i965/fs: Handle explicit flag destinations in flags_written()
[mesa.git] / src / intel / compiler / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include "main/macros.h"
32 #include "brw_eu.h"
33 #include "brw_fs.h"
34 #include "brw_nir.h"
35 #include "brw_vec4_gs_visitor.h"
36 #include "brw_cfg.h"
37 #include "brw_dead_control_flow.h"
38 #include "common/gen_debug.h"
39 #include "compiler/glsl_types.h"
40 #include "compiler/nir/nir_builder.h"
41 #include "program/prog_parameter.h"
42
43 using namespace brw;
44
45 static unsigned get_lowered_simd_width(const struct gen_device_info *devinfo,
46 const fs_inst *inst);
47
48 void
49 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
50 const fs_reg *src, unsigned sources)
51 {
52 memset(this, 0, sizeof(*this));
53
54 this->src = new fs_reg[MAX2(sources, 3)];
55 for (unsigned i = 0; i < sources; i++)
56 this->src[i] = src[i];
57
58 this->opcode = opcode;
59 this->dst = dst;
60 this->sources = sources;
61 this->exec_size = exec_size;
62 this->base_mrf = -1;
63
64 assert(dst.file != IMM && dst.file != UNIFORM);
65
66 assert(this->exec_size != 0);
67
68 this->conditional_mod = BRW_CONDITIONAL_NONE;
69
70 /* This will be the case for almost all instructions. */
71 switch (dst.file) {
72 case VGRF:
73 case ARF:
74 case FIXED_GRF:
75 case MRF:
76 case ATTR:
77 this->size_written = dst.component_size(exec_size);
78 break;
79 case BAD_FILE:
80 this->size_written = 0;
81 break;
82 case IMM:
83 case UNIFORM:
84 unreachable("Invalid destination register file");
85 }
86
87 this->writes_accumulator = false;
88 }
89
90 fs_inst::fs_inst()
91 {
92 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
93 }
94
95 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
96 {
97 init(opcode, exec_size, reg_undef, NULL, 0);
98 }
99
100 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
101 {
102 init(opcode, exec_size, dst, NULL, 0);
103 }
104
105 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
106 const fs_reg &src0)
107 {
108 const fs_reg src[1] = { src0 };
109 init(opcode, exec_size, dst, src, 1);
110 }
111
112 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
113 const fs_reg &src0, const fs_reg &src1)
114 {
115 const fs_reg src[2] = { src0, src1 };
116 init(opcode, exec_size, dst, src, 2);
117 }
118
119 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
120 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
121 {
122 const fs_reg src[3] = { src0, src1, src2 };
123 init(opcode, exec_size, dst, src, 3);
124 }
125
126 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
127 const fs_reg src[], unsigned sources)
128 {
129 init(opcode, exec_width, dst, src, sources);
130 }
131
132 fs_inst::fs_inst(const fs_inst &that)
133 {
134 memcpy(this, &that, sizeof(that));
135
136 this->src = new fs_reg[MAX2(that.sources, 3)];
137
138 for (unsigned i = 0; i < that.sources; i++)
139 this->src[i] = that.src[i];
140 }
141
142 fs_inst::~fs_inst()
143 {
144 delete[] this->src;
145 }
146
147 void
148 fs_inst::resize_sources(uint8_t num_sources)
149 {
150 if (this->sources != num_sources) {
151 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
152
153 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
154 src[i] = this->src[i];
155
156 delete[] this->src;
157 this->src = src;
158 this->sources = num_sources;
159 }
160 }
161
162 void
163 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
164 const fs_reg &dst,
165 const fs_reg &surf_index,
166 const fs_reg &varying_offset,
167 uint32_t const_offset)
168 {
169 /* We have our constant surface use a pitch of 4 bytes, so our index can
170 * be any component of a vector, and then we load 4 contiguous
171 * components starting from that.
172 *
173 * We break down the const_offset to a portion added to the variable offset
174 * and a portion done using fs_reg::offset, which means that if you have
175 * GLSL using something like "uniform vec4 a[20]; gl_FragColor = a[i]",
176 * we'll temporarily generate 4 vec4 loads from offset i * 4, and CSE can
177 * later notice that those loads are all the same and eliminate the
178 * redundant ones.
179 */
180 fs_reg vec4_offset = vgrf(glsl_type::uint_type);
181 bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf));
182
183 /* The pull load message will load a vec4 (16 bytes). If we are loading
184 * a double this means we are only loading 2 elements worth of data.
185 * We also want to use a 32-bit data type for the dst of the load operation
186 * so other parts of the driver don't get confused about the size of the
187 * result.
188 */
189 fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
190 fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
191 vec4_result, surf_index, vec4_offset);
192 inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
193
194 if (type_sz(dst.type) == 8) {
195 shuffle_32bit_load_result_to_64bit_data(
196 bld, retype(vec4_result, dst.type), vec4_result, 2);
197 }
198
199 vec4_result.type = dst.type;
200 bld.MOV(dst, offset(vec4_result, bld,
201 (const_offset & 0xf) / type_sz(vec4_result.type)));
202 }
203
204 /**
205 * A helper for MOV generation for fixing up broken hardware SEND dependency
206 * handling.
207 */
208 void
209 fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
210 {
211 /* The caller always wants uncompressed to emit the minimal extra
212 * dependencies, and to avoid having to deal with aligning its regs to 2.
213 */
214 const fs_builder ubld = bld.annotate("send dependency resolve")
215 .half(0);
216
217 ubld.MOV(ubld.null_reg_f(), fs_reg(VGRF, grf, BRW_REGISTER_TYPE_F));
218 }
219
220 bool
221 fs_inst::equals(fs_inst *inst) const
222 {
223 return (opcode == inst->opcode &&
224 dst.equals(inst->dst) &&
225 src[0].equals(inst->src[0]) &&
226 src[1].equals(inst->src[1]) &&
227 src[2].equals(inst->src[2]) &&
228 saturate == inst->saturate &&
229 predicate == inst->predicate &&
230 conditional_mod == inst->conditional_mod &&
231 mlen == inst->mlen &&
232 base_mrf == inst->base_mrf &&
233 target == inst->target &&
234 eot == inst->eot &&
235 header_size == inst->header_size &&
236 shadow_compare == inst->shadow_compare &&
237 exec_size == inst->exec_size &&
238 offset == inst->offset);
239 }
240
241 bool
242 fs_inst::is_send_from_grf() const
243 {
244 switch (opcode) {
245 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
246 case SHADER_OPCODE_SHADER_TIME_ADD:
247 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
248 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
249 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
250 case SHADER_OPCODE_UNTYPED_ATOMIC:
251 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
252 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
253 case SHADER_OPCODE_TYPED_ATOMIC:
254 case SHADER_OPCODE_TYPED_SURFACE_READ:
255 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
256 case SHADER_OPCODE_URB_WRITE_SIMD8:
257 case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
258 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
259 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
260 case SHADER_OPCODE_URB_READ_SIMD8:
261 case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
262 return true;
263 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
264 return src[1].file == VGRF;
265 case FS_OPCODE_FB_WRITE:
266 case FS_OPCODE_FB_READ:
267 return src[0].file == VGRF;
268 default:
269 if (is_tex())
270 return src[0].file == VGRF;
271
272 return false;
273 }
274 }
275
276 /**
277 * Returns true if this instruction's sources and destinations cannot
278 * safely be the same register.
279 *
280 * In most cases, a register can be written over safely by the same
281 * instruction that is its last use. For a single instruction, the
282 * sources are dereferenced before writing of the destination starts
283 * (naturally).
284 *
285 * However, there are a few cases where this can be problematic:
286 *
287 * - Virtual opcodes that translate to multiple instructions in the
288 * code generator: if src == dst and one instruction writes the
289 * destination before a later instruction reads the source, then
290 * src will have been clobbered.
291 *
292 * - SIMD16 compressed instructions with certain regioning (see below).
293 *
294 * The register allocator uses this information to set up conflicts between
295 * GRF sources and the destination.
296 */
297 bool
298 fs_inst::has_source_and_destination_hazard() const
299 {
300 switch (opcode) {
301 case FS_OPCODE_PACK_HALF_2x16_SPLIT:
302 /* Multiple partial writes to the destination */
303 return true;
304 default:
305 /* The SIMD16 compressed instruction
306 *
307 * add(16) g4<1>F g4<8,8,1>F g6<8,8,1>F
308 *
309 * is actually decoded in hardware as:
310 *
311 * add(8) g4<1>F g4<8,8,1>F g6<8,8,1>F
312 * add(8) g5<1>F g5<8,8,1>F g7<8,8,1>F
313 *
314 * Which is safe. However, if we have uniform accesses
315 * happening, we get into trouble:
316 *
317 * add(8) g4<1>F g4<0,1,0>F g6<8,8,1>F
318 * add(8) g5<1>F g4<0,1,0>F g7<8,8,1>F
319 *
320 * Now our destination for the first instruction overwrote the
321 * second instruction's src0, and we get garbage for those 8
322 * pixels. There's a similar issue for the pre-gen6
323 * pixel_x/pixel_y, which are registers of 16-bit values and thus
324 * would get stomped by the first decode as well.
325 */
326 if (exec_size == 16) {
327 for (int i = 0; i < sources; i++) {
328 if (src[i].file == VGRF && (src[i].stride == 0 ||
329 src[i].type == BRW_REGISTER_TYPE_UW ||
330 src[i].type == BRW_REGISTER_TYPE_W ||
331 src[i].type == BRW_REGISTER_TYPE_UB ||
332 src[i].type == BRW_REGISTER_TYPE_B)) {
333 return true;
334 }
335 }
336 }
337 return false;
338 }
339 }
340
341 bool
342 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
343 {
344 if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
345 return false;
346
347 fs_reg reg = this->src[0];
348 if (reg.file != VGRF || reg.offset != 0 || reg.stride != 1)
349 return false;
350
351 if (grf_alloc.sizes[reg.nr] * REG_SIZE != this->size_written)
352 return false;
353
354 for (int i = 0; i < this->sources; i++) {
355 reg.type = this->src[i].type;
356 if (!this->src[i].equals(reg))
357 return false;
358
359 if (i < this->header_size) {
360 reg.offset += REG_SIZE;
361 } else {
362 reg = horiz_offset(reg, this->exec_size);
363 }
364 }
365
366 return true;
367 }
368
369 bool
370 fs_inst::can_do_source_mods(const struct gen_device_info *devinfo)
371 {
372 if (devinfo->gen == 6 && is_math())
373 return false;
374
375 if (is_send_from_grf())
376 return false;
377
378 if (!backend_instruction::can_do_source_mods())
379 return false;
380
381 return true;
382 }
383
384 bool
385 fs_inst::can_change_types() const
386 {
387 return dst.type == src[0].type &&
388 !src[0].abs && !src[0].negate && !saturate &&
389 (opcode == BRW_OPCODE_MOV ||
390 (opcode == BRW_OPCODE_SEL &&
391 dst.type == src[1].type &&
392 predicate != BRW_PREDICATE_NONE &&
393 !src[1].abs && !src[1].negate));
394 }
395
396 bool
397 fs_inst::has_side_effects() const
398 {
399 return this->eot || backend_instruction::has_side_effects();
400 }
401
402 void
403 fs_reg::init()
404 {
405 memset(this, 0, sizeof(*this));
406 stride = 1;
407 }
408
409 /** Generic unset register constructor. */
410 fs_reg::fs_reg()
411 {
412 init();
413 this->file = BAD_FILE;
414 }
415
416 fs_reg::fs_reg(struct ::brw_reg reg) :
417 backend_reg(reg)
418 {
419 this->offset = 0;
420 this->stride = 1;
421 if (this->file == IMM &&
422 (this->type != BRW_REGISTER_TYPE_V &&
423 this->type != BRW_REGISTER_TYPE_UV &&
424 this->type != BRW_REGISTER_TYPE_VF)) {
425 this->stride = 0;
426 }
427 }
428
429 bool
430 fs_reg::equals(const fs_reg &r) const
431 {
432 return (this->backend_reg::equals(r) &&
433 stride == r.stride);
434 }
435
436 bool
437 fs_reg::is_contiguous() const
438 {
439 return stride == 1;
440 }
441
442 unsigned
443 fs_reg::component_size(unsigned width) const
444 {
445 const unsigned stride = ((file != ARF && file != FIXED_GRF) ? this->stride :
446 hstride == 0 ? 0 :
447 1 << (hstride - 1));
448 return MAX2(width * stride, 1) * type_sz(type);
449 }
450
451 extern "C" int
452 type_size_scalar(const struct glsl_type *type)
453 {
454 unsigned int size, i;
455
456 switch (type->base_type) {
457 case GLSL_TYPE_UINT:
458 case GLSL_TYPE_INT:
459 case GLSL_TYPE_FLOAT:
460 case GLSL_TYPE_BOOL:
461 return type->components();
462 case GLSL_TYPE_DOUBLE:
463 case GLSL_TYPE_UINT64:
464 case GLSL_TYPE_INT64:
465 return type->components() * 2;
466 case GLSL_TYPE_ARRAY:
467 return type_size_scalar(type->fields.array) * type->length;
468 case GLSL_TYPE_STRUCT:
469 size = 0;
470 for (i = 0; i < type->length; i++) {
471 size += type_size_scalar(type->fields.structure[i].type);
472 }
473 return size;
474 case GLSL_TYPE_SAMPLER:
475 /* Samplers take up no register space, since they're baked in at
476 * link time.
477 */
478 return 0;
479 case GLSL_TYPE_ATOMIC_UINT:
480 return 0;
481 case GLSL_TYPE_SUBROUTINE:
482 return 1;
483 case GLSL_TYPE_IMAGE:
484 return BRW_IMAGE_PARAM_SIZE;
485 case GLSL_TYPE_VOID:
486 case GLSL_TYPE_ERROR:
487 case GLSL_TYPE_INTERFACE:
488 case GLSL_TYPE_FUNCTION:
489 unreachable("not reached");
490 }
491
492 return 0;
493 }
494
495 /**
496 * Create a MOV to read the timestamp register.
497 *
498 * The caller is responsible for emitting the MOV. The return value is
499 * the destination of the MOV, with extra parameters set.
500 */
501 fs_reg
502 fs_visitor::get_timestamp(const fs_builder &bld)
503 {
504 assert(devinfo->gen >= 7);
505
506 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
507 BRW_ARF_TIMESTAMP,
508 0),
509 BRW_REGISTER_TYPE_UD));
510
511 fs_reg dst = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
512
513 /* We want to read the 3 fields we care about even if it's not enabled in
514 * the dispatch.
515 */
516 bld.group(4, 0).exec_all().MOV(dst, ts);
517
518 return dst;
519 }
520
521 void
522 fs_visitor::emit_shader_time_begin()
523 {
524 /* We want only the low 32 bits of the timestamp. Since it's running
525 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
526 * which is plenty of time for our purposes. It is identical across the
527 * EUs, but since it's tracking GPU core speed it will increment at a
528 * varying rate as render P-states change.
529 */
530 shader_start_time = component(
531 get_timestamp(bld.annotate("shader time start")), 0);
532 }
533
534 void
535 fs_visitor::emit_shader_time_end()
536 {
537 /* Insert our code just before the final SEND with EOT. */
538 exec_node *end = this->instructions.get_tail();
539 assert(end && ((fs_inst *) end)->eot);
540 const fs_builder ibld = bld.annotate("shader time end")
541 .exec_all().at(NULL, end);
542 const fs_reg timestamp = get_timestamp(ibld);
543
544 /* We only use the low 32 bits of the timestamp - see
545 * emit_shader_time_begin()).
546 *
547 * We could also check if render P-states have changed (or anything
548 * else that might disrupt timing) by setting smear to 2 and checking if
549 * that field is != 0.
550 */
551 const fs_reg shader_end_time = component(timestamp, 0);
552
553 /* Check that there weren't any timestamp reset events (assuming these
554 * were the only two timestamp reads that happened).
555 */
556 const fs_reg reset = component(timestamp, 2);
557 set_condmod(BRW_CONDITIONAL_Z,
558 ibld.AND(ibld.null_reg_ud(), reset, brw_imm_ud(1u)));
559 ibld.IF(BRW_PREDICATE_NORMAL);
560
561 fs_reg start = shader_start_time;
562 start.negate = true;
563 const fs_reg diff = component(fs_reg(VGRF, alloc.allocate(1),
564 BRW_REGISTER_TYPE_UD),
565 0);
566 const fs_builder cbld = ibld.group(1, 0);
567 cbld.group(1, 0).ADD(diff, start, shader_end_time);
568
569 /* If there were no instructions between the two timestamp gets, the diff
570 * is 2 cycles. Remove that overhead, so I can forget about that when
571 * trying to determine the time taken for single instructions.
572 */
573 cbld.ADD(diff, diff, brw_imm_ud(-2u));
574 SHADER_TIME_ADD(cbld, 0, diff);
575 SHADER_TIME_ADD(cbld, 1, brw_imm_ud(1u));
576 ibld.emit(BRW_OPCODE_ELSE);
577 SHADER_TIME_ADD(cbld, 2, brw_imm_ud(1u));
578 ibld.emit(BRW_OPCODE_ENDIF);
579 }
580
581 void
582 fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
583 int shader_time_subindex,
584 fs_reg value)
585 {
586 int index = shader_time_index * 3 + shader_time_subindex;
587 struct brw_reg offset = brw_imm_d(index * BRW_SHADER_TIME_STRIDE);
588
589 fs_reg payload;
590 if (dispatch_width == 8)
591 payload = vgrf(glsl_type::uvec2_type);
592 else
593 payload = vgrf(glsl_type::uint_type);
594
595 bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
596 }
597
598 void
599 fs_visitor::vfail(const char *format, va_list va)
600 {
601 char *msg;
602
603 if (failed)
604 return;
605
606 failed = true;
607
608 msg = ralloc_vasprintf(mem_ctx, format, va);
609 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
610
611 this->fail_msg = msg;
612
613 if (debug_enabled) {
614 fprintf(stderr, "%s", msg);
615 }
616 }
617
618 void
619 fs_visitor::fail(const char *format, ...)
620 {
621 va_list va;
622
623 va_start(va, format);
624 vfail(format, va);
625 va_end(va);
626 }
627
628 /**
629 * Mark this program as impossible to compile with dispatch width greater
630 * than n.
631 *
632 * During the SIMD8 compile (which happens first), we can detect and flag
633 * things that are unsupported in SIMD16+ mode, so the compiler can skip the
634 * SIMD16+ compile altogether.
635 *
636 * During a compile of dispatch width greater than n (if one happens anyway),
637 * this just calls fail().
638 */
639 void
640 fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
641 {
642 if (dispatch_width > n) {
643 fail("%s", msg);
644 } else {
645 max_dispatch_width = n;
646 compiler->shader_perf_log(log_data,
647 "Shader dispatch width limited to SIMD%d: %s",
648 n, msg);
649 }
650 }
651
652 /**
653 * Returns true if the instruction has a flag that means it won't
654 * update an entire destination register.
655 *
656 * For example, dead code elimination and live variable analysis want to know
657 * when a write to a variable screens off any preceding values that were in
658 * it.
659 */
660 bool
661 fs_inst::is_partial_write() const
662 {
663 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
664 (this->exec_size * type_sz(this->dst.type)) < 32 ||
665 !this->dst.is_contiguous() ||
666 this->dst.offset % REG_SIZE != 0);
667 }
668
669 unsigned
670 fs_inst::components_read(unsigned i) const
671 {
672 /* Return zero if the source is not present. */
673 if (src[i].file == BAD_FILE)
674 return 0;
675
676 switch (opcode) {
677 case FS_OPCODE_LINTERP:
678 if (i == 0)
679 return 2;
680 else
681 return 1;
682
683 case FS_OPCODE_PIXEL_X:
684 case FS_OPCODE_PIXEL_Y:
685 assert(i == 0);
686 return 2;
687
688 case FS_OPCODE_FB_WRITE_LOGICAL:
689 assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
690 /* First/second FB write color. */
691 if (i < 2)
692 return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
693 else
694 return 1;
695
696 case SHADER_OPCODE_TEX_LOGICAL:
697 case SHADER_OPCODE_TXD_LOGICAL:
698 case SHADER_OPCODE_TXF_LOGICAL:
699 case SHADER_OPCODE_TXL_LOGICAL:
700 case SHADER_OPCODE_TXS_LOGICAL:
701 case FS_OPCODE_TXB_LOGICAL:
702 case SHADER_OPCODE_TXF_CMS_LOGICAL:
703 case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
704 case SHADER_OPCODE_TXF_UMS_LOGICAL:
705 case SHADER_OPCODE_TXF_MCS_LOGICAL:
706 case SHADER_OPCODE_LOD_LOGICAL:
707 case SHADER_OPCODE_TG4_LOGICAL:
708 case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
709 case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
710 assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
711 src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
712 /* Texture coordinates. */
713 if (i == TEX_LOGICAL_SRC_COORDINATE)
714 return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
715 /* Texture derivatives. */
716 else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
717 opcode == SHADER_OPCODE_TXD_LOGICAL)
718 return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
719 /* Texture offset. */
720 else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
721 return 2;
722 /* MCS */
723 else if (i == TEX_LOGICAL_SRC_MCS && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
724 return 2;
725 else
726 return 1;
727
728 case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
729 case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
730 assert(src[3].file == IMM);
731 /* Surface coordinates. */
732 if (i == 0)
733 return src[3].ud;
734 /* Surface operation source (ignored for reads). */
735 else if (i == 1)
736 return 0;
737 else
738 return 1;
739
740 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
741 case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
742 assert(src[3].file == IMM &&
743 src[4].file == IMM);
744 /* Surface coordinates. */
745 if (i == 0)
746 return src[3].ud;
747 /* Surface operation source. */
748 else if (i == 1)
749 return src[4].ud;
750 else
751 return 1;
752
753 case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
754 case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
755 assert(src[3].file == IMM &&
756 src[4].file == IMM);
757 const unsigned op = src[4].ud;
758 /* Surface coordinates. */
759 if (i == 0)
760 return src[3].ud;
761 /* Surface operation source. */
762 else if (i == 1 && op == BRW_AOP_CMPWR)
763 return 2;
764 else if (i == 1 && (op == BRW_AOP_INC || op == BRW_AOP_DEC ||
765 op == BRW_AOP_PREDEC))
766 return 0;
767 else
768 return 1;
769 }
770
771 default:
772 return 1;
773 }
774 }
775
776 unsigned
777 fs_inst::size_read(int arg) const
778 {
779 switch (opcode) {
780 case FS_OPCODE_FB_WRITE:
781 case FS_OPCODE_FB_READ:
782 case SHADER_OPCODE_URB_WRITE_SIMD8:
783 case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
784 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
785 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
786 case SHADER_OPCODE_URB_READ_SIMD8:
787 case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
788 case SHADER_OPCODE_UNTYPED_ATOMIC:
789 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
790 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
791 case SHADER_OPCODE_TYPED_ATOMIC:
792 case SHADER_OPCODE_TYPED_SURFACE_READ:
793 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
794 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
795 if (arg == 0)
796 return mlen * REG_SIZE;
797 break;
798
799 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
800 /* The payload is actually stored in src1 */
801 if (arg == 1)
802 return mlen * REG_SIZE;
803 break;
804
805 case FS_OPCODE_LINTERP:
806 if (arg == 1)
807 return 16;
808 break;
809
810 case SHADER_OPCODE_LOAD_PAYLOAD:
811 if (arg < this->header_size)
812 return REG_SIZE;
813 break;
814
815 case CS_OPCODE_CS_TERMINATE:
816 case SHADER_OPCODE_BARRIER:
817 return REG_SIZE;
818
819 case SHADER_OPCODE_MOV_INDIRECT:
820 if (arg == 0) {
821 assert(src[2].file == IMM);
822 return src[2].ud;
823 }
824 break;
825
826 default:
827 if (is_tex() && arg == 0 && src[0].file == VGRF)
828 return mlen * REG_SIZE;
829 break;
830 }
831
832 switch (src[arg].file) {
833 case UNIFORM:
834 case IMM:
835 return components_read(arg) * type_sz(src[arg].type);
836 case BAD_FILE:
837 case ARF:
838 case FIXED_GRF:
839 case VGRF:
840 case ATTR:
841 return components_read(arg) * src[arg].component_size(exec_size);
842 case MRF:
843 unreachable("MRF registers are not allowed as sources");
844 }
845 return 0;
846 }
847
848 namespace {
849 /* Return the subset of flag registers that an instruction could
850 * potentially read or write based on the execution controls and flag
851 * subregister number of the instruction.
852 */
853 unsigned
854 flag_mask(const fs_inst *inst)
855 {
856 const unsigned start = inst->flag_subreg * 16 + inst->group;
857 const unsigned end = start + inst->exec_size;
858 return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);
859 }
860
861 unsigned
862 bit_mask(unsigned n)
863 {
864 return (n >= CHAR_BIT * sizeof(bit_mask(n)) ? ~0u : (1u << n) - 1);
865 }
866
867 unsigned
868 flag_mask(const fs_reg &r, unsigned sz)
869 {
870 if (r.file == ARF) {
871 const unsigned start = (r.nr - BRW_ARF_FLAG) * 4 + r.subnr;
872 const unsigned end = start + sz;
873 return bit_mask(end) & ~bit_mask(start);
874 } else {
875 return 0;
876 }
877 }
878 }
879
880 unsigned
881 fs_inst::flags_read(const gen_device_info *devinfo) const
882 {
883 /* XXX - This doesn't consider explicit uses of the flag register as source
884 * region.
885 */
886 if (predicate == BRW_PREDICATE_ALIGN1_ANYV ||
887 predicate == BRW_PREDICATE_ALIGN1_ALLV) {
888 /* The vertical predication modes combine corresponding bits from
889 * f0.0 and f1.0 on Gen7+, and f0.0 and f0.1 on older hardware.
890 */
891 const unsigned shift = devinfo->gen >= 7 ? 4 : 2;
892 return flag_mask(this) << shift | flag_mask(this);
893 } else if (predicate) {
894 return flag_mask(this);
895 } else {
896 return 0;
897 }
898 }
899
900 unsigned
901 fs_inst::flags_written() const
902 {
903 if ((conditional_mod && (opcode != BRW_OPCODE_SEL &&
904 opcode != BRW_OPCODE_IF &&
905 opcode != BRW_OPCODE_WHILE)) ||
906 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
907 return flag_mask(this);
908 } else {
909 return flag_mask(dst, size_written);
910 }
911 }
912
913 /**
914 * Returns how many MRFs an FS opcode will write over.
915 *
916 * Note that this is not the 0 or 1 implied writes in an actual gen
917 * instruction -- the FS opcodes often generate MOVs in addition.
918 */
919 int
920 fs_visitor::implied_mrf_writes(fs_inst *inst)
921 {
922 if (inst->mlen == 0)
923 return 0;
924
925 if (inst->base_mrf == -1)
926 return 0;
927
928 switch (inst->opcode) {
929 case SHADER_OPCODE_RCP:
930 case SHADER_OPCODE_RSQ:
931 case SHADER_OPCODE_SQRT:
932 case SHADER_OPCODE_EXP2:
933 case SHADER_OPCODE_LOG2:
934 case SHADER_OPCODE_SIN:
935 case SHADER_OPCODE_COS:
936 return 1 * dispatch_width / 8;
937 case SHADER_OPCODE_POW:
938 case SHADER_OPCODE_INT_QUOTIENT:
939 case SHADER_OPCODE_INT_REMAINDER:
940 return 2 * dispatch_width / 8;
941 case SHADER_OPCODE_TEX:
942 case FS_OPCODE_TXB:
943 case SHADER_OPCODE_TXD:
944 case SHADER_OPCODE_TXF:
945 case SHADER_OPCODE_TXF_CMS:
946 case SHADER_OPCODE_TXF_MCS:
947 case SHADER_OPCODE_TG4:
948 case SHADER_OPCODE_TG4_OFFSET:
949 case SHADER_OPCODE_TXL:
950 case SHADER_OPCODE_TXS:
951 case SHADER_OPCODE_LOD:
952 case SHADER_OPCODE_SAMPLEINFO:
953 return 1;
954 case FS_OPCODE_FB_WRITE:
955 return 2;
956 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
957 case SHADER_OPCODE_GEN4_SCRATCH_READ:
958 return 1;
959 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
960 return inst->mlen;
961 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
962 return inst->mlen;
963 default:
964 unreachable("not reached");
965 }
966 }
967
968 fs_reg
969 fs_visitor::vgrf(const glsl_type *const type)
970 {
971 int reg_width = dispatch_width / 8;
972 return fs_reg(VGRF, alloc.allocate(type_size_scalar(type) * reg_width),
973 brw_type_for_base_type(type));
974 }
975
976 fs_reg::fs_reg(enum brw_reg_file file, int nr)
977 {
978 init();
979 this->file = file;
980 this->nr = nr;
981 this->type = BRW_REGISTER_TYPE_F;
982 this->stride = (file == UNIFORM ? 0 : 1);
983 }
984
985 fs_reg::fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type)
986 {
987 init();
988 this->file = file;
989 this->nr = nr;
990 this->type = type;
991 this->stride = (file == UNIFORM ? 0 : 1);
992 }
993
994 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
995 * This brings in those uniform definitions
996 */
997 void
998 fs_visitor::import_uniforms(fs_visitor *v)
999 {
1000 this->push_constant_loc = v->push_constant_loc;
1001 this->pull_constant_loc = v->pull_constant_loc;
1002 this->uniforms = v->uniforms;
1003 }
1004
1005 void
1006 fs_visitor::emit_fragcoord_interpolation(fs_reg wpos)
1007 {
1008 assert(stage == MESA_SHADER_FRAGMENT);
1009
1010 /* gl_FragCoord.x */
1011 bld.MOV(wpos, this->pixel_x);
1012 wpos = offset(wpos, bld, 1);
1013
1014 /* gl_FragCoord.y */
1015 bld.MOV(wpos, this->pixel_y);
1016 wpos = offset(wpos, bld, 1);
1017
1018 /* gl_FragCoord.z */
1019 if (devinfo->gen >= 6) {
1020 bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
1021 } else {
1022 bld.emit(FS_OPCODE_LINTERP, wpos,
1023 this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL],
1024 interp_reg(VARYING_SLOT_POS, 2));
1025 }
1026 wpos = offset(wpos, bld, 1);
1027
1028 /* gl_FragCoord.w: Already set up in emit_interpolation */
1029 bld.MOV(wpos, this->wpos_w);
1030 }
1031
1032 enum brw_barycentric_mode
1033 brw_barycentric_mode(enum glsl_interp_mode mode, nir_intrinsic_op op)
1034 {
1035 /* Barycentric modes don't make sense for flat inputs. */
1036 assert(mode != INTERP_MODE_FLAT);
1037
1038 unsigned bary;
1039 switch (op) {
1040 case nir_intrinsic_load_barycentric_pixel:
1041 case nir_intrinsic_load_barycentric_at_offset:
1042 bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL;
1043 break;
1044 case nir_intrinsic_load_barycentric_centroid:
1045 bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
1046 break;
1047 case nir_intrinsic_load_barycentric_sample:
1048 case nir_intrinsic_load_barycentric_at_sample:
1049 bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
1050 break;
1051 default:
1052 unreachable("invalid intrinsic");
1053 }
1054
1055 if (mode == INTERP_MODE_NOPERSPECTIVE)
1056 bary += 3;
1057
1058 return (enum brw_barycentric_mode) bary;
1059 }
1060
1061 /**
1062 * Turn one of the two CENTROID barycentric modes into PIXEL mode.
1063 */
1064 static enum brw_barycentric_mode
1065 centroid_to_pixel(enum brw_barycentric_mode bary)
1066 {
1067 assert(bary == BRW_BARYCENTRIC_PERSPECTIVE_CENTROID ||
1068 bary == BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
1069 return (enum brw_barycentric_mode) ((unsigned) bary - 1);
1070 }
1071
1072 fs_reg *
1073 fs_visitor::emit_frontfacing_interpolation()
1074 {
1075 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1076
1077 if (devinfo->gen >= 6) {
1078 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1079 * a boolean result from this (~0/true or 0/false).
1080 *
1081 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1082 * this task in only one instruction:
1083 * - a negation source modifier will flip the bit; and
1084 * - a W -> D type conversion will sign extend the bit into the high
1085 * word of the destination.
1086 *
1087 * An ASR 15 fills the low word of the destination.
1088 */
1089 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1090 g0.negate = true;
1091
1092 bld.ASR(*reg, g0, brw_imm_d(15));
1093 } else {
1094 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1095 * a boolean result from this (1/true or 0/false).
1096 *
1097 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1098 * the negation source modifier to flip it. Unfortunately the SHR
1099 * instruction only operates on UD (or D with an abs source modifier)
1100 * sources without negation.
1101 *
1102 * Instead, use ASR (which will give ~0/true or 0/false).
1103 */
1104 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1105 g1_6.negate = true;
1106
1107 bld.ASR(*reg, g1_6, brw_imm_d(31));
1108 }
1109
1110 return reg;
1111 }
1112
1113 void
1114 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1115 {
1116 assert(stage == MESA_SHADER_FRAGMENT);
1117 struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
1118 assert(dst.type == BRW_REGISTER_TYPE_F);
1119
1120 if (wm_prog_data->persample_dispatch) {
1121 /* Convert int_sample_pos to floating point */
1122 bld.MOV(dst, int_sample_pos);
1123 /* Scale to the range [0, 1] */
1124 bld.MUL(dst, dst, brw_imm_f(1 / 16.0f));
1125 }
1126 else {
1127 /* From ARB_sample_shading specification:
1128 * "When rendering to a non-multisample buffer, or if multisample
1129 * rasterization is disabled, gl_SamplePosition will always be
1130 * (0.5, 0.5).
1131 */
1132 bld.MOV(dst, brw_imm_f(0.5f));
1133 }
1134 }
1135
1136 fs_reg *
1137 fs_visitor::emit_samplepos_setup()
1138 {
1139 assert(devinfo->gen >= 6);
1140
1141 const fs_builder abld = bld.annotate("compute sample position");
1142 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1143 fs_reg pos = *reg;
1144 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1145 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1146
1147 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1148 * mode will be enabled.
1149 *
1150 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1151 * R31.1:0 Position Offset X/Y for Slot[3:0]
1152 * R31.3:2 Position Offset X/Y for Slot[7:4]
1153 * .....
1154 *
1155 * The X, Y sample positions come in as bytes in thread payload. So, read
1156 * the positions using vstride=16, width=8, hstride=2.
1157 */
1158 struct brw_reg sample_pos_reg =
1159 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1160 BRW_REGISTER_TYPE_B), 16, 8, 2);
1161
1162 if (dispatch_width == 8) {
1163 abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
1164 } else {
1165 abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
1166 abld.half(1).MOV(half(int_sample_x, 1),
1167 fs_reg(suboffset(sample_pos_reg, 16)));
1168 }
1169 /* Compute gl_SamplePosition.x */
1170 compute_sample_position(pos, int_sample_x);
1171 pos = offset(pos, abld, 1);
1172 if (dispatch_width == 8) {
1173 abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
1174 } else {
1175 abld.half(0).MOV(half(int_sample_y, 0),
1176 fs_reg(suboffset(sample_pos_reg, 1)));
1177 abld.half(1).MOV(half(int_sample_y, 1),
1178 fs_reg(suboffset(sample_pos_reg, 17)));
1179 }
1180 /* Compute gl_SamplePosition.y */
1181 compute_sample_position(pos, int_sample_y);
1182 return reg;
1183 }
1184
1185 fs_reg *
1186 fs_visitor::emit_sampleid_setup()
1187 {
1188 assert(stage == MESA_SHADER_FRAGMENT);
1189 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1190 assert(devinfo->gen >= 6);
1191
1192 const fs_builder abld = bld.annotate("compute sample id");
1193 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1194
1195 if (!key->multisample_fbo) {
1196 /* As per GL_ARB_sample_shading specification:
1197 * "When rendering to a non-multisample buffer, or if multisample
1198 * rasterization is disabled, gl_SampleID will always be zero."
1199 */
1200 abld.MOV(*reg, brw_imm_d(0));
1201 } else if (devinfo->gen >= 8) {
1202 /* Sample ID comes in as 4-bit numbers in g1.0:
1203 *
1204 * 15:12 Slot 3 SampleID (only used in SIMD16)
1205 * 11:8 Slot 2 SampleID (only used in SIMD16)
1206 * 7:4 Slot 1 SampleID
1207 * 3:0 Slot 0 SampleID
1208 *
1209 * Each slot corresponds to four channels, so we want to replicate each
1210 * half-byte value to 4 channels in a row:
1211 *
1212 * dst+0: .7 .6 .5 .4 .3 .2 .1 .0
1213 * 7:4 7:4 7:4 7:4 3:0 3:0 3:0 3:0
1214 *
1215 * dst+1: .7 .6 .5 .4 .3 .2 .1 .0 (if SIMD16)
1216 * 15:12 15:12 15:12 15:12 11:8 11:8 11:8 11:8
1217 *
1218 * First, we read g1.0 with a <1,8,0>UB region, causing the first 8
1219 * channels to read the first byte (7:0), and the second group of 8
1220 * channels to read the second byte (15:8). Then, we shift right by
1221 * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3
1222 * values into place. Finally, we AND with 0xf to keep the low nibble.
1223 *
1224 * shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V
1225 * and(16) dst<1>D tmp<8,8,1>W 0xf:W
1226 *
1227 * TODO: These payload bits exist on Gen7 too, but they appear to always
1228 * be zero, so this code fails to work. We should find out why.
1229 */
1230 fs_reg tmp(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
1231
1232 abld.SHR(tmp, fs_reg(stride(retype(brw_vec1_grf(1, 0),
1233 BRW_REGISTER_TYPE_B), 1, 8, 0)),
1234 brw_imm_v(0x44440000));
1235 abld.AND(*reg, tmp, brw_imm_w(0xf));
1236 } else {
1237 const fs_reg t1 = component(fs_reg(VGRF, alloc.allocate(1),
1238 BRW_REGISTER_TYPE_D), 0);
1239 const fs_reg t2(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
1240
1241 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1242 * 8x multisampling, subspan 0 will represent sample N (where N
1243 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1244 * 7. We can find the value of N by looking at R0.0 bits 7:6
1245 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1246 * (since samples are always delivered in pairs). That is, we
1247 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1248 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1249 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1250 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1251 * populating a temporary variable with the sequence (0, 1, 2, 3),
1252 * and then reading from it using vstride=1, width=4, hstride=0.
1253 * These computations hold good for 4x multisampling as well.
1254 *
1255 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1256 * the first four slots are sample 0 of subspan 0; the next four
1257 * are sample 1 of subspan 0; the third group is sample 0 of
1258 * subspan 1, and finally sample 1 of subspan 1.
1259 */
1260
1261 /* SKL+ has an extra bit for the Starting Sample Pair Index to
1262 * accomodate 16x MSAA.
1263 */
1264 abld.exec_all().group(1, 0)
1265 .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1266 brw_imm_ud(0xc0));
1267 abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5));
1268
1269 /* This works for both SIMD8 and SIMD16 */
1270 abld.exec_all().group(4, 0).MOV(t2, brw_imm_v(0x3210));
1271
1272 /* This special instruction takes care of setting vstride=1,
1273 * width=4, hstride=0 of t2 during an ADD instruction.
1274 */
1275 abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1276 }
1277
1278 return reg;
1279 }
1280
1281 fs_reg *
1282 fs_visitor::emit_samplemaskin_setup()
1283 {
1284 assert(stage == MESA_SHADER_FRAGMENT);
1285 struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
1286 assert(devinfo->gen >= 6);
1287
1288 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1289
1290 fs_reg coverage_mask(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
1291 BRW_REGISTER_TYPE_D));
1292
1293 if (wm_prog_data->persample_dispatch) {
1294 /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
1295 * and a mask representing which sample is being processed by the
1296 * current shader invocation.
1297 *
1298 * From the OES_sample_variables specification:
1299 * "When per-sample shading is active due to the use of a fragment input
1300 * qualified by "sample" or due to the use of the gl_SampleID or
1301 * gl_SamplePosition variables, only the bit for the current sample is
1302 * set in gl_SampleMaskIn."
1303 */
1304 const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
1305
1306 if (nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
1307 nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
1308
1309 fs_reg one = vgrf(glsl_type::int_type);
1310 fs_reg enabled_mask = vgrf(glsl_type::int_type);
1311 abld.MOV(one, brw_imm_d(1));
1312 abld.SHL(enabled_mask, one, nir_system_values[SYSTEM_VALUE_SAMPLE_ID]);
1313 abld.AND(*reg, enabled_mask, coverage_mask);
1314 } else {
1315 /* In per-pixel mode, the coverage mask is sufficient. */
1316 *reg = coverage_mask;
1317 }
1318 return reg;
1319 }
1320
1321 fs_reg
1322 fs_visitor::resolve_source_modifiers(const fs_reg &src)
1323 {
1324 if (!src.abs && !src.negate)
1325 return src;
1326
1327 fs_reg temp = bld.vgrf(src.type);
1328 bld.MOV(temp, src);
1329
1330 return temp;
1331 }
1332
1333 void
1334 fs_visitor::emit_discard_jump()
1335 {
1336 assert(brw_wm_prog_data(this->prog_data)->uses_kill);
1337
1338 /* For performance, after a discard, jump to the end of the
1339 * shader if all relevant channels have been discarded.
1340 */
1341 fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
1342 discard_jump->flag_subreg = 1;
1343
1344 discard_jump->predicate = BRW_PREDICATE_ALIGN1_ANY4H;
1345 discard_jump->predicate_inverse = true;
1346 }
1347
1348 void
1349 fs_visitor::emit_gs_thread_end()
1350 {
1351 assert(stage == MESA_SHADER_GEOMETRY);
1352
1353 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1354
1355 if (gs_compile->control_data_header_size_bits > 0) {
1356 emit_gs_control_data_bits(this->final_gs_vertex_count);
1357 }
1358
1359 const fs_builder abld = bld.annotate("thread end");
1360 fs_inst *inst;
1361
1362 if (gs_prog_data->static_vertex_count != -1) {
1363 foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
1364 if (prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8 ||
1365 prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
1366 prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
1367 prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) {
1368 prev->eot = true;
1369
1370 /* Delete now dead instructions. */
1371 foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
1372 if (dead == prev)
1373 break;
1374 dead->remove();
1375 }
1376 return;
1377 } else if (prev->is_control_flow() || prev->has_side_effects()) {
1378 break;
1379 }
1380 }
1381 fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1382 abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)));
1383 inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, hdr);
1384 inst->mlen = 1;
1385 } else {
1386 fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2);
1387 fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
1388 sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
1389 sources[1] = this->final_gs_vertex_count;
1390 abld.LOAD_PAYLOAD(payload, sources, 2, 2);
1391 inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
1392 inst->mlen = 2;
1393 }
1394 inst->eot = true;
1395 inst->offset = 0;
1396 }
1397
1398 void
1399 fs_visitor::assign_curb_setup()
1400 {
1401 unsigned uniform_push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
1402
1403 unsigned ubo_push_length = 0;
1404 unsigned ubo_push_start[4];
1405 for (int i = 0; i < 4; i++) {
1406 ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
1407 ubo_push_length += stage_prog_data->ubo_ranges[i].length;
1408 }
1409
1410 prog_data->curb_read_length = uniform_push_length + ubo_push_length;
1411
1412 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1413 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1414 for (unsigned int i = 0; i < inst->sources; i++) {
1415 if (inst->src[i].file == UNIFORM) {
1416 int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
1417 int constant_nr;
1418 if (inst->src[i].nr >= UBO_START) {
1419 /* constant_nr is in 32-bit units, the rest are in bytes */
1420 constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
1421 inst->src[i].offset / 4;
1422 } else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1423 constant_nr = push_constant_loc[uniform_nr];
1424 } else {
1425 /* Section 5.11 of the OpenGL 4.1 spec says:
1426 * "Out-of-bounds reads return undefined values, which include
1427 * values from other variables of the active program or zero."
1428 * Just return the first push constant.
1429 */
1430 constant_nr = 0;
1431 }
1432
1433 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1434 constant_nr / 8,
1435 constant_nr % 8);
1436 brw_reg.abs = inst->src[i].abs;
1437 brw_reg.negate = inst->src[i].negate;
1438
1439 assert(inst->src[i].stride == 0);
1440 inst->src[i] = byte_offset(
1441 retype(brw_reg, inst->src[i].type),
1442 inst->src[i].offset % 4);
1443 }
1444 }
1445 }
1446
1447 /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
1448 this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length;
1449 }
1450
1451 void
1452 fs_visitor::calculate_urb_setup()
1453 {
1454 assert(stage == MESA_SHADER_FRAGMENT);
1455 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
1456 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1457
1458 memset(prog_data->urb_setup, -1,
1459 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1460
1461 int urb_next = 0;
1462 /* Figure out where each of the incoming setup attributes lands. */
1463 if (devinfo->gen >= 6) {
1464 if (_mesa_bitcount_64(nir->info.inputs_read &
1465 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1466 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1467 * first 16 varying inputs, so we can put them wherever we want.
1468 * Just put them in order.
1469 *
1470 * This is useful because it means that (a) inputs not used by the
1471 * fragment shader won't take up valuable register space, and (b) we
1472 * won't have to recompile the fragment shader if it gets paired with
1473 * a different vertex (or geometry) shader.
1474 */
1475 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1476 if (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
1477 BITFIELD64_BIT(i)) {
1478 prog_data->urb_setup[i] = urb_next++;
1479 }
1480 }
1481 } else {
1482 bool include_vue_header =
1483 nir->info.inputs_read & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
1484
1485 /* We have enough input varyings that the SF/SBE pipeline stage can't
1486 * arbitrarily rearrange them to suit our whim; we have to put them
1487 * in an order that matches the output of the previous pipeline stage
1488 * (geometry or vertex shader).
1489 */
1490 struct brw_vue_map prev_stage_vue_map;
1491 brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1492 key->input_slots_valid,
1493 nir->info.separate_shader);
1494 int first_slot =
1495 include_vue_header ? 0 : 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1496
1497 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1498 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1499 slot++) {
1500 int varying = prev_stage_vue_map.slot_to_varying[slot];
1501 if (varying != BRW_VARYING_SLOT_PAD &&
1502 (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
1503 BITFIELD64_BIT(varying))) {
1504 prog_data->urb_setup[varying] = slot - first_slot;
1505 }
1506 }
1507 urb_next = prev_stage_vue_map.num_slots - first_slot;
1508 }
1509 } else {
1510 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1511 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1512 /* Point size is packed into the header, not as a general attribute */
1513 if (i == VARYING_SLOT_PSIZ)
1514 continue;
1515
1516 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1517 /* The back color slot is skipped when the front color is
1518 * also written to. In addition, some slots can be
1519 * written in the vertex shader and not read in the
1520 * fragment shader. So the register number must always be
1521 * incremented, mapped or not.
1522 */
1523 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1524 prog_data->urb_setup[i] = urb_next;
1525 urb_next++;
1526 }
1527 }
1528
1529 /*
1530 * It's a FS only attribute, and we did interpolation for this attribute
1531 * in SF thread. So, count it here, too.
1532 *
1533 * See compile_sf_prog() for more info.
1534 */
1535 if (nir->info.inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1536 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1537 }
1538
1539 prog_data->num_varying_inputs = urb_next;
1540 }
1541
1542 void
1543 fs_visitor::assign_urb_setup()
1544 {
1545 assert(stage == MESA_SHADER_FRAGMENT);
1546 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
1547
1548 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1549
1550 /* Offset all the urb_setup[] index by the actual position of the
1551 * setup regs, now that the location of the constants has been chosen.
1552 */
1553 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1554 if (inst->opcode == FS_OPCODE_LINTERP) {
1555 assert(inst->src[1].file == FIXED_GRF);
1556 inst->src[1].nr += urb_start;
1557 }
1558
1559 if (inst->opcode == FS_OPCODE_CINTERP) {
1560 assert(inst->src[0].file == FIXED_GRF);
1561 inst->src[0].nr += urb_start;
1562 }
1563 }
1564
1565 /* Each attribute is 4 setup channels, each of which is half a reg. */
1566 this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
1567 }
1568
1569 void
1570 fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
1571 {
1572 for (int i = 0; i < inst->sources; i++) {
1573 if (inst->src[i].file == ATTR) {
1574 int grf = payload.num_regs +
1575 prog_data->curb_read_length +
1576 inst->src[i].nr +
1577 inst->src[i].offset / REG_SIZE;
1578
1579 /* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
1580 *
1581 * VertStride must be used to cross GRF register boundaries. This
1582 * rule implies that elements within a 'Width' cannot cross GRF
1583 * boundaries.
1584 *
1585 * So, for registers that are large enough, we have to split the exec
1586 * size in two and trust the compression state to sort it out.
1587 */
1588 unsigned total_size = inst->exec_size *
1589 inst->src[i].stride *
1590 type_sz(inst->src[i].type);
1591
1592 assert(total_size <= 2 * REG_SIZE);
1593 const unsigned exec_size =
1594 (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
1595
1596 unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
1597 struct brw_reg reg =
1598 stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1599 inst->src[i].offset % REG_SIZE),
1600 exec_size * inst->src[i].stride,
1601 width, inst->src[i].stride);
1602 reg.abs = inst->src[i].abs;
1603 reg.negate = inst->src[i].negate;
1604
1605 inst->src[i] = reg;
1606 }
1607 }
1608 }
1609
1610 void
1611 fs_visitor::assign_vs_urb_setup()
1612 {
1613 struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data);
1614
1615 assert(stage == MESA_SHADER_VERTEX);
1616
1617 /* Each attribute is 4 regs. */
1618 this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots;
1619
1620 assert(vs_prog_data->base.urb_read_length <= 15);
1621
1622 /* Rewrite all ATTR file references to the hw grf that they land in. */
1623 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1624 convert_attr_sources_to_hw_regs(inst);
1625 }
1626 }
1627
1628 void
1629 fs_visitor::assign_tcs_single_patch_urb_setup()
1630 {
1631 assert(stage == MESA_SHADER_TESS_CTRL);
1632
1633 /* Rewrite all ATTR file references to HW_REGs. */
1634 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1635 convert_attr_sources_to_hw_regs(inst);
1636 }
1637 }
1638
1639 void
1640 fs_visitor::assign_tes_urb_setup()
1641 {
1642 assert(stage == MESA_SHADER_TESS_EVAL);
1643
1644 struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
1645
1646 first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
1647
1648 /* Rewrite all ATTR file references to HW_REGs. */
1649 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1650 convert_attr_sources_to_hw_regs(inst);
1651 }
1652 }
1653
1654 void
1655 fs_visitor::assign_gs_urb_setup()
1656 {
1657 assert(stage == MESA_SHADER_GEOMETRY);
1658
1659 struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
1660
1661 first_non_payload_grf +=
1662 8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
1663
1664 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1665 /* Rewrite all ATTR file references to GRFs. */
1666 convert_attr_sources_to_hw_regs(inst);
1667 }
1668 }
1669
1670
1671 /**
1672 * Split large virtual GRFs into separate components if we can.
1673 *
1674 * This is mostly duplicated with what brw_fs_vector_splitting does,
1675 * but that's really conservative because it's afraid of doing
1676 * splitting that doesn't result in real progress after the rest of
1677 * the optimization phases, which would cause infinite looping in
1678 * optimization. We can do it once here, safely. This also has the
1679 * opportunity to split interpolated values, or maybe even uniforms,
1680 * which we don't have at the IR level.
1681 *
1682 * We want to split, because virtual GRFs are what we register
1683 * allocate and spill (due to contiguousness requirements for some
1684 * instructions), and they're what we naturally generate in the
1685 * codegen process, but most virtual GRFs don't actually need to be
1686 * contiguous sets of GRFs. If we split, we'll end up with reduced
1687 * live intervals and better dead code elimination and coalescing.
1688 */
1689 void
1690 fs_visitor::split_virtual_grfs()
1691 {
1692 /* Compact the register file so we eliminate dead vgrfs. This
1693 * only defines split points for live registers, so if we have
1694 * too large dead registers they will hit assertions later.
1695 */
1696 compact_virtual_grfs();
1697
1698 int num_vars = this->alloc.count;
1699
1700 /* Count the total number of registers */
1701 int reg_count = 0;
1702 int vgrf_to_reg[num_vars];
1703 for (int i = 0; i < num_vars; i++) {
1704 vgrf_to_reg[i] = reg_count;
1705 reg_count += alloc.sizes[i];
1706 }
1707
1708 /* An array of "split points". For each register slot, this indicates
1709 * if this slot can be separated from the previous slot. Every time an
1710 * instruction uses multiple elements of a register (as a source or
1711 * destination), we mark the used slots as inseparable. Then we go
1712 * through and split the registers into the smallest pieces we can.
1713 */
1714 bool split_points[reg_count];
1715 memset(split_points, 0, sizeof(split_points));
1716
1717 /* Mark all used registers as fully splittable */
1718 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1719 if (inst->dst.file == VGRF) {
1720 int reg = vgrf_to_reg[inst->dst.nr];
1721 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++)
1722 split_points[reg + j] = true;
1723 }
1724
1725 for (int i = 0; i < inst->sources; i++) {
1726 if (inst->src[i].file == VGRF) {
1727 int reg = vgrf_to_reg[inst->src[i].nr];
1728 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++)
1729 split_points[reg + j] = true;
1730 }
1731 }
1732 }
1733
1734 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1735 if (inst->dst.file == VGRF) {
1736 int reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
1737 for (unsigned j = 1; j < regs_written(inst); j++)
1738 split_points[reg + j] = false;
1739 }
1740 for (int i = 0; i < inst->sources; i++) {
1741 if (inst->src[i].file == VGRF) {
1742 int reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
1743 for (unsigned j = 1; j < regs_read(inst, i); j++)
1744 split_points[reg + j] = false;
1745 }
1746 }
1747 }
1748
1749 int new_virtual_grf[reg_count];
1750 int new_reg_offset[reg_count];
1751
1752 int reg = 0;
1753 for (int i = 0; i < num_vars; i++) {
1754 /* The first one should always be 0 as a quick sanity check. */
1755 assert(split_points[reg] == false);
1756
1757 /* j = 0 case */
1758 new_reg_offset[reg] = 0;
1759 reg++;
1760 int offset = 1;
1761
1762 /* j > 0 case */
1763 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1764 /* If this is a split point, reset the offset to 0 and allocate a
1765 * new virtual GRF for the previous offset many registers
1766 */
1767 if (split_points[reg]) {
1768 assert(offset <= MAX_VGRF_SIZE);
1769 int grf = alloc.allocate(offset);
1770 for (int k = reg - offset; k < reg; k++)
1771 new_virtual_grf[k] = grf;
1772 offset = 0;
1773 }
1774 new_reg_offset[reg] = offset;
1775 offset++;
1776 reg++;
1777 }
1778
1779 /* The last one gets the original register number */
1780 assert(offset <= MAX_VGRF_SIZE);
1781 alloc.sizes[i] = offset;
1782 for (int k = reg - offset; k < reg; k++)
1783 new_virtual_grf[k] = i;
1784 }
1785 assert(reg == reg_count);
1786
1787 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1788 if (inst->dst.file == VGRF) {
1789 reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
1790 inst->dst.nr = new_virtual_grf[reg];
1791 inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
1792 inst->dst.offset % REG_SIZE;
1793 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1794 }
1795 for (int i = 0; i < inst->sources; i++) {
1796 if (inst->src[i].file == VGRF) {
1797 reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
1798 inst->src[i].nr = new_virtual_grf[reg];
1799 inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
1800 inst->src[i].offset % REG_SIZE;
1801 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1802 }
1803 }
1804 }
1805 invalidate_live_intervals();
1806 }
1807
1808 /**
1809 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1810 *
1811 * During code generation, we create tons of temporary variables, many of
1812 * which get immediately killed and are never used again. Yet, in later
1813 * optimization and analysis passes, such as compute_live_intervals, we need
1814 * to loop over all the virtual GRFs. Compacting them can save a lot of
1815 * overhead.
1816 */
1817 bool
1818 fs_visitor::compact_virtual_grfs()
1819 {
1820 bool progress = false;
1821 int remap_table[this->alloc.count];
1822 memset(remap_table, -1, sizeof(remap_table));
1823
1824 /* Mark which virtual GRFs are used. */
1825 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1826 if (inst->dst.file == VGRF)
1827 remap_table[inst->dst.nr] = 0;
1828
1829 for (int i = 0; i < inst->sources; i++) {
1830 if (inst->src[i].file == VGRF)
1831 remap_table[inst->src[i].nr] = 0;
1832 }
1833 }
1834
1835 /* Compact the GRF arrays. */
1836 int new_index = 0;
1837 for (unsigned i = 0; i < this->alloc.count; i++) {
1838 if (remap_table[i] == -1) {
1839 /* We just found an unused register. This means that we are
1840 * actually going to compact something.
1841 */
1842 progress = true;
1843 } else {
1844 remap_table[i] = new_index;
1845 alloc.sizes[new_index] = alloc.sizes[i];
1846 invalidate_live_intervals();
1847 ++new_index;
1848 }
1849 }
1850
1851 this->alloc.count = new_index;
1852
1853 /* Patch all the instructions to use the newly renumbered registers */
1854 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1855 if (inst->dst.file == VGRF)
1856 inst->dst.nr = remap_table[inst->dst.nr];
1857
1858 for (int i = 0; i < inst->sources; i++) {
1859 if (inst->src[i].file == VGRF)
1860 inst->src[i].nr = remap_table[inst->src[i].nr];
1861 }
1862 }
1863
1864 /* Patch all the references to delta_xy, since they're used in register
1865 * allocation. If they're unused, switch them to BAD_FILE so we don't
1866 * think some random VGRF is delta_xy.
1867 */
1868 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
1869 if (delta_xy[i].file == VGRF) {
1870 if (remap_table[delta_xy[i].nr] != -1) {
1871 delta_xy[i].nr = remap_table[delta_xy[i].nr];
1872 } else {
1873 delta_xy[i].file = BAD_FILE;
1874 }
1875 }
1876 }
1877
1878 return progress;
1879 }
1880
1881 static void
1882 set_push_pull_constant_loc(unsigned uniform, int *chunk_start,
1883 unsigned *max_chunk_bitsize,
1884 bool contiguous, unsigned bitsize,
1885 const unsigned target_bitsize,
1886 int *push_constant_loc, int *pull_constant_loc,
1887 unsigned *num_push_constants,
1888 unsigned *num_pull_constants,
1889 const unsigned max_push_components,
1890 const unsigned max_chunk_size,
1891 struct brw_stage_prog_data *stage_prog_data)
1892 {
1893 /* This is the first live uniform in the chunk */
1894 if (*chunk_start < 0)
1895 *chunk_start = uniform;
1896
1897 /* Keep track of the maximum bit size access in contiguous uniforms */
1898 *max_chunk_bitsize = MAX2(*max_chunk_bitsize, bitsize);
1899
1900 /* If this element does not need to be contiguous with the next, we
1901 * split at this point and everything between chunk_start and u forms a
1902 * single chunk.
1903 */
1904 if (!contiguous) {
1905 /* If bitsize doesn't match the target one, skip it */
1906 if (*max_chunk_bitsize != target_bitsize) {
1907 /* FIXME: right now we only support 32 and 64-bit accesses */
1908 assert(*max_chunk_bitsize == 4 || *max_chunk_bitsize == 8);
1909 *max_chunk_bitsize = 0;
1910 *chunk_start = -1;
1911 return;
1912 }
1913
1914 unsigned chunk_size = uniform - *chunk_start + 1;
1915
1916 /* Decide whether we should push or pull this parameter. In the
1917 * Vulkan driver, push constants are explicitly exposed via the API
1918 * so we push everything. In GL, we only push small arrays.
1919 */
1920 if (stage_prog_data->pull_param == NULL ||
1921 (*num_push_constants + chunk_size <= max_push_components &&
1922 chunk_size <= max_chunk_size)) {
1923 assert(*num_push_constants + chunk_size <= max_push_components);
1924 for (unsigned j = *chunk_start; j <= uniform; j++)
1925 push_constant_loc[j] = (*num_push_constants)++;
1926 } else {
1927 for (unsigned j = *chunk_start; j <= uniform; j++)
1928 pull_constant_loc[j] = (*num_pull_constants)++;
1929 }
1930
1931 *max_chunk_bitsize = 0;
1932 *chunk_start = -1;
1933 }
1934 }
1935
1936 /**
1937 * Assign UNIFORM file registers to either push constants or pull constants.
1938 *
1939 * We allow a fragment shader to have more than the specified minimum
1940 * maximum number of fragment shader uniform components (64). If
1941 * there are too many of these, they'd fill up all of register space.
1942 * So, this will push some of them out to the pull constant buffer and
1943 * update the program to load them.
1944 */
1945 void
1946 fs_visitor::assign_constant_locations()
1947 {
1948 /* Only the first compile gets to decide on locations. */
1949 if (dispatch_width != min_dispatch_width)
1950 return;
1951
1952 bool is_live[uniforms];
1953 memset(is_live, 0, sizeof(is_live));
1954 unsigned bitsize_access[uniforms];
1955 memset(bitsize_access, 0, sizeof(bitsize_access));
1956
1957 /* For each uniform slot, a value of true indicates that the given slot and
1958 * the next slot must remain contiguous. This is used to keep us from
1959 * splitting arrays apart.
1960 */
1961 bool contiguous[uniforms];
1962 memset(contiguous, 0, sizeof(contiguous));
1963
1964 int thread_local_id_index =
1965 (stage == MESA_SHADER_COMPUTE) ?
1966 brw_cs_prog_data(stage_prog_data)->thread_local_id_index : -1;
1967
1968 /* First, we walk through the instructions and do two things:
1969 *
1970 * 1) Figure out which uniforms are live.
1971 *
1972 * 2) Mark any indirectly used ranges of registers as contiguous.
1973 *
1974 * Note that we don't move constant-indexed accesses to arrays. No
1975 * testing has been done of the performance impact of this choice.
1976 */
1977 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
1978 for (int i = 0 ; i < inst->sources; i++) {
1979 if (inst->src[i].file != UNIFORM)
1980 continue;
1981
1982 int constant_nr = inst->src[i].nr + inst->src[i].offset / 4;
1983
1984 if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
1985 assert(inst->src[2].ud % 4 == 0);
1986 unsigned last = constant_nr + (inst->src[2].ud / 4) - 1;
1987 assert(last < uniforms);
1988
1989 for (unsigned j = constant_nr; j < last; j++) {
1990 is_live[j] = true;
1991 contiguous[j] = true;
1992 bitsize_access[j] = MAX2(bitsize_access[j], type_sz(inst->src[i].type));
1993 }
1994 is_live[last] = true;
1995 bitsize_access[last] = MAX2(bitsize_access[last], type_sz(inst->src[i].type));
1996 } else {
1997 if (constant_nr >= 0 && constant_nr < (int) uniforms) {
1998 int regs_read = inst->components_read(i) *
1999 type_sz(inst->src[i].type) / 4;
2000 for (int j = 0; j < regs_read; j++) {
2001 is_live[constant_nr + j] = true;
2002 bitsize_access[constant_nr + j] =
2003 MAX2(bitsize_access[constant_nr + j], type_sz(inst->src[i].type));
2004 }
2005 }
2006 }
2007 }
2008 }
2009
2010 if (thread_local_id_index >= 0 && !is_live[thread_local_id_index])
2011 thread_local_id_index = -1;
2012
2013 /* Only allow 16 registers (128 uniform components) as push constants.
2014 *
2015 * Just demote the end of the list. We could probably do better
2016 * here, demoting things that are rarely used in the program first.
2017 *
2018 * If changing this value, note the limitation about total_regs in
2019 * brw_curbe.c.
2020 */
2021 unsigned int max_push_components = 16 * 8;
2022 if (thread_local_id_index >= 0)
2023 max_push_components--; /* Save a slot for the thread ID */
2024
2025 /* We push small arrays, but no bigger than 16 floats. This is big enough
2026 * for a vec4 but hopefully not large enough to push out other stuff. We
2027 * should probably use a better heuristic at some point.
2028 */
2029 const unsigned int max_chunk_size = 16;
2030
2031 unsigned int num_push_constants = 0;
2032 unsigned int num_pull_constants = 0;
2033
2034 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2035 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2036
2037 /* Default to -1 meaning no location */
2038 memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc));
2039 memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
2040
2041 int chunk_start = -1;
2042 unsigned max_chunk_bitsize = 0;
2043
2044 /* First push 64-bit uniforms to ensure they are properly aligned */
2045 const unsigned uniform_64_bit_size = type_sz(BRW_REGISTER_TYPE_DF);
2046 for (unsigned u = 0; u < uniforms; u++) {
2047 if (!is_live[u])
2048 continue;
2049
2050 set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize,
2051 contiguous[u], bitsize_access[u],
2052 uniform_64_bit_size,
2053 push_constant_loc, pull_constant_loc,
2054 &num_push_constants, &num_pull_constants,
2055 max_push_components, max_chunk_size,
2056 stage_prog_data);
2057
2058 }
2059
2060 /* Then push the rest of uniforms */
2061 const unsigned uniform_32_bit_size = type_sz(BRW_REGISTER_TYPE_F);
2062 for (unsigned u = 0; u < uniforms; u++) {
2063 if (!is_live[u])
2064 continue;
2065
2066 /* Skip thread_local_id_index to put it in the last push register. */
2067 if (thread_local_id_index == (int)u)
2068 continue;
2069
2070 set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize,
2071 contiguous[u], bitsize_access[u],
2072 uniform_32_bit_size,
2073 push_constant_loc, pull_constant_loc,
2074 &num_push_constants, &num_pull_constants,
2075 max_push_components, max_chunk_size,
2076 stage_prog_data);
2077 }
2078
2079 /* Add the CS local thread ID uniform at the end of the push constants */
2080 if (thread_local_id_index >= 0)
2081 push_constant_loc[thread_local_id_index] = num_push_constants++;
2082
2083 /* As the uniforms are going to be reordered, take the data from a temporary
2084 * copy of the original param[].
2085 */
2086 gl_constant_value **param = ralloc_array(NULL, gl_constant_value*,
2087 stage_prog_data->nr_params);
2088 memcpy(param, stage_prog_data->param,
2089 sizeof(gl_constant_value*) * stage_prog_data->nr_params);
2090 stage_prog_data->nr_params = num_push_constants;
2091 stage_prog_data->nr_pull_params = num_pull_constants;
2092
2093 /* Now that we know how many regular uniforms we'll push, reduce the
2094 * UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
2095 */
2096 unsigned push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
2097 for (int i = 0; i < 4; i++) {
2098 struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
2099
2100 if (push_length + range->length > 64)
2101 range->length = 64 - push_length;
2102
2103 push_length += range->length;
2104 }
2105 assert(push_length <= 64);
2106
2107 /* Up until now, the param[] array has been indexed by reg + offset
2108 * of UNIFORM registers. Move pull constants into pull_param[] and
2109 * condense param[] to only contain the uniforms we chose to push.
2110 *
2111 * NOTE: Because we are condensing the params[] array, we know that
2112 * push_constant_loc[i] <= i and we can do it in one smooth loop without
2113 * having to make a copy.
2114 */
2115 int new_thread_local_id_index = -1;
2116 for (unsigned int i = 0; i < uniforms; i++) {
2117 const gl_constant_value *value = param[i];
2118
2119 if (pull_constant_loc[i] != -1) {
2120 stage_prog_data->pull_param[pull_constant_loc[i]] = value;
2121 } else if (push_constant_loc[i] != -1) {
2122 stage_prog_data->param[push_constant_loc[i]] = value;
2123 if (thread_local_id_index == (int)i)
2124 new_thread_local_id_index = push_constant_loc[i];
2125 }
2126 }
2127 ralloc_free(param);
2128
2129 if (stage == MESA_SHADER_COMPUTE)
2130 brw_cs_prog_data(stage_prog_data)->thread_local_id_index =
2131 new_thread_local_id_index;
2132 }
2133
2134 bool
2135 fs_visitor::get_pull_locs(const fs_reg &src,
2136 unsigned *out_surf_index,
2137 unsigned *out_pull_index)
2138 {
2139 assert(src.file == UNIFORM);
2140
2141 if (src.nr >= UBO_START) {
2142 const struct brw_ubo_range *range =
2143 &prog_data->ubo_ranges[src.nr - UBO_START];
2144
2145 /* If this access is in our (reduced) range, use the push data. */
2146 if (src.offset / 32 < range->length)
2147 return false;
2148
2149 *out_surf_index = prog_data->binding_table.ubo_start + range->block;
2150 *out_pull_index = (32 * range->start + src.offset) / 4;
2151 return true;
2152 }
2153
2154 const unsigned location = src.nr + src.offset / 4;
2155
2156 if (location < uniforms && pull_constant_loc[location] != -1) {
2157 /* A regular uniform push constant */
2158 *out_surf_index = stage_prog_data->binding_table.pull_constants_start;
2159 *out_pull_index = pull_constant_loc[location];
2160 return true;
2161 }
2162
2163 return false;
2164 }
2165
2166 /**
2167 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2168 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2169 */
2170 void
2171 fs_visitor::lower_constant_loads()
2172 {
2173 unsigned index, pull_index;
2174
2175 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2176 /* Set up the annotation tracking for new generated instructions. */
2177 const fs_builder ibld(this, block, inst);
2178
2179 for (int i = 0; i < inst->sources; i++) {
2180 if (inst->src[i].file != UNIFORM)
2181 continue;
2182
2183 /* We'll handle this case later */
2184 if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
2185 continue;
2186
2187 if (!get_pull_locs(inst->src[i], &index, &pull_index))
2188 continue;
2189
2190 assert(inst->src[i].stride == 0);
2191
2192 const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
2193 const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
2194 const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
2195 const unsigned base = pull_index * 4;
2196
2197 ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
2198 dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1)));
2199
2200 /* Rewrite the instruction to use the temporary VGRF. */
2201 inst->src[i].file = VGRF;
2202 inst->src[i].nr = dst.nr;
2203 inst->src[i].offset = (base & (block_sz - 1)) +
2204 inst->src[i].offset % 4;
2205
2206 brw_mark_surface_used(prog_data, index);
2207 }
2208
2209 if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
2210 inst->src[0].file == UNIFORM) {
2211
2212 if (!get_pull_locs(inst->src[0], &index, &pull_index))
2213 continue;
2214
2215 VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
2216 brw_imm_ud(index),
2217 inst->src[1],
2218 pull_index * 4);
2219 inst->remove(block);
2220
2221 brw_mark_surface_used(prog_data, index);
2222 }
2223 }
2224 invalidate_live_intervals();
2225 }
2226
2227 bool
2228 fs_visitor::opt_algebraic()
2229 {
2230 bool progress = false;
2231
2232 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2233 switch (inst->opcode) {
2234 case BRW_OPCODE_MOV:
2235 if (inst->src[0].file != IMM)
2236 break;
2237
2238 if (inst->saturate) {
2239 if (inst->dst.type != inst->src[0].type)
2240 assert(!"unimplemented: saturate mixed types");
2241
2242 if (brw_saturate_immediate(inst->dst.type,
2243 &inst->src[0].as_brw_reg())) {
2244 inst->saturate = false;
2245 progress = true;
2246 }
2247 }
2248 break;
2249
2250 case BRW_OPCODE_MUL:
2251 if (inst->src[1].file != IMM)
2252 continue;
2253
2254 /* a * 1.0 = a */
2255 if (inst->src[1].is_one()) {
2256 inst->opcode = BRW_OPCODE_MOV;
2257 inst->src[1] = reg_undef;
2258 progress = true;
2259 break;
2260 }
2261
2262 /* a * -1.0 = -a */
2263 if (inst->src[1].is_negative_one()) {
2264 inst->opcode = BRW_OPCODE_MOV;
2265 inst->src[0].negate = !inst->src[0].negate;
2266 inst->src[1] = reg_undef;
2267 progress = true;
2268 break;
2269 }
2270
2271 /* a * 0.0 = 0.0 */
2272 if (inst->src[1].is_zero()) {
2273 inst->opcode = BRW_OPCODE_MOV;
2274 inst->src[0] = inst->src[1];
2275 inst->src[1] = reg_undef;
2276 progress = true;
2277 break;
2278 }
2279
2280 if (inst->src[0].file == IMM) {
2281 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2282 inst->opcode = BRW_OPCODE_MOV;
2283 inst->src[0].f *= inst->src[1].f;
2284 inst->src[1] = reg_undef;
2285 progress = true;
2286 break;
2287 }
2288 break;
2289 case BRW_OPCODE_ADD:
2290 if (inst->src[1].file != IMM)
2291 continue;
2292
2293 /* a + 0.0 = a */
2294 if (inst->src[1].is_zero()) {
2295 inst->opcode = BRW_OPCODE_MOV;
2296 inst->src[1] = reg_undef;
2297 progress = true;
2298 break;
2299 }
2300
2301 if (inst->src[0].file == IMM) {
2302 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2303 inst->opcode = BRW_OPCODE_MOV;
2304 inst->src[0].f += inst->src[1].f;
2305 inst->src[1] = reg_undef;
2306 progress = true;
2307 break;
2308 }
2309 break;
2310 case BRW_OPCODE_OR:
2311 if (inst->src[0].equals(inst->src[1])) {
2312 inst->opcode = BRW_OPCODE_MOV;
2313 inst->src[1] = reg_undef;
2314 progress = true;
2315 break;
2316 }
2317 break;
2318 case BRW_OPCODE_LRP:
2319 if (inst->src[1].equals(inst->src[2])) {
2320 inst->opcode = BRW_OPCODE_MOV;
2321 inst->src[0] = inst->src[1];
2322 inst->src[1] = reg_undef;
2323 inst->src[2] = reg_undef;
2324 progress = true;
2325 break;
2326 }
2327 break;
2328 case BRW_OPCODE_CMP:
2329 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2330 inst->src[0].abs &&
2331 inst->src[0].negate &&
2332 inst->src[1].is_zero()) {
2333 inst->src[0].abs = false;
2334 inst->src[0].negate = false;
2335 inst->conditional_mod = BRW_CONDITIONAL_Z;
2336 progress = true;
2337 break;
2338 }
2339 break;
2340 case BRW_OPCODE_SEL:
2341 if (inst->src[0].equals(inst->src[1])) {
2342 inst->opcode = BRW_OPCODE_MOV;
2343 inst->src[1] = reg_undef;
2344 inst->predicate = BRW_PREDICATE_NONE;
2345 inst->predicate_inverse = false;
2346 progress = true;
2347 } else if (inst->saturate && inst->src[1].file == IMM) {
2348 switch (inst->conditional_mod) {
2349 case BRW_CONDITIONAL_LE:
2350 case BRW_CONDITIONAL_L:
2351 switch (inst->src[1].type) {
2352 case BRW_REGISTER_TYPE_F:
2353 if (inst->src[1].f >= 1.0f) {
2354 inst->opcode = BRW_OPCODE_MOV;
2355 inst->src[1] = reg_undef;
2356 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2357 progress = true;
2358 }
2359 break;
2360 default:
2361 break;
2362 }
2363 break;
2364 case BRW_CONDITIONAL_GE:
2365 case BRW_CONDITIONAL_G:
2366 switch (inst->src[1].type) {
2367 case BRW_REGISTER_TYPE_F:
2368 if (inst->src[1].f <= 0.0f) {
2369 inst->opcode = BRW_OPCODE_MOV;
2370 inst->src[1] = reg_undef;
2371 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2372 progress = true;
2373 }
2374 break;
2375 default:
2376 break;
2377 }
2378 default:
2379 break;
2380 }
2381 }
2382 break;
2383 case BRW_OPCODE_MAD:
2384 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2385 inst->opcode = BRW_OPCODE_MOV;
2386 inst->src[1] = reg_undef;
2387 inst->src[2] = reg_undef;
2388 progress = true;
2389 } else if (inst->src[0].is_zero()) {
2390 inst->opcode = BRW_OPCODE_MUL;
2391 inst->src[0] = inst->src[2];
2392 inst->src[2] = reg_undef;
2393 progress = true;
2394 } else if (inst->src[1].is_one()) {
2395 inst->opcode = BRW_OPCODE_ADD;
2396 inst->src[1] = inst->src[2];
2397 inst->src[2] = reg_undef;
2398 progress = true;
2399 } else if (inst->src[2].is_one()) {
2400 inst->opcode = BRW_OPCODE_ADD;
2401 inst->src[2] = reg_undef;
2402 progress = true;
2403 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2404 inst->opcode = BRW_OPCODE_ADD;
2405 inst->src[1].f *= inst->src[2].f;
2406 inst->src[2] = reg_undef;
2407 progress = true;
2408 }
2409 break;
2410 case SHADER_OPCODE_BROADCAST:
2411 if (is_uniform(inst->src[0])) {
2412 inst->opcode = BRW_OPCODE_MOV;
2413 inst->sources = 1;
2414 inst->force_writemask_all = true;
2415 progress = true;
2416 } else if (inst->src[1].file == IMM) {
2417 inst->opcode = BRW_OPCODE_MOV;
2418 inst->src[0] = component(inst->src[0],
2419 inst->src[1].ud);
2420 inst->sources = 1;
2421 inst->force_writemask_all = true;
2422 progress = true;
2423 }
2424 break;
2425
2426 default:
2427 break;
2428 }
2429
2430 /* Swap if src[0] is immediate. */
2431 if (progress && inst->is_commutative()) {
2432 if (inst->src[0].file == IMM) {
2433 fs_reg tmp = inst->src[1];
2434 inst->src[1] = inst->src[0];
2435 inst->src[0] = tmp;
2436 }
2437 }
2438 }
2439 return progress;
2440 }
2441
2442 /**
2443 * Optimize sample messages that have constant zero values for the trailing
2444 * texture coordinates. We can just reduce the message length for these
2445 * instructions instead of reserving a register for it. Trailing parameters
2446 * that aren't sent default to zero anyway. This will cause the dead code
2447 * eliminator to remove the MOV instruction that would otherwise be emitted to
2448 * set up the zero value.
2449 */
2450 bool
2451 fs_visitor::opt_zero_samples()
2452 {
2453 /* Gen4 infers the texturing opcode based on the message length so we can't
2454 * change it.
2455 */
2456 if (devinfo->gen < 5)
2457 return false;
2458
2459 bool progress = false;
2460
2461 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2462 if (!inst->is_tex())
2463 continue;
2464
2465 fs_inst *load_payload = (fs_inst *) inst->prev;
2466
2467 if (load_payload->is_head_sentinel() ||
2468 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2469 continue;
2470
2471 /* We don't want to remove the message header or the first parameter.
2472 * Removing the first parameter is not allowed, see the Haswell PRM
2473 * volume 7, page 149:
2474 *
2475 * "Parameter 0 is required except for the sampleinfo message, which
2476 * has no parameter 0"
2477 */
2478 while (inst->mlen > inst->header_size + inst->exec_size / 8 &&
2479 load_payload->src[(inst->mlen - inst->header_size) /
2480 (inst->exec_size / 8) +
2481 inst->header_size - 1].is_zero()) {
2482 inst->mlen -= inst->exec_size / 8;
2483 progress = true;
2484 }
2485 }
2486
2487 if (progress)
2488 invalidate_live_intervals();
2489
2490 return progress;
2491 }
2492
2493 /**
2494 * Optimize sample messages which are followed by the final RT write.
2495 *
2496 * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2497 * results sent directly to the framebuffer, bypassing the EU. Recognize the
2498 * final texturing results copied to the framebuffer write payload and modify
2499 * them to write to the framebuffer directly.
2500 */
2501 bool
2502 fs_visitor::opt_sampler_eot()
2503 {
2504 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2505
2506 if (stage != MESA_SHADER_FRAGMENT)
2507 return false;
2508
2509 if (devinfo->gen != 9 && !devinfo->is_cherryview)
2510 return false;
2511
2512 /* FINISHME: It should be possible to implement this optimization when there
2513 * are multiple drawbuffers.
2514 */
2515 if (key->nr_color_regions != 1)
2516 return false;
2517
2518 /* Requires emitting a bunch of saturating MOV instructions during logical
2519 * send lowering to clamp the color payload, which the sampler unit isn't
2520 * going to do for us.
2521 */
2522 if (key->clamp_fragment_color)
2523 return false;
2524
2525 /* Look for a texturing instruction immediately before the final FB_WRITE. */
2526 bblock_t *block = cfg->blocks[cfg->num_blocks - 1];
2527 fs_inst *fb_write = (fs_inst *)block->end();
2528 assert(fb_write->eot);
2529 assert(fb_write->opcode == FS_OPCODE_FB_WRITE_LOGICAL);
2530
2531 /* There wasn't one; nothing to do. */
2532 if (unlikely(fb_write->prev->is_head_sentinel()))
2533 return false;
2534
2535 fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2536
2537 /* 3D Sampler » Messages » Message Format
2538 *
2539 * “Response Length of zero is allowed on all SIMD8* and SIMD16* sampler
2540 * messages except sample+killpix, resinfo, sampleinfo, LOD, and gather4*”
2541 */
2542 if (tex_inst->opcode != SHADER_OPCODE_TEX_LOGICAL &&
2543 tex_inst->opcode != SHADER_OPCODE_TXD_LOGICAL &&
2544 tex_inst->opcode != SHADER_OPCODE_TXF_LOGICAL &&
2545 tex_inst->opcode != SHADER_OPCODE_TXL_LOGICAL &&
2546 tex_inst->opcode != FS_OPCODE_TXB_LOGICAL &&
2547 tex_inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL &&
2548 tex_inst->opcode != SHADER_OPCODE_TXF_CMS_W_LOGICAL &&
2549 tex_inst->opcode != SHADER_OPCODE_TXF_UMS_LOGICAL)
2550 return false;
2551
2552 /* XXX - This shouldn't be necessary. */
2553 if (tex_inst->prev->is_head_sentinel())
2554 return false;
2555
2556 /* Check that the FB write sources are fully initialized by the single
2557 * texturing instruction.
2558 */
2559 for (unsigned i = 0; i < FB_WRITE_LOGICAL_NUM_SRCS; i++) {
2560 if (i == FB_WRITE_LOGICAL_SRC_COLOR0) {
2561 if (!fb_write->src[i].equals(tex_inst->dst) ||
2562 fb_write->size_read(i) != tex_inst->size_written)
2563 return false;
2564 } else if (i != FB_WRITE_LOGICAL_SRC_COMPONENTS) {
2565 if (fb_write->src[i].file != BAD_FILE)
2566 return false;
2567 }
2568 }
2569
2570 assert(!tex_inst->eot); /* We can't get here twice */
2571 assert((tex_inst->offset & (0xff << 24)) == 0);
2572
2573 const fs_builder ibld(this, block, tex_inst);
2574
2575 tex_inst->offset |= fb_write->target << 24;
2576 tex_inst->eot = true;
2577 tex_inst->dst = ibld.null_reg_ud();
2578 tex_inst->size_written = 0;
2579 fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2580
2581 /* Marking EOT is sufficient, lower_logical_sends() will notice the EOT
2582 * flag and submit a header together with the sampler message as required
2583 * by the hardware.
2584 */
2585 invalidate_live_intervals();
2586 return true;
2587 }
2588
2589 bool
2590 fs_visitor::opt_register_renaming()
2591 {
2592 bool progress = false;
2593 int depth = 0;
2594
2595 int remap[alloc.count];
2596 memset(remap, -1, sizeof(int) * alloc.count);
2597
2598 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2599 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2600 depth++;
2601 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2602 inst->opcode == BRW_OPCODE_WHILE) {
2603 depth--;
2604 }
2605
2606 /* Rewrite instruction sources. */
2607 for (int i = 0; i < inst->sources; i++) {
2608 if (inst->src[i].file == VGRF &&
2609 remap[inst->src[i].nr] != -1 &&
2610 remap[inst->src[i].nr] != inst->src[i].nr) {
2611 inst->src[i].nr = remap[inst->src[i].nr];
2612 progress = true;
2613 }
2614 }
2615
2616 const int dst = inst->dst.nr;
2617
2618 if (depth == 0 &&
2619 inst->dst.file == VGRF &&
2620 alloc.sizes[inst->dst.nr] * REG_SIZE == inst->size_written &&
2621 !inst->is_partial_write()) {
2622 if (remap[dst] == -1) {
2623 remap[dst] = dst;
2624 } else {
2625 remap[dst] = alloc.allocate(regs_written(inst));
2626 inst->dst.nr = remap[dst];
2627 progress = true;
2628 }
2629 } else if (inst->dst.file == VGRF &&
2630 remap[dst] != -1 &&
2631 remap[dst] != dst) {
2632 inst->dst.nr = remap[dst];
2633 progress = true;
2634 }
2635 }
2636
2637 if (progress) {
2638 invalidate_live_intervals();
2639
2640 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2641 if (delta_xy[i].file == VGRF && remap[delta_xy[i].nr] != -1) {
2642 delta_xy[i].nr = remap[delta_xy[i].nr];
2643 }
2644 }
2645 }
2646
2647 return progress;
2648 }
2649
2650 /**
2651 * Remove redundant or useless discard jumps.
2652 *
2653 * For example, we can eliminate jumps in the following sequence:
2654 *
2655 * discard-jump (redundant with the next jump)
2656 * discard-jump (useless; jumps to the next instruction)
2657 * placeholder-halt
2658 */
2659 bool
2660 fs_visitor::opt_redundant_discard_jumps()
2661 {
2662 bool progress = false;
2663
2664 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2665
2666 fs_inst *placeholder_halt = NULL;
2667 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2668 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2669 placeholder_halt = inst;
2670 break;
2671 }
2672 }
2673
2674 if (!placeholder_halt)
2675 return false;
2676
2677 /* Delete any HALTs immediately before the placeholder halt. */
2678 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2679 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2680 prev = (fs_inst *) placeholder_halt->prev) {
2681 prev->remove(last_bblock);
2682 progress = true;
2683 }
2684
2685 if (progress)
2686 invalidate_live_intervals();
2687
2688 return progress;
2689 }
2690
2691 /**
2692 * Compute a bitmask with GRF granularity with a bit set for each GRF starting
2693 * from \p r.offset which overlaps the region starting at \p s.offset and
2694 * spanning \p ds bytes.
2695 */
2696 static inline unsigned
2697 mask_relative_to(const fs_reg &r, const fs_reg &s, unsigned ds)
2698 {
2699 const int rel_offset = reg_offset(s) - reg_offset(r);
2700 const int shift = rel_offset / REG_SIZE;
2701 const unsigned n = DIV_ROUND_UP(rel_offset % REG_SIZE + ds, REG_SIZE);
2702 assert(reg_space(r) == reg_space(s) &&
2703 shift >= 0 && shift < int(8 * sizeof(unsigned)));
2704 return ((1 << n) - 1) << shift;
2705 }
2706
2707 bool
2708 fs_visitor::compute_to_mrf()
2709 {
2710 bool progress = false;
2711 int next_ip = 0;
2712
2713 /* No MRFs on Gen >= 7. */
2714 if (devinfo->gen >= 7)
2715 return false;
2716
2717 calculate_live_intervals();
2718
2719 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2720 int ip = next_ip;
2721 next_ip++;
2722
2723 if (inst->opcode != BRW_OPCODE_MOV ||
2724 inst->is_partial_write() ||
2725 inst->dst.file != MRF || inst->src[0].file != VGRF ||
2726 inst->dst.type != inst->src[0].type ||
2727 inst->src[0].abs || inst->src[0].negate ||
2728 !inst->src[0].is_contiguous() ||
2729 inst->src[0].offset % REG_SIZE != 0)
2730 continue;
2731
2732 /* Can't compute-to-MRF this GRF if someone else was going to
2733 * read it later.
2734 */
2735 if (this->virtual_grf_end[inst->src[0].nr] > ip)
2736 continue;
2737
2738 /* Found a move of a GRF to a MRF. Let's see if we can go rewrite the
2739 * things that computed the value of all GRFs of the source region. The
2740 * regs_left bitset keeps track of the registers we haven't yet found a
2741 * generating instruction for.
2742 */
2743 unsigned regs_left = (1 << regs_read(inst, 0)) - 1;
2744
2745 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
2746 if (regions_overlap(scan_inst->dst, scan_inst->size_written,
2747 inst->src[0], inst->size_read(0))) {
2748 /* Found the last thing to write our reg we want to turn
2749 * into a compute-to-MRF.
2750 */
2751
2752 /* If this one instruction didn't populate all the
2753 * channels, bail. We might be able to rewrite everything
2754 * that writes that reg, but it would require smarter
2755 * tracking.
2756 */
2757 if (scan_inst->is_partial_write())
2758 break;
2759
2760 /* Handling things not fully contained in the source of the copy
2761 * would need us to understand coalescing out more than one MOV at
2762 * a time.
2763 */
2764 if (!region_contained_in(scan_inst->dst, scan_inst->size_written,
2765 inst->src[0], inst->size_read(0)))
2766 break;
2767
2768 /* SEND instructions can't have MRF as a destination. */
2769 if (scan_inst->mlen)
2770 break;
2771
2772 if (devinfo->gen == 6) {
2773 /* gen6 math instructions must have the destination be
2774 * GRF, so no compute-to-MRF for them.
2775 */
2776 if (scan_inst->is_math()) {
2777 break;
2778 }
2779 }
2780
2781 /* Clear the bits for any registers this instruction overwrites. */
2782 regs_left &= ~mask_relative_to(
2783 inst->src[0], scan_inst->dst, scan_inst->size_written);
2784 if (!regs_left)
2785 break;
2786 }
2787
2788 /* We don't handle control flow here. Most computation of
2789 * values that end up in MRFs are shortly before the MRF
2790 * write anyway.
2791 */
2792 if (block->start() == scan_inst)
2793 break;
2794
2795 /* You can't read from an MRF, so if someone else reads our
2796 * MRF's source GRF that we wanted to rewrite, that stops us.
2797 */
2798 bool interfered = false;
2799 for (int i = 0; i < scan_inst->sources; i++) {
2800 if (regions_overlap(scan_inst->src[i], scan_inst->size_read(i),
2801 inst->src[0], inst->size_read(0))) {
2802 interfered = true;
2803 }
2804 }
2805 if (interfered)
2806 break;
2807
2808 if (regions_overlap(scan_inst->dst, scan_inst->size_written,
2809 inst->dst, inst->size_written)) {
2810 /* If somebody else writes our MRF here, we can't
2811 * compute-to-MRF before that.
2812 */
2813 break;
2814 }
2815
2816 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 &&
2817 regions_overlap(fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE,
2818 inst->dst, inst->size_written)) {
2819 /* Found a SEND instruction, which means that there are
2820 * live values in MRFs from base_mrf to base_mrf +
2821 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2822 * above it.
2823 */
2824 break;
2825 }
2826 }
2827
2828 if (regs_left)
2829 continue;
2830
2831 /* Found all generating instructions of our MRF's source value, so it
2832 * should be safe to rewrite them to point to the MRF directly.
2833 */
2834 regs_left = (1 << regs_read(inst, 0)) - 1;
2835
2836 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
2837 if (regions_overlap(scan_inst->dst, scan_inst->size_written,
2838 inst->src[0], inst->size_read(0))) {
2839 /* Clear the bits for any registers this instruction overwrites. */
2840 regs_left &= ~mask_relative_to(
2841 inst->src[0], scan_inst->dst, scan_inst->size_written);
2842
2843 const unsigned rel_offset = reg_offset(scan_inst->dst) -
2844 reg_offset(inst->src[0]);
2845
2846 if (inst->dst.nr & BRW_MRF_COMPR4) {
2847 /* Apply the same address transformation done by the hardware
2848 * for COMPR4 MRF writes.
2849 */
2850 assert(rel_offset < 2 * REG_SIZE);
2851 scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE * 4;
2852
2853 /* Clear the COMPR4 bit if the generating instruction is not
2854 * compressed.
2855 */
2856 if (scan_inst->size_written < 2 * REG_SIZE)
2857 scan_inst->dst.nr &= ~BRW_MRF_COMPR4;
2858
2859 } else {
2860 /* Calculate the MRF number the result of this instruction is
2861 * ultimately written to.
2862 */
2863 scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE;
2864 }
2865
2866 scan_inst->dst.file = MRF;
2867 scan_inst->dst.offset = inst->dst.offset + rel_offset % REG_SIZE;
2868 scan_inst->saturate |= inst->saturate;
2869 if (!regs_left)
2870 break;
2871 }
2872 }
2873
2874 assert(!regs_left);
2875 inst->remove(block);
2876 progress = true;
2877 }
2878
2879 if (progress)
2880 invalidate_live_intervals();
2881
2882 return progress;
2883 }
2884
2885 /**
2886 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2887 * flow. We could probably do better here with some form of divergence
2888 * analysis.
2889 */
2890 bool
2891 fs_visitor::eliminate_find_live_channel()
2892 {
2893 bool progress = false;
2894 unsigned depth = 0;
2895
2896 if (!brw_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) {
2897 /* The optimization below assumes that channel zero is live on thread
2898 * dispatch, which may not be the case if the fixed function dispatches
2899 * threads sparsely.
2900 */
2901 return false;
2902 }
2903
2904 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2905 switch (inst->opcode) {
2906 case BRW_OPCODE_IF:
2907 case BRW_OPCODE_DO:
2908 depth++;
2909 break;
2910
2911 case BRW_OPCODE_ENDIF:
2912 case BRW_OPCODE_WHILE:
2913 depth--;
2914 break;
2915
2916 case FS_OPCODE_DISCARD_JUMP:
2917 /* This can potentially make control flow non-uniform until the end
2918 * of the program.
2919 */
2920 return progress;
2921
2922 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2923 if (depth == 0) {
2924 inst->opcode = BRW_OPCODE_MOV;
2925 inst->src[0] = brw_imm_ud(0u);
2926 inst->sources = 1;
2927 inst->force_writemask_all = true;
2928 progress = true;
2929 }
2930 break;
2931
2932 default:
2933 break;
2934 }
2935 }
2936
2937 return progress;
2938 }
2939
2940 /**
2941 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2942 * instructions to FS_OPCODE_REP_FB_WRITE.
2943 */
2944 void
2945 fs_visitor::emit_repclear_shader()
2946 {
2947 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2948 int base_mrf = 0;
2949 int color_mrf = base_mrf + 2;
2950 fs_inst *mov;
2951
2952 if (uniforms > 0) {
2953 mov = bld.exec_all().group(4, 0)
2954 .MOV(brw_message_reg(color_mrf),
2955 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
2956 } else {
2957 struct brw_reg reg =
2958 brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_REGISTER_TYPE_F,
2959 BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,
2960 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2961
2962 mov = bld.exec_all().group(4, 0)
2963 .MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg));
2964 }
2965
2966 fs_inst *write;
2967 if (key->nr_color_regions == 1) {
2968 write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2969 write->saturate = key->clamp_fragment_color;
2970 write->base_mrf = color_mrf;
2971 write->target = 0;
2972 write->header_size = 0;
2973 write->mlen = 1;
2974 } else {
2975 assume(key->nr_color_regions > 0);
2976 for (int i = 0; i < key->nr_color_regions; ++i) {
2977 write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2978 write->saturate = key->clamp_fragment_color;
2979 write->base_mrf = base_mrf;
2980 write->target = i;
2981 write->header_size = 2;
2982 write->mlen = 3;
2983 }
2984 }
2985 write->eot = true;
2986
2987 calculate_cfg();
2988
2989 assign_constant_locations();
2990 assign_curb_setup();
2991
2992 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2993 if (uniforms > 0) {
2994 assert(mov->src[0].file == FIXED_GRF);
2995 mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
2996 }
2997 }
2998
2999 /**
3000 * Walks through basic blocks, looking for repeated MRF writes and
3001 * removing the later ones.
3002 */
3003 bool
3004 fs_visitor::remove_duplicate_mrf_writes()
3005 {
3006 fs_inst *last_mrf_move[BRW_MAX_MRF(devinfo->gen)];
3007 bool progress = false;
3008
3009 /* Need to update the MRF tracking for compressed instructions. */
3010 if (dispatch_width >= 16)
3011 return false;
3012
3013 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3014
3015 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3016 if (inst->is_control_flow()) {
3017 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3018 }
3019
3020 if (inst->opcode == BRW_OPCODE_MOV &&
3021 inst->dst.file == MRF) {
3022 fs_inst *prev_inst = last_mrf_move[inst->dst.nr];
3023 if (prev_inst && inst->equals(prev_inst)) {
3024 inst->remove(block);
3025 progress = true;
3026 continue;
3027 }
3028 }
3029
3030 /* Clear out the last-write records for MRFs that were overwritten. */
3031 if (inst->dst.file == MRF) {
3032 last_mrf_move[inst->dst.nr] = NULL;
3033 }
3034
3035 if (inst->mlen > 0 && inst->base_mrf != -1) {
3036 /* Found a SEND instruction, which will include two or fewer
3037 * implied MRF writes. We could do better here.
3038 */
3039 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3040 last_mrf_move[inst->base_mrf + i] = NULL;
3041 }
3042 }
3043
3044 /* Clear out any MRF move records whose sources got overwritten. */
3045 for (unsigned i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3046 if (last_mrf_move[i] &&
3047 regions_overlap(inst->dst, inst->size_written,
3048 last_mrf_move[i]->src[0],
3049 last_mrf_move[i]->size_read(0))) {
3050 last_mrf_move[i] = NULL;
3051 }
3052 }
3053
3054 if (inst->opcode == BRW_OPCODE_MOV &&
3055 inst->dst.file == MRF &&
3056 inst->src[0].file != ARF &&
3057 !inst->is_partial_write()) {
3058 last_mrf_move[inst->dst.nr] = inst;
3059 }
3060 }
3061
3062 if (progress)
3063 invalidate_live_intervals();
3064
3065 return progress;
3066 }
3067
3068 static void
3069 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3070 {
3071 /* Clear the flag for registers that actually got read (as expected). */
3072 for (int i = 0; i < inst->sources; i++) {
3073 int grf;
3074 if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) {
3075 grf = inst->src[i].nr;
3076 } else {
3077 continue;
3078 }
3079
3080 if (grf >= first_grf &&
3081 grf < first_grf + grf_len) {
3082 deps[grf - first_grf] = false;
3083 if (inst->exec_size == 16)
3084 deps[grf - first_grf + 1] = false;
3085 }
3086 }
3087 }
3088
3089 /**
3090 * Implements this workaround for the original 965:
3091 *
3092 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3093 * check for post destination dependencies on this instruction, software
3094 * must ensure that there is no destination hazard for the case of ‘write
3095 * followed by a posted write’ shown in the following example.
3096 *
3097 * 1. mov r3 0
3098 * 2. send r3.xy <rest of send instruction>
3099 * 3. mov r2 r3
3100 *
3101 * Due to no post-destination dependency check on the ‘send’, the above
3102 * code sequence could have two instructions (1 and 2) in flight at the
3103 * same time that both consider ‘r3’ as the target of their final writes.
3104 */
3105 void
3106 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3107 fs_inst *inst)
3108 {
3109 int write_len = regs_written(inst);
3110 int first_write_grf = inst->dst.nr;
3111 bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
3112 assert(write_len < (int)sizeof(needs_dep) - 1);
3113
3114 memset(needs_dep, false, sizeof(needs_dep));
3115 memset(needs_dep, true, write_len);
3116
3117 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3118
3119 /* Walk backwards looking for writes to registers we're writing which
3120 * aren't read since being written. If we hit the start of the program,
3121 * we assume that there are no outstanding dependencies on entry to the
3122 * program.
3123 */
3124 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
3125 /* If we hit control flow, assume that there *are* outstanding
3126 * dependencies, and force their cleanup before our instruction.
3127 */
3128 if (block->start() == scan_inst && block->num != 0) {
3129 for (int i = 0; i < write_len; i++) {
3130 if (needs_dep[i])
3131 DEP_RESOLVE_MOV(fs_builder(this, block, inst),
3132 first_write_grf + i);
3133 }
3134 return;
3135 }
3136
3137 /* We insert our reads as late as possible on the assumption that any
3138 * instruction but a MOV that might have left us an outstanding
3139 * dependency has more latency than a MOV.
3140 */
3141 if (scan_inst->dst.file == VGRF) {
3142 for (unsigned i = 0; i < regs_written(scan_inst); i++) {
3143 int reg = scan_inst->dst.nr + i;
3144
3145 if (reg >= first_write_grf &&
3146 reg < first_write_grf + write_len &&
3147 needs_dep[reg - first_write_grf]) {
3148 DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
3149 needs_dep[reg - first_write_grf] = false;
3150 if (scan_inst->exec_size == 16)
3151 needs_dep[reg - first_write_grf + 1] = false;
3152 }
3153 }
3154 }
3155
3156 /* Clear the flag for registers that actually got read (as expected). */
3157 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3158
3159 /* Continue the loop only if we haven't resolved all the dependencies */
3160 int i;
3161 for (i = 0; i < write_len; i++) {
3162 if (needs_dep[i])
3163 break;
3164 }
3165 if (i == write_len)
3166 return;
3167 }
3168 }
3169
3170 /**
3171 * Implements this workaround for the original 965:
3172 *
3173 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3174 * used as a destination register until after it has been sourced by an
3175 * instruction with a different destination register.
3176 */
3177 void
3178 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3179 {
3180 int write_len = regs_written(inst);
3181 int first_write_grf = inst->dst.nr;
3182 bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
3183 assert(write_len < (int)sizeof(needs_dep) - 1);
3184
3185 memset(needs_dep, false, sizeof(needs_dep));
3186 memset(needs_dep, true, write_len);
3187 /* Walk forwards looking for writes to registers we're writing which aren't
3188 * read before being written.
3189 */
3190 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst) {
3191 /* If we hit control flow, force resolve all remaining dependencies. */
3192 if (block->end() == scan_inst && block->num != cfg->num_blocks - 1) {
3193 for (int i = 0; i < write_len; i++) {
3194 if (needs_dep[i])
3195 DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
3196 first_write_grf + i);
3197 }
3198 return;
3199 }
3200
3201 /* Clear the flag for registers that actually got read (as expected). */
3202 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3203
3204 /* We insert our reads as late as possible since they're reading the
3205 * result of a SEND, which has massive latency.
3206 */
3207 if (scan_inst->dst.file == VGRF &&
3208 scan_inst->dst.nr >= first_write_grf &&
3209 scan_inst->dst.nr < first_write_grf + write_len &&
3210 needs_dep[scan_inst->dst.nr - first_write_grf]) {
3211 DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
3212 scan_inst->dst.nr);
3213 needs_dep[scan_inst->dst.nr - first_write_grf] = false;
3214 }
3215
3216 /* Continue the loop only if we haven't resolved all the dependencies */
3217 int i;
3218 for (i = 0; i < write_len; i++) {
3219 if (needs_dep[i])
3220 break;
3221 }
3222 if (i == write_len)
3223 return;
3224 }
3225 }
3226
3227 void
3228 fs_visitor::insert_gen4_send_dependency_workarounds()
3229 {
3230 if (devinfo->gen != 4 || devinfo->is_g4x)
3231 return;
3232
3233 bool progress = false;
3234
3235 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3236 if (inst->mlen != 0 && inst->dst.file == VGRF) {
3237 insert_gen4_pre_send_dependency_workarounds(block, inst);
3238 insert_gen4_post_send_dependency_workarounds(block, inst);
3239 progress = true;
3240 }
3241 }
3242
3243 if (progress)
3244 invalidate_live_intervals();
3245 }
3246
3247 /**
3248 * Turns the generic expression-style uniform pull constant load instruction
3249 * into a hardware-specific series of instructions for loading a pull
3250 * constant.
3251 *
3252 * The expression style allows the CSE pass before this to optimize out
3253 * repeated loads from the same offset, and gives the pre-register-allocation
3254 * scheduling full flexibility, while the conversion to native instructions
3255 * allows the post-register-allocation scheduler the best information
3256 * possible.
3257 *
3258 * Note that execution masking for setting up pull constant loads is special:
3259 * the channels that need to be written are unrelated to the current execution
3260 * mask, since a later instruction will use one of the result channels as a
3261 * source operand for all 8 or 16 of its channels.
3262 */
3263 void
3264 fs_visitor::lower_uniform_pull_constant_loads()
3265 {
3266 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3267 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3268 continue;
3269
3270 if (devinfo->gen >= 7) {
3271 const fs_builder ubld = fs_builder(this, block, inst).exec_all();
3272 const fs_reg payload = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD);
3273
3274 ubld.group(8, 0).MOV(payload,
3275 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
3276 ubld.group(1, 0).MOV(component(payload, 2),
3277 brw_imm_ud(inst->src[1].ud / 16));
3278
3279 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3280 inst->src[1] = payload;
3281 inst->header_size = 1;
3282 inst->mlen = 1;
3283
3284 invalidate_live_intervals();
3285 } else {
3286 /* Before register allocation, we didn't tell the scheduler about the
3287 * MRF we use. We know it's safe to use this MRF because nothing
3288 * else does except for register spill/unspill, which generates and
3289 * uses its MRF within a single IR instruction.
3290 */
3291 inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
3292 inst->mlen = 1;
3293 }
3294 }
3295 }
3296
3297 bool
3298 fs_visitor::lower_load_payload()
3299 {
3300 bool progress = false;
3301
3302 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3303 if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3304 continue;
3305
3306 assert(inst->dst.file == MRF || inst->dst.file == VGRF);
3307 assert(inst->saturate == false);
3308 fs_reg dst = inst->dst;
3309
3310 /* Get rid of COMPR4. We'll add it back in if we need it */
3311 if (dst.file == MRF)
3312 dst.nr = dst.nr & ~BRW_MRF_COMPR4;
3313
3314 const fs_builder ibld(this, block, inst);
3315 const fs_builder hbld = ibld.exec_all().group(8, 0);
3316
3317 for (uint8_t i = 0; i < inst->header_size; i++) {
3318 if (inst->src[i].file != BAD_FILE) {
3319 fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3320 fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3321 hbld.MOV(mov_dst, mov_src);
3322 }
3323 dst = offset(dst, hbld, 1);
3324 }
3325
3326 if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) &&
3327 inst->exec_size > 8) {
3328 /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3329 * a straightforward copy. Instead, the result of the
3330 * LOAD_PAYLOAD is treated as interleaved and the first four
3331 * non-header sources are unpacked as:
3332 *
3333 * m + 0: r0
3334 * m + 1: g0
3335 * m + 2: b0
3336 * m + 3: a0
3337 * m + 4: r1
3338 * m + 5: g1
3339 * m + 6: b1
3340 * m + 7: a1
3341 *
3342 * This is used for gen <= 5 fb writes.
3343 */
3344 assert(inst->exec_size == 16);
3345 assert(inst->header_size + 4 <= inst->sources);
3346 for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3347 if (inst->src[i].file != BAD_FILE) {
3348 if (devinfo->has_compr4) {
3349 fs_reg compr4_dst = retype(dst, inst->src[i].type);
3350 compr4_dst.nr |= BRW_MRF_COMPR4;
3351 ibld.MOV(compr4_dst, inst->src[i]);
3352 } else {
3353 /* Platform doesn't have COMPR4. We have to fake it */
3354 fs_reg mov_dst = retype(dst, inst->src[i].type);
3355 ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
3356 mov_dst.nr += 4;
3357 ibld.half(1).MOV(mov_dst, half(inst->src[i], 1));
3358 }
3359 }
3360
3361 dst.nr++;
3362 }
3363
3364 /* The loop above only ever incremented us through the first set
3365 * of 4 registers. However, thanks to the magic of COMPR4, we
3366 * actually wrote to the first 8 registers, so we need to take
3367 * that into account now.
3368 */
3369 dst.nr += 4;
3370
3371 /* The COMPR4 code took care of the first 4 sources. We'll let
3372 * the regular path handle any remaining sources. Yes, we are
3373 * modifying the instruction but we're about to delete it so
3374 * this really doesn't hurt anything.
3375 */
3376 inst->header_size += 4;
3377 }
3378
3379 for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3380 if (inst->src[i].file != BAD_FILE)
3381 ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
3382 dst = offset(dst, ibld, 1);
3383 }
3384
3385 inst->remove(block);
3386 progress = true;
3387 }
3388
3389 if (progress)
3390 invalidate_live_intervals();
3391
3392 return progress;
3393 }
3394
3395 bool
3396 fs_visitor::lower_integer_multiplication()
3397 {
3398 bool progress = false;
3399
3400 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3401 const fs_builder ibld(this, block, inst);
3402
3403 if (inst->opcode == BRW_OPCODE_MUL) {
3404 if (inst->dst.is_accumulator() ||
3405 (inst->dst.type != BRW_REGISTER_TYPE_D &&
3406 inst->dst.type != BRW_REGISTER_TYPE_UD))
3407 continue;
3408
3409 /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit
3410 * operation directly, but CHV/BXT cannot.
3411 */
3412 if (devinfo->gen >= 8 &&
3413 !devinfo->is_cherryview && !gen_device_info_is_9lp(devinfo))
3414 continue;
3415
3416 if (inst->src[1].file == IMM &&
3417 inst->src[1].ud < (1 << 16)) {
3418 /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3419 * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3420 * src1 are used.
3421 *
3422 * If multiplying by an immediate value that fits in 16-bits, do a
3423 * single MUL instruction with that value in the proper location.
3424 */
3425 if (devinfo->gen < 7) {
3426 fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8),
3427 inst->dst.type);
3428 ibld.MOV(imm, inst->src[1]);
3429 ibld.MUL(inst->dst, imm, inst->src[0]);
3430 } else {
3431 const bool ud = (inst->src[1].type == BRW_REGISTER_TYPE_UD);
3432 ibld.MUL(inst->dst, inst->src[0],
3433 ud ? brw_imm_uw(inst->src[1].ud)
3434 : brw_imm_w(inst->src[1].d));
3435 }
3436 } else {
3437 /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3438 * do 32-bit integer multiplication in one instruction, but instead
3439 * must do a sequence (which actually calculates a 64-bit result):
3440 *
3441 * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
3442 * mach(8) null g3<8,8,1>D g4<8,8,1>D
3443 * mov(8) g2<1>D acc0<8,8,1>D
3444 *
3445 * But on Gen > 6, the ability to use second accumulator register
3446 * (acc1) for non-float data types was removed, preventing a simple
3447 * implementation in SIMD16. A 16-channel result can be calculated by
3448 * executing the three instructions twice in SIMD8, once with quarter
3449 * control of 1Q for the first eight channels and again with 2Q for
3450 * the second eight channels.
3451 *
3452 * Which accumulator register is implicitly accessed (by AccWrEnable
3453 * for instance) is determined by the quarter control. Unfortunately
3454 * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3455 * implicit accumulator access by an instruction with 2Q will access
3456 * acc1 regardless of whether the data type is usable in acc1.
3457 *
3458 * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3459 * integer data types.
3460 *
3461 * Since we only want the low 32-bits of the result, we can do two
3462 * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3463 * adjust the high result and add them (like the mach is doing):
3464 *
3465 * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
3466 * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
3467 * shl(8) g9<1>D g8<8,8,1>D 16D
3468 * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
3469 *
3470 * We avoid the shl instruction by realizing that we only want to add
3471 * the low 16-bits of the "high" result to the high 16-bits of the
3472 * "low" result and using proper regioning on the add:
3473 *
3474 * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
3475 * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
3476 * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
3477 *
3478 * Since it does not use the (single) accumulator register, we can
3479 * schedule multi-component multiplications much better.
3480 */
3481
3482 fs_reg orig_dst = inst->dst;
3483 if (orig_dst.is_null() || orig_dst.file == MRF) {
3484 inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
3485 inst->dst.type);
3486 }
3487 fs_reg low = inst->dst;
3488 fs_reg high(VGRF, alloc.allocate(dispatch_width / 8),
3489 inst->dst.type);
3490
3491 if (devinfo->gen >= 7) {
3492 if (inst->src[1].file == IMM) {
3493 ibld.MUL(low, inst->src[0],
3494 brw_imm_uw(inst->src[1].ud & 0xffff));
3495 ibld.MUL(high, inst->src[0],
3496 brw_imm_uw(inst->src[1].ud >> 16));
3497 } else {
3498 ibld.MUL(low, inst->src[0],
3499 subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0));
3500 ibld.MUL(high, inst->src[0],
3501 subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 1));
3502 }
3503 } else {
3504 ibld.MUL(low, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 0),
3505 inst->src[1]);
3506 ibld.MUL(high, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 1),
3507 inst->src[1]);
3508 }
3509
3510 ibld.ADD(subscript(inst->dst, BRW_REGISTER_TYPE_UW, 1),
3511 subscript(low, BRW_REGISTER_TYPE_UW, 1),
3512 subscript(high, BRW_REGISTER_TYPE_UW, 0));
3513
3514 if (inst->conditional_mod || orig_dst.file == MRF) {
3515 set_condmod(inst->conditional_mod,
3516 ibld.MOV(orig_dst, inst->dst));
3517 }
3518 }
3519
3520 } else if (inst->opcode == SHADER_OPCODE_MULH) {
3521 /* Should have been lowered to 8-wide. */
3522 assert(inst->exec_size <= get_lowered_simd_width(devinfo, inst));
3523 const fs_reg acc = retype(brw_acc_reg(inst->exec_size),
3524 inst->dst.type);
3525 fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
3526 fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
3527
3528 if (devinfo->gen >= 8) {
3529 /* Until Gen8, integer multiplies read 32-bits from one source,
3530 * and 16-bits from the other, and relying on the MACH instruction
3531 * to generate the high bits of the result.
3532 *
3533 * On Gen8, the multiply instruction does a full 32x32-bit
3534 * multiply, but in order to do a 64-bit multiply we can simulate
3535 * the previous behavior and then use a MACH instruction.
3536 *
3537 * FINISHME: Don't use source modifiers on src1.
3538 */
3539 assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
3540 mul->src[1].type == BRW_REGISTER_TYPE_UD);
3541 mul->src[1].type = BRW_REGISTER_TYPE_UW;
3542 mul->src[1].stride *= 2;
3543
3544 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
3545 inst->group > 0) {
3546 /* Among other things the quarter control bits influence which
3547 * accumulator register is used by the hardware for instructions
3548 * that access the accumulator implicitly (e.g. MACH). A
3549 * second-half instruction would normally map to acc1, which
3550 * doesn't exist on Gen7 and up (the hardware does emulate it for
3551 * floating-point instructions *only* by taking advantage of the
3552 * extra precision of acc0 not normally used for floating point
3553 * arithmetic).
3554 *
3555 * HSW and up are careful enough not to try to access an
3556 * accumulator register that doesn't exist, but on earlier Gen7
3557 * hardware we need to make sure that the quarter control bits are
3558 * zero to avoid non-deterministic behaviour and emit an extra MOV
3559 * to get the result masked correctly according to the current
3560 * channel enables.
3561 */
3562 mach->group = 0;
3563 mach->force_writemask_all = true;
3564 mach->dst = ibld.vgrf(inst->dst.type);
3565 ibld.MOV(inst->dst, mach->dst);
3566 }
3567 } else {
3568 continue;
3569 }
3570
3571 inst->remove(block);
3572 progress = true;
3573 }
3574
3575 if (progress)
3576 invalidate_live_intervals();
3577
3578 return progress;
3579 }
3580
3581 bool
3582 fs_visitor::lower_minmax()
3583 {
3584 assert(devinfo->gen < 6);
3585
3586 bool progress = false;
3587
3588 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3589 const fs_builder ibld(this, block, inst);
3590
3591 if (inst->opcode == BRW_OPCODE_SEL &&
3592 inst->predicate == BRW_PREDICATE_NONE) {
3593 /* FIXME: Using CMP doesn't preserve the NaN propagation semantics of
3594 * the original SEL.L/GE instruction
3595 */
3596 ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
3597 inst->conditional_mod);
3598 inst->predicate = BRW_PREDICATE_NORMAL;
3599 inst->conditional_mod = BRW_CONDITIONAL_NONE;
3600
3601 progress = true;
3602 }
3603 }
3604
3605 if (progress)
3606 invalidate_live_intervals();
3607
3608 return progress;
3609 }
3610
3611 static void
3612 setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
3613 fs_reg *dst, fs_reg color, unsigned components)
3614 {
3615 if (key->clamp_fragment_color) {
3616 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
3617 assert(color.type == BRW_REGISTER_TYPE_F);
3618
3619 for (unsigned i = 0; i < components; i++)
3620 set_saturate(true,
3621 bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
3622
3623 color = tmp;
3624 }
3625
3626 for (unsigned i = 0; i < components; i++)
3627 dst[i] = offset(color, bld, i);
3628 }
3629
3630 static void
3631 lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
3632 const struct brw_wm_prog_data *prog_data,
3633 const brw_wm_prog_key *key,
3634 const fs_visitor::thread_payload &payload)
3635 {
3636 assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
3637 const gen_device_info *devinfo = bld.shader->devinfo;
3638 const fs_reg &color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
3639 const fs_reg &color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
3640 const fs_reg &src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
3641 const fs_reg &src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
3642 const fs_reg &dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
3643 const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
3644 fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
3645 const unsigned components =
3646 inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
3647
3648 /* We can potentially have a message length of up to 15, so we have to set
3649 * base_mrf to either 0 or 1 in order to fit in m0..m15.
3650 */
3651 fs_reg sources[15];
3652 int header_size = 2, payload_header_size;
3653 unsigned length = 0;
3654
3655 /* From the Sandy Bridge PRM, volume 4, page 198:
3656 *
3657 * "Dispatched Pixel Enables. One bit per pixel indicating
3658 * which pixels were originally enabled when the thread was
3659 * dispatched. This field is only required for the end-of-
3660 * thread message and on all dual-source messages."
3661 */
3662 if (devinfo->gen >= 6 &&
3663 (devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) &&
3664 color1.file == BAD_FILE &&
3665 key->nr_color_regions == 1) {
3666 header_size = 0;
3667 }
3668
3669 if (header_size != 0) {
3670 assert(header_size == 2);
3671 /* Allocate 2 registers for a header */
3672 length += 2;
3673 }
3674
3675 if (payload.aa_dest_stencil_reg) {
3676 sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
3677 bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
3678 .MOV(sources[length],
3679 fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
3680 length++;
3681 }
3682
3683 if (sample_mask.file != BAD_FILE) {
3684 sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1),
3685 BRW_REGISTER_TYPE_UD);
3686
3687 /* Hand over gl_SampleMask. Only the lower 16 bits of each channel are
3688 * relevant. Since it's unsigned single words one vgrf is always
3689 * 16-wide, but only the lower or higher 8 channels will be used by the
3690 * hardware when doing a SIMD8 write depending on whether we have
3691 * selected the subspans for the first or second half respectively.
3692 */
3693 assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
3694 sample_mask.type = BRW_REGISTER_TYPE_UW;
3695 sample_mask.stride *= 2;
3696
3697 bld.exec_all().annotate("FB write oMask")
3698 .MOV(horiz_offset(retype(sources[length], BRW_REGISTER_TYPE_UW),
3699 inst->group),
3700 sample_mask);
3701 length++;
3702 }
3703
3704 payload_header_size = length;
3705
3706 if (src0_alpha.file != BAD_FILE) {
3707 /* FIXME: This is being passed at the wrong location in the payload and
3708 * doesn't work when gl_SampleMask and MRTs are used simultaneously.
3709 * It's supposed to be immediately before oMask but there seems to be no
3710 * reasonable way to pass them in the correct order because LOAD_PAYLOAD
3711 * requires header sources to form a contiguous segment at the beginning
3712 * of the message and src0_alpha has per-channel semantics.
3713 */
3714 setup_color_payload(bld, key, &sources[length], src0_alpha, 1);
3715 length++;
3716 } else if (key->replicate_alpha && inst->target != 0) {
3717 /* Handle the case when fragment shader doesn't write to draw buffer
3718 * zero. No need to call setup_color_payload() for src0_alpha because
3719 * alpha value will be undefined.
3720 */
3721 length++;
3722 }
3723
3724 setup_color_payload(bld, key, &sources[length], color0, components);
3725 length += 4;
3726
3727 if (color1.file != BAD_FILE) {
3728 setup_color_payload(bld, key, &sources[length], color1, components);
3729 length += 4;
3730 }
3731
3732 if (src_depth.file != BAD_FILE) {
3733 sources[length] = src_depth;
3734 length++;
3735 }
3736
3737 if (dst_depth.file != BAD_FILE) {
3738 sources[length] = dst_depth;
3739 length++;
3740 }
3741
3742 if (src_stencil.file != BAD_FILE) {
3743 assert(devinfo->gen >= 9);
3744 assert(bld.dispatch_width() != 16);
3745
3746 /* XXX: src_stencil is only available on gen9+. dst_depth is never
3747 * available on gen9+. As such it's impossible to have both enabled at the
3748 * same time and therefore length cannot overrun the array.
3749 */
3750 assert(length < 15);
3751
3752 sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD);
3753 bld.exec_all().annotate("FB write OS")
3754 .MOV(retype(sources[length], BRW_REGISTER_TYPE_UB),
3755 subscript(src_stencil, BRW_REGISTER_TYPE_UB, 0));
3756 length++;
3757 }
3758
3759 fs_inst *load;
3760 if (devinfo->gen >= 7) {
3761 /* Send from the GRF */
3762 fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F);
3763 load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
3764 payload.nr = bld.shader->alloc.allocate(regs_written(load));
3765 load->dst = payload;
3766
3767 inst->src[0] = payload;
3768 inst->resize_sources(1);
3769 } else {
3770 /* Send from the MRF */
3771 load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
3772 sources, length, payload_header_size);
3773
3774 /* On pre-SNB, we have to interlace the color values. LOAD_PAYLOAD
3775 * will do this for us if we just give it a COMPR4 destination.
3776 */
3777 if (devinfo->gen < 6 && bld.dispatch_width() == 16)
3778 load->dst.nr |= BRW_MRF_COMPR4;
3779
3780 inst->resize_sources(0);
3781 inst->base_mrf = 1;
3782 }
3783
3784 inst->opcode = FS_OPCODE_FB_WRITE;
3785 inst->mlen = regs_written(load);
3786 inst->header_size = header_size;
3787 }
3788
3789 static void
3790 lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst)
3791 {
3792 const fs_builder &ubld = bld.exec_all();
3793 const unsigned length = 2;
3794 const fs_reg header = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD, length);
3795
3796 ubld.group(16, 0)
3797 .MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
3798
3799 inst->resize_sources(1);
3800 inst->src[0] = header;
3801 inst->opcode = FS_OPCODE_FB_READ;
3802 inst->mlen = length;
3803 inst->header_size = length;
3804 }
3805
3806 static void
3807 lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
3808 const fs_reg &coordinate,
3809 const fs_reg &shadow_c,
3810 const fs_reg &lod, const fs_reg &lod2,
3811 const fs_reg &surface,
3812 const fs_reg &sampler,
3813 unsigned coord_components,
3814 unsigned grad_components)
3815 {
3816 const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||
3817 op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);
3818 fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);
3819 fs_reg msg_end = msg_begin;
3820
3821 /* g0 header. */
3822 msg_end = offset(msg_end, bld.group(8, 0), 1);
3823
3824 for (unsigned i = 0; i < coord_components; i++)
3825 bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
3826 offset(coordinate, bld, i));
3827
3828 msg_end = offset(msg_end, bld, coord_components);
3829
3830 /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
3831 * require all three components to be present and zero if they are unused.
3832 */
3833 if (coord_components > 0 &&
3834 (has_lod || shadow_c.file != BAD_FILE ||
3835 (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
3836 for (unsigned i = coord_components; i < 3; i++)
3837 bld.MOV(offset(msg_end, bld, i), brw_imm_f(0.0f));
3838
3839 msg_end = offset(msg_end, bld, 3 - coord_components);
3840 }
3841
3842 if (op == SHADER_OPCODE_TXD) {
3843 /* TXD unsupported in SIMD16 mode. */
3844 assert(bld.dispatch_width() == 8);
3845
3846 /* the slots for u and v are always present, but r is optional */
3847 if (coord_components < 2)
3848 msg_end = offset(msg_end, bld, 2 - coord_components);
3849
3850 /* P = u, v, r
3851 * dPdx = dudx, dvdx, drdx
3852 * dPdy = dudy, dvdy, drdy
3853 *
3854 * 1-arg: Does not exist.
3855 *
3856 * 2-arg: dudx dvdx dudy dvdy
3857 * dPdx.x dPdx.y dPdy.x dPdy.y
3858 * m4 m5 m6 m7
3859 *
3860 * 3-arg: dudx dvdx drdx dudy dvdy drdy
3861 * dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
3862 * m5 m6 m7 m8 m9 m10
3863 */
3864 for (unsigned i = 0; i < grad_components; i++)
3865 bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
3866
3867 msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
3868
3869 for (unsigned i = 0; i < grad_components; i++)
3870 bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
3871
3872 msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
3873 }
3874
3875 if (has_lod) {
3876 /* Bias/LOD with shadow comparator is unsupported in SIMD16 -- *Without*
3877 * shadow comparator (including RESINFO) it's unsupported in SIMD8 mode.
3878 */
3879 assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
3880 bld.dispatch_width() == 16);
3881
3882 const brw_reg_type type =
3883 (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?
3884 BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
3885 bld.MOV(retype(msg_end, type), lod);
3886 msg_end = offset(msg_end, bld, 1);
3887 }
3888
3889 if (shadow_c.file != BAD_FILE) {
3890 if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
3891 /* There's no plain shadow compare message, so we use shadow
3892 * compare with a bias of 0.0.
3893 */
3894 bld.MOV(msg_end, brw_imm_f(0.0f));
3895 msg_end = offset(msg_end, bld, 1);
3896 }
3897
3898 bld.MOV(msg_end, shadow_c);
3899 msg_end = offset(msg_end, bld, 1);
3900 }
3901
3902 inst->opcode = op;
3903 inst->src[0] = reg_undef;
3904 inst->src[1] = surface;
3905 inst->src[2] = sampler;
3906 inst->resize_sources(3);
3907 inst->base_mrf = msg_begin.nr;
3908 inst->mlen = msg_end.nr - msg_begin.nr;
3909 inst->header_size = 1;
3910 }
3911
3912 static void
3913 lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
3914 const fs_reg &coordinate,
3915 const fs_reg &shadow_c,
3916 const fs_reg &lod, const fs_reg &lod2,
3917 const fs_reg &sample_index,
3918 const fs_reg &surface,
3919 const fs_reg &sampler,
3920 unsigned coord_components,
3921 unsigned grad_components)
3922 {
3923 fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
3924 fs_reg msg_coords = message;
3925 unsigned header_size = 0;
3926
3927 if (inst->offset != 0) {
3928 /* The offsets set up by the visitor are in the m1 header, so we can't
3929 * go headerless.
3930 */
3931 header_size = 1;
3932 message.nr--;
3933 }
3934
3935 for (unsigned i = 0; i < coord_components; i++)
3936 bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type),
3937 offset(coordinate, bld, i));
3938
3939 fs_reg msg_end = offset(msg_coords, bld, coord_components);
3940 fs_reg msg_lod = offset(msg_coords, bld, 4);
3941
3942 if (shadow_c.file != BAD_FILE) {
3943 fs_reg msg_shadow = msg_lod;
3944 bld.MOV(msg_shadow, shadow_c);
3945 msg_lod = offset(msg_shadow, bld, 1);
3946 msg_end = msg_lod;
3947 }
3948
3949 switch (op) {
3950 case SHADER_OPCODE_TXL:
3951 case FS_OPCODE_TXB:
3952 bld.MOV(msg_lod, lod);
3953 msg_end = offset(msg_lod, bld, 1);
3954 break;
3955 case SHADER_OPCODE_TXD:
3956 /**
3957 * P = u, v, r
3958 * dPdx = dudx, dvdx, drdx
3959 * dPdy = dudy, dvdy, drdy
3960 *
3961 * Load up these values:
3962 * - dudx dudy dvdx dvdy drdx drdy
3963 * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
3964 */
3965 msg_end = msg_lod;
3966 for (unsigned i = 0; i < grad_components; i++) {
3967 bld.MOV(msg_end, offset(lod, bld, i));
3968 msg_end = offset(msg_end, bld, 1);
3969
3970 bld.MOV(msg_end, offset(lod2, bld, i));
3971 msg_end = offset(msg_end, bld, 1);
3972 }
3973 break;
3974 case SHADER_OPCODE_TXS:
3975 msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
3976 bld.MOV(msg_lod, lod);
3977 msg_end = offset(msg_lod, bld, 1);
3978 break;
3979 case SHADER_OPCODE_TXF:
3980 msg_lod = offset(msg_coords, bld, 3);
3981 bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
3982 msg_end = offset(msg_lod, bld, 1);
3983 break;
3984 case SHADER_OPCODE_TXF_CMS:
3985 msg_lod = offset(msg_coords, bld, 3);
3986 /* lod */
3987 bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
3988 /* sample index */
3989 bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
3990 msg_end = offset(msg_lod, bld, 2);
3991 break;
3992 default:
3993 break;
3994 }
3995
3996 inst->opcode = op;
3997 inst->src[0] = reg_undef;
3998 inst->src[1] = surface;
3999 inst->src[2] = sampler;
4000 inst->resize_sources(3);
4001 inst->base_mrf = message.nr;
4002 inst->mlen = msg_end.nr - message.nr;
4003 inst->header_size = header_size;
4004
4005 /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
4006 assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
4007 }
4008
4009 static bool
4010 is_high_sampler(const struct gen_device_info *devinfo, const fs_reg &sampler)
4011 {
4012 if (devinfo->gen < 8 && !devinfo->is_haswell)
4013 return false;
4014
4015 return sampler.file != IMM || sampler.ud >= 16;
4016 }
4017
4018 static void
4019 lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
4020 const fs_reg &coordinate,
4021 const fs_reg &shadow_c,
4022 fs_reg lod, const fs_reg &lod2,
4023 const fs_reg &sample_index,
4024 const fs_reg &mcs,
4025 const fs_reg &surface,
4026 const fs_reg &sampler,
4027 const fs_reg &tg4_offset,
4028 unsigned coord_components,
4029 unsigned grad_components)
4030 {
4031 const gen_device_info *devinfo = bld.shader->devinfo;
4032 unsigned reg_width = bld.dispatch_width() / 8;
4033 unsigned header_size = 0, length = 0;
4034 fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
4035 for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
4036 sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
4037
4038 if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
4039 inst->offset != 0 || inst->eot ||
4040 op == SHADER_OPCODE_SAMPLEINFO ||
4041 is_high_sampler(devinfo, sampler)) {
4042 /* For general texture offsets (no txf workaround), we need a header to
4043 * put them in. Note that we're only reserving space for it in the
4044 * message payload as it will be initialized implicitly by the
4045 * generator.
4046 *
4047 * TG4 needs to place its channel select in the header, for interaction
4048 * with ARB_texture_swizzle. The sampler index is only 4-bits, so for
4049 * larger sampler numbers we need to offset the Sampler State Pointer in
4050 * the header.
4051 */
4052 header_size = 1;
4053 sources[0] = fs_reg();
4054 length++;
4055
4056 /* If we're requesting fewer than four channels worth of response,
4057 * and we have an explicit header, we need to set up the sampler
4058 * writemask. It's reversed from normal: 1 means "don't write".
4059 */
4060 if (!inst->eot && regs_written(inst) != 4 * reg_width) {
4061 assert(regs_written(inst) % reg_width == 0);
4062 unsigned mask = ~((1 << (regs_written(inst) / reg_width)) - 1) & 0xf;
4063 inst->offset |= mask << 12;
4064 }
4065 }
4066
4067 if (shadow_c.file != BAD_FILE) {
4068 bld.MOV(sources[length], shadow_c);
4069 length++;
4070 }
4071
4072 bool coordinate_done = false;
4073
4074 /* Set up the LOD info */
4075 switch (op) {
4076 case FS_OPCODE_TXB:
4077 case SHADER_OPCODE_TXL:
4078 if (devinfo->gen >= 9 && op == SHADER_OPCODE_TXL && lod.is_zero()) {
4079 op = SHADER_OPCODE_TXL_LZ;
4080 break;
4081 }
4082 bld.MOV(sources[length], lod);
4083 length++;
4084 break;
4085 case SHADER_OPCODE_TXD:
4086 /* TXD should have been lowered in SIMD16 mode. */
4087 assert(bld.dispatch_width() == 8);
4088
4089 /* Load dPdx and the coordinate together:
4090 * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
4091 */
4092 for (unsigned i = 0; i < coord_components; i++) {
4093 bld.MOV(sources[length++], offset(coordinate, bld, i));
4094
4095 /* For cube map array, the coordinate is (u,v,r,ai) but there are
4096 * only derivatives for (u, v, r).
4097 */
4098 if (i < grad_components) {
4099 bld.MOV(sources[length++], offset(lod, bld, i));
4100 bld.MOV(sources[length++], offset(lod2, bld, i));
4101 }
4102 }
4103
4104 coordinate_done = true;
4105 break;
4106 case SHADER_OPCODE_TXS:
4107 bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
4108 length++;
4109 break;
4110 case SHADER_OPCODE_TXF:
4111 /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
4112 * On Gen9 they are u, v, lod, r
4113 */
4114 bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D), coordinate);
4115
4116 if (devinfo->gen >= 9) {
4117 if (coord_components >= 2) {
4118 bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D),
4119 offset(coordinate, bld, 1));
4120 } else {
4121 sources[length] = brw_imm_d(0);
4122 }
4123 length++;
4124 }
4125
4126 if (devinfo->gen >= 9 && lod.is_zero()) {
4127 op = SHADER_OPCODE_TXF_LZ;
4128 } else {
4129 bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
4130 length++;
4131 }
4132
4133 for (unsigned i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++)
4134 bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
4135 offset(coordinate, bld, i));
4136
4137 coordinate_done = true;
4138 break;
4139
4140 case SHADER_OPCODE_TXF_CMS:
4141 case SHADER_OPCODE_TXF_CMS_W:
4142 case SHADER_OPCODE_TXF_UMS:
4143 case SHADER_OPCODE_TXF_MCS:
4144 if (op == SHADER_OPCODE_TXF_UMS ||
4145 op == SHADER_OPCODE_TXF_CMS ||
4146 op == SHADER_OPCODE_TXF_CMS_W) {
4147 bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
4148 length++;
4149 }
4150
4151 if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) {
4152 /* Data from the multisample control surface. */
4153 bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
4154 length++;
4155
4156 /* On Gen9+ we'll use ld2dms_w instead which has two registers for
4157 * the MCS data.
4158 */
4159 if (op == SHADER_OPCODE_TXF_CMS_W) {
4160 bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD),
4161 mcs.file == IMM ?
4162 mcs :
4163 offset(mcs, bld, 1));
4164 length++;
4165 }
4166 }
4167
4168 /* There is no offsetting for this message; just copy in the integer
4169 * texture coordinates.
4170 */
4171 for (unsigned i = 0; i < coord_components; i++)
4172 bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
4173 offset(coordinate, bld, i));
4174
4175 coordinate_done = true;
4176 break;
4177 case SHADER_OPCODE_TG4_OFFSET:
4178 /* More crazy intermixing */
4179 for (unsigned i = 0; i < 2; i++) /* u, v */
4180 bld.MOV(sources[length++], offset(coordinate, bld, i));
4181
4182 for (unsigned i = 0; i < 2; i++) /* offu, offv */
4183 bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
4184 offset(tg4_offset, bld, i));
4185
4186 if (coord_components == 3) /* r if present */
4187 bld.MOV(sources[length++], offset(coordinate, bld, 2));
4188
4189 coordinate_done = true;
4190 break;
4191 default:
4192 break;
4193 }
4194
4195 /* Set up the coordinate (except for cases where it was done above) */
4196 if (!coordinate_done) {
4197 for (unsigned i = 0; i < coord_components; i++)
4198 bld.MOV(sources[length++], offset(coordinate, bld, i));
4199 }
4200
4201 int mlen;
4202 if (reg_width == 2)
4203 mlen = length * reg_width - header_size;
4204 else
4205 mlen = length * reg_width;
4206
4207 const fs_reg src_payload = fs_reg(VGRF, bld.shader->alloc.allocate(mlen),
4208 BRW_REGISTER_TYPE_F);
4209 bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
4210
4211 /* Generate the SEND. */
4212 inst->opcode = op;
4213 inst->src[0] = src_payload;
4214 inst->src[1] = surface;
4215 inst->src[2] = sampler;
4216 inst->resize_sources(3);
4217 inst->mlen = mlen;
4218 inst->header_size = header_size;
4219
4220 /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
4221 assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
4222 }
4223
4224 static void
4225 lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
4226 {
4227 const gen_device_info *devinfo = bld.shader->devinfo;
4228 const fs_reg &coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
4229 const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
4230 const fs_reg &lod = inst->src[TEX_LOGICAL_SRC_LOD];
4231 const fs_reg &lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
4232 const fs_reg &sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
4233 const fs_reg &mcs = inst->src[TEX_LOGICAL_SRC_MCS];
4234 const fs_reg &surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
4235 const fs_reg &sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
4236 const fs_reg &tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
4237 assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
4238 const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
4239 assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
4240 const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
4241
4242 if (devinfo->gen >= 7) {
4243 lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
4244 shadow_c, lod, lod2, sample_index,
4245 mcs, surface, sampler, tg4_offset,
4246 coord_components, grad_components);
4247 } else if (devinfo->gen >= 5) {
4248 lower_sampler_logical_send_gen5(bld, inst, op, coordinate,
4249 shadow_c, lod, lod2, sample_index,
4250 surface, sampler,
4251 coord_components, grad_components);
4252 } else {
4253 lower_sampler_logical_send_gen4(bld, inst, op, coordinate,
4254 shadow_c, lod, lod2,
4255 surface, sampler,
4256 coord_components, grad_components);
4257 }
4258 }
4259
4260 /**
4261 * Initialize the header present in some typed and untyped surface
4262 * messages.
4263 */
4264 static fs_reg
4265 emit_surface_header(const fs_builder &bld, const fs_reg &sample_mask)
4266 {
4267 fs_builder ubld = bld.exec_all().group(8, 0);
4268 const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4269 ubld.MOV(dst, brw_imm_d(0));
4270 ubld.MOV(component(dst, 7), sample_mask);
4271 return dst;
4272 }
4273
4274 static void
4275 lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
4276 const fs_reg &sample_mask)
4277 {
4278 /* Get the logical send arguments. */
4279 const fs_reg &addr = inst->src[0];
4280 const fs_reg &src = inst->src[1];
4281 const fs_reg &surface = inst->src[2];
4282 const UNUSED fs_reg &dims = inst->src[3];
4283 const fs_reg &arg = inst->src[4];
4284
4285 /* Calculate the total number of components of the payload. */
4286 const unsigned addr_sz = inst->components_read(0);
4287 const unsigned src_sz = inst->components_read(1);
4288 const unsigned header_sz = (sample_mask.file == BAD_FILE ? 0 : 1);
4289 const unsigned sz = header_sz + addr_sz + src_sz;
4290
4291 /* Allocate space for the payload. */
4292 fs_reg *const components = new fs_reg[sz];
4293 const fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
4294 unsigned n = 0;
4295
4296 /* Construct the payload. */
4297 if (header_sz)
4298 components[n++] = emit_surface_header(bld, sample_mask);
4299
4300 for (unsigned i = 0; i < addr_sz; i++)
4301 components[n++] = offset(addr, bld, i);
4302
4303 for (unsigned i = 0; i < src_sz; i++)
4304 components[n++] = offset(src, bld, i);
4305
4306 bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
4307
4308 /* Update the original instruction. */
4309 inst->opcode = op;
4310 inst->mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
4311 inst->header_size = header_sz;
4312
4313 inst->src[0] = payload;
4314 inst->src[1] = surface;
4315 inst->src[2] = arg;
4316 inst->resize_sources(3);
4317
4318 delete[] components;
4319 }
4320
4321 static void
4322 lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst)
4323 {
4324 const gen_device_info *devinfo = bld.shader->devinfo;
4325
4326 if (devinfo->gen >= 7) {
4327 /* We are switching the instruction from an ALU-like instruction to a
4328 * send-from-grf instruction. Since sends can't handle strides or
4329 * source modifiers, we have to make a copy of the offset source.
4330 */
4331 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
4332 bld.MOV(tmp, inst->src[1]);
4333 inst->src[1] = tmp;
4334
4335 inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
4336
4337 } else {
4338 const fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->gen),
4339 BRW_REGISTER_TYPE_UD);
4340
4341 bld.MOV(byte_offset(payload, REG_SIZE), inst->src[1]);
4342
4343 inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4;
4344 inst->resize_sources(1);
4345 inst->base_mrf = payload.nr;
4346 inst->header_size = 1;
4347 inst->mlen = 1 + inst->exec_size / 8;
4348 }
4349 }
4350
4351 static void
4352 lower_math_logical_send(const fs_builder &bld, fs_inst *inst)
4353 {
4354 assert(bld.shader->devinfo->gen < 6);
4355
4356 inst->base_mrf = 2;
4357 inst->mlen = inst->sources * inst->exec_size / 8;
4358
4359 if (inst->sources > 1) {
4360 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
4361 * "Message Payload":
4362 *
4363 * "Operand0[7]. For the INT DIV functions, this operand is the
4364 * denominator."
4365 * ...
4366 * "Operand1[7]. For the INT DIV functions, this operand is the
4367 * numerator."
4368 */
4369 const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
4370 const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
4371 const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
4372
4373 inst->resize_sources(1);
4374 inst->src[0] = src0;
4375
4376 assert(inst->exec_size == 8);
4377 bld.MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type), src1);
4378 }
4379 }
4380
4381 bool
4382 fs_visitor::lower_logical_sends()
4383 {
4384 bool progress = false;
4385
4386 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
4387 const fs_builder ibld(this, block, inst);
4388
4389 switch (inst->opcode) {
4390 case FS_OPCODE_FB_WRITE_LOGICAL:
4391 assert(stage == MESA_SHADER_FRAGMENT);
4392 lower_fb_write_logical_send(ibld, inst,
4393 brw_wm_prog_data(prog_data),
4394 (const brw_wm_prog_key *)key,
4395 payload);
4396 break;
4397
4398 case FS_OPCODE_FB_READ_LOGICAL:
4399 lower_fb_read_logical_send(ibld, inst);
4400 break;
4401
4402 case SHADER_OPCODE_TEX_LOGICAL:
4403 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
4404 break;
4405
4406 case SHADER_OPCODE_TXD_LOGICAL:
4407 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
4408 break;
4409
4410 case SHADER_OPCODE_TXF_LOGICAL:
4411 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
4412 break;
4413
4414 case SHADER_OPCODE_TXL_LOGICAL:
4415 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
4416 break;
4417
4418 case SHADER_OPCODE_TXS_LOGICAL:
4419 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
4420 break;
4421
4422 case FS_OPCODE_TXB_LOGICAL:
4423 lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
4424 break;
4425
4426 case SHADER_OPCODE_TXF_CMS_LOGICAL:
4427 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
4428 break;
4429
4430 case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
4431 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W);
4432 break;
4433
4434 case SHADER_OPCODE_TXF_UMS_LOGICAL:
4435 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
4436 break;
4437
4438 case SHADER_OPCODE_TXF_MCS_LOGICAL:
4439 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
4440 break;
4441
4442 case SHADER_OPCODE_LOD_LOGICAL:
4443 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
4444 break;
4445
4446 case SHADER_OPCODE_TG4_LOGICAL:
4447 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
4448 break;
4449
4450 case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
4451 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
4452 break;
4453
4454 case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
4455 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_SAMPLEINFO);
4456 break;
4457
4458 case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
4459 lower_surface_logical_send(ibld, inst,
4460 SHADER_OPCODE_UNTYPED_SURFACE_READ,
4461 fs_reg());
4462 break;
4463
4464 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
4465 lower_surface_logical_send(ibld, inst,
4466 SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
4467 ibld.sample_mask_reg());
4468 break;
4469
4470 case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
4471 lower_surface_logical_send(ibld, inst,
4472 SHADER_OPCODE_UNTYPED_ATOMIC,
4473 ibld.sample_mask_reg());
4474 break;
4475
4476 case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
4477 lower_surface_logical_send(ibld, inst,
4478 SHADER_OPCODE_TYPED_SURFACE_READ,
4479 brw_imm_d(0xffff));
4480 break;
4481
4482 case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
4483 lower_surface_logical_send(ibld, inst,
4484 SHADER_OPCODE_TYPED_SURFACE_WRITE,
4485 ibld.sample_mask_reg());
4486 break;
4487
4488 case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
4489 lower_surface_logical_send(ibld, inst,
4490 SHADER_OPCODE_TYPED_ATOMIC,
4491 ibld.sample_mask_reg());
4492 break;
4493
4494 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
4495 lower_varying_pull_constant_logical_send(ibld, inst);
4496 break;
4497
4498 case SHADER_OPCODE_RCP:
4499 case SHADER_OPCODE_RSQ:
4500 case SHADER_OPCODE_SQRT:
4501 case SHADER_OPCODE_EXP2:
4502 case SHADER_OPCODE_LOG2:
4503 case SHADER_OPCODE_SIN:
4504 case SHADER_OPCODE_COS:
4505 case SHADER_OPCODE_POW:
4506 case SHADER_OPCODE_INT_QUOTIENT:
4507 case SHADER_OPCODE_INT_REMAINDER:
4508 /* The math opcodes are overloaded for the send-like and
4509 * expression-like instructions which seems kind of icky. Gen6+ has
4510 * a native (but rather quirky) MATH instruction so we don't need to
4511 * do anything here. On Gen4-5 we'll have to lower the Gen6-like
4512 * logical instructions (which we can easily recognize because they
4513 * have mlen = 0) into send-like virtual instructions.
4514 */
4515 if (devinfo->gen < 6 && inst->mlen == 0) {
4516 lower_math_logical_send(ibld, inst);
4517 break;
4518
4519 } else {
4520 continue;
4521 }
4522
4523 default:
4524 continue;
4525 }
4526
4527 progress = true;
4528 }
4529
4530 if (progress)
4531 invalidate_live_intervals();
4532
4533 return progress;
4534 }
4535
4536 /**
4537 * Get the closest allowed SIMD width for instruction \p inst accounting for
4538 * some common regioning and execution control restrictions that apply to FPU
4539 * instructions. These restrictions don't necessarily have any relevance to
4540 * instructions not executed by the FPU pipeline like extended math, control
4541 * flow or send message instructions.
4542 *
4543 * For virtual opcodes it's really up to the instruction -- In some cases
4544 * (e.g. where a virtual instruction unrolls into a simple sequence of FPU
4545 * instructions) it may simplify virtual instruction lowering if we can
4546 * enforce FPU-like regioning restrictions already on the virtual instruction,
4547 * in other cases (e.g. virtual send-like instructions) this may be
4548 * excessively restrictive.
4549 */
4550 static unsigned
4551 get_fpu_lowered_simd_width(const struct gen_device_info *devinfo,
4552 const fs_inst *inst)
4553 {
4554 /* Maximum execution size representable in the instruction controls. */
4555 unsigned max_width = MIN2(32, inst->exec_size);
4556
4557 /* According to the PRMs:
4558 * "A. In Direct Addressing mode, a source cannot span more than 2
4559 * adjacent GRF registers.
4560 * B. A destination cannot span more than 2 adjacent GRF registers."
4561 *
4562 * Look for the source or destination with the largest register region
4563 * which is the one that is going to limit the overall execution size of
4564 * the instruction due to this rule.
4565 */
4566 unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
4567
4568 for (unsigned i = 0; i < inst->sources; i++)
4569 reg_count = MAX2(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
4570
4571 /* Calculate the maximum execution size of the instruction based on the
4572 * factor by which it goes over the hardware limit of 2 GRFs.
4573 */
4574 if (reg_count > 2)
4575 max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, 2));
4576
4577 /* According to the IVB PRMs:
4578 * "When destination spans two registers, the source MUST span two
4579 * registers. The exception to the above rule:
4580 *
4581 * - When source is scalar, the source registers are not incremented.
4582 * - When source is packed integer Word and destination is packed
4583 * integer DWord, the source register is not incremented but the
4584 * source sub register is incremented."
4585 *
4586 * The hardware specs from Gen4 to Gen7.5 mention similar regioning
4587 * restrictions. The code below intentionally doesn't check whether the
4588 * destination type is integer because empirically the hardware doesn't
4589 * seem to care what the actual type is as long as it's dword-aligned.
4590 */
4591 if (devinfo->gen < 8) {
4592 for (unsigned i = 0; i < inst->sources; i++) {
4593 /* IVB implements DF scalars as <0;2,1> regions. */
4594 const bool is_scalar_exception = is_uniform(inst->src[i]) &&
4595 (devinfo->is_haswell || type_sz(inst->src[i].type) != 8);
4596 const bool is_packed_word_exception =
4597 type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 &&
4598 type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1;
4599
4600 if (inst->size_written > REG_SIZE &&
4601 inst->size_read(i) != 0 && inst->size_read(i) <= REG_SIZE &&
4602 !is_scalar_exception && !is_packed_word_exception) {
4603 const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
4604 max_width = MIN2(max_width, inst->exec_size / reg_count);
4605 }
4606 }
4607 }
4608
4609 /* From the IVB PRMs:
4610 * "When an instruction is SIMD32, the low 16 bits of the execution mask
4611 * are applied for both halves of the SIMD32 instruction. If different
4612 * execution mask channels are required, split the instruction into two
4613 * SIMD16 instructions."
4614 *
4615 * There is similar text in the HSW PRMs. Gen4-6 don't even implement
4616 * 32-wide control flow support in hardware and will behave similarly.
4617 */
4618 if (devinfo->gen < 8 && !inst->force_writemask_all)
4619 max_width = MIN2(max_width, 16);
4620
4621 /* From the IVB PRMs (applies to HSW too):
4622 * "Instructions with condition modifiers must not use SIMD32."
4623 *
4624 * From the BDW PRMs (applies to later hardware too):
4625 * "Ternary instruction with condition modifiers must not use SIMD32."
4626 */
4627 if (inst->conditional_mod && (devinfo->gen < 8 || inst->is_3src(devinfo)))
4628 max_width = MIN2(max_width, 16);
4629
4630 /* From the IVB PRMs (applies to other devices that don't have the
4631 * gen_device_info::supports_simd16_3src flag set):
4632 * "In Align16 access mode, SIMD16 is not allowed for DW operations and
4633 * SIMD8 is not allowed for DF operations."
4634 */
4635 if (inst->is_3src(devinfo) && !devinfo->supports_simd16_3src)
4636 max_width = MIN2(max_width, inst->exec_size / reg_count);
4637
4638 /* Pre-Gen8 EUs are hardwired to use the QtrCtrl+1 (where QtrCtrl is
4639 * the 8-bit quarter of the execution mask signals specified in the
4640 * instruction control fields) for the second compressed half of any
4641 * single-precision instruction (for double-precision instructions
4642 * it's hardwired to use NibCtrl+1, at least on HSW), which means that
4643 * the EU will apply the wrong execution controls for the second
4644 * sequential GRF write if the number of channels per GRF is not exactly
4645 * eight in single-precision mode (or four in double-float mode).
4646 *
4647 * In this situation we calculate the maximum size of the split
4648 * instructions so they only ever write to a single register.
4649 */
4650 if (devinfo->gen < 8 && inst->size_written > REG_SIZE &&
4651 !inst->force_writemask_all) {
4652 const unsigned channels_per_grf = inst->exec_size /
4653 DIV_ROUND_UP(inst->size_written, REG_SIZE);
4654 const unsigned exec_type_size = get_exec_type_size(inst);
4655 assert(exec_type_size);
4656
4657 /* The hardware shifts exactly 8 channels per compressed half of the
4658 * instruction in single-precision mode and exactly 4 in double-precision.
4659 */
4660 if (channels_per_grf != (exec_type_size == 8 ? 4 : 8))
4661 max_width = MIN2(max_width, channels_per_grf);
4662
4663 /* Lower all non-force_writemask_all DF instructions to SIMD4 on IVB/BYT
4664 * because HW applies the same channel enable signals to both halves of
4665 * the compressed instruction which will be just wrong under
4666 * non-uniform control flow.
4667 */
4668 if (devinfo->gen == 7 && !devinfo->is_haswell &&
4669 (exec_type_size == 8 || type_sz(inst->dst.type) == 8))
4670 max_width = MIN2(max_width, 4);
4671 }
4672
4673 /* Only power-of-two execution sizes are representable in the instruction
4674 * control fields.
4675 */
4676 return 1 << _mesa_logbase2(max_width);
4677 }
4678
4679 /**
4680 * Get the maximum allowed SIMD width for instruction \p inst accounting for
4681 * various payload size restrictions that apply to sampler message
4682 * instructions.
4683 *
4684 * This is only intended to provide a maximum theoretical bound for the
4685 * execution size of the message based on the number of argument components
4686 * alone, which in most cases will determine whether the SIMD8 or SIMD16
4687 * variant of the message can be used, though some messages may have
4688 * additional restrictions not accounted for here (e.g. pre-ILK hardware uses
4689 * the message length to determine the exact SIMD width and argument count,
4690 * which makes a number of sampler message combinations impossible to
4691 * represent).
4692 */
4693 static unsigned
4694 get_sampler_lowered_simd_width(const struct gen_device_info *devinfo,
4695 const fs_inst *inst)
4696 {
4697 /* Calculate the number of coordinate components that have to be present
4698 * assuming that additional arguments follow the texel coordinates in the
4699 * message payload. On IVB+ there is no need for padding, on ILK-SNB we
4700 * need to pad to four or three components depending on the message,
4701 * pre-ILK we need to pad to at most three components.
4702 */
4703 const unsigned req_coord_components =
4704 (devinfo->gen >= 7 ||
4705 !inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 :
4706 (devinfo->gen >= 5 && inst->opcode != SHADER_OPCODE_TXF_LOGICAL &&
4707 inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 :
4708 3;
4709
4710 /* On Gen9+ the LOD argument is for free if we're able to use the LZ
4711 * variant of the TXL or TXF message.
4712 */
4713 const bool implicit_lod = devinfo->gen >= 9 &&
4714 (inst->opcode == SHADER_OPCODE_TXL ||
4715 inst->opcode == SHADER_OPCODE_TXF) &&
4716 inst->src[TEX_LOGICAL_SRC_LOD].is_zero();
4717
4718 /* Calculate the total number of argument components that need to be passed
4719 * to the sampler unit.
4720 */
4721 const unsigned num_payload_components =
4722 MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE),
4723 req_coord_components) +
4724 inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
4725 (implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) +
4726 inst->components_read(TEX_LOGICAL_SRC_LOD2) +
4727 inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
4728 (inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
4729 inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) +
4730 inst->components_read(TEX_LOGICAL_SRC_MCS);
4731
4732 /* SIMD16 messages with more than five arguments exceed the maximum message
4733 * size supported by the sampler, regardless of whether a header is
4734 * provided or not.
4735 */
4736 return MIN2(inst->exec_size,
4737 num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
4738 }
4739
4740 /**
4741 * Get the closest native SIMD width supported by the hardware for instruction
4742 * \p inst. The instruction will be left untouched by
4743 * fs_visitor::lower_simd_width() if the returned value is equal to the
4744 * original execution size.
4745 */
4746 static unsigned
4747 get_lowered_simd_width(const struct gen_device_info *devinfo,
4748 const fs_inst *inst)
4749 {
4750 switch (inst->opcode) {
4751 case BRW_OPCODE_MOV:
4752 case BRW_OPCODE_SEL:
4753 case BRW_OPCODE_NOT:
4754 case BRW_OPCODE_AND:
4755 case BRW_OPCODE_OR:
4756 case BRW_OPCODE_XOR:
4757 case BRW_OPCODE_SHR:
4758 case BRW_OPCODE_SHL:
4759 case BRW_OPCODE_ASR:
4760 case BRW_OPCODE_CMPN:
4761 case BRW_OPCODE_CSEL:
4762 case BRW_OPCODE_F32TO16:
4763 case BRW_OPCODE_F16TO32:
4764 case BRW_OPCODE_BFREV:
4765 case BRW_OPCODE_BFE:
4766 case BRW_OPCODE_ADD:
4767 case BRW_OPCODE_MUL:
4768 case BRW_OPCODE_AVG:
4769 case BRW_OPCODE_FRC:
4770 case BRW_OPCODE_RNDU:
4771 case BRW_OPCODE_RNDD:
4772 case BRW_OPCODE_RNDE:
4773 case BRW_OPCODE_RNDZ:
4774 case BRW_OPCODE_LZD:
4775 case BRW_OPCODE_FBH:
4776 case BRW_OPCODE_FBL:
4777 case BRW_OPCODE_CBIT:
4778 case BRW_OPCODE_SAD2:
4779 case BRW_OPCODE_MAD:
4780 case BRW_OPCODE_LRP:
4781 case FS_OPCODE_PACK:
4782 return get_fpu_lowered_simd_width(devinfo, inst);
4783
4784 case BRW_OPCODE_CMP: {
4785 /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says that
4786 * when the destination is a GRF the dependency-clear bit on the flag
4787 * register is cleared early.
4788 *
4789 * Suggested workarounds are to disable coissuing CMP instructions
4790 * or to split CMP(16) instructions into two CMP(8) instructions.
4791 *
4792 * We choose to split into CMP(8) instructions since disabling
4793 * coissuing would affect CMP instructions not otherwise affected by
4794 * the errata.
4795 */
4796 const unsigned max_width = (devinfo->gen == 7 && !devinfo->is_haswell &&
4797 !inst->dst.is_null() ? 8 : ~0);
4798 return MIN2(max_width, get_fpu_lowered_simd_width(devinfo, inst));
4799 }
4800 case BRW_OPCODE_BFI1:
4801 case BRW_OPCODE_BFI2:
4802 /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
4803 * should
4804 * "Force BFI instructions to be executed always in SIMD8."
4805 */
4806 return MIN2(devinfo->is_haswell ? 8 : ~0u,
4807 get_fpu_lowered_simd_width(devinfo, inst));
4808
4809 case BRW_OPCODE_IF:
4810 assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16);
4811 return inst->exec_size;
4812
4813 case SHADER_OPCODE_RCP:
4814 case SHADER_OPCODE_RSQ:
4815 case SHADER_OPCODE_SQRT:
4816 case SHADER_OPCODE_EXP2:
4817 case SHADER_OPCODE_LOG2:
4818 case SHADER_OPCODE_SIN:
4819 case SHADER_OPCODE_COS:
4820 /* Unary extended math instructions are limited to SIMD8 on Gen4 and
4821 * Gen6.
4822 */
4823 return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) :
4824 devinfo->gen == 5 || devinfo->is_g4x ? MIN2(16, inst->exec_size) :
4825 MIN2(8, inst->exec_size));
4826
4827 case SHADER_OPCODE_POW:
4828 /* SIMD16 is only allowed on Gen7+. */
4829 return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) :
4830 MIN2(8, inst->exec_size));
4831
4832 case SHADER_OPCODE_INT_QUOTIENT:
4833 case SHADER_OPCODE_INT_REMAINDER:
4834 /* Integer division is limited to SIMD8 on all generations. */
4835 return MIN2(8, inst->exec_size);
4836
4837 case FS_OPCODE_LINTERP:
4838 case FS_OPCODE_GET_BUFFER_SIZE:
4839 case FS_OPCODE_DDX_COARSE:
4840 case FS_OPCODE_DDX_FINE:
4841 case FS_OPCODE_DDY_COARSE:
4842 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
4843 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
4844 case FS_OPCODE_PACK_HALF_2x16_SPLIT:
4845 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
4846 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
4847 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
4848 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
4849 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
4850 return MIN2(16, inst->exec_size);
4851
4852 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
4853 /* Pre-ILK hardware doesn't have a SIMD8 variant of the texel fetch
4854 * message used to implement varying pull constant loads, so expand it
4855 * to SIMD16. An alternative with longer message payload length but
4856 * shorter return payload would be to use the SIMD8 sampler message that
4857 * takes (header, u, v, r) as parameters instead of (header, u).
4858 */
4859 return (devinfo->gen == 4 ? 16 : MIN2(16, inst->exec_size));
4860
4861 case FS_OPCODE_DDY_FINE:
4862 /* The implementation of this virtual opcode may require emitting
4863 * compressed Align16 instructions, which are severely limited on some
4864 * generations.
4865 *
4866 * From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
4867 * Region Restrictions):
4868 *
4869 * "In Align16 access mode, SIMD16 is not allowed for DW operations
4870 * and SIMD8 is not allowed for DF operations."
4871 *
4872 * In this context, "DW operations" means "operations acting on 32-bit
4873 * values", so it includes operations on floats.
4874 *
4875 * Gen4 has a similar restriction. From the i965 PRM, section 11.5.3
4876 * (Instruction Compression -> Rules and Restrictions):
4877 *
4878 * "A compressed instruction must be in Align1 access mode. Align16
4879 * mode instructions cannot be compressed."
4880 *
4881 * Similar text exists in the g45 PRM.
4882 *
4883 * Empirically, compressed align16 instructions using odd register
4884 * numbers don't appear to work on Sandybridge either.
4885 */
4886 return (devinfo->gen == 4 || devinfo->gen == 6 ||
4887 (devinfo->gen == 7 && !devinfo->is_haswell) ?
4888 MIN2(8, inst->exec_size) : MIN2(16, inst->exec_size));
4889
4890 case SHADER_OPCODE_MULH:
4891 /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
4892 * is 8-wide on Gen7+.
4893 */
4894 return (devinfo->gen >= 7 ? 8 :
4895 get_fpu_lowered_simd_width(devinfo, inst));
4896
4897 case FS_OPCODE_FB_WRITE_LOGICAL:
4898 /* Gen6 doesn't support SIMD16 depth writes but we cannot handle them
4899 * here.
4900 */
4901 assert(devinfo->gen != 6 ||
4902 inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE ||
4903 inst->exec_size == 8);
4904 /* Dual-source FB writes are unsupported in SIMD16 mode. */
4905 return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
4906 8 : MIN2(16, inst->exec_size));
4907
4908 case FS_OPCODE_FB_READ_LOGICAL:
4909 return MIN2(16, inst->exec_size);
4910
4911 case SHADER_OPCODE_TEX_LOGICAL:
4912 case SHADER_OPCODE_TXF_CMS_LOGICAL:
4913 case SHADER_OPCODE_TXF_UMS_LOGICAL:
4914 case SHADER_OPCODE_TXF_MCS_LOGICAL:
4915 case SHADER_OPCODE_LOD_LOGICAL:
4916 case SHADER_OPCODE_TG4_LOGICAL:
4917 case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
4918 case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
4919 case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
4920 return get_sampler_lowered_simd_width(devinfo, inst);
4921
4922 case SHADER_OPCODE_TXD_LOGICAL:
4923 /* TXD is unsupported in SIMD16 mode. */
4924 return 8;
4925
4926 case SHADER_OPCODE_TXL_LOGICAL:
4927 case FS_OPCODE_TXB_LOGICAL:
4928 /* Only one execution size is representable pre-ILK depending on whether
4929 * the shadow reference argument is present.
4930 */
4931 if (devinfo->gen == 4)
4932 return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8;
4933 else
4934 return get_sampler_lowered_simd_width(devinfo, inst);
4935
4936 case SHADER_OPCODE_TXF_LOGICAL:
4937 case SHADER_OPCODE_TXS_LOGICAL:
4938 /* Gen4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
4939 * messages. Use SIMD16 instead.
4940 */
4941 if (devinfo->gen == 4)
4942 return 16;
4943 else
4944 return get_sampler_lowered_simd_width(devinfo, inst);
4945
4946 case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
4947 case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
4948 case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
4949 return 8;
4950
4951 case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
4952 case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
4953 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
4954 return MIN2(16, inst->exec_size);
4955
4956 case SHADER_OPCODE_URB_READ_SIMD8:
4957 case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
4958 case SHADER_OPCODE_URB_WRITE_SIMD8:
4959 case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
4960 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
4961 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
4962 return MIN2(8, inst->exec_size);
4963
4964 case SHADER_OPCODE_MOV_INDIRECT: {
4965 /* From IVB and HSW PRMs:
4966 *
4967 * "2.When the destination requires two registers and the sources are
4968 * indirect, the sources must use 1x1 regioning mode.
4969 *
4970 * In case of DF instructions in HSW/IVB, the exec_size is limited by
4971 * the EU decompression logic not handling VxH indirect addressing
4972 * correctly.
4973 */
4974 const unsigned max_size = (devinfo->gen >= 8 ? 2 : 1) * REG_SIZE;
4975 /* Prior to Broadwell, we only have 8 address subregisters. */
4976 return MIN3(devinfo->gen >= 8 ? 16 : 8,
4977 max_size / (inst->dst.stride * type_sz(inst->dst.type)),
4978 inst->exec_size);
4979 }
4980
4981 case SHADER_OPCODE_LOAD_PAYLOAD: {
4982 const unsigned reg_count =
4983 DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
4984
4985 if (reg_count > 2) {
4986 /* Only LOAD_PAYLOAD instructions with per-channel destination region
4987 * can be easily lowered (which excludes headers and heterogeneous
4988 * types).
4989 */
4990 assert(!inst->header_size);
4991 for (unsigned i = 0; i < inst->sources; i++)
4992 assert(type_sz(inst->dst.type) == type_sz(inst->src[i].type) ||
4993 inst->src[i].file == BAD_FILE);
4994
4995 return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
4996 } else {
4997 return inst->exec_size;
4998 }
4999 }
5000 default:
5001 return inst->exec_size;
5002 }
5003 }
5004
5005 /**
5006 * Return true if splitting out the group of channels of instruction \p inst
5007 * given by lbld.group() requires allocating a temporary for the i-th source
5008 * of the lowered instruction.
5009 */
5010 static inline bool
5011 needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i)
5012 {
5013 return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||
5014 (inst->components_read(i) == 1 &&
5015 lbld.dispatch_width() <= inst->exec_size));
5016 }
5017
5018 /**
5019 * Extract the data that would be consumed by the channel group given by
5020 * lbld.group() from the i-th source region of instruction \p inst and return
5021 * it as result in packed form. If any copy instructions are required they
5022 * will be emitted before the given \p inst in \p block.
5023 */
5024 static fs_reg
5025 emit_unzip(const fs_builder &lbld, bblock_t *block, fs_inst *inst,
5026 unsigned i)
5027 {
5028 /* Specified channel group from the source region. */
5029 const fs_reg src = horiz_offset(inst->src[i], lbld.group());
5030
5031 if (needs_src_copy(lbld, inst, i)) {
5032 /* Builder of the right width to perform the copy avoiding uninitialized
5033 * data if the lowered execution size is greater than the original
5034 * execution size of the instruction.
5035 */
5036 const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
5037 inst->exec_size), 0);
5038 const fs_reg tmp = lbld.vgrf(inst->src[i].type, inst->components_read(i));
5039
5040 for (unsigned k = 0; k < inst->components_read(i); ++k)
5041 cbld.at(block, inst)
5042 .MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k));
5043
5044 return tmp;
5045
5046 } else if (is_periodic(inst->src[i], lbld.dispatch_width())) {
5047 /* The source is invariant for all dispatch_width-wide groups of the
5048 * original region.
5049 */
5050 return inst->src[i];
5051
5052 } else {
5053 /* We can just point the lowered instruction at the right channel group
5054 * from the original region.
5055 */
5056 return src;
5057 }
5058 }
5059
5060 /**
5061 * Return true if splitting out the group of channels of instruction \p inst
5062 * given by lbld.group() requires allocating a temporary for the destination
5063 * of the lowered instruction and copying the data back to the original
5064 * destination region.
5065 */
5066 static inline bool
5067 needs_dst_copy(const fs_builder &lbld, const fs_inst *inst)
5068 {
5069 /* If the instruction writes more than one component we'll have to shuffle
5070 * the results of multiple lowered instructions in order to make sure that
5071 * they end up arranged correctly in the original destination region.
5072 */
5073 if (inst->size_written > inst->dst.component_size(inst->exec_size))
5074 return true;
5075
5076 /* If the lowered execution size is larger than the original the result of
5077 * the instruction won't fit in the original destination, so we'll have to
5078 * allocate a temporary in any case.
5079 */
5080 if (lbld.dispatch_width() > inst->exec_size)
5081 return true;
5082
5083 for (unsigned i = 0; i < inst->sources; i++) {
5084 /* If we already made a copy of the source for other reasons there won't
5085 * be any overlap with the destination.
5086 */
5087 if (needs_src_copy(lbld, inst, i))
5088 continue;
5089
5090 /* In order to keep the logic simple we emit a copy whenever the
5091 * destination region doesn't exactly match an overlapping source, which
5092 * may point at the source and destination not being aligned group by
5093 * group which could cause one of the lowered instructions to overwrite
5094 * the data read from the same source by other lowered instructions.
5095 */
5096 if (regions_overlap(inst->dst, inst->size_written,
5097 inst->src[i], inst->size_read(i)) &&
5098 !inst->dst.equals(inst->src[i]))
5099 return true;
5100 }
5101
5102 return false;
5103 }
5104
5105 /**
5106 * Insert data from a packed temporary into the channel group given by
5107 * lbld.group() of the destination region of instruction \p inst and return
5108 * the temporary as result. If any copy instructions are required they will
5109 * be emitted around the given \p inst in \p block.
5110 */
5111 static fs_reg
5112 emit_zip(const fs_builder &lbld, bblock_t *block, fs_inst *inst)
5113 {
5114 /* Builder of the right width to perform the copy avoiding uninitialized
5115 * data if the lowered execution size is greater than the original
5116 * execution size of the instruction.
5117 */
5118 const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
5119 inst->exec_size), 0);
5120
5121 /* Specified channel group from the destination region. */
5122 const fs_reg dst = horiz_offset(inst->dst, lbld.group());
5123 const unsigned dst_size = inst->size_written /
5124 inst->dst.component_size(inst->exec_size);
5125
5126 if (needs_dst_copy(lbld, inst)) {
5127 const fs_reg tmp = lbld.vgrf(inst->dst.type, dst_size);
5128
5129 if (inst->predicate) {
5130 /* Handle predication by copying the original contents of
5131 * the destination into the temporary before emitting the
5132 * lowered instruction.
5133 */
5134 for (unsigned k = 0; k < dst_size; ++k)
5135 cbld.at(block, inst)
5136 .MOV(offset(tmp, lbld, k), offset(dst, inst->exec_size, k));
5137 }
5138
5139 for (unsigned k = 0; k < dst_size; ++k)
5140 cbld.at(block, inst->next)
5141 .MOV(offset(dst, inst->exec_size, k), offset(tmp, lbld, k));
5142
5143 return tmp;
5144
5145 } else {
5146 /* No need to allocate a temporary for the lowered instruction, just
5147 * take the right group of channels from the original region.
5148 */
5149 return dst;
5150 }
5151 }
5152
5153 bool
5154 fs_visitor::lower_simd_width()
5155 {
5156 bool progress = false;
5157
5158 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
5159 const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
5160
5161 if (lower_width != inst->exec_size) {
5162 /* Builder matching the original instruction. We may also need to
5163 * emit an instruction of width larger than the original, set the
5164 * execution size of the builder to the highest of both for now so
5165 * we're sure that both cases can be handled.
5166 */
5167 const unsigned max_width = MAX2(inst->exec_size, lower_width);
5168 const fs_builder ibld = bld.at(block, inst)
5169 .exec_all(inst->force_writemask_all)
5170 .group(max_width, inst->group / max_width);
5171
5172 /* Split the copies in chunks of the execution width of either the
5173 * original or the lowered instruction, whichever is lower.
5174 */
5175 const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width);
5176 const unsigned dst_size = inst->size_written /
5177 inst->dst.component_size(inst->exec_size);
5178
5179 assert(!inst->writes_accumulator && !inst->mlen);
5180
5181 for (unsigned i = 0; i < n; i++) {
5182 /* Emit a copy of the original instruction with the lowered width.
5183 * If the EOT flag was set throw it away except for the last
5184 * instruction to avoid killing the thread prematurely.
5185 */
5186 fs_inst split_inst = *inst;
5187 split_inst.exec_size = lower_width;
5188 split_inst.eot = inst->eot && i == n - 1;
5189
5190 /* Select the correct channel enables for the i-th group, then
5191 * transform the sources and destination and emit the lowered
5192 * instruction.
5193 */
5194 const fs_builder lbld = ibld.group(lower_width, i);
5195
5196 for (unsigned j = 0; j < inst->sources; j++)
5197 split_inst.src[j] = emit_unzip(lbld, block, inst, j);
5198
5199 split_inst.dst = emit_zip(lbld, block, inst);
5200 split_inst.size_written =
5201 split_inst.dst.component_size(lower_width) * dst_size;
5202
5203 lbld.emit(split_inst);
5204 }
5205
5206 inst->remove(block);
5207 progress = true;
5208 }
5209 }
5210
5211 if (progress)
5212 invalidate_live_intervals();
5213
5214 return progress;
5215 }
5216
5217 void
5218 fs_visitor::dump_instructions()
5219 {
5220 dump_instructions(NULL);
5221 }
5222
5223 void
5224 fs_visitor::dump_instructions(const char *name)
5225 {
5226 FILE *file = stderr;
5227 if (name && geteuid() != 0) {
5228 file = fopen(name, "w");
5229 if (!file)
5230 file = stderr;
5231 }
5232
5233 if (cfg) {
5234 calculate_register_pressure();
5235 int ip = 0, max_pressure = 0;
5236 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
5237 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
5238 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
5239 dump_instruction(inst, file);
5240 ip++;
5241 }
5242 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
5243 } else {
5244 int ip = 0;
5245 foreach_in_list(backend_instruction, inst, &instructions) {
5246 fprintf(file, "%4d: ", ip++);
5247 dump_instruction(inst, file);
5248 }
5249 }
5250
5251 if (file != stderr) {
5252 fclose(file);
5253 }
5254 }
5255
5256 void
5257 fs_visitor::dump_instruction(backend_instruction *be_inst)
5258 {
5259 dump_instruction(be_inst, stderr);
5260 }
5261
5262 void
5263 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
5264 {
5265 fs_inst *inst = (fs_inst *)be_inst;
5266
5267 if (inst->predicate) {
5268 fprintf(file, "(%cf0.%d) ",
5269 inst->predicate_inverse ? '-' : '+',
5270 inst->flag_subreg);
5271 }
5272
5273 fprintf(file, "%s", brw_instruction_name(devinfo, inst->opcode));
5274 if (inst->saturate)
5275 fprintf(file, ".sat");
5276 if (inst->conditional_mod) {
5277 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
5278 if (!inst->predicate &&
5279 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
5280 inst->opcode != BRW_OPCODE_IF &&
5281 inst->opcode != BRW_OPCODE_WHILE))) {
5282 fprintf(file, ".f0.%d", inst->flag_subreg);
5283 }
5284 }
5285 fprintf(file, "(%d) ", inst->exec_size);
5286
5287 if (inst->mlen) {
5288 fprintf(file, "(mlen: %d) ", inst->mlen);
5289 }
5290
5291 if (inst->eot) {
5292 fprintf(file, "(EOT) ");
5293 }
5294
5295 switch (inst->dst.file) {
5296 case VGRF:
5297 fprintf(file, "vgrf%d", inst->dst.nr);
5298 break;
5299 case FIXED_GRF:
5300 fprintf(file, "g%d", inst->dst.nr);
5301 break;
5302 case MRF:
5303 fprintf(file, "m%d", inst->dst.nr);
5304 break;
5305 case BAD_FILE:
5306 fprintf(file, "(null)");
5307 break;
5308 case UNIFORM:
5309 fprintf(file, "***u%d***", inst->dst.nr);
5310 break;
5311 case ATTR:
5312 fprintf(file, "***attr%d***", inst->dst.nr);
5313 break;
5314 case ARF:
5315 switch (inst->dst.nr) {
5316 case BRW_ARF_NULL:
5317 fprintf(file, "null");
5318 break;
5319 case BRW_ARF_ADDRESS:
5320 fprintf(file, "a0.%d", inst->dst.subnr);
5321 break;
5322 case BRW_ARF_ACCUMULATOR:
5323 fprintf(file, "acc%d", inst->dst.subnr);
5324 break;
5325 case BRW_ARF_FLAG:
5326 fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
5327 break;
5328 default:
5329 fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
5330 break;
5331 }
5332 break;
5333 case IMM:
5334 unreachable("not reached");
5335 }
5336
5337 if (inst->dst.offset ||
5338 (inst->dst.file == VGRF &&
5339 alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
5340 const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE);
5341 fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
5342 inst->dst.offset % reg_size);
5343 }
5344
5345 if (inst->dst.stride != 1)
5346 fprintf(file, "<%u>", inst->dst.stride);
5347 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
5348
5349 for (int i = 0; i < inst->sources; i++) {
5350 if (inst->src[i].negate)
5351 fprintf(file, "-");
5352 if (inst->src[i].abs)
5353 fprintf(file, "|");
5354 switch (inst->src[i].file) {
5355 case VGRF:
5356 fprintf(file, "vgrf%d", inst->src[i].nr);
5357 break;
5358 case FIXED_GRF:
5359 fprintf(file, "g%d", inst->src[i].nr);
5360 break;
5361 case MRF:
5362 fprintf(file, "***m%d***", inst->src[i].nr);
5363 break;
5364 case ATTR:
5365 fprintf(file, "attr%d", inst->src[i].nr);
5366 break;
5367 case UNIFORM:
5368 fprintf(file, "u%d", inst->src[i].nr);
5369 break;
5370 case BAD_FILE:
5371 fprintf(file, "(null)");
5372 break;
5373 case IMM:
5374 switch (inst->src[i].type) {
5375 case BRW_REGISTER_TYPE_F:
5376 fprintf(file, "%-gf", inst->src[i].f);
5377 break;
5378 case BRW_REGISTER_TYPE_DF:
5379 fprintf(file, "%fdf", inst->src[i].df);
5380 break;
5381 case BRW_REGISTER_TYPE_W:
5382 case BRW_REGISTER_TYPE_D:
5383 fprintf(file, "%dd", inst->src[i].d);
5384 break;
5385 case BRW_REGISTER_TYPE_UW:
5386 case BRW_REGISTER_TYPE_UD:
5387 fprintf(file, "%uu", inst->src[i].ud);
5388 break;
5389 case BRW_REGISTER_TYPE_VF:
5390 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
5391 brw_vf_to_float((inst->src[i].ud >> 0) & 0xff),
5392 brw_vf_to_float((inst->src[i].ud >> 8) & 0xff),
5393 brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
5394 brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
5395 break;
5396 default:
5397 fprintf(file, "???");
5398 break;
5399 }
5400 break;
5401 case ARF:
5402 switch (inst->src[i].nr) {
5403 case BRW_ARF_NULL:
5404 fprintf(file, "null");
5405 break;
5406 case BRW_ARF_ADDRESS:
5407 fprintf(file, "a0.%d", inst->src[i].subnr);
5408 break;
5409 case BRW_ARF_ACCUMULATOR:
5410 fprintf(file, "acc%d", inst->src[i].subnr);
5411 break;
5412 case BRW_ARF_FLAG:
5413 fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
5414 break;
5415 default:
5416 fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
5417 break;
5418 }
5419 break;
5420 }
5421
5422 if (inst->src[i].offset ||
5423 (inst->src[i].file == VGRF &&
5424 alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
5425 const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE);
5426 fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
5427 inst->src[i].offset % reg_size);
5428 }
5429
5430 if (inst->src[i].abs)
5431 fprintf(file, "|");
5432
5433 if (inst->src[i].file != IMM) {
5434 unsigned stride;
5435 if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {
5436 unsigned hstride = inst->src[i].hstride;
5437 stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));
5438 } else {
5439 stride = inst->src[i].stride;
5440 }
5441 if (stride != 1)
5442 fprintf(file, "<%u>", stride);
5443
5444 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
5445 }
5446
5447 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
5448 fprintf(file, ", ");
5449 }
5450
5451 fprintf(file, " ");
5452
5453 if (inst->force_writemask_all)
5454 fprintf(file, "NoMask ");
5455
5456 if (inst->exec_size != dispatch_width)
5457 fprintf(file, "group%d ", inst->group);
5458
5459 fprintf(file, "\n");
5460 }
5461
5462 /**
5463 * Possibly returns an instruction that set up @param reg.
5464 *
5465 * Sometimes we want to take the result of some expression/variable
5466 * dereference tree and rewrite the instruction generating the result
5467 * of the tree. When processing the tree, we know that the
5468 * instructions generated are all writing temporaries that are dead
5469 * outside of this tree. So, if we have some instructions that write
5470 * a temporary, we're free to point that temp write somewhere else.
5471 *
5472 * Note that this doesn't guarantee that the instruction generated
5473 * only reg -- it might be the size=4 destination of a texture instruction.
5474 */
5475 fs_inst *
5476 fs_visitor::get_instruction_generating_reg(fs_inst *start,
5477 fs_inst *end,
5478 const fs_reg &reg)
5479 {
5480 if (end == start ||
5481 end->is_partial_write() ||
5482 !reg.equals(end->dst)) {
5483 return NULL;
5484 } else {
5485 return end;
5486 }
5487 }
5488
5489 void
5490 fs_visitor::setup_fs_payload_gen6()
5491 {
5492 assert(stage == MESA_SHADER_FRAGMENT);
5493 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
5494
5495 assert(devinfo->gen >= 6);
5496
5497 /* R0-1: masks, pixel X/Y coordinates. */
5498 payload.num_regs = 2;
5499 /* R2: only for 32-pixel dispatch.*/
5500
5501 /* R3-26: barycentric interpolation coordinates. These appear in the
5502 * same order that they appear in the brw_barycentric_mode
5503 * enum. Each set of coordinates occupies 2 registers if dispatch width
5504 * == 8 and 4 registers if dispatch width == 16. Coordinates only
5505 * appear if they were enabled using the "Barycentric Interpolation
5506 * Mode" bits in WM_STATE.
5507 */
5508 for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
5509 if (prog_data->barycentric_interp_modes & (1 << i)) {
5510 payload.barycentric_coord_reg[i] = payload.num_regs;
5511 payload.num_regs += 2;
5512 if (dispatch_width == 16) {
5513 payload.num_regs += 2;
5514 }
5515 }
5516 }
5517
5518 /* R27: interpolated depth if uses source depth */
5519 prog_data->uses_src_depth =
5520 (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
5521 if (prog_data->uses_src_depth) {
5522 payload.source_depth_reg = payload.num_regs;
5523 payload.num_regs++;
5524 if (dispatch_width == 16) {
5525 /* R28: interpolated depth if not SIMD8. */
5526 payload.num_regs++;
5527 }
5528 }
5529
5530 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
5531 prog_data->uses_src_w =
5532 (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
5533 if (prog_data->uses_src_w) {
5534 payload.source_w_reg = payload.num_regs;
5535 payload.num_regs++;
5536 if (dispatch_width == 16) {
5537 /* R30: interpolated W if not SIMD8. */
5538 payload.num_regs++;
5539 }
5540 }
5541
5542 /* R31: MSAA position offsets. */
5543 if (prog_data->persample_dispatch &&
5544 (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS)) {
5545 /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
5546 *
5547 * "MSDISPMODE_PERSAMPLE is required in order to select
5548 * POSOFFSET_SAMPLE"
5549 *
5550 * So we can only really get sample positions if we are doing real
5551 * per-sample dispatch. If we need gl_SamplePosition and we don't have
5552 * persample dispatch, we hard-code it to 0.5.
5553 */
5554 prog_data->uses_pos_offset = true;
5555 payload.sample_pos_reg = payload.num_regs;
5556 payload.num_regs++;
5557 }
5558
5559 /* R32: MSAA input coverage mask */
5560 prog_data->uses_sample_mask =
5561 (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) != 0;
5562 if (prog_data->uses_sample_mask) {
5563 assert(devinfo->gen >= 7);
5564 payload.sample_mask_in_reg = payload.num_regs;
5565 payload.num_regs++;
5566 if (dispatch_width == 16) {
5567 /* R33: input coverage mask if not SIMD8. */
5568 payload.num_regs++;
5569 }
5570 }
5571
5572 /* R34-: bary for 32-pixel. */
5573 /* R58-59: interp W for 32-pixel. */
5574
5575 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
5576 source_depth_to_render_target = true;
5577 }
5578 }
5579
5580 void
5581 fs_visitor::setup_vs_payload()
5582 {
5583 /* R0: thread header, R1: urb handles */
5584 payload.num_regs = 2;
5585 }
5586
5587 void
5588 fs_visitor::setup_gs_payload()
5589 {
5590 assert(stage == MESA_SHADER_GEOMETRY);
5591
5592 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
5593 struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
5594
5595 /* R0: thread header, R1: output URB handles */
5596 payload.num_regs = 2;
5597
5598 if (gs_prog_data->include_primitive_id) {
5599 /* R2: Primitive ID 0..7 */
5600 payload.num_regs++;
5601 }
5602
5603 /* Use a maximum of 24 registers for push-model inputs. */
5604 const unsigned max_push_components = 24;
5605
5606 /* If pushing our inputs would take too many registers, reduce the URB read
5607 * length (which is in HWords, or 8 registers), and resort to pulling.
5608 *
5609 * Note that the GS reads <URB Read Length> HWords for every vertex - so we
5610 * have to multiply by VerticesIn to obtain the total storage requirement.
5611 */
5612 if (8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in >
5613 max_push_components || gs_prog_data->invocations > 1) {
5614 gs_prog_data->base.include_vue_handles = true;
5615
5616 /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
5617 payload.num_regs += nir->info.gs.vertices_in;
5618
5619 vue_prog_data->urb_read_length =
5620 ROUND_DOWN_TO(max_push_components / nir->info.gs.vertices_in, 8) / 8;
5621 }
5622 }
5623
5624 void
5625 fs_visitor::setup_cs_payload()
5626 {
5627 assert(devinfo->gen >= 7);
5628 payload.num_regs = 1;
5629 }
5630
5631 void
5632 fs_visitor::calculate_register_pressure()
5633 {
5634 invalidate_live_intervals();
5635 calculate_live_intervals();
5636
5637 unsigned num_instructions = 0;
5638 foreach_block(block, cfg)
5639 num_instructions += block->instructions.length();
5640
5641 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
5642
5643 for (unsigned reg = 0; reg < alloc.count; reg++) {
5644 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
5645 regs_live_at_ip[ip] += alloc.sizes[reg];
5646 }
5647 }
5648
5649 /**
5650 * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
5651 *
5652 * The needs_unlit_centroid_workaround ends up producing one of these per
5653 * channel of centroid input, so it's good to clean them up.
5654 *
5655 * An assumption here is that nothing ever modifies the dispatched pixels
5656 * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
5657 * dictates that anyway.
5658 */
5659 bool
5660 fs_visitor::opt_drop_redundant_mov_to_flags()
5661 {
5662 bool flag_mov_found[2] = {false};
5663 bool progress = false;
5664
5665 /* Instructions removed by this pass can only be added if this were true */
5666 if (!devinfo->needs_unlit_centroid_workaround)
5667 return false;
5668
5669 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
5670 if (inst->is_control_flow()) {
5671 memset(flag_mov_found, 0, sizeof(flag_mov_found));
5672 } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
5673 if (!flag_mov_found[inst->flag_subreg]) {
5674 flag_mov_found[inst->flag_subreg] = true;
5675 } else {
5676 inst->remove(block);
5677 progress = true;
5678 }
5679 } else if (inst->flags_written()) {
5680 flag_mov_found[inst->flag_subreg] = false;
5681 }
5682 }
5683
5684 return progress;
5685 }
5686
5687 void
5688 fs_visitor::optimize()
5689 {
5690 /* Start by validating the shader we currently have. */
5691 validate();
5692
5693 /* bld is the common builder object pointing at the end of the program we
5694 * used to translate it into i965 IR. For the optimization and lowering
5695 * passes coming next, any code added after the end of the program without
5696 * having explicitly called fs_builder::at() clearly points at a mistake.
5697 * Ideally optimization passes wouldn't be part of the visitor so they
5698 * wouldn't have access to bld at all, but they do, so just in case some
5699 * pass forgets to ask for a location explicitly set it to NULL here to
5700 * make it trip. The dispatch width is initialized to a bogus value to
5701 * make sure that optimizations set the execution controls explicitly to
5702 * match the code they are manipulating instead of relying on the defaults.
5703 */
5704 bld = fs_builder(this, 64);
5705
5706 assign_constant_locations();
5707 lower_constant_loads();
5708
5709 validate();
5710
5711 split_virtual_grfs();
5712 validate();
5713
5714 #define OPT(pass, args...) ({ \
5715 pass_num++; \
5716 bool this_progress = pass(args); \
5717 \
5718 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
5719 char filename[64]; \
5720 snprintf(filename, 64, "%s%d-%s-%02d-%02d-" #pass, \
5721 stage_abbrev, dispatch_width, nir->info.name, iteration, pass_num); \
5722 \
5723 backend_shader::dump_instructions(filename); \
5724 } \
5725 \
5726 validate(); \
5727 \
5728 progress = progress || this_progress; \
5729 this_progress; \
5730 })
5731
5732 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
5733 char filename[64];
5734 snprintf(filename, 64, "%s%d-%s-00-00-start",
5735 stage_abbrev, dispatch_width, nir->info.name);
5736
5737 backend_shader::dump_instructions(filename);
5738 }
5739
5740 bool progress = false;
5741 int iteration = 0;
5742 int pass_num = 0;
5743
5744 OPT(opt_drop_redundant_mov_to_flags);
5745
5746 do {
5747 progress = false;
5748 pass_num = 0;
5749 iteration++;
5750
5751 OPT(remove_duplicate_mrf_writes);
5752
5753 OPT(opt_algebraic);
5754 OPT(opt_cse);
5755 OPT(opt_copy_propagation);
5756 OPT(opt_predicated_break, this);
5757 OPT(opt_cmod_propagation);
5758 OPT(dead_code_eliminate);
5759 OPT(opt_peephole_sel);
5760 OPT(dead_control_flow_eliminate, this);
5761 OPT(opt_register_renaming);
5762 OPT(opt_saturate_propagation);
5763 OPT(register_coalesce);
5764 OPT(compute_to_mrf);
5765 OPT(eliminate_find_live_channel);
5766
5767 OPT(compact_virtual_grfs);
5768 } while (progress);
5769
5770 progress = false;
5771 pass_num = 0;
5772
5773 if (OPT(lower_pack)) {
5774 OPT(register_coalesce);
5775 OPT(dead_code_eliminate);
5776 }
5777
5778 OPT(lower_simd_width);
5779
5780 /* After SIMD lowering just in case we had to unroll the EOT send. */
5781 OPT(opt_sampler_eot);
5782
5783 OPT(lower_logical_sends);
5784
5785 if (progress) {
5786 OPT(opt_copy_propagation);
5787 /* Only run after logical send lowering because it's easier to implement
5788 * in terms of physical sends.
5789 */
5790 if (OPT(opt_zero_samples))
5791 OPT(opt_copy_propagation);
5792 /* Run after logical send lowering to give it a chance to CSE the
5793 * LOAD_PAYLOAD instructions created to construct the payloads of
5794 * e.g. texturing messages in cases where it wasn't possible to CSE the
5795 * whole logical instruction.
5796 */
5797 OPT(opt_cse);
5798 OPT(register_coalesce);
5799 OPT(compute_to_mrf);
5800 OPT(dead_code_eliminate);
5801 OPT(remove_duplicate_mrf_writes);
5802 OPT(opt_peephole_sel);
5803 }
5804
5805 OPT(opt_redundant_discard_jumps);
5806
5807 if (OPT(lower_load_payload)) {
5808 split_virtual_grfs();
5809 OPT(register_coalesce);
5810 OPT(compute_to_mrf);
5811 OPT(dead_code_eliminate);
5812 }
5813
5814 OPT(opt_combine_constants);
5815 OPT(lower_integer_multiplication);
5816
5817 if (devinfo->gen <= 5 && OPT(lower_minmax)) {
5818 OPT(opt_cmod_propagation);
5819 OPT(opt_cse);
5820 OPT(opt_copy_propagation);
5821 OPT(dead_code_eliminate);
5822 }
5823
5824 if (OPT(lower_conversions)) {
5825 OPT(opt_copy_propagation);
5826 OPT(dead_code_eliminate);
5827 OPT(lower_simd_width);
5828 }
5829
5830 lower_uniform_pull_constant_loads();
5831
5832 validate();
5833 }
5834
5835 /**
5836 * Three source instruction must have a GRF/MRF destination register.
5837 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
5838 */
5839 void
5840 fs_visitor::fixup_3src_null_dest()
5841 {
5842 bool progress = false;
5843
5844 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
5845 if (inst->is_3src(devinfo) && inst->dst.is_null()) {
5846 inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
5847 inst->dst.type);
5848 progress = true;
5849 }
5850 }
5851
5852 if (progress)
5853 invalidate_live_intervals();
5854 }
5855
5856 void
5857 fs_visitor::allocate_registers(bool allow_spilling)
5858 {
5859 bool allocated_without_spills;
5860
5861 static const enum instruction_scheduler_mode pre_modes[] = {
5862 SCHEDULE_PRE,
5863 SCHEDULE_PRE_NON_LIFO,
5864 SCHEDULE_PRE_LIFO,
5865 };
5866
5867 bool spill_all = allow_spilling && (INTEL_DEBUG & DEBUG_SPILL_FS);
5868
5869 /* Try each scheduling heuristic to see if it can successfully register
5870 * allocate without spilling. They should be ordered by decreasing
5871 * performance but increasing likelihood of allocating.
5872 */
5873 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
5874 schedule_instructions(pre_modes[i]);
5875
5876 if (0) {
5877 assign_regs_trivial();
5878 allocated_without_spills = true;
5879 } else {
5880 allocated_without_spills = assign_regs(false, spill_all);
5881 }
5882 if (allocated_without_spills)
5883 break;
5884 }
5885
5886 if (!allocated_without_spills) {
5887 if (!allow_spilling)
5888 fail("Failure to register allocate and spilling is not allowed.");
5889
5890 /* We assume that any spilling is worse than just dropping back to
5891 * SIMD8. There's probably actually some intermediate point where
5892 * SIMD16 with a couple of spills is still better.
5893 */
5894 if (dispatch_width > min_dispatch_width) {
5895 fail("Failure to register allocate. Reduce number of "
5896 "live scalar values to avoid this.");
5897 } else {
5898 compiler->shader_perf_log(log_data,
5899 "%s shader triggered register spilling. "
5900 "Try reducing the number of live scalar "
5901 "values to improve performance.\n",
5902 stage_name);
5903 }
5904
5905 /* Since we're out of heuristics, just go spill registers until we
5906 * get an allocation.
5907 */
5908 while (!assign_regs(true, spill_all)) {
5909 if (failed)
5910 break;
5911 }
5912 }
5913
5914 /* This must come after all optimization and register allocation, since
5915 * it inserts dead code that happens to have side effects, and it does
5916 * so based on the actual physical registers in use.
5917 */
5918 insert_gen4_send_dependency_workarounds();
5919
5920 if (failed)
5921 return;
5922
5923 schedule_instructions(SCHEDULE_POST);
5924
5925 if (last_scratch > 0) {
5926 MAYBE_UNUSED unsigned max_scratch_size = 2 * 1024 * 1024;
5927
5928 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
5929
5930 if (stage == MESA_SHADER_COMPUTE) {
5931 if (devinfo->is_haswell) {
5932 /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
5933 * field documentation, Haswell supports a minimum of 2kB of
5934 * scratch space for compute shaders, unlike every other stage
5935 * and platform.
5936 */
5937 prog_data->total_scratch = MAX2(prog_data->total_scratch, 2048);
5938 } else if (devinfo->gen <= 7) {
5939 /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
5940 * field documentation, platforms prior to Haswell measure scratch
5941 * size linearly with a range of [1kB, 12kB] and 1kB granularity.
5942 */
5943 prog_data->total_scratch = ALIGN(last_scratch, 1024);
5944 max_scratch_size = 12 * 1024;
5945 }
5946 }
5947
5948 /* We currently only support up to 2MB of scratch space. If we
5949 * need to support more eventually, the documentation suggests
5950 * that we could allocate a larger buffer, and partition it out
5951 * ourselves. We'd just have to undo the hardware's address
5952 * calculation by subtracting (FFTID * Per Thread Scratch Space)
5953 * and then add FFTID * (Larger Per Thread Scratch Space).
5954 *
5955 * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
5956 * Thread Group Tracking > Local Memory/Scratch Space.
5957 */
5958 assert(prog_data->total_scratch < max_scratch_size);
5959 }
5960 }
5961
5962 bool
5963 fs_visitor::run_vs(gl_clip_plane *clip_planes)
5964 {
5965 assert(stage == MESA_SHADER_VERTEX);
5966
5967 setup_vs_payload();
5968
5969 if (shader_time_index >= 0)
5970 emit_shader_time_begin();
5971
5972 emit_nir_code();
5973
5974 if (failed)
5975 return false;
5976
5977 compute_clip_distance(clip_planes);
5978
5979 emit_urb_writes();
5980
5981 if (shader_time_index >= 0)
5982 emit_shader_time_end();
5983
5984 calculate_cfg();
5985
5986 optimize();
5987
5988 assign_curb_setup();
5989 assign_vs_urb_setup();
5990
5991 fixup_3src_null_dest();
5992 allocate_registers(true);
5993
5994 return !failed;
5995 }
5996
5997 bool
5998 fs_visitor::run_tcs_single_patch()
5999 {
6000 assert(stage == MESA_SHADER_TESS_CTRL);
6001
6002 struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
6003
6004 /* r1-r4 contain the ICP handles. */
6005 payload.num_regs = 5;
6006
6007 if (shader_time_index >= 0)
6008 emit_shader_time_begin();
6009
6010 /* Initialize gl_InvocationID */
6011 fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW);
6012 fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD);
6013 bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210)));
6014 bld.MOV(channels_ud, channels_uw);
6015
6016 if (tcs_prog_data->instances == 1) {
6017 invocation_id = channels_ud;
6018 } else {
6019 invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
6020
6021 /* Get instance number from g0.2 bits 23:17, and multiply it by 8. */
6022 fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
6023 fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD);
6024 bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
6025 brw_imm_ud(INTEL_MASK(23, 17)));
6026 bld.SHR(instance_times_8, t, brw_imm_ud(17 - 3));
6027
6028 bld.ADD(invocation_id, instance_times_8, channels_ud);
6029 }
6030
6031 /* Fix the disptach mask */
6032 if (nir->info.tess.tcs_vertices_out % 8) {
6033 bld.CMP(bld.null_reg_ud(), invocation_id,
6034 brw_imm_ud(nir->info.tess.tcs_vertices_out), BRW_CONDITIONAL_L);
6035 bld.IF(BRW_PREDICATE_NORMAL);
6036 }
6037
6038 emit_nir_code();
6039
6040 if (nir->info.tess.tcs_vertices_out % 8) {
6041 bld.emit(BRW_OPCODE_ENDIF);
6042 }
6043
6044 /* Emit EOT write; set TR DS Cache bit */
6045 fs_reg srcs[3] = {
6046 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
6047 fs_reg(brw_imm_ud(WRITEMASK_X << 16)),
6048 fs_reg(brw_imm_ud(0)),
6049 };
6050 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
6051 bld.LOAD_PAYLOAD(payload, srcs, 3, 2);
6052
6053 fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8_MASKED,
6054 bld.null_reg_ud(), payload);
6055 inst->mlen = 3;
6056 inst->eot = true;
6057
6058 if (shader_time_index >= 0)
6059 emit_shader_time_end();
6060
6061 if (failed)
6062 return false;
6063
6064 calculate_cfg();
6065
6066 optimize();
6067
6068 assign_curb_setup();
6069 assign_tcs_single_patch_urb_setup();
6070
6071 fixup_3src_null_dest();
6072 allocate_registers(true);
6073
6074 return !failed;
6075 }
6076
6077 bool
6078 fs_visitor::run_tes()
6079 {
6080 assert(stage == MESA_SHADER_TESS_EVAL);
6081
6082 /* R0: thread header, R1-3: gl_TessCoord.xyz, R4: URB handles */
6083 payload.num_regs = 5;
6084
6085 if (shader_time_index >= 0)
6086 emit_shader_time_begin();
6087
6088 emit_nir_code();
6089
6090 if (failed)
6091 return false;
6092
6093 emit_urb_writes();
6094
6095 if (shader_time_index >= 0)
6096 emit_shader_time_end();
6097
6098 calculate_cfg();
6099
6100 optimize();
6101
6102 assign_curb_setup();
6103 assign_tes_urb_setup();
6104
6105 fixup_3src_null_dest();
6106 allocate_registers(true);
6107
6108 return !failed;
6109 }
6110
6111 bool
6112 fs_visitor::run_gs()
6113 {
6114 assert(stage == MESA_SHADER_GEOMETRY);
6115
6116 setup_gs_payload();
6117
6118 this->final_gs_vertex_count = vgrf(glsl_type::uint_type);
6119
6120 if (gs_compile->control_data_header_size_bits > 0) {
6121 /* Create a VGRF to store accumulated control data bits. */
6122 this->control_data_bits = vgrf(glsl_type::uint_type);
6123
6124 /* If we're outputting more than 32 control data bits, then EmitVertex()
6125 * will set control_data_bits to 0 after emitting the first vertex.
6126 * Otherwise, we need to initialize it to 0 here.
6127 */
6128 if (gs_compile->control_data_header_size_bits <= 32) {
6129 const fs_builder abld = bld.annotate("initialize control data bits");
6130 abld.MOV(this->control_data_bits, brw_imm_ud(0u));
6131 }
6132 }
6133
6134 if (shader_time_index >= 0)
6135 emit_shader_time_begin();
6136
6137 emit_nir_code();
6138
6139 emit_gs_thread_end();
6140
6141 if (shader_time_index >= 0)
6142 emit_shader_time_end();
6143
6144 if (failed)
6145 return false;
6146
6147 calculate_cfg();
6148
6149 optimize();
6150
6151 assign_curb_setup();
6152 assign_gs_urb_setup();
6153
6154 fixup_3src_null_dest();
6155 allocate_registers(true);
6156
6157 return !failed;
6158 }
6159
6160 bool
6161 fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
6162 {
6163 struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
6164 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
6165
6166 assert(stage == MESA_SHADER_FRAGMENT);
6167
6168 if (devinfo->gen >= 6)
6169 setup_fs_payload_gen6();
6170 else
6171 setup_fs_payload_gen4();
6172
6173 if (0) {
6174 emit_dummy_fs();
6175 } else if (do_rep_send) {
6176 assert(dispatch_width == 16);
6177 emit_repclear_shader();
6178 } else {
6179 if (shader_time_index >= 0)
6180 emit_shader_time_begin();
6181
6182 calculate_urb_setup();
6183 if (nir->info.inputs_read > 0 ||
6184 (nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
6185 if (devinfo->gen < 6)
6186 emit_interpolation_setup_gen4();
6187 else
6188 emit_interpolation_setup_gen6();
6189 }
6190
6191 /* We handle discards by keeping track of the still-live pixels in f0.1.
6192 * Initialize it with the dispatched pixels.
6193 */
6194 if (wm_prog_data->uses_kill) {
6195 fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
6196 discard_init->flag_subreg = 1;
6197 }
6198
6199 /* Generate FS IR for main(). (the visitor only descends into
6200 * functions called "main").
6201 */
6202 emit_nir_code();
6203
6204 if (failed)
6205 return false;
6206
6207 if (wm_prog_data->uses_kill)
6208 bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
6209
6210 if (wm_key->alpha_test_func)
6211 emit_alpha_test();
6212
6213 emit_fb_writes();
6214
6215 if (shader_time_index >= 0)
6216 emit_shader_time_end();
6217
6218 calculate_cfg();
6219
6220 optimize();
6221
6222 assign_curb_setup();
6223 assign_urb_setup();
6224
6225 fixup_3src_null_dest();
6226 allocate_registers(allow_spilling);
6227
6228 if (failed)
6229 return false;
6230 }
6231
6232 return !failed;
6233 }
6234
6235 bool
6236 fs_visitor::run_cs()
6237 {
6238 assert(stage == MESA_SHADER_COMPUTE);
6239
6240 setup_cs_payload();
6241
6242 if (shader_time_index >= 0)
6243 emit_shader_time_begin();
6244
6245 if (devinfo->is_haswell && prog_data->total_shared > 0) {
6246 /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
6247 const fs_builder abld = bld.exec_all().group(1, 0);
6248 abld.MOV(retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW),
6249 suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1));
6250 }
6251
6252 emit_nir_code();
6253
6254 if (failed)
6255 return false;
6256
6257 emit_cs_terminate();
6258
6259 if (shader_time_index >= 0)
6260 emit_shader_time_end();
6261
6262 calculate_cfg();
6263
6264 optimize();
6265
6266 assign_curb_setup();
6267
6268 fixup_3src_null_dest();
6269 allocate_registers(true);
6270
6271 if (failed)
6272 return false;
6273
6274 return !failed;
6275 }
6276
6277 /**
6278 * Return a bitfield where bit n is set if barycentric interpolation mode n
6279 * (see enum brw_barycentric_mode) is needed by the fragment shader.
6280 *
6281 * We examine the load_barycentric intrinsics rather than looking at input
6282 * variables so that we catch interpolateAtCentroid() messages too, which
6283 * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
6284 */
6285 static unsigned
6286 brw_compute_barycentric_interp_modes(const struct gen_device_info *devinfo,
6287 const nir_shader *shader)
6288 {
6289 unsigned barycentric_interp_modes = 0;
6290
6291 nir_foreach_function(f, shader) {
6292 if (!f->impl)
6293 continue;
6294
6295 nir_foreach_block(block, f->impl) {
6296 nir_foreach_instr(instr, block) {
6297 if (instr->type != nir_instr_type_intrinsic)
6298 continue;
6299
6300 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
6301 if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
6302 continue;
6303
6304 /* Ignore WPOS; it doesn't require interpolation. */
6305 if (nir_intrinsic_base(intrin) == VARYING_SLOT_POS)
6306 continue;
6307
6308 intrin = nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
6309 enum glsl_interp_mode interp = (enum glsl_interp_mode)
6310 nir_intrinsic_interp_mode(intrin);
6311 nir_intrinsic_op bary_op = intrin->intrinsic;
6312 enum brw_barycentric_mode bary =
6313 brw_barycentric_mode(interp, bary_op);
6314
6315 barycentric_interp_modes |= 1 << bary;
6316
6317 if (devinfo->needs_unlit_centroid_workaround &&
6318 bary_op == nir_intrinsic_load_barycentric_centroid)
6319 barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
6320 }
6321 }
6322 }
6323
6324 return barycentric_interp_modes;
6325 }
6326
6327 static void
6328 brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
6329 const nir_shader *shader)
6330 {
6331 prog_data->flat_inputs = 0;
6332
6333 nir_foreach_variable(var, &shader->inputs) {
6334 int input_index = prog_data->urb_setup[var->data.location];
6335
6336 if (input_index < 0)
6337 continue;
6338
6339 /* flat shading */
6340 if (var->data.interpolation == INTERP_MODE_FLAT)
6341 prog_data->flat_inputs |= (1 << input_index);
6342 }
6343 }
6344
6345 static uint8_t
6346 computed_depth_mode(const nir_shader *shader)
6347 {
6348 if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
6349 switch (shader->info.fs.depth_layout) {
6350 case FRAG_DEPTH_LAYOUT_NONE:
6351 case FRAG_DEPTH_LAYOUT_ANY:
6352 return BRW_PSCDEPTH_ON;
6353 case FRAG_DEPTH_LAYOUT_GREATER:
6354 return BRW_PSCDEPTH_ON_GE;
6355 case FRAG_DEPTH_LAYOUT_LESS:
6356 return BRW_PSCDEPTH_ON_LE;
6357 case FRAG_DEPTH_LAYOUT_UNCHANGED:
6358 return BRW_PSCDEPTH_OFF;
6359 }
6360 }
6361 return BRW_PSCDEPTH_OFF;
6362 }
6363
6364 /**
6365 * Move load_interpolated_input with simple (payload-based) barycentric modes
6366 * to the top of the program so we don't emit multiple PLNs for the same input.
6367 *
6368 * This works around CSE not being able to handle non-dominating cases
6369 * such as:
6370 *
6371 * if (...) {
6372 * interpolate input
6373 * } else {
6374 * interpolate the same exact input
6375 * }
6376 *
6377 * This should be replaced by global value numbering someday.
6378 */
6379 static bool
6380 move_interpolation_to_top(nir_shader *nir)
6381 {
6382 bool progress = false;
6383
6384 nir_foreach_function(f, nir) {
6385 if (!f->impl)
6386 continue;
6387
6388 nir_block *top = nir_start_block(f->impl);
6389 exec_node *cursor_node = NULL;
6390
6391 nir_foreach_block(block, f->impl) {
6392 if (block == top)
6393 continue;
6394
6395 nir_foreach_instr_safe(instr, block) {
6396 if (instr->type != nir_instr_type_intrinsic)
6397 continue;
6398
6399 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
6400 if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
6401 continue;
6402 nir_intrinsic_instr *bary_intrinsic =
6403 nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
6404 nir_intrinsic_op op = bary_intrinsic->intrinsic;
6405
6406 /* Leave interpolateAtSample/Offset() where they are. */
6407 if (op == nir_intrinsic_load_barycentric_at_sample ||
6408 op == nir_intrinsic_load_barycentric_at_offset)
6409 continue;
6410
6411 nir_instr *move[3] = {
6412 &bary_intrinsic->instr,
6413 intrin->src[1].ssa->parent_instr,
6414 instr
6415 };
6416
6417 for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
6418 if (move[i]->block != top) {
6419 move[i]->block = top;
6420 exec_node_remove(&move[i]->node);
6421 if (cursor_node) {
6422 exec_node_insert_after(cursor_node, &move[i]->node);
6423 } else {
6424 exec_list_push_head(&top->instr_list, &move[i]->node);
6425 }
6426 cursor_node = &move[i]->node;
6427 progress = true;
6428 }
6429 }
6430 }
6431 }
6432 nir_metadata_preserve(f->impl, (nir_metadata)
6433 ((unsigned) nir_metadata_block_index |
6434 (unsigned) nir_metadata_dominance));
6435 }
6436
6437 return progress;
6438 }
6439
6440 /**
6441 * Demote per-sample barycentric intrinsics to centroid.
6442 *
6443 * Useful when rendering to a non-multisampled buffer.
6444 */
6445 static bool
6446 demote_sample_qualifiers(nir_shader *nir)
6447 {
6448 bool progress = true;
6449
6450 nir_foreach_function(f, nir) {
6451 if (!f->impl)
6452 continue;
6453
6454 nir_builder b;
6455 nir_builder_init(&b, f->impl);
6456
6457 nir_foreach_block(block, f->impl) {
6458 nir_foreach_instr_safe(instr, block) {
6459 if (instr->type != nir_instr_type_intrinsic)
6460 continue;
6461
6462 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
6463 if (intrin->intrinsic != nir_intrinsic_load_barycentric_sample &&
6464 intrin->intrinsic != nir_intrinsic_load_barycentric_at_sample)
6465 continue;
6466
6467 b.cursor = nir_before_instr(instr);
6468 nir_ssa_def *centroid =
6469 nir_load_barycentric(&b, nir_intrinsic_load_barycentric_centroid,
6470 nir_intrinsic_interp_mode(intrin));
6471 nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
6472 nir_src_for_ssa(centroid));
6473 nir_instr_remove(instr);
6474 progress = true;
6475 }
6476 }
6477
6478 nir_metadata_preserve(f->impl, (nir_metadata)
6479 ((unsigned) nir_metadata_block_index |
6480 (unsigned) nir_metadata_dominance));
6481 }
6482
6483 return progress;
6484 }
6485
6486 /**
6487 * Pre-gen6, the register file of the EUs was shared between threads,
6488 * and each thread used some subset allocated on a 16-register block
6489 * granularity. The unit states wanted these block counts.
6490 */
6491 static inline int
6492 brw_register_blocks(int reg_count)
6493 {
6494 return ALIGN(reg_count, 16) / 16 - 1;
6495 }
6496
6497 const unsigned *
6498 brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
6499 void *mem_ctx,
6500 const struct brw_wm_prog_key *key,
6501 struct brw_wm_prog_data *prog_data,
6502 const nir_shader *src_shader,
6503 struct gl_program *prog,
6504 int shader_time_index8, int shader_time_index16,
6505 bool allow_spilling,
6506 bool use_rep_send, struct brw_vue_map *vue_map,
6507 unsigned *final_assembly_size,
6508 char **error_str)
6509 {
6510 const struct gen_device_info *devinfo = compiler->devinfo;
6511
6512 nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
6513 shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
6514 brw_nir_lower_fs_inputs(shader, devinfo, key);
6515 brw_nir_lower_fs_outputs(shader);
6516
6517 if (devinfo->gen < 6) {
6518 brw_setup_vue_interpolation(vue_map, shader, prog_data, devinfo);
6519 }
6520
6521 if (!key->multisample_fbo)
6522 NIR_PASS_V(shader, demote_sample_qualifiers);
6523 NIR_PASS_V(shader, move_interpolation_to_top);
6524 shader = brw_postprocess_nir(shader, compiler, true);
6525
6526 /* key->alpha_test_func means simulating alpha testing via discards,
6527 * so the shader definitely kills pixels.
6528 */
6529 prog_data->uses_kill = shader->info.fs.uses_discard ||
6530 key->alpha_test_func;
6531 prog_data->uses_omask = key->multisample_fbo &&
6532 shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
6533 prog_data->computed_depth_mode = computed_depth_mode(shader);
6534 prog_data->computed_stencil =
6535 shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
6536
6537 prog_data->persample_dispatch =
6538 key->multisample_fbo &&
6539 (key->persample_interp ||
6540 (shader->info.system_values_read & (SYSTEM_BIT_SAMPLE_ID |
6541 SYSTEM_BIT_SAMPLE_POS)) ||
6542 shader->info.fs.uses_sample_qualifier ||
6543 shader->info.outputs_read);
6544
6545 prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
6546 prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
6547 prog_data->inner_coverage = shader->info.fs.inner_coverage;
6548
6549 prog_data->barycentric_interp_modes =
6550 brw_compute_barycentric_interp_modes(compiler->devinfo, shader);
6551
6552 cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL;
6553 uint8_t simd8_grf_start = 0, simd16_grf_start = 0;
6554 unsigned simd8_grf_used = 0, simd16_grf_used = 0;
6555
6556 fs_visitor v8(compiler, log_data, mem_ctx, key,
6557 &prog_data->base, prog, shader, 8,
6558 shader_time_index8);
6559 if (!v8.run_fs(allow_spilling, false /* do_rep_send */)) {
6560 if (error_str)
6561 *error_str = ralloc_strdup(mem_ctx, v8.fail_msg);
6562
6563 return NULL;
6564 } else if (likely(!(INTEL_DEBUG & DEBUG_NO8))) {
6565 simd8_cfg = v8.cfg;
6566 simd8_grf_start = v8.payload.num_regs;
6567 simd8_grf_used = v8.grf_used;
6568 }
6569
6570 if (v8.max_dispatch_width >= 16 &&
6571 likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) {
6572 /* Try a SIMD16 compile */
6573 fs_visitor v16(compiler, log_data, mem_ctx, key,
6574 &prog_data->base, prog, shader, 16,
6575 shader_time_index16);
6576 v16.import_uniforms(&v8);
6577 if (!v16.run_fs(allow_spilling, use_rep_send)) {
6578 compiler->shader_perf_log(log_data,
6579 "SIMD16 shader failed to compile: %s",
6580 v16.fail_msg);
6581 } else {
6582 simd16_cfg = v16.cfg;
6583 simd16_grf_start = v16.payload.num_regs;
6584 simd16_grf_used = v16.grf_used;
6585 }
6586 }
6587
6588 /* When the caller requests a repclear shader, they want SIMD16-only */
6589 if (use_rep_send)
6590 simd8_cfg = NULL;
6591
6592 /* Prior to Iron Lake, the PS had a single shader offset with a jump table
6593 * at the top to select the shader. We've never implemented that.
6594 * Instead, we just give them exactly one shader and we pick the widest one
6595 * available.
6596 */
6597 if (compiler->devinfo->gen < 5 && simd16_cfg)
6598 simd8_cfg = NULL;
6599
6600 if (prog_data->persample_dispatch) {
6601 /* Starting with SandyBridge (where we first get MSAA), the different
6602 * pixel dispatch combinations are grouped into classifications A
6603 * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1). On all hardware
6604 * generations, the only configurations supporting persample dispatch
6605 * are are this in which only one dispatch width is enabled.
6606 *
6607 * If computed depth is enabled, SNB only allows SIMD8 while IVB+
6608 * allow SIMD8 or SIMD16 so we choose SIMD16 if available.
6609 */
6610 if (compiler->devinfo->gen == 6 &&
6611 prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) {
6612 simd16_cfg = NULL;
6613 } else if (simd16_cfg) {
6614 simd8_cfg = NULL;
6615 }
6616 }
6617
6618 /* We have to compute the flat inputs after the visitor is finished running
6619 * because it relies on prog_data->urb_setup which is computed in
6620 * fs_visitor::calculate_urb_setup().
6621 */
6622 brw_compute_flat_inputs(prog_data, shader);
6623
6624 fs_generator g(compiler, log_data, mem_ctx, (void *) key, &prog_data->base,
6625 v8.promoted_constants, v8.runtime_check_aads_emit,
6626 MESA_SHADER_FRAGMENT);
6627
6628 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
6629 g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s",
6630 shader->info.label ?
6631 shader->info.label : "unnamed",
6632 shader->info.name));
6633 }
6634
6635 if (simd8_cfg) {
6636 prog_data->dispatch_8 = true;
6637 g.generate_code(simd8_cfg, 8);
6638 prog_data->base.dispatch_grf_start_reg = simd8_grf_start;
6639 prog_data->reg_blocks_0 = brw_register_blocks(simd8_grf_used);
6640
6641 if (simd16_cfg) {
6642 prog_data->dispatch_16 = true;
6643 prog_data->prog_offset_2 = g.generate_code(simd16_cfg, 16);
6644 prog_data->dispatch_grf_start_reg_2 = simd16_grf_start;
6645 prog_data->reg_blocks_2 = brw_register_blocks(simd16_grf_used);
6646 }
6647 } else if (simd16_cfg) {
6648 prog_data->dispatch_16 = true;
6649 g.generate_code(simd16_cfg, 16);
6650 prog_data->base.dispatch_grf_start_reg = simd16_grf_start;
6651 prog_data->reg_blocks_0 = brw_register_blocks(simd16_grf_used);
6652 }
6653
6654 return g.get_assembly(final_assembly_size);
6655 }
6656
6657 fs_reg *
6658 fs_visitor::emit_cs_work_group_id_setup()
6659 {
6660 assert(stage == MESA_SHADER_COMPUTE);
6661
6662 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));
6663
6664 struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
6665 struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD));
6666 struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD));
6667
6668 bld.MOV(*reg, r0_1);
6669 bld.MOV(offset(*reg, bld, 1), r0_6);
6670 bld.MOV(offset(*reg, bld, 2), r0_7);
6671
6672 return reg;
6673 }
6674
6675 static void
6676 fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords)
6677 {
6678 block->dwords = dwords;
6679 block->regs = DIV_ROUND_UP(dwords, 8);
6680 block->size = block->regs * 32;
6681 }
6682
6683 static void
6684 cs_fill_push_const_info(const struct gen_device_info *devinfo,
6685 struct brw_cs_prog_data *cs_prog_data)
6686 {
6687 const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
6688 bool fill_thread_id =
6689 cs_prog_data->thread_local_id_index >= 0 &&
6690 cs_prog_data->thread_local_id_index < (int)prog_data->nr_params;
6691 bool cross_thread_supported = devinfo->gen > 7 || devinfo->is_haswell;
6692
6693 /* The thread ID should be stored in the last param dword */
6694 assert(prog_data->nr_params > 0 || !fill_thread_id);
6695 assert(!fill_thread_id ||
6696 cs_prog_data->thread_local_id_index ==
6697 (int)prog_data->nr_params - 1);
6698
6699 unsigned cross_thread_dwords, per_thread_dwords;
6700 if (!cross_thread_supported) {
6701 cross_thread_dwords = 0u;
6702 per_thread_dwords = prog_data->nr_params;
6703 } else if (fill_thread_id) {
6704 /* Fill all but the last register with cross-thread payload */
6705 cross_thread_dwords = 8 * (cs_prog_data->thread_local_id_index / 8);
6706 per_thread_dwords = prog_data->nr_params - cross_thread_dwords;
6707 assert(per_thread_dwords > 0 && per_thread_dwords <= 8);
6708 } else {
6709 /* Fill all data using cross-thread payload */
6710 cross_thread_dwords = prog_data->nr_params;
6711 per_thread_dwords = 0u;
6712 }
6713
6714 fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords);
6715 fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords);
6716
6717 unsigned total_dwords =
6718 (cs_prog_data->push.per_thread.size * cs_prog_data->threads +
6719 cs_prog_data->push.cross_thread.size) / 4;
6720 fill_push_const_block_info(&cs_prog_data->push.total, total_dwords);
6721
6722 assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 ||
6723 cs_prog_data->push.per_thread.size == 0);
6724 assert(cs_prog_data->push.cross_thread.dwords +
6725 cs_prog_data->push.per_thread.dwords ==
6726 prog_data->nr_params);
6727 }
6728
6729 static void
6730 cs_set_simd_size(struct brw_cs_prog_data *cs_prog_data, unsigned size)
6731 {
6732 cs_prog_data->simd_size = size;
6733 unsigned group_size = cs_prog_data->local_size[0] *
6734 cs_prog_data->local_size[1] * cs_prog_data->local_size[2];
6735 cs_prog_data->threads = (group_size + size - 1) / size;
6736 }
6737
6738 const unsigned *
6739 brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
6740 void *mem_ctx,
6741 const struct brw_cs_prog_key *key,
6742 struct brw_cs_prog_data *prog_data,
6743 const nir_shader *src_shader,
6744 int shader_time_index,
6745 unsigned *final_assembly_size,
6746 char **error_str)
6747 {
6748 nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
6749 shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
6750 brw_nir_lower_cs_shared(shader);
6751 prog_data->base.total_shared += shader->num_shared;
6752
6753 /* Now that we cloned the nir_shader, we can update num_uniforms based on
6754 * the thread_local_id_index.
6755 */
6756 assert(prog_data->thread_local_id_index >= 0);
6757 shader->num_uniforms =
6758 MAX2(shader->num_uniforms,
6759 (unsigned)4 * (prog_data->thread_local_id_index + 1));
6760
6761 brw_nir_lower_intrinsics(shader, &prog_data->base);
6762 shader = brw_postprocess_nir(shader, compiler, true);
6763
6764 prog_data->local_size[0] = shader->info.cs.local_size[0];
6765 prog_data->local_size[1] = shader->info.cs.local_size[1];
6766 prog_data->local_size[2] = shader->info.cs.local_size[2];
6767 unsigned local_workgroup_size =
6768 shader->info.cs.local_size[0] * shader->info.cs.local_size[1] *
6769 shader->info.cs.local_size[2];
6770
6771 unsigned max_cs_threads = compiler->devinfo->max_cs_threads;
6772 unsigned simd_required = DIV_ROUND_UP(local_workgroup_size, max_cs_threads);
6773
6774 cfg_t *cfg = NULL;
6775 const char *fail_msg = NULL;
6776
6777 /* Now the main event: Visit the shader IR and generate our CS IR for it.
6778 */
6779 fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base,
6780 NULL, /* Never used in core profile */
6781 shader, 8, shader_time_index);
6782 if (simd_required <= 8) {
6783 if (!v8.run_cs()) {
6784 fail_msg = v8.fail_msg;
6785 } else {
6786 cfg = v8.cfg;
6787 cs_set_simd_size(prog_data, 8);
6788 cs_fill_push_const_info(compiler->devinfo, prog_data);
6789 prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs;
6790 }
6791 }
6792
6793 fs_visitor v16(compiler, log_data, mem_ctx, key, &prog_data->base,
6794 NULL, /* Never used in core profile */
6795 shader, 16, shader_time_index);
6796 if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
6797 !fail_msg && v8.max_dispatch_width >= 16 &&
6798 simd_required <= 16) {
6799 /* Try a SIMD16 compile */
6800 if (simd_required <= 8)
6801 v16.import_uniforms(&v8);
6802 if (!v16.run_cs()) {
6803 compiler->shader_perf_log(log_data,
6804 "SIMD16 shader failed to compile: %s",
6805 v16.fail_msg);
6806 if (!cfg) {
6807 fail_msg =
6808 "Couldn't generate SIMD16 program and not "
6809 "enough threads for SIMD8";
6810 }
6811 } else {
6812 cfg = v16.cfg;
6813 cs_set_simd_size(prog_data, 16);
6814 cs_fill_push_const_info(compiler->devinfo, prog_data);
6815 prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs;
6816 }
6817 }
6818
6819 fs_visitor v32(compiler, log_data, mem_ctx, key, &prog_data->base,
6820 NULL, /* Never used in core profile */
6821 shader, 32, shader_time_index);
6822 if (!fail_msg && v8.max_dispatch_width >= 32 &&
6823 (simd_required > 16 || (INTEL_DEBUG & DEBUG_DO32))) {
6824 /* Try a SIMD32 compile */
6825 if (simd_required <= 8)
6826 v32.import_uniforms(&v8);
6827 else if (simd_required <= 16)
6828 v32.import_uniforms(&v16);
6829
6830 if (!v32.run_cs()) {
6831 compiler->shader_perf_log(log_data,
6832 "SIMD32 shader failed to compile: %s",
6833 v16.fail_msg);
6834 if (!cfg) {
6835 fail_msg =
6836 "Couldn't generate SIMD32 program and not "
6837 "enough threads for SIMD16";
6838 }
6839 } else {
6840 cfg = v32.cfg;
6841 cs_set_simd_size(prog_data, 32);
6842 cs_fill_push_const_info(compiler->devinfo, prog_data);
6843 }
6844 }
6845
6846 if (unlikely(cfg == NULL)) {
6847 assert(fail_msg);
6848 if (error_str)
6849 *error_str = ralloc_strdup(mem_ctx, fail_msg);
6850
6851 return NULL;
6852 }
6853
6854 fs_generator g(compiler, log_data, mem_ctx, (void*) key, &prog_data->base,
6855 v8.promoted_constants, v8.runtime_check_aads_emit,
6856 MESA_SHADER_COMPUTE);
6857 if (INTEL_DEBUG & DEBUG_CS) {
6858 char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
6859 shader->info.label ? shader->info.label :
6860 "unnamed",
6861 shader->info.name);
6862 g.enable_debug(name);
6863 }
6864
6865 g.generate_code(cfg, prog_data->simd_size);
6866
6867 return g.get_assembly(final_assembly_size);
6868 }
6869
6870 /**
6871 * Test the dispatch mask packing assumptions of
6872 * brw_stage_has_packed_dispatch(). Call this from e.g. the top of
6873 * fs_visitor::emit_nir_code() to cause a GPU hang if any shader invocation is
6874 * executed with an unexpected dispatch mask.
6875 */
6876 static UNUSED void
6877 brw_fs_test_dispatch_packing(const fs_builder &bld)
6878 {
6879 const gl_shader_stage stage = bld.shader->stage;
6880
6881 if (brw_stage_has_packed_dispatch(bld.shader->devinfo, stage,
6882 bld.shader->stage_prog_data)) {
6883 const fs_builder ubld = bld.exec_all().group(1, 0);
6884 const fs_reg tmp = component(bld.vgrf(BRW_REGISTER_TYPE_UD), 0);
6885 const fs_reg mask = (stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :
6886 brw_dmask_reg());
6887
6888 ubld.ADD(tmp, mask, brw_imm_ud(1));
6889 ubld.AND(tmp, mask, tmp);
6890
6891 /* This will loop forever if the dispatch mask doesn't have the expected
6892 * form '2^n-1', in which case tmp will be non-zero.
6893 */
6894 bld.emit(BRW_OPCODE_DO);
6895 bld.CMP(bld.null_reg_ud(), tmp, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
6896 set_predicate(BRW_PREDICATE_NORMAL, bld.emit(BRW_OPCODE_WHILE));
6897 }
6898 }