i965: Clean up #includes in the compiler.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include "main/macros.h"
32 #include "brw_context.h"
33 #include "brw_eu.h"
34 #include "brw_fs.h"
35 #include "brw_cs.h"
36 #include "brw_nir.h"
37 #include "brw_vec4_gs_visitor.h"
38 #include "brw_cfg.h"
39 #include "brw_program.h"
40 #include "brw_dead_control_flow.h"
41 #include "glsl/nir/glsl_types.h"
42
43 using namespace brw;
44
45 void
46 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
47 const fs_reg *src, unsigned sources)
48 {
49 memset(this, 0, sizeof(*this));
50
51 this->src = new fs_reg[MAX2(sources, 3)];
52 for (unsigned i = 0; i < sources; i++)
53 this->src[i] = src[i];
54
55 this->opcode = opcode;
56 this->dst = dst;
57 this->sources = sources;
58 this->exec_size = exec_size;
59
60 assert(dst.file != IMM && dst.file != UNIFORM);
61
62 assert(this->exec_size != 0);
63
64 this->conditional_mod = BRW_CONDITIONAL_NONE;
65
66 /* This will be the case for almost all instructions. */
67 switch (dst.file) {
68 case VGRF:
69 case ARF:
70 case FIXED_GRF:
71 case MRF:
72 case ATTR:
73 this->regs_written = DIV_ROUND_UP(dst.component_size(exec_size),
74 REG_SIZE);
75 break;
76 case BAD_FILE:
77 this->regs_written = 0;
78 break;
79 case IMM:
80 case UNIFORM:
81 unreachable("Invalid destination register file");
82 }
83
84 this->writes_accumulator = false;
85 }
86
87 fs_inst::fs_inst()
88 {
89 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
90 }
91
92 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
93 {
94 init(opcode, exec_size, reg_undef, NULL, 0);
95 }
96
97 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
98 {
99 init(opcode, exec_size, dst, NULL, 0);
100 }
101
102 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
103 const fs_reg &src0)
104 {
105 const fs_reg src[1] = { src0 };
106 init(opcode, exec_size, dst, src, 1);
107 }
108
109 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
110 const fs_reg &src0, const fs_reg &src1)
111 {
112 const fs_reg src[2] = { src0, src1 };
113 init(opcode, exec_size, dst, src, 2);
114 }
115
116 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
117 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
118 {
119 const fs_reg src[3] = { src0, src1, src2 };
120 init(opcode, exec_size, dst, src, 3);
121 }
122
123 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
124 const fs_reg src[], unsigned sources)
125 {
126 init(opcode, exec_width, dst, src, sources);
127 }
128
129 fs_inst::fs_inst(const fs_inst &that)
130 {
131 memcpy(this, &that, sizeof(that));
132
133 this->src = new fs_reg[MAX2(that.sources, 3)];
134
135 for (unsigned i = 0; i < that.sources; i++)
136 this->src[i] = that.src[i];
137 }
138
139 fs_inst::~fs_inst()
140 {
141 delete[] this->src;
142 }
143
144 void
145 fs_inst::resize_sources(uint8_t num_sources)
146 {
147 if (this->sources != num_sources) {
148 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
149
150 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
151 src[i] = this->src[i];
152
153 delete[] this->src;
154 this->src = src;
155 this->sources = num_sources;
156 }
157 }
158
159 void
160 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
161 const fs_reg &dst,
162 const fs_reg &surf_index,
163 const fs_reg &varying_offset,
164 uint32_t const_offset)
165 {
166 /* We have our constant surface use a pitch of 4 bytes, so our index can
167 * be any component of a vector, and then we load 4 contiguous
168 * components starting from that.
169 *
170 * We break down the const_offset to a portion added to the variable
171 * offset and a portion done using reg_offset, which means that if you
172 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
173 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
174 * CSE can later notice that those loads are all the same and eliminate
175 * the redundant ones.
176 */
177 fs_reg vec4_offset = vgrf(glsl_type::int_type);
178 bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~3));
179
180 int scale = 1;
181 if (devinfo->gen == 4 && bld.dispatch_width() == 8) {
182 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
183 * u, v, r) as parameters, or we can just use the SIMD16 message
184 * consisting of (header, u). We choose the second, at the cost of a
185 * longer return length.
186 */
187 scale = 2;
188 }
189
190 enum opcode op;
191 if (devinfo->gen >= 7)
192 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
193 else
194 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
195
196 int regs_written = 4 * (bld.dispatch_width() / 8) * scale;
197 fs_reg vec4_result = fs_reg(VGRF, alloc.allocate(regs_written), dst.type);
198 fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
199 inst->regs_written = regs_written;
200
201 if (devinfo->gen < 7) {
202 inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen);
203 inst->header_size = 1;
204 if (devinfo->gen == 4)
205 inst->mlen = 3;
206 else
207 inst->mlen = 1 + bld.dispatch_width() / 8;
208 }
209
210 bld.MOV(dst, offset(vec4_result, bld, (const_offset & 3) * scale));
211 }
212
213 /**
214 * A helper for MOV generation for fixing up broken hardware SEND dependency
215 * handling.
216 */
217 void
218 fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
219 {
220 /* The caller always wants uncompressed to emit the minimal extra
221 * dependencies, and to avoid having to deal with aligning its regs to 2.
222 */
223 const fs_builder ubld = bld.annotate("send dependency resolve")
224 .half(0);
225
226 ubld.MOV(ubld.null_reg_f(), fs_reg(VGRF, grf, BRW_REGISTER_TYPE_F));
227 }
228
229 bool
230 fs_inst::equals(fs_inst *inst) const
231 {
232 return (opcode == inst->opcode &&
233 dst.equals(inst->dst) &&
234 src[0].equals(inst->src[0]) &&
235 src[1].equals(inst->src[1]) &&
236 src[2].equals(inst->src[2]) &&
237 saturate == inst->saturate &&
238 predicate == inst->predicate &&
239 conditional_mod == inst->conditional_mod &&
240 mlen == inst->mlen &&
241 base_mrf == inst->base_mrf &&
242 target == inst->target &&
243 eot == inst->eot &&
244 header_size == inst->header_size &&
245 shadow_compare == inst->shadow_compare &&
246 exec_size == inst->exec_size &&
247 offset == inst->offset);
248 }
249
250 bool
251 fs_inst::overwrites_reg(const fs_reg &reg) const
252 {
253 return reg.in_range(dst, regs_written);
254 }
255
256 bool
257 fs_inst::is_send_from_grf() const
258 {
259 switch (opcode) {
260 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
261 case SHADER_OPCODE_SHADER_TIME_ADD:
262 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
263 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
264 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
265 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
266 case SHADER_OPCODE_UNTYPED_ATOMIC:
267 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
268 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
269 case SHADER_OPCODE_TYPED_ATOMIC:
270 case SHADER_OPCODE_TYPED_SURFACE_READ:
271 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
272 case SHADER_OPCODE_URB_WRITE_SIMD8:
273 case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
274 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
275 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
276 case SHADER_OPCODE_URB_READ_SIMD8:
277 case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
278 return true;
279 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
280 return src[1].file == VGRF;
281 case FS_OPCODE_FB_WRITE:
282 return src[0].file == VGRF;
283 default:
284 if (is_tex())
285 return src[0].file == VGRF;
286
287 return false;
288 }
289 }
290
291 bool
292 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
293 {
294 if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
295 return false;
296
297 fs_reg reg = this->src[0];
298 if (reg.file != VGRF || reg.reg_offset != 0 || reg.stride == 0)
299 return false;
300
301 if (grf_alloc.sizes[reg.nr] != this->regs_written)
302 return false;
303
304 for (int i = 0; i < this->sources; i++) {
305 reg.type = this->src[i].type;
306 if (!this->src[i].equals(reg))
307 return false;
308
309 if (i < this->header_size) {
310 reg.reg_offset += 1;
311 } else {
312 reg.reg_offset += this->exec_size / 8;
313 }
314 }
315
316 return true;
317 }
318
319 bool
320 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
321 {
322 if (devinfo->gen == 6 && is_math())
323 return false;
324
325 if (is_send_from_grf())
326 return false;
327
328 if (!backend_instruction::can_do_source_mods())
329 return false;
330
331 return true;
332 }
333
334 bool
335 fs_inst::can_change_types() const
336 {
337 return dst.type == src[0].type &&
338 !src[0].abs && !src[0].negate && !saturate &&
339 (opcode == BRW_OPCODE_MOV ||
340 (opcode == BRW_OPCODE_SEL &&
341 dst.type == src[1].type &&
342 predicate != BRW_PREDICATE_NONE &&
343 !src[1].abs && !src[1].negate));
344 }
345
346 bool
347 fs_inst::has_side_effects() const
348 {
349 return this->eot || backend_instruction::has_side_effects();
350 }
351
352 void
353 fs_reg::init()
354 {
355 memset(this, 0, sizeof(*this));
356 stride = 1;
357 }
358
359 /** Generic unset register constructor. */
360 fs_reg::fs_reg()
361 {
362 init();
363 this->file = BAD_FILE;
364 }
365
366 fs_reg::fs_reg(struct ::brw_reg reg) :
367 backend_reg(reg)
368 {
369 this->reg_offset = 0;
370 this->subreg_offset = 0;
371 this->reladdr = NULL;
372 this->stride = 1;
373 if (this->file == IMM &&
374 (this->type != BRW_REGISTER_TYPE_V &&
375 this->type != BRW_REGISTER_TYPE_UV &&
376 this->type != BRW_REGISTER_TYPE_VF)) {
377 this->stride = 0;
378 }
379 }
380
381 bool
382 fs_reg::equals(const fs_reg &r) const
383 {
384 return (this->backend_reg::equals(r) &&
385 subreg_offset == r.subreg_offset &&
386 !reladdr && !r.reladdr &&
387 stride == r.stride);
388 }
389
390 fs_reg &
391 fs_reg::set_smear(unsigned subreg)
392 {
393 assert(file != ARF && file != FIXED_GRF && file != IMM);
394 subreg_offset = subreg * type_sz(type);
395 stride = 0;
396 return *this;
397 }
398
399 bool
400 fs_reg::is_contiguous() const
401 {
402 return stride == 1;
403 }
404
405 unsigned
406 fs_reg::component_size(unsigned width) const
407 {
408 const unsigned stride = ((file != ARF && file != FIXED_GRF) ? this->stride :
409 hstride == 0 ? 0 :
410 1 << (hstride - 1));
411 return MAX2(width * stride, 1) * type_sz(type);
412 }
413
414 extern "C" int
415 type_size_scalar(const struct glsl_type *type)
416 {
417 unsigned int size, i;
418
419 switch (type->base_type) {
420 case GLSL_TYPE_UINT:
421 case GLSL_TYPE_INT:
422 case GLSL_TYPE_FLOAT:
423 case GLSL_TYPE_BOOL:
424 return type->components();
425 case GLSL_TYPE_ARRAY:
426 return type_size_scalar(type->fields.array) * type->length;
427 case GLSL_TYPE_STRUCT:
428 size = 0;
429 for (i = 0; i < type->length; i++) {
430 size += type_size_scalar(type->fields.structure[i].type);
431 }
432 return size;
433 case GLSL_TYPE_SAMPLER:
434 /* Samplers take up no register space, since they're baked in at
435 * link time.
436 */
437 return 0;
438 case GLSL_TYPE_ATOMIC_UINT:
439 return 0;
440 case GLSL_TYPE_SUBROUTINE:
441 return 1;
442 case GLSL_TYPE_IMAGE:
443 return BRW_IMAGE_PARAM_SIZE;
444 case GLSL_TYPE_VOID:
445 case GLSL_TYPE_ERROR:
446 case GLSL_TYPE_INTERFACE:
447 case GLSL_TYPE_DOUBLE:
448 unreachable("not reached");
449 }
450
451 return 0;
452 }
453
454 /**
455 * Returns the number of scalar components needed to store type, assuming
456 * that vectors are padded out to vec4.
457 *
458 * This has the packing rules of type_size_vec4(), but counts components
459 * similar to type_size_scalar().
460 */
461 extern "C" int
462 type_size_vec4_times_4(const struct glsl_type *type)
463 {
464 return 4 * type_size_vec4(type);
465 }
466
467 /**
468 * Create a MOV to read the timestamp register.
469 *
470 * The caller is responsible for emitting the MOV. The return value is
471 * the destination of the MOV, with extra parameters set.
472 */
473 fs_reg
474 fs_visitor::get_timestamp(const fs_builder &bld)
475 {
476 assert(devinfo->gen >= 7);
477
478 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
479 BRW_ARF_TIMESTAMP,
480 0),
481 BRW_REGISTER_TYPE_UD));
482
483 fs_reg dst = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
484
485 /* We want to read the 3 fields we care about even if it's not enabled in
486 * the dispatch.
487 */
488 bld.group(4, 0).exec_all().MOV(dst, ts);
489
490 return dst;
491 }
492
493 void
494 fs_visitor::emit_shader_time_begin()
495 {
496 shader_start_time = get_timestamp(bld.annotate("shader time start"));
497
498 /* We want only the low 32 bits of the timestamp. Since it's running
499 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
500 * which is plenty of time for our purposes. It is identical across the
501 * EUs, but since it's tracking GPU core speed it will increment at a
502 * varying rate as render P-states change.
503 */
504 shader_start_time.set_smear(0);
505 }
506
507 void
508 fs_visitor::emit_shader_time_end()
509 {
510 /* Insert our code just before the final SEND with EOT. */
511 exec_node *end = this->instructions.get_tail();
512 assert(end && ((fs_inst *) end)->eot);
513 const fs_builder ibld = bld.annotate("shader time end")
514 .exec_all().at(NULL, end);
515
516 fs_reg shader_end_time = get_timestamp(ibld);
517
518 /* We only use the low 32 bits of the timestamp - see
519 * emit_shader_time_begin()).
520 *
521 * We could also check if render P-states have changed (or anything
522 * else that might disrupt timing) by setting smear to 2 and checking if
523 * that field is != 0.
524 */
525 shader_end_time.set_smear(0);
526
527 /* Check that there weren't any timestamp reset events (assuming these
528 * were the only two timestamp reads that happened).
529 */
530 fs_reg reset = shader_end_time;
531 reset.set_smear(2);
532 set_condmod(BRW_CONDITIONAL_Z,
533 ibld.AND(ibld.null_reg_ud(), reset, brw_imm_ud(1u)));
534 ibld.IF(BRW_PREDICATE_NORMAL);
535
536 fs_reg start = shader_start_time;
537 start.negate = true;
538 fs_reg diff = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
539 diff.set_smear(0);
540
541 const fs_builder cbld = ibld.group(1, 0);
542 cbld.group(1, 0).ADD(diff, start, shader_end_time);
543
544 /* If there were no instructions between the two timestamp gets, the diff
545 * is 2 cycles. Remove that overhead, so I can forget about that when
546 * trying to determine the time taken for single instructions.
547 */
548 cbld.ADD(diff, diff, brw_imm_ud(-2u));
549 SHADER_TIME_ADD(cbld, 0, diff);
550 SHADER_TIME_ADD(cbld, 1, brw_imm_ud(1u));
551 ibld.emit(BRW_OPCODE_ELSE);
552 SHADER_TIME_ADD(cbld, 2, brw_imm_ud(1u));
553 ibld.emit(BRW_OPCODE_ENDIF);
554 }
555
556 void
557 fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
558 int shader_time_subindex,
559 fs_reg value)
560 {
561 int index = shader_time_index * 3 + shader_time_subindex;
562 struct brw_reg offset = brw_imm_d(index * SHADER_TIME_STRIDE);
563
564 fs_reg payload;
565 if (dispatch_width == 8)
566 payload = vgrf(glsl_type::uvec2_type);
567 else
568 payload = vgrf(glsl_type::uint_type);
569
570 bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
571 }
572
573 void
574 fs_visitor::vfail(const char *format, va_list va)
575 {
576 char *msg;
577
578 if (failed)
579 return;
580
581 failed = true;
582
583 msg = ralloc_vasprintf(mem_ctx, format, va);
584 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
585
586 this->fail_msg = msg;
587
588 if (debug_enabled) {
589 fprintf(stderr, "%s", msg);
590 }
591 }
592
593 void
594 fs_visitor::fail(const char *format, ...)
595 {
596 va_list va;
597
598 va_start(va, format);
599 vfail(format, va);
600 va_end(va);
601 }
602
603 /**
604 * Mark this program as impossible to compile in SIMD16 mode.
605 *
606 * During the SIMD8 compile (which happens first), we can detect and flag
607 * things that are unsupported in SIMD16 mode, so the compiler can skip
608 * the SIMD16 compile altogether.
609 *
610 * During a SIMD16 compile (if one happens anyway), this just calls fail().
611 */
612 void
613 fs_visitor::no16(const char *msg)
614 {
615 if (dispatch_width == 16) {
616 fail("%s", msg);
617 } else {
618 simd16_unsupported = true;
619
620 compiler->shader_perf_log(log_data,
621 "SIMD16 shader failed to compile: %s", msg);
622 }
623 }
624
625 /**
626 * Returns true if the instruction has a flag that means it won't
627 * update an entire destination register.
628 *
629 * For example, dead code elimination and live variable analysis want to know
630 * when a write to a variable screens off any preceding values that were in
631 * it.
632 */
633 bool
634 fs_inst::is_partial_write() const
635 {
636 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
637 (this->exec_size * type_sz(this->dst.type)) < 32 ||
638 !this->dst.is_contiguous());
639 }
640
641 unsigned
642 fs_inst::components_read(unsigned i) const
643 {
644 switch (opcode) {
645 case FS_OPCODE_LINTERP:
646 if (i == 0)
647 return 2;
648 else
649 return 1;
650
651 case FS_OPCODE_PIXEL_X:
652 case FS_OPCODE_PIXEL_Y:
653 assert(i == 0);
654 return 2;
655
656 case FS_OPCODE_FB_WRITE_LOGICAL:
657 assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
658 /* First/second FB write color. */
659 if (i < 2)
660 return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
661 else
662 return 1;
663
664 case SHADER_OPCODE_TEX_LOGICAL:
665 case SHADER_OPCODE_TXD_LOGICAL:
666 case SHADER_OPCODE_TXF_LOGICAL:
667 case SHADER_OPCODE_TXL_LOGICAL:
668 case SHADER_OPCODE_TXS_LOGICAL:
669 case FS_OPCODE_TXB_LOGICAL:
670 case SHADER_OPCODE_TXF_CMS_LOGICAL:
671 case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
672 case SHADER_OPCODE_TXF_UMS_LOGICAL:
673 case SHADER_OPCODE_TXF_MCS_LOGICAL:
674 case SHADER_OPCODE_LOD_LOGICAL:
675 case SHADER_OPCODE_TG4_LOGICAL:
676 case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
677 assert(src[8].file == IMM && src[9].file == IMM);
678 /* Texture coordinates. */
679 if (i == 0)
680 return src[8].ud;
681 /* Texture derivatives. */
682 else if ((i == 2 || i == 3) && opcode == SHADER_OPCODE_TXD_LOGICAL)
683 return src[9].ud;
684 /* Texture offset. */
685 else if (i == 7)
686 return 2;
687 /* MCS */
688 else if (i == 5 && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
689 return 2;
690 else
691 return 1;
692
693 case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
694 case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
695 assert(src[3].file == IMM);
696 /* Surface coordinates. */
697 if (i == 0)
698 return src[3].ud;
699 /* Surface operation source (ignored for reads). */
700 else if (i == 1)
701 return 0;
702 else
703 return 1;
704
705 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
706 case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
707 assert(src[3].file == IMM &&
708 src[4].file == IMM);
709 /* Surface coordinates. */
710 if (i == 0)
711 return src[3].ud;
712 /* Surface operation source. */
713 else if (i == 1)
714 return src[4].ud;
715 else
716 return 1;
717
718 case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
719 case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
720 assert(src[3].file == IMM &&
721 src[4].file == IMM);
722 const unsigned op = src[4].ud;
723 /* Surface coordinates. */
724 if (i == 0)
725 return src[3].ud;
726 /* Surface operation source. */
727 else if (i == 1 && op == BRW_AOP_CMPWR)
728 return 2;
729 else if (i == 1 && (op == BRW_AOP_INC || op == BRW_AOP_DEC ||
730 op == BRW_AOP_PREDEC))
731 return 0;
732 else
733 return 1;
734 }
735
736 default:
737 return 1;
738 }
739 }
740
741 int
742 fs_inst::regs_read(int arg) const
743 {
744 switch (opcode) {
745 case FS_OPCODE_FB_WRITE:
746 case SHADER_OPCODE_URB_WRITE_SIMD8:
747 case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
748 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
749 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
750 case SHADER_OPCODE_URB_READ_SIMD8:
751 case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
752 case SHADER_OPCODE_UNTYPED_ATOMIC:
753 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
754 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
755 case SHADER_OPCODE_TYPED_ATOMIC:
756 case SHADER_OPCODE_TYPED_SURFACE_READ:
757 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
758 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
759 if (arg == 0)
760 return mlen;
761 break;
762
763 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
764 /* The payload is actually stored in src1 */
765 if (arg == 1)
766 return mlen;
767 break;
768
769 case FS_OPCODE_LINTERP:
770 if (arg == 1)
771 return 1;
772 break;
773
774 case SHADER_OPCODE_LOAD_PAYLOAD:
775 if (arg < this->header_size)
776 return 1;
777 break;
778
779 case CS_OPCODE_CS_TERMINATE:
780 case SHADER_OPCODE_BARRIER:
781 return 1;
782
783 case SHADER_OPCODE_MOV_INDIRECT:
784 if (arg == 0) {
785 assert(src[2].file == IMM);
786 unsigned region_length = src[2].ud;
787
788 if (src[0].file == FIXED_GRF) {
789 /* If the start of the region is not register aligned, then
790 * there's some portion of the register that's technically
791 * unread at the beginning.
792 *
793 * However, the register allocator works in terms of whole
794 * registers, and does not use subnr. It assumes that the
795 * read starts at the beginning of the register, and extends
796 * regs_read() whole registers beyond that.
797 *
798 * To compensate, we extend the region length to include this
799 * unread portion at the beginning.
800 */
801 if (src[0].subnr)
802 region_length += src[0].subnr * type_sz(src[0].type);
803
804 return DIV_ROUND_UP(region_length, REG_SIZE);
805 } else {
806 assert(!"Invalid register file");
807 }
808 }
809 break;
810
811 default:
812 if (is_tex() && arg == 0 && src[0].file == VGRF)
813 return mlen;
814 break;
815 }
816
817 switch (src[arg].file) {
818 case BAD_FILE:
819 return 0;
820 case UNIFORM:
821 case IMM:
822 return 1;
823 case ARF:
824 case FIXED_GRF:
825 case VGRF:
826 case ATTR:
827 return DIV_ROUND_UP(components_read(arg) *
828 src[arg].component_size(exec_size),
829 REG_SIZE);
830 case MRF:
831 unreachable("MRF registers are not allowed as sources");
832 }
833 return 0;
834 }
835
836 bool
837 fs_inst::reads_flag() const
838 {
839 return predicate;
840 }
841
842 bool
843 fs_inst::writes_flag() const
844 {
845 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
846 opcode != BRW_OPCODE_IF &&
847 opcode != BRW_OPCODE_WHILE)) ||
848 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
849 }
850
851 /**
852 * Returns how many MRFs an FS opcode will write over.
853 *
854 * Note that this is not the 0 or 1 implied writes in an actual gen
855 * instruction -- the FS opcodes often generate MOVs in addition.
856 */
857 int
858 fs_visitor::implied_mrf_writes(fs_inst *inst)
859 {
860 if (inst->mlen == 0)
861 return 0;
862
863 if (inst->base_mrf == -1)
864 return 0;
865
866 switch (inst->opcode) {
867 case SHADER_OPCODE_RCP:
868 case SHADER_OPCODE_RSQ:
869 case SHADER_OPCODE_SQRT:
870 case SHADER_OPCODE_EXP2:
871 case SHADER_OPCODE_LOG2:
872 case SHADER_OPCODE_SIN:
873 case SHADER_OPCODE_COS:
874 return 1 * dispatch_width / 8;
875 case SHADER_OPCODE_POW:
876 case SHADER_OPCODE_INT_QUOTIENT:
877 case SHADER_OPCODE_INT_REMAINDER:
878 return 2 * dispatch_width / 8;
879 case SHADER_OPCODE_TEX:
880 case FS_OPCODE_TXB:
881 case SHADER_OPCODE_TXD:
882 case SHADER_OPCODE_TXF:
883 case SHADER_OPCODE_TXF_CMS:
884 case SHADER_OPCODE_TXF_CMS_W:
885 case SHADER_OPCODE_TXF_MCS:
886 case SHADER_OPCODE_TG4:
887 case SHADER_OPCODE_TG4_OFFSET:
888 case SHADER_OPCODE_TXL:
889 case SHADER_OPCODE_TXS:
890 case SHADER_OPCODE_LOD:
891 case SHADER_OPCODE_SAMPLEINFO:
892 return 1;
893 case FS_OPCODE_FB_WRITE:
894 return 2;
895 case FS_OPCODE_GET_BUFFER_SIZE:
896 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
897 case SHADER_OPCODE_GEN4_SCRATCH_READ:
898 return 1;
899 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
900 return inst->mlen;
901 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
902 return inst->mlen;
903 case SHADER_OPCODE_UNTYPED_ATOMIC:
904 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
905 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
906 case SHADER_OPCODE_TYPED_ATOMIC:
907 case SHADER_OPCODE_TYPED_SURFACE_READ:
908 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
909 case SHADER_OPCODE_URB_WRITE_SIMD8:
910 case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
911 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
912 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
913 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
914 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
915 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
916 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
917 return 0;
918 default:
919 unreachable("not reached");
920 }
921 }
922
923 fs_reg
924 fs_visitor::vgrf(const glsl_type *const type)
925 {
926 int reg_width = dispatch_width / 8;
927 return fs_reg(VGRF, alloc.allocate(type_size_scalar(type) * reg_width),
928 brw_type_for_base_type(type));
929 }
930
931 fs_reg::fs_reg(enum brw_reg_file file, int nr)
932 {
933 init();
934 this->file = file;
935 this->nr = nr;
936 this->type = BRW_REGISTER_TYPE_F;
937 this->stride = (file == UNIFORM ? 0 : 1);
938 }
939
940 fs_reg::fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type)
941 {
942 init();
943 this->file = file;
944 this->nr = nr;
945 this->type = type;
946 this->stride = (file == UNIFORM ? 0 : 1);
947 }
948
949 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
950 * This brings in those uniform definitions
951 */
952 void
953 fs_visitor::import_uniforms(fs_visitor *v)
954 {
955 this->push_constant_loc = v->push_constant_loc;
956 this->pull_constant_loc = v->pull_constant_loc;
957 this->uniforms = v->uniforms;
958 this->param_size = v->param_size;
959 }
960
961 fs_reg *
962 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
963 bool origin_upper_left)
964 {
965 assert(stage == MESA_SHADER_FRAGMENT);
966 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
967 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
968 fs_reg wpos = *reg;
969 bool flip = !origin_upper_left ^ key->render_to_fbo;
970
971 /* gl_FragCoord.x */
972 if (pixel_center_integer) {
973 bld.MOV(wpos, this->pixel_x);
974 } else {
975 bld.ADD(wpos, this->pixel_x, brw_imm_f(0.5f));
976 }
977 wpos = offset(wpos, bld, 1);
978
979 /* gl_FragCoord.y */
980 if (!flip && pixel_center_integer) {
981 bld.MOV(wpos, this->pixel_y);
982 } else {
983 fs_reg pixel_y = this->pixel_y;
984 float offset = (pixel_center_integer ? 0.0f : 0.5f);
985
986 if (flip) {
987 pixel_y.negate = true;
988 offset += key->drawable_height - 1.0f;
989 }
990
991 bld.ADD(wpos, pixel_y, brw_imm_f(offset));
992 }
993 wpos = offset(wpos, bld, 1);
994
995 /* gl_FragCoord.z */
996 if (devinfo->gen >= 6) {
997 bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
998 } else {
999 bld.emit(FS_OPCODE_LINTERP, wpos,
1000 this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1001 interp_reg(VARYING_SLOT_POS, 2));
1002 }
1003 wpos = offset(wpos, bld, 1);
1004
1005 /* gl_FragCoord.w: Already set up in emit_interpolation */
1006 bld.MOV(wpos, this->wpos_w);
1007
1008 return reg;
1009 }
1010
1011 fs_inst *
1012 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1013 glsl_interp_qualifier interpolation_mode,
1014 bool is_centroid, bool is_sample)
1015 {
1016 brw_wm_barycentric_interp_mode barycoord_mode;
1017 if (devinfo->gen >= 6) {
1018 if (is_centroid) {
1019 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1020 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1021 else
1022 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1023 } else if (is_sample) {
1024 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1025 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1026 else
1027 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1028 } else {
1029 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1030 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1031 else
1032 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1033 }
1034 } else {
1035 /* On Ironlake and below, there is only one interpolation mode.
1036 * Centroid interpolation doesn't mean anything on this hardware --
1037 * there is no multisampling.
1038 */
1039 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1040 }
1041 return bld.emit(FS_OPCODE_LINTERP, attr,
1042 this->delta_xy[barycoord_mode], interp);
1043 }
1044
1045 void
1046 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1047 const glsl_type *type,
1048 glsl_interp_qualifier interpolation_mode,
1049 int location, bool mod_centroid,
1050 bool mod_sample)
1051 {
1052 attr.type = brw_type_for_base_type(type->get_scalar_type());
1053
1054 assert(stage == MESA_SHADER_FRAGMENT);
1055 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1056 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1057
1058 unsigned int array_elements;
1059
1060 if (type->is_array()) {
1061 array_elements = type->arrays_of_arrays_size();
1062 if (array_elements == 0) {
1063 fail("dereferenced array '%s' has length 0\n", name);
1064 }
1065 type = type->without_array();
1066 } else {
1067 array_elements = 1;
1068 }
1069
1070 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1071 bool is_gl_Color =
1072 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1073 if (key->flat_shade && is_gl_Color) {
1074 interpolation_mode = INTERP_QUALIFIER_FLAT;
1075 } else {
1076 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1077 }
1078 }
1079
1080 for (unsigned int i = 0; i < array_elements; i++) {
1081 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1082 if (prog_data->urb_setup[location] == -1) {
1083 /* If there's no incoming setup data for this slot, don't
1084 * emit interpolation for it.
1085 */
1086 attr = offset(attr, bld, type->vector_elements);
1087 location++;
1088 continue;
1089 }
1090
1091 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1092 /* Constant interpolation (flat shading) case. The SF has
1093 * handed us defined values in only the constant offset
1094 * field of the setup reg.
1095 */
1096 for (unsigned int k = 0; k < type->vector_elements; k++) {
1097 struct brw_reg interp = interp_reg(location, k);
1098 interp = suboffset(interp, 3);
1099 interp.type = attr.type;
1100 bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1101 attr = offset(attr, bld, 1);
1102 }
1103 } else {
1104 /* Smooth/noperspective interpolation case. */
1105 for (unsigned int k = 0; k < type->vector_elements; k++) {
1106 struct brw_reg interp = interp_reg(location, k);
1107 if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1108 /* Get the pixel/sample mask into f0 so that we know
1109 * which pixels are lit. Then, for each channel that is
1110 * unlit, replace the centroid data with non-centroid
1111 * data.
1112 */
1113 bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1114
1115 fs_inst *inst;
1116 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1117 false, false);
1118 inst->predicate = BRW_PREDICATE_NORMAL;
1119 inst->predicate_inverse = true;
1120 if (devinfo->has_pln)
1121 inst->no_dd_clear = true;
1122
1123 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1124 mod_centroid && !key->persample_shading,
1125 mod_sample || key->persample_shading);
1126 inst->predicate = BRW_PREDICATE_NORMAL;
1127 inst->predicate_inverse = false;
1128 if (devinfo->has_pln)
1129 inst->no_dd_check = true;
1130
1131 } else {
1132 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1133 mod_centroid && !key->persample_shading,
1134 mod_sample || key->persample_shading);
1135 }
1136 if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1137 bld.MUL(attr, attr, this->pixel_w);
1138 }
1139 attr = offset(attr, bld, 1);
1140 }
1141
1142 }
1143 location++;
1144 }
1145 }
1146 }
1147
1148 fs_reg *
1149 fs_visitor::emit_frontfacing_interpolation()
1150 {
1151 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1152
1153 if (devinfo->gen >= 6) {
1154 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1155 * a boolean result from this (~0/true or 0/false).
1156 *
1157 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1158 * this task in only one instruction:
1159 * - a negation source modifier will flip the bit; and
1160 * - a W -> D type conversion will sign extend the bit into the high
1161 * word of the destination.
1162 *
1163 * An ASR 15 fills the low word of the destination.
1164 */
1165 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1166 g0.negate = true;
1167
1168 bld.ASR(*reg, g0, brw_imm_d(15));
1169 } else {
1170 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1171 * a boolean result from this (1/true or 0/false).
1172 *
1173 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1174 * the negation source modifier to flip it. Unfortunately the SHR
1175 * instruction only operates on UD (or D with an abs source modifier)
1176 * sources without negation.
1177 *
1178 * Instead, use ASR (which will give ~0/true or 0/false).
1179 */
1180 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1181 g1_6.negate = true;
1182
1183 bld.ASR(*reg, g1_6, brw_imm_d(31));
1184 }
1185
1186 return reg;
1187 }
1188
1189 void
1190 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1191 {
1192 assert(stage == MESA_SHADER_FRAGMENT);
1193 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1194 assert(dst.type == BRW_REGISTER_TYPE_F);
1195
1196 if (key->compute_pos_offset) {
1197 /* Convert int_sample_pos to floating point */
1198 bld.MOV(dst, int_sample_pos);
1199 /* Scale to the range [0, 1] */
1200 bld.MUL(dst, dst, brw_imm_f(1 / 16.0f));
1201 }
1202 else {
1203 /* From ARB_sample_shading specification:
1204 * "When rendering to a non-multisample buffer, or if multisample
1205 * rasterization is disabled, gl_SamplePosition will always be
1206 * (0.5, 0.5).
1207 */
1208 bld.MOV(dst, brw_imm_f(0.5f));
1209 }
1210 }
1211
1212 fs_reg *
1213 fs_visitor::emit_samplepos_setup()
1214 {
1215 assert(devinfo->gen >= 6);
1216
1217 const fs_builder abld = bld.annotate("compute sample position");
1218 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1219 fs_reg pos = *reg;
1220 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1221 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1222
1223 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1224 * mode will be enabled.
1225 *
1226 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1227 * R31.1:0 Position Offset X/Y for Slot[3:0]
1228 * R31.3:2 Position Offset X/Y for Slot[7:4]
1229 * .....
1230 *
1231 * The X, Y sample positions come in as bytes in thread payload. So, read
1232 * the positions using vstride=16, width=8, hstride=2.
1233 */
1234 struct brw_reg sample_pos_reg =
1235 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1236 BRW_REGISTER_TYPE_B), 16, 8, 2);
1237
1238 if (dispatch_width == 8) {
1239 abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
1240 } else {
1241 abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
1242 abld.half(1).MOV(half(int_sample_x, 1),
1243 fs_reg(suboffset(sample_pos_reg, 16)));
1244 }
1245 /* Compute gl_SamplePosition.x */
1246 compute_sample_position(pos, int_sample_x);
1247 pos = offset(pos, abld, 1);
1248 if (dispatch_width == 8) {
1249 abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
1250 } else {
1251 abld.half(0).MOV(half(int_sample_y, 0),
1252 fs_reg(suboffset(sample_pos_reg, 1)));
1253 abld.half(1).MOV(half(int_sample_y, 1),
1254 fs_reg(suboffset(sample_pos_reg, 17)));
1255 }
1256 /* Compute gl_SamplePosition.y */
1257 compute_sample_position(pos, int_sample_y);
1258 return reg;
1259 }
1260
1261 fs_reg *
1262 fs_visitor::emit_sampleid_setup()
1263 {
1264 assert(stage == MESA_SHADER_FRAGMENT);
1265 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1266 assert(devinfo->gen >= 6);
1267
1268 const fs_builder abld = bld.annotate("compute sample id");
1269 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1270
1271 if (key->compute_sample_id) {
1272 fs_reg t1(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_D);
1273 t1.set_smear(0);
1274 fs_reg t2(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
1275
1276 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1277 * 8x multisampling, subspan 0 will represent sample N (where N
1278 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1279 * 7. We can find the value of N by looking at R0.0 bits 7:6
1280 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1281 * (since samples are always delivered in pairs). That is, we
1282 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1283 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1284 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1285 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1286 * populating a temporary variable with the sequence (0, 1, 2, 3),
1287 * and then reading from it using vstride=1, width=4, hstride=0.
1288 * These computations hold good for 4x multisampling as well.
1289 *
1290 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1291 * the first four slots are sample 0 of subspan 0; the next four
1292 * are sample 1 of subspan 0; the third group is sample 0 of
1293 * subspan 1, and finally sample 1 of subspan 1.
1294 */
1295
1296 /* SKL+ has an extra bit for the Starting Sample Pair Index to
1297 * accomodate 16x MSAA.
1298 */
1299 unsigned sspi_mask = devinfo->gen >= 9 ? 0x1c0 : 0xc0;
1300
1301 abld.exec_all().group(1, 0)
1302 .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1303 brw_imm_ud(sspi_mask));
1304 abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5));
1305
1306 /* This works for both SIMD8 and SIMD16 */
1307 abld.exec_all().group(4, 0)
1308 .MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210));
1309
1310 /* This special instruction takes care of setting vstride=1,
1311 * width=4, hstride=0 of t2 during an ADD instruction.
1312 */
1313 abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1314 } else {
1315 /* As per GL_ARB_sample_shading specification:
1316 * "When rendering to a non-multisample buffer, or if multisample
1317 * rasterization is disabled, gl_SampleID will always be zero."
1318 */
1319 abld.MOV(*reg, brw_imm_d(0));
1320 }
1321
1322 return reg;
1323 }
1324
1325 fs_reg
1326 fs_visitor::resolve_source_modifiers(const fs_reg &src)
1327 {
1328 if (!src.abs && !src.negate)
1329 return src;
1330
1331 fs_reg temp = bld.vgrf(src.type);
1332 bld.MOV(temp, src);
1333
1334 return temp;
1335 }
1336
1337 void
1338 fs_visitor::emit_discard_jump()
1339 {
1340 assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1341
1342 /* For performance, after a discard, jump to the end of the
1343 * shader if all relevant channels have been discarded.
1344 */
1345 fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
1346 discard_jump->flag_subreg = 1;
1347
1348 discard_jump->predicate = (dispatch_width == 8)
1349 ? BRW_PREDICATE_ALIGN1_ANY8H
1350 : BRW_PREDICATE_ALIGN1_ANY16H;
1351 discard_jump->predicate_inverse = true;
1352 }
1353
1354 void
1355 fs_visitor::emit_gs_thread_end()
1356 {
1357 assert(stage == MESA_SHADER_GEOMETRY);
1358
1359 struct brw_gs_prog_data *gs_prog_data =
1360 (struct brw_gs_prog_data *) prog_data;
1361
1362 if (gs_compile->control_data_header_size_bits > 0) {
1363 emit_gs_control_data_bits(this->final_gs_vertex_count);
1364 }
1365
1366 const fs_builder abld = bld.annotate("thread end");
1367 fs_inst *inst;
1368
1369 if (gs_prog_data->static_vertex_count != -1) {
1370 foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
1371 if (prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8 ||
1372 prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
1373 prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
1374 prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) {
1375 prev->eot = true;
1376
1377 /* Delete now dead instructions. */
1378 foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
1379 if (dead == prev)
1380 break;
1381 dead->remove();
1382 }
1383 return;
1384 } else if (prev->is_control_flow() || prev->has_side_effects()) {
1385 break;
1386 }
1387 }
1388 fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1389 abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)));
1390 inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, hdr);
1391 inst->mlen = 1;
1392 } else {
1393 fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2);
1394 fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
1395 sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
1396 sources[1] = this->final_gs_vertex_count;
1397 abld.LOAD_PAYLOAD(payload, sources, 2, 2);
1398 inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
1399 inst->mlen = 2;
1400 }
1401 inst->eot = true;
1402 inst->offset = 0;
1403 }
1404
1405 void
1406 fs_visitor::assign_curb_setup()
1407 {
1408 if (dispatch_width == 8) {
1409 prog_data->dispatch_grf_start_reg = payload.num_regs;
1410 } else {
1411 if (stage == MESA_SHADER_FRAGMENT) {
1412 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1413 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1414 } else if (stage == MESA_SHADER_COMPUTE) {
1415 brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1416 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1417 } else {
1418 unreachable("Unsupported shader type!");
1419 }
1420 }
1421
1422 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1423
1424 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1425 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1426 for (unsigned int i = 0; i < inst->sources; i++) {
1427 if (inst->src[i].file == UNIFORM) {
1428 int uniform_nr = inst->src[i].nr + inst->src[i].reg_offset;
1429 int constant_nr;
1430 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1431 constant_nr = push_constant_loc[uniform_nr];
1432 } else {
1433 /* Section 5.11 of the OpenGL 4.1 spec says:
1434 * "Out-of-bounds reads return undefined values, which include
1435 * values from other variables of the active program or zero."
1436 * Just return the first push constant.
1437 */
1438 constant_nr = 0;
1439 }
1440
1441 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1442 constant_nr / 8,
1443 constant_nr % 8);
1444 brw_reg.abs = inst->src[i].abs;
1445 brw_reg.negate = inst->src[i].negate;
1446
1447 assert(inst->src[i].stride == 0);
1448 inst->src[i] = byte_offset(
1449 retype(brw_reg, inst->src[i].type),
1450 inst->src[i].subreg_offset);
1451 }
1452 }
1453 }
1454
1455 /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
1456 this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length;
1457 }
1458
1459 void
1460 fs_visitor::calculate_urb_setup()
1461 {
1462 assert(stage == MESA_SHADER_FRAGMENT);
1463 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1464 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1465
1466 memset(prog_data->urb_setup, -1,
1467 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1468
1469 int urb_next = 0;
1470 /* Figure out where each of the incoming setup attributes lands. */
1471 if (devinfo->gen >= 6) {
1472 if (_mesa_bitcount_64(nir->info.inputs_read &
1473 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1474 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1475 * first 16 varying inputs, so we can put them wherever we want.
1476 * Just put them in order.
1477 *
1478 * This is useful because it means that (a) inputs not used by the
1479 * fragment shader won't take up valuable register space, and (b) we
1480 * won't have to recompile the fragment shader if it gets paired with
1481 * a different vertex (or geometry) shader.
1482 */
1483 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1484 if (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
1485 BITFIELD64_BIT(i)) {
1486 prog_data->urb_setup[i] = urb_next++;
1487 }
1488 }
1489 } else {
1490 bool include_vue_header =
1491 nir->info.inputs_read & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
1492
1493 /* We have enough input varyings that the SF/SBE pipeline stage can't
1494 * arbitrarily rearrange them to suit our whim; we have to put them
1495 * in an order that matches the output of the previous pipeline stage
1496 * (geometry or vertex shader).
1497 */
1498 struct brw_vue_map prev_stage_vue_map;
1499 brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1500 key->input_slots_valid,
1501 nir->info.separate_shader);
1502 int first_slot =
1503 include_vue_header ? 0 : 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1504
1505 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1506 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1507 slot++) {
1508 int varying = prev_stage_vue_map.slot_to_varying[slot];
1509 if (varying != BRW_VARYING_SLOT_PAD &&
1510 (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
1511 BITFIELD64_BIT(varying))) {
1512 prog_data->urb_setup[varying] = slot - first_slot;
1513 }
1514 }
1515 urb_next = prev_stage_vue_map.num_slots - first_slot;
1516 }
1517 } else {
1518 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1519 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1520 /* Point size is packed into the header, not as a general attribute */
1521 if (i == VARYING_SLOT_PSIZ)
1522 continue;
1523
1524 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1525 /* The back color slot is skipped when the front color is
1526 * also written to. In addition, some slots can be
1527 * written in the vertex shader and not read in the
1528 * fragment shader. So the register number must always be
1529 * incremented, mapped or not.
1530 */
1531 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1532 prog_data->urb_setup[i] = urb_next;
1533 urb_next++;
1534 }
1535 }
1536
1537 /*
1538 * It's a FS only attribute, and we did interpolation for this attribute
1539 * in SF thread. So, count it here, too.
1540 *
1541 * See compile_sf_prog() for more info.
1542 */
1543 if (nir->info.inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1544 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1545 }
1546
1547 prog_data->num_varying_inputs = urb_next;
1548 }
1549
1550 void
1551 fs_visitor::assign_urb_setup()
1552 {
1553 assert(stage == MESA_SHADER_FRAGMENT);
1554 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1555
1556 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1557
1558 /* Offset all the urb_setup[] index by the actual position of the
1559 * setup regs, now that the location of the constants has been chosen.
1560 */
1561 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1562 if (inst->opcode == FS_OPCODE_LINTERP) {
1563 assert(inst->src[1].file == FIXED_GRF);
1564 inst->src[1].nr += urb_start;
1565 }
1566
1567 if (inst->opcode == FS_OPCODE_CINTERP) {
1568 assert(inst->src[0].file == FIXED_GRF);
1569 inst->src[0].nr += urb_start;
1570 }
1571 }
1572
1573 /* Each attribute is 4 setup channels, each of which is half a reg. */
1574 this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
1575 }
1576
1577 void
1578 fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
1579 {
1580 for (int i = 0; i < inst->sources; i++) {
1581 if (inst->src[i].file == ATTR) {
1582 int grf = payload.num_regs +
1583 prog_data->curb_read_length +
1584 inst->src[i].nr +
1585 inst->src[i].reg_offset;
1586
1587 unsigned width = inst->src[i].stride == 0 ? 1 : inst->exec_size;
1588 struct brw_reg reg =
1589 stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1590 inst->src[i].subreg_offset),
1591 inst->exec_size * inst->src[i].stride,
1592 width, inst->src[i].stride);
1593 reg.abs = inst->src[i].abs;
1594 reg.negate = inst->src[i].negate;
1595
1596 inst->src[i] = reg;
1597 }
1598 }
1599 }
1600
1601 void
1602 fs_visitor::assign_vs_urb_setup()
1603 {
1604 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1605
1606 assert(stage == MESA_SHADER_VERTEX);
1607 int count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1608 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1609 count++;
1610
1611 /* Each attribute is 4 regs. */
1612 this->first_non_payload_grf += 4 * vs_prog_data->nr_attributes;
1613
1614 assert(vs_prog_data->base.urb_read_length <= 15);
1615
1616 /* Rewrite all ATTR file references to the hw grf that they land in. */
1617 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1618 convert_attr_sources_to_hw_regs(inst);
1619 }
1620 }
1621
1622 void
1623 fs_visitor::assign_gs_urb_setup()
1624 {
1625 assert(stage == MESA_SHADER_GEOMETRY);
1626
1627 brw_vue_prog_data *vue_prog_data = (brw_vue_prog_data *) prog_data;
1628
1629 first_non_payload_grf +=
1630 8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
1631
1632 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1633 /* Rewrite all ATTR file references to GRFs. */
1634 convert_attr_sources_to_hw_regs(inst);
1635 }
1636 }
1637
1638
1639 /**
1640 * Split large virtual GRFs into separate components if we can.
1641 *
1642 * This is mostly duplicated with what brw_fs_vector_splitting does,
1643 * but that's really conservative because it's afraid of doing
1644 * splitting that doesn't result in real progress after the rest of
1645 * the optimization phases, which would cause infinite looping in
1646 * optimization. We can do it once here, safely. This also has the
1647 * opportunity to split interpolated values, or maybe even uniforms,
1648 * which we don't have at the IR level.
1649 *
1650 * We want to split, because virtual GRFs are what we register
1651 * allocate and spill (due to contiguousness requirements for some
1652 * instructions), and they're what we naturally generate in the
1653 * codegen process, but most virtual GRFs don't actually need to be
1654 * contiguous sets of GRFs. If we split, we'll end up with reduced
1655 * live intervals and better dead code elimination and coalescing.
1656 */
1657 void
1658 fs_visitor::split_virtual_grfs()
1659 {
1660 int num_vars = this->alloc.count;
1661
1662 /* Count the total number of registers */
1663 int reg_count = 0;
1664 int vgrf_to_reg[num_vars];
1665 for (int i = 0; i < num_vars; i++) {
1666 vgrf_to_reg[i] = reg_count;
1667 reg_count += alloc.sizes[i];
1668 }
1669
1670 /* An array of "split points". For each register slot, this indicates
1671 * if this slot can be separated from the previous slot. Every time an
1672 * instruction uses multiple elements of a register (as a source or
1673 * destination), we mark the used slots as inseparable. Then we go
1674 * through and split the registers into the smallest pieces we can.
1675 */
1676 bool split_points[reg_count];
1677 memset(split_points, 0, sizeof(split_points));
1678
1679 /* Mark all used registers as fully splittable */
1680 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1681 if (inst->dst.file == VGRF) {
1682 int reg = vgrf_to_reg[inst->dst.nr];
1683 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++)
1684 split_points[reg + j] = true;
1685 }
1686
1687 for (int i = 0; i < inst->sources; i++) {
1688 if (inst->src[i].file == VGRF) {
1689 int reg = vgrf_to_reg[inst->src[i].nr];
1690 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++)
1691 split_points[reg + j] = true;
1692 }
1693 }
1694 }
1695
1696 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1697 if (inst->dst.file == VGRF) {
1698 int reg = vgrf_to_reg[inst->dst.nr] + inst->dst.reg_offset;
1699 for (int j = 1; j < inst->regs_written; j++)
1700 split_points[reg + j] = false;
1701 }
1702 for (int i = 0; i < inst->sources; i++) {
1703 if (inst->src[i].file == VGRF) {
1704 int reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].reg_offset;
1705 for (int j = 1; j < inst->regs_read(i); j++)
1706 split_points[reg + j] = false;
1707 }
1708 }
1709 }
1710
1711 int new_virtual_grf[reg_count];
1712 int new_reg_offset[reg_count];
1713
1714 int reg = 0;
1715 for (int i = 0; i < num_vars; i++) {
1716 /* The first one should always be 0 as a quick sanity check. */
1717 assert(split_points[reg] == false);
1718
1719 /* j = 0 case */
1720 new_reg_offset[reg] = 0;
1721 reg++;
1722 int offset = 1;
1723
1724 /* j > 0 case */
1725 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1726 /* If this is a split point, reset the offset to 0 and allocate a
1727 * new virtual GRF for the previous offset many registers
1728 */
1729 if (split_points[reg]) {
1730 assert(offset <= MAX_VGRF_SIZE);
1731 int grf = alloc.allocate(offset);
1732 for (int k = reg - offset; k < reg; k++)
1733 new_virtual_grf[k] = grf;
1734 offset = 0;
1735 }
1736 new_reg_offset[reg] = offset;
1737 offset++;
1738 reg++;
1739 }
1740
1741 /* The last one gets the original register number */
1742 assert(offset <= MAX_VGRF_SIZE);
1743 alloc.sizes[i] = offset;
1744 for (int k = reg - offset; k < reg; k++)
1745 new_virtual_grf[k] = i;
1746 }
1747 assert(reg == reg_count);
1748
1749 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1750 if (inst->dst.file == VGRF) {
1751 reg = vgrf_to_reg[inst->dst.nr] + inst->dst.reg_offset;
1752 inst->dst.nr = new_virtual_grf[reg];
1753 inst->dst.reg_offset = new_reg_offset[reg];
1754 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1755 }
1756 for (int i = 0; i < inst->sources; i++) {
1757 if (inst->src[i].file == VGRF) {
1758 reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].reg_offset;
1759 inst->src[i].nr = new_virtual_grf[reg];
1760 inst->src[i].reg_offset = new_reg_offset[reg];
1761 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1762 }
1763 }
1764 }
1765 invalidate_live_intervals();
1766 }
1767
1768 /**
1769 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1770 *
1771 * During code generation, we create tons of temporary variables, many of
1772 * which get immediately killed and are never used again. Yet, in later
1773 * optimization and analysis passes, such as compute_live_intervals, we need
1774 * to loop over all the virtual GRFs. Compacting them can save a lot of
1775 * overhead.
1776 */
1777 bool
1778 fs_visitor::compact_virtual_grfs()
1779 {
1780 bool progress = false;
1781 int remap_table[this->alloc.count];
1782 memset(remap_table, -1, sizeof(remap_table));
1783
1784 /* Mark which virtual GRFs are used. */
1785 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1786 if (inst->dst.file == VGRF)
1787 remap_table[inst->dst.nr] = 0;
1788
1789 for (int i = 0; i < inst->sources; i++) {
1790 if (inst->src[i].file == VGRF)
1791 remap_table[inst->src[i].nr] = 0;
1792 }
1793 }
1794
1795 /* Compact the GRF arrays. */
1796 int new_index = 0;
1797 for (unsigned i = 0; i < this->alloc.count; i++) {
1798 if (remap_table[i] == -1) {
1799 /* We just found an unused register. This means that we are
1800 * actually going to compact something.
1801 */
1802 progress = true;
1803 } else {
1804 remap_table[i] = new_index;
1805 alloc.sizes[new_index] = alloc.sizes[i];
1806 invalidate_live_intervals();
1807 ++new_index;
1808 }
1809 }
1810
1811 this->alloc.count = new_index;
1812
1813 /* Patch all the instructions to use the newly renumbered registers */
1814 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1815 if (inst->dst.file == VGRF)
1816 inst->dst.nr = remap_table[inst->dst.nr];
1817
1818 for (int i = 0; i < inst->sources; i++) {
1819 if (inst->src[i].file == VGRF)
1820 inst->src[i].nr = remap_table[inst->src[i].nr];
1821 }
1822 }
1823
1824 /* Patch all the references to delta_xy, since they're used in register
1825 * allocation. If they're unused, switch them to BAD_FILE so we don't
1826 * think some random VGRF is delta_xy.
1827 */
1828 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
1829 if (delta_xy[i].file == VGRF) {
1830 if (remap_table[delta_xy[i].nr] != -1) {
1831 delta_xy[i].nr = remap_table[delta_xy[i].nr];
1832 } else {
1833 delta_xy[i].file = BAD_FILE;
1834 }
1835 }
1836 }
1837
1838 return progress;
1839 }
1840
1841 /**
1842 * Assign UNIFORM file registers to either push constants or pull constants.
1843 *
1844 * We allow a fragment shader to have more than the specified minimum
1845 * maximum number of fragment shader uniform components (64). If
1846 * there are too many of these, they'd fill up all of register space.
1847 * So, this will push some of them out to the pull constant buffer and
1848 * update the program to load them. We also use pull constants for all
1849 * indirect constant loads because we don't support indirect accesses in
1850 * registers yet.
1851 */
1852 void
1853 fs_visitor::assign_constant_locations()
1854 {
1855 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1856 if (dispatch_width != 8)
1857 return;
1858
1859 unsigned int num_pull_constants = 0;
1860
1861 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1862 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
1863
1864 bool is_live[uniforms];
1865 memset(is_live, 0, sizeof(is_live));
1866
1867 /* First, we walk through the instructions and do two things:
1868 *
1869 * 1) Figure out which uniforms are live.
1870 *
1871 * 2) Find all indirect access of uniform arrays and flag them as needing
1872 * to go into the pull constant buffer.
1873 *
1874 * Note that we don't move constant-indexed accesses to arrays. No
1875 * testing has been done of the performance impact of this choice.
1876 */
1877 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
1878 for (int i = 0 ; i < inst->sources; i++) {
1879 if (inst->src[i].file != UNIFORM)
1880 continue;
1881
1882 if (inst->src[i].reladdr) {
1883 int uniform = inst->src[i].nr;
1884
1885 /* If this array isn't already present in the pull constant buffer,
1886 * add it.
1887 */
1888 if (pull_constant_loc[uniform] == -1) {
1889 assert(param_size[uniform]);
1890 for (int j = 0; j < param_size[uniform]; j++)
1891 pull_constant_loc[uniform + j] = num_pull_constants++;
1892 }
1893 } else {
1894 /* Mark the the one accessed uniform as live */
1895 int constant_nr = inst->src[i].nr + inst->src[i].reg_offset;
1896 if (constant_nr >= 0 && constant_nr < (int) uniforms)
1897 is_live[constant_nr] = true;
1898 }
1899 }
1900 }
1901
1902 /* Only allow 16 registers (128 uniform components) as push constants.
1903 *
1904 * Just demote the end of the list. We could probably do better
1905 * here, demoting things that are rarely used in the program first.
1906 *
1907 * If changing this value, note the limitation about total_regs in
1908 * brw_curbe.c.
1909 */
1910 unsigned int max_push_components = 16 * 8;
1911 unsigned int num_push_constants = 0;
1912
1913 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1914
1915 for (unsigned int i = 0; i < uniforms; i++) {
1916 if (!is_live[i] || pull_constant_loc[i] != -1) {
1917 /* This UNIFORM register is either dead, or has already been demoted
1918 * to a pull const. Mark it as no longer living in the param[] array.
1919 */
1920 push_constant_loc[i] = -1;
1921 continue;
1922 }
1923
1924 if (num_push_constants < max_push_components) {
1925 /* Retain as a push constant. Record the location in the params[]
1926 * array.
1927 */
1928 push_constant_loc[i] = num_push_constants++;
1929 } else {
1930 /* Demote to a pull constant. */
1931 push_constant_loc[i] = -1;
1932 pull_constant_loc[i] = num_pull_constants++;
1933 }
1934 }
1935
1936 stage_prog_data->nr_params = num_push_constants;
1937 stage_prog_data->nr_pull_params = num_pull_constants;
1938
1939 /* Up until now, the param[] array has been indexed by reg + reg_offset
1940 * of UNIFORM registers. Move pull constants into pull_param[] and
1941 * condense param[] to only contain the uniforms we chose to push.
1942 *
1943 * NOTE: Because we are condensing the params[] array, we know that
1944 * push_constant_loc[i] <= i and we can do it in one smooth loop without
1945 * having to make a copy.
1946 */
1947 for (unsigned int i = 0; i < uniforms; i++) {
1948 const gl_constant_value *value = stage_prog_data->param[i];
1949
1950 if (pull_constant_loc[i] != -1) {
1951 stage_prog_data->pull_param[pull_constant_loc[i]] = value;
1952 } else if (push_constant_loc[i] != -1) {
1953 stage_prog_data->param[push_constant_loc[i]] = value;
1954 }
1955 }
1956 }
1957
1958 /**
1959 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1960 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1961 */
1962 void
1963 fs_visitor::demote_pull_constants()
1964 {
1965 foreach_block_and_inst (block, fs_inst, inst, cfg) {
1966 for (int i = 0; i < inst->sources; i++) {
1967 if (inst->src[i].file != UNIFORM)
1968 continue;
1969
1970 int pull_index;
1971 unsigned location = inst->src[i].nr + inst->src[i].reg_offset;
1972 if (location >= uniforms) /* Out of bounds access */
1973 pull_index = -1;
1974 else
1975 pull_index = pull_constant_loc[location];
1976
1977 if (pull_index == -1)
1978 continue;
1979
1980 /* Set up the annotation tracking for new generated instructions. */
1981 const fs_builder ibld(this, block, inst);
1982 const unsigned index = stage_prog_data->binding_table.pull_constants_start;
1983 fs_reg dst = vgrf(glsl_type::float_type);
1984
1985 assert(inst->src[i].stride == 0);
1986
1987 /* Generate a pull load into dst. */
1988 if (inst->src[i].reladdr) {
1989 VARYING_PULL_CONSTANT_LOAD(ibld, dst,
1990 brw_imm_ud(index),
1991 *inst->src[i].reladdr,
1992 pull_index);
1993 inst->src[i].reladdr = NULL;
1994 inst->src[i].stride = 1;
1995 } else {
1996 const fs_builder ubld = ibld.exec_all().group(8, 0);
1997 struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15);
1998 ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1999 dst, brw_imm_ud(index), offset);
2000 inst->src[i].set_smear(pull_index & 3);
2001 }
2002 brw_mark_surface_used(prog_data, index);
2003
2004 /* Rewrite the instruction to use the temporary VGRF. */
2005 inst->src[i].file = VGRF;
2006 inst->src[i].nr = dst.nr;
2007 inst->src[i].reg_offset = 0;
2008 }
2009 }
2010 invalidate_live_intervals();
2011 }
2012
2013 bool
2014 fs_visitor::opt_algebraic()
2015 {
2016 bool progress = false;
2017
2018 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2019 switch (inst->opcode) {
2020 case BRW_OPCODE_MOV:
2021 if (inst->src[0].file != IMM)
2022 break;
2023
2024 if (inst->saturate) {
2025 if (inst->dst.type != inst->src[0].type)
2026 assert(!"unimplemented: saturate mixed types");
2027
2028 if (brw_saturate_immediate(inst->dst.type,
2029 &inst->src[0].as_brw_reg())) {
2030 inst->saturate = false;
2031 progress = true;
2032 }
2033 }
2034 break;
2035
2036 case BRW_OPCODE_MUL:
2037 if (inst->src[1].file != IMM)
2038 continue;
2039
2040 /* a * 1.0 = a */
2041 if (inst->src[1].is_one()) {
2042 inst->opcode = BRW_OPCODE_MOV;
2043 inst->src[1] = reg_undef;
2044 progress = true;
2045 break;
2046 }
2047
2048 /* a * -1.0 = -a */
2049 if (inst->src[1].is_negative_one()) {
2050 inst->opcode = BRW_OPCODE_MOV;
2051 inst->src[0].negate = !inst->src[0].negate;
2052 inst->src[1] = reg_undef;
2053 progress = true;
2054 break;
2055 }
2056
2057 /* a * 0.0 = 0.0 */
2058 if (inst->src[1].is_zero()) {
2059 inst->opcode = BRW_OPCODE_MOV;
2060 inst->src[0] = inst->src[1];
2061 inst->src[1] = reg_undef;
2062 progress = true;
2063 break;
2064 }
2065
2066 if (inst->src[0].file == IMM) {
2067 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2068 inst->opcode = BRW_OPCODE_MOV;
2069 inst->src[0].f *= inst->src[1].f;
2070 inst->src[1] = reg_undef;
2071 progress = true;
2072 break;
2073 }
2074 break;
2075 case BRW_OPCODE_ADD:
2076 if (inst->src[1].file != IMM)
2077 continue;
2078
2079 /* a + 0.0 = a */
2080 if (inst->src[1].is_zero()) {
2081 inst->opcode = BRW_OPCODE_MOV;
2082 inst->src[1] = reg_undef;
2083 progress = true;
2084 break;
2085 }
2086
2087 if (inst->src[0].file == IMM) {
2088 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2089 inst->opcode = BRW_OPCODE_MOV;
2090 inst->src[0].f += inst->src[1].f;
2091 inst->src[1] = reg_undef;
2092 progress = true;
2093 break;
2094 }
2095 break;
2096 case BRW_OPCODE_OR:
2097 if (inst->src[0].equals(inst->src[1])) {
2098 inst->opcode = BRW_OPCODE_MOV;
2099 inst->src[1] = reg_undef;
2100 progress = true;
2101 break;
2102 }
2103 break;
2104 case BRW_OPCODE_LRP:
2105 if (inst->src[1].equals(inst->src[2])) {
2106 inst->opcode = BRW_OPCODE_MOV;
2107 inst->src[0] = inst->src[1];
2108 inst->src[1] = reg_undef;
2109 inst->src[2] = reg_undef;
2110 progress = true;
2111 break;
2112 }
2113 break;
2114 case BRW_OPCODE_CMP:
2115 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2116 inst->src[0].abs &&
2117 inst->src[0].negate &&
2118 inst->src[1].is_zero()) {
2119 inst->src[0].abs = false;
2120 inst->src[0].negate = false;
2121 inst->conditional_mod = BRW_CONDITIONAL_Z;
2122 progress = true;
2123 break;
2124 }
2125 break;
2126 case BRW_OPCODE_SEL:
2127 if (inst->src[0].equals(inst->src[1])) {
2128 inst->opcode = BRW_OPCODE_MOV;
2129 inst->src[1] = reg_undef;
2130 inst->predicate = BRW_PREDICATE_NONE;
2131 inst->predicate_inverse = false;
2132 progress = true;
2133 } else if (inst->saturate && inst->src[1].file == IMM) {
2134 switch (inst->conditional_mod) {
2135 case BRW_CONDITIONAL_LE:
2136 case BRW_CONDITIONAL_L:
2137 switch (inst->src[1].type) {
2138 case BRW_REGISTER_TYPE_F:
2139 if (inst->src[1].f >= 1.0f) {
2140 inst->opcode = BRW_OPCODE_MOV;
2141 inst->src[1] = reg_undef;
2142 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2143 progress = true;
2144 }
2145 break;
2146 default:
2147 break;
2148 }
2149 break;
2150 case BRW_CONDITIONAL_GE:
2151 case BRW_CONDITIONAL_G:
2152 switch (inst->src[1].type) {
2153 case BRW_REGISTER_TYPE_F:
2154 if (inst->src[1].f <= 0.0f) {
2155 inst->opcode = BRW_OPCODE_MOV;
2156 inst->src[1] = reg_undef;
2157 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2158 progress = true;
2159 }
2160 break;
2161 default:
2162 break;
2163 }
2164 default:
2165 break;
2166 }
2167 }
2168 break;
2169 case BRW_OPCODE_MAD:
2170 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2171 inst->opcode = BRW_OPCODE_MOV;
2172 inst->src[1] = reg_undef;
2173 inst->src[2] = reg_undef;
2174 progress = true;
2175 } else if (inst->src[0].is_zero()) {
2176 inst->opcode = BRW_OPCODE_MUL;
2177 inst->src[0] = inst->src[2];
2178 inst->src[2] = reg_undef;
2179 progress = true;
2180 } else if (inst->src[1].is_one()) {
2181 inst->opcode = BRW_OPCODE_ADD;
2182 inst->src[1] = inst->src[2];
2183 inst->src[2] = reg_undef;
2184 progress = true;
2185 } else if (inst->src[2].is_one()) {
2186 inst->opcode = BRW_OPCODE_ADD;
2187 inst->src[2] = reg_undef;
2188 progress = true;
2189 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2190 inst->opcode = BRW_OPCODE_ADD;
2191 inst->src[1].f *= inst->src[2].f;
2192 inst->src[2] = reg_undef;
2193 progress = true;
2194 }
2195 break;
2196 case SHADER_OPCODE_RCP: {
2197 fs_inst *prev = (fs_inst *)inst->prev;
2198 if (prev->opcode == SHADER_OPCODE_SQRT) {
2199 if (inst->src[0].equals(prev->dst)) {
2200 inst->opcode = SHADER_OPCODE_RSQ;
2201 inst->src[0] = prev->src[0];
2202 progress = true;
2203 }
2204 }
2205 break;
2206 }
2207 case SHADER_OPCODE_BROADCAST:
2208 if (is_uniform(inst->src[0])) {
2209 inst->opcode = BRW_OPCODE_MOV;
2210 inst->sources = 1;
2211 inst->force_writemask_all = true;
2212 progress = true;
2213 } else if (inst->src[1].file == IMM) {
2214 inst->opcode = BRW_OPCODE_MOV;
2215 inst->src[0] = component(inst->src[0],
2216 inst->src[1].ud);
2217 inst->sources = 1;
2218 inst->force_writemask_all = true;
2219 progress = true;
2220 }
2221 break;
2222
2223 default:
2224 break;
2225 }
2226
2227 /* Swap if src[0] is immediate. */
2228 if (progress && inst->is_commutative()) {
2229 if (inst->src[0].file == IMM) {
2230 fs_reg tmp = inst->src[1];
2231 inst->src[1] = inst->src[0];
2232 inst->src[0] = tmp;
2233 }
2234 }
2235 }
2236 return progress;
2237 }
2238
2239 /**
2240 * Optimize sample messages that have constant zero values for the trailing
2241 * texture coordinates. We can just reduce the message length for these
2242 * instructions instead of reserving a register for it. Trailing parameters
2243 * that aren't sent default to zero anyway. This will cause the dead code
2244 * eliminator to remove the MOV instruction that would otherwise be emitted to
2245 * set up the zero value.
2246 */
2247 bool
2248 fs_visitor::opt_zero_samples()
2249 {
2250 /* Gen4 infers the texturing opcode based on the message length so we can't
2251 * change it.
2252 */
2253 if (devinfo->gen < 5)
2254 return false;
2255
2256 bool progress = false;
2257
2258 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2259 if (!inst->is_tex())
2260 continue;
2261
2262 fs_inst *load_payload = (fs_inst *) inst->prev;
2263
2264 if (load_payload->is_head_sentinel() ||
2265 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2266 continue;
2267
2268 /* We don't want to remove the message header or the first parameter.
2269 * Removing the first parameter is not allowed, see the Haswell PRM
2270 * volume 7, page 149:
2271 *
2272 * "Parameter 0 is required except for the sampleinfo message, which
2273 * has no parameter 0"
2274 */
2275 while (inst->mlen > inst->header_size + inst->exec_size / 8 &&
2276 load_payload->src[(inst->mlen - inst->header_size) /
2277 (inst->exec_size / 8) +
2278 inst->header_size - 1].is_zero()) {
2279 inst->mlen -= inst->exec_size / 8;
2280 progress = true;
2281 }
2282 }
2283
2284 if (progress)
2285 invalidate_live_intervals();
2286
2287 return progress;
2288 }
2289
2290 /**
2291 * Optimize sample messages which are followed by the final RT write.
2292 *
2293 * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2294 * results sent directly to the framebuffer, bypassing the EU. Recognize the
2295 * final texturing results copied to the framebuffer write payload and modify
2296 * them to write to the framebuffer directly.
2297 */
2298 bool
2299 fs_visitor::opt_sampler_eot()
2300 {
2301 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2302
2303 if (stage != MESA_SHADER_FRAGMENT)
2304 return false;
2305
2306 if (devinfo->gen < 9 && !devinfo->is_cherryview)
2307 return false;
2308
2309 /* FINISHME: It should be possible to implement this optimization when there
2310 * are multiple drawbuffers.
2311 */
2312 if (key->nr_color_regions != 1)
2313 return false;
2314
2315 /* Look for a texturing instruction immediately before the final FB_WRITE. */
2316 bblock_t *block = cfg->blocks[cfg->num_blocks - 1];
2317 fs_inst *fb_write = (fs_inst *)block->end();
2318 assert(fb_write->eot);
2319 assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2320
2321 fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2322
2323 /* There wasn't one; nothing to do. */
2324 if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2325 return false;
2326
2327 /* 3D Sampler » Messages » Message Format
2328 *
2329 * “Response Length of zero is allowed on all SIMD8* and SIMD16* sampler
2330 * messages except sample+killpix, resinfo, sampleinfo, LOD, and gather4*”
2331 */
2332 if (tex_inst->opcode == SHADER_OPCODE_TXS ||
2333 tex_inst->opcode == SHADER_OPCODE_SAMPLEINFO ||
2334 tex_inst->opcode == SHADER_OPCODE_LOD ||
2335 tex_inst->opcode == SHADER_OPCODE_TG4 ||
2336 tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2337 return false;
2338
2339 /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2340 * It's very likely to be the previous instruction.
2341 */
2342 fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2343 if (load_payload->is_head_sentinel() ||
2344 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2345 return false;
2346
2347 assert(!tex_inst->eot); /* We can't get here twice */
2348 assert((tex_inst->offset & (0xff << 24)) == 0);
2349
2350 const fs_builder ibld(this, block, tex_inst);
2351
2352 tex_inst->offset |= fb_write->target << 24;
2353 tex_inst->eot = true;
2354 tex_inst->dst = ibld.null_reg_ud();
2355 fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2356
2357 /* If a header is present, marking the eot is sufficient. Otherwise, we need
2358 * to create a new LOAD_PAYLOAD command with the same sources and a space
2359 * saved for the header. Using a new destination register not only makes sure
2360 * we have enough space, but it will make sure the dead code eliminator kills
2361 * the instruction that this will replace.
2362 */
2363 if (tex_inst->header_size != 0)
2364 return true;
2365
2366 fs_reg send_header = ibld.vgrf(BRW_REGISTER_TYPE_F,
2367 load_payload->sources + 1);
2368 fs_reg *new_sources =
2369 ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2370
2371 new_sources[0] = fs_reg();
2372 for (int i = 0; i < load_payload->sources; i++)
2373 new_sources[i+1] = load_payload->src[i];
2374
2375 /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2376 * requires a lot of information about the sources to appropriately figure
2377 * out the number of registers needed to be used. Given this stage in our
2378 * optimization, we may not have the appropriate GRFs required by
2379 * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2380 * manually emit the instruction.
2381 */
2382 fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2383 load_payload->exec_size,
2384 send_header,
2385 new_sources,
2386 load_payload->sources + 1);
2387
2388 new_load_payload->regs_written = load_payload->regs_written + 1;
2389 new_load_payload->header_size = 1;
2390 tex_inst->mlen++;
2391 tex_inst->header_size = 1;
2392 tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2393 tex_inst->src[0] = send_header;
2394
2395 return true;
2396 }
2397
2398 bool
2399 fs_visitor::opt_register_renaming()
2400 {
2401 bool progress = false;
2402 int depth = 0;
2403
2404 int remap[alloc.count];
2405 memset(remap, -1, sizeof(int) * alloc.count);
2406
2407 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2408 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2409 depth++;
2410 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2411 inst->opcode == BRW_OPCODE_WHILE) {
2412 depth--;
2413 }
2414
2415 /* Rewrite instruction sources. */
2416 for (int i = 0; i < inst->sources; i++) {
2417 if (inst->src[i].file == VGRF &&
2418 remap[inst->src[i].nr] != -1 &&
2419 remap[inst->src[i].nr] != inst->src[i].nr) {
2420 inst->src[i].nr = remap[inst->src[i].nr];
2421 progress = true;
2422 }
2423 }
2424
2425 const int dst = inst->dst.nr;
2426
2427 if (depth == 0 &&
2428 inst->dst.file == VGRF &&
2429 alloc.sizes[inst->dst.nr] == inst->exec_size / 8 &&
2430 !inst->is_partial_write()) {
2431 if (remap[dst] == -1) {
2432 remap[dst] = dst;
2433 } else {
2434 remap[dst] = alloc.allocate(inst->exec_size / 8);
2435 inst->dst.nr = remap[dst];
2436 progress = true;
2437 }
2438 } else if (inst->dst.file == VGRF &&
2439 remap[dst] != -1 &&
2440 remap[dst] != dst) {
2441 inst->dst.nr = remap[dst];
2442 progress = true;
2443 }
2444 }
2445
2446 if (progress) {
2447 invalidate_live_intervals();
2448
2449 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2450 if (delta_xy[i].file == VGRF && remap[delta_xy[i].nr] != -1) {
2451 delta_xy[i].nr = remap[delta_xy[i].nr];
2452 }
2453 }
2454 }
2455
2456 return progress;
2457 }
2458
2459 /**
2460 * Remove redundant or useless discard jumps.
2461 *
2462 * For example, we can eliminate jumps in the following sequence:
2463 *
2464 * discard-jump (redundant with the next jump)
2465 * discard-jump (useless; jumps to the next instruction)
2466 * placeholder-halt
2467 */
2468 bool
2469 fs_visitor::opt_redundant_discard_jumps()
2470 {
2471 bool progress = false;
2472
2473 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2474
2475 fs_inst *placeholder_halt = NULL;
2476 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2477 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2478 placeholder_halt = inst;
2479 break;
2480 }
2481 }
2482
2483 if (!placeholder_halt)
2484 return false;
2485
2486 /* Delete any HALTs immediately before the placeholder halt. */
2487 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2488 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2489 prev = (fs_inst *) placeholder_halt->prev) {
2490 prev->remove(last_bblock);
2491 progress = true;
2492 }
2493
2494 if (progress)
2495 invalidate_live_intervals();
2496
2497 return progress;
2498 }
2499
2500 bool
2501 fs_visitor::compute_to_mrf()
2502 {
2503 bool progress = false;
2504 int next_ip = 0;
2505
2506 /* No MRFs on Gen >= 7. */
2507 if (devinfo->gen >= 7)
2508 return false;
2509
2510 calculate_live_intervals();
2511
2512 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2513 int ip = next_ip;
2514 next_ip++;
2515
2516 if (inst->opcode != BRW_OPCODE_MOV ||
2517 inst->is_partial_write() ||
2518 inst->dst.file != MRF || inst->src[0].file != VGRF ||
2519 inst->dst.type != inst->src[0].type ||
2520 inst->src[0].abs || inst->src[0].negate ||
2521 !inst->src[0].is_contiguous() ||
2522 inst->src[0].subreg_offset)
2523 continue;
2524
2525 /* Work out which hardware MRF registers are written by this
2526 * instruction.
2527 */
2528 int mrf_low = inst->dst.nr & ~BRW_MRF_COMPR4;
2529 int mrf_high;
2530 if (inst->dst.nr & BRW_MRF_COMPR4) {
2531 mrf_high = mrf_low + 4;
2532 } else if (inst->exec_size == 16) {
2533 mrf_high = mrf_low + 1;
2534 } else {
2535 mrf_high = mrf_low;
2536 }
2537
2538 /* Can't compute-to-MRF this GRF if someone else was going to
2539 * read it later.
2540 */
2541 if (this->virtual_grf_end[inst->src[0].nr] > ip)
2542 continue;
2543
2544 /* Found a move of a GRF to a MRF. Let's see if we can go
2545 * rewrite the thing that made this GRF to write into the MRF.
2546 */
2547 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
2548 if (scan_inst->dst.file == VGRF &&
2549 scan_inst->dst.nr == inst->src[0].nr) {
2550 /* Found the last thing to write our reg we want to turn
2551 * into a compute-to-MRF.
2552 */
2553
2554 /* If this one instruction didn't populate all the
2555 * channels, bail. We might be able to rewrite everything
2556 * that writes that reg, but it would require smarter
2557 * tracking to delay the rewriting until complete success.
2558 */
2559 if (scan_inst->is_partial_write())
2560 break;
2561
2562 /* Things returning more than one register would need us to
2563 * understand coalescing out more than one MOV at a time.
2564 */
2565 if (scan_inst->regs_written > scan_inst->exec_size / 8)
2566 break;
2567
2568 /* SEND instructions can't have MRF as a destination. */
2569 if (scan_inst->mlen)
2570 break;
2571
2572 if (devinfo->gen == 6) {
2573 /* gen6 math instructions must have the destination be
2574 * GRF, so no compute-to-MRF for them.
2575 */
2576 if (scan_inst->is_math()) {
2577 break;
2578 }
2579 }
2580
2581 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2582 /* Found the creator of our MRF's source value. */
2583 scan_inst->dst.file = MRF;
2584 scan_inst->dst.nr = inst->dst.nr;
2585 scan_inst->saturate |= inst->saturate;
2586 inst->remove(block);
2587 progress = true;
2588 }
2589 break;
2590 }
2591
2592 /* We don't handle control flow here. Most computation of
2593 * values that end up in MRFs are shortly before the MRF
2594 * write anyway.
2595 */
2596 if (block->start() == scan_inst)
2597 break;
2598
2599 /* You can't read from an MRF, so if someone else reads our
2600 * MRF's source GRF that we wanted to rewrite, that stops us.
2601 */
2602 bool interfered = false;
2603 for (int i = 0; i < scan_inst->sources; i++) {
2604 if (scan_inst->src[i].file == VGRF &&
2605 scan_inst->src[i].nr == inst->src[0].nr &&
2606 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2607 interfered = true;
2608 }
2609 }
2610 if (interfered)
2611 break;
2612
2613 if (scan_inst->dst.file == MRF) {
2614 /* If somebody else writes our MRF here, we can't
2615 * compute-to-MRF before that.
2616 */
2617 int scan_mrf_low = scan_inst->dst.nr & ~BRW_MRF_COMPR4;
2618 int scan_mrf_high;
2619
2620 if (scan_inst->dst.nr & BRW_MRF_COMPR4) {
2621 scan_mrf_high = scan_mrf_low + 4;
2622 } else if (scan_inst->exec_size == 16) {
2623 scan_mrf_high = scan_mrf_low + 1;
2624 } else {
2625 scan_mrf_high = scan_mrf_low;
2626 }
2627
2628 if (mrf_low == scan_mrf_low ||
2629 mrf_low == scan_mrf_high ||
2630 mrf_high == scan_mrf_low ||
2631 mrf_high == scan_mrf_high) {
2632 break;
2633 }
2634 }
2635
2636 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2637 /* Found a SEND instruction, which means that there are
2638 * live values in MRFs from base_mrf to base_mrf +
2639 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2640 * above it.
2641 */
2642 if (mrf_low >= scan_inst->base_mrf &&
2643 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2644 break;
2645 }
2646 if (mrf_high >= scan_inst->base_mrf &&
2647 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2648 break;
2649 }
2650 }
2651 }
2652 }
2653
2654 if (progress)
2655 invalidate_live_intervals();
2656
2657 return progress;
2658 }
2659
2660 /**
2661 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2662 * flow. We could probably do better here with some form of divergence
2663 * analysis.
2664 */
2665 bool
2666 fs_visitor::eliminate_find_live_channel()
2667 {
2668 bool progress = false;
2669 unsigned depth = 0;
2670
2671 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2672 switch (inst->opcode) {
2673 case BRW_OPCODE_IF:
2674 case BRW_OPCODE_DO:
2675 depth++;
2676 break;
2677
2678 case BRW_OPCODE_ENDIF:
2679 case BRW_OPCODE_WHILE:
2680 depth--;
2681 break;
2682
2683 case FS_OPCODE_DISCARD_JUMP:
2684 /* This can potentially make control flow non-uniform until the end
2685 * of the program.
2686 */
2687 return progress;
2688
2689 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2690 if (depth == 0) {
2691 inst->opcode = BRW_OPCODE_MOV;
2692 inst->src[0] = brw_imm_ud(0u);
2693 inst->sources = 1;
2694 inst->force_writemask_all = true;
2695 progress = true;
2696 }
2697 break;
2698
2699 default:
2700 break;
2701 }
2702 }
2703
2704 return progress;
2705 }
2706
2707 /**
2708 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2709 * instructions to FS_OPCODE_REP_FB_WRITE.
2710 */
2711 void
2712 fs_visitor::emit_repclear_shader()
2713 {
2714 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2715 int base_mrf = 1;
2716 int color_mrf = base_mrf + 2;
2717
2718 fs_inst *mov = bld.exec_all().group(4, 0)
2719 .MOV(brw_message_reg(color_mrf),
2720 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
2721
2722 fs_inst *write;
2723 if (key->nr_color_regions == 1) {
2724 write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2725 write->saturate = key->clamp_fragment_color;
2726 write->base_mrf = color_mrf;
2727 write->target = 0;
2728 write->header_size = 0;
2729 write->mlen = 1;
2730 } else {
2731 assume(key->nr_color_regions > 0);
2732 for (int i = 0; i < key->nr_color_regions; ++i) {
2733 write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2734 write->saturate = key->clamp_fragment_color;
2735 write->base_mrf = base_mrf;
2736 write->target = i;
2737 write->header_size = 2;
2738 write->mlen = 3;
2739 }
2740 }
2741 write->eot = true;
2742
2743 calculate_cfg();
2744
2745 assign_constant_locations();
2746 assign_curb_setup();
2747
2748 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2749 assert(mov->src[0].file == FIXED_GRF);
2750 mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
2751 }
2752
2753 /**
2754 * Walks through basic blocks, looking for repeated MRF writes and
2755 * removing the later ones.
2756 */
2757 bool
2758 fs_visitor::remove_duplicate_mrf_writes()
2759 {
2760 fs_inst *last_mrf_move[BRW_MAX_MRF(devinfo->gen)];
2761 bool progress = false;
2762
2763 /* Need to update the MRF tracking for compressed instructions. */
2764 if (dispatch_width == 16)
2765 return false;
2766
2767 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2768
2769 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2770 if (inst->is_control_flow()) {
2771 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2772 }
2773
2774 if (inst->opcode == BRW_OPCODE_MOV &&
2775 inst->dst.file == MRF) {
2776 fs_inst *prev_inst = last_mrf_move[inst->dst.nr];
2777 if (prev_inst && inst->equals(prev_inst)) {
2778 inst->remove(block);
2779 progress = true;
2780 continue;
2781 }
2782 }
2783
2784 /* Clear out the last-write records for MRFs that were overwritten. */
2785 if (inst->dst.file == MRF) {
2786 last_mrf_move[inst->dst.nr] = NULL;
2787 }
2788
2789 if (inst->mlen > 0 && inst->base_mrf != -1) {
2790 /* Found a SEND instruction, which will include two or fewer
2791 * implied MRF writes. We could do better here.
2792 */
2793 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2794 last_mrf_move[inst->base_mrf + i] = NULL;
2795 }
2796 }
2797
2798 /* Clear out any MRF move records whose sources got overwritten. */
2799 if (inst->dst.file == VGRF) {
2800 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2801 if (last_mrf_move[i] &&
2802 last_mrf_move[i]->src[0].nr == inst->dst.nr) {
2803 last_mrf_move[i] = NULL;
2804 }
2805 }
2806 }
2807
2808 if (inst->opcode == BRW_OPCODE_MOV &&
2809 inst->dst.file == MRF &&
2810 inst->src[0].file == VGRF &&
2811 !inst->is_partial_write()) {
2812 last_mrf_move[inst->dst.nr] = inst;
2813 }
2814 }
2815
2816 if (progress)
2817 invalidate_live_intervals();
2818
2819 return progress;
2820 }
2821
2822 static void
2823 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
2824 {
2825 /* Clear the flag for registers that actually got read (as expected). */
2826 for (int i = 0; i < inst->sources; i++) {
2827 int grf;
2828 if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) {
2829 grf = inst->src[i].nr;
2830 } else {
2831 continue;
2832 }
2833
2834 if (grf >= first_grf &&
2835 grf < first_grf + grf_len) {
2836 deps[grf - first_grf] = false;
2837 if (inst->exec_size == 16)
2838 deps[grf - first_grf + 1] = false;
2839 }
2840 }
2841 }
2842
2843 /**
2844 * Implements this workaround for the original 965:
2845 *
2846 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2847 * check for post destination dependencies on this instruction, software
2848 * must ensure that there is no destination hazard for the case of ‘write
2849 * followed by a posted write’ shown in the following example.
2850 *
2851 * 1. mov r3 0
2852 * 2. send r3.xy <rest of send instruction>
2853 * 3. mov r2 r3
2854 *
2855 * Due to no post-destination dependency check on the ‘send’, the above
2856 * code sequence could have two instructions (1 and 2) in flight at the
2857 * same time that both consider ‘r3’ as the target of their final writes.
2858 */
2859 void
2860 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2861 fs_inst *inst)
2862 {
2863 int write_len = inst->regs_written;
2864 int first_write_grf = inst->dst.nr;
2865 bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
2866 assert(write_len < (int)sizeof(needs_dep) - 1);
2867
2868 memset(needs_dep, false, sizeof(needs_dep));
2869 memset(needs_dep, true, write_len);
2870
2871 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
2872
2873 /* Walk backwards looking for writes to registers we're writing which
2874 * aren't read since being written. If we hit the start of the program,
2875 * we assume that there are no outstanding dependencies on entry to the
2876 * program.
2877 */
2878 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
2879 /* If we hit control flow, assume that there *are* outstanding
2880 * dependencies, and force their cleanup before our instruction.
2881 */
2882 if (block->start() == scan_inst) {
2883 for (int i = 0; i < write_len; i++) {
2884 if (needs_dep[i])
2885 DEP_RESOLVE_MOV(fs_builder(this, block, inst),
2886 first_write_grf + i);
2887 }
2888 return;
2889 }
2890
2891 /* We insert our reads as late as possible on the assumption that any
2892 * instruction but a MOV that might have left us an outstanding
2893 * dependency has more latency than a MOV.
2894 */
2895 if (scan_inst->dst.file == VGRF) {
2896 for (int i = 0; i < scan_inst->regs_written; i++) {
2897 int reg = scan_inst->dst.nr + i;
2898
2899 if (reg >= first_write_grf &&
2900 reg < first_write_grf + write_len &&
2901 needs_dep[reg - first_write_grf]) {
2902 DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
2903 needs_dep[reg - first_write_grf] = false;
2904 if (scan_inst->exec_size == 16)
2905 needs_dep[reg - first_write_grf + 1] = false;
2906 }
2907 }
2908 }
2909
2910 /* Clear the flag for registers that actually got read (as expected). */
2911 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2912
2913 /* Continue the loop only if we haven't resolved all the dependencies */
2914 int i;
2915 for (i = 0; i < write_len; i++) {
2916 if (needs_dep[i])
2917 break;
2918 }
2919 if (i == write_len)
2920 return;
2921 }
2922 }
2923
2924 /**
2925 * Implements this workaround for the original 965:
2926 *
2927 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2928 * used as a destination register until after it has been sourced by an
2929 * instruction with a different destination register.
2930 */
2931 void
2932 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2933 {
2934 int write_len = inst->regs_written;
2935 int first_write_grf = inst->dst.nr;
2936 bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
2937 assert(write_len < (int)sizeof(needs_dep) - 1);
2938
2939 memset(needs_dep, false, sizeof(needs_dep));
2940 memset(needs_dep, true, write_len);
2941 /* Walk forwards looking for writes to registers we're writing which aren't
2942 * read before being written.
2943 */
2944 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst) {
2945 /* If we hit control flow, force resolve all remaining dependencies. */
2946 if (block->end() == scan_inst) {
2947 for (int i = 0; i < write_len; i++) {
2948 if (needs_dep[i])
2949 DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
2950 first_write_grf + i);
2951 }
2952 return;
2953 }
2954
2955 /* Clear the flag for registers that actually got read (as expected). */
2956 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2957
2958 /* We insert our reads as late as possible since they're reading the
2959 * result of a SEND, which has massive latency.
2960 */
2961 if (scan_inst->dst.file == VGRF &&
2962 scan_inst->dst.nr >= first_write_grf &&
2963 scan_inst->dst.nr < first_write_grf + write_len &&
2964 needs_dep[scan_inst->dst.nr - first_write_grf]) {
2965 DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
2966 scan_inst->dst.nr);
2967 needs_dep[scan_inst->dst.nr - first_write_grf] = false;
2968 }
2969
2970 /* Continue the loop only if we haven't resolved all the dependencies */
2971 int i;
2972 for (i = 0; i < write_len; i++) {
2973 if (needs_dep[i])
2974 break;
2975 }
2976 if (i == write_len)
2977 return;
2978 }
2979 }
2980
2981 void
2982 fs_visitor::insert_gen4_send_dependency_workarounds()
2983 {
2984 if (devinfo->gen != 4 || devinfo->is_g4x)
2985 return;
2986
2987 bool progress = false;
2988
2989 /* Note that we're done with register allocation, so GRF fs_regs always
2990 * have a .reg_offset of 0.
2991 */
2992
2993 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2994 if (inst->mlen != 0 && inst->dst.file == VGRF) {
2995 insert_gen4_pre_send_dependency_workarounds(block, inst);
2996 insert_gen4_post_send_dependency_workarounds(block, inst);
2997 progress = true;
2998 }
2999 }
3000
3001 if (progress)
3002 invalidate_live_intervals();
3003 }
3004
3005 /**
3006 * Turns the generic expression-style uniform pull constant load instruction
3007 * into a hardware-specific series of instructions for loading a pull
3008 * constant.
3009 *
3010 * The expression style allows the CSE pass before this to optimize out
3011 * repeated loads from the same offset, and gives the pre-register-allocation
3012 * scheduling full flexibility, while the conversion to native instructions
3013 * allows the post-register-allocation scheduler the best information
3014 * possible.
3015 *
3016 * Note that execution masking for setting up pull constant loads is special:
3017 * the channels that need to be written are unrelated to the current execution
3018 * mask, since a later instruction will use one of the result channels as a
3019 * source operand for all 8 or 16 of its channels.
3020 */
3021 void
3022 fs_visitor::lower_uniform_pull_constant_loads()
3023 {
3024 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3025 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3026 continue;
3027
3028 if (devinfo->gen >= 7) {
3029 /* The offset arg before was a vec4-aligned byte offset. We need to
3030 * turn it into a dword offset.
3031 */
3032 fs_reg const_offset_reg = inst->src[1];
3033 assert(const_offset_reg.file == IMM &&
3034 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3035 const_offset_reg.ud /= 4;
3036
3037 fs_reg payload, offset;
3038 if (devinfo->gen >= 9) {
3039 /* We have to use a message header on Skylake to get SIMD4x2
3040 * mode. Reserve space for the register.
3041 */
3042 offset = payload = fs_reg(VGRF, alloc.allocate(2));
3043 offset.reg_offset++;
3044 inst->mlen = 2;
3045 } else {
3046 offset = payload = fs_reg(VGRF, alloc.allocate(1));
3047 inst->mlen = 1;
3048 }
3049
3050 /* This is actually going to be a MOV, but since only the first dword
3051 * is accessed, we have a special opcode to do just that one. Note
3052 * that this needs to be an operation that will be considered a def
3053 * by live variable analysis, or register allocation will explode.
3054 */
3055 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3056 8, offset, const_offset_reg);
3057 setup->force_writemask_all = true;
3058
3059 setup->ir = inst->ir;
3060 setup->annotation = inst->annotation;
3061 inst->insert_before(block, setup);
3062
3063 /* Similarly, this will only populate the first 4 channels of the
3064 * result register (since we only use smear values from 0-3), but we
3065 * don't tell the optimizer.
3066 */
3067 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3068 inst->src[1] = payload;
3069 inst->base_mrf = -1;
3070
3071 invalidate_live_intervals();
3072 } else {
3073 /* Before register allocation, we didn't tell the scheduler about the
3074 * MRF we use. We know it's safe to use this MRF because nothing
3075 * else does except for register spill/unspill, which generates and
3076 * uses its MRF within a single IR instruction.
3077 */
3078 inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
3079 inst->mlen = 1;
3080 }
3081 }
3082 }
3083
3084 bool
3085 fs_visitor::lower_load_payload()
3086 {
3087 bool progress = false;
3088
3089 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3090 if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3091 continue;
3092
3093 assert(inst->dst.file == MRF || inst->dst.file == VGRF);
3094 assert(inst->saturate == false);
3095 fs_reg dst = inst->dst;
3096
3097 /* Get rid of COMPR4. We'll add it back in if we need it */
3098 if (dst.file == MRF)
3099 dst.nr = dst.nr & ~BRW_MRF_COMPR4;
3100
3101 const fs_builder ibld(this, block, inst);
3102 const fs_builder hbld = ibld.exec_all().group(8, 0);
3103
3104 for (uint8_t i = 0; i < inst->header_size; i++) {
3105 if (inst->src[i].file != BAD_FILE) {
3106 fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3107 fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3108 hbld.MOV(mov_dst, mov_src);
3109 }
3110 dst = offset(dst, hbld, 1);
3111 }
3112
3113 if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) &&
3114 inst->exec_size > 8) {
3115 /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3116 * a straightforward copy. Instead, the result of the
3117 * LOAD_PAYLOAD is treated as interleaved and the first four
3118 * non-header sources are unpacked as:
3119 *
3120 * m + 0: r0
3121 * m + 1: g0
3122 * m + 2: b0
3123 * m + 3: a0
3124 * m + 4: r1
3125 * m + 5: g1
3126 * m + 6: b1
3127 * m + 7: a1
3128 *
3129 * This is used for gen <= 5 fb writes.
3130 */
3131 assert(inst->exec_size == 16);
3132 assert(inst->header_size + 4 <= inst->sources);
3133 for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3134 if (inst->src[i].file != BAD_FILE) {
3135 if (devinfo->has_compr4) {
3136 fs_reg compr4_dst = retype(dst, inst->src[i].type);
3137 compr4_dst.nr |= BRW_MRF_COMPR4;
3138 ibld.MOV(compr4_dst, inst->src[i]);
3139 } else {
3140 /* Platform doesn't have COMPR4. We have to fake it */
3141 fs_reg mov_dst = retype(dst, inst->src[i].type);
3142 ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
3143 mov_dst.nr += 4;
3144 ibld.half(1).MOV(mov_dst, half(inst->src[i], 1));
3145 }
3146 }
3147
3148 dst.nr++;
3149 }
3150
3151 /* The loop above only ever incremented us through the first set
3152 * of 4 registers. However, thanks to the magic of COMPR4, we
3153 * actually wrote to the first 8 registers, so we need to take
3154 * that into account now.
3155 */
3156 dst.nr += 4;
3157
3158 /* The COMPR4 code took care of the first 4 sources. We'll let
3159 * the regular path handle any remaining sources. Yes, we are
3160 * modifying the instruction but we're about to delete it so
3161 * this really doesn't hurt anything.
3162 */
3163 inst->header_size += 4;
3164 }
3165
3166 for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3167 if (inst->src[i].file != BAD_FILE)
3168 ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
3169 dst = offset(dst, ibld, 1);
3170 }
3171
3172 inst->remove(block);
3173 progress = true;
3174 }
3175
3176 if (progress)
3177 invalidate_live_intervals();
3178
3179 return progress;
3180 }
3181
3182 bool
3183 fs_visitor::lower_integer_multiplication()
3184 {
3185 bool progress = false;
3186
3187 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3188 const fs_builder ibld(this, block, inst);
3189
3190 if (inst->opcode == BRW_OPCODE_MUL) {
3191 if (inst->dst.is_accumulator() ||
3192 (inst->dst.type != BRW_REGISTER_TYPE_D &&
3193 inst->dst.type != BRW_REGISTER_TYPE_UD))
3194 continue;
3195
3196 /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit
3197 * operation directly, but CHV/BXT cannot.
3198 */
3199 if (devinfo->gen >= 8 &&
3200 !devinfo->is_cherryview && !devinfo->is_broxton)
3201 continue;
3202
3203 if (inst->src[1].file == IMM &&
3204 inst->src[1].ud < (1 << 16)) {
3205 /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3206 * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3207 * src1 are used.
3208 *
3209 * If multiplying by an immediate value that fits in 16-bits, do a
3210 * single MUL instruction with that value in the proper location.
3211 */
3212 if (devinfo->gen < 7) {
3213 fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8),
3214 inst->dst.type);
3215 ibld.MOV(imm, inst->src[1]);
3216 ibld.MUL(inst->dst, imm, inst->src[0]);
3217 } else {
3218 ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
3219 }
3220 } else {
3221 /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3222 * do 32-bit integer multiplication in one instruction, but instead
3223 * must do a sequence (which actually calculates a 64-bit result):
3224 *
3225 * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
3226 * mach(8) null g3<8,8,1>D g4<8,8,1>D
3227 * mov(8) g2<1>D acc0<8,8,1>D
3228 *
3229 * But on Gen > 6, the ability to use second accumulator register
3230 * (acc1) for non-float data types was removed, preventing a simple
3231 * implementation in SIMD16. A 16-channel result can be calculated by
3232 * executing the three instructions twice in SIMD8, once with quarter
3233 * control of 1Q for the first eight channels and again with 2Q for
3234 * the second eight channels.
3235 *
3236 * Which accumulator register is implicitly accessed (by AccWrEnable
3237 * for instance) is determined by the quarter control. Unfortunately
3238 * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3239 * implicit accumulator access by an instruction with 2Q will access
3240 * acc1 regardless of whether the data type is usable in acc1.
3241 *
3242 * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3243 * integer data types.
3244 *
3245 * Since we only want the low 32-bits of the result, we can do two
3246 * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3247 * adjust the high result and add them (like the mach is doing):
3248 *
3249 * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
3250 * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
3251 * shl(8) g9<1>D g8<8,8,1>D 16D
3252 * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
3253 *
3254 * We avoid the shl instruction by realizing that we only want to add
3255 * the low 16-bits of the "high" result to the high 16-bits of the
3256 * "low" result and using proper regioning on the add:
3257 *
3258 * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
3259 * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
3260 * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
3261 *
3262 * Since it does not use the (single) accumulator register, we can
3263 * schedule multi-component multiplications much better.
3264 */
3265
3266 fs_reg orig_dst = inst->dst;
3267 if (orig_dst.is_null() || orig_dst.file == MRF) {
3268 inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
3269 inst->dst.type);
3270 }
3271 fs_reg low = inst->dst;
3272 fs_reg high(VGRF, alloc.allocate(dispatch_width / 8),
3273 inst->dst.type);
3274
3275 if (devinfo->gen >= 7) {
3276 fs_reg src1_0_w = inst->src[1];
3277 fs_reg src1_1_w = inst->src[1];
3278
3279 if (inst->src[1].file == IMM) {
3280 src1_0_w.ud &= 0xffff;
3281 src1_1_w.ud >>= 16;
3282 } else {
3283 src1_0_w.type = BRW_REGISTER_TYPE_UW;
3284 if (src1_0_w.stride != 0) {
3285 assert(src1_0_w.stride == 1);
3286 src1_0_w.stride = 2;
3287 }
3288
3289 src1_1_w.type = BRW_REGISTER_TYPE_UW;
3290 if (src1_1_w.stride != 0) {
3291 assert(src1_1_w.stride == 1);
3292 src1_1_w.stride = 2;
3293 }
3294 src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3295 }
3296 ibld.MUL(low, inst->src[0], src1_0_w);
3297 ibld.MUL(high, inst->src[0], src1_1_w);
3298 } else {
3299 fs_reg src0_0_w = inst->src[0];
3300 fs_reg src0_1_w = inst->src[0];
3301
3302 src0_0_w.type = BRW_REGISTER_TYPE_UW;
3303 if (src0_0_w.stride != 0) {
3304 assert(src0_0_w.stride == 1);
3305 src0_0_w.stride = 2;
3306 }
3307
3308 src0_1_w.type = BRW_REGISTER_TYPE_UW;
3309 if (src0_1_w.stride != 0) {
3310 assert(src0_1_w.stride == 1);
3311 src0_1_w.stride = 2;
3312 }
3313 src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3314
3315 ibld.MUL(low, src0_0_w, inst->src[1]);
3316 ibld.MUL(high, src0_1_w, inst->src[1]);
3317 }
3318
3319 fs_reg dst = inst->dst;
3320 dst.type = BRW_REGISTER_TYPE_UW;
3321 dst.subreg_offset = 2;
3322 dst.stride = 2;
3323
3324 high.type = BRW_REGISTER_TYPE_UW;
3325 high.stride = 2;
3326
3327 low.type = BRW_REGISTER_TYPE_UW;
3328 low.subreg_offset = 2;
3329 low.stride = 2;
3330
3331 ibld.ADD(dst, low, high);
3332
3333 if (inst->conditional_mod || orig_dst.file == MRF) {
3334 set_condmod(inst->conditional_mod,
3335 ibld.MOV(orig_dst, inst->dst));
3336 }
3337 }
3338
3339 } else if (inst->opcode == SHADER_OPCODE_MULH) {
3340 /* Should have been lowered to 8-wide. */
3341 assert(inst->exec_size <= 8);
3342 const fs_reg acc = retype(brw_acc_reg(inst->exec_size),
3343 inst->dst.type);
3344 fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
3345 fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
3346
3347 if (devinfo->gen >= 8) {
3348 /* Until Gen8, integer multiplies read 32-bits from one source,
3349 * and 16-bits from the other, and relying on the MACH instruction
3350 * to generate the high bits of the result.
3351 *
3352 * On Gen8, the multiply instruction does a full 32x32-bit
3353 * multiply, but in order to do a 64-bit multiply we can simulate
3354 * the previous behavior and then use a MACH instruction.
3355 *
3356 * FINISHME: Don't use source modifiers on src1.
3357 */
3358 assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
3359 mul->src[1].type == BRW_REGISTER_TYPE_UD);
3360 mul->src[1].type = (type_is_signed(mul->src[1].type) ?
3361 BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW);
3362 mul->src[1].stride *= 2;
3363
3364 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
3365 inst->force_sechalf) {
3366 /* Among other things the quarter control bits influence which
3367 * accumulator register is used by the hardware for instructions
3368 * that access the accumulator implicitly (e.g. MACH). A
3369 * second-half instruction would normally map to acc1, which
3370 * doesn't exist on Gen7 and up (the hardware does emulate it for
3371 * floating-point instructions *only* by taking advantage of the
3372 * extra precision of acc0 not normally used for floating point
3373 * arithmetic).
3374 *
3375 * HSW and up are careful enough not to try to access an
3376 * accumulator register that doesn't exist, but on earlier Gen7
3377 * hardware we need to make sure that the quarter control bits are
3378 * zero to avoid non-deterministic behaviour and emit an extra MOV
3379 * to get the result masked correctly according to the current
3380 * channel enables.
3381 */
3382 mach->force_sechalf = false;
3383 mach->force_writemask_all = true;
3384 mach->dst = ibld.vgrf(inst->dst.type);
3385 ibld.MOV(inst->dst, mach->dst);
3386 }
3387 } else {
3388 continue;
3389 }
3390
3391 inst->remove(block);
3392 progress = true;
3393 }
3394
3395 if (progress)
3396 invalidate_live_intervals();
3397
3398 return progress;
3399 }
3400
3401 static void
3402 setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
3403 fs_reg *dst, fs_reg color, unsigned components)
3404 {
3405 if (key->clamp_fragment_color) {
3406 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
3407 assert(color.type == BRW_REGISTER_TYPE_F);
3408
3409 for (unsigned i = 0; i < components; i++)
3410 set_saturate(true,
3411 bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
3412
3413 color = tmp;
3414 }
3415
3416 for (unsigned i = 0; i < components; i++)
3417 dst[i] = offset(color, bld, i);
3418 }
3419
3420 static void
3421 lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
3422 const brw_wm_prog_data *prog_data,
3423 const brw_wm_prog_key *key,
3424 const fs_visitor::thread_payload &payload)
3425 {
3426 assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
3427 const brw_device_info *devinfo = bld.shader->devinfo;
3428 const fs_reg &color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
3429 const fs_reg &color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
3430 const fs_reg &src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
3431 const fs_reg &src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
3432 const fs_reg &dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
3433 const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
3434 fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
3435 const unsigned components =
3436 inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
3437
3438 /* We can potentially have a message length of up to 15, so we have to set
3439 * base_mrf to either 0 or 1 in order to fit in m0..m15.
3440 */
3441 fs_reg sources[15];
3442 int header_size = 2, payload_header_size;
3443 unsigned length = 0;
3444
3445 /* From the Sandy Bridge PRM, volume 4, page 198:
3446 *
3447 * "Dispatched Pixel Enables. One bit per pixel indicating
3448 * which pixels were originally enabled when the thread was
3449 * dispatched. This field is only required for the end-of-
3450 * thread message and on all dual-source messages."
3451 */
3452 if (devinfo->gen >= 6 &&
3453 (devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) &&
3454 color1.file == BAD_FILE &&
3455 key->nr_color_regions == 1) {
3456 header_size = 0;
3457 }
3458
3459 if (header_size != 0) {
3460 assert(header_size == 2);
3461 /* Allocate 2 registers for a header */
3462 length += 2;
3463 }
3464
3465 if (payload.aa_dest_stencil_reg) {
3466 sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
3467 bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
3468 .MOV(sources[length],
3469 fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
3470 length++;
3471 }
3472
3473 if (prog_data->uses_omask) {
3474 sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1),
3475 BRW_REGISTER_TYPE_UD);
3476
3477 /* Hand over gl_SampleMask. Only the lower 16 bits of each channel are
3478 * relevant. Since it's unsigned single words one vgrf is always
3479 * 16-wide, but only the lower or higher 8 channels will be used by the
3480 * hardware when doing a SIMD8 write depending on whether we have
3481 * selected the subspans for the first or second half respectively.
3482 */
3483 assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
3484 sample_mask.type = BRW_REGISTER_TYPE_UW;
3485 sample_mask.stride *= 2;
3486
3487 bld.exec_all().annotate("FB write oMask")
3488 .MOV(half(retype(sources[length], BRW_REGISTER_TYPE_UW),
3489 inst->force_sechalf),
3490 sample_mask);
3491 length++;
3492 }
3493
3494 payload_header_size = length;
3495
3496 if (src0_alpha.file != BAD_FILE) {
3497 /* FIXME: This is being passed at the wrong location in the payload and
3498 * doesn't work when gl_SampleMask and MRTs are used simultaneously.
3499 * It's supposed to be immediately before oMask but there seems to be no
3500 * reasonable way to pass them in the correct order because LOAD_PAYLOAD
3501 * requires header sources to form a contiguous segment at the beginning
3502 * of the message and src0_alpha has per-channel semantics.
3503 */
3504 setup_color_payload(bld, key, &sources[length], src0_alpha, 1);
3505 length++;
3506 }
3507
3508 setup_color_payload(bld, key, &sources[length], color0, components);
3509 length += 4;
3510
3511 if (color1.file != BAD_FILE) {
3512 setup_color_payload(bld, key, &sources[length], color1, components);
3513 length += 4;
3514 }
3515
3516 if (src_depth.file != BAD_FILE) {
3517 sources[length] = src_depth;
3518 length++;
3519 }
3520
3521 if (dst_depth.file != BAD_FILE) {
3522 sources[length] = dst_depth;
3523 length++;
3524 }
3525
3526 if (src_stencil.file != BAD_FILE) {
3527 assert(devinfo->gen >= 9);
3528 assert(bld.dispatch_width() != 16);
3529
3530 /* XXX: src_stencil is only available on gen9+. dst_depth is never
3531 * available on gen9+. As such it's impossible to have both enabled at the
3532 * same time and therefore length cannot overrun the array.
3533 */
3534 assert(length < 15);
3535
3536 sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD);
3537 bld.exec_all().annotate("FB write OS")
3538 .emit(FS_OPCODE_PACK_STENCIL_REF, sources[length],
3539 retype(src_stencil, BRW_REGISTER_TYPE_UB));
3540 length++;
3541 }
3542
3543 fs_inst *load;
3544 if (devinfo->gen >= 7) {
3545 /* Send from the GRF */
3546 fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F);
3547 load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
3548 payload.nr = bld.shader->alloc.allocate(load->regs_written);
3549 load->dst = payload;
3550
3551 inst->src[0] = payload;
3552 inst->resize_sources(1);
3553 inst->base_mrf = -1;
3554 } else {
3555 /* Send from the MRF */
3556 load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
3557 sources, length, payload_header_size);
3558
3559 /* On pre-SNB, we have to interlace the color values. LOAD_PAYLOAD
3560 * will do this for us if we just give it a COMPR4 destination.
3561 */
3562 if (devinfo->gen < 6 && bld.dispatch_width() == 16)
3563 load->dst.nr |= BRW_MRF_COMPR4;
3564
3565 inst->resize_sources(0);
3566 inst->base_mrf = 1;
3567 }
3568
3569 inst->opcode = FS_OPCODE_FB_WRITE;
3570 inst->mlen = load->regs_written;
3571 inst->header_size = header_size;
3572 }
3573
3574 static void
3575 lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
3576 const fs_reg &coordinate,
3577 const fs_reg &shadow_c,
3578 const fs_reg &lod, const fs_reg &lod2,
3579 const fs_reg &sampler,
3580 unsigned coord_components,
3581 unsigned grad_components)
3582 {
3583 const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||
3584 op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);
3585 fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);
3586 fs_reg msg_end = msg_begin;
3587
3588 /* g0 header. */
3589 msg_end = offset(msg_end, bld.group(8, 0), 1);
3590
3591 for (unsigned i = 0; i < coord_components; i++)
3592 bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
3593 offset(coordinate, bld, i));
3594
3595 msg_end = offset(msg_end, bld, coord_components);
3596
3597 /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
3598 * require all three components to be present and zero if they are unused.
3599 */
3600 if (coord_components > 0 &&
3601 (has_lod || shadow_c.file != BAD_FILE ||
3602 (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
3603 for (unsigned i = coord_components; i < 3; i++)
3604 bld.MOV(offset(msg_end, bld, i), brw_imm_f(0.0f));
3605
3606 msg_end = offset(msg_end, bld, 3 - coord_components);
3607 }
3608
3609 if (op == SHADER_OPCODE_TXD) {
3610 /* TXD unsupported in SIMD16 mode. */
3611 assert(bld.dispatch_width() == 8);
3612
3613 /* the slots for u and v are always present, but r is optional */
3614 if (coord_components < 2)
3615 msg_end = offset(msg_end, bld, 2 - coord_components);
3616
3617 /* P = u, v, r
3618 * dPdx = dudx, dvdx, drdx
3619 * dPdy = dudy, dvdy, drdy
3620 *
3621 * 1-arg: Does not exist.
3622 *
3623 * 2-arg: dudx dvdx dudy dvdy
3624 * dPdx.x dPdx.y dPdy.x dPdy.y
3625 * m4 m5 m6 m7
3626 *
3627 * 3-arg: dudx dvdx drdx dudy dvdy drdy
3628 * dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
3629 * m5 m6 m7 m8 m9 m10
3630 */
3631 for (unsigned i = 0; i < grad_components; i++)
3632 bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
3633
3634 msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
3635
3636 for (unsigned i = 0; i < grad_components; i++)
3637 bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
3638
3639 msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
3640 }
3641
3642 if (has_lod) {
3643 /* Bias/LOD with shadow comparitor is unsupported in SIMD16 -- *Without*
3644 * shadow comparitor (including RESINFO) it's unsupported in SIMD8 mode.
3645 */
3646 assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
3647 bld.dispatch_width() == 16);
3648
3649 const brw_reg_type type =
3650 (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?
3651 BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
3652 bld.MOV(retype(msg_end, type), lod);
3653 msg_end = offset(msg_end, bld, 1);
3654 }
3655
3656 if (shadow_c.file != BAD_FILE) {
3657 if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
3658 /* There's no plain shadow compare message, so we use shadow
3659 * compare with a bias of 0.0.
3660 */
3661 bld.MOV(msg_end, brw_imm_f(0.0f));
3662 msg_end = offset(msg_end, bld, 1);
3663 }
3664
3665 bld.MOV(msg_end, shadow_c);
3666 msg_end = offset(msg_end, bld, 1);
3667 }
3668
3669 inst->opcode = op;
3670 inst->src[0] = reg_undef;
3671 inst->src[1] = sampler;
3672 inst->resize_sources(2);
3673 inst->base_mrf = msg_begin.nr;
3674 inst->mlen = msg_end.nr - msg_begin.nr;
3675 inst->header_size = 1;
3676 }
3677
3678 static void
3679 lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
3680 fs_reg coordinate,
3681 const fs_reg &shadow_c,
3682 fs_reg lod, fs_reg lod2,
3683 const fs_reg &sample_index,
3684 const fs_reg &sampler,
3685 const fs_reg &offset_value,
3686 unsigned coord_components,
3687 unsigned grad_components)
3688 {
3689 fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
3690 fs_reg msg_coords = message;
3691 unsigned header_size = 0;
3692
3693 if (offset_value.file != BAD_FILE) {
3694 /* The offsets set up by the visitor are in the m1 header, so we can't
3695 * go headerless.
3696 */
3697 header_size = 1;
3698 message.nr--;
3699 }
3700
3701 for (unsigned i = 0; i < coord_components; i++) {
3702 bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type), coordinate);
3703 coordinate = offset(coordinate, bld, 1);
3704 }
3705 fs_reg msg_end = offset(msg_coords, bld, coord_components);
3706 fs_reg msg_lod = offset(msg_coords, bld, 4);
3707
3708 if (shadow_c.file != BAD_FILE) {
3709 fs_reg msg_shadow = msg_lod;
3710 bld.MOV(msg_shadow, shadow_c);
3711 msg_lod = offset(msg_shadow, bld, 1);
3712 msg_end = msg_lod;
3713 }
3714
3715 switch (op) {
3716 case SHADER_OPCODE_TXL:
3717 case FS_OPCODE_TXB:
3718 bld.MOV(msg_lod, lod);
3719 msg_end = offset(msg_lod, bld, 1);
3720 break;
3721 case SHADER_OPCODE_TXD:
3722 /**
3723 * P = u, v, r
3724 * dPdx = dudx, dvdx, drdx
3725 * dPdy = dudy, dvdy, drdy
3726 *
3727 * Load up these values:
3728 * - dudx dudy dvdx dvdy drdx drdy
3729 * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
3730 */
3731 msg_end = msg_lod;
3732 for (unsigned i = 0; i < grad_components; i++) {
3733 bld.MOV(msg_end, lod);
3734 lod = offset(lod, bld, 1);
3735 msg_end = offset(msg_end, bld, 1);
3736
3737 bld.MOV(msg_end, lod2);
3738 lod2 = offset(lod2, bld, 1);
3739 msg_end = offset(msg_end, bld, 1);
3740 }
3741 break;
3742 case SHADER_OPCODE_TXS:
3743 msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
3744 bld.MOV(msg_lod, lod);
3745 msg_end = offset(msg_lod, bld, 1);
3746 break;
3747 case SHADER_OPCODE_TXF:
3748 msg_lod = offset(msg_coords, bld, 3);
3749 bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
3750 msg_end = offset(msg_lod, bld, 1);
3751 break;
3752 case SHADER_OPCODE_TXF_CMS:
3753 msg_lod = offset(msg_coords, bld, 3);
3754 /* lod */
3755 bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
3756 /* sample index */
3757 bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
3758 msg_end = offset(msg_lod, bld, 2);
3759 break;
3760 default:
3761 break;
3762 }
3763
3764 inst->opcode = op;
3765 inst->src[0] = reg_undef;
3766 inst->src[1] = sampler;
3767 inst->resize_sources(2);
3768 inst->base_mrf = message.nr;
3769 inst->mlen = msg_end.nr - message.nr;
3770 inst->header_size = header_size;
3771
3772 /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
3773 assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
3774 }
3775
3776 static bool
3777 is_high_sampler(const struct brw_device_info *devinfo, const fs_reg &sampler)
3778 {
3779 if (devinfo->gen < 8 && !devinfo->is_haswell)
3780 return false;
3781
3782 return sampler.file != IMM || sampler.ud >= 16;
3783 }
3784
3785 static void
3786 lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
3787 fs_reg coordinate,
3788 const fs_reg &shadow_c,
3789 fs_reg lod, fs_reg lod2,
3790 const fs_reg &sample_index,
3791 const fs_reg &mcs, const fs_reg &sampler,
3792 fs_reg offset_value,
3793 unsigned coord_components,
3794 unsigned grad_components)
3795 {
3796 const brw_device_info *devinfo = bld.shader->devinfo;
3797 int reg_width = bld.dispatch_width() / 8;
3798 unsigned header_size = 0, length = 0;
3799 fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
3800 for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
3801 sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
3802
3803 if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
3804 offset_value.file != BAD_FILE ||
3805 is_high_sampler(devinfo, sampler)) {
3806 /* For general texture offsets (no txf workaround), we need a header to
3807 * put them in. Note that we're only reserving space for it in the
3808 * message payload as it will be initialized implicitly by the
3809 * generator.
3810 *
3811 * TG4 needs to place its channel select in the header, for interaction
3812 * with ARB_texture_swizzle. The sampler index is only 4-bits, so for
3813 * larger sampler numbers we need to offset the Sampler State Pointer in
3814 * the header.
3815 */
3816 header_size = 1;
3817 sources[0] = fs_reg();
3818 length++;
3819 }
3820
3821 if (shadow_c.file != BAD_FILE) {
3822 bld.MOV(sources[length], shadow_c);
3823 length++;
3824 }
3825
3826 bool coordinate_done = false;
3827
3828 /* The sampler can only meaningfully compute LOD for fragment shader
3829 * messages. For all other stages, we change the opcode to TXL and
3830 * hardcode the LOD to 0.
3831 */
3832 if (bld.shader->stage != MESA_SHADER_FRAGMENT &&
3833 op == SHADER_OPCODE_TEX) {
3834 op = SHADER_OPCODE_TXL;
3835 lod = brw_imm_f(0.0f);
3836 }
3837
3838 /* Set up the LOD info */
3839 switch (op) {
3840 case FS_OPCODE_TXB:
3841 case SHADER_OPCODE_TXL:
3842 bld.MOV(sources[length], lod);
3843 length++;
3844 break;
3845 case SHADER_OPCODE_TXD:
3846 /* TXD should have been lowered in SIMD16 mode. */
3847 assert(bld.dispatch_width() == 8);
3848
3849 /* Load dPdx and the coordinate together:
3850 * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
3851 */
3852 for (unsigned i = 0; i < coord_components; i++) {
3853 bld.MOV(sources[length], coordinate);
3854 coordinate = offset(coordinate, bld, 1);
3855 length++;
3856
3857 /* For cube map array, the coordinate is (u,v,r,ai) but there are
3858 * only derivatives for (u, v, r).
3859 */
3860 if (i < grad_components) {
3861 bld.MOV(sources[length], lod);
3862 lod = offset(lod, bld, 1);
3863 length++;
3864
3865 bld.MOV(sources[length], lod2);
3866 lod2 = offset(lod2, bld, 1);
3867 length++;
3868 }
3869 }
3870
3871 coordinate_done = true;
3872 break;
3873 case SHADER_OPCODE_TXS:
3874 bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
3875 length++;
3876 break;
3877 case SHADER_OPCODE_TXF:
3878 /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
3879 * On Gen9 they are u, v, lod, r
3880 */
3881 bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
3882 coordinate = offset(coordinate, bld, 1);
3883 length++;
3884
3885 if (devinfo->gen >= 9) {
3886 if (coord_components >= 2) {
3887 bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
3888 coordinate = offset(coordinate, bld, 1);
3889 }
3890 length++;
3891 }
3892
3893 bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
3894 length++;
3895
3896 for (unsigned i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) {
3897 bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
3898 coordinate = offset(coordinate, bld, 1);
3899 length++;
3900 }
3901
3902 coordinate_done = true;
3903 break;
3904 case SHADER_OPCODE_TXF_CMS:
3905 case SHADER_OPCODE_TXF_CMS_W:
3906 case SHADER_OPCODE_TXF_UMS:
3907 case SHADER_OPCODE_TXF_MCS:
3908 if (op == SHADER_OPCODE_TXF_UMS ||
3909 op == SHADER_OPCODE_TXF_CMS ||
3910 op == SHADER_OPCODE_TXF_CMS_W) {
3911 bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
3912 length++;
3913 }
3914
3915 if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) {
3916 /* Data from the multisample control surface. */
3917 bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
3918 length++;
3919
3920 /* On Gen9+ we'll use ld2dms_w instead which has two registers for
3921 * the MCS data.
3922 */
3923 if (op == SHADER_OPCODE_TXF_CMS_W) {
3924 bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD),
3925 mcs.file == IMM ?
3926 mcs :
3927 offset(mcs, bld, 1));
3928 length++;
3929 }
3930 }
3931
3932 /* There is no offsetting for this message; just copy in the integer
3933 * texture coordinates.
3934 */
3935 for (unsigned i = 0; i < coord_components; i++) {
3936 bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
3937 coordinate = offset(coordinate, bld, 1);
3938 length++;
3939 }
3940
3941 coordinate_done = true;
3942 break;
3943 case SHADER_OPCODE_TG4_OFFSET:
3944 /* gather4_po_c should have been lowered in SIMD16 mode. */
3945 assert(bld.dispatch_width() == 8 || shadow_c.file == BAD_FILE);
3946
3947 /* More crazy intermixing */
3948 for (unsigned i = 0; i < 2; i++) { /* u, v */
3949 bld.MOV(sources[length], coordinate);
3950 coordinate = offset(coordinate, bld, 1);
3951 length++;
3952 }
3953
3954 for (unsigned i = 0; i < 2; i++) { /* offu, offv */
3955 bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value);
3956 offset_value = offset(offset_value, bld, 1);
3957 length++;
3958 }
3959
3960 if (coord_components == 3) { /* r if present */
3961 bld.MOV(sources[length], coordinate);
3962 coordinate = offset(coordinate, bld, 1);
3963 length++;
3964 }
3965
3966 coordinate_done = true;
3967 break;
3968 default:
3969 break;
3970 }
3971
3972 /* Set up the coordinate (except for cases where it was done above) */
3973 if (!coordinate_done) {
3974 for (unsigned i = 0; i < coord_components; i++) {
3975 bld.MOV(sources[length], coordinate);
3976 coordinate = offset(coordinate, bld, 1);
3977 length++;
3978 }
3979 }
3980
3981 int mlen;
3982 if (reg_width == 2)
3983 mlen = length * reg_width - header_size;
3984 else
3985 mlen = length * reg_width;
3986
3987 const fs_reg src_payload = fs_reg(VGRF, bld.shader->alloc.allocate(mlen),
3988 BRW_REGISTER_TYPE_F);
3989 bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
3990
3991 /* Generate the SEND. */
3992 inst->opcode = op;
3993 inst->src[0] = src_payload;
3994 inst->src[1] = sampler;
3995 inst->resize_sources(2);
3996 inst->base_mrf = -1;
3997 inst->mlen = mlen;
3998 inst->header_size = header_size;
3999
4000 /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
4001 assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
4002 }
4003
4004 static void
4005 lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
4006 {
4007 const brw_device_info *devinfo = bld.shader->devinfo;
4008 const fs_reg &coordinate = inst->src[0];
4009 const fs_reg &shadow_c = inst->src[1];
4010 const fs_reg &lod = inst->src[2];
4011 const fs_reg &lod2 = inst->src[3];
4012 const fs_reg &sample_index = inst->src[4];
4013 const fs_reg &mcs = inst->src[5];
4014 const fs_reg &sampler = inst->src[6];
4015 const fs_reg &offset_value = inst->src[7];
4016 assert(inst->src[8].file == IMM && inst->src[9].file == IMM);
4017 const unsigned coord_components = inst->src[8].ud;
4018 const unsigned grad_components = inst->src[9].ud;
4019
4020 if (devinfo->gen >= 7) {
4021 lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
4022 shadow_c, lod, lod2, sample_index,
4023 mcs, sampler, offset_value,
4024 coord_components, grad_components);
4025 } else if (devinfo->gen >= 5) {
4026 lower_sampler_logical_send_gen5(bld, inst, op, coordinate,
4027 shadow_c, lod, lod2, sample_index,
4028 sampler, offset_value,
4029 coord_components, grad_components);
4030 } else {
4031 lower_sampler_logical_send_gen4(bld, inst, op, coordinate,
4032 shadow_c, lod, lod2, sampler,
4033 coord_components, grad_components);
4034 }
4035 }
4036
4037 /**
4038 * Initialize the header present in some typed and untyped surface
4039 * messages.
4040 */
4041 static fs_reg
4042 emit_surface_header(const fs_builder &bld, const fs_reg &sample_mask)
4043 {
4044 fs_builder ubld = bld.exec_all().group(8, 0);
4045 const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4046 ubld.MOV(dst, brw_imm_d(0));
4047 ubld.MOV(component(dst, 7), sample_mask);
4048 return dst;
4049 }
4050
4051 static void
4052 lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
4053 const fs_reg &sample_mask)
4054 {
4055 /* Get the logical send arguments. */
4056 const fs_reg &addr = inst->src[0];
4057 const fs_reg &src = inst->src[1];
4058 const fs_reg &surface = inst->src[2];
4059 const UNUSED fs_reg &dims = inst->src[3];
4060 const fs_reg &arg = inst->src[4];
4061
4062 /* Calculate the total number of components of the payload. */
4063 const unsigned addr_sz = inst->components_read(0);
4064 const unsigned src_sz = inst->components_read(1);
4065 const unsigned header_sz = (sample_mask.file == BAD_FILE ? 0 : 1);
4066 const unsigned sz = header_sz + addr_sz + src_sz;
4067
4068 /* Allocate space for the payload. */
4069 fs_reg *const components = new fs_reg[sz];
4070 const fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
4071 unsigned n = 0;
4072
4073 /* Construct the payload. */
4074 if (header_sz)
4075 components[n++] = emit_surface_header(bld, sample_mask);
4076
4077 for (unsigned i = 0; i < addr_sz; i++)
4078 components[n++] = offset(addr, bld, i);
4079
4080 for (unsigned i = 0; i < src_sz; i++)
4081 components[n++] = offset(src, bld, i);
4082
4083 bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
4084
4085 /* Update the original instruction. */
4086 inst->opcode = op;
4087 inst->mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
4088 inst->header_size = header_sz;
4089
4090 inst->src[0] = payload;
4091 inst->src[1] = surface;
4092 inst->src[2] = arg;
4093 inst->resize_sources(3);
4094
4095 delete[] components;
4096 }
4097
4098 bool
4099 fs_visitor::lower_logical_sends()
4100 {
4101 bool progress = false;
4102
4103 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
4104 const fs_builder ibld(this, block, inst);
4105
4106 switch (inst->opcode) {
4107 case FS_OPCODE_FB_WRITE_LOGICAL:
4108 assert(stage == MESA_SHADER_FRAGMENT);
4109 lower_fb_write_logical_send(ibld, inst,
4110 (const brw_wm_prog_data *)prog_data,
4111 (const brw_wm_prog_key *)key,
4112 payload);
4113 break;
4114
4115 case SHADER_OPCODE_TEX_LOGICAL:
4116 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
4117 break;
4118
4119 case SHADER_OPCODE_TXD_LOGICAL:
4120 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
4121 break;
4122
4123 case SHADER_OPCODE_TXF_LOGICAL:
4124 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
4125 break;
4126
4127 case SHADER_OPCODE_TXL_LOGICAL:
4128 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
4129 break;
4130
4131 case SHADER_OPCODE_TXS_LOGICAL:
4132 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
4133 break;
4134
4135 case FS_OPCODE_TXB_LOGICAL:
4136 lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
4137 break;
4138
4139 case SHADER_OPCODE_TXF_CMS_LOGICAL:
4140 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
4141 break;
4142
4143 case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
4144 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W);
4145 break;
4146
4147 case SHADER_OPCODE_TXF_UMS_LOGICAL:
4148 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
4149 break;
4150
4151 case SHADER_OPCODE_TXF_MCS_LOGICAL:
4152 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
4153 break;
4154
4155 case SHADER_OPCODE_LOD_LOGICAL:
4156 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
4157 break;
4158
4159 case SHADER_OPCODE_TG4_LOGICAL:
4160 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
4161 break;
4162
4163 case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
4164 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
4165 break;
4166
4167 case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
4168 lower_surface_logical_send(ibld, inst,
4169 SHADER_OPCODE_UNTYPED_SURFACE_READ,
4170 fs_reg());
4171 break;
4172
4173 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
4174 lower_surface_logical_send(ibld, inst,
4175 SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
4176 ibld.sample_mask_reg());
4177 break;
4178
4179 case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
4180 lower_surface_logical_send(ibld, inst,
4181 SHADER_OPCODE_UNTYPED_ATOMIC,
4182 ibld.sample_mask_reg());
4183 break;
4184
4185 case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
4186 lower_surface_logical_send(ibld, inst,
4187 SHADER_OPCODE_TYPED_SURFACE_READ,
4188 brw_imm_d(0xffff));
4189 break;
4190
4191 case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
4192 lower_surface_logical_send(ibld, inst,
4193 SHADER_OPCODE_TYPED_SURFACE_WRITE,
4194 ibld.sample_mask_reg());
4195 break;
4196
4197 case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
4198 lower_surface_logical_send(ibld, inst,
4199 SHADER_OPCODE_TYPED_ATOMIC,
4200 ibld.sample_mask_reg());
4201 break;
4202
4203 default:
4204 continue;
4205 }
4206
4207 progress = true;
4208 }
4209
4210 if (progress)
4211 invalidate_live_intervals();
4212
4213 return progress;
4214 }
4215
4216 /**
4217 * Get the closest native SIMD width supported by the hardware for instruction
4218 * \p inst. The instruction will be left untouched by
4219 * fs_visitor::lower_simd_width() if the returned value is equal to the
4220 * original execution size.
4221 */
4222 static unsigned
4223 get_lowered_simd_width(const struct brw_device_info *devinfo,
4224 const fs_inst *inst)
4225 {
4226 switch (inst->opcode) {
4227 case BRW_OPCODE_MOV:
4228 case BRW_OPCODE_SEL:
4229 case BRW_OPCODE_NOT:
4230 case BRW_OPCODE_AND:
4231 case BRW_OPCODE_OR:
4232 case BRW_OPCODE_XOR:
4233 case BRW_OPCODE_SHR:
4234 case BRW_OPCODE_SHL:
4235 case BRW_OPCODE_ASR:
4236 case BRW_OPCODE_CMP:
4237 case BRW_OPCODE_CMPN:
4238 case BRW_OPCODE_CSEL:
4239 case BRW_OPCODE_F32TO16:
4240 case BRW_OPCODE_F16TO32:
4241 case BRW_OPCODE_BFREV:
4242 case BRW_OPCODE_BFE:
4243 case BRW_OPCODE_BFI1:
4244 case BRW_OPCODE_BFI2:
4245 case BRW_OPCODE_ADD:
4246 case BRW_OPCODE_MUL:
4247 case BRW_OPCODE_AVG:
4248 case BRW_OPCODE_FRC:
4249 case BRW_OPCODE_RNDU:
4250 case BRW_OPCODE_RNDD:
4251 case BRW_OPCODE_RNDE:
4252 case BRW_OPCODE_RNDZ:
4253 case BRW_OPCODE_LZD:
4254 case BRW_OPCODE_FBH:
4255 case BRW_OPCODE_FBL:
4256 case BRW_OPCODE_CBIT:
4257 case BRW_OPCODE_SAD2:
4258 case BRW_OPCODE_MAD:
4259 case BRW_OPCODE_LRP:
4260 case SHADER_OPCODE_RCP:
4261 case SHADER_OPCODE_RSQ:
4262 case SHADER_OPCODE_SQRT:
4263 case SHADER_OPCODE_EXP2:
4264 case SHADER_OPCODE_LOG2:
4265 case SHADER_OPCODE_POW:
4266 case SHADER_OPCODE_INT_QUOTIENT:
4267 case SHADER_OPCODE_INT_REMAINDER:
4268 case SHADER_OPCODE_SIN:
4269 case SHADER_OPCODE_COS: {
4270 /* According to the PRMs:
4271 * "A. In Direct Addressing mode, a source cannot span more than 2
4272 * adjacent GRF registers.
4273 * B. A destination cannot span more than 2 adjacent GRF registers."
4274 *
4275 * Look for the source or destination with the largest register region
4276 * which is the one that is going to limit the overal execution size of
4277 * the instruction due to this rule.
4278 */
4279 unsigned reg_count = inst->regs_written;
4280
4281 for (unsigned i = 0; i < inst->sources; i++)
4282 reg_count = MAX2(reg_count, (unsigned)inst->regs_read(i));
4283
4284 /* Calculate the maximum execution size of the instruction based on the
4285 * factor by which it goes over the hardware limit of 2 GRFs.
4286 */
4287 return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
4288 }
4289 case SHADER_OPCODE_MULH:
4290 /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
4291 * is 8-wide on Gen7+.
4292 */
4293 return (devinfo->gen >= 7 ? 8 : inst->exec_size);
4294
4295 case FS_OPCODE_FB_WRITE_LOGICAL:
4296 /* Gen6 doesn't support SIMD16 depth writes but we cannot handle them
4297 * here.
4298 */
4299 assert(devinfo->gen != 6 ||
4300 inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE ||
4301 inst->exec_size == 8);
4302 /* Dual-source FB writes are unsupported in SIMD16 mode. */
4303 return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
4304 8 : inst->exec_size);
4305
4306 case SHADER_OPCODE_TXD_LOGICAL:
4307 /* TXD is unsupported in SIMD16 mode. */
4308 return 8;
4309
4310 case SHADER_OPCODE_TG4_OFFSET_LOGICAL: {
4311 /* gather4_po_c is unsupported in SIMD16 mode. */
4312 const fs_reg &shadow_c = inst->src[1];
4313 return (shadow_c.file != BAD_FILE ? 8 : inst->exec_size);
4314 }
4315 case SHADER_OPCODE_TXL_LOGICAL:
4316 case FS_OPCODE_TXB_LOGICAL: {
4317 /* Gen4 doesn't have SIMD8 non-shadow-compare bias/LOD instructions, and
4318 * Gen4-6 can't support TXL and TXB with shadow comparison in SIMD16
4319 * mode because the message exceeds the maximum length of 11.
4320 */
4321 const fs_reg &shadow_c = inst->src[1];
4322 if (devinfo->gen == 4 && shadow_c.file == BAD_FILE)
4323 return 16;
4324 else if (devinfo->gen < 7 && shadow_c.file != BAD_FILE)
4325 return 8;
4326 else
4327 return inst->exec_size;
4328 }
4329 case SHADER_OPCODE_TXF_LOGICAL:
4330 case SHADER_OPCODE_TXS_LOGICAL:
4331 /* Gen4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
4332 * messages. Use SIMD16 instead.
4333 */
4334 if (devinfo->gen == 4)
4335 return 16;
4336 else
4337 return inst->exec_size;
4338
4339 case SHADER_OPCODE_TXF_CMS_W_LOGICAL: {
4340 /* This opcode can take up to 6 arguments which means that in some
4341 * circumstances it can end up with a message that is too long in SIMD16
4342 * mode.
4343 */
4344 const unsigned coord_components = inst->src[8].ud;
4345 /* First three arguments are the sample index and the two arguments for
4346 * the MCS data.
4347 */
4348 if ((coord_components + 3) * 2 > MAX_SAMPLER_MESSAGE_SIZE)
4349 return 8;
4350 else
4351 return inst->exec_size;
4352 }
4353
4354 case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
4355 case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
4356 case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
4357 return 8;
4358
4359 default:
4360 return inst->exec_size;
4361 }
4362 }
4363
4364 /**
4365 * The \p rows array of registers represents a \p num_rows by \p num_columns
4366 * matrix in row-major order, write it in column-major order into the register
4367 * passed as destination. \p stride gives the separation between matrix
4368 * elements in the input in fs_builder::dispatch_width() units.
4369 */
4370 static void
4371 emit_transpose(const fs_builder &bld,
4372 const fs_reg &dst, const fs_reg *rows,
4373 unsigned num_rows, unsigned num_columns, unsigned stride)
4374 {
4375 fs_reg *const components = new fs_reg[num_rows * num_columns];
4376
4377 for (unsigned i = 0; i < num_columns; ++i) {
4378 for (unsigned j = 0; j < num_rows; ++j)
4379 components[num_rows * i + j] = offset(rows[j], bld, stride * i);
4380 }
4381
4382 bld.LOAD_PAYLOAD(dst, components, num_rows * num_columns, 0);
4383
4384 delete[] components;
4385 }
4386
4387 bool
4388 fs_visitor::lower_simd_width()
4389 {
4390 bool progress = false;
4391
4392 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
4393 const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
4394
4395 if (lower_width != inst->exec_size) {
4396 /* Builder matching the original instruction. We may also need to
4397 * emit an instruction of width larger than the original, set the
4398 * execution size of the builder to the highest of both for now so
4399 * we're sure that both cases can be handled.
4400 */
4401 const fs_builder ibld = bld.at(block, inst)
4402 .exec_all(inst->force_writemask_all)
4403 .group(MAX2(inst->exec_size, lower_width),
4404 inst->force_sechalf);
4405
4406 /* Split the copies in chunks of the execution width of either the
4407 * original or the lowered instruction, whichever is lower.
4408 */
4409 const unsigned copy_width = MIN2(lower_width, inst->exec_size);
4410 const unsigned n = inst->exec_size / copy_width;
4411 const unsigned dst_size = inst->regs_written * REG_SIZE /
4412 inst->dst.component_size(inst->exec_size);
4413 fs_reg dsts[4];
4414
4415 assert(n > 0 && n <= ARRAY_SIZE(dsts) &&
4416 !inst->writes_accumulator && !inst->mlen);
4417
4418 for (unsigned i = 0; i < n; i++) {
4419 /* Emit a copy of the original instruction with the lowered width.
4420 * If the EOT flag was set throw it away except for the last
4421 * instruction to avoid killing the thread prematurely.
4422 */
4423 fs_inst split_inst = *inst;
4424 split_inst.exec_size = lower_width;
4425 split_inst.eot = inst->eot && i == n - 1;
4426
4427 /* Select the correct channel enables for the i-th group, then
4428 * transform the sources and destination and emit the lowered
4429 * instruction.
4430 */
4431 const fs_builder lbld = ibld.group(lower_width, i);
4432
4433 for (unsigned j = 0; j < inst->sources; j++) {
4434 if (inst->src[j].file != BAD_FILE &&
4435 !is_uniform(inst->src[j])) {
4436 /* Get the i-th copy_width-wide chunk of the source. */
4437 const fs_reg src = horiz_offset(inst->src[j], copy_width * i);
4438 const unsigned src_size = inst->components_read(j);
4439
4440 /* Use a trivial transposition to copy one every n
4441 * copy_width-wide components of the register into a
4442 * temporary passed as source to the lowered instruction.
4443 */
4444 split_inst.src[j] = lbld.vgrf(inst->src[j].type, src_size);
4445 emit_transpose(lbld.group(copy_width, 0),
4446 split_inst.src[j], &src, 1, src_size, n);
4447 }
4448 }
4449
4450 if (inst->regs_written) {
4451 /* Allocate enough space to hold the result of the lowered
4452 * instruction and fix up the number of registers written.
4453 */
4454 split_inst.dst = dsts[i] =
4455 lbld.vgrf(inst->dst.type, dst_size);
4456 split_inst.regs_written =
4457 DIV_ROUND_UP(inst->regs_written * lower_width,
4458 inst->exec_size);
4459 }
4460
4461 lbld.emit(split_inst);
4462 }
4463
4464 if (inst->regs_written) {
4465 /* Distance between useful channels in the temporaries, skipping
4466 * garbage if the lowered instruction is wider than the original.
4467 */
4468 const unsigned m = lower_width / copy_width;
4469
4470 /* Interleave the components of the result from the lowered
4471 * instructions. We need to set exec_all() when copying more than
4472 * one half per component, because LOAD_PAYLOAD (in terms of which
4473 * emit_transpose is implemented) can only use the same channel
4474 * enable signals for all of its non-header sources.
4475 */
4476 emit_transpose(ibld.exec_all(inst->exec_size > copy_width)
4477 .group(copy_width, 0),
4478 inst->dst, dsts, n, dst_size, m);
4479 }
4480
4481 inst->remove(block);
4482 progress = true;
4483 }
4484 }
4485
4486 if (progress)
4487 invalidate_live_intervals();
4488
4489 return progress;
4490 }
4491
4492 void
4493 fs_visitor::dump_instructions()
4494 {
4495 dump_instructions(NULL);
4496 }
4497
4498 void
4499 fs_visitor::dump_instructions(const char *name)
4500 {
4501 FILE *file = stderr;
4502 if (name && geteuid() != 0) {
4503 file = fopen(name, "w");
4504 if (!file)
4505 file = stderr;
4506 }
4507
4508 if (cfg) {
4509 calculate_register_pressure();
4510 int ip = 0, max_pressure = 0;
4511 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
4512 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
4513 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
4514 dump_instruction(inst, file);
4515 ip++;
4516 }
4517 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
4518 } else {
4519 int ip = 0;
4520 foreach_in_list(backend_instruction, inst, &instructions) {
4521 fprintf(file, "%4d: ", ip++);
4522 dump_instruction(inst, file);
4523 }
4524 }
4525
4526 if (file != stderr) {
4527 fclose(file);
4528 }
4529 }
4530
4531 void
4532 fs_visitor::dump_instruction(backend_instruction *be_inst)
4533 {
4534 dump_instruction(be_inst, stderr);
4535 }
4536
4537 void
4538 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
4539 {
4540 fs_inst *inst = (fs_inst *)be_inst;
4541
4542 if (inst->predicate) {
4543 fprintf(file, "(%cf0.%d) ",
4544 inst->predicate_inverse ? '-' : '+',
4545 inst->flag_subreg);
4546 }
4547
4548 fprintf(file, "%s", brw_instruction_name(inst->opcode));
4549 if (inst->saturate)
4550 fprintf(file, ".sat");
4551 if (inst->conditional_mod) {
4552 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
4553 if (!inst->predicate &&
4554 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
4555 inst->opcode != BRW_OPCODE_IF &&
4556 inst->opcode != BRW_OPCODE_WHILE))) {
4557 fprintf(file, ".f0.%d", inst->flag_subreg);
4558 }
4559 }
4560 fprintf(file, "(%d) ", inst->exec_size);
4561
4562 if (inst->mlen) {
4563 fprintf(file, "(mlen: %d) ", inst->mlen);
4564 }
4565
4566 switch (inst->dst.file) {
4567 case VGRF:
4568 fprintf(file, "vgrf%d", inst->dst.nr);
4569 if (alloc.sizes[inst->dst.nr] != inst->regs_written ||
4570 inst->dst.subreg_offset)
4571 fprintf(file, "+%d.%d",
4572 inst->dst.reg_offset, inst->dst.subreg_offset);
4573 break;
4574 case FIXED_GRF:
4575 fprintf(file, "g%d", inst->dst.nr);
4576 break;
4577 case MRF:
4578 fprintf(file, "m%d", inst->dst.nr);
4579 break;
4580 case BAD_FILE:
4581 fprintf(file, "(null)");
4582 break;
4583 case UNIFORM:
4584 fprintf(file, "***u%d***", inst->dst.nr + inst->dst.reg_offset);
4585 break;
4586 case ATTR:
4587 fprintf(file, "***attr%d***", inst->dst.nr + inst->dst.reg_offset);
4588 break;
4589 case ARF:
4590 switch (inst->dst.nr) {
4591 case BRW_ARF_NULL:
4592 fprintf(file, "null");
4593 break;
4594 case BRW_ARF_ADDRESS:
4595 fprintf(file, "a0.%d", inst->dst.subnr);
4596 break;
4597 case BRW_ARF_ACCUMULATOR:
4598 fprintf(file, "acc%d", inst->dst.subnr);
4599 break;
4600 case BRW_ARF_FLAG:
4601 fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
4602 break;
4603 default:
4604 fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
4605 break;
4606 }
4607 if (inst->dst.subnr)
4608 fprintf(file, "+%d", inst->dst.subnr);
4609 break;
4610 case IMM:
4611 unreachable("not reached");
4612 }
4613 if (inst->dst.stride != 1)
4614 fprintf(file, "<%u>", inst->dst.stride);
4615 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
4616
4617 for (int i = 0; i < inst->sources; i++) {
4618 if (inst->src[i].negate)
4619 fprintf(file, "-");
4620 if (inst->src[i].abs)
4621 fprintf(file, "|");
4622 switch (inst->src[i].file) {
4623 case VGRF:
4624 fprintf(file, "vgrf%d", inst->src[i].nr);
4625 if (alloc.sizes[inst->src[i].nr] != (unsigned)inst->regs_read(i) ||
4626 inst->src[i].subreg_offset)
4627 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
4628 inst->src[i].subreg_offset);
4629 break;
4630 case FIXED_GRF:
4631 fprintf(file, "g%d", inst->src[i].nr);
4632 break;
4633 case MRF:
4634 fprintf(file, "***m%d***", inst->src[i].nr);
4635 break;
4636 case ATTR:
4637 fprintf(file, "attr%d+%d", inst->src[i].nr, inst->src[i].reg_offset);
4638 break;
4639 case UNIFORM:
4640 fprintf(file, "u%d", inst->src[i].nr + inst->src[i].reg_offset);
4641 if (inst->src[i].reladdr) {
4642 fprintf(file, "+reladdr");
4643 } else if (inst->src[i].subreg_offset) {
4644 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
4645 inst->src[i].subreg_offset);
4646 }
4647 break;
4648 case BAD_FILE:
4649 fprintf(file, "(null)");
4650 break;
4651 case IMM:
4652 switch (inst->src[i].type) {
4653 case BRW_REGISTER_TYPE_F:
4654 fprintf(file, "%ff", inst->src[i].f);
4655 break;
4656 case BRW_REGISTER_TYPE_W:
4657 case BRW_REGISTER_TYPE_D:
4658 fprintf(file, "%dd", inst->src[i].d);
4659 break;
4660 case BRW_REGISTER_TYPE_UW:
4661 case BRW_REGISTER_TYPE_UD:
4662 fprintf(file, "%uu", inst->src[i].ud);
4663 break;
4664 case BRW_REGISTER_TYPE_VF:
4665 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
4666 brw_vf_to_float((inst->src[i].ud >> 0) & 0xff),
4667 brw_vf_to_float((inst->src[i].ud >> 8) & 0xff),
4668 brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
4669 brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
4670 break;
4671 default:
4672 fprintf(file, "???");
4673 break;
4674 }
4675 break;
4676 case ARF:
4677 switch (inst->src[i].nr) {
4678 case BRW_ARF_NULL:
4679 fprintf(file, "null");
4680 break;
4681 case BRW_ARF_ADDRESS:
4682 fprintf(file, "a0.%d", inst->src[i].subnr);
4683 break;
4684 case BRW_ARF_ACCUMULATOR:
4685 fprintf(file, "acc%d", inst->src[i].subnr);
4686 break;
4687 case BRW_ARF_FLAG:
4688 fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
4689 break;
4690 default:
4691 fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
4692 break;
4693 }
4694 if (inst->src[i].subnr)
4695 fprintf(file, "+%d", inst->src[i].subnr);
4696 break;
4697 }
4698 if (inst->src[i].abs)
4699 fprintf(file, "|");
4700
4701 if (inst->src[i].file != IMM) {
4702 unsigned stride;
4703 if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {
4704 unsigned hstride = inst->src[i].hstride;
4705 stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));
4706 } else {
4707 stride = inst->src[i].stride;
4708 }
4709 if (stride != 1)
4710 fprintf(file, "<%u>", stride);
4711
4712 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
4713 }
4714
4715 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
4716 fprintf(file, ", ");
4717 }
4718
4719 fprintf(file, " ");
4720
4721 if (inst->force_writemask_all)
4722 fprintf(file, "NoMask ");
4723
4724 if (dispatch_width == 16 && inst->exec_size == 8) {
4725 if (inst->force_sechalf)
4726 fprintf(file, "2ndhalf ");
4727 else
4728 fprintf(file, "1sthalf ");
4729 }
4730
4731 fprintf(file, "\n");
4732 }
4733
4734 /**
4735 * Possibly returns an instruction that set up @param reg.
4736 *
4737 * Sometimes we want to take the result of some expression/variable
4738 * dereference tree and rewrite the instruction generating the result
4739 * of the tree. When processing the tree, we know that the
4740 * instructions generated are all writing temporaries that are dead
4741 * outside of this tree. So, if we have some instructions that write
4742 * a temporary, we're free to point that temp write somewhere else.
4743 *
4744 * Note that this doesn't guarantee that the instruction generated
4745 * only reg -- it might be the size=4 destination of a texture instruction.
4746 */
4747 fs_inst *
4748 fs_visitor::get_instruction_generating_reg(fs_inst *start,
4749 fs_inst *end,
4750 const fs_reg &reg)
4751 {
4752 if (end == start ||
4753 end->is_partial_write() ||
4754 reg.reladdr ||
4755 !reg.equals(end->dst)) {
4756 return NULL;
4757 } else {
4758 return end;
4759 }
4760 }
4761
4762 void
4763 fs_visitor::setup_payload_gen6()
4764 {
4765 bool uses_depth =
4766 (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
4767 unsigned barycentric_interp_modes =
4768 (stage == MESA_SHADER_FRAGMENT) ?
4769 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
4770
4771 assert(devinfo->gen >= 6);
4772
4773 /* R0-1: masks, pixel X/Y coordinates. */
4774 payload.num_regs = 2;
4775 /* R2: only for 32-pixel dispatch.*/
4776
4777 /* R3-26: barycentric interpolation coordinates. These appear in the
4778 * same order that they appear in the brw_wm_barycentric_interp_mode
4779 * enum. Each set of coordinates occupies 2 registers if dispatch width
4780 * == 8 and 4 registers if dispatch width == 16. Coordinates only
4781 * appear if they were enabled using the "Barycentric Interpolation
4782 * Mode" bits in WM_STATE.
4783 */
4784 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
4785 if (barycentric_interp_modes & (1 << i)) {
4786 payload.barycentric_coord_reg[i] = payload.num_regs;
4787 payload.num_regs += 2;
4788 if (dispatch_width == 16) {
4789 payload.num_regs += 2;
4790 }
4791 }
4792 }
4793
4794 /* R27: interpolated depth if uses source depth */
4795 if (uses_depth) {
4796 payload.source_depth_reg = payload.num_regs;
4797 payload.num_regs++;
4798 if (dispatch_width == 16) {
4799 /* R28: interpolated depth if not SIMD8. */
4800 payload.num_regs++;
4801 }
4802 }
4803 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
4804 if (uses_depth) {
4805 payload.source_w_reg = payload.num_regs;
4806 payload.num_regs++;
4807 if (dispatch_width == 16) {
4808 /* R30: interpolated W if not SIMD8. */
4809 payload.num_regs++;
4810 }
4811 }
4812
4813 if (stage == MESA_SHADER_FRAGMENT) {
4814 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
4815 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
4816 prog_data->uses_pos_offset = key->compute_pos_offset;
4817 /* R31: MSAA position offsets. */
4818 if (prog_data->uses_pos_offset) {
4819 payload.sample_pos_reg = payload.num_regs;
4820 payload.num_regs++;
4821 }
4822 }
4823
4824 /* R32: MSAA input coverage mask */
4825 if (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) {
4826 assert(devinfo->gen >= 7);
4827 payload.sample_mask_in_reg = payload.num_regs;
4828 payload.num_regs++;
4829 if (dispatch_width == 16) {
4830 /* R33: input coverage mask if not SIMD8. */
4831 payload.num_regs++;
4832 }
4833 }
4834
4835 /* R34-: bary for 32-pixel. */
4836 /* R58-59: interp W for 32-pixel. */
4837
4838 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
4839 source_depth_to_render_target = true;
4840 }
4841 }
4842
4843 void
4844 fs_visitor::setup_vs_payload()
4845 {
4846 /* R0: thread header, R1: urb handles */
4847 payload.num_regs = 2;
4848 }
4849
4850 /**
4851 * We are building the local ID push constant data using the simplest possible
4852 * method. We simply push the local IDs directly as they should appear in the
4853 * registers for the uvec3 gl_LocalInvocationID variable.
4854 *
4855 * Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6
4856 * registers worth of push constant space.
4857 *
4858 * Note: Any updates to brw_cs_prog_local_id_payload_dwords,
4859 * fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup need
4860 * to coordinated.
4861 *
4862 * FINISHME: There are a few easy optimizations to consider.
4863 *
4864 * 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is
4865 * no need for using push constant space for that dimension.
4866 *
4867 * 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can
4868 * easily use 16-bit words rather than 32-bit dwords in the push constant
4869 * data.
4870 *
4871 * 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for
4872 * conveying the data, and thereby reduce push constant usage.
4873 *
4874 */
4875 void
4876 fs_visitor::setup_gs_payload()
4877 {
4878 assert(stage == MESA_SHADER_GEOMETRY);
4879
4880 struct brw_gs_prog_data *gs_prog_data =
4881 (struct brw_gs_prog_data *) prog_data;
4882 struct brw_vue_prog_data *vue_prog_data =
4883 (struct brw_vue_prog_data *) prog_data;
4884
4885 /* R0: thread header, R1: output URB handles */
4886 payload.num_regs = 2;
4887
4888 if (gs_prog_data->include_primitive_id) {
4889 /* R2: Primitive ID 0..7 */
4890 payload.num_regs++;
4891 }
4892
4893 /* Use a maximum of 32 registers for push-model inputs. */
4894 const unsigned max_push_components = 32;
4895
4896 /* If pushing our inputs would take too many registers, reduce the URB read
4897 * length (which is in HWords, or 8 registers), and resort to pulling.
4898 *
4899 * Note that the GS reads <URB Read Length> HWords for every vertex - so we
4900 * have to multiply by VerticesIn to obtain the total storage requirement.
4901 */
4902 if (8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in >
4903 max_push_components) {
4904 gs_prog_data->base.include_vue_handles = true;
4905
4906 /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
4907 payload.num_regs += nir->info.gs.vertices_in;
4908
4909 vue_prog_data->urb_read_length =
4910 ROUND_DOWN_TO(max_push_components / nir->info.gs.vertices_in, 8) / 8;
4911 }
4912 }
4913
4914 void
4915 fs_visitor::setup_cs_payload()
4916 {
4917 assert(devinfo->gen >= 7);
4918 brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
4919
4920 payload.num_regs = 1;
4921
4922 if (nir->info.system_values_read & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
4923 prog_data->local_invocation_id_regs = dispatch_width * 3 / 8;
4924 payload.local_invocation_id_reg = payload.num_regs;
4925 payload.num_regs += prog_data->local_invocation_id_regs;
4926 }
4927 }
4928
4929 void
4930 fs_visitor::calculate_register_pressure()
4931 {
4932 invalidate_live_intervals();
4933 calculate_live_intervals();
4934
4935 unsigned num_instructions = 0;
4936 foreach_block(block, cfg)
4937 num_instructions += block->instructions.length();
4938
4939 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
4940
4941 for (unsigned reg = 0; reg < alloc.count; reg++) {
4942 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
4943 regs_live_at_ip[ip] += alloc.sizes[reg];
4944 }
4945 }
4946
4947 void
4948 fs_visitor::optimize()
4949 {
4950 /* Start by validating the shader we currently have. */
4951 validate();
4952
4953 /* bld is the common builder object pointing at the end of the program we
4954 * used to translate it into i965 IR. For the optimization and lowering
4955 * passes coming next, any code added after the end of the program without
4956 * having explicitly called fs_builder::at() clearly points at a mistake.
4957 * Ideally optimization passes wouldn't be part of the visitor so they
4958 * wouldn't have access to bld at all, but they do, so just in case some
4959 * pass forgets to ask for a location explicitly set it to NULL here to
4960 * make it trip. The dispatch width is initialized to a bogus value to
4961 * make sure that optimizations set the execution controls explicitly to
4962 * match the code they are manipulating instead of relying on the defaults.
4963 */
4964 bld = fs_builder(this, 64);
4965
4966 assign_constant_locations();
4967 demote_pull_constants();
4968
4969 validate();
4970
4971 split_virtual_grfs();
4972 validate();
4973
4974 #define OPT(pass, args...) ({ \
4975 pass_num++; \
4976 bool this_progress = pass(args); \
4977 \
4978 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
4979 char filename[64]; \
4980 snprintf(filename, 64, "%s%d-%s-%02d-%02d-" #pass, \
4981 stage_abbrev, dispatch_width, nir->info.name, iteration, pass_num); \
4982 \
4983 backend_shader::dump_instructions(filename); \
4984 } \
4985 \
4986 validate(); \
4987 \
4988 progress = progress || this_progress; \
4989 this_progress; \
4990 })
4991
4992 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
4993 char filename[64];
4994 snprintf(filename, 64, "%s%d-%s-00-start",
4995 stage_abbrev, dispatch_width, nir->info.name);
4996
4997 backend_shader::dump_instructions(filename);
4998 }
4999
5000 bool progress = false;
5001 int iteration = 0;
5002 int pass_num = 0;
5003
5004 OPT(lower_simd_width);
5005 OPT(lower_logical_sends);
5006
5007 do {
5008 progress = false;
5009 pass_num = 0;
5010 iteration++;
5011
5012 OPT(remove_duplicate_mrf_writes);
5013
5014 OPT(opt_algebraic);
5015 OPT(opt_cse);
5016 OPT(opt_copy_propagate);
5017 OPT(opt_predicated_break, this);
5018 OPT(opt_cmod_propagation);
5019 OPT(dead_code_eliminate);
5020 OPT(opt_peephole_sel);
5021 OPT(dead_control_flow_eliminate, this);
5022 OPT(opt_register_renaming);
5023 OPT(opt_redundant_discard_jumps);
5024 OPT(opt_saturate_propagation);
5025 OPT(opt_zero_samples);
5026 OPT(register_coalesce);
5027 OPT(compute_to_mrf);
5028 OPT(eliminate_find_live_channel);
5029
5030 OPT(compact_virtual_grfs);
5031 } while (progress);
5032
5033 pass_num = 0;
5034
5035 OPT(opt_sampler_eot);
5036
5037 if (OPT(lower_load_payload)) {
5038 split_virtual_grfs();
5039 OPT(register_coalesce);
5040 OPT(compute_to_mrf);
5041 OPT(dead_code_eliminate);
5042 }
5043
5044 OPT(opt_combine_constants);
5045 OPT(lower_integer_multiplication);
5046
5047 lower_uniform_pull_constant_loads();
5048
5049 validate();
5050 }
5051
5052 /**
5053 * Three source instruction must have a GRF/MRF destination register.
5054 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
5055 */
5056 void
5057 fs_visitor::fixup_3src_null_dest()
5058 {
5059 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
5060 if (inst->is_3src() && inst->dst.is_null()) {
5061 inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
5062 inst->dst.type);
5063 }
5064 }
5065 }
5066
5067 void
5068 fs_visitor::allocate_registers()
5069 {
5070 bool allocated_without_spills;
5071
5072 static const enum instruction_scheduler_mode pre_modes[] = {
5073 SCHEDULE_PRE,
5074 SCHEDULE_PRE_NON_LIFO,
5075 SCHEDULE_PRE_LIFO,
5076 };
5077
5078 /* Try each scheduling heuristic to see if it can successfully register
5079 * allocate without spilling. They should be ordered by decreasing
5080 * performance but increasing likelihood of allocating.
5081 */
5082 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
5083 schedule_instructions(pre_modes[i]);
5084
5085 if (0) {
5086 assign_regs_trivial();
5087 allocated_without_spills = true;
5088 } else {
5089 allocated_without_spills = assign_regs(false);
5090 }
5091 if (allocated_without_spills)
5092 break;
5093 }
5094
5095 if (!allocated_without_spills) {
5096 /* We assume that any spilling is worse than just dropping back to
5097 * SIMD8. There's probably actually some intermediate point where
5098 * SIMD16 with a couple of spills is still better.
5099 */
5100 if (dispatch_width == 16) {
5101 fail("Failure to register allocate. Reduce number of "
5102 "live scalar values to avoid this.");
5103 } else {
5104 compiler->shader_perf_log(log_data,
5105 "%s shader triggered register spilling. "
5106 "Try reducing the number of live scalar "
5107 "values to improve performance.\n",
5108 stage_name);
5109 }
5110
5111 /* Since we're out of heuristics, just go spill registers until we
5112 * get an allocation.
5113 */
5114 while (!assign_regs(true)) {
5115 if (failed)
5116 break;
5117 }
5118 }
5119
5120 /* This must come after all optimization and register allocation, since
5121 * it inserts dead code that happens to have side effects, and it does
5122 * so based on the actual physical registers in use.
5123 */
5124 insert_gen4_send_dependency_workarounds();
5125
5126 if (failed)
5127 return;
5128
5129 schedule_instructions(SCHEDULE_POST);
5130
5131 if (last_scratch > 0)
5132 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
5133 }
5134
5135 bool
5136 fs_visitor::run_vs(gl_clip_plane *clip_planes)
5137 {
5138 assert(stage == MESA_SHADER_VERTEX);
5139
5140 setup_vs_payload();
5141
5142 if (shader_time_index >= 0)
5143 emit_shader_time_begin();
5144
5145 emit_nir_code();
5146
5147 if (failed)
5148 return false;
5149
5150 compute_clip_distance(clip_planes);
5151
5152 emit_urb_writes();
5153
5154 if (shader_time_index >= 0)
5155 emit_shader_time_end();
5156
5157 calculate_cfg();
5158
5159 optimize();
5160
5161 assign_curb_setup();
5162 assign_vs_urb_setup();
5163
5164 fixup_3src_null_dest();
5165 allocate_registers();
5166
5167 return !failed;
5168 }
5169
5170 bool
5171 fs_visitor::run_gs()
5172 {
5173 assert(stage == MESA_SHADER_GEOMETRY);
5174
5175 setup_gs_payload();
5176
5177 this->final_gs_vertex_count = vgrf(glsl_type::uint_type);
5178
5179 if (gs_compile->control_data_header_size_bits > 0) {
5180 /* Create a VGRF to store accumulated control data bits. */
5181 this->control_data_bits = vgrf(glsl_type::uint_type);
5182
5183 /* If we're outputting more than 32 control data bits, then EmitVertex()
5184 * will set control_data_bits to 0 after emitting the first vertex.
5185 * Otherwise, we need to initialize it to 0 here.
5186 */
5187 if (gs_compile->control_data_header_size_bits <= 32) {
5188 const fs_builder abld = bld.annotate("initialize control data bits");
5189 abld.MOV(this->control_data_bits, brw_imm_ud(0u));
5190 }
5191 }
5192
5193 if (shader_time_index >= 0)
5194 emit_shader_time_begin();
5195
5196 emit_nir_code();
5197
5198 emit_gs_thread_end();
5199
5200 if (shader_time_index >= 0)
5201 emit_shader_time_end();
5202
5203 if (failed)
5204 return false;
5205
5206 calculate_cfg();
5207
5208 optimize();
5209
5210 assign_curb_setup();
5211 assign_gs_urb_setup();
5212
5213 fixup_3src_null_dest();
5214 allocate_registers();
5215
5216 return !failed;
5217 }
5218
5219 bool
5220 fs_visitor::run_fs(bool do_rep_send)
5221 {
5222 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
5223 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
5224
5225 assert(stage == MESA_SHADER_FRAGMENT);
5226
5227 if (devinfo->gen >= 6)
5228 setup_payload_gen6();
5229 else
5230 setup_payload_gen4();
5231
5232 if (0) {
5233 emit_dummy_fs();
5234 } else if (do_rep_send) {
5235 assert(dispatch_width == 16);
5236 emit_repclear_shader();
5237 } else {
5238 if (shader_time_index >= 0)
5239 emit_shader_time_begin();
5240
5241 calculate_urb_setup();
5242 if (nir->info.inputs_read > 0) {
5243 if (devinfo->gen < 6)
5244 emit_interpolation_setup_gen4();
5245 else
5246 emit_interpolation_setup_gen6();
5247 }
5248
5249 /* We handle discards by keeping track of the still-live pixels in f0.1.
5250 * Initialize it with the dispatched pixels.
5251 */
5252 if (wm_prog_data->uses_kill) {
5253 fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
5254 discard_init->flag_subreg = 1;
5255 }
5256
5257 /* Generate FS IR for main(). (the visitor only descends into
5258 * functions called "main").
5259 */
5260 emit_nir_code();
5261
5262 if (failed)
5263 return false;
5264
5265 if (wm_prog_data->uses_kill)
5266 bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
5267
5268 if (wm_key->alpha_test_func)
5269 emit_alpha_test();
5270
5271 emit_fb_writes();
5272
5273 if (shader_time_index >= 0)
5274 emit_shader_time_end();
5275
5276 calculate_cfg();
5277
5278 optimize();
5279
5280 assign_curb_setup();
5281 assign_urb_setup();
5282
5283 fixup_3src_null_dest();
5284 allocate_registers();
5285
5286 if (failed)
5287 return false;
5288 }
5289
5290 if (dispatch_width == 8)
5291 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
5292 else
5293 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
5294
5295 return !failed;
5296 }
5297
5298 bool
5299 fs_visitor::run_cs()
5300 {
5301 assert(stage == MESA_SHADER_COMPUTE);
5302
5303 setup_cs_payload();
5304
5305 if (shader_time_index >= 0)
5306 emit_shader_time_begin();
5307
5308 emit_nir_code();
5309
5310 if (failed)
5311 return false;
5312
5313 emit_cs_terminate();
5314
5315 if (shader_time_index >= 0)
5316 emit_shader_time_end();
5317
5318 calculate_cfg();
5319
5320 optimize();
5321
5322 assign_curb_setup();
5323
5324 fixup_3src_null_dest();
5325 allocate_registers();
5326
5327 if (failed)
5328 return false;
5329
5330 return !failed;
5331 }
5332
5333 /**
5334 * Return a bitfield where bit n is set if barycentric interpolation mode n
5335 * (see enum brw_wm_barycentric_interp_mode) is needed by the fragment shader.
5336 */
5337 static unsigned
5338 brw_compute_barycentric_interp_modes(const struct brw_device_info *devinfo,
5339 bool shade_model_flat,
5340 bool persample_shading,
5341 const nir_shader *shader)
5342 {
5343 unsigned barycentric_interp_modes = 0;
5344
5345 nir_foreach_variable(var, &shader->inputs) {
5346 enum glsl_interp_qualifier interp_qualifier =
5347 (enum glsl_interp_qualifier)var->data.interpolation;
5348 bool is_centroid = var->data.centroid && !persample_shading;
5349 bool is_sample = var->data.sample || persample_shading;
5350 bool is_gl_Color = (var->data.location == VARYING_SLOT_COL0) ||
5351 (var->data.location == VARYING_SLOT_COL1);
5352
5353 /* Ignore WPOS and FACE, because they don't require interpolation. */
5354 if (var->data.location == VARYING_SLOT_POS ||
5355 var->data.location == VARYING_SLOT_FACE)
5356 continue;
5357
5358 /* Determine the set (or sets) of barycentric coordinates needed to
5359 * interpolate this variable. Note that when
5360 * brw->needs_unlit_centroid_workaround is set, centroid interpolation
5361 * uses PIXEL interpolation for unlit pixels and CENTROID interpolation
5362 * for lit pixels, so we need both sets of barycentric coordinates.
5363 */
5364 if (interp_qualifier == INTERP_QUALIFIER_NOPERSPECTIVE) {
5365 if (is_centroid) {
5366 barycentric_interp_modes |=
5367 1 << BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
5368 } else if (is_sample) {
5369 barycentric_interp_modes |=
5370 1 << BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
5371 }
5372 if ((!is_centroid && !is_sample) ||
5373 devinfo->needs_unlit_centroid_workaround) {
5374 barycentric_interp_modes |=
5375 1 << BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
5376 }
5377 } else if (interp_qualifier == INTERP_QUALIFIER_SMOOTH ||
5378 (!(shade_model_flat && is_gl_Color) &&
5379 interp_qualifier == INTERP_QUALIFIER_NONE)) {
5380 if (is_centroid) {
5381 barycentric_interp_modes |=
5382 1 << BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
5383 } else if (is_sample) {
5384 barycentric_interp_modes |=
5385 1 << BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
5386 }
5387 if ((!is_centroid && !is_sample) ||
5388 devinfo->needs_unlit_centroid_workaround) {
5389 barycentric_interp_modes |=
5390 1 << BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
5391 }
5392 }
5393 }
5394
5395 return barycentric_interp_modes;
5396 }
5397
5398 static uint8_t
5399 computed_depth_mode(const nir_shader *shader)
5400 {
5401 if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
5402 switch (shader->info.fs.depth_layout) {
5403 case FRAG_DEPTH_LAYOUT_NONE:
5404 case FRAG_DEPTH_LAYOUT_ANY:
5405 return BRW_PSCDEPTH_ON;
5406 case FRAG_DEPTH_LAYOUT_GREATER:
5407 return BRW_PSCDEPTH_ON_GE;
5408 case FRAG_DEPTH_LAYOUT_LESS:
5409 return BRW_PSCDEPTH_ON_LE;
5410 case FRAG_DEPTH_LAYOUT_UNCHANGED:
5411 return BRW_PSCDEPTH_OFF;
5412 }
5413 }
5414 return BRW_PSCDEPTH_OFF;
5415 }
5416
5417 const unsigned *
5418 brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
5419 void *mem_ctx,
5420 const struct brw_wm_prog_key *key,
5421 struct brw_wm_prog_data *prog_data,
5422 const nir_shader *src_shader,
5423 struct gl_program *prog,
5424 int shader_time_index8, int shader_time_index16,
5425 bool use_rep_send,
5426 unsigned *final_assembly_size,
5427 char **error_str)
5428 {
5429 nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
5430 shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
5431 true);
5432 shader = brw_postprocess_nir(shader, compiler->devinfo, true);
5433
5434 /* key->alpha_test_func means simulating alpha testing via discards,
5435 * so the shader definitely kills pixels.
5436 */
5437 prog_data->uses_kill = shader->info.fs.uses_discard || key->alpha_test_func;
5438 prog_data->uses_omask =
5439 shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
5440 prog_data->computed_depth_mode = computed_depth_mode(shader);
5441 prog_data->computed_stencil =
5442 shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
5443
5444 prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
5445
5446 prog_data->barycentric_interp_modes =
5447 brw_compute_barycentric_interp_modes(compiler->devinfo,
5448 key->flat_shade,
5449 key->persample_shading,
5450 shader);
5451
5452 fs_visitor v(compiler, log_data, mem_ctx, key,
5453 &prog_data->base, prog, shader, 8,
5454 shader_time_index8);
5455 if (!v.run_fs(false /* do_rep_send */)) {
5456 if (error_str)
5457 *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
5458
5459 return NULL;
5460 }
5461
5462 cfg_t *simd16_cfg = NULL;
5463 fs_visitor v2(compiler, log_data, mem_ctx, key,
5464 &prog_data->base, prog, shader, 16,
5465 shader_time_index16);
5466 if (likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) {
5467 if (!v.simd16_unsupported) {
5468 /* Try a SIMD16 compile */
5469 v2.import_uniforms(&v);
5470 if (!v2.run_fs(use_rep_send)) {
5471 compiler->shader_perf_log(log_data,
5472 "SIMD16 shader failed to compile: %s",
5473 v2.fail_msg);
5474 } else {
5475 simd16_cfg = v2.cfg;
5476 }
5477 }
5478 }
5479
5480 cfg_t *simd8_cfg;
5481 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || use_rep_send;
5482 if ((no_simd8 || compiler->devinfo->gen < 5) && simd16_cfg) {
5483 simd8_cfg = NULL;
5484 prog_data->no_8 = true;
5485 } else {
5486 simd8_cfg = v.cfg;
5487 prog_data->no_8 = false;
5488 }
5489
5490 fs_generator g(compiler, log_data, mem_ctx, (void *) key, &prog_data->base,
5491 v.promoted_constants, v.runtime_check_aads_emit, "FS");
5492
5493 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
5494 g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s",
5495 shader->info.label ? shader->info.label :
5496 "unnamed",
5497 shader->info.name));
5498 }
5499
5500 if (simd8_cfg)
5501 g.generate_code(simd8_cfg, 8);
5502 if (simd16_cfg)
5503 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
5504
5505 return g.get_assembly(final_assembly_size);
5506 }
5507
5508 fs_reg *
5509 fs_visitor::emit_cs_local_invocation_id_setup()
5510 {
5511 assert(stage == MESA_SHADER_COMPUTE);
5512
5513 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));
5514
5515 struct brw_reg src =
5516 brw_vec8_grf(payload.local_invocation_id_reg, 0);
5517 src = retype(src, BRW_REGISTER_TYPE_UD);
5518 bld.MOV(*reg, src);
5519 src.nr += dispatch_width / 8;
5520 bld.MOV(offset(*reg, bld, 1), src);
5521 src.nr += dispatch_width / 8;
5522 bld.MOV(offset(*reg, bld, 2), src);
5523
5524 return reg;
5525 }
5526
5527 fs_reg *
5528 fs_visitor::emit_cs_work_group_id_setup()
5529 {
5530 assert(stage == MESA_SHADER_COMPUTE);
5531
5532 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));
5533
5534 struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
5535 struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD));
5536 struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD));
5537
5538 bld.MOV(*reg, r0_1);
5539 bld.MOV(offset(*reg, bld, 1), r0_6);
5540 bld.MOV(offset(*reg, bld, 2), r0_7);
5541
5542 return reg;
5543 }
5544
5545 const unsigned *
5546 brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
5547 void *mem_ctx,
5548 const struct brw_cs_prog_key *key,
5549 struct brw_cs_prog_data *prog_data,
5550 const nir_shader *src_shader,
5551 int shader_time_index,
5552 unsigned *final_assembly_size,
5553 char **error_str)
5554 {
5555 nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
5556 shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
5557 true);
5558 shader = brw_postprocess_nir(shader, compiler->devinfo, true);
5559
5560 prog_data->local_size[0] = shader->info.cs.local_size[0];
5561 prog_data->local_size[1] = shader->info.cs.local_size[1];
5562 prog_data->local_size[2] = shader->info.cs.local_size[2];
5563 unsigned local_workgroup_size =
5564 shader->info.cs.local_size[0] * shader->info.cs.local_size[1] *
5565 shader->info.cs.local_size[2];
5566
5567 unsigned max_cs_threads = compiler->devinfo->max_cs_threads;
5568
5569 cfg_t *cfg = NULL;
5570 const char *fail_msg = NULL;
5571
5572 /* Now the main event: Visit the shader IR and generate our CS IR for it.
5573 */
5574 fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base,
5575 NULL, /* Never used in core profile */
5576 shader, 8, shader_time_index);
5577 if (!v8.run_cs()) {
5578 fail_msg = v8.fail_msg;
5579 } else if (local_workgroup_size <= 8 * max_cs_threads) {
5580 cfg = v8.cfg;
5581 prog_data->simd_size = 8;
5582 }
5583
5584 fs_visitor v16(compiler, log_data, mem_ctx, key, &prog_data->base,
5585 NULL, /* Never used in core profile */
5586 shader, 16, shader_time_index);
5587 if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
5588 !fail_msg && !v8.simd16_unsupported &&
5589 local_workgroup_size <= 16 * max_cs_threads) {
5590 /* Try a SIMD16 compile */
5591 v16.import_uniforms(&v8);
5592 if (!v16.run_cs()) {
5593 compiler->shader_perf_log(log_data,
5594 "SIMD16 shader failed to compile: %s",
5595 v16.fail_msg);
5596 if (!cfg) {
5597 fail_msg =
5598 "Couldn't generate SIMD16 program and not "
5599 "enough threads for SIMD8";
5600 }
5601 } else {
5602 cfg = v16.cfg;
5603 prog_data->simd_size = 16;
5604 }
5605 }
5606
5607 if (unlikely(cfg == NULL)) {
5608 assert(fail_msg);
5609 if (error_str)
5610 *error_str = ralloc_strdup(mem_ctx, fail_msg);
5611
5612 return NULL;
5613 }
5614
5615 fs_generator g(compiler, log_data, mem_ctx, (void*) key, &prog_data->base,
5616 v8.promoted_constants, v8.runtime_check_aads_emit, "CS");
5617 if (INTEL_DEBUG & DEBUG_CS) {
5618 char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
5619 shader->info.label ? shader->info.label :
5620 "unnamed",
5621 shader->info.name);
5622 g.enable_debug(name);
5623 }
5624
5625 g.generate_code(cfg, prog_data->simd_size);
5626
5627 return g.get_assembly(final_assembly_size);
5628 }