i965/fs: Remove exec_size guessing from fs_inst::init()
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 using namespace brw;
53
54 void
55 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
56 const fs_reg *src, unsigned sources)
57 {
58 memset(this, 0, sizeof(*this));
59
60 this->src = new fs_reg[MAX2(sources, 3)];
61 for (unsigned i = 0; i < sources; i++)
62 this->src[i] = src[i];
63
64 this->opcode = opcode;
65 this->dst = dst;
66 this->sources = sources;
67 this->exec_size = exec_size;
68
69 assert(dst.file != IMM && dst.file != UNIFORM);
70
71 assert(this->exec_size != 0);
72
73 this->conditional_mod = BRW_CONDITIONAL_NONE;
74
75 /* This will be the case for almost all instructions. */
76 switch (dst.file) {
77 case GRF:
78 case HW_REG:
79 case MRF:
80 case ATTR:
81 this->regs_written =
82 DIV_ROUND_UP(MAX2(exec_size * dst.stride, 1) * type_sz(dst.type), 32);
83 break;
84 case BAD_FILE:
85 this->regs_written = 0;
86 break;
87 case IMM:
88 case UNIFORM:
89 unreachable("Invalid destination register file");
90 default:
91 unreachable("Invalid register file");
92 }
93
94 this->writes_accumulator = false;
95 }
96
97 fs_inst::fs_inst()
98 {
99 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
100 }
101
102 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
103 {
104 init(opcode, exec_size, reg_undef, NULL, 0);
105 }
106
107 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
108 {
109 init(opcode, exec_size, dst, NULL, 0);
110 }
111
112 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
113 const fs_reg &src0)
114 {
115 const fs_reg src[1] = { src0 };
116 init(opcode, exec_size, dst, src, 1);
117 }
118
119 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
120 const fs_reg &src0, const fs_reg &src1)
121 {
122 const fs_reg src[2] = { src0, src1 };
123 init(opcode, exec_size, dst, src, 2);
124 }
125
126 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
127 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
128 {
129 const fs_reg src[3] = { src0, src1, src2 };
130 init(opcode, exec_size, dst, src, 3);
131 }
132
133 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
134 const fs_reg src[], unsigned sources)
135 {
136 init(opcode, exec_width, dst, src, sources);
137 }
138
139 fs_inst::fs_inst(const fs_inst &that)
140 {
141 memcpy(this, &that, sizeof(that));
142
143 this->src = new fs_reg[MAX2(that.sources, 3)];
144
145 for (unsigned i = 0; i < that.sources; i++)
146 this->src[i] = that.src[i];
147 }
148
149 fs_inst::~fs_inst()
150 {
151 delete[] this->src;
152 }
153
154 void
155 fs_inst::resize_sources(uint8_t num_sources)
156 {
157 if (this->sources != num_sources) {
158 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
159
160 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
161 src[i] = this->src[i];
162
163 delete[] this->src;
164 this->src = src;
165 this->sources = num_sources;
166 }
167 }
168
169 void
170 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
171 const fs_reg &dst,
172 const fs_reg &surf_index,
173 const fs_reg &varying_offset,
174 uint32_t const_offset)
175 {
176 /* We have our constant surface use a pitch of 4 bytes, so our index can
177 * be any component of a vector, and then we load 4 contiguous
178 * components starting from that.
179 *
180 * We break down the const_offset to a portion added to the variable
181 * offset and a portion done using reg_offset, which means that if you
182 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
183 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
184 * CSE can later notice that those loads are all the same and eliminate
185 * the redundant ones.
186 */
187 fs_reg vec4_offset = vgrf(glsl_type::int_type);
188 bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
189
190 int scale = 1;
191 if (devinfo->gen == 4 && dst.width == 8) {
192 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
193 * u, v, r) as parameters, or we can just use the SIMD16 message
194 * consisting of (header, u). We choose the second, at the cost of a
195 * longer return length.
196 */
197 scale = 2;
198 }
199
200 enum opcode op;
201 if (devinfo->gen >= 7)
202 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
203 else
204 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
205
206 assert(dst.width % 8 == 0);
207 int regs_written = 4 * (dst.width / 8) * scale;
208 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
209 dst.type, dst.width);
210 fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
211 inst->regs_written = regs_written;
212
213 if (devinfo->gen < 7) {
214 inst->base_mrf = 13;
215 inst->header_size = 1;
216 if (devinfo->gen == 4)
217 inst->mlen = 3;
218 else
219 inst->mlen = 1 + dispatch_width / 8;
220 }
221
222 bld.MOV(dst, offset(vec4_result, bld, (const_offset & 3) * scale));
223 }
224
225 /**
226 * A helper for MOV generation for fixing up broken hardware SEND dependency
227 * handling.
228 */
229 void
230 fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
231 {
232 /* The caller always wants uncompressed to emit the minimal extra
233 * dependencies, and to avoid having to deal with aligning its regs to 2.
234 */
235 const fs_builder ubld = bld.annotate("send dependency resolve")
236 .half(0);
237
238 ubld.MOV(ubld.null_reg_f(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
239 }
240
241 bool
242 fs_inst::equals(fs_inst *inst) const
243 {
244 return (opcode == inst->opcode &&
245 dst.equals(inst->dst) &&
246 src[0].equals(inst->src[0]) &&
247 src[1].equals(inst->src[1]) &&
248 src[2].equals(inst->src[2]) &&
249 saturate == inst->saturate &&
250 predicate == inst->predicate &&
251 conditional_mod == inst->conditional_mod &&
252 mlen == inst->mlen &&
253 base_mrf == inst->base_mrf &&
254 target == inst->target &&
255 eot == inst->eot &&
256 header_size == inst->header_size &&
257 shadow_compare == inst->shadow_compare &&
258 exec_size == inst->exec_size &&
259 offset == inst->offset);
260 }
261
262 bool
263 fs_inst::overwrites_reg(const fs_reg &reg) const
264 {
265 return reg.in_range(dst, regs_written);
266 }
267
268 bool
269 fs_inst::is_send_from_grf() const
270 {
271 switch (opcode) {
272 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
273 case SHADER_OPCODE_SHADER_TIME_ADD:
274 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
275 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
276 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
277 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
278 case SHADER_OPCODE_UNTYPED_ATOMIC:
279 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
280 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
281 case SHADER_OPCODE_TYPED_ATOMIC:
282 case SHADER_OPCODE_TYPED_SURFACE_READ:
283 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
284 case SHADER_OPCODE_URB_WRITE_SIMD8:
285 return true;
286 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
287 return src[1].file == GRF;
288 case FS_OPCODE_FB_WRITE:
289 return src[0].file == GRF;
290 default:
291 if (is_tex())
292 return src[0].file == GRF;
293
294 return false;
295 }
296 }
297
298 bool
299 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
300 {
301 if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
302 return false;
303
304 fs_reg reg = this->src[0];
305 if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
306 return false;
307
308 if (grf_alloc.sizes[reg.reg] != this->regs_written)
309 return false;
310
311 for (int i = 0; i < this->sources; i++) {
312 reg.type = this->src[i].type;
313 reg.width = this->src[i].width;
314 if (!this->src[i].equals(reg))
315 return false;
316
317 if (i < this->header_size) {
318 reg.reg_offset += 1;
319 } else {
320 reg.reg_offset += this->exec_size / 8;
321 }
322 }
323
324 return true;
325 }
326
327 bool
328 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
329 {
330 if (devinfo->gen == 6 && is_math())
331 return false;
332
333 if (is_send_from_grf())
334 return false;
335
336 if (!backend_instruction::can_do_source_mods())
337 return false;
338
339 return true;
340 }
341
342 bool
343 fs_inst::has_side_effects() const
344 {
345 return this->eot || backend_instruction::has_side_effects();
346 }
347
348 void
349 fs_reg::init()
350 {
351 memset(this, 0, sizeof(*this));
352 stride = 1;
353 }
354
355 /** Generic unset register constructor. */
356 fs_reg::fs_reg()
357 {
358 init();
359 this->file = BAD_FILE;
360 }
361
362 /** Immediate value constructor. */
363 fs_reg::fs_reg(float f)
364 {
365 init();
366 this->file = IMM;
367 this->type = BRW_REGISTER_TYPE_F;
368 this->fixed_hw_reg.dw1.f = f;
369 this->width = 1;
370 }
371
372 /** Immediate value constructor. */
373 fs_reg::fs_reg(int32_t i)
374 {
375 init();
376 this->file = IMM;
377 this->type = BRW_REGISTER_TYPE_D;
378 this->fixed_hw_reg.dw1.d = i;
379 this->width = 1;
380 }
381
382 /** Immediate value constructor. */
383 fs_reg::fs_reg(uint32_t u)
384 {
385 init();
386 this->file = IMM;
387 this->type = BRW_REGISTER_TYPE_UD;
388 this->fixed_hw_reg.dw1.ud = u;
389 this->width = 1;
390 }
391
392 /** Vector float immediate value constructor. */
393 fs_reg::fs_reg(uint8_t vf[4])
394 {
395 init();
396 this->file = IMM;
397 this->type = BRW_REGISTER_TYPE_VF;
398 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
399 }
400
401 /** Vector float immediate value constructor. */
402 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
403 {
404 init();
405 this->file = IMM;
406 this->type = BRW_REGISTER_TYPE_VF;
407 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
408 (vf1 << 8) |
409 (vf2 << 16) |
410 (vf3 << 24);
411 }
412
413 /** Fixed brw_reg. */
414 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
415 {
416 init();
417 this->file = HW_REG;
418 this->fixed_hw_reg = fixed_hw_reg;
419 this->type = fixed_hw_reg.type;
420 this->width = 1 << fixed_hw_reg.width;
421 }
422
423 bool
424 fs_reg::equals(const fs_reg &r) const
425 {
426 return (file == r.file &&
427 reg == r.reg &&
428 reg_offset == r.reg_offset &&
429 subreg_offset == r.subreg_offset &&
430 type == r.type &&
431 negate == r.negate &&
432 abs == r.abs &&
433 !reladdr && !r.reladdr &&
434 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
435 width == r.width &&
436 stride == r.stride);
437 }
438
439 fs_reg &
440 fs_reg::set_smear(unsigned subreg)
441 {
442 assert(file != HW_REG && file != IMM);
443 subreg_offset = subreg * type_sz(type);
444 stride = 0;
445 return *this;
446 }
447
448 bool
449 fs_reg::is_contiguous() const
450 {
451 return stride == 1;
452 }
453
454 int
455 fs_visitor::type_size(const struct glsl_type *type)
456 {
457 unsigned int size, i;
458
459 switch (type->base_type) {
460 case GLSL_TYPE_UINT:
461 case GLSL_TYPE_INT:
462 case GLSL_TYPE_FLOAT:
463 case GLSL_TYPE_BOOL:
464 return type->components();
465 case GLSL_TYPE_ARRAY:
466 return type_size(type->fields.array) * type->length;
467 case GLSL_TYPE_STRUCT:
468 size = 0;
469 for (i = 0; i < type->length; i++) {
470 size += type_size(type->fields.structure[i].type);
471 }
472 return size;
473 case GLSL_TYPE_SAMPLER:
474 /* Samplers take up no register space, since they're baked in at
475 * link time.
476 */
477 return 0;
478 case GLSL_TYPE_ATOMIC_UINT:
479 return 0;
480 case GLSL_TYPE_IMAGE:
481 case GLSL_TYPE_VOID:
482 case GLSL_TYPE_ERROR:
483 case GLSL_TYPE_INTERFACE:
484 case GLSL_TYPE_DOUBLE:
485 unreachable("not reached");
486 }
487
488 return 0;
489 }
490
491 /**
492 * Create a MOV to read the timestamp register.
493 *
494 * The caller is responsible for emitting the MOV. The return value is
495 * the destination of the MOV, with extra parameters set.
496 */
497 fs_reg
498 fs_visitor::get_timestamp(const fs_builder &bld)
499 {
500 assert(devinfo->gen >= 7);
501
502 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
503 BRW_ARF_TIMESTAMP,
504 0),
505 BRW_REGISTER_TYPE_UD));
506
507 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
508
509 /* We want to read the 3 fields we care about even if it's not enabled in
510 * the dispatch.
511 */
512 bld.group(4, 0).exec_all().MOV(dst, ts);
513
514 /* The caller wants the low 32 bits of the timestamp. Since it's running
515 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
516 * which is plenty of time for our purposes. It is identical across the
517 * EUs, but since it's tracking GPU core speed it will increment at a
518 * varying rate as render P-states change.
519 *
520 * The caller could also check if render P-states have changed (or anything
521 * else that might disrupt timing) by setting smear to 2 and checking if
522 * that field is != 0.
523 */
524 dst.set_smear(0);
525
526 return dst;
527 }
528
529 void
530 fs_visitor::emit_shader_time_begin()
531 {
532 shader_start_time = get_timestamp(bld.annotate("shader time start"));
533 }
534
535 void
536 fs_visitor::emit_shader_time_end()
537 {
538 /* Insert our code just before the final SEND with EOT. */
539 exec_node *end = this->instructions.get_tail();
540 assert(end && ((fs_inst *) end)->eot);
541 const fs_builder ibld = bld.annotate("shader time end")
542 .exec_all().at(NULL, end);
543
544 fs_reg shader_end_time = get_timestamp(ibld);
545
546 /* Check that there weren't any timestamp reset events (assuming these
547 * were the only two timestamp reads that happened).
548 */
549 fs_reg reset = shader_end_time;
550 reset.set_smear(2);
551 set_condmod(BRW_CONDITIONAL_Z,
552 ibld.AND(ibld.null_reg_ud(), reset, fs_reg(1u)));
553 ibld.IF(BRW_PREDICATE_NORMAL);
554
555 fs_reg start = shader_start_time;
556 start.negate = true;
557 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
558 diff.set_smear(0);
559
560 const fs_builder cbld = ibld.group(1, 0);
561 cbld.group(1, 0).ADD(diff, start, shader_end_time);
562
563 /* If there were no instructions between the two timestamp gets, the diff
564 * is 2 cycles. Remove that overhead, so I can forget about that when
565 * trying to determine the time taken for single instructions.
566 */
567 cbld.ADD(diff, diff, fs_reg(-2u));
568 SHADER_TIME_ADD(cbld, 0, diff);
569 SHADER_TIME_ADD(cbld, 1, fs_reg(1u));
570 ibld.emit(BRW_OPCODE_ELSE);
571 SHADER_TIME_ADD(cbld, 2, fs_reg(1u));
572 ibld.emit(BRW_OPCODE_ENDIF);
573 }
574
575 void
576 fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
577 int shader_time_subindex,
578 fs_reg value)
579 {
580 int index = shader_time_index * 3 + shader_time_subindex;
581 fs_reg offset = fs_reg(index * SHADER_TIME_STRIDE);
582
583 fs_reg payload;
584 if (dispatch_width == 8)
585 payload = vgrf(glsl_type::uvec2_type);
586 else
587 payload = vgrf(glsl_type::uint_type);
588
589 bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
590 }
591
592 void
593 fs_visitor::vfail(const char *format, va_list va)
594 {
595 char *msg;
596
597 if (failed)
598 return;
599
600 failed = true;
601
602 msg = ralloc_vasprintf(mem_ctx, format, va);
603 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
604
605 this->fail_msg = msg;
606
607 if (debug_enabled) {
608 fprintf(stderr, "%s", msg);
609 }
610 }
611
612 void
613 fs_visitor::fail(const char *format, ...)
614 {
615 va_list va;
616
617 va_start(va, format);
618 vfail(format, va);
619 va_end(va);
620 }
621
622 /**
623 * Mark this program as impossible to compile in SIMD16 mode.
624 *
625 * During the SIMD8 compile (which happens first), we can detect and flag
626 * things that are unsupported in SIMD16 mode, so the compiler can skip
627 * the SIMD16 compile altogether.
628 *
629 * During a SIMD16 compile (if one happens anyway), this just calls fail().
630 */
631 void
632 fs_visitor::no16(const char *msg)
633 {
634 if (dispatch_width == 16) {
635 fail("%s", msg);
636 } else {
637 simd16_unsupported = true;
638
639 compiler->shader_perf_log(log_data,
640 "SIMD16 shader failed to compile: %s", msg);
641 }
642 }
643
644 /**
645 * Returns true if the instruction has a flag that means it won't
646 * update an entire destination register.
647 *
648 * For example, dead code elimination and live variable analysis want to know
649 * when a write to a variable screens off any preceding values that were in
650 * it.
651 */
652 bool
653 fs_inst::is_partial_write() const
654 {
655 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
656 (this->exec_size * type_sz(this->dst.type)) < 32 ||
657 !this->dst.is_contiguous());
658 }
659
660 int
661 fs_inst::regs_read(int arg) const
662 {
663 unsigned components = 1;
664 switch (opcode) {
665 case FS_OPCODE_FB_WRITE:
666 case SHADER_OPCODE_URB_WRITE_SIMD8:
667 case SHADER_OPCODE_UNTYPED_ATOMIC:
668 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
669 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
670 case SHADER_OPCODE_TYPED_ATOMIC:
671 case SHADER_OPCODE_TYPED_SURFACE_READ:
672 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
673 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
674 if (arg == 0)
675 return mlen;
676 break;
677
678 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
679 /* The payload is actually stored in src1 */
680 if (arg == 1)
681 return mlen;
682 break;
683
684 case FS_OPCODE_LINTERP:
685 if (arg == 0)
686 return exec_size / 4;
687 break;
688
689 case FS_OPCODE_PIXEL_X:
690 case FS_OPCODE_PIXEL_Y:
691 if (arg == 0)
692 components = 1;
693 break;
694
695 case SHADER_OPCODE_LOAD_PAYLOAD:
696 if (arg < this->header_size)
697 return 1;
698 break;
699
700 default:
701 if (is_tex() && arg == 0 && src[0].file == GRF)
702 return mlen;
703 break;
704 }
705
706 switch (src[arg].file) {
707 case BAD_FILE:
708 case UNIFORM:
709 case IMM:
710 return 1;
711 case GRF:
712 case HW_REG:
713 if (src[arg].stride == 0) {
714 return 1;
715 } else {
716 int size = components * this->exec_size * type_sz(src[arg].type);
717 return DIV_ROUND_UP(size * src[arg].stride, 32);
718 }
719 case MRF:
720 unreachable("MRF registers are not allowed as sources");
721 default:
722 unreachable("Invalid register file");
723 }
724 }
725
726 bool
727 fs_inst::reads_flag() const
728 {
729 return predicate;
730 }
731
732 bool
733 fs_inst::writes_flag() const
734 {
735 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
736 opcode != BRW_OPCODE_IF &&
737 opcode != BRW_OPCODE_WHILE)) ||
738 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
739 }
740
741 /**
742 * Returns how many MRFs an FS opcode will write over.
743 *
744 * Note that this is not the 0 or 1 implied writes in an actual gen
745 * instruction -- the FS opcodes often generate MOVs in addition.
746 */
747 int
748 fs_visitor::implied_mrf_writes(fs_inst *inst)
749 {
750 if (inst->mlen == 0)
751 return 0;
752
753 if (inst->base_mrf == -1)
754 return 0;
755
756 switch (inst->opcode) {
757 case SHADER_OPCODE_RCP:
758 case SHADER_OPCODE_RSQ:
759 case SHADER_OPCODE_SQRT:
760 case SHADER_OPCODE_EXP2:
761 case SHADER_OPCODE_LOG2:
762 case SHADER_OPCODE_SIN:
763 case SHADER_OPCODE_COS:
764 return 1 * dispatch_width / 8;
765 case SHADER_OPCODE_POW:
766 case SHADER_OPCODE_INT_QUOTIENT:
767 case SHADER_OPCODE_INT_REMAINDER:
768 return 2 * dispatch_width / 8;
769 case SHADER_OPCODE_TEX:
770 case FS_OPCODE_TXB:
771 case SHADER_OPCODE_TXD:
772 case SHADER_OPCODE_TXF:
773 case SHADER_OPCODE_TXF_CMS:
774 case SHADER_OPCODE_TXF_MCS:
775 case SHADER_OPCODE_TG4:
776 case SHADER_OPCODE_TG4_OFFSET:
777 case SHADER_OPCODE_TXL:
778 case SHADER_OPCODE_TXS:
779 case SHADER_OPCODE_LOD:
780 return 1;
781 case FS_OPCODE_FB_WRITE:
782 return 2;
783 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
784 case SHADER_OPCODE_GEN4_SCRATCH_READ:
785 return 1;
786 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
787 return inst->mlen;
788 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
789 return inst->mlen;
790 case SHADER_OPCODE_UNTYPED_ATOMIC:
791 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
792 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
793 case SHADER_OPCODE_TYPED_ATOMIC:
794 case SHADER_OPCODE_TYPED_SURFACE_READ:
795 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
796 case SHADER_OPCODE_URB_WRITE_SIMD8:
797 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
798 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
799 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
800 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
801 return 0;
802 default:
803 unreachable("not reached");
804 }
805 }
806
807 fs_reg
808 fs_visitor::vgrf(const glsl_type *const type)
809 {
810 int reg_width = dispatch_width / 8;
811 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
812 brw_type_for_base_type(type), dispatch_width);
813 }
814
815 /** Fixed HW reg constructor. */
816 fs_reg::fs_reg(enum register_file file, int reg)
817 {
818 init();
819 this->file = file;
820 this->reg = reg;
821 this->type = BRW_REGISTER_TYPE_F;
822
823 switch (file) {
824 case UNIFORM:
825 this->width = 1;
826 break;
827 default:
828 this->width = 8;
829 }
830 }
831
832 /** Fixed HW reg constructor. */
833 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
834 {
835 init();
836 this->file = file;
837 this->reg = reg;
838 this->type = type;
839
840 switch (file) {
841 case UNIFORM:
842 this->width = 1;
843 break;
844 default:
845 this->width = 8;
846 }
847 }
848
849 /** Fixed HW reg constructor. */
850 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
851 uint8_t width)
852 {
853 init();
854 this->file = file;
855 this->reg = reg;
856 this->type = type;
857 this->width = width;
858 }
859
860 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
861 * This brings in those uniform definitions
862 */
863 void
864 fs_visitor::import_uniforms(fs_visitor *v)
865 {
866 this->push_constant_loc = v->push_constant_loc;
867 this->pull_constant_loc = v->pull_constant_loc;
868 this->uniforms = v->uniforms;
869 this->param_size = v->param_size;
870 }
871
872 fs_reg *
873 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
874 bool origin_upper_left)
875 {
876 assert(stage == MESA_SHADER_FRAGMENT);
877 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
878 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
879 fs_reg wpos = *reg;
880 bool flip = !origin_upper_left ^ key->render_to_fbo;
881
882 /* gl_FragCoord.x */
883 if (pixel_center_integer) {
884 bld.MOV(wpos, this->pixel_x);
885 } else {
886 bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
887 }
888 wpos = offset(wpos, bld, 1);
889
890 /* gl_FragCoord.y */
891 if (!flip && pixel_center_integer) {
892 bld.MOV(wpos, this->pixel_y);
893 } else {
894 fs_reg pixel_y = this->pixel_y;
895 float offset = (pixel_center_integer ? 0.0 : 0.5);
896
897 if (flip) {
898 pixel_y.negate = true;
899 offset += key->drawable_height - 1.0;
900 }
901
902 bld.ADD(wpos, pixel_y, fs_reg(offset));
903 }
904 wpos = offset(wpos, bld, 1);
905
906 /* gl_FragCoord.z */
907 if (devinfo->gen >= 6) {
908 bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
909 } else {
910 bld.emit(FS_OPCODE_LINTERP, wpos,
911 this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
912 interp_reg(VARYING_SLOT_POS, 2));
913 }
914 wpos = offset(wpos, bld, 1);
915
916 /* gl_FragCoord.w: Already set up in emit_interpolation */
917 bld.MOV(wpos, this->wpos_w);
918
919 return reg;
920 }
921
922 fs_inst *
923 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
924 glsl_interp_qualifier interpolation_mode,
925 bool is_centroid, bool is_sample)
926 {
927 brw_wm_barycentric_interp_mode barycoord_mode;
928 if (devinfo->gen >= 6) {
929 if (is_centroid) {
930 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
931 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
932 else
933 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
934 } else if (is_sample) {
935 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
936 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
937 else
938 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
939 } else {
940 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
941 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
942 else
943 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
944 }
945 } else {
946 /* On Ironlake and below, there is only one interpolation mode.
947 * Centroid interpolation doesn't mean anything on this hardware --
948 * there is no multisampling.
949 */
950 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
951 }
952 return bld.emit(FS_OPCODE_LINTERP, attr,
953 this->delta_xy[barycoord_mode], interp);
954 }
955
956 void
957 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
958 const glsl_type *type,
959 glsl_interp_qualifier interpolation_mode,
960 int location, bool mod_centroid,
961 bool mod_sample)
962 {
963 attr.type = brw_type_for_base_type(type->get_scalar_type());
964
965 assert(stage == MESA_SHADER_FRAGMENT);
966 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
967 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
968
969 unsigned int array_elements;
970
971 if (type->is_array()) {
972 array_elements = type->length;
973 if (array_elements == 0) {
974 fail("dereferenced array '%s' has length 0\n", name);
975 }
976 type = type->fields.array;
977 } else {
978 array_elements = 1;
979 }
980
981 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
982 bool is_gl_Color =
983 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
984 if (key->flat_shade && is_gl_Color) {
985 interpolation_mode = INTERP_QUALIFIER_FLAT;
986 } else {
987 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
988 }
989 }
990
991 for (unsigned int i = 0; i < array_elements; i++) {
992 for (unsigned int j = 0; j < type->matrix_columns; j++) {
993 if (prog_data->urb_setup[location] == -1) {
994 /* If there's no incoming setup data for this slot, don't
995 * emit interpolation for it.
996 */
997 attr = offset(attr, bld, type->vector_elements);
998 location++;
999 continue;
1000 }
1001
1002 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1003 /* Constant interpolation (flat shading) case. The SF has
1004 * handed us defined values in only the constant offset
1005 * field of the setup reg.
1006 */
1007 for (unsigned int k = 0; k < type->vector_elements; k++) {
1008 struct brw_reg interp = interp_reg(location, k);
1009 interp = suboffset(interp, 3);
1010 interp.type = attr.type;
1011 bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1012 attr = offset(attr, bld, 1);
1013 }
1014 } else {
1015 /* Smooth/noperspective interpolation case. */
1016 for (unsigned int k = 0; k < type->vector_elements; k++) {
1017 struct brw_reg interp = interp_reg(location, k);
1018 if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1019 /* Get the pixel/sample mask into f0 so that we know
1020 * which pixels are lit. Then, for each channel that is
1021 * unlit, replace the centroid data with non-centroid
1022 * data.
1023 */
1024 bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1025
1026 fs_inst *inst;
1027 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1028 false, false);
1029 inst->predicate = BRW_PREDICATE_NORMAL;
1030 inst->predicate_inverse = true;
1031 if (devinfo->has_pln)
1032 inst->no_dd_clear = true;
1033
1034 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1035 mod_centroid && !key->persample_shading,
1036 mod_sample || key->persample_shading);
1037 inst->predicate = BRW_PREDICATE_NORMAL;
1038 inst->predicate_inverse = false;
1039 if (devinfo->has_pln)
1040 inst->no_dd_check = true;
1041
1042 } else {
1043 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1044 mod_centroid && !key->persample_shading,
1045 mod_sample || key->persample_shading);
1046 }
1047 if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1048 bld.MUL(attr, attr, this->pixel_w);
1049 }
1050 attr = offset(attr, bld, 1);
1051 }
1052
1053 }
1054 location++;
1055 }
1056 }
1057 }
1058
1059 fs_reg *
1060 fs_visitor::emit_frontfacing_interpolation()
1061 {
1062 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1063
1064 if (devinfo->gen >= 6) {
1065 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1066 * a boolean result from this (~0/true or 0/false).
1067 *
1068 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1069 * this task in only one instruction:
1070 * - a negation source modifier will flip the bit; and
1071 * - a W -> D type conversion will sign extend the bit into the high
1072 * word of the destination.
1073 *
1074 * An ASR 15 fills the low word of the destination.
1075 */
1076 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1077 g0.negate = true;
1078
1079 bld.ASR(*reg, g0, fs_reg(15));
1080 } else {
1081 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1082 * a boolean result from this (1/true or 0/false).
1083 *
1084 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1085 * the negation source modifier to flip it. Unfortunately the SHR
1086 * instruction only operates on UD (or D with an abs source modifier)
1087 * sources without negation.
1088 *
1089 * Instead, use ASR (which will give ~0/true or 0/false).
1090 */
1091 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1092 g1_6.negate = true;
1093
1094 bld.ASR(*reg, g1_6, fs_reg(31));
1095 }
1096
1097 return reg;
1098 }
1099
1100 void
1101 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1102 {
1103 assert(stage == MESA_SHADER_FRAGMENT);
1104 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1105 assert(dst.type == BRW_REGISTER_TYPE_F);
1106
1107 if (key->compute_pos_offset) {
1108 /* Convert int_sample_pos to floating point */
1109 bld.MOV(dst, int_sample_pos);
1110 /* Scale to the range [0, 1] */
1111 bld.MUL(dst, dst, fs_reg(1 / 16.0f));
1112 }
1113 else {
1114 /* From ARB_sample_shading specification:
1115 * "When rendering to a non-multisample buffer, or if multisample
1116 * rasterization is disabled, gl_SamplePosition will always be
1117 * (0.5, 0.5).
1118 */
1119 bld.MOV(dst, fs_reg(0.5f));
1120 }
1121 }
1122
1123 fs_reg *
1124 fs_visitor::emit_samplepos_setup()
1125 {
1126 assert(devinfo->gen >= 6);
1127
1128 const fs_builder abld = bld.annotate("compute sample position");
1129 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1130 fs_reg pos = *reg;
1131 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1132 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1133
1134 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1135 * mode will be enabled.
1136 *
1137 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1138 * R31.1:0 Position Offset X/Y for Slot[3:0]
1139 * R31.3:2 Position Offset X/Y for Slot[7:4]
1140 * .....
1141 *
1142 * The X, Y sample positions come in as bytes in thread payload. So, read
1143 * the positions using vstride=16, width=8, hstride=2.
1144 */
1145 struct brw_reg sample_pos_reg =
1146 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1147 BRW_REGISTER_TYPE_B), 16, 8, 2);
1148
1149 if (dispatch_width == 8) {
1150 abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
1151 } else {
1152 abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
1153 abld.half(1).MOV(half(int_sample_x, 1),
1154 fs_reg(suboffset(sample_pos_reg, 16)));
1155 }
1156 /* Compute gl_SamplePosition.x */
1157 compute_sample_position(pos, int_sample_x);
1158 pos = offset(pos, abld, 1);
1159 if (dispatch_width == 8) {
1160 abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
1161 } else {
1162 abld.half(0).MOV(half(int_sample_y, 0),
1163 fs_reg(suboffset(sample_pos_reg, 1)));
1164 abld.half(1).MOV(half(int_sample_y, 1),
1165 fs_reg(suboffset(sample_pos_reg, 17)));
1166 }
1167 /* Compute gl_SamplePosition.y */
1168 compute_sample_position(pos, int_sample_y);
1169 return reg;
1170 }
1171
1172 fs_reg *
1173 fs_visitor::emit_sampleid_setup()
1174 {
1175 assert(stage == MESA_SHADER_FRAGMENT);
1176 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1177 assert(devinfo->gen >= 6);
1178
1179 const fs_builder abld = bld.annotate("compute sample id");
1180 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1181
1182 if (key->compute_sample_id) {
1183 fs_reg t1 = vgrf(glsl_type::int_type);
1184 fs_reg t2 = vgrf(glsl_type::int_type);
1185 t2.type = BRW_REGISTER_TYPE_UW;
1186
1187 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1188 * 8x multisampling, subspan 0 will represent sample N (where N
1189 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1190 * 7. We can find the value of N by looking at R0.0 bits 7:6
1191 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1192 * (since samples are always delivered in pairs). That is, we
1193 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1194 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1195 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1196 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1197 * populating a temporary variable with the sequence (0, 1, 2, 3),
1198 * and then reading from it using vstride=1, width=4, hstride=0.
1199 * These computations hold good for 4x multisampling as well.
1200 *
1201 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1202 * the first four slots are sample 0 of subspan 0; the next four
1203 * are sample 1 of subspan 0; the third group is sample 0 of
1204 * subspan 1, and finally sample 1 of subspan 1.
1205 */
1206 abld.exec_all()
1207 .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1208 fs_reg(0xc0));
1209 abld.exec_all().SHR(t1, t1, fs_reg(5));
1210
1211 /* This works for both SIMD8 and SIMD16 */
1212 abld.exec_all()
1213 .MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210));
1214
1215 /* This special instruction takes care of setting vstride=1,
1216 * width=4, hstride=0 of t2 during an ADD instruction.
1217 */
1218 abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1219 } else {
1220 /* As per GL_ARB_sample_shading specification:
1221 * "When rendering to a non-multisample buffer, or if multisample
1222 * rasterization is disabled, gl_SampleID will always be zero."
1223 */
1224 abld.MOV(*reg, fs_reg(0));
1225 }
1226
1227 return reg;
1228 }
1229
1230 void
1231 fs_visitor::resolve_source_modifiers(fs_reg *src)
1232 {
1233 if (!src->abs && !src->negate)
1234 return;
1235
1236 fs_reg temp = bld.vgrf(src->type);
1237 bld.MOV(temp, *src);
1238 *src = temp;
1239 }
1240
1241 void
1242 fs_visitor::emit_discard_jump()
1243 {
1244 assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1245
1246 /* For performance, after a discard, jump to the end of the
1247 * shader if all relevant channels have been discarded.
1248 */
1249 fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
1250 discard_jump->flag_subreg = 1;
1251
1252 discard_jump->predicate = (dispatch_width == 8)
1253 ? BRW_PREDICATE_ALIGN1_ANY8H
1254 : BRW_PREDICATE_ALIGN1_ANY16H;
1255 discard_jump->predicate_inverse = true;
1256 }
1257
1258 void
1259 fs_visitor::assign_curb_setup()
1260 {
1261 if (dispatch_width == 8) {
1262 prog_data->dispatch_grf_start_reg = payload.num_regs;
1263 } else {
1264 if (stage == MESA_SHADER_FRAGMENT) {
1265 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1266 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1267 } else if (stage == MESA_SHADER_COMPUTE) {
1268 brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1269 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1270 } else {
1271 unreachable("Unsupported shader type!");
1272 }
1273 }
1274
1275 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1276
1277 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1278 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1279 for (unsigned int i = 0; i < inst->sources; i++) {
1280 if (inst->src[i].file == UNIFORM) {
1281 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1282 int constant_nr;
1283 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1284 constant_nr = push_constant_loc[uniform_nr];
1285 } else {
1286 /* Section 5.11 of the OpenGL 4.1 spec says:
1287 * "Out-of-bounds reads return undefined values, which include
1288 * values from other variables of the active program or zero."
1289 * Just return the first push constant.
1290 */
1291 constant_nr = 0;
1292 }
1293
1294 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1295 constant_nr / 8,
1296 constant_nr % 8);
1297
1298 inst->src[i].file = HW_REG;
1299 inst->src[i].fixed_hw_reg = byte_offset(
1300 retype(brw_reg, inst->src[i].type),
1301 inst->src[i].subreg_offset);
1302 }
1303 }
1304 }
1305 }
1306
1307 void
1308 fs_visitor::calculate_urb_setup()
1309 {
1310 assert(stage == MESA_SHADER_FRAGMENT);
1311 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1312 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1313
1314 memset(prog_data->urb_setup, -1,
1315 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1316
1317 int urb_next = 0;
1318 /* Figure out where each of the incoming setup attributes lands. */
1319 if (devinfo->gen >= 6) {
1320 if (_mesa_bitcount_64(prog->InputsRead &
1321 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1322 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1323 * first 16 varying inputs, so we can put them wherever we want.
1324 * Just put them in order.
1325 *
1326 * This is useful because it means that (a) inputs not used by the
1327 * fragment shader won't take up valuable register space, and (b) we
1328 * won't have to recompile the fragment shader if it gets paired with
1329 * a different vertex (or geometry) shader.
1330 */
1331 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1332 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1333 BITFIELD64_BIT(i)) {
1334 prog_data->urb_setup[i] = urb_next++;
1335 }
1336 }
1337 } else {
1338 /* We have enough input varyings that the SF/SBE pipeline stage can't
1339 * arbitrarily rearrange them to suit our whim; we have to put them
1340 * in an order that matches the output of the previous pipeline stage
1341 * (geometry or vertex shader).
1342 */
1343 struct brw_vue_map prev_stage_vue_map;
1344 brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1345 key->input_slots_valid);
1346 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1347 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1348 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1349 slot++) {
1350 int varying = prev_stage_vue_map.slot_to_varying[slot];
1351 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1352 * unused.
1353 */
1354 if (varying != BRW_VARYING_SLOT_COUNT &&
1355 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1356 BITFIELD64_BIT(varying))) {
1357 prog_data->urb_setup[varying] = slot - first_slot;
1358 }
1359 }
1360 urb_next = prev_stage_vue_map.num_slots - first_slot;
1361 }
1362 } else {
1363 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1364 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1365 /* Point size is packed into the header, not as a general attribute */
1366 if (i == VARYING_SLOT_PSIZ)
1367 continue;
1368
1369 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1370 /* The back color slot is skipped when the front color is
1371 * also written to. In addition, some slots can be
1372 * written in the vertex shader and not read in the
1373 * fragment shader. So the register number must always be
1374 * incremented, mapped or not.
1375 */
1376 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1377 prog_data->urb_setup[i] = urb_next;
1378 urb_next++;
1379 }
1380 }
1381
1382 /*
1383 * It's a FS only attribute, and we did interpolation for this attribute
1384 * in SF thread. So, count it here, too.
1385 *
1386 * See compile_sf_prog() for more info.
1387 */
1388 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1389 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1390 }
1391
1392 prog_data->num_varying_inputs = urb_next;
1393 }
1394
1395 void
1396 fs_visitor::assign_urb_setup()
1397 {
1398 assert(stage == MESA_SHADER_FRAGMENT);
1399 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1400
1401 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1402
1403 /* Offset all the urb_setup[] index by the actual position of the
1404 * setup regs, now that the location of the constants has been chosen.
1405 */
1406 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1407 if (inst->opcode == FS_OPCODE_LINTERP) {
1408 assert(inst->src[1].file == HW_REG);
1409 inst->src[1].fixed_hw_reg.nr += urb_start;
1410 }
1411
1412 if (inst->opcode == FS_OPCODE_CINTERP) {
1413 assert(inst->src[0].file == HW_REG);
1414 inst->src[0].fixed_hw_reg.nr += urb_start;
1415 }
1416 }
1417
1418 /* Each attribute is 4 setup channels, each of which is half a reg. */
1419 this->first_non_payload_grf =
1420 urb_start + prog_data->num_varying_inputs * 2;
1421 }
1422
1423 void
1424 fs_visitor::assign_vs_urb_setup()
1425 {
1426 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1427 int grf, count, slot, channel, attr;
1428
1429 assert(stage == MESA_SHADER_VERTEX);
1430 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1431 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1432 count++;
1433
1434 /* Each attribute is 4 regs. */
1435 this->first_non_payload_grf =
1436 payload.num_regs + prog_data->curb_read_length + count * 4;
1437
1438 unsigned vue_entries =
1439 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1440
1441 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1442 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1443
1444 assert(vs_prog_data->base.urb_read_length <= 15);
1445
1446 /* Rewrite all ATTR file references to the hw grf that they land in. */
1447 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1448 for (int i = 0; i < inst->sources; i++) {
1449 if (inst->src[i].file == ATTR) {
1450
1451 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1452 slot = count - 1;
1453 } else {
1454 /* Attributes come in in a contiguous block, ordered by their
1455 * gl_vert_attrib value. That means we can compute the slot
1456 * number for an attribute by masking out the enabled
1457 * attributes before it and counting the bits.
1458 */
1459 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1460 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1461 BITFIELD64_MASK(attr));
1462 }
1463
1464 channel = inst->src[i].reg_offset & 3;
1465
1466 grf = payload.num_regs +
1467 prog_data->curb_read_length +
1468 slot * 4 + channel;
1469
1470 inst->src[i].file = HW_REG;
1471 inst->src[i].fixed_hw_reg =
1472 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1473 }
1474 }
1475 }
1476 }
1477
1478 /**
1479 * Split large virtual GRFs into separate components if we can.
1480 *
1481 * This is mostly duplicated with what brw_fs_vector_splitting does,
1482 * but that's really conservative because it's afraid of doing
1483 * splitting that doesn't result in real progress after the rest of
1484 * the optimization phases, which would cause infinite looping in
1485 * optimization. We can do it once here, safely. This also has the
1486 * opportunity to split interpolated values, or maybe even uniforms,
1487 * which we don't have at the IR level.
1488 *
1489 * We want to split, because virtual GRFs are what we register
1490 * allocate and spill (due to contiguousness requirements for some
1491 * instructions), and they're what we naturally generate in the
1492 * codegen process, but most virtual GRFs don't actually need to be
1493 * contiguous sets of GRFs. If we split, we'll end up with reduced
1494 * live intervals and better dead code elimination and coalescing.
1495 */
1496 void
1497 fs_visitor::split_virtual_grfs()
1498 {
1499 int num_vars = this->alloc.count;
1500
1501 /* Count the total number of registers */
1502 int reg_count = 0;
1503 int vgrf_to_reg[num_vars];
1504 for (int i = 0; i < num_vars; i++) {
1505 vgrf_to_reg[i] = reg_count;
1506 reg_count += alloc.sizes[i];
1507 }
1508
1509 /* An array of "split points". For each register slot, this indicates
1510 * if this slot can be separated from the previous slot. Every time an
1511 * instruction uses multiple elements of a register (as a source or
1512 * destination), we mark the used slots as inseparable. Then we go
1513 * through and split the registers into the smallest pieces we can.
1514 */
1515 bool split_points[reg_count];
1516 memset(split_points, 0, sizeof(split_points));
1517
1518 /* Mark all used registers as fully splittable */
1519 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1520 if (inst->dst.file == GRF) {
1521 int reg = vgrf_to_reg[inst->dst.reg];
1522 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1523 split_points[reg + j] = true;
1524 }
1525
1526 for (int i = 0; i < inst->sources; i++) {
1527 if (inst->src[i].file == GRF) {
1528 int reg = vgrf_to_reg[inst->src[i].reg];
1529 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1530 split_points[reg + j] = true;
1531 }
1532 }
1533 }
1534
1535 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1536 if (inst->dst.file == GRF) {
1537 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1538 for (int j = 1; j < inst->regs_written; j++)
1539 split_points[reg + j] = false;
1540 }
1541 for (int i = 0; i < inst->sources; i++) {
1542 if (inst->src[i].file == GRF) {
1543 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1544 for (int j = 1; j < inst->regs_read(i); j++)
1545 split_points[reg + j] = false;
1546 }
1547 }
1548 }
1549
1550 int new_virtual_grf[reg_count];
1551 int new_reg_offset[reg_count];
1552
1553 int reg = 0;
1554 for (int i = 0; i < num_vars; i++) {
1555 /* The first one should always be 0 as a quick sanity check. */
1556 assert(split_points[reg] == false);
1557
1558 /* j = 0 case */
1559 new_reg_offset[reg] = 0;
1560 reg++;
1561 int offset = 1;
1562
1563 /* j > 0 case */
1564 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1565 /* If this is a split point, reset the offset to 0 and allocate a
1566 * new virtual GRF for the previous offset many registers
1567 */
1568 if (split_points[reg]) {
1569 assert(offset <= MAX_VGRF_SIZE);
1570 int grf = alloc.allocate(offset);
1571 for (int k = reg - offset; k < reg; k++)
1572 new_virtual_grf[k] = grf;
1573 offset = 0;
1574 }
1575 new_reg_offset[reg] = offset;
1576 offset++;
1577 reg++;
1578 }
1579
1580 /* The last one gets the original register number */
1581 assert(offset <= MAX_VGRF_SIZE);
1582 alloc.sizes[i] = offset;
1583 for (int k = reg - offset; k < reg; k++)
1584 new_virtual_grf[k] = i;
1585 }
1586 assert(reg == reg_count);
1587
1588 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1589 if (inst->dst.file == GRF) {
1590 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1591 inst->dst.reg = new_virtual_grf[reg];
1592 inst->dst.reg_offset = new_reg_offset[reg];
1593 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1594 }
1595 for (int i = 0; i < inst->sources; i++) {
1596 if (inst->src[i].file == GRF) {
1597 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1598 inst->src[i].reg = new_virtual_grf[reg];
1599 inst->src[i].reg_offset = new_reg_offset[reg];
1600 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1601 }
1602 }
1603 }
1604 invalidate_live_intervals();
1605 }
1606
1607 /**
1608 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1609 *
1610 * During code generation, we create tons of temporary variables, many of
1611 * which get immediately killed and are never used again. Yet, in later
1612 * optimization and analysis passes, such as compute_live_intervals, we need
1613 * to loop over all the virtual GRFs. Compacting them can save a lot of
1614 * overhead.
1615 */
1616 bool
1617 fs_visitor::compact_virtual_grfs()
1618 {
1619 bool progress = false;
1620 int remap_table[this->alloc.count];
1621 memset(remap_table, -1, sizeof(remap_table));
1622
1623 /* Mark which virtual GRFs are used. */
1624 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1625 if (inst->dst.file == GRF)
1626 remap_table[inst->dst.reg] = 0;
1627
1628 for (int i = 0; i < inst->sources; i++) {
1629 if (inst->src[i].file == GRF)
1630 remap_table[inst->src[i].reg] = 0;
1631 }
1632 }
1633
1634 /* Compact the GRF arrays. */
1635 int new_index = 0;
1636 for (unsigned i = 0; i < this->alloc.count; i++) {
1637 if (remap_table[i] == -1) {
1638 /* We just found an unused register. This means that we are
1639 * actually going to compact something.
1640 */
1641 progress = true;
1642 } else {
1643 remap_table[i] = new_index;
1644 alloc.sizes[new_index] = alloc.sizes[i];
1645 invalidate_live_intervals();
1646 ++new_index;
1647 }
1648 }
1649
1650 this->alloc.count = new_index;
1651
1652 /* Patch all the instructions to use the newly renumbered registers */
1653 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1654 if (inst->dst.file == GRF)
1655 inst->dst.reg = remap_table[inst->dst.reg];
1656
1657 for (int i = 0; i < inst->sources; i++) {
1658 if (inst->src[i].file == GRF)
1659 inst->src[i].reg = remap_table[inst->src[i].reg];
1660 }
1661 }
1662
1663 /* Patch all the references to delta_xy, since they're used in register
1664 * allocation. If they're unused, switch them to BAD_FILE so we don't
1665 * think some random VGRF is delta_xy.
1666 */
1667 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
1668 if (delta_xy[i].file == GRF) {
1669 if (remap_table[delta_xy[i].reg] != -1) {
1670 delta_xy[i].reg = remap_table[delta_xy[i].reg];
1671 } else {
1672 delta_xy[i].file = BAD_FILE;
1673 }
1674 }
1675 }
1676
1677 return progress;
1678 }
1679
1680 /*
1681 * Implements array access of uniforms by inserting a
1682 * PULL_CONSTANT_LOAD instruction.
1683 *
1684 * Unlike temporary GRF array access (where we don't support it due to
1685 * the difficulty of doing relative addressing on instruction
1686 * destinations), we could potentially do array access of uniforms
1687 * that were loaded in GRF space as push constants. In real-world
1688 * usage we've seen, though, the arrays being used are always larger
1689 * than we could load as push constants, so just always move all
1690 * uniform array access out to a pull constant buffer.
1691 */
1692 void
1693 fs_visitor::move_uniform_array_access_to_pull_constants()
1694 {
1695 if (dispatch_width != 8)
1696 return;
1697
1698 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1699 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
1700
1701 /* Walk through and find array access of uniforms. Put a copy of that
1702 * uniform in the pull constant buffer.
1703 *
1704 * Note that we don't move constant-indexed accesses to arrays. No
1705 * testing has been done of the performance impact of this choice.
1706 */
1707 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
1708 for (int i = 0 ; i < inst->sources; i++) {
1709 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1710 continue;
1711
1712 int uniform = inst->src[i].reg;
1713
1714 /* If this array isn't already present in the pull constant buffer,
1715 * add it.
1716 */
1717 if (pull_constant_loc[uniform] == -1) {
1718 const gl_constant_value **values = &stage_prog_data->param[uniform];
1719
1720 assert(param_size[uniform]);
1721
1722 for (int j = 0; j < param_size[uniform]; j++) {
1723 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1724
1725 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1726 values[j];
1727 }
1728 }
1729 }
1730 }
1731 }
1732
1733 /**
1734 * Assign UNIFORM file registers to either push constants or pull constants.
1735 *
1736 * We allow a fragment shader to have more than the specified minimum
1737 * maximum number of fragment shader uniform components (64). If
1738 * there are too many of these, they'd fill up all of register space.
1739 * So, this will push some of them out to the pull constant buffer and
1740 * update the program to load them.
1741 */
1742 void
1743 fs_visitor::assign_constant_locations()
1744 {
1745 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1746 if (dispatch_width != 8)
1747 return;
1748
1749 /* Find which UNIFORM registers are still in use. */
1750 bool is_live[uniforms];
1751 for (unsigned int i = 0; i < uniforms; i++) {
1752 is_live[i] = false;
1753 }
1754
1755 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1756 for (int i = 0; i < inst->sources; i++) {
1757 if (inst->src[i].file != UNIFORM)
1758 continue;
1759
1760 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1761 if (constant_nr >= 0 && constant_nr < (int) uniforms)
1762 is_live[constant_nr] = true;
1763 }
1764 }
1765
1766 /* Only allow 16 registers (128 uniform components) as push constants.
1767 *
1768 * Just demote the end of the list. We could probably do better
1769 * here, demoting things that are rarely used in the program first.
1770 *
1771 * If changing this value, note the limitation about total_regs in
1772 * brw_curbe.c.
1773 */
1774 unsigned int max_push_components = 16 * 8;
1775 unsigned int num_push_constants = 0;
1776
1777 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1778
1779 for (unsigned int i = 0; i < uniforms; i++) {
1780 if (!is_live[i] || pull_constant_loc[i] != -1) {
1781 /* This UNIFORM register is either dead, or has already been demoted
1782 * to a pull const. Mark it as no longer living in the param[] array.
1783 */
1784 push_constant_loc[i] = -1;
1785 continue;
1786 }
1787
1788 if (num_push_constants < max_push_components) {
1789 /* Retain as a push constant. Record the location in the params[]
1790 * array.
1791 */
1792 push_constant_loc[i] = num_push_constants++;
1793 } else {
1794 /* Demote to a pull constant. */
1795 push_constant_loc[i] = -1;
1796
1797 int pull_index = stage_prog_data->nr_pull_params++;
1798 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1799 pull_constant_loc[i] = pull_index;
1800 }
1801 }
1802
1803 stage_prog_data->nr_params = num_push_constants;
1804
1805 /* Up until now, the param[] array has been indexed by reg + reg_offset
1806 * of UNIFORM registers. Condense it to only contain the uniforms we
1807 * chose to upload as push constants.
1808 */
1809 for (unsigned int i = 0; i < uniforms; i++) {
1810 int remapped = push_constant_loc[i];
1811
1812 if (remapped == -1)
1813 continue;
1814
1815 assert(remapped <= (int)i);
1816 stage_prog_data->param[remapped] = stage_prog_data->param[i];
1817 }
1818 }
1819
1820 /**
1821 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1822 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1823 */
1824 void
1825 fs_visitor::demote_pull_constants()
1826 {
1827 foreach_block_and_inst (block, fs_inst, inst, cfg) {
1828 for (int i = 0; i < inst->sources; i++) {
1829 if (inst->src[i].file != UNIFORM)
1830 continue;
1831
1832 int pull_index;
1833 unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
1834 if (location >= uniforms) /* Out of bounds access */
1835 pull_index = -1;
1836 else
1837 pull_index = pull_constant_loc[location];
1838
1839 if (pull_index == -1)
1840 continue;
1841
1842 /* Set up the annotation tracking for new generated instructions. */
1843 const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
1844 .at(block, inst);
1845 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1846 fs_reg dst = vgrf(glsl_type::float_type);
1847
1848 /* Generate a pull load into dst. */
1849 if (inst->src[i].reladdr) {
1850 VARYING_PULL_CONSTANT_LOAD(ibld, dst,
1851 surf_index,
1852 *inst->src[i].reladdr,
1853 pull_index);
1854 inst->src[i].reladdr = NULL;
1855 } else {
1856 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1857 ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1858 dst, surf_index, offset);
1859 inst->src[i].set_smear(pull_index & 3);
1860 }
1861
1862 /* Rewrite the instruction to use the temporary VGRF. */
1863 inst->src[i].file = GRF;
1864 inst->src[i].reg = dst.reg;
1865 inst->src[i].reg_offset = 0;
1866 inst->src[i].width = dispatch_width;
1867 }
1868 }
1869 invalidate_live_intervals();
1870 }
1871
1872 bool
1873 fs_visitor::opt_algebraic()
1874 {
1875 bool progress = false;
1876
1877 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1878 switch (inst->opcode) {
1879 case BRW_OPCODE_MOV:
1880 if (inst->src[0].file != IMM)
1881 break;
1882
1883 if (inst->saturate) {
1884 if (inst->dst.type != inst->src[0].type)
1885 assert(!"unimplemented: saturate mixed types");
1886
1887 if (brw_saturate_immediate(inst->dst.type,
1888 &inst->src[0].fixed_hw_reg)) {
1889 inst->saturate = false;
1890 progress = true;
1891 }
1892 }
1893 break;
1894
1895 case BRW_OPCODE_MUL:
1896 if (inst->src[1].file != IMM)
1897 continue;
1898
1899 /* a * 1.0 = a */
1900 if (inst->src[1].is_one()) {
1901 inst->opcode = BRW_OPCODE_MOV;
1902 inst->src[1] = reg_undef;
1903 progress = true;
1904 break;
1905 }
1906
1907 /* a * -1.0 = -a */
1908 if (inst->src[1].is_negative_one()) {
1909 inst->opcode = BRW_OPCODE_MOV;
1910 inst->src[0].negate = !inst->src[0].negate;
1911 inst->src[1] = reg_undef;
1912 progress = true;
1913 break;
1914 }
1915
1916 /* a * 0.0 = 0.0 */
1917 if (inst->src[1].is_zero()) {
1918 inst->opcode = BRW_OPCODE_MOV;
1919 inst->src[0] = inst->src[1];
1920 inst->src[1] = reg_undef;
1921 progress = true;
1922 break;
1923 }
1924
1925 if (inst->src[0].file == IMM) {
1926 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
1927 inst->opcode = BRW_OPCODE_MOV;
1928 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
1929 inst->src[1] = reg_undef;
1930 progress = true;
1931 break;
1932 }
1933 break;
1934 case BRW_OPCODE_ADD:
1935 if (inst->src[1].file != IMM)
1936 continue;
1937
1938 /* a + 0.0 = a */
1939 if (inst->src[1].is_zero()) {
1940 inst->opcode = BRW_OPCODE_MOV;
1941 inst->src[1] = reg_undef;
1942 progress = true;
1943 break;
1944 }
1945
1946 if (inst->src[0].file == IMM) {
1947 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
1948 inst->opcode = BRW_OPCODE_MOV;
1949 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
1950 inst->src[1] = reg_undef;
1951 progress = true;
1952 break;
1953 }
1954 break;
1955 case BRW_OPCODE_OR:
1956 if (inst->src[0].equals(inst->src[1])) {
1957 inst->opcode = BRW_OPCODE_MOV;
1958 inst->src[1] = reg_undef;
1959 progress = true;
1960 break;
1961 }
1962 break;
1963 case BRW_OPCODE_LRP:
1964 if (inst->src[1].equals(inst->src[2])) {
1965 inst->opcode = BRW_OPCODE_MOV;
1966 inst->src[0] = inst->src[1];
1967 inst->src[1] = reg_undef;
1968 inst->src[2] = reg_undef;
1969 progress = true;
1970 break;
1971 }
1972 break;
1973 case BRW_OPCODE_CMP:
1974 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
1975 inst->src[0].abs &&
1976 inst->src[0].negate &&
1977 inst->src[1].is_zero()) {
1978 inst->src[0].abs = false;
1979 inst->src[0].negate = false;
1980 inst->conditional_mod = BRW_CONDITIONAL_Z;
1981 progress = true;
1982 break;
1983 }
1984 break;
1985 case BRW_OPCODE_SEL:
1986 if (inst->src[0].equals(inst->src[1])) {
1987 inst->opcode = BRW_OPCODE_MOV;
1988 inst->src[1] = reg_undef;
1989 inst->predicate = BRW_PREDICATE_NONE;
1990 inst->predicate_inverse = false;
1991 progress = true;
1992 } else if (inst->saturate && inst->src[1].file == IMM) {
1993 switch (inst->conditional_mod) {
1994 case BRW_CONDITIONAL_LE:
1995 case BRW_CONDITIONAL_L:
1996 switch (inst->src[1].type) {
1997 case BRW_REGISTER_TYPE_F:
1998 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
1999 inst->opcode = BRW_OPCODE_MOV;
2000 inst->src[1] = reg_undef;
2001 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2002 progress = true;
2003 }
2004 break;
2005 default:
2006 break;
2007 }
2008 break;
2009 case BRW_CONDITIONAL_GE:
2010 case BRW_CONDITIONAL_G:
2011 switch (inst->src[1].type) {
2012 case BRW_REGISTER_TYPE_F:
2013 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2014 inst->opcode = BRW_OPCODE_MOV;
2015 inst->src[1] = reg_undef;
2016 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2017 progress = true;
2018 }
2019 break;
2020 default:
2021 break;
2022 }
2023 default:
2024 break;
2025 }
2026 }
2027 break;
2028 case BRW_OPCODE_MAD:
2029 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2030 inst->opcode = BRW_OPCODE_MOV;
2031 inst->src[1] = reg_undef;
2032 inst->src[2] = reg_undef;
2033 progress = true;
2034 } else if (inst->src[0].is_zero()) {
2035 inst->opcode = BRW_OPCODE_MUL;
2036 inst->src[0] = inst->src[2];
2037 inst->src[2] = reg_undef;
2038 progress = true;
2039 } else if (inst->src[1].is_one()) {
2040 inst->opcode = BRW_OPCODE_ADD;
2041 inst->src[1] = inst->src[2];
2042 inst->src[2] = reg_undef;
2043 progress = true;
2044 } else if (inst->src[2].is_one()) {
2045 inst->opcode = BRW_OPCODE_ADD;
2046 inst->src[2] = reg_undef;
2047 progress = true;
2048 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2049 inst->opcode = BRW_OPCODE_ADD;
2050 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2051 inst->src[2] = reg_undef;
2052 progress = true;
2053 }
2054 break;
2055 case SHADER_OPCODE_RCP: {
2056 fs_inst *prev = (fs_inst *)inst->prev;
2057 if (prev->opcode == SHADER_OPCODE_SQRT) {
2058 if (inst->src[0].equals(prev->dst)) {
2059 inst->opcode = SHADER_OPCODE_RSQ;
2060 inst->src[0] = prev->src[0];
2061 progress = true;
2062 }
2063 }
2064 break;
2065 }
2066 case SHADER_OPCODE_BROADCAST:
2067 if (is_uniform(inst->src[0])) {
2068 inst->opcode = BRW_OPCODE_MOV;
2069 inst->sources = 1;
2070 inst->force_writemask_all = true;
2071 progress = true;
2072 } else if (inst->src[1].file == IMM) {
2073 inst->opcode = BRW_OPCODE_MOV;
2074 inst->src[0] = component(inst->src[0],
2075 inst->src[1].fixed_hw_reg.dw1.ud);
2076 inst->sources = 1;
2077 inst->force_writemask_all = true;
2078 progress = true;
2079 }
2080 break;
2081
2082 default:
2083 break;
2084 }
2085
2086 /* Swap if src[0] is immediate. */
2087 if (progress && inst->is_commutative()) {
2088 if (inst->src[0].file == IMM) {
2089 fs_reg tmp = inst->src[1];
2090 inst->src[1] = inst->src[0];
2091 inst->src[0] = tmp;
2092 }
2093 }
2094 }
2095 return progress;
2096 }
2097
2098 /**
2099 * Optimize sample messages that have constant zero values for the trailing
2100 * texture coordinates. We can just reduce the message length for these
2101 * instructions instead of reserving a register for it. Trailing parameters
2102 * that aren't sent default to zero anyway. This will cause the dead code
2103 * eliminator to remove the MOV instruction that would otherwise be emitted to
2104 * set up the zero value.
2105 */
2106 bool
2107 fs_visitor::opt_zero_samples()
2108 {
2109 /* Gen4 infers the texturing opcode based on the message length so we can't
2110 * change it.
2111 */
2112 if (devinfo->gen < 5)
2113 return false;
2114
2115 bool progress = false;
2116
2117 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2118 if (!inst->is_tex())
2119 continue;
2120
2121 fs_inst *load_payload = (fs_inst *) inst->prev;
2122
2123 if (load_payload->is_head_sentinel() ||
2124 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2125 continue;
2126
2127 /* We don't want to remove the message header or the first parameter.
2128 * Removing the first parameter is not allowed, see the Haswell PRM
2129 * volume 7, page 149:
2130 *
2131 * "Parameter 0 is required except for the sampleinfo message, which
2132 * has no parameter 0"
2133 */
2134 while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2135 load_payload->src[(inst->mlen - inst->header_size) /
2136 (dispatch_width / 8) +
2137 inst->header_size - 1].is_zero()) {
2138 inst->mlen -= dispatch_width / 8;
2139 progress = true;
2140 }
2141 }
2142
2143 if (progress)
2144 invalidate_live_intervals();
2145
2146 return progress;
2147 }
2148
2149 /**
2150 * Optimize sample messages which are followed by the final RT write.
2151 *
2152 * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2153 * results sent directly to the framebuffer, bypassing the EU. Recognize the
2154 * final texturing results copied to the framebuffer write payload and modify
2155 * them to write to the framebuffer directly.
2156 */
2157 bool
2158 fs_visitor::opt_sampler_eot()
2159 {
2160 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2161
2162 if (stage != MESA_SHADER_FRAGMENT)
2163 return false;
2164
2165 if (devinfo->gen < 9 && !devinfo->is_cherryview)
2166 return false;
2167
2168 /* FINISHME: It should be possible to implement this optimization when there
2169 * are multiple drawbuffers.
2170 */
2171 if (key->nr_color_regions != 1)
2172 return false;
2173
2174 /* Look for a texturing instruction immediately before the final FB_WRITE. */
2175 fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2176 assert(fb_write->eot);
2177 assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2178
2179 fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2180
2181 /* There wasn't one; nothing to do. */
2182 if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2183 return false;
2184
2185 /* This optimisation doesn't seem to work for textureGather for some
2186 * reason. I can't find any documentation or known workarounds to indicate
2187 * that this is expected, but considering that it is probably pretty
2188 * unlikely that a shader would directly write out the results from
2189 * textureGather we might as well just disable it.
2190 */
2191 if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
2192 tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2193 return false;
2194
2195 /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2196 * It's very likely to be the previous instruction.
2197 */
2198 fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2199 if (load_payload->is_head_sentinel() ||
2200 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2201 return false;
2202
2203 assert(!tex_inst->eot); /* We can't get here twice */
2204 assert((tex_inst->offset & (0xff << 24)) == 0);
2205
2206 tex_inst->offset |= fb_write->target << 24;
2207 tex_inst->eot = true;
2208 tex_inst->dst = bld.null_reg_ud();
2209 fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2210
2211 /* If a header is present, marking the eot is sufficient. Otherwise, we need
2212 * to create a new LOAD_PAYLOAD command with the same sources and a space
2213 * saved for the header. Using a new destination register not only makes sure
2214 * we have enough space, but it will make sure the dead code eliminator kills
2215 * the instruction that this will replace.
2216 */
2217 if (tex_inst->header_size != 0)
2218 return true;
2219
2220 fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
2221 load_payload->sources + 1);
2222 fs_reg *new_sources =
2223 ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2224
2225 new_sources[0] = fs_reg();
2226 for (int i = 0; i < load_payload->sources; i++)
2227 new_sources[i+1] = load_payload->src[i];
2228
2229 /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2230 * requires a lot of information about the sources to appropriately figure
2231 * out the number of registers needed to be used. Given this stage in our
2232 * optimization, we may not have the appropriate GRFs required by
2233 * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2234 * manually emit the instruction.
2235 */
2236 fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2237 load_payload->exec_size,
2238 send_header,
2239 new_sources,
2240 load_payload->sources + 1);
2241
2242 new_load_payload->regs_written = load_payload->regs_written + 1;
2243 new_load_payload->header_size = 1;
2244 tex_inst->mlen++;
2245 tex_inst->header_size = 1;
2246 tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2247 tex_inst->src[0] = send_header;
2248
2249 return true;
2250 }
2251
2252 bool
2253 fs_visitor::opt_register_renaming()
2254 {
2255 bool progress = false;
2256 int depth = 0;
2257
2258 int remap[alloc.count];
2259 memset(remap, -1, sizeof(int) * alloc.count);
2260
2261 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2262 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2263 depth++;
2264 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2265 inst->opcode == BRW_OPCODE_WHILE) {
2266 depth--;
2267 }
2268
2269 /* Rewrite instruction sources. */
2270 for (int i = 0; i < inst->sources; i++) {
2271 if (inst->src[i].file == GRF &&
2272 remap[inst->src[i].reg] != -1 &&
2273 remap[inst->src[i].reg] != inst->src[i].reg) {
2274 inst->src[i].reg = remap[inst->src[i].reg];
2275 progress = true;
2276 }
2277 }
2278
2279 const int dst = inst->dst.reg;
2280
2281 if (depth == 0 &&
2282 inst->dst.file == GRF &&
2283 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2284 !inst->is_partial_write()) {
2285 if (remap[dst] == -1) {
2286 remap[dst] = dst;
2287 } else {
2288 remap[dst] = alloc.allocate(inst->dst.width / 8);
2289 inst->dst.reg = remap[dst];
2290 progress = true;
2291 }
2292 } else if (inst->dst.file == GRF &&
2293 remap[dst] != -1 &&
2294 remap[dst] != dst) {
2295 inst->dst.reg = remap[dst];
2296 progress = true;
2297 }
2298 }
2299
2300 if (progress) {
2301 invalidate_live_intervals();
2302
2303 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2304 if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2305 delta_xy[i].reg = remap[delta_xy[i].reg];
2306 }
2307 }
2308 }
2309
2310 return progress;
2311 }
2312
2313 /**
2314 * Remove redundant or useless discard jumps.
2315 *
2316 * For example, we can eliminate jumps in the following sequence:
2317 *
2318 * discard-jump (redundant with the next jump)
2319 * discard-jump (useless; jumps to the next instruction)
2320 * placeholder-halt
2321 */
2322 bool
2323 fs_visitor::opt_redundant_discard_jumps()
2324 {
2325 bool progress = false;
2326
2327 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2328
2329 fs_inst *placeholder_halt = NULL;
2330 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2331 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2332 placeholder_halt = inst;
2333 break;
2334 }
2335 }
2336
2337 if (!placeholder_halt)
2338 return false;
2339
2340 /* Delete any HALTs immediately before the placeholder halt. */
2341 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2342 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2343 prev = (fs_inst *) placeholder_halt->prev) {
2344 prev->remove(last_bblock);
2345 progress = true;
2346 }
2347
2348 if (progress)
2349 invalidate_live_intervals();
2350
2351 return progress;
2352 }
2353
2354 bool
2355 fs_visitor::compute_to_mrf()
2356 {
2357 bool progress = false;
2358 int next_ip = 0;
2359
2360 /* No MRFs on Gen >= 7. */
2361 if (devinfo->gen >= 7)
2362 return false;
2363
2364 calculate_live_intervals();
2365
2366 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2367 int ip = next_ip;
2368 next_ip++;
2369
2370 if (inst->opcode != BRW_OPCODE_MOV ||
2371 inst->is_partial_write() ||
2372 inst->dst.file != MRF || inst->src[0].file != GRF ||
2373 inst->dst.type != inst->src[0].type ||
2374 inst->src[0].abs || inst->src[0].negate ||
2375 !inst->src[0].is_contiguous() ||
2376 inst->src[0].subreg_offset)
2377 continue;
2378
2379 /* Work out which hardware MRF registers are written by this
2380 * instruction.
2381 */
2382 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2383 int mrf_high;
2384 if (inst->dst.reg & BRW_MRF_COMPR4) {
2385 mrf_high = mrf_low + 4;
2386 } else if (inst->exec_size == 16) {
2387 mrf_high = mrf_low + 1;
2388 } else {
2389 mrf_high = mrf_low;
2390 }
2391
2392 /* Can't compute-to-MRF this GRF if someone else was going to
2393 * read it later.
2394 */
2395 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2396 continue;
2397
2398 /* Found a move of a GRF to a MRF. Let's see if we can go
2399 * rewrite the thing that made this GRF to write into the MRF.
2400 */
2401 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2402 if (scan_inst->dst.file == GRF &&
2403 scan_inst->dst.reg == inst->src[0].reg) {
2404 /* Found the last thing to write our reg we want to turn
2405 * into a compute-to-MRF.
2406 */
2407
2408 /* If this one instruction didn't populate all the
2409 * channels, bail. We might be able to rewrite everything
2410 * that writes that reg, but it would require smarter
2411 * tracking to delay the rewriting until complete success.
2412 */
2413 if (scan_inst->is_partial_write())
2414 break;
2415
2416 /* Things returning more than one register would need us to
2417 * understand coalescing out more than one MOV at a time.
2418 */
2419 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2420 break;
2421
2422 /* SEND instructions can't have MRF as a destination. */
2423 if (scan_inst->mlen)
2424 break;
2425
2426 if (devinfo->gen == 6) {
2427 /* gen6 math instructions must have the destination be
2428 * GRF, so no compute-to-MRF for them.
2429 */
2430 if (scan_inst->is_math()) {
2431 break;
2432 }
2433 }
2434
2435 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2436 /* Found the creator of our MRF's source value. */
2437 scan_inst->dst.file = MRF;
2438 scan_inst->dst.reg = inst->dst.reg;
2439 scan_inst->saturate |= inst->saturate;
2440 inst->remove(block);
2441 progress = true;
2442 }
2443 break;
2444 }
2445
2446 /* We don't handle control flow here. Most computation of
2447 * values that end up in MRFs are shortly before the MRF
2448 * write anyway.
2449 */
2450 if (block->start() == scan_inst)
2451 break;
2452
2453 /* You can't read from an MRF, so if someone else reads our
2454 * MRF's source GRF that we wanted to rewrite, that stops us.
2455 */
2456 bool interfered = false;
2457 for (int i = 0; i < scan_inst->sources; i++) {
2458 if (scan_inst->src[i].file == GRF &&
2459 scan_inst->src[i].reg == inst->src[0].reg &&
2460 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2461 interfered = true;
2462 }
2463 }
2464 if (interfered)
2465 break;
2466
2467 if (scan_inst->dst.file == MRF) {
2468 /* If somebody else writes our MRF here, we can't
2469 * compute-to-MRF before that.
2470 */
2471 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2472 int scan_mrf_high;
2473
2474 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2475 scan_mrf_high = scan_mrf_low + 4;
2476 } else if (scan_inst->exec_size == 16) {
2477 scan_mrf_high = scan_mrf_low + 1;
2478 } else {
2479 scan_mrf_high = scan_mrf_low;
2480 }
2481
2482 if (mrf_low == scan_mrf_low ||
2483 mrf_low == scan_mrf_high ||
2484 mrf_high == scan_mrf_low ||
2485 mrf_high == scan_mrf_high) {
2486 break;
2487 }
2488 }
2489
2490 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2491 /* Found a SEND instruction, which means that there are
2492 * live values in MRFs from base_mrf to base_mrf +
2493 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2494 * above it.
2495 */
2496 if (mrf_low >= scan_inst->base_mrf &&
2497 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2498 break;
2499 }
2500 if (mrf_high >= scan_inst->base_mrf &&
2501 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2502 break;
2503 }
2504 }
2505 }
2506 }
2507
2508 if (progress)
2509 invalidate_live_intervals();
2510
2511 return progress;
2512 }
2513
2514 /**
2515 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2516 * flow. We could probably do better here with some form of divergence
2517 * analysis.
2518 */
2519 bool
2520 fs_visitor::eliminate_find_live_channel()
2521 {
2522 bool progress = false;
2523 unsigned depth = 0;
2524
2525 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2526 switch (inst->opcode) {
2527 case BRW_OPCODE_IF:
2528 case BRW_OPCODE_DO:
2529 depth++;
2530 break;
2531
2532 case BRW_OPCODE_ENDIF:
2533 case BRW_OPCODE_WHILE:
2534 depth--;
2535 break;
2536
2537 case FS_OPCODE_DISCARD_JUMP:
2538 /* This can potentially make control flow non-uniform until the end
2539 * of the program.
2540 */
2541 return progress;
2542
2543 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2544 if (depth == 0) {
2545 inst->opcode = BRW_OPCODE_MOV;
2546 inst->src[0] = fs_reg(0);
2547 inst->sources = 1;
2548 inst->force_writemask_all = true;
2549 progress = true;
2550 }
2551 break;
2552
2553 default:
2554 break;
2555 }
2556 }
2557
2558 return progress;
2559 }
2560
2561 /**
2562 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2563 * instructions to FS_OPCODE_REP_FB_WRITE.
2564 */
2565 void
2566 fs_visitor::emit_repclear_shader()
2567 {
2568 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2569 int base_mrf = 1;
2570 int color_mrf = base_mrf + 2;
2571
2572 fs_inst *mov = bld.exec_all().MOV(vec4(brw_message_reg(color_mrf)),
2573 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
2574
2575 fs_inst *write;
2576 if (key->nr_color_regions == 1) {
2577 write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2578 write->saturate = key->clamp_fragment_color;
2579 write->base_mrf = color_mrf;
2580 write->target = 0;
2581 write->header_size = 0;
2582 write->mlen = 1;
2583 } else {
2584 assume(key->nr_color_regions > 0);
2585 for (int i = 0; i < key->nr_color_regions; ++i) {
2586 write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2587 write->saturate = key->clamp_fragment_color;
2588 write->base_mrf = base_mrf;
2589 write->target = i;
2590 write->header_size = 2;
2591 write->mlen = 3;
2592 }
2593 }
2594 write->eot = true;
2595
2596 calculate_cfg();
2597
2598 assign_constant_locations();
2599 assign_curb_setup();
2600
2601 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2602 assert(mov->src[0].file == HW_REG);
2603 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2604 }
2605
2606 /**
2607 * Walks through basic blocks, looking for repeated MRF writes and
2608 * removing the later ones.
2609 */
2610 bool
2611 fs_visitor::remove_duplicate_mrf_writes()
2612 {
2613 fs_inst *last_mrf_move[16];
2614 bool progress = false;
2615
2616 /* Need to update the MRF tracking for compressed instructions. */
2617 if (dispatch_width == 16)
2618 return false;
2619
2620 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2621
2622 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2623 if (inst->is_control_flow()) {
2624 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2625 }
2626
2627 if (inst->opcode == BRW_OPCODE_MOV &&
2628 inst->dst.file == MRF) {
2629 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2630 if (prev_inst && inst->equals(prev_inst)) {
2631 inst->remove(block);
2632 progress = true;
2633 continue;
2634 }
2635 }
2636
2637 /* Clear out the last-write records for MRFs that were overwritten. */
2638 if (inst->dst.file == MRF) {
2639 last_mrf_move[inst->dst.reg] = NULL;
2640 }
2641
2642 if (inst->mlen > 0 && inst->base_mrf != -1) {
2643 /* Found a SEND instruction, which will include two or fewer
2644 * implied MRF writes. We could do better here.
2645 */
2646 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2647 last_mrf_move[inst->base_mrf + i] = NULL;
2648 }
2649 }
2650
2651 /* Clear out any MRF move records whose sources got overwritten. */
2652 if (inst->dst.file == GRF) {
2653 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2654 if (last_mrf_move[i] &&
2655 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2656 last_mrf_move[i] = NULL;
2657 }
2658 }
2659 }
2660
2661 if (inst->opcode == BRW_OPCODE_MOV &&
2662 inst->dst.file == MRF &&
2663 inst->src[0].file == GRF &&
2664 !inst->is_partial_write()) {
2665 last_mrf_move[inst->dst.reg] = inst;
2666 }
2667 }
2668
2669 if (progress)
2670 invalidate_live_intervals();
2671
2672 return progress;
2673 }
2674
2675 static void
2676 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
2677 {
2678 /* Clear the flag for registers that actually got read (as expected). */
2679 for (int i = 0; i < inst->sources; i++) {
2680 int grf;
2681 if (inst->src[i].file == GRF) {
2682 grf = inst->src[i].reg;
2683 } else if (inst->src[i].file == HW_REG &&
2684 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2685 grf = inst->src[i].fixed_hw_reg.nr;
2686 } else {
2687 continue;
2688 }
2689
2690 if (grf >= first_grf &&
2691 grf < first_grf + grf_len) {
2692 deps[grf - first_grf] = false;
2693 if (inst->exec_size == 16)
2694 deps[grf - first_grf + 1] = false;
2695 }
2696 }
2697 }
2698
2699 /**
2700 * Implements this workaround for the original 965:
2701 *
2702 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2703 * check for post destination dependencies on this instruction, software
2704 * must ensure that there is no destination hazard for the case of ‘write
2705 * followed by a posted write’ shown in the following example.
2706 *
2707 * 1. mov r3 0
2708 * 2. send r3.xy <rest of send instruction>
2709 * 3. mov r2 r3
2710 *
2711 * Due to no post-destination dependency check on the ‘send’, the above
2712 * code sequence could have two instructions (1 and 2) in flight at the
2713 * same time that both consider ‘r3’ as the target of their final writes.
2714 */
2715 void
2716 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2717 fs_inst *inst)
2718 {
2719 int write_len = inst->regs_written;
2720 int first_write_grf = inst->dst.reg;
2721 bool needs_dep[BRW_MAX_MRF];
2722 assert(write_len < (int)sizeof(needs_dep) - 1);
2723
2724 memset(needs_dep, false, sizeof(needs_dep));
2725 memset(needs_dep, true, write_len);
2726
2727 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
2728
2729 /* Walk backwards looking for writes to registers we're writing which
2730 * aren't read since being written. If we hit the start of the program,
2731 * we assume that there are no outstanding dependencies on entry to the
2732 * program.
2733 */
2734 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2735 /* If we hit control flow, assume that there *are* outstanding
2736 * dependencies, and force their cleanup before our instruction.
2737 */
2738 if (block->start() == scan_inst) {
2739 for (int i = 0; i < write_len; i++) {
2740 if (needs_dep[i])
2741 DEP_RESOLVE_MOV(bld.at(block, inst), first_write_grf + i);
2742 }
2743 return;
2744 }
2745
2746 /* We insert our reads as late as possible on the assumption that any
2747 * instruction but a MOV that might have left us an outstanding
2748 * dependency has more latency than a MOV.
2749 */
2750 if (scan_inst->dst.file == GRF) {
2751 for (int i = 0; i < scan_inst->regs_written; i++) {
2752 int reg = scan_inst->dst.reg + i;
2753
2754 if (reg >= first_write_grf &&
2755 reg < first_write_grf + write_len &&
2756 needs_dep[reg - first_write_grf]) {
2757 DEP_RESOLVE_MOV(bld.at(block, inst), reg);
2758 needs_dep[reg - first_write_grf] = false;
2759 if (scan_inst->exec_size == 16)
2760 needs_dep[reg - first_write_grf + 1] = false;
2761 }
2762 }
2763 }
2764
2765 /* Clear the flag for registers that actually got read (as expected). */
2766 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2767
2768 /* Continue the loop only if we haven't resolved all the dependencies */
2769 int i;
2770 for (i = 0; i < write_len; i++) {
2771 if (needs_dep[i])
2772 break;
2773 }
2774 if (i == write_len)
2775 return;
2776 }
2777 }
2778
2779 /**
2780 * Implements this workaround for the original 965:
2781 *
2782 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2783 * used as a destination register until after it has been sourced by an
2784 * instruction with a different destination register.
2785 */
2786 void
2787 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2788 {
2789 int write_len = inst->regs_written;
2790 int first_write_grf = inst->dst.reg;
2791 bool needs_dep[BRW_MAX_MRF];
2792 assert(write_len < (int)sizeof(needs_dep) - 1);
2793
2794 memset(needs_dep, false, sizeof(needs_dep));
2795 memset(needs_dep, true, write_len);
2796 /* Walk forwards looking for writes to registers we're writing which aren't
2797 * read before being written.
2798 */
2799 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2800 /* If we hit control flow, force resolve all remaining dependencies. */
2801 if (block->end() == scan_inst) {
2802 for (int i = 0; i < write_len; i++) {
2803 if (needs_dep[i])
2804 DEP_RESOLVE_MOV(bld.at(block, scan_inst), first_write_grf + i);
2805 }
2806 return;
2807 }
2808
2809 /* Clear the flag for registers that actually got read (as expected). */
2810 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2811
2812 /* We insert our reads as late as possible since they're reading the
2813 * result of a SEND, which has massive latency.
2814 */
2815 if (scan_inst->dst.file == GRF &&
2816 scan_inst->dst.reg >= first_write_grf &&
2817 scan_inst->dst.reg < first_write_grf + write_len &&
2818 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2819 DEP_RESOLVE_MOV(bld.at(block, scan_inst), scan_inst->dst.reg);
2820 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2821 }
2822
2823 /* Continue the loop only if we haven't resolved all the dependencies */
2824 int i;
2825 for (i = 0; i < write_len; i++) {
2826 if (needs_dep[i])
2827 break;
2828 }
2829 if (i == write_len)
2830 return;
2831 }
2832 }
2833
2834 void
2835 fs_visitor::insert_gen4_send_dependency_workarounds()
2836 {
2837 if (devinfo->gen != 4 || devinfo->is_g4x)
2838 return;
2839
2840 bool progress = false;
2841
2842 /* Note that we're done with register allocation, so GRF fs_regs always
2843 * have a .reg_offset of 0.
2844 */
2845
2846 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2847 if (inst->mlen != 0 && inst->dst.file == GRF) {
2848 insert_gen4_pre_send_dependency_workarounds(block, inst);
2849 insert_gen4_post_send_dependency_workarounds(block, inst);
2850 progress = true;
2851 }
2852 }
2853
2854 if (progress)
2855 invalidate_live_intervals();
2856 }
2857
2858 /**
2859 * Turns the generic expression-style uniform pull constant load instruction
2860 * into a hardware-specific series of instructions for loading a pull
2861 * constant.
2862 *
2863 * The expression style allows the CSE pass before this to optimize out
2864 * repeated loads from the same offset, and gives the pre-register-allocation
2865 * scheduling full flexibility, while the conversion to native instructions
2866 * allows the post-register-allocation scheduler the best information
2867 * possible.
2868 *
2869 * Note that execution masking for setting up pull constant loads is special:
2870 * the channels that need to be written are unrelated to the current execution
2871 * mask, since a later instruction will use one of the result channels as a
2872 * source operand for all 8 or 16 of its channels.
2873 */
2874 void
2875 fs_visitor::lower_uniform_pull_constant_loads()
2876 {
2877 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2878 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2879 continue;
2880
2881 if (devinfo->gen >= 7) {
2882 /* The offset arg before was a vec4-aligned byte offset. We need to
2883 * turn it into a dword offset.
2884 */
2885 fs_reg const_offset_reg = inst->src[1];
2886 assert(const_offset_reg.file == IMM &&
2887 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2888 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2889
2890 fs_reg payload, offset;
2891 if (devinfo->gen >= 9) {
2892 /* We have to use a message header on Skylake to get SIMD4x2
2893 * mode. Reserve space for the register.
2894 */
2895 offset = payload = fs_reg(GRF, alloc.allocate(2));
2896 offset.reg_offset++;
2897 inst->mlen = 2;
2898 } else {
2899 offset = payload = fs_reg(GRF, alloc.allocate(1));
2900 inst->mlen = 1;
2901 }
2902
2903 /* This is actually going to be a MOV, but since only the first dword
2904 * is accessed, we have a special opcode to do just that one. Note
2905 * that this needs to be an operation that will be considered a def
2906 * by live variable analysis, or register allocation will explode.
2907 */
2908 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2909 8, offset, const_offset_reg);
2910 setup->force_writemask_all = true;
2911
2912 setup->ir = inst->ir;
2913 setup->annotation = inst->annotation;
2914 inst->insert_before(block, setup);
2915
2916 /* Similarly, this will only populate the first 4 channels of the
2917 * result register (since we only use smear values from 0-3), but we
2918 * don't tell the optimizer.
2919 */
2920 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2921 inst->src[1] = payload;
2922 inst->base_mrf = -1;
2923
2924 invalidate_live_intervals();
2925 } else {
2926 /* Before register allocation, we didn't tell the scheduler about the
2927 * MRF we use. We know it's safe to use this MRF because nothing
2928 * else does except for register spill/unspill, which generates and
2929 * uses its MRF within a single IR instruction.
2930 */
2931 inst->base_mrf = 14;
2932 inst->mlen = 1;
2933 }
2934 }
2935 }
2936
2937 bool
2938 fs_visitor::lower_load_payload()
2939 {
2940 bool progress = false;
2941
2942 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2943 if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2944 continue;
2945
2946 assert(inst->dst.file == MRF || inst->dst.file == GRF);
2947 assert(inst->saturate == false);
2948 fs_reg dst = inst->dst;
2949
2950 /* Get rid of COMPR4. We'll add it back in if we need it */
2951 if (dst.file == MRF)
2952 dst.reg = dst.reg & ~BRW_MRF_COMPR4;
2953
2954 dst.width = 8;
2955 const fs_builder hbld = bld.group(8, 0).exec_all().at(block, inst);
2956
2957 for (uint8_t i = 0; i < inst->header_size; i++) {
2958 if (inst->src[i].file != BAD_FILE) {
2959 fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
2960 fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
2961 mov_src.width = 8;
2962 hbld.MOV(mov_dst, mov_src);
2963 }
2964 dst = offset(dst, hbld, 1);
2965 }
2966
2967 dst.width = inst->exec_size;
2968 const fs_builder ibld = bld.group(inst->exec_size, inst->force_sechalf)
2969 .exec_all(inst->force_writemask_all)
2970 .at(block, inst);
2971
2972 if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
2973 inst->exec_size > 8) {
2974 /* In this case, the payload portion of the LOAD_PAYLOAD isn't
2975 * a straightforward copy. Instead, the result of the
2976 * LOAD_PAYLOAD is treated as interleaved and the first four
2977 * non-header sources are unpacked as:
2978 *
2979 * m + 0: r0
2980 * m + 1: g0
2981 * m + 2: b0
2982 * m + 3: a0
2983 * m + 4: r1
2984 * m + 5: g1
2985 * m + 6: b1
2986 * m + 7: a1
2987 *
2988 * This is used for gen <= 5 fb writes.
2989 */
2990 assert(inst->exec_size == 16);
2991 assert(inst->header_size + 4 <= inst->sources);
2992 for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
2993 if (inst->src[i].file != BAD_FILE) {
2994 if (devinfo->has_compr4) {
2995 fs_reg compr4_dst = retype(dst, inst->src[i].type);
2996 compr4_dst.reg |= BRW_MRF_COMPR4;
2997 ibld.MOV(compr4_dst, inst->src[i]);
2998 } else {
2999 /* Platform doesn't have COMPR4. We have to fake it */
3000 fs_reg mov_dst = retype(dst, inst->src[i].type);
3001 mov_dst.width = 8;
3002 ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
3003 mov_dst.reg += 4;
3004 ibld.half(1).MOV(mov_dst, half(inst->src[i], 1));
3005 }
3006 }
3007
3008 dst.reg++;
3009 }
3010
3011 /* The loop above only ever incremented us through the first set
3012 * of 4 registers. However, thanks to the magic of COMPR4, we
3013 * actually wrote to the first 8 registers, so we need to take
3014 * that into account now.
3015 */
3016 dst.reg += 4;
3017
3018 /* The COMPR4 code took care of the first 4 sources. We'll let
3019 * the regular path handle any remaining sources. Yes, we are
3020 * modifying the instruction but we're about to delete it so
3021 * this really doesn't hurt anything.
3022 */
3023 inst->header_size += 4;
3024 }
3025
3026 for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3027 if (inst->src[i].file != BAD_FILE)
3028 ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
3029 dst = offset(dst, ibld, 1);
3030 }
3031
3032 inst->remove(block);
3033 progress = true;
3034 }
3035
3036 if (progress)
3037 invalidate_live_intervals();
3038
3039 return progress;
3040 }
3041
3042 bool
3043 fs_visitor::lower_integer_multiplication()
3044 {
3045 bool progress = false;
3046
3047 /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
3048 * directly, but Cherryview cannot.
3049 */
3050 if (devinfo->gen >= 8 && !devinfo->is_cherryview)
3051 return false;
3052
3053 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3054 if (inst->opcode != BRW_OPCODE_MUL ||
3055 inst->dst.is_accumulator() ||
3056 (inst->dst.type != BRW_REGISTER_TYPE_D &&
3057 inst->dst.type != BRW_REGISTER_TYPE_UD))
3058 continue;
3059
3060 const fs_builder ibld = bld.at(block, inst);
3061
3062 /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3063 * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3064 * src1 are used.
3065 *
3066 * If multiplying by an immediate value that fits in 16-bits, do a
3067 * single MUL instruction with that value in the proper location.
3068 */
3069 if (inst->src[1].file == IMM &&
3070 inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
3071 if (devinfo->gen < 7) {
3072 fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
3073 inst->dst.type, dispatch_width);
3074 ibld.MOV(imm, inst->src[1]);
3075 ibld.MUL(inst->dst, imm, inst->src[0]);
3076 } else {
3077 ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
3078 }
3079 } else {
3080 /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3081 * do 32-bit integer multiplication in one instruction, but instead
3082 * must do a sequence (which actually calculates a 64-bit result):
3083 *
3084 * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
3085 * mach(8) null g3<8,8,1>D g4<8,8,1>D
3086 * mov(8) g2<1>D acc0<8,8,1>D
3087 *
3088 * But on Gen > 6, the ability to use second accumulator register
3089 * (acc1) for non-float data types was removed, preventing a simple
3090 * implementation in SIMD16. A 16-channel result can be calculated by
3091 * executing the three instructions twice in SIMD8, once with quarter
3092 * control of 1Q for the first eight channels and again with 2Q for
3093 * the second eight channels.
3094 *
3095 * Which accumulator register is implicitly accessed (by AccWrEnable
3096 * for instance) is determined by the quarter control. Unfortunately
3097 * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3098 * implicit accumulator access by an instruction with 2Q will access
3099 * acc1 regardless of whether the data type is usable in acc1.
3100 *
3101 * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3102 * integer data types.
3103 *
3104 * Since we only want the low 32-bits of the result, we can do two
3105 * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3106 * adjust the high result and add them (like the mach is doing):
3107 *
3108 * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
3109 * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
3110 * shl(8) g9<1>D g8<8,8,1>D 16D
3111 * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
3112 *
3113 * We avoid the shl instruction by realizing that we only want to add
3114 * the low 16-bits of the "high" result to the high 16-bits of the
3115 * "low" result and using proper regioning on the add:
3116 *
3117 * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
3118 * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
3119 * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
3120 *
3121 * Since it does not use the (single) accumulator register, we can
3122 * schedule multi-component multiplications much better.
3123 */
3124
3125 if (inst->conditional_mod && inst->dst.is_null()) {
3126 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3127 inst->dst.type, dispatch_width);
3128 }
3129 fs_reg low = inst->dst;
3130 fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
3131 inst->dst.type, dispatch_width);
3132
3133 if (devinfo->gen >= 7) {
3134 fs_reg src1_0_w = inst->src[1];
3135 fs_reg src1_1_w = inst->src[1];
3136
3137 if (inst->src[1].file == IMM) {
3138 src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
3139 src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
3140 } else {
3141 src1_0_w.type = BRW_REGISTER_TYPE_UW;
3142 if (src1_0_w.stride != 0) {
3143 assert(src1_0_w.stride == 1);
3144 src1_0_w.stride = 2;
3145 }
3146
3147 src1_1_w.type = BRW_REGISTER_TYPE_UW;
3148 if (src1_1_w.stride != 0) {
3149 assert(src1_1_w.stride == 1);
3150 src1_1_w.stride = 2;
3151 }
3152 src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3153 }
3154 ibld.MUL(low, inst->src[0], src1_0_w);
3155 ibld.MUL(high, inst->src[0], src1_1_w);
3156 } else {
3157 fs_reg src0_0_w = inst->src[0];
3158 fs_reg src0_1_w = inst->src[0];
3159
3160 src0_0_w.type = BRW_REGISTER_TYPE_UW;
3161 if (src0_0_w.stride != 0) {
3162 assert(src0_0_w.stride == 1);
3163 src0_0_w.stride = 2;
3164 }
3165
3166 src0_1_w.type = BRW_REGISTER_TYPE_UW;
3167 if (src0_1_w.stride != 0) {
3168 assert(src0_1_w.stride == 1);
3169 src0_1_w.stride = 2;
3170 }
3171 src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3172
3173 ibld.MUL(low, src0_0_w, inst->src[1]);
3174 ibld.MUL(high, src0_1_w, inst->src[1]);
3175 }
3176
3177 fs_reg dst = inst->dst;
3178 dst.type = BRW_REGISTER_TYPE_UW;
3179 dst.subreg_offset = 2;
3180 dst.stride = 2;
3181
3182 high.type = BRW_REGISTER_TYPE_UW;
3183 high.stride = 2;
3184
3185 low.type = BRW_REGISTER_TYPE_UW;
3186 low.subreg_offset = 2;
3187 low.stride = 2;
3188
3189 ibld.ADD(dst, low, high);
3190
3191 if (inst->conditional_mod) {
3192 fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
3193 set_condmod(inst->conditional_mod,
3194 ibld.MOV(null, inst->dst));
3195 }
3196 }
3197
3198 inst->remove(block);
3199 progress = true;
3200 }
3201
3202 if (progress)
3203 invalidate_live_intervals();
3204
3205 return progress;
3206 }
3207
3208 void
3209 fs_visitor::dump_instructions()
3210 {
3211 dump_instructions(NULL);
3212 }
3213
3214 void
3215 fs_visitor::dump_instructions(const char *name)
3216 {
3217 FILE *file = stderr;
3218 if (name && geteuid() != 0) {
3219 file = fopen(name, "w");
3220 if (!file)
3221 file = stderr;
3222 }
3223
3224 if (cfg) {
3225 calculate_register_pressure();
3226 int ip = 0, max_pressure = 0;
3227 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3228 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3229 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3230 dump_instruction(inst, file);
3231 ip++;
3232 }
3233 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3234 } else {
3235 int ip = 0;
3236 foreach_in_list(backend_instruction, inst, &instructions) {
3237 fprintf(file, "%4d: ", ip++);
3238 dump_instruction(inst, file);
3239 }
3240 }
3241
3242 if (file != stderr) {
3243 fclose(file);
3244 }
3245 }
3246
3247 void
3248 fs_visitor::dump_instruction(backend_instruction *be_inst)
3249 {
3250 dump_instruction(be_inst, stderr);
3251 }
3252
3253 void
3254 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3255 {
3256 fs_inst *inst = (fs_inst *)be_inst;
3257
3258 if (inst->predicate) {
3259 fprintf(file, "(%cf0.%d) ",
3260 inst->predicate_inverse ? '-' : '+',
3261 inst->flag_subreg);
3262 }
3263
3264 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3265 if (inst->saturate)
3266 fprintf(file, ".sat");
3267 if (inst->conditional_mod) {
3268 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3269 if (!inst->predicate &&
3270 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3271 inst->opcode != BRW_OPCODE_IF &&
3272 inst->opcode != BRW_OPCODE_WHILE))) {
3273 fprintf(file, ".f0.%d", inst->flag_subreg);
3274 }
3275 }
3276 fprintf(file, "(%d) ", inst->exec_size);
3277
3278 if (inst->mlen) {
3279 fprintf(file, "(mlen: %d) ", inst->mlen);
3280 }
3281
3282 switch (inst->dst.file) {
3283 case GRF:
3284 fprintf(file, "vgrf%d", inst->dst.reg);
3285 if (inst->dst.width != dispatch_width)
3286 fprintf(file, "@%d", inst->dst.width);
3287 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3288 inst->dst.subreg_offset)
3289 fprintf(file, "+%d.%d",
3290 inst->dst.reg_offset, inst->dst.subreg_offset);
3291 break;
3292 case MRF:
3293 fprintf(file, "m%d", inst->dst.reg);
3294 break;
3295 case BAD_FILE:
3296 fprintf(file, "(null)");
3297 break;
3298 case UNIFORM:
3299 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3300 break;
3301 case ATTR:
3302 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3303 break;
3304 case HW_REG:
3305 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3306 switch (inst->dst.fixed_hw_reg.nr) {
3307 case BRW_ARF_NULL:
3308 fprintf(file, "null");
3309 break;
3310 case BRW_ARF_ADDRESS:
3311 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3312 break;
3313 case BRW_ARF_ACCUMULATOR:
3314 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3315 break;
3316 case BRW_ARF_FLAG:
3317 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3318 inst->dst.fixed_hw_reg.subnr);
3319 break;
3320 default:
3321 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3322 inst->dst.fixed_hw_reg.subnr);
3323 break;
3324 }
3325 } else {
3326 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3327 }
3328 if (inst->dst.fixed_hw_reg.subnr)
3329 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3330 break;
3331 default:
3332 fprintf(file, "???");
3333 break;
3334 }
3335 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3336
3337 for (int i = 0; i < inst->sources; i++) {
3338 if (inst->src[i].negate)
3339 fprintf(file, "-");
3340 if (inst->src[i].abs)
3341 fprintf(file, "|");
3342 switch (inst->src[i].file) {
3343 case GRF:
3344 fprintf(file, "vgrf%d", inst->src[i].reg);
3345 if (inst->src[i].width != dispatch_width)
3346 fprintf(file, "@%d", inst->src[i].width);
3347 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3348 inst->src[i].subreg_offset)
3349 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3350 inst->src[i].subreg_offset);
3351 break;
3352 case MRF:
3353 fprintf(file, "***m%d***", inst->src[i].reg);
3354 break;
3355 case ATTR:
3356 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3357 break;
3358 case UNIFORM:
3359 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3360 if (inst->src[i].reladdr) {
3361 fprintf(file, "+reladdr");
3362 } else if (inst->src[i].subreg_offset) {
3363 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3364 inst->src[i].subreg_offset);
3365 }
3366 break;
3367 case BAD_FILE:
3368 fprintf(file, "(null)");
3369 break;
3370 case IMM:
3371 switch (inst->src[i].type) {
3372 case BRW_REGISTER_TYPE_F:
3373 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3374 break;
3375 case BRW_REGISTER_TYPE_W:
3376 case BRW_REGISTER_TYPE_D:
3377 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3378 break;
3379 case BRW_REGISTER_TYPE_UW:
3380 case BRW_REGISTER_TYPE_UD:
3381 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3382 break;
3383 case BRW_REGISTER_TYPE_VF:
3384 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3385 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3386 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3387 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3388 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3389 break;
3390 default:
3391 fprintf(file, "???");
3392 break;
3393 }
3394 break;
3395 case HW_REG:
3396 if (inst->src[i].fixed_hw_reg.negate)
3397 fprintf(file, "-");
3398 if (inst->src[i].fixed_hw_reg.abs)
3399 fprintf(file, "|");
3400 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3401 switch (inst->src[i].fixed_hw_reg.nr) {
3402 case BRW_ARF_NULL:
3403 fprintf(file, "null");
3404 break;
3405 case BRW_ARF_ADDRESS:
3406 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3407 break;
3408 case BRW_ARF_ACCUMULATOR:
3409 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3410 break;
3411 case BRW_ARF_FLAG:
3412 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3413 inst->src[i].fixed_hw_reg.subnr);
3414 break;
3415 default:
3416 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3417 inst->src[i].fixed_hw_reg.subnr);
3418 break;
3419 }
3420 } else {
3421 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3422 }
3423 if (inst->src[i].fixed_hw_reg.subnr)
3424 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3425 if (inst->src[i].fixed_hw_reg.abs)
3426 fprintf(file, "|");
3427 break;
3428 default:
3429 fprintf(file, "???");
3430 break;
3431 }
3432 if (inst->src[i].abs)
3433 fprintf(file, "|");
3434
3435 if (inst->src[i].file != IMM) {
3436 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3437 }
3438
3439 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3440 fprintf(file, ", ");
3441 }
3442
3443 fprintf(file, " ");
3444
3445 if (dispatch_width == 16 && inst->exec_size == 8) {
3446 if (inst->force_sechalf)
3447 fprintf(file, "2ndhalf ");
3448 else
3449 fprintf(file, "1sthalf ");
3450 }
3451
3452 fprintf(file, "\n");
3453 }
3454
3455 /**
3456 * Possibly returns an instruction that set up @param reg.
3457 *
3458 * Sometimes we want to take the result of some expression/variable
3459 * dereference tree and rewrite the instruction generating the result
3460 * of the tree. When processing the tree, we know that the
3461 * instructions generated are all writing temporaries that are dead
3462 * outside of this tree. So, if we have some instructions that write
3463 * a temporary, we're free to point that temp write somewhere else.
3464 *
3465 * Note that this doesn't guarantee that the instruction generated
3466 * only reg -- it might be the size=4 destination of a texture instruction.
3467 */
3468 fs_inst *
3469 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3470 fs_inst *end,
3471 const fs_reg &reg)
3472 {
3473 if (end == start ||
3474 end->is_partial_write() ||
3475 reg.reladdr ||
3476 !reg.equals(end->dst)) {
3477 return NULL;
3478 } else {
3479 return end;
3480 }
3481 }
3482
3483 void
3484 fs_visitor::setup_payload_gen6()
3485 {
3486 bool uses_depth =
3487 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3488 unsigned barycentric_interp_modes =
3489 (stage == MESA_SHADER_FRAGMENT) ?
3490 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3491
3492 assert(devinfo->gen >= 6);
3493
3494 /* R0-1: masks, pixel X/Y coordinates. */
3495 payload.num_regs = 2;
3496 /* R2: only for 32-pixel dispatch.*/
3497
3498 /* R3-26: barycentric interpolation coordinates. These appear in the
3499 * same order that they appear in the brw_wm_barycentric_interp_mode
3500 * enum. Each set of coordinates occupies 2 registers if dispatch width
3501 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3502 * appear if they were enabled using the "Barycentric Interpolation
3503 * Mode" bits in WM_STATE.
3504 */
3505 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3506 if (barycentric_interp_modes & (1 << i)) {
3507 payload.barycentric_coord_reg[i] = payload.num_regs;
3508 payload.num_regs += 2;
3509 if (dispatch_width == 16) {
3510 payload.num_regs += 2;
3511 }
3512 }
3513 }
3514
3515 /* R27: interpolated depth if uses source depth */
3516 if (uses_depth) {
3517 payload.source_depth_reg = payload.num_regs;
3518 payload.num_regs++;
3519 if (dispatch_width == 16) {
3520 /* R28: interpolated depth if not SIMD8. */
3521 payload.num_regs++;
3522 }
3523 }
3524 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3525 if (uses_depth) {
3526 payload.source_w_reg = payload.num_regs;
3527 payload.num_regs++;
3528 if (dispatch_width == 16) {
3529 /* R30: interpolated W if not SIMD8. */
3530 payload.num_regs++;
3531 }
3532 }
3533
3534 if (stage == MESA_SHADER_FRAGMENT) {
3535 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3536 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3537 prog_data->uses_pos_offset = key->compute_pos_offset;
3538 /* R31: MSAA position offsets. */
3539 if (prog_data->uses_pos_offset) {
3540 payload.sample_pos_reg = payload.num_regs;
3541 payload.num_regs++;
3542 }
3543 }
3544
3545 /* R32: MSAA input coverage mask */
3546 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3547 assert(devinfo->gen >= 7);
3548 payload.sample_mask_in_reg = payload.num_regs;
3549 payload.num_regs++;
3550 if (dispatch_width == 16) {
3551 /* R33: input coverage mask if not SIMD8. */
3552 payload.num_regs++;
3553 }
3554 }
3555
3556 /* R34-: bary for 32-pixel. */
3557 /* R58-59: interp W for 32-pixel. */
3558
3559 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3560 source_depth_to_render_target = true;
3561 }
3562 }
3563
3564 void
3565 fs_visitor::setup_vs_payload()
3566 {
3567 /* R0: thread header, R1: urb handles */
3568 payload.num_regs = 2;
3569 }
3570
3571 void
3572 fs_visitor::setup_cs_payload()
3573 {
3574 assert(devinfo->gen >= 7);
3575
3576 payload.num_regs = 1;
3577 }
3578
3579 void
3580 fs_visitor::assign_binding_table_offsets()
3581 {
3582 assert(stage == MESA_SHADER_FRAGMENT);
3583 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3584 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3585 uint32_t next_binding_table_offset = 0;
3586
3587 /* If there are no color regions, we still perform an FB write to a null
3588 * renderbuffer, which we place at surface index 0.
3589 */
3590 prog_data->binding_table.render_target_start = next_binding_table_offset;
3591 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3592
3593 assign_common_binding_table_offsets(next_binding_table_offset);
3594 }
3595
3596 void
3597 fs_visitor::calculate_register_pressure()
3598 {
3599 invalidate_live_intervals();
3600 calculate_live_intervals();
3601
3602 unsigned num_instructions = 0;
3603 foreach_block(block, cfg)
3604 num_instructions += block->instructions.length();
3605
3606 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3607
3608 for (unsigned reg = 0; reg < alloc.count; reg++) {
3609 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3610 regs_live_at_ip[ip] += alloc.sizes[reg];
3611 }
3612 }
3613
3614 void
3615 fs_visitor::optimize()
3616 {
3617 /* bld is the common builder object pointing at the end of the program we
3618 * used to translate it into i965 IR. For the optimization and lowering
3619 * passes coming next, any code added after the end of the program without
3620 * having explicitly called fs_builder::at() clearly points at a mistake.
3621 * Ideally optimization passes wouldn't be part of the visitor so they
3622 * wouldn't have access to bld at all, but they do, so just in case some
3623 * pass forgets to ask for a location explicitly set it to NULL here to
3624 * make it trip.
3625 */
3626 bld = bld.at(NULL, NULL);
3627
3628 split_virtual_grfs();
3629
3630 move_uniform_array_access_to_pull_constants();
3631 assign_constant_locations();
3632 demote_pull_constants();
3633
3634 #define OPT(pass, args...) ({ \
3635 pass_num++; \
3636 bool this_progress = pass(args); \
3637 \
3638 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3639 char filename[64]; \
3640 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3641 stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3642 \
3643 backend_shader::dump_instructions(filename); \
3644 } \
3645 \
3646 progress = progress || this_progress; \
3647 this_progress; \
3648 })
3649
3650 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3651 char filename[64];
3652 snprintf(filename, 64, "%s%d-%04d-00-start",
3653 stage_abbrev, dispatch_width,
3654 shader_prog ? shader_prog->Name : 0);
3655
3656 backend_shader::dump_instructions(filename);
3657 }
3658
3659 bool progress;
3660 int iteration = 0;
3661 int pass_num = 0;
3662 do {
3663 progress = false;
3664 pass_num = 0;
3665 iteration++;
3666
3667 OPT(remove_duplicate_mrf_writes);
3668
3669 OPT(opt_algebraic);
3670 OPT(opt_cse);
3671 OPT(opt_copy_propagate);
3672 OPT(opt_peephole_predicated_break);
3673 OPT(opt_cmod_propagation);
3674 OPT(dead_code_eliminate);
3675 OPT(opt_peephole_sel);
3676 OPT(dead_control_flow_eliminate, this);
3677 OPT(opt_register_renaming);
3678 OPT(opt_redundant_discard_jumps);
3679 OPT(opt_saturate_propagation);
3680 OPT(opt_zero_samples);
3681 OPT(register_coalesce);
3682 OPT(compute_to_mrf);
3683 OPT(eliminate_find_live_channel);
3684
3685 OPT(compact_virtual_grfs);
3686 } while (progress);
3687
3688 pass_num = 0;
3689
3690 OPT(opt_sampler_eot);
3691
3692 if (OPT(lower_load_payload)) {
3693 split_virtual_grfs();
3694 OPT(register_coalesce);
3695 OPT(compute_to_mrf);
3696 OPT(dead_code_eliminate);
3697 }
3698
3699 OPT(opt_combine_constants);
3700 OPT(lower_integer_multiplication);
3701
3702 lower_uniform_pull_constant_loads();
3703 }
3704
3705 /**
3706 * Three source instruction must have a GRF/MRF destination register.
3707 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
3708 */
3709 void
3710 fs_visitor::fixup_3src_null_dest()
3711 {
3712 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3713 if (inst->is_3src() && inst->dst.is_null()) {
3714 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3715 inst->dst.type);
3716 }
3717 }
3718 }
3719
3720 void
3721 fs_visitor::allocate_registers()
3722 {
3723 bool allocated_without_spills;
3724
3725 static const enum instruction_scheduler_mode pre_modes[] = {
3726 SCHEDULE_PRE,
3727 SCHEDULE_PRE_NON_LIFO,
3728 SCHEDULE_PRE_LIFO,
3729 };
3730
3731 /* Try each scheduling heuristic to see if it can successfully register
3732 * allocate without spilling. They should be ordered by decreasing
3733 * performance but increasing likelihood of allocating.
3734 */
3735 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3736 schedule_instructions(pre_modes[i]);
3737
3738 if (0) {
3739 assign_regs_trivial();
3740 allocated_without_spills = true;
3741 } else {
3742 allocated_without_spills = assign_regs(false);
3743 }
3744 if (allocated_without_spills)
3745 break;
3746 }
3747
3748 if (!allocated_without_spills) {
3749 /* We assume that any spilling is worse than just dropping back to
3750 * SIMD8. There's probably actually some intermediate point where
3751 * SIMD16 with a couple of spills is still better.
3752 */
3753 if (dispatch_width == 16) {
3754 fail("Failure to register allocate. Reduce number of "
3755 "live scalar values to avoid this.");
3756 } else {
3757 compiler->shader_perf_log(log_data,
3758 "%s shader triggered register spilling. "
3759 "Try reducing the number of live scalar "
3760 "values to improve performance.\n",
3761 stage_name);
3762 }
3763
3764 /* Since we're out of heuristics, just go spill registers until we
3765 * get an allocation.
3766 */
3767 while (!assign_regs(true)) {
3768 if (failed)
3769 break;
3770 }
3771 }
3772
3773 /* This must come after all optimization and register allocation, since
3774 * it inserts dead code that happens to have side effects, and it does
3775 * so based on the actual physical registers in use.
3776 */
3777 insert_gen4_send_dependency_workarounds();
3778
3779 if (failed)
3780 return;
3781
3782 if (!allocated_without_spills)
3783 schedule_instructions(SCHEDULE_POST);
3784
3785 if (last_scratch > 0)
3786 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3787 }
3788
3789 bool
3790 fs_visitor::run_vs(gl_clip_plane *clip_planes)
3791 {
3792 assert(stage == MESA_SHADER_VERTEX);
3793
3794 assign_common_binding_table_offsets(0);
3795 setup_vs_payload();
3796
3797 if (shader_time_index >= 0)
3798 emit_shader_time_begin();
3799
3800 emit_nir_code();
3801
3802 if (failed)
3803 return false;
3804
3805 compute_clip_distance(clip_planes);
3806
3807 emit_urb_writes();
3808
3809 if (shader_time_index >= 0)
3810 emit_shader_time_end();
3811
3812 calculate_cfg();
3813
3814 optimize();
3815
3816 assign_curb_setup();
3817 assign_vs_urb_setup();
3818
3819 fixup_3src_null_dest();
3820 allocate_registers();
3821
3822 return !failed;
3823 }
3824
3825 bool
3826 fs_visitor::run_fs(bool do_rep_send)
3827 {
3828 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3829 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3830
3831 assert(stage == MESA_SHADER_FRAGMENT);
3832
3833 sanity_param_count = prog->Parameters->NumParameters;
3834
3835 assign_binding_table_offsets();
3836
3837 if (devinfo->gen >= 6)
3838 setup_payload_gen6();
3839 else
3840 setup_payload_gen4();
3841
3842 if (0) {
3843 emit_dummy_fs();
3844 } else if (do_rep_send) {
3845 assert(dispatch_width == 16);
3846 emit_repclear_shader();
3847 } else {
3848 if (shader_time_index >= 0)
3849 emit_shader_time_begin();
3850
3851 calculate_urb_setup();
3852 if (prog->InputsRead > 0) {
3853 if (devinfo->gen < 6)
3854 emit_interpolation_setup_gen4();
3855 else
3856 emit_interpolation_setup_gen6();
3857 }
3858
3859 /* We handle discards by keeping track of the still-live pixels in f0.1.
3860 * Initialize it with the dispatched pixels.
3861 */
3862 if (wm_prog_data->uses_kill) {
3863 fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3864 discard_init->flag_subreg = 1;
3865 }
3866
3867 /* Generate FS IR for main(). (the visitor only descends into
3868 * functions called "main").
3869 */
3870 emit_nir_code();
3871
3872 if (failed)
3873 return false;
3874
3875 if (wm_prog_data->uses_kill)
3876 bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
3877
3878 if (wm_key->alpha_test_func)
3879 emit_alpha_test();
3880
3881 emit_fb_writes();
3882
3883 if (shader_time_index >= 0)
3884 emit_shader_time_end();
3885
3886 calculate_cfg();
3887
3888 optimize();
3889
3890 assign_curb_setup();
3891 assign_urb_setup();
3892
3893 fixup_3src_null_dest();
3894 allocate_registers();
3895
3896 if (failed)
3897 return false;
3898 }
3899
3900 if (dispatch_width == 8)
3901 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3902 else
3903 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3904
3905 /* If any state parameters were appended, then ParameterValues could have
3906 * been realloced, in which case the driver uniform storage set up by
3907 * _mesa_associate_uniform_storage() would point to freed memory. Make
3908 * sure that didn't happen.
3909 */
3910 assert(sanity_param_count == prog->Parameters->NumParameters);
3911
3912 return !failed;
3913 }
3914
3915 bool
3916 fs_visitor::run_cs()
3917 {
3918 assert(stage == MESA_SHADER_COMPUTE);
3919 assert(shader);
3920
3921 sanity_param_count = prog->Parameters->NumParameters;
3922
3923 assign_common_binding_table_offsets(0);
3924
3925 setup_cs_payload();
3926
3927 if (shader_time_index >= 0)
3928 emit_shader_time_begin();
3929
3930 emit_nir_code();
3931
3932 if (failed)
3933 return false;
3934
3935 emit_cs_terminate();
3936
3937 if (shader_time_index >= 0)
3938 emit_shader_time_end();
3939
3940 calculate_cfg();
3941
3942 optimize();
3943
3944 assign_curb_setup();
3945
3946 fixup_3src_null_dest();
3947 allocate_registers();
3948
3949 if (failed)
3950 return false;
3951
3952 /* If any state parameters were appended, then ParameterValues could have
3953 * been realloced, in which case the driver uniform storage set up by
3954 * _mesa_associate_uniform_storage() would point to freed memory. Make
3955 * sure that didn't happen.
3956 */
3957 assert(sanity_param_count == prog->Parameters->NumParameters);
3958
3959 return !failed;
3960 }
3961
3962 const unsigned *
3963 brw_wm_fs_emit(struct brw_context *brw,
3964 void *mem_ctx,
3965 const struct brw_wm_prog_key *key,
3966 struct brw_wm_prog_data *prog_data,
3967 struct gl_fragment_program *fp,
3968 struct gl_shader_program *prog,
3969 unsigned *final_assembly_size)
3970 {
3971 bool start_busy = false;
3972 double start_time = 0;
3973
3974 if (unlikely(brw->perf_debug)) {
3975 start_busy = (brw->batch.last_bo &&
3976 drm_intel_bo_busy(brw->batch.last_bo));
3977 start_time = get_time();
3978 }
3979
3980 struct brw_shader *shader = NULL;
3981 if (prog)
3982 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3983
3984 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3985 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3986
3987 int st_index8 = -1, st_index16 = -1;
3988 if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
3989 st_index8 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS8);
3990 st_index16 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS16);
3991 }
3992
3993 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3994 */
3995 fs_visitor v(brw->intelScreen->compiler, brw,
3996 mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
3997 prog, &fp->Base, 8, st_index8);
3998 if (!v.run_fs(false /* do_rep_send */)) {
3999 if (prog) {
4000 prog->LinkStatus = false;
4001 ralloc_strcat(&prog->InfoLog, v.fail_msg);
4002 }
4003
4004 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4005 v.fail_msg);
4006
4007 return NULL;
4008 }
4009
4010 cfg_t *simd16_cfg = NULL;
4011 fs_visitor v2(brw->intelScreen->compiler, brw,
4012 mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4013 prog, &fp->Base, 16, st_index16);
4014 if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4015 if (!v.simd16_unsupported) {
4016 /* Try a SIMD16 compile */
4017 v2.import_uniforms(&v);
4018 if (!v2.run_fs(brw->use_rep_send)) {
4019 perf_debug("SIMD16 shader failed to compile: %s", v2.fail_msg);
4020 } else {
4021 simd16_cfg = v2.cfg;
4022 }
4023 }
4024 }
4025
4026 cfg_t *simd8_cfg;
4027 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4028 if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4029 simd8_cfg = NULL;
4030 prog_data->no_8 = true;
4031 } else {
4032 simd8_cfg = v.cfg;
4033 prog_data->no_8 = false;
4034 }
4035
4036 fs_generator g(brw->intelScreen->compiler, brw,
4037 mem_ctx, (void *) key, &prog_data->base,
4038 &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4039
4040 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4041 char *name;
4042 if (prog)
4043 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4044 prog->Label ? prog->Label : "unnamed",
4045 prog->Name);
4046 else
4047 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4048
4049 g.enable_debug(name);
4050 }
4051
4052 if (simd8_cfg)
4053 g.generate_code(simd8_cfg, 8);
4054 if (simd16_cfg)
4055 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4056
4057 if (unlikely(brw->perf_debug) && shader) {
4058 if (shader->compiled_once)
4059 brw_wm_debug_recompile(brw, prog, key);
4060 shader->compiled_once = true;
4061
4062 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4063 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4064 (get_time() - start_time) * 1000);
4065 }
4066 }
4067
4068 return g.get_assembly(final_assembly_size);
4069 }
4070
4071 extern "C" bool
4072 brw_fs_precompile(struct gl_context *ctx,
4073 struct gl_shader_program *shader_prog,
4074 struct gl_program *prog)
4075 {
4076 struct brw_context *brw = brw_context(ctx);
4077 struct brw_wm_prog_key key;
4078
4079 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4080 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4081 bool program_uses_dfdy = fp->UsesDFdy;
4082
4083 memset(&key, 0, sizeof(key));
4084
4085 if (brw->gen < 6) {
4086 if (fp->UsesKill)
4087 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4088
4089 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4090 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4091
4092 /* Just assume depth testing. */
4093 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4094 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4095 }
4096
4097 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4098 BRW_FS_VARYING_INPUT_MASK) > 16)
4099 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4100
4101 brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4102
4103 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4104 key.drawable_height = ctx->DrawBuffer->Height;
4105 }
4106
4107 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4108 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4109 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4110
4111 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4112 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4113 key.nr_color_regions > 1;
4114 }
4115
4116 key.program_string_id = bfp->id;
4117
4118 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4119 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4120
4121 bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4122
4123 brw->wm.base.prog_offset = old_prog_offset;
4124 brw->wm.prog_data = old_prog_data;
4125
4126 return success;
4127 }
4128
4129 void
4130 brw_setup_tex_for_precompile(struct brw_context *brw,
4131 struct brw_sampler_prog_key_data *tex,
4132 struct gl_program *prog)
4133 {
4134 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4135 unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4136 for (unsigned i = 0; i < sampler_count; i++) {
4137 if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4138 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4139 tex->swizzles[i] =
4140 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4141 } else {
4142 /* Color sampler: assume no swizzling. */
4143 tex->swizzles[i] = SWIZZLE_XYZW;
4144 }
4145 }
4146 }