i965/fs: Remove the width field from fs_reg
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 using namespace brw;
53
54 void
55 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
56 const fs_reg *src, unsigned sources)
57 {
58 memset(this, 0, sizeof(*this));
59
60 this->src = new fs_reg[MAX2(sources, 3)];
61 for (unsigned i = 0; i < sources; i++)
62 this->src[i] = src[i];
63
64 this->opcode = opcode;
65 this->dst = dst;
66 this->sources = sources;
67 this->exec_size = exec_size;
68
69 assert(dst.file != IMM && dst.file != UNIFORM);
70
71 assert(this->exec_size != 0);
72
73 this->conditional_mod = BRW_CONDITIONAL_NONE;
74
75 /* This will be the case for almost all instructions. */
76 switch (dst.file) {
77 case GRF:
78 case HW_REG:
79 case MRF:
80 case ATTR:
81 this->regs_written =
82 DIV_ROUND_UP(MAX2(exec_size * dst.stride, 1) * type_sz(dst.type), 32);
83 break;
84 case BAD_FILE:
85 this->regs_written = 0;
86 break;
87 case IMM:
88 case UNIFORM:
89 unreachable("Invalid destination register file");
90 default:
91 unreachable("Invalid register file");
92 }
93
94 this->writes_accumulator = false;
95 }
96
97 fs_inst::fs_inst()
98 {
99 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
100 }
101
102 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
103 {
104 init(opcode, exec_size, reg_undef, NULL, 0);
105 }
106
107 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
108 {
109 init(opcode, exec_size, dst, NULL, 0);
110 }
111
112 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
113 const fs_reg &src0)
114 {
115 const fs_reg src[1] = { src0 };
116 init(opcode, exec_size, dst, src, 1);
117 }
118
119 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
120 const fs_reg &src0, const fs_reg &src1)
121 {
122 const fs_reg src[2] = { src0, src1 };
123 init(opcode, exec_size, dst, src, 2);
124 }
125
126 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
127 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
128 {
129 const fs_reg src[3] = { src0, src1, src2 };
130 init(opcode, exec_size, dst, src, 3);
131 }
132
133 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
134 const fs_reg src[], unsigned sources)
135 {
136 init(opcode, exec_width, dst, src, sources);
137 }
138
139 fs_inst::fs_inst(const fs_inst &that)
140 {
141 memcpy(this, &that, sizeof(that));
142
143 this->src = new fs_reg[MAX2(that.sources, 3)];
144
145 for (unsigned i = 0; i < that.sources; i++)
146 this->src[i] = that.src[i];
147 }
148
149 fs_inst::~fs_inst()
150 {
151 delete[] this->src;
152 }
153
154 void
155 fs_inst::resize_sources(uint8_t num_sources)
156 {
157 if (this->sources != num_sources) {
158 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
159
160 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
161 src[i] = this->src[i];
162
163 delete[] this->src;
164 this->src = src;
165 this->sources = num_sources;
166 }
167 }
168
169 void
170 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
171 const fs_reg &dst,
172 const fs_reg &surf_index,
173 const fs_reg &varying_offset,
174 uint32_t const_offset)
175 {
176 /* We have our constant surface use a pitch of 4 bytes, so our index can
177 * be any component of a vector, and then we load 4 contiguous
178 * components starting from that.
179 *
180 * We break down the const_offset to a portion added to the variable
181 * offset and a portion done using reg_offset, which means that if you
182 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
183 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
184 * CSE can later notice that those loads are all the same and eliminate
185 * the redundant ones.
186 */
187 fs_reg vec4_offset = vgrf(glsl_type::int_type);
188 bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
189
190 int scale = 1;
191 if (devinfo->gen == 4 && bld.dispatch_width() == 8) {
192 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
193 * u, v, r) as parameters, or we can just use the SIMD16 message
194 * consisting of (header, u). We choose the second, at the cost of a
195 * longer return length.
196 */
197 scale = 2;
198 }
199
200 enum opcode op;
201 if (devinfo->gen >= 7)
202 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
203 else
204 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
205
206 int regs_written = 4 * (bld.dispatch_width() / 8) * scale;
207 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written), dst.type);
208 fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
209 inst->regs_written = regs_written;
210
211 if (devinfo->gen < 7) {
212 inst->base_mrf = 13;
213 inst->header_size = 1;
214 if (devinfo->gen == 4)
215 inst->mlen = 3;
216 else
217 inst->mlen = 1 + bld.dispatch_width() / 8;
218 }
219
220 bld.MOV(dst, offset(vec4_result, bld, (const_offset & 3) * scale));
221 }
222
223 /**
224 * A helper for MOV generation for fixing up broken hardware SEND dependency
225 * handling.
226 */
227 void
228 fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
229 {
230 /* The caller always wants uncompressed to emit the minimal extra
231 * dependencies, and to avoid having to deal with aligning its regs to 2.
232 */
233 const fs_builder ubld = bld.annotate("send dependency resolve")
234 .half(0);
235
236 ubld.MOV(ubld.null_reg_f(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
237 }
238
239 bool
240 fs_inst::equals(fs_inst *inst) const
241 {
242 return (opcode == inst->opcode &&
243 dst.equals(inst->dst) &&
244 src[0].equals(inst->src[0]) &&
245 src[1].equals(inst->src[1]) &&
246 src[2].equals(inst->src[2]) &&
247 saturate == inst->saturate &&
248 predicate == inst->predicate &&
249 conditional_mod == inst->conditional_mod &&
250 mlen == inst->mlen &&
251 base_mrf == inst->base_mrf &&
252 target == inst->target &&
253 eot == inst->eot &&
254 header_size == inst->header_size &&
255 shadow_compare == inst->shadow_compare &&
256 exec_size == inst->exec_size &&
257 offset == inst->offset);
258 }
259
260 bool
261 fs_inst::overwrites_reg(const fs_reg &reg) const
262 {
263 return reg.in_range(dst, regs_written);
264 }
265
266 bool
267 fs_inst::is_send_from_grf() const
268 {
269 switch (opcode) {
270 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
271 case SHADER_OPCODE_SHADER_TIME_ADD:
272 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
273 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
274 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
275 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
276 case SHADER_OPCODE_UNTYPED_ATOMIC:
277 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
278 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
279 case SHADER_OPCODE_TYPED_ATOMIC:
280 case SHADER_OPCODE_TYPED_SURFACE_READ:
281 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
282 case SHADER_OPCODE_URB_WRITE_SIMD8:
283 return true;
284 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
285 return src[1].file == GRF;
286 case FS_OPCODE_FB_WRITE:
287 return src[0].file == GRF;
288 default:
289 if (is_tex())
290 return src[0].file == GRF;
291
292 return false;
293 }
294 }
295
296 bool
297 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
298 {
299 if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
300 return false;
301
302 fs_reg reg = this->src[0];
303 if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
304 return false;
305
306 if (grf_alloc.sizes[reg.reg] != this->regs_written)
307 return false;
308
309 for (int i = 0; i < this->sources; i++) {
310 reg.type = this->src[i].type;
311 if (!this->src[i].equals(reg))
312 return false;
313
314 if (i < this->header_size) {
315 reg.reg_offset += 1;
316 } else {
317 reg.reg_offset += this->exec_size / 8;
318 }
319 }
320
321 return true;
322 }
323
324 bool
325 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
326 {
327 if (devinfo->gen == 6 && is_math())
328 return false;
329
330 if (is_send_from_grf())
331 return false;
332
333 if (!backend_instruction::can_do_source_mods())
334 return false;
335
336 return true;
337 }
338
339 bool
340 fs_inst::has_side_effects() const
341 {
342 return this->eot || backend_instruction::has_side_effects();
343 }
344
345 void
346 fs_reg::init()
347 {
348 memset(this, 0, sizeof(*this));
349 stride = 1;
350 }
351
352 /** Generic unset register constructor. */
353 fs_reg::fs_reg()
354 {
355 init();
356 this->file = BAD_FILE;
357 }
358
359 /** Immediate value constructor. */
360 fs_reg::fs_reg(float f)
361 {
362 init();
363 this->file = IMM;
364 this->type = BRW_REGISTER_TYPE_F;
365 this->fixed_hw_reg.dw1.f = f;
366 }
367
368 /** Immediate value constructor. */
369 fs_reg::fs_reg(int32_t i)
370 {
371 init();
372 this->file = IMM;
373 this->type = BRW_REGISTER_TYPE_D;
374 this->fixed_hw_reg.dw1.d = i;
375 }
376
377 /** Immediate value constructor. */
378 fs_reg::fs_reg(uint32_t u)
379 {
380 init();
381 this->file = IMM;
382 this->type = BRW_REGISTER_TYPE_UD;
383 this->fixed_hw_reg.dw1.ud = u;
384 }
385
386 /** Vector float immediate value constructor. */
387 fs_reg::fs_reg(uint8_t vf[4])
388 {
389 init();
390 this->file = IMM;
391 this->type = BRW_REGISTER_TYPE_VF;
392 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
393 }
394
395 /** Vector float immediate value constructor. */
396 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
397 {
398 init();
399 this->file = IMM;
400 this->type = BRW_REGISTER_TYPE_VF;
401 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
402 (vf1 << 8) |
403 (vf2 << 16) |
404 (vf3 << 24);
405 }
406
407 /** Fixed brw_reg. */
408 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
409 {
410 init();
411 this->file = HW_REG;
412 this->fixed_hw_reg = fixed_hw_reg;
413 this->type = fixed_hw_reg.type;
414 }
415
416 bool
417 fs_reg::equals(const fs_reg &r) const
418 {
419 return (file == r.file &&
420 reg == r.reg &&
421 reg_offset == r.reg_offset &&
422 subreg_offset == r.subreg_offset &&
423 type == r.type &&
424 negate == r.negate &&
425 abs == r.abs &&
426 !reladdr && !r.reladdr &&
427 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
428 stride == r.stride);
429 }
430
431 fs_reg &
432 fs_reg::set_smear(unsigned subreg)
433 {
434 assert(file != HW_REG && file != IMM);
435 subreg_offset = subreg * type_sz(type);
436 stride = 0;
437 return *this;
438 }
439
440 bool
441 fs_reg::is_contiguous() const
442 {
443 return stride == 1;
444 }
445
446 int
447 fs_visitor::type_size(const struct glsl_type *type)
448 {
449 unsigned int size, i;
450
451 switch (type->base_type) {
452 case GLSL_TYPE_UINT:
453 case GLSL_TYPE_INT:
454 case GLSL_TYPE_FLOAT:
455 case GLSL_TYPE_BOOL:
456 return type->components();
457 case GLSL_TYPE_ARRAY:
458 return type_size(type->fields.array) * type->length;
459 case GLSL_TYPE_STRUCT:
460 size = 0;
461 for (i = 0; i < type->length; i++) {
462 size += type_size(type->fields.structure[i].type);
463 }
464 return size;
465 case GLSL_TYPE_SAMPLER:
466 /* Samplers take up no register space, since they're baked in at
467 * link time.
468 */
469 return 0;
470 case GLSL_TYPE_ATOMIC_UINT:
471 return 0;
472 case GLSL_TYPE_IMAGE:
473 case GLSL_TYPE_VOID:
474 case GLSL_TYPE_ERROR:
475 case GLSL_TYPE_INTERFACE:
476 case GLSL_TYPE_DOUBLE:
477 unreachable("not reached");
478 }
479
480 return 0;
481 }
482
483 /**
484 * Create a MOV to read the timestamp register.
485 *
486 * The caller is responsible for emitting the MOV. The return value is
487 * the destination of the MOV, with extra parameters set.
488 */
489 fs_reg
490 fs_visitor::get_timestamp(const fs_builder &bld)
491 {
492 assert(devinfo->gen >= 7);
493
494 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
495 BRW_ARF_TIMESTAMP,
496 0),
497 BRW_REGISTER_TYPE_UD));
498
499 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
500
501 /* We want to read the 3 fields we care about even if it's not enabled in
502 * the dispatch.
503 */
504 bld.group(4, 0).exec_all().MOV(dst, ts);
505
506 /* The caller wants the low 32 bits of the timestamp. Since it's running
507 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
508 * which is plenty of time for our purposes. It is identical across the
509 * EUs, but since it's tracking GPU core speed it will increment at a
510 * varying rate as render P-states change.
511 *
512 * The caller could also check if render P-states have changed (or anything
513 * else that might disrupt timing) by setting smear to 2 and checking if
514 * that field is != 0.
515 */
516 dst.set_smear(0);
517
518 return dst;
519 }
520
521 void
522 fs_visitor::emit_shader_time_begin()
523 {
524 shader_start_time = get_timestamp(bld.annotate("shader time start"));
525 }
526
527 void
528 fs_visitor::emit_shader_time_end()
529 {
530 /* Insert our code just before the final SEND with EOT. */
531 exec_node *end = this->instructions.get_tail();
532 assert(end && ((fs_inst *) end)->eot);
533 const fs_builder ibld = bld.annotate("shader time end")
534 .exec_all().at(NULL, end);
535
536 fs_reg shader_end_time = get_timestamp(ibld);
537
538 /* Check that there weren't any timestamp reset events (assuming these
539 * were the only two timestamp reads that happened).
540 */
541 fs_reg reset = shader_end_time;
542 reset.set_smear(2);
543 set_condmod(BRW_CONDITIONAL_Z,
544 ibld.AND(ibld.null_reg_ud(), reset, fs_reg(1u)));
545 ibld.IF(BRW_PREDICATE_NORMAL);
546
547 fs_reg start = shader_start_time;
548 start.negate = true;
549 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
550 diff.set_smear(0);
551
552 const fs_builder cbld = ibld.group(1, 0);
553 cbld.group(1, 0).ADD(diff, start, shader_end_time);
554
555 /* If there were no instructions between the two timestamp gets, the diff
556 * is 2 cycles. Remove that overhead, so I can forget about that when
557 * trying to determine the time taken for single instructions.
558 */
559 cbld.ADD(diff, diff, fs_reg(-2u));
560 SHADER_TIME_ADD(cbld, 0, diff);
561 SHADER_TIME_ADD(cbld, 1, fs_reg(1u));
562 ibld.emit(BRW_OPCODE_ELSE);
563 SHADER_TIME_ADD(cbld, 2, fs_reg(1u));
564 ibld.emit(BRW_OPCODE_ENDIF);
565 }
566
567 void
568 fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
569 int shader_time_subindex,
570 fs_reg value)
571 {
572 int index = shader_time_index * 3 + shader_time_subindex;
573 fs_reg offset = fs_reg(index * SHADER_TIME_STRIDE);
574
575 fs_reg payload;
576 if (dispatch_width == 8)
577 payload = vgrf(glsl_type::uvec2_type);
578 else
579 payload = vgrf(glsl_type::uint_type);
580
581 bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
582 }
583
584 void
585 fs_visitor::vfail(const char *format, va_list va)
586 {
587 char *msg;
588
589 if (failed)
590 return;
591
592 failed = true;
593
594 msg = ralloc_vasprintf(mem_ctx, format, va);
595 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
596
597 this->fail_msg = msg;
598
599 if (debug_enabled) {
600 fprintf(stderr, "%s", msg);
601 }
602 }
603
604 void
605 fs_visitor::fail(const char *format, ...)
606 {
607 va_list va;
608
609 va_start(va, format);
610 vfail(format, va);
611 va_end(va);
612 }
613
614 /**
615 * Mark this program as impossible to compile in SIMD16 mode.
616 *
617 * During the SIMD8 compile (which happens first), we can detect and flag
618 * things that are unsupported in SIMD16 mode, so the compiler can skip
619 * the SIMD16 compile altogether.
620 *
621 * During a SIMD16 compile (if one happens anyway), this just calls fail().
622 */
623 void
624 fs_visitor::no16(const char *msg)
625 {
626 if (dispatch_width == 16) {
627 fail("%s", msg);
628 } else {
629 simd16_unsupported = true;
630
631 compiler->shader_perf_log(log_data,
632 "SIMD16 shader failed to compile: %s", msg);
633 }
634 }
635
636 /**
637 * Returns true if the instruction has a flag that means it won't
638 * update an entire destination register.
639 *
640 * For example, dead code elimination and live variable analysis want to know
641 * when a write to a variable screens off any preceding values that were in
642 * it.
643 */
644 bool
645 fs_inst::is_partial_write() const
646 {
647 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
648 (this->exec_size * type_sz(this->dst.type)) < 32 ||
649 !this->dst.is_contiguous());
650 }
651
652 int
653 fs_inst::regs_read(int arg) const
654 {
655 unsigned components = 1;
656 switch (opcode) {
657 case FS_OPCODE_FB_WRITE:
658 case SHADER_OPCODE_URB_WRITE_SIMD8:
659 case SHADER_OPCODE_UNTYPED_ATOMIC:
660 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
661 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
662 case SHADER_OPCODE_TYPED_ATOMIC:
663 case SHADER_OPCODE_TYPED_SURFACE_READ:
664 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
665 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
666 if (arg == 0)
667 return mlen;
668 break;
669
670 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
671 /* The payload is actually stored in src1 */
672 if (arg == 1)
673 return mlen;
674 break;
675
676 case FS_OPCODE_LINTERP:
677 if (arg == 0)
678 return exec_size / 4;
679 break;
680
681 case FS_OPCODE_PIXEL_X:
682 case FS_OPCODE_PIXEL_Y:
683 if (arg == 0)
684 components = 1;
685 break;
686
687 case SHADER_OPCODE_LOAD_PAYLOAD:
688 if (arg < this->header_size)
689 return 1;
690 break;
691
692 default:
693 if (is_tex() && arg == 0 && src[0].file == GRF)
694 return mlen;
695 break;
696 }
697
698 switch (src[arg].file) {
699 case BAD_FILE:
700 case UNIFORM:
701 case IMM:
702 return 1;
703 case GRF:
704 case HW_REG:
705 if (src[arg].stride == 0) {
706 return 1;
707 } else {
708 int size = components * this->exec_size * type_sz(src[arg].type);
709 return DIV_ROUND_UP(size * src[arg].stride, 32);
710 }
711 case MRF:
712 unreachable("MRF registers are not allowed as sources");
713 default:
714 unreachable("Invalid register file");
715 }
716 }
717
718 bool
719 fs_inst::reads_flag() const
720 {
721 return predicate;
722 }
723
724 bool
725 fs_inst::writes_flag() const
726 {
727 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
728 opcode != BRW_OPCODE_IF &&
729 opcode != BRW_OPCODE_WHILE)) ||
730 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
731 }
732
733 /**
734 * Returns how many MRFs an FS opcode will write over.
735 *
736 * Note that this is not the 0 or 1 implied writes in an actual gen
737 * instruction -- the FS opcodes often generate MOVs in addition.
738 */
739 int
740 fs_visitor::implied_mrf_writes(fs_inst *inst)
741 {
742 if (inst->mlen == 0)
743 return 0;
744
745 if (inst->base_mrf == -1)
746 return 0;
747
748 switch (inst->opcode) {
749 case SHADER_OPCODE_RCP:
750 case SHADER_OPCODE_RSQ:
751 case SHADER_OPCODE_SQRT:
752 case SHADER_OPCODE_EXP2:
753 case SHADER_OPCODE_LOG2:
754 case SHADER_OPCODE_SIN:
755 case SHADER_OPCODE_COS:
756 return 1 * dispatch_width / 8;
757 case SHADER_OPCODE_POW:
758 case SHADER_OPCODE_INT_QUOTIENT:
759 case SHADER_OPCODE_INT_REMAINDER:
760 return 2 * dispatch_width / 8;
761 case SHADER_OPCODE_TEX:
762 case FS_OPCODE_TXB:
763 case SHADER_OPCODE_TXD:
764 case SHADER_OPCODE_TXF:
765 case SHADER_OPCODE_TXF_CMS:
766 case SHADER_OPCODE_TXF_MCS:
767 case SHADER_OPCODE_TG4:
768 case SHADER_OPCODE_TG4_OFFSET:
769 case SHADER_OPCODE_TXL:
770 case SHADER_OPCODE_TXS:
771 case SHADER_OPCODE_LOD:
772 return 1;
773 case FS_OPCODE_FB_WRITE:
774 return 2;
775 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
776 case SHADER_OPCODE_GEN4_SCRATCH_READ:
777 return 1;
778 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
779 return inst->mlen;
780 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
781 return inst->mlen;
782 case SHADER_OPCODE_UNTYPED_ATOMIC:
783 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
784 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
785 case SHADER_OPCODE_TYPED_ATOMIC:
786 case SHADER_OPCODE_TYPED_SURFACE_READ:
787 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
788 case SHADER_OPCODE_URB_WRITE_SIMD8:
789 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
790 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
791 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
792 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
793 return 0;
794 default:
795 unreachable("not reached");
796 }
797 }
798
799 fs_reg
800 fs_visitor::vgrf(const glsl_type *const type)
801 {
802 int reg_width = dispatch_width / 8;
803 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
804 brw_type_for_base_type(type));
805 }
806
807 /** Fixed HW reg constructor. */
808 fs_reg::fs_reg(enum register_file file, int reg)
809 {
810 init();
811 this->file = file;
812 this->reg = reg;
813 this->type = BRW_REGISTER_TYPE_F;
814 }
815
816 /** Fixed HW reg constructor. */
817 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
818 {
819 init();
820 this->file = file;
821 this->reg = reg;
822 this->type = type;
823 }
824
825 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
826 * This brings in those uniform definitions
827 */
828 void
829 fs_visitor::import_uniforms(fs_visitor *v)
830 {
831 this->push_constant_loc = v->push_constant_loc;
832 this->pull_constant_loc = v->pull_constant_loc;
833 this->uniforms = v->uniforms;
834 this->param_size = v->param_size;
835 }
836
837 fs_reg *
838 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
839 bool origin_upper_left)
840 {
841 assert(stage == MESA_SHADER_FRAGMENT);
842 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
843 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
844 fs_reg wpos = *reg;
845 bool flip = !origin_upper_left ^ key->render_to_fbo;
846
847 /* gl_FragCoord.x */
848 if (pixel_center_integer) {
849 bld.MOV(wpos, this->pixel_x);
850 } else {
851 bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
852 }
853 wpos = offset(wpos, bld, 1);
854
855 /* gl_FragCoord.y */
856 if (!flip && pixel_center_integer) {
857 bld.MOV(wpos, this->pixel_y);
858 } else {
859 fs_reg pixel_y = this->pixel_y;
860 float offset = (pixel_center_integer ? 0.0 : 0.5);
861
862 if (flip) {
863 pixel_y.negate = true;
864 offset += key->drawable_height - 1.0;
865 }
866
867 bld.ADD(wpos, pixel_y, fs_reg(offset));
868 }
869 wpos = offset(wpos, bld, 1);
870
871 /* gl_FragCoord.z */
872 if (devinfo->gen >= 6) {
873 bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
874 } else {
875 bld.emit(FS_OPCODE_LINTERP, wpos,
876 this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
877 interp_reg(VARYING_SLOT_POS, 2));
878 }
879 wpos = offset(wpos, bld, 1);
880
881 /* gl_FragCoord.w: Already set up in emit_interpolation */
882 bld.MOV(wpos, this->wpos_w);
883
884 return reg;
885 }
886
887 fs_inst *
888 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
889 glsl_interp_qualifier interpolation_mode,
890 bool is_centroid, bool is_sample)
891 {
892 brw_wm_barycentric_interp_mode barycoord_mode;
893 if (devinfo->gen >= 6) {
894 if (is_centroid) {
895 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
896 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
897 else
898 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
899 } else if (is_sample) {
900 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
901 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
902 else
903 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
904 } else {
905 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
906 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
907 else
908 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
909 }
910 } else {
911 /* On Ironlake and below, there is only one interpolation mode.
912 * Centroid interpolation doesn't mean anything on this hardware --
913 * there is no multisampling.
914 */
915 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
916 }
917 return bld.emit(FS_OPCODE_LINTERP, attr,
918 this->delta_xy[barycoord_mode], interp);
919 }
920
921 void
922 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
923 const glsl_type *type,
924 glsl_interp_qualifier interpolation_mode,
925 int location, bool mod_centroid,
926 bool mod_sample)
927 {
928 attr.type = brw_type_for_base_type(type->get_scalar_type());
929
930 assert(stage == MESA_SHADER_FRAGMENT);
931 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
932 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
933
934 unsigned int array_elements;
935
936 if (type->is_array()) {
937 array_elements = type->length;
938 if (array_elements == 0) {
939 fail("dereferenced array '%s' has length 0\n", name);
940 }
941 type = type->fields.array;
942 } else {
943 array_elements = 1;
944 }
945
946 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
947 bool is_gl_Color =
948 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
949 if (key->flat_shade && is_gl_Color) {
950 interpolation_mode = INTERP_QUALIFIER_FLAT;
951 } else {
952 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
953 }
954 }
955
956 for (unsigned int i = 0; i < array_elements; i++) {
957 for (unsigned int j = 0; j < type->matrix_columns; j++) {
958 if (prog_data->urb_setup[location] == -1) {
959 /* If there's no incoming setup data for this slot, don't
960 * emit interpolation for it.
961 */
962 attr = offset(attr, bld, type->vector_elements);
963 location++;
964 continue;
965 }
966
967 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
968 /* Constant interpolation (flat shading) case. The SF has
969 * handed us defined values in only the constant offset
970 * field of the setup reg.
971 */
972 for (unsigned int k = 0; k < type->vector_elements; k++) {
973 struct brw_reg interp = interp_reg(location, k);
974 interp = suboffset(interp, 3);
975 interp.type = attr.type;
976 bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
977 attr = offset(attr, bld, 1);
978 }
979 } else {
980 /* Smooth/noperspective interpolation case. */
981 for (unsigned int k = 0; k < type->vector_elements; k++) {
982 struct brw_reg interp = interp_reg(location, k);
983 if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
984 /* Get the pixel/sample mask into f0 so that we know
985 * which pixels are lit. Then, for each channel that is
986 * unlit, replace the centroid data with non-centroid
987 * data.
988 */
989 bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
990
991 fs_inst *inst;
992 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
993 false, false);
994 inst->predicate = BRW_PREDICATE_NORMAL;
995 inst->predicate_inverse = true;
996 if (devinfo->has_pln)
997 inst->no_dd_clear = true;
998
999 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1000 mod_centroid && !key->persample_shading,
1001 mod_sample || key->persample_shading);
1002 inst->predicate = BRW_PREDICATE_NORMAL;
1003 inst->predicate_inverse = false;
1004 if (devinfo->has_pln)
1005 inst->no_dd_check = true;
1006
1007 } else {
1008 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1009 mod_centroid && !key->persample_shading,
1010 mod_sample || key->persample_shading);
1011 }
1012 if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1013 bld.MUL(attr, attr, this->pixel_w);
1014 }
1015 attr = offset(attr, bld, 1);
1016 }
1017
1018 }
1019 location++;
1020 }
1021 }
1022 }
1023
1024 fs_reg *
1025 fs_visitor::emit_frontfacing_interpolation()
1026 {
1027 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1028
1029 if (devinfo->gen >= 6) {
1030 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1031 * a boolean result from this (~0/true or 0/false).
1032 *
1033 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1034 * this task in only one instruction:
1035 * - a negation source modifier will flip the bit; and
1036 * - a W -> D type conversion will sign extend the bit into the high
1037 * word of the destination.
1038 *
1039 * An ASR 15 fills the low word of the destination.
1040 */
1041 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1042 g0.negate = true;
1043
1044 bld.ASR(*reg, g0, fs_reg(15));
1045 } else {
1046 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1047 * a boolean result from this (1/true or 0/false).
1048 *
1049 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1050 * the negation source modifier to flip it. Unfortunately the SHR
1051 * instruction only operates on UD (or D with an abs source modifier)
1052 * sources without negation.
1053 *
1054 * Instead, use ASR (which will give ~0/true or 0/false).
1055 */
1056 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1057 g1_6.negate = true;
1058
1059 bld.ASR(*reg, g1_6, fs_reg(31));
1060 }
1061
1062 return reg;
1063 }
1064
1065 void
1066 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1067 {
1068 assert(stage == MESA_SHADER_FRAGMENT);
1069 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1070 assert(dst.type == BRW_REGISTER_TYPE_F);
1071
1072 if (key->compute_pos_offset) {
1073 /* Convert int_sample_pos to floating point */
1074 bld.MOV(dst, int_sample_pos);
1075 /* Scale to the range [0, 1] */
1076 bld.MUL(dst, dst, fs_reg(1 / 16.0f));
1077 }
1078 else {
1079 /* From ARB_sample_shading specification:
1080 * "When rendering to a non-multisample buffer, or if multisample
1081 * rasterization is disabled, gl_SamplePosition will always be
1082 * (0.5, 0.5).
1083 */
1084 bld.MOV(dst, fs_reg(0.5f));
1085 }
1086 }
1087
1088 fs_reg *
1089 fs_visitor::emit_samplepos_setup()
1090 {
1091 assert(devinfo->gen >= 6);
1092
1093 const fs_builder abld = bld.annotate("compute sample position");
1094 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1095 fs_reg pos = *reg;
1096 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1097 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1098
1099 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1100 * mode will be enabled.
1101 *
1102 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1103 * R31.1:0 Position Offset X/Y for Slot[3:0]
1104 * R31.3:2 Position Offset X/Y for Slot[7:4]
1105 * .....
1106 *
1107 * The X, Y sample positions come in as bytes in thread payload. So, read
1108 * the positions using vstride=16, width=8, hstride=2.
1109 */
1110 struct brw_reg sample_pos_reg =
1111 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1112 BRW_REGISTER_TYPE_B), 16, 8, 2);
1113
1114 if (dispatch_width == 8) {
1115 abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
1116 } else {
1117 abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
1118 abld.half(1).MOV(half(int_sample_x, 1),
1119 fs_reg(suboffset(sample_pos_reg, 16)));
1120 }
1121 /* Compute gl_SamplePosition.x */
1122 compute_sample_position(pos, int_sample_x);
1123 pos = offset(pos, abld, 1);
1124 if (dispatch_width == 8) {
1125 abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
1126 } else {
1127 abld.half(0).MOV(half(int_sample_y, 0),
1128 fs_reg(suboffset(sample_pos_reg, 1)));
1129 abld.half(1).MOV(half(int_sample_y, 1),
1130 fs_reg(suboffset(sample_pos_reg, 17)));
1131 }
1132 /* Compute gl_SamplePosition.y */
1133 compute_sample_position(pos, int_sample_y);
1134 return reg;
1135 }
1136
1137 fs_reg *
1138 fs_visitor::emit_sampleid_setup()
1139 {
1140 assert(stage == MESA_SHADER_FRAGMENT);
1141 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1142 assert(devinfo->gen >= 6);
1143
1144 const fs_builder abld = bld.annotate("compute sample id");
1145 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1146
1147 if (key->compute_sample_id) {
1148 fs_reg t1 = vgrf(glsl_type::int_type);
1149 fs_reg t2 = vgrf(glsl_type::int_type);
1150 t2.type = BRW_REGISTER_TYPE_UW;
1151
1152 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1153 * 8x multisampling, subspan 0 will represent sample N (where N
1154 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1155 * 7. We can find the value of N by looking at R0.0 bits 7:6
1156 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1157 * (since samples are always delivered in pairs). That is, we
1158 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1159 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1160 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1161 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1162 * populating a temporary variable with the sequence (0, 1, 2, 3),
1163 * and then reading from it using vstride=1, width=4, hstride=0.
1164 * These computations hold good for 4x multisampling as well.
1165 *
1166 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1167 * the first four slots are sample 0 of subspan 0; the next four
1168 * are sample 1 of subspan 0; the third group is sample 0 of
1169 * subspan 1, and finally sample 1 of subspan 1.
1170 */
1171 abld.exec_all()
1172 .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1173 fs_reg(0xc0));
1174 abld.exec_all().SHR(t1, t1, fs_reg(5));
1175
1176 /* This works for both SIMD8 and SIMD16 */
1177 abld.exec_all()
1178 .MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210));
1179
1180 /* This special instruction takes care of setting vstride=1,
1181 * width=4, hstride=0 of t2 during an ADD instruction.
1182 */
1183 abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1184 } else {
1185 /* As per GL_ARB_sample_shading specification:
1186 * "When rendering to a non-multisample buffer, or if multisample
1187 * rasterization is disabled, gl_SampleID will always be zero."
1188 */
1189 abld.MOV(*reg, fs_reg(0));
1190 }
1191
1192 return reg;
1193 }
1194
1195 void
1196 fs_visitor::resolve_source_modifiers(fs_reg *src)
1197 {
1198 if (!src->abs && !src->negate)
1199 return;
1200
1201 fs_reg temp = bld.vgrf(src->type);
1202 bld.MOV(temp, *src);
1203 *src = temp;
1204 }
1205
1206 void
1207 fs_visitor::emit_discard_jump()
1208 {
1209 assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1210
1211 /* For performance, after a discard, jump to the end of the
1212 * shader if all relevant channels have been discarded.
1213 */
1214 fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
1215 discard_jump->flag_subreg = 1;
1216
1217 discard_jump->predicate = (dispatch_width == 8)
1218 ? BRW_PREDICATE_ALIGN1_ANY8H
1219 : BRW_PREDICATE_ALIGN1_ANY16H;
1220 discard_jump->predicate_inverse = true;
1221 }
1222
1223 void
1224 fs_visitor::assign_curb_setup()
1225 {
1226 if (dispatch_width == 8) {
1227 prog_data->dispatch_grf_start_reg = payload.num_regs;
1228 } else {
1229 if (stage == MESA_SHADER_FRAGMENT) {
1230 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1231 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1232 } else if (stage == MESA_SHADER_COMPUTE) {
1233 brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1234 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1235 } else {
1236 unreachable("Unsupported shader type!");
1237 }
1238 }
1239
1240 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1241
1242 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1243 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1244 for (unsigned int i = 0; i < inst->sources; i++) {
1245 if (inst->src[i].file == UNIFORM) {
1246 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1247 int constant_nr;
1248 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1249 constant_nr = push_constant_loc[uniform_nr];
1250 } else {
1251 /* Section 5.11 of the OpenGL 4.1 spec says:
1252 * "Out-of-bounds reads return undefined values, which include
1253 * values from other variables of the active program or zero."
1254 * Just return the first push constant.
1255 */
1256 constant_nr = 0;
1257 }
1258
1259 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1260 constant_nr / 8,
1261 constant_nr % 8);
1262
1263 inst->src[i].file = HW_REG;
1264 inst->src[i].fixed_hw_reg = byte_offset(
1265 retype(brw_reg, inst->src[i].type),
1266 inst->src[i].subreg_offset);
1267 }
1268 }
1269 }
1270 }
1271
1272 void
1273 fs_visitor::calculate_urb_setup()
1274 {
1275 assert(stage == MESA_SHADER_FRAGMENT);
1276 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1277 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1278
1279 memset(prog_data->urb_setup, -1,
1280 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1281
1282 int urb_next = 0;
1283 /* Figure out where each of the incoming setup attributes lands. */
1284 if (devinfo->gen >= 6) {
1285 if (_mesa_bitcount_64(prog->InputsRead &
1286 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1287 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1288 * first 16 varying inputs, so we can put them wherever we want.
1289 * Just put them in order.
1290 *
1291 * This is useful because it means that (a) inputs not used by the
1292 * fragment shader won't take up valuable register space, and (b) we
1293 * won't have to recompile the fragment shader if it gets paired with
1294 * a different vertex (or geometry) shader.
1295 */
1296 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1297 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1298 BITFIELD64_BIT(i)) {
1299 prog_data->urb_setup[i] = urb_next++;
1300 }
1301 }
1302 } else {
1303 /* We have enough input varyings that the SF/SBE pipeline stage can't
1304 * arbitrarily rearrange them to suit our whim; we have to put them
1305 * in an order that matches the output of the previous pipeline stage
1306 * (geometry or vertex shader).
1307 */
1308 struct brw_vue_map prev_stage_vue_map;
1309 brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1310 key->input_slots_valid);
1311 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1312 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1313 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1314 slot++) {
1315 int varying = prev_stage_vue_map.slot_to_varying[slot];
1316 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1317 * unused.
1318 */
1319 if (varying != BRW_VARYING_SLOT_COUNT &&
1320 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1321 BITFIELD64_BIT(varying))) {
1322 prog_data->urb_setup[varying] = slot - first_slot;
1323 }
1324 }
1325 urb_next = prev_stage_vue_map.num_slots - first_slot;
1326 }
1327 } else {
1328 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1329 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1330 /* Point size is packed into the header, not as a general attribute */
1331 if (i == VARYING_SLOT_PSIZ)
1332 continue;
1333
1334 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1335 /* The back color slot is skipped when the front color is
1336 * also written to. In addition, some slots can be
1337 * written in the vertex shader and not read in the
1338 * fragment shader. So the register number must always be
1339 * incremented, mapped or not.
1340 */
1341 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1342 prog_data->urb_setup[i] = urb_next;
1343 urb_next++;
1344 }
1345 }
1346
1347 /*
1348 * It's a FS only attribute, and we did interpolation for this attribute
1349 * in SF thread. So, count it here, too.
1350 *
1351 * See compile_sf_prog() for more info.
1352 */
1353 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1354 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1355 }
1356
1357 prog_data->num_varying_inputs = urb_next;
1358 }
1359
1360 void
1361 fs_visitor::assign_urb_setup()
1362 {
1363 assert(stage == MESA_SHADER_FRAGMENT);
1364 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1365
1366 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1367
1368 /* Offset all the urb_setup[] index by the actual position of the
1369 * setup regs, now that the location of the constants has been chosen.
1370 */
1371 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1372 if (inst->opcode == FS_OPCODE_LINTERP) {
1373 assert(inst->src[1].file == HW_REG);
1374 inst->src[1].fixed_hw_reg.nr += urb_start;
1375 }
1376
1377 if (inst->opcode == FS_OPCODE_CINTERP) {
1378 assert(inst->src[0].file == HW_REG);
1379 inst->src[0].fixed_hw_reg.nr += urb_start;
1380 }
1381 }
1382
1383 /* Each attribute is 4 setup channels, each of which is half a reg. */
1384 this->first_non_payload_grf =
1385 urb_start + prog_data->num_varying_inputs * 2;
1386 }
1387
1388 void
1389 fs_visitor::assign_vs_urb_setup()
1390 {
1391 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1392 int grf, count, slot, channel, attr;
1393
1394 assert(stage == MESA_SHADER_VERTEX);
1395 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1396 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1397 count++;
1398
1399 /* Each attribute is 4 regs. */
1400 this->first_non_payload_grf =
1401 payload.num_regs + prog_data->curb_read_length + count * 4;
1402
1403 unsigned vue_entries =
1404 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1405
1406 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1407 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1408
1409 assert(vs_prog_data->base.urb_read_length <= 15);
1410
1411 /* Rewrite all ATTR file references to the hw grf that they land in. */
1412 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1413 for (int i = 0; i < inst->sources; i++) {
1414 if (inst->src[i].file == ATTR) {
1415
1416 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1417 slot = count - 1;
1418 } else {
1419 /* Attributes come in in a contiguous block, ordered by their
1420 * gl_vert_attrib value. That means we can compute the slot
1421 * number for an attribute by masking out the enabled
1422 * attributes before it and counting the bits.
1423 */
1424 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1425 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1426 BITFIELD64_MASK(attr));
1427 }
1428
1429 channel = inst->src[i].reg_offset & 3;
1430
1431 grf = payload.num_regs +
1432 prog_data->curb_read_length +
1433 slot * 4 + channel;
1434
1435 inst->src[i].file = HW_REG;
1436 inst->src[i].fixed_hw_reg =
1437 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1438 }
1439 }
1440 }
1441 }
1442
1443 /**
1444 * Split large virtual GRFs into separate components if we can.
1445 *
1446 * This is mostly duplicated with what brw_fs_vector_splitting does,
1447 * but that's really conservative because it's afraid of doing
1448 * splitting that doesn't result in real progress after the rest of
1449 * the optimization phases, which would cause infinite looping in
1450 * optimization. We can do it once here, safely. This also has the
1451 * opportunity to split interpolated values, or maybe even uniforms,
1452 * which we don't have at the IR level.
1453 *
1454 * We want to split, because virtual GRFs are what we register
1455 * allocate and spill (due to contiguousness requirements for some
1456 * instructions), and they're what we naturally generate in the
1457 * codegen process, but most virtual GRFs don't actually need to be
1458 * contiguous sets of GRFs. If we split, we'll end up with reduced
1459 * live intervals and better dead code elimination and coalescing.
1460 */
1461 void
1462 fs_visitor::split_virtual_grfs()
1463 {
1464 int num_vars = this->alloc.count;
1465
1466 /* Count the total number of registers */
1467 int reg_count = 0;
1468 int vgrf_to_reg[num_vars];
1469 for (int i = 0; i < num_vars; i++) {
1470 vgrf_to_reg[i] = reg_count;
1471 reg_count += alloc.sizes[i];
1472 }
1473
1474 /* An array of "split points". For each register slot, this indicates
1475 * if this slot can be separated from the previous slot. Every time an
1476 * instruction uses multiple elements of a register (as a source or
1477 * destination), we mark the used slots as inseparable. Then we go
1478 * through and split the registers into the smallest pieces we can.
1479 */
1480 bool split_points[reg_count];
1481 memset(split_points, 0, sizeof(split_points));
1482
1483 /* Mark all used registers as fully splittable */
1484 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1485 if (inst->dst.file == GRF) {
1486 int reg = vgrf_to_reg[inst->dst.reg];
1487 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1488 split_points[reg + j] = true;
1489 }
1490
1491 for (int i = 0; i < inst->sources; i++) {
1492 if (inst->src[i].file == GRF) {
1493 int reg = vgrf_to_reg[inst->src[i].reg];
1494 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1495 split_points[reg + j] = true;
1496 }
1497 }
1498 }
1499
1500 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1501 if (inst->dst.file == GRF) {
1502 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1503 for (int j = 1; j < inst->regs_written; j++)
1504 split_points[reg + j] = false;
1505 }
1506 for (int i = 0; i < inst->sources; i++) {
1507 if (inst->src[i].file == GRF) {
1508 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1509 for (int j = 1; j < inst->regs_read(i); j++)
1510 split_points[reg + j] = false;
1511 }
1512 }
1513 }
1514
1515 int new_virtual_grf[reg_count];
1516 int new_reg_offset[reg_count];
1517
1518 int reg = 0;
1519 for (int i = 0; i < num_vars; i++) {
1520 /* The first one should always be 0 as a quick sanity check. */
1521 assert(split_points[reg] == false);
1522
1523 /* j = 0 case */
1524 new_reg_offset[reg] = 0;
1525 reg++;
1526 int offset = 1;
1527
1528 /* j > 0 case */
1529 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1530 /* If this is a split point, reset the offset to 0 and allocate a
1531 * new virtual GRF for the previous offset many registers
1532 */
1533 if (split_points[reg]) {
1534 assert(offset <= MAX_VGRF_SIZE);
1535 int grf = alloc.allocate(offset);
1536 for (int k = reg - offset; k < reg; k++)
1537 new_virtual_grf[k] = grf;
1538 offset = 0;
1539 }
1540 new_reg_offset[reg] = offset;
1541 offset++;
1542 reg++;
1543 }
1544
1545 /* The last one gets the original register number */
1546 assert(offset <= MAX_VGRF_SIZE);
1547 alloc.sizes[i] = offset;
1548 for (int k = reg - offset; k < reg; k++)
1549 new_virtual_grf[k] = i;
1550 }
1551 assert(reg == reg_count);
1552
1553 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1554 if (inst->dst.file == GRF) {
1555 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1556 inst->dst.reg = new_virtual_grf[reg];
1557 inst->dst.reg_offset = new_reg_offset[reg];
1558 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1559 }
1560 for (int i = 0; i < inst->sources; i++) {
1561 if (inst->src[i].file == GRF) {
1562 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1563 inst->src[i].reg = new_virtual_grf[reg];
1564 inst->src[i].reg_offset = new_reg_offset[reg];
1565 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1566 }
1567 }
1568 }
1569 invalidate_live_intervals();
1570 }
1571
1572 /**
1573 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1574 *
1575 * During code generation, we create tons of temporary variables, many of
1576 * which get immediately killed and are never used again. Yet, in later
1577 * optimization and analysis passes, such as compute_live_intervals, we need
1578 * to loop over all the virtual GRFs. Compacting them can save a lot of
1579 * overhead.
1580 */
1581 bool
1582 fs_visitor::compact_virtual_grfs()
1583 {
1584 bool progress = false;
1585 int remap_table[this->alloc.count];
1586 memset(remap_table, -1, sizeof(remap_table));
1587
1588 /* Mark which virtual GRFs are used. */
1589 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1590 if (inst->dst.file == GRF)
1591 remap_table[inst->dst.reg] = 0;
1592
1593 for (int i = 0; i < inst->sources; i++) {
1594 if (inst->src[i].file == GRF)
1595 remap_table[inst->src[i].reg] = 0;
1596 }
1597 }
1598
1599 /* Compact the GRF arrays. */
1600 int new_index = 0;
1601 for (unsigned i = 0; i < this->alloc.count; i++) {
1602 if (remap_table[i] == -1) {
1603 /* We just found an unused register. This means that we are
1604 * actually going to compact something.
1605 */
1606 progress = true;
1607 } else {
1608 remap_table[i] = new_index;
1609 alloc.sizes[new_index] = alloc.sizes[i];
1610 invalidate_live_intervals();
1611 ++new_index;
1612 }
1613 }
1614
1615 this->alloc.count = new_index;
1616
1617 /* Patch all the instructions to use the newly renumbered registers */
1618 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1619 if (inst->dst.file == GRF)
1620 inst->dst.reg = remap_table[inst->dst.reg];
1621
1622 for (int i = 0; i < inst->sources; i++) {
1623 if (inst->src[i].file == GRF)
1624 inst->src[i].reg = remap_table[inst->src[i].reg];
1625 }
1626 }
1627
1628 /* Patch all the references to delta_xy, since they're used in register
1629 * allocation. If they're unused, switch them to BAD_FILE so we don't
1630 * think some random VGRF is delta_xy.
1631 */
1632 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
1633 if (delta_xy[i].file == GRF) {
1634 if (remap_table[delta_xy[i].reg] != -1) {
1635 delta_xy[i].reg = remap_table[delta_xy[i].reg];
1636 } else {
1637 delta_xy[i].file = BAD_FILE;
1638 }
1639 }
1640 }
1641
1642 return progress;
1643 }
1644
1645 /*
1646 * Implements array access of uniforms by inserting a
1647 * PULL_CONSTANT_LOAD instruction.
1648 *
1649 * Unlike temporary GRF array access (where we don't support it due to
1650 * the difficulty of doing relative addressing on instruction
1651 * destinations), we could potentially do array access of uniforms
1652 * that were loaded in GRF space as push constants. In real-world
1653 * usage we've seen, though, the arrays being used are always larger
1654 * than we could load as push constants, so just always move all
1655 * uniform array access out to a pull constant buffer.
1656 */
1657 void
1658 fs_visitor::move_uniform_array_access_to_pull_constants()
1659 {
1660 if (dispatch_width != 8)
1661 return;
1662
1663 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1664 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
1665
1666 /* Walk through and find array access of uniforms. Put a copy of that
1667 * uniform in the pull constant buffer.
1668 *
1669 * Note that we don't move constant-indexed accesses to arrays. No
1670 * testing has been done of the performance impact of this choice.
1671 */
1672 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
1673 for (int i = 0 ; i < inst->sources; i++) {
1674 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1675 continue;
1676
1677 int uniform = inst->src[i].reg;
1678
1679 /* If this array isn't already present in the pull constant buffer,
1680 * add it.
1681 */
1682 if (pull_constant_loc[uniform] == -1) {
1683 const gl_constant_value **values = &stage_prog_data->param[uniform];
1684
1685 assert(param_size[uniform]);
1686
1687 for (int j = 0; j < param_size[uniform]; j++) {
1688 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1689
1690 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1691 values[j];
1692 }
1693 }
1694 }
1695 }
1696 }
1697
1698 /**
1699 * Assign UNIFORM file registers to either push constants or pull constants.
1700 *
1701 * We allow a fragment shader to have more than the specified minimum
1702 * maximum number of fragment shader uniform components (64). If
1703 * there are too many of these, they'd fill up all of register space.
1704 * So, this will push some of them out to the pull constant buffer and
1705 * update the program to load them.
1706 */
1707 void
1708 fs_visitor::assign_constant_locations()
1709 {
1710 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1711 if (dispatch_width != 8)
1712 return;
1713
1714 /* Find which UNIFORM registers are still in use. */
1715 bool is_live[uniforms];
1716 for (unsigned int i = 0; i < uniforms; i++) {
1717 is_live[i] = false;
1718 }
1719
1720 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1721 for (int i = 0; i < inst->sources; i++) {
1722 if (inst->src[i].file != UNIFORM)
1723 continue;
1724
1725 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1726 if (constant_nr >= 0 && constant_nr < (int) uniforms)
1727 is_live[constant_nr] = true;
1728 }
1729 }
1730
1731 /* Only allow 16 registers (128 uniform components) as push constants.
1732 *
1733 * Just demote the end of the list. We could probably do better
1734 * here, demoting things that are rarely used in the program first.
1735 *
1736 * If changing this value, note the limitation about total_regs in
1737 * brw_curbe.c.
1738 */
1739 unsigned int max_push_components = 16 * 8;
1740 unsigned int num_push_constants = 0;
1741
1742 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1743
1744 for (unsigned int i = 0; i < uniforms; i++) {
1745 if (!is_live[i] || pull_constant_loc[i] != -1) {
1746 /* This UNIFORM register is either dead, or has already been demoted
1747 * to a pull const. Mark it as no longer living in the param[] array.
1748 */
1749 push_constant_loc[i] = -1;
1750 continue;
1751 }
1752
1753 if (num_push_constants < max_push_components) {
1754 /* Retain as a push constant. Record the location in the params[]
1755 * array.
1756 */
1757 push_constant_loc[i] = num_push_constants++;
1758 } else {
1759 /* Demote to a pull constant. */
1760 push_constant_loc[i] = -1;
1761
1762 int pull_index = stage_prog_data->nr_pull_params++;
1763 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1764 pull_constant_loc[i] = pull_index;
1765 }
1766 }
1767
1768 stage_prog_data->nr_params = num_push_constants;
1769
1770 /* Up until now, the param[] array has been indexed by reg + reg_offset
1771 * of UNIFORM registers. Condense it to only contain the uniforms we
1772 * chose to upload as push constants.
1773 */
1774 for (unsigned int i = 0; i < uniforms; i++) {
1775 int remapped = push_constant_loc[i];
1776
1777 if (remapped == -1)
1778 continue;
1779
1780 assert(remapped <= (int)i);
1781 stage_prog_data->param[remapped] = stage_prog_data->param[i];
1782 }
1783 }
1784
1785 /**
1786 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1787 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1788 */
1789 void
1790 fs_visitor::demote_pull_constants()
1791 {
1792 foreach_block_and_inst (block, fs_inst, inst, cfg) {
1793 for (int i = 0; i < inst->sources; i++) {
1794 if (inst->src[i].file != UNIFORM)
1795 continue;
1796
1797 int pull_index;
1798 unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
1799 if (location >= uniforms) /* Out of bounds access */
1800 pull_index = -1;
1801 else
1802 pull_index = pull_constant_loc[location];
1803
1804 if (pull_index == -1)
1805 continue;
1806
1807 /* Set up the annotation tracking for new generated instructions. */
1808 const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
1809 .at(block, inst);
1810 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1811 fs_reg dst = vgrf(glsl_type::float_type);
1812
1813 /* Generate a pull load into dst. */
1814 if (inst->src[i].reladdr) {
1815 VARYING_PULL_CONSTANT_LOAD(ibld, dst,
1816 surf_index,
1817 *inst->src[i].reladdr,
1818 pull_index);
1819 inst->src[i].reladdr = NULL;
1820 } else {
1821 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1822 ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1823 dst, surf_index, offset);
1824 inst->src[i].set_smear(pull_index & 3);
1825 }
1826
1827 /* Rewrite the instruction to use the temporary VGRF. */
1828 inst->src[i].file = GRF;
1829 inst->src[i].reg = dst.reg;
1830 inst->src[i].reg_offset = 0;
1831 }
1832 }
1833 invalidate_live_intervals();
1834 }
1835
1836 bool
1837 fs_visitor::opt_algebraic()
1838 {
1839 bool progress = false;
1840
1841 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1842 switch (inst->opcode) {
1843 case BRW_OPCODE_MOV:
1844 if (inst->src[0].file != IMM)
1845 break;
1846
1847 if (inst->saturate) {
1848 if (inst->dst.type != inst->src[0].type)
1849 assert(!"unimplemented: saturate mixed types");
1850
1851 if (brw_saturate_immediate(inst->dst.type,
1852 &inst->src[0].fixed_hw_reg)) {
1853 inst->saturate = false;
1854 progress = true;
1855 }
1856 }
1857 break;
1858
1859 case BRW_OPCODE_MUL:
1860 if (inst->src[1].file != IMM)
1861 continue;
1862
1863 /* a * 1.0 = a */
1864 if (inst->src[1].is_one()) {
1865 inst->opcode = BRW_OPCODE_MOV;
1866 inst->src[1] = reg_undef;
1867 progress = true;
1868 break;
1869 }
1870
1871 /* a * -1.0 = -a */
1872 if (inst->src[1].is_negative_one()) {
1873 inst->opcode = BRW_OPCODE_MOV;
1874 inst->src[0].negate = !inst->src[0].negate;
1875 inst->src[1] = reg_undef;
1876 progress = true;
1877 break;
1878 }
1879
1880 /* a * 0.0 = 0.0 */
1881 if (inst->src[1].is_zero()) {
1882 inst->opcode = BRW_OPCODE_MOV;
1883 inst->src[0] = inst->src[1];
1884 inst->src[1] = reg_undef;
1885 progress = true;
1886 break;
1887 }
1888
1889 if (inst->src[0].file == IMM) {
1890 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
1891 inst->opcode = BRW_OPCODE_MOV;
1892 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
1893 inst->src[1] = reg_undef;
1894 progress = true;
1895 break;
1896 }
1897 break;
1898 case BRW_OPCODE_ADD:
1899 if (inst->src[1].file != IMM)
1900 continue;
1901
1902 /* a + 0.0 = a */
1903 if (inst->src[1].is_zero()) {
1904 inst->opcode = BRW_OPCODE_MOV;
1905 inst->src[1] = reg_undef;
1906 progress = true;
1907 break;
1908 }
1909
1910 if (inst->src[0].file == IMM) {
1911 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
1912 inst->opcode = BRW_OPCODE_MOV;
1913 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
1914 inst->src[1] = reg_undef;
1915 progress = true;
1916 break;
1917 }
1918 break;
1919 case BRW_OPCODE_OR:
1920 if (inst->src[0].equals(inst->src[1])) {
1921 inst->opcode = BRW_OPCODE_MOV;
1922 inst->src[1] = reg_undef;
1923 progress = true;
1924 break;
1925 }
1926 break;
1927 case BRW_OPCODE_LRP:
1928 if (inst->src[1].equals(inst->src[2])) {
1929 inst->opcode = BRW_OPCODE_MOV;
1930 inst->src[0] = inst->src[1];
1931 inst->src[1] = reg_undef;
1932 inst->src[2] = reg_undef;
1933 progress = true;
1934 break;
1935 }
1936 break;
1937 case BRW_OPCODE_CMP:
1938 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
1939 inst->src[0].abs &&
1940 inst->src[0].negate &&
1941 inst->src[1].is_zero()) {
1942 inst->src[0].abs = false;
1943 inst->src[0].negate = false;
1944 inst->conditional_mod = BRW_CONDITIONAL_Z;
1945 progress = true;
1946 break;
1947 }
1948 break;
1949 case BRW_OPCODE_SEL:
1950 if (inst->src[0].equals(inst->src[1])) {
1951 inst->opcode = BRW_OPCODE_MOV;
1952 inst->src[1] = reg_undef;
1953 inst->predicate = BRW_PREDICATE_NONE;
1954 inst->predicate_inverse = false;
1955 progress = true;
1956 } else if (inst->saturate && inst->src[1].file == IMM) {
1957 switch (inst->conditional_mod) {
1958 case BRW_CONDITIONAL_LE:
1959 case BRW_CONDITIONAL_L:
1960 switch (inst->src[1].type) {
1961 case BRW_REGISTER_TYPE_F:
1962 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
1963 inst->opcode = BRW_OPCODE_MOV;
1964 inst->src[1] = reg_undef;
1965 inst->conditional_mod = BRW_CONDITIONAL_NONE;
1966 progress = true;
1967 }
1968 break;
1969 default:
1970 break;
1971 }
1972 break;
1973 case BRW_CONDITIONAL_GE:
1974 case BRW_CONDITIONAL_G:
1975 switch (inst->src[1].type) {
1976 case BRW_REGISTER_TYPE_F:
1977 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
1978 inst->opcode = BRW_OPCODE_MOV;
1979 inst->src[1] = reg_undef;
1980 inst->conditional_mod = BRW_CONDITIONAL_NONE;
1981 progress = true;
1982 }
1983 break;
1984 default:
1985 break;
1986 }
1987 default:
1988 break;
1989 }
1990 }
1991 break;
1992 case BRW_OPCODE_MAD:
1993 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
1994 inst->opcode = BRW_OPCODE_MOV;
1995 inst->src[1] = reg_undef;
1996 inst->src[2] = reg_undef;
1997 progress = true;
1998 } else if (inst->src[0].is_zero()) {
1999 inst->opcode = BRW_OPCODE_MUL;
2000 inst->src[0] = inst->src[2];
2001 inst->src[2] = reg_undef;
2002 progress = true;
2003 } else if (inst->src[1].is_one()) {
2004 inst->opcode = BRW_OPCODE_ADD;
2005 inst->src[1] = inst->src[2];
2006 inst->src[2] = reg_undef;
2007 progress = true;
2008 } else if (inst->src[2].is_one()) {
2009 inst->opcode = BRW_OPCODE_ADD;
2010 inst->src[2] = reg_undef;
2011 progress = true;
2012 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2013 inst->opcode = BRW_OPCODE_ADD;
2014 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2015 inst->src[2] = reg_undef;
2016 progress = true;
2017 }
2018 break;
2019 case SHADER_OPCODE_RCP: {
2020 fs_inst *prev = (fs_inst *)inst->prev;
2021 if (prev->opcode == SHADER_OPCODE_SQRT) {
2022 if (inst->src[0].equals(prev->dst)) {
2023 inst->opcode = SHADER_OPCODE_RSQ;
2024 inst->src[0] = prev->src[0];
2025 progress = true;
2026 }
2027 }
2028 break;
2029 }
2030 case SHADER_OPCODE_BROADCAST:
2031 if (is_uniform(inst->src[0])) {
2032 inst->opcode = BRW_OPCODE_MOV;
2033 inst->sources = 1;
2034 inst->force_writemask_all = true;
2035 progress = true;
2036 } else if (inst->src[1].file == IMM) {
2037 inst->opcode = BRW_OPCODE_MOV;
2038 inst->src[0] = component(inst->src[0],
2039 inst->src[1].fixed_hw_reg.dw1.ud);
2040 inst->sources = 1;
2041 inst->force_writemask_all = true;
2042 progress = true;
2043 }
2044 break;
2045
2046 default:
2047 break;
2048 }
2049
2050 /* Swap if src[0] is immediate. */
2051 if (progress && inst->is_commutative()) {
2052 if (inst->src[0].file == IMM) {
2053 fs_reg tmp = inst->src[1];
2054 inst->src[1] = inst->src[0];
2055 inst->src[0] = tmp;
2056 }
2057 }
2058 }
2059 return progress;
2060 }
2061
2062 /**
2063 * Optimize sample messages that have constant zero values for the trailing
2064 * texture coordinates. We can just reduce the message length for these
2065 * instructions instead of reserving a register for it. Trailing parameters
2066 * that aren't sent default to zero anyway. This will cause the dead code
2067 * eliminator to remove the MOV instruction that would otherwise be emitted to
2068 * set up the zero value.
2069 */
2070 bool
2071 fs_visitor::opt_zero_samples()
2072 {
2073 /* Gen4 infers the texturing opcode based on the message length so we can't
2074 * change it.
2075 */
2076 if (devinfo->gen < 5)
2077 return false;
2078
2079 bool progress = false;
2080
2081 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2082 if (!inst->is_tex())
2083 continue;
2084
2085 fs_inst *load_payload = (fs_inst *) inst->prev;
2086
2087 if (load_payload->is_head_sentinel() ||
2088 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2089 continue;
2090
2091 /* We don't want to remove the message header or the first parameter.
2092 * Removing the first parameter is not allowed, see the Haswell PRM
2093 * volume 7, page 149:
2094 *
2095 * "Parameter 0 is required except for the sampleinfo message, which
2096 * has no parameter 0"
2097 */
2098 while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2099 load_payload->src[(inst->mlen - inst->header_size) /
2100 (dispatch_width / 8) +
2101 inst->header_size - 1].is_zero()) {
2102 inst->mlen -= dispatch_width / 8;
2103 progress = true;
2104 }
2105 }
2106
2107 if (progress)
2108 invalidate_live_intervals();
2109
2110 return progress;
2111 }
2112
2113 /**
2114 * Optimize sample messages which are followed by the final RT write.
2115 *
2116 * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2117 * results sent directly to the framebuffer, bypassing the EU. Recognize the
2118 * final texturing results copied to the framebuffer write payload and modify
2119 * them to write to the framebuffer directly.
2120 */
2121 bool
2122 fs_visitor::opt_sampler_eot()
2123 {
2124 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2125
2126 if (stage != MESA_SHADER_FRAGMENT)
2127 return false;
2128
2129 if (devinfo->gen < 9 && !devinfo->is_cherryview)
2130 return false;
2131
2132 /* FINISHME: It should be possible to implement this optimization when there
2133 * are multiple drawbuffers.
2134 */
2135 if (key->nr_color_regions != 1)
2136 return false;
2137
2138 /* Look for a texturing instruction immediately before the final FB_WRITE. */
2139 fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2140 assert(fb_write->eot);
2141 assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2142
2143 fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2144
2145 /* There wasn't one; nothing to do. */
2146 if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2147 return false;
2148
2149 /* This optimisation doesn't seem to work for textureGather for some
2150 * reason. I can't find any documentation or known workarounds to indicate
2151 * that this is expected, but considering that it is probably pretty
2152 * unlikely that a shader would directly write out the results from
2153 * textureGather we might as well just disable it.
2154 */
2155 if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
2156 tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2157 return false;
2158
2159 /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2160 * It's very likely to be the previous instruction.
2161 */
2162 fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2163 if (load_payload->is_head_sentinel() ||
2164 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2165 return false;
2166
2167 assert(!tex_inst->eot); /* We can't get here twice */
2168 assert((tex_inst->offset & (0xff << 24)) == 0);
2169
2170 tex_inst->offset |= fb_write->target << 24;
2171 tex_inst->eot = true;
2172 tex_inst->dst = bld.null_reg_ud();
2173 fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2174
2175 /* If a header is present, marking the eot is sufficient. Otherwise, we need
2176 * to create a new LOAD_PAYLOAD command with the same sources and a space
2177 * saved for the header. Using a new destination register not only makes sure
2178 * we have enough space, but it will make sure the dead code eliminator kills
2179 * the instruction that this will replace.
2180 */
2181 if (tex_inst->header_size != 0)
2182 return true;
2183
2184 fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
2185 load_payload->sources + 1);
2186 fs_reg *new_sources =
2187 ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2188
2189 new_sources[0] = fs_reg();
2190 for (int i = 0; i < load_payload->sources; i++)
2191 new_sources[i+1] = load_payload->src[i];
2192
2193 /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2194 * requires a lot of information about the sources to appropriately figure
2195 * out the number of registers needed to be used. Given this stage in our
2196 * optimization, we may not have the appropriate GRFs required by
2197 * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2198 * manually emit the instruction.
2199 */
2200 fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2201 load_payload->exec_size,
2202 send_header,
2203 new_sources,
2204 load_payload->sources + 1);
2205
2206 new_load_payload->regs_written = load_payload->regs_written + 1;
2207 new_load_payload->header_size = 1;
2208 tex_inst->mlen++;
2209 tex_inst->header_size = 1;
2210 tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2211 tex_inst->src[0] = send_header;
2212
2213 return true;
2214 }
2215
2216 bool
2217 fs_visitor::opt_register_renaming()
2218 {
2219 bool progress = false;
2220 int depth = 0;
2221
2222 int remap[alloc.count];
2223 memset(remap, -1, sizeof(int) * alloc.count);
2224
2225 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2226 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2227 depth++;
2228 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2229 inst->opcode == BRW_OPCODE_WHILE) {
2230 depth--;
2231 }
2232
2233 /* Rewrite instruction sources. */
2234 for (int i = 0; i < inst->sources; i++) {
2235 if (inst->src[i].file == GRF &&
2236 remap[inst->src[i].reg] != -1 &&
2237 remap[inst->src[i].reg] != inst->src[i].reg) {
2238 inst->src[i].reg = remap[inst->src[i].reg];
2239 progress = true;
2240 }
2241 }
2242
2243 const int dst = inst->dst.reg;
2244
2245 if (depth == 0 &&
2246 inst->dst.file == GRF &&
2247 alloc.sizes[inst->dst.reg] == inst->exec_size / 8 &&
2248 !inst->is_partial_write()) {
2249 if (remap[dst] == -1) {
2250 remap[dst] = dst;
2251 } else {
2252 remap[dst] = alloc.allocate(inst->exec_size / 8);
2253 inst->dst.reg = remap[dst];
2254 progress = true;
2255 }
2256 } else if (inst->dst.file == GRF &&
2257 remap[dst] != -1 &&
2258 remap[dst] != dst) {
2259 inst->dst.reg = remap[dst];
2260 progress = true;
2261 }
2262 }
2263
2264 if (progress) {
2265 invalidate_live_intervals();
2266
2267 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2268 if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2269 delta_xy[i].reg = remap[delta_xy[i].reg];
2270 }
2271 }
2272 }
2273
2274 return progress;
2275 }
2276
2277 /**
2278 * Remove redundant or useless discard jumps.
2279 *
2280 * For example, we can eliminate jumps in the following sequence:
2281 *
2282 * discard-jump (redundant with the next jump)
2283 * discard-jump (useless; jumps to the next instruction)
2284 * placeholder-halt
2285 */
2286 bool
2287 fs_visitor::opt_redundant_discard_jumps()
2288 {
2289 bool progress = false;
2290
2291 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2292
2293 fs_inst *placeholder_halt = NULL;
2294 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2295 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2296 placeholder_halt = inst;
2297 break;
2298 }
2299 }
2300
2301 if (!placeholder_halt)
2302 return false;
2303
2304 /* Delete any HALTs immediately before the placeholder halt. */
2305 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2306 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2307 prev = (fs_inst *) placeholder_halt->prev) {
2308 prev->remove(last_bblock);
2309 progress = true;
2310 }
2311
2312 if (progress)
2313 invalidate_live_intervals();
2314
2315 return progress;
2316 }
2317
2318 bool
2319 fs_visitor::compute_to_mrf()
2320 {
2321 bool progress = false;
2322 int next_ip = 0;
2323
2324 /* No MRFs on Gen >= 7. */
2325 if (devinfo->gen >= 7)
2326 return false;
2327
2328 calculate_live_intervals();
2329
2330 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2331 int ip = next_ip;
2332 next_ip++;
2333
2334 if (inst->opcode != BRW_OPCODE_MOV ||
2335 inst->is_partial_write() ||
2336 inst->dst.file != MRF || inst->src[0].file != GRF ||
2337 inst->dst.type != inst->src[0].type ||
2338 inst->src[0].abs || inst->src[0].negate ||
2339 !inst->src[0].is_contiguous() ||
2340 inst->src[0].subreg_offset)
2341 continue;
2342
2343 /* Work out which hardware MRF registers are written by this
2344 * instruction.
2345 */
2346 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2347 int mrf_high;
2348 if (inst->dst.reg & BRW_MRF_COMPR4) {
2349 mrf_high = mrf_low + 4;
2350 } else if (inst->exec_size == 16) {
2351 mrf_high = mrf_low + 1;
2352 } else {
2353 mrf_high = mrf_low;
2354 }
2355
2356 /* Can't compute-to-MRF this GRF if someone else was going to
2357 * read it later.
2358 */
2359 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2360 continue;
2361
2362 /* Found a move of a GRF to a MRF. Let's see if we can go
2363 * rewrite the thing that made this GRF to write into the MRF.
2364 */
2365 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2366 if (scan_inst->dst.file == GRF &&
2367 scan_inst->dst.reg == inst->src[0].reg) {
2368 /* Found the last thing to write our reg we want to turn
2369 * into a compute-to-MRF.
2370 */
2371
2372 /* If this one instruction didn't populate all the
2373 * channels, bail. We might be able to rewrite everything
2374 * that writes that reg, but it would require smarter
2375 * tracking to delay the rewriting until complete success.
2376 */
2377 if (scan_inst->is_partial_write())
2378 break;
2379
2380 /* Things returning more than one register would need us to
2381 * understand coalescing out more than one MOV at a time.
2382 */
2383 if (scan_inst->regs_written > scan_inst->exec_size / 8)
2384 break;
2385
2386 /* SEND instructions can't have MRF as a destination. */
2387 if (scan_inst->mlen)
2388 break;
2389
2390 if (devinfo->gen == 6) {
2391 /* gen6 math instructions must have the destination be
2392 * GRF, so no compute-to-MRF for them.
2393 */
2394 if (scan_inst->is_math()) {
2395 break;
2396 }
2397 }
2398
2399 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2400 /* Found the creator of our MRF's source value. */
2401 scan_inst->dst.file = MRF;
2402 scan_inst->dst.reg = inst->dst.reg;
2403 scan_inst->saturate |= inst->saturate;
2404 inst->remove(block);
2405 progress = true;
2406 }
2407 break;
2408 }
2409
2410 /* We don't handle control flow here. Most computation of
2411 * values that end up in MRFs are shortly before the MRF
2412 * write anyway.
2413 */
2414 if (block->start() == scan_inst)
2415 break;
2416
2417 /* You can't read from an MRF, so if someone else reads our
2418 * MRF's source GRF that we wanted to rewrite, that stops us.
2419 */
2420 bool interfered = false;
2421 for (int i = 0; i < scan_inst->sources; i++) {
2422 if (scan_inst->src[i].file == GRF &&
2423 scan_inst->src[i].reg == inst->src[0].reg &&
2424 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2425 interfered = true;
2426 }
2427 }
2428 if (interfered)
2429 break;
2430
2431 if (scan_inst->dst.file == MRF) {
2432 /* If somebody else writes our MRF here, we can't
2433 * compute-to-MRF before that.
2434 */
2435 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2436 int scan_mrf_high;
2437
2438 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2439 scan_mrf_high = scan_mrf_low + 4;
2440 } else if (scan_inst->exec_size == 16) {
2441 scan_mrf_high = scan_mrf_low + 1;
2442 } else {
2443 scan_mrf_high = scan_mrf_low;
2444 }
2445
2446 if (mrf_low == scan_mrf_low ||
2447 mrf_low == scan_mrf_high ||
2448 mrf_high == scan_mrf_low ||
2449 mrf_high == scan_mrf_high) {
2450 break;
2451 }
2452 }
2453
2454 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2455 /* Found a SEND instruction, which means that there are
2456 * live values in MRFs from base_mrf to base_mrf +
2457 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2458 * above it.
2459 */
2460 if (mrf_low >= scan_inst->base_mrf &&
2461 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2462 break;
2463 }
2464 if (mrf_high >= scan_inst->base_mrf &&
2465 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2466 break;
2467 }
2468 }
2469 }
2470 }
2471
2472 if (progress)
2473 invalidate_live_intervals();
2474
2475 return progress;
2476 }
2477
2478 /**
2479 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2480 * flow. We could probably do better here with some form of divergence
2481 * analysis.
2482 */
2483 bool
2484 fs_visitor::eliminate_find_live_channel()
2485 {
2486 bool progress = false;
2487 unsigned depth = 0;
2488
2489 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2490 switch (inst->opcode) {
2491 case BRW_OPCODE_IF:
2492 case BRW_OPCODE_DO:
2493 depth++;
2494 break;
2495
2496 case BRW_OPCODE_ENDIF:
2497 case BRW_OPCODE_WHILE:
2498 depth--;
2499 break;
2500
2501 case FS_OPCODE_DISCARD_JUMP:
2502 /* This can potentially make control flow non-uniform until the end
2503 * of the program.
2504 */
2505 return progress;
2506
2507 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2508 if (depth == 0) {
2509 inst->opcode = BRW_OPCODE_MOV;
2510 inst->src[0] = fs_reg(0);
2511 inst->sources = 1;
2512 inst->force_writemask_all = true;
2513 progress = true;
2514 }
2515 break;
2516
2517 default:
2518 break;
2519 }
2520 }
2521
2522 return progress;
2523 }
2524
2525 /**
2526 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2527 * instructions to FS_OPCODE_REP_FB_WRITE.
2528 */
2529 void
2530 fs_visitor::emit_repclear_shader()
2531 {
2532 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2533 int base_mrf = 1;
2534 int color_mrf = base_mrf + 2;
2535
2536 fs_inst *mov = bld.exec_all().MOV(vec4(brw_message_reg(color_mrf)),
2537 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
2538
2539 fs_inst *write;
2540 if (key->nr_color_regions == 1) {
2541 write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2542 write->saturate = key->clamp_fragment_color;
2543 write->base_mrf = color_mrf;
2544 write->target = 0;
2545 write->header_size = 0;
2546 write->mlen = 1;
2547 } else {
2548 assume(key->nr_color_regions > 0);
2549 for (int i = 0; i < key->nr_color_regions; ++i) {
2550 write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2551 write->saturate = key->clamp_fragment_color;
2552 write->base_mrf = base_mrf;
2553 write->target = i;
2554 write->header_size = 2;
2555 write->mlen = 3;
2556 }
2557 }
2558 write->eot = true;
2559
2560 calculate_cfg();
2561
2562 assign_constant_locations();
2563 assign_curb_setup();
2564
2565 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2566 assert(mov->src[0].file == HW_REG);
2567 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2568 }
2569
2570 /**
2571 * Walks through basic blocks, looking for repeated MRF writes and
2572 * removing the later ones.
2573 */
2574 bool
2575 fs_visitor::remove_duplicate_mrf_writes()
2576 {
2577 fs_inst *last_mrf_move[16];
2578 bool progress = false;
2579
2580 /* Need to update the MRF tracking for compressed instructions. */
2581 if (dispatch_width == 16)
2582 return false;
2583
2584 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2585
2586 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2587 if (inst->is_control_flow()) {
2588 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2589 }
2590
2591 if (inst->opcode == BRW_OPCODE_MOV &&
2592 inst->dst.file == MRF) {
2593 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2594 if (prev_inst && inst->equals(prev_inst)) {
2595 inst->remove(block);
2596 progress = true;
2597 continue;
2598 }
2599 }
2600
2601 /* Clear out the last-write records for MRFs that were overwritten. */
2602 if (inst->dst.file == MRF) {
2603 last_mrf_move[inst->dst.reg] = NULL;
2604 }
2605
2606 if (inst->mlen > 0 && inst->base_mrf != -1) {
2607 /* Found a SEND instruction, which will include two or fewer
2608 * implied MRF writes. We could do better here.
2609 */
2610 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2611 last_mrf_move[inst->base_mrf + i] = NULL;
2612 }
2613 }
2614
2615 /* Clear out any MRF move records whose sources got overwritten. */
2616 if (inst->dst.file == GRF) {
2617 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2618 if (last_mrf_move[i] &&
2619 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2620 last_mrf_move[i] = NULL;
2621 }
2622 }
2623 }
2624
2625 if (inst->opcode == BRW_OPCODE_MOV &&
2626 inst->dst.file == MRF &&
2627 inst->src[0].file == GRF &&
2628 !inst->is_partial_write()) {
2629 last_mrf_move[inst->dst.reg] = inst;
2630 }
2631 }
2632
2633 if (progress)
2634 invalidate_live_intervals();
2635
2636 return progress;
2637 }
2638
2639 static void
2640 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
2641 {
2642 /* Clear the flag for registers that actually got read (as expected). */
2643 for (int i = 0; i < inst->sources; i++) {
2644 int grf;
2645 if (inst->src[i].file == GRF) {
2646 grf = inst->src[i].reg;
2647 } else if (inst->src[i].file == HW_REG &&
2648 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2649 grf = inst->src[i].fixed_hw_reg.nr;
2650 } else {
2651 continue;
2652 }
2653
2654 if (grf >= first_grf &&
2655 grf < first_grf + grf_len) {
2656 deps[grf - first_grf] = false;
2657 if (inst->exec_size == 16)
2658 deps[grf - first_grf + 1] = false;
2659 }
2660 }
2661 }
2662
2663 /**
2664 * Implements this workaround for the original 965:
2665 *
2666 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2667 * check for post destination dependencies on this instruction, software
2668 * must ensure that there is no destination hazard for the case of ‘write
2669 * followed by a posted write’ shown in the following example.
2670 *
2671 * 1. mov r3 0
2672 * 2. send r3.xy <rest of send instruction>
2673 * 3. mov r2 r3
2674 *
2675 * Due to no post-destination dependency check on the ‘send’, the above
2676 * code sequence could have two instructions (1 and 2) in flight at the
2677 * same time that both consider ‘r3’ as the target of their final writes.
2678 */
2679 void
2680 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2681 fs_inst *inst)
2682 {
2683 int write_len = inst->regs_written;
2684 int first_write_grf = inst->dst.reg;
2685 bool needs_dep[BRW_MAX_MRF];
2686 assert(write_len < (int)sizeof(needs_dep) - 1);
2687
2688 memset(needs_dep, false, sizeof(needs_dep));
2689 memset(needs_dep, true, write_len);
2690
2691 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
2692
2693 /* Walk backwards looking for writes to registers we're writing which
2694 * aren't read since being written. If we hit the start of the program,
2695 * we assume that there are no outstanding dependencies on entry to the
2696 * program.
2697 */
2698 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2699 /* If we hit control flow, assume that there *are* outstanding
2700 * dependencies, and force their cleanup before our instruction.
2701 */
2702 if (block->start() == scan_inst) {
2703 for (int i = 0; i < write_len; i++) {
2704 if (needs_dep[i])
2705 DEP_RESOLVE_MOV(bld.at(block, inst), first_write_grf + i);
2706 }
2707 return;
2708 }
2709
2710 /* We insert our reads as late as possible on the assumption that any
2711 * instruction but a MOV that might have left us an outstanding
2712 * dependency has more latency than a MOV.
2713 */
2714 if (scan_inst->dst.file == GRF) {
2715 for (int i = 0; i < scan_inst->regs_written; i++) {
2716 int reg = scan_inst->dst.reg + i;
2717
2718 if (reg >= first_write_grf &&
2719 reg < first_write_grf + write_len &&
2720 needs_dep[reg - first_write_grf]) {
2721 DEP_RESOLVE_MOV(bld.at(block, inst), reg);
2722 needs_dep[reg - first_write_grf] = false;
2723 if (scan_inst->exec_size == 16)
2724 needs_dep[reg - first_write_grf + 1] = false;
2725 }
2726 }
2727 }
2728
2729 /* Clear the flag for registers that actually got read (as expected). */
2730 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2731
2732 /* Continue the loop only if we haven't resolved all the dependencies */
2733 int i;
2734 for (i = 0; i < write_len; i++) {
2735 if (needs_dep[i])
2736 break;
2737 }
2738 if (i == write_len)
2739 return;
2740 }
2741 }
2742
2743 /**
2744 * Implements this workaround for the original 965:
2745 *
2746 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2747 * used as a destination register until after it has been sourced by an
2748 * instruction with a different destination register.
2749 */
2750 void
2751 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2752 {
2753 int write_len = inst->regs_written;
2754 int first_write_grf = inst->dst.reg;
2755 bool needs_dep[BRW_MAX_MRF];
2756 assert(write_len < (int)sizeof(needs_dep) - 1);
2757
2758 memset(needs_dep, false, sizeof(needs_dep));
2759 memset(needs_dep, true, write_len);
2760 /* Walk forwards looking for writes to registers we're writing which aren't
2761 * read before being written.
2762 */
2763 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2764 /* If we hit control flow, force resolve all remaining dependencies. */
2765 if (block->end() == scan_inst) {
2766 for (int i = 0; i < write_len; i++) {
2767 if (needs_dep[i])
2768 DEP_RESOLVE_MOV(bld.at(block, scan_inst), first_write_grf + i);
2769 }
2770 return;
2771 }
2772
2773 /* Clear the flag for registers that actually got read (as expected). */
2774 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2775
2776 /* We insert our reads as late as possible since they're reading the
2777 * result of a SEND, which has massive latency.
2778 */
2779 if (scan_inst->dst.file == GRF &&
2780 scan_inst->dst.reg >= first_write_grf &&
2781 scan_inst->dst.reg < first_write_grf + write_len &&
2782 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2783 DEP_RESOLVE_MOV(bld.at(block, scan_inst), scan_inst->dst.reg);
2784 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2785 }
2786
2787 /* Continue the loop only if we haven't resolved all the dependencies */
2788 int i;
2789 for (i = 0; i < write_len; i++) {
2790 if (needs_dep[i])
2791 break;
2792 }
2793 if (i == write_len)
2794 return;
2795 }
2796 }
2797
2798 void
2799 fs_visitor::insert_gen4_send_dependency_workarounds()
2800 {
2801 if (devinfo->gen != 4 || devinfo->is_g4x)
2802 return;
2803
2804 bool progress = false;
2805
2806 /* Note that we're done with register allocation, so GRF fs_regs always
2807 * have a .reg_offset of 0.
2808 */
2809
2810 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2811 if (inst->mlen != 0 && inst->dst.file == GRF) {
2812 insert_gen4_pre_send_dependency_workarounds(block, inst);
2813 insert_gen4_post_send_dependency_workarounds(block, inst);
2814 progress = true;
2815 }
2816 }
2817
2818 if (progress)
2819 invalidate_live_intervals();
2820 }
2821
2822 /**
2823 * Turns the generic expression-style uniform pull constant load instruction
2824 * into a hardware-specific series of instructions for loading a pull
2825 * constant.
2826 *
2827 * The expression style allows the CSE pass before this to optimize out
2828 * repeated loads from the same offset, and gives the pre-register-allocation
2829 * scheduling full flexibility, while the conversion to native instructions
2830 * allows the post-register-allocation scheduler the best information
2831 * possible.
2832 *
2833 * Note that execution masking for setting up pull constant loads is special:
2834 * the channels that need to be written are unrelated to the current execution
2835 * mask, since a later instruction will use one of the result channels as a
2836 * source operand for all 8 or 16 of its channels.
2837 */
2838 void
2839 fs_visitor::lower_uniform_pull_constant_loads()
2840 {
2841 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2842 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2843 continue;
2844
2845 if (devinfo->gen >= 7) {
2846 /* The offset arg before was a vec4-aligned byte offset. We need to
2847 * turn it into a dword offset.
2848 */
2849 fs_reg const_offset_reg = inst->src[1];
2850 assert(const_offset_reg.file == IMM &&
2851 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2852 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2853
2854 fs_reg payload, offset;
2855 if (devinfo->gen >= 9) {
2856 /* We have to use a message header on Skylake to get SIMD4x2
2857 * mode. Reserve space for the register.
2858 */
2859 offset = payload = fs_reg(GRF, alloc.allocate(2));
2860 offset.reg_offset++;
2861 inst->mlen = 2;
2862 } else {
2863 offset = payload = fs_reg(GRF, alloc.allocate(1));
2864 inst->mlen = 1;
2865 }
2866
2867 /* This is actually going to be a MOV, but since only the first dword
2868 * is accessed, we have a special opcode to do just that one. Note
2869 * that this needs to be an operation that will be considered a def
2870 * by live variable analysis, or register allocation will explode.
2871 */
2872 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2873 8, offset, const_offset_reg);
2874 setup->force_writemask_all = true;
2875
2876 setup->ir = inst->ir;
2877 setup->annotation = inst->annotation;
2878 inst->insert_before(block, setup);
2879
2880 /* Similarly, this will only populate the first 4 channels of the
2881 * result register (since we only use smear values from 0-3), but we
2882 * don't tell the optimizer.
2883 */
2884 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2885 inst->src[1] = payload;
2886 inst->base_mrf = -1;
2887
2888 invalidate_live_intervals();
2889 } else {
2890 /* Before register allocation, we didn't tell the scheduler about the
2891 * MRF we use. We know it's safe to use this MRF because nothing
2892 * else does except for register spill/unspill, which generates and
2893 * uses its MRF within a single IR instruction.
2894 */
2895 inst->base_mrf = 14;
2896 inst->mlen = 1;
2897 }
2898 }
2899 }
2900
2901 bool
2902 fs_visitor::lower_load_payload()
2903 {
2904 bool progress = false;
2905
2906 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2907 if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2908 continue;
2909
2910 assert(inst->dst.file == MRF || inst->dst.file == GRF);
2911 assert(inst->saturate == false);
2912 fs_reg dst = inst->dst;
2913
2914 /* Get rid of COMPR4. We'll add it back in if we need it */
2915 if (dst.file == MRF)
2916 dst.reg = dst.reg & ~BRW_MRF_COMPR4;
2917
2918 const fs_builder hbld = bld.group(8, 0).exec_all().at(block, inst);
2919
2920 for (uint8_t i = 0; i < inst->header_size; i++) {
2921 if (inst->src[i].file != BAD_FILE) {
2922 fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
2923 fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
2924 hbld.MOV(mov_dst, mov_src);
2925 }
2926 dst = offset(dst, hbld, 1);
2927 }
2928
2929 const fs_builder ibld = bld.group(inst->exec_size, inst->force_sechalf)
2930 .exec_all(inst->force_writemask_all)
2931 .at(block, inst);
2932
2933 if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
2934 inst->exec_size > 8) {
2935 /* In this case, the payload portion of the LOAD_PAYLOAD isn't
2936 * a straightforward copy. Instead, the result of the
2937 * LOAD_PAYLOAD is treated as interleaved and the first four
2938 * non-header sources are unpacked as:
2939 *
2940 * m + 0: r0
2941 * m + 1: g0
2942 * m + 2: b0
2943 * m + 3: a0
2944 * m + 4: r1
2945 * m + 5: g1
2946 * m + 6: b1
2947 * m + 7: a1
2948 *
2949 * This is used for gen <= 5 fb writes.
2950 */
2951 assert(inst->exec_size == 16);
2952 assert(inst->header_size + 4 <= inst->sources);
2953 for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
2954 if (inst->src[i].file != BAD_FILE) {
2955 if (devinfo->has_compr4) {
2956 fs_reg compr4_dst = retype(dst, inst->src[i].type);
2957 compr4_dst.reg |= BRW_MRF_COMPR4;
2958 ibld.MOV(compr4_dst, inst->src[i]);
2959 } else {
2960 /* Platform doesn't have COMPR4. We have to fake it */
2961 fs_reg mov_dst = retype(dst, inst->src[i].type);
2962 ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
2963 mov_dst.reg += 4;
2964 ibld.half(1).MOV(mov_dst, half(inst->src[i], 1));
2965 }
2966 }
2967
2968 dst.reg++;
2969 }
2970
2971 /* The loop above only ever incremented us through the first set
2972 * of 4 registers. However, thanks to the magic of COMPR4, we
2973 * actually wrote to the first 8 registers, so we need to take
2974 * that into account now.
2975 */
2976 dst.reg += 4;
2977
2978 /* The COMPR4 code took care of the first 4 sources. We'll let
2979 * the regular path handle any remaining sources. Yes, we are
2980 * modifying the instruction but we're about to delete it so
2981 * this really doesn't hurt anything.
2982 */
2983 inst->header_size += 4;
2984 }
2985
2986 for (uint8_t i = inst->header_size; i < inst->sources; i++) {
2987 if (inst->src[i].file != BAD_FILE)
2988 ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
2989 dst = offset(dst, ibld, 1);
2990 }
2991
2992 inst->remove(block);
2993 progress = true;
2994 }
2995
2996 if (progress)
2997 invalidate_live_intervals();
2998
2999 return progress;
3000 }
3001
3002 bool
3003 fs_visitor::lower_integer_multiplication()
3004 {
3005 bool progress = false;
3006
3007 /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
3008 * directly, but Cherryview cannot.
3009 */
3010 if (devinfo->gen >= 8 && !devinfo->is_cherryview)
3011 return false;
3012
3013 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3014 if (inst->opcode != BRW_OPCODE_MUL ||
3015 inst->dst.is_accumulator() ||
3016 (inst->dst.type != BRW_REGISTER_TYPE_D &&
3017 inst->dst.type != BRW_REGISTER_TYPE_UD))
3018 continue;
3019
3020 const fs_builder ibld = bld.at(block, inst);
3021
3022 /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3023 * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3024 * src1 are used.
3025 *
3026 * If multiplying by an immediate value that fits in 16-bits, do a
3027 * single MUL instruction with that value in the proper location.
3028 */
3029 if (inst->src[1].file == IMM &&
3030 inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
3031 if (devinfo->gen < 7) {
3032 fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
3033 inst->dst.type);
3034 ibld.MOV(imm, inst->src[1]);
3035 ibld.MUL(inst->dst, imm, inst->src[0]);
3036 } else {
3037 ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
3038 }
3039 } else {
3040 /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3041 * do 32-bit integer multiplication in one instruction, but instead
3042 * must do a sequence (which actually calculates a 64-bit result):
3043 *
3044 * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
3045 * mach(8) null g3<8,8,1>D g4<8,8,1>D
3046 * mov(8) g2<1>D acc0<8,8,1>D
3047 *
3048 * But on Gen > 6, the ability to use second accumulator register
3049 * (acc1) for non-float data types was removed, preventing a simple
3050 * implementation in SIMD16. A 16-channel result can be calculated by
3051 * executing the three instructions twice in SIMD8, once with quarter
3052 * control of 1Q for the first eight channels and again with 2Q for
3053 * the second eight channels.
3054 *
3055 * Which accumulator register is implicitly accessed (by AccWrEnable
3056 * for instance) is determined by the quarter control. Unfortunately
3057 * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3058 * implicit accumulator access by an instruction with 2Q will access
3059 * acc1 regardless of whether the data type is usable in acc1.
3060 *
3061 * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3062 * integer data types.
3063 *
3064 * Since we only want the low 32-bits of the result, we can do two
3065 * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3066 * adjust the high result and add them (like the mach is doing):
3067 *
3068 * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
3069 * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
3070 * shl(8) g9<1>D g8<8,8,1>D 16D
3071 * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
3072 *
3073 * We avoid the shl instruction by realizing that we only want to add
3074 * the low 16-bits of the "high" result to the high 16-bits of the
3075 * "low" result and using proper regioning on the add:
3076 *
3077 * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
3078 * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
3079 * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
3080 *
3081 * Since it does not use the (single) accumulator register, we can
3082 * schedule multi-component multiplications much better.
3083 */
3084
3085 if (inst->conditional_mod && inst->dst.is_null()) {
3086 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3087 inst->dst.type);
3088 }
3089 fs_reg low = inst->dst;
3090 fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
3091 inst->dst.type);
3092
3093 if (devinfo->gen >= 7) {
3094 fs_reg src1_0_w = inst->src[1];
3095 fs_reg src1_1_w = inst->src[1];
3096
3097 if (inst->src[1].file == IMM) {
3098 src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
3099 src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
3100 } else {
3101 src1_0_w.type = BRW_REGISTER_TYPE_UW;
3102 if (src1_0_w.stride != 0) {
3103 assert(src1_0_w.stride == 1);
3104 src1_0_w.stride = 2;
3105 }
3106
3107 src1_1_w.type = BRW_REGISTER_TYPE_UW;
3108 if (src1_1_w.stride != 0) {
3109 assert(src1_1_w.stride == 1);
3110 src1_1_w.stride = 2;
3111 }
3112 src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3113 }
3114 ibld.MUL(low, inst->src[0], src1_0_w);
3115 ibld.MUL(high, inst->src[0], src1_1_w);
3116 } else {
3117 fs_reg src0_0_w = inst->src[0];
3118 fs_reg src0_1_w = inst->src[0];
3119
3120 src0_0_w.type = BRW_REGISTER_TYPE_UW;
3121 if (src0_0_w.stride != 0) {
3122 assert(src0_0_w.stride == 1);
3123 src0_0_w.stride = 2;
3124 }
3125
3126 src0_1_w.type = BRW_REGISTER_TYPE_UW;
3127 if (src0_1_w.stride != 0) {
3128 assert(src0_1_w.stride == 1);
3129 src0_1_w.stride = 2;
3130 }
3131 src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3132
3133 ibld.MUL(low, src0_0_w, inst->src[1]);
3134 ibld.MUL(high, src0_1_w, inst->src[1]);
3135 }
3136
3137 fs_reg dst = inst->dst;
3138 dst.type = BRW_REGISTER_TYPE_UW;
3139 dst.subreg_offset = 2;
3140 dst.stride = 2;
3141
3142 high.type = BRW_REGISTER_TYPE_UW;
3143 high.stride = 2;
3144
3145 low.type = BRW_REGISTER_TYPE_UW;
3146 low.subreg_offset = 2;
3147 low.stride = 2;
3148
3149 ibld.ADD(dst, low, high);
3150
3151 if (inst->conditional_mod) {
3152 fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
3153 set_condmod(inst->conditional_mod,
3154 ibld.MOV(null, inst->dst));
3155 }
3156 }
3157
3158 inst->remove(block);
3159 progress = true;
3160 }
3161
3162 if (progress)
3163 invalidate_live_intervals();
3164
3165 return progress;
3166 }
3167
3168 void
3169 fs_visitor::dump_instructions()
3170 {
3171 dump_instructions(NULL);
3172 }
3173
3174 void
3175 fs_visitor::dump_instructions(const char *name)
3176 {
3177 FILE *file = stderr;
3178 if (name && geteuid() != 0) {
3179 file = fopen(name, "w");
3180 if (!file)
3181 file = stderr;
3182 }
3183
3184 if (cfg) {
3185 calculate_register_pressure();
3186 int ip = 0, max_pressure = 0;
3187 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3188 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3189 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3190 dump_instruction(inst, file);
3191 ip++;
3192 }
3193 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3194 } else {
3195 int ip = 0;
3196 foreach_in_list(backend_instruction, inst, &instructions) {
3197 fprintf(file, "%4d: ", ip++);
3198 dump_instruction(inst, file);
3199 }
3200 }
3201
3202 if (file != stderr) {
3203 fclose(file);
3204 }
3205 }
3206
3207 void
3208 fs_visitor::dump_instruction(backend_instruction *be_inst)
3209 {
3210 dump_instruction(be_inst, stderr);
3211 }
3212
3213 void
3214 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3215 {
3216 fs_inst *inst = (fs_inst *)be_inst;
3217
3218 if (inst->predicate) {
3219 fprintf(file, "(%cf0.%d) ",
3220 inst->predicate_inverse ? '-' : '+',
3221 inst->flag_subreg);
3222 }
3223
3224 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3225 if (inst->saturate)
3226 fprintf(file, ".sat");
3227 if (inst->conditional_mod) {
3228 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3229 if (!inst->predicate &&
3230 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3231 inst->opcode != BRW_OPCODE_IF &&
3232 inst->opcode != BRW_OPCODE_WHILE))) {
3233 fprintf(file, ".f0.%d", inst->flag_subreg);
3234 }
3235 }
3236 fprintf(file, "(%d) ", inst->exec_size);
3237
3238 if (inst->mlen) {
3239 fprintf(file, "(mlen: %d) ", inst->mlen);
3240 }
3241
3242 switch (inst->dst.file) {
3243 case GRF:
3244 fprintf(file, "vgrf%d", inst->dst.reg);
3245 if (alloc.sizes[inst->dst.reg] != inst->regs_written ||
3246 inst->dst.subreg_offset)
3247 fprintf(file, "+%d.%d",
3248 inst->dst.reg_offset, inst->dst.subreg_offset);
3249 break;
3250 case MRF:
3251 fprintf(file, "m%d", inst->dst.reg);
3252 break;
3253 case BAD_FILE:
3254 fprintf(file, "(null)");
3255 break;
3256 case UNIFORM:
3257 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3258 break;
3259 case ATTR:
3260 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3261 break;
3262 case HW_REG:
3263 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3264 switch (inst->dst.fixed_hw_reg.nr) {
3265 case BRW_ARF_NULL:
3266 fprintf(file, "null");
3267 break;
3268 case BRW_ARF_ADDRESS:
3269 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3270 break;
3271 case BRW_ARF_ACCUMULATOR:
3272 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3273 break;
3274 case BRW_ARF_FLAG:
3275 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3276 inst->dst.fixed_hw_reg.subnr);
3277 break;
3278 default:
3279 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3280 inst->dst.fixed_hw_reg.subnr);
3281 break;
3282 }
3283 } else {
3284 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3285 }
3286 if (inst->dst.fixed_hw_reg.subnr)
3287 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3288 break;
3289 default:
3290 fprintf(file, "???");
3291 break;
3292 }
3293 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3294
3295 for (int i = 0; i < inst->sources; i++) {
3296 if (inst->src[i].negate)
3297 fprintf(file, "-");
3298 if (inst->src[i].abs)
3299 fprintf(file, "|");
3300 switch (inst->src[i].file) {
3301 case GRF:
3302 fprintf(file, "vgrf%d", inst->src[i].reg);
3303 if (alloc.sizes[inst->src[i].reg] != (unsigned)inst->regs_read(i) ||
3304 inst->src[i].subreg_offset)
3305 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3306 inst->src[i].subreg_offset);
3307 break;
3308 case MRF:
3309 fprintf(file, "***m%d***", inst->src[i].reg);
3310 break;
3311 case ATTR:
3312 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3313 break;
3314 case UNIFORM:
3315 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3316 if (inst->src[i].reladdr) {
3317 fprintf(file, "+reladdr");
3318 } else if (inst->src[i].subreg_offset) {
3319 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3320 inst->src[i].subreg_offset);
3321 }
3322 break;
3323 case BAD_FILE:
3324 fprintf(file, "(null)");
3325 break;
3326 case IMM:
3327 switch (inst->src[i].type) {
3328 case BRW_REGISTER_TYPE_F:
3329 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3330 break;
3331 case BRW_REGISTER_TYPE_W:
3332 case BRW_REGISTER_TYPE_D:
3333 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3334 break;
3335 case BRW_REGISTER_TYPE_UW:
3336 case BRW_REGISTER_TYPE_UD:
3337 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3338 break;
3339 case BRW_REGISTER_TYPE_VF:
3340 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3341 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3342 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3343 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3344 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3345 break;
3346 default:
3347 fprintf(file, "???");
3348 break;
3349 }
3350 break;
3351 case HW_REG:
3352 if (inst->src[i].fixed_hw_reg.negate)
3353 fprintf(file, "-");
3354 if (inst->src[i].fixed_hw_reg.abs)
3355 fprintf(file, "|");
3356 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3357 switch (inst->src[i].fixed_hw_reg.nr) {
3358 case BRW_ARF_NULL:
3359 fprintf(file, "null");
3360 break;
3361 case BRW_ARF_ADDRESS:
3362 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3363 break;
3364 case BRW_ARF_ACCUMULATOR:
3365 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3366 break;
3367 case BRW_ARF_FLAG:
3368 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3369 inst->src[i].fixed_hw_reg.subnr);
3370 break;
3371 default:
3372 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3373 inst->src[i].fixed_hw_reg.subnr);
3374 break;
3375 }
3376 } else {
3377 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3378 }
3379 if (inst->src[i].fixed_hw_reg.subnr)
3380 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3381 if (inst->src[i].fixed_hw_reg.abs)
3382 fprintf(file, "|");
3383 break;
3384 default:
3385 fprintf(file, "???");
3386 break;
3387 }
3388 if (inst->src[i].abs)
3389 fprintf(file, "|");
3390
3391 if (inst->src[i].file != IMM) {
3392 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3393 }
3394
3395 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3396 fprintf(file, ", ");
3397 }
3398
3399 fprintf(file, " ");
3400
3401 if (dispatch_width == 16 && inst->exec_size == 8) {
3402 if (inst->force_sechalf)
3403 fprintf(file, "2ndhalf ");
3404 else
3405 fprintf(file, "1sthalf ");
3406 }
3407
3408 fprintf(file, "\n");
3409 }
3410
3411 /**
3412 * Possibly returns an instruction that set up @param reg.
3413 *
3414 * Sometimes we want to take the result of some expression/variable
3415 * dereference tree and rewrite the instruction generating the result
3416 * of the tree. When processing the tree, we know that the
3417 * instructions generated are all writing temporaries that are dead
3418 * outside of this tree. So, if we have some instructions that write
3419 * a temporary, we're free to point that temp write somewhere else.
3420 *
3421 * Note that this doesn't guarantee that the instruction generated
3422 * only reg -- it might be the size=4 destination of a texture instruction.
3423 */
3424 fs_inst *
3425 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3426 fs_inst *end,
3427 const fs_reg &reg)
3428 {
3429 if (end == start ||
3430 end->is_partial_write() ||
3431 reg.reladdr ||
3432 !reg.equals(end->dst)) {
3433 return NULL;
3434 } else {
3435 return end;
3436 }
3437 }
3438
3439 void
3440 fs_visitor::setup_payload_gen6()
3441 {
3442 bool uses_depth =
3443 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3444 unsigned barycentric_interp_modes =
3445 (stage == MESA_SHADER_FRAGMENT) ?
3446 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3447
3448 assert(devinfo->gen >= 6);
3449
3450 /* R0-1: masks, pixel X/Y coordinates. */
3451 payload.num_regs = 2;
3452 /* R2: only for 32-pixel dispatch.*/
3453
3454 /* R3-26: barycentric interpolation coordinates. These appear in the
3455 * same order that they appear in the brw_wm_barycentric_interp_mode
3456 * enum. Each set of coordinates occupies 2 registers if dispatch width
3457 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3458 * appear if they were enabled using the "Barycentric Interpolation
3459 * Mode" bits in WM_STATE.
3460 */
3461 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3462 if (barycentric_interp_modes & (1 << i)) {
3463 payload.barycentric_coord_reg[i] = payload.num_regs;
3464 payload.num_regs += 2;
3465 if (dispatch_width == 16) {
3466 payload.num_regs += 2;
3467 }
3468 }
3469 }
3470
3471 /* R27: interpolated depth if uses source depth */
3472 if (uses_depth) {
3473 payload.source_depth_reg = payload.num_regs;
3474 payload.num_regs++;
3475 if (dispatch_width == 16) {
3476 /* R28: interpolated depth if not SIMD8. */
3477 payload.num_regs++;
3478 }
3479 }
3480 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3481 if (uses_depth) {
3482 payload.source_w_reg = payload.num_regs;
3483 payload.num_regs++;
3484 if (dispatch_width == 16) {
3485 /* R30: interpolated W if not SIMD8. */
3486 payload.num_regs++;
3487 }
3488 }
3489
3490 if (stage == MESA_SHADER_FRAGMENT) {
3491 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3492 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3493 prog_data->uses_pos_offset = key->compute_pos_offset;
3494 /* R31: MSAA position offsets. */
3495 if (prog_data->uses_pos_offset) {
3496 payload.sample_pos_reg = payload.num_regs;
3497 payload.num_regs++;
3498 }
3499 }
3500
3501 /* R32: MSAA input coverage mask */
3502 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3503 assert(devinfo->gen >= 7);
3504 payload.sample_mask_in_reg = payload.num_regs;
3505 payload.num_regs++;
3506 if (dispatch_width == 16) {
3507 /* R33: input coverage mask if not SIMD8. */
3508 payload.num_regs++;
3509 }
3510 }
3511
3512 /* R34-: bary for 32-pixel. */
3513 /* R58-59: interp W for 32-pixel. */
3514
3515 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3516 source_depth_to_render_target = true;
3517 }
3518 }
3519
3520 void
3521 fs_visitor::setup_vs_payload()
3522 {
3523 /* R0: thread header, R1: urb handles */
3524 payload.num_regs = 2;
3525 }
3526
3527 void
3528 fs_visitor::setup_cs_payload()
3529 {
3530 assert(devinfo->gen >= 7);
3531
3532 payload.num_regs = 1;
3533 }
3534
3535 void
3536 fs_visitor::assign_binding_table_offsets()
3537 {
3538 assert(stage == MESA_SHADER_FRAGMENT);
3539 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3540 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3541 uint32_t next_binding_table_offset = 0;
3542
3543 /* If there are no color regions, we still perform an FB write to a null
3544 * renderbuffer, which we place at surface index 0.
3545 */
3546 prog_data->binding_table.render_target_start = next_binding_table_offset;
3547 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3548
3549 assign_common_binding_table_offsets(next_binding_table_offset);
3550 }
3551
3552 void
3553 fs_visitor::calculate_register_pressure()
3554 {
3555 invalidate_live_intervals();
3556 calculate_live_intervals();
3557
3558 unsigned num_instructions = 0;
3559 foreach_block(block, cfg)
3560 num_instructions += block->instructions.length();
3561
3562 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3563
3564 for (unsigned reg = 0; reg < alloc.count; reg++) {
3565 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3566 regs_live_at_ip[ip] += alloc.sizes[reg];
3567 }
3568 }
3569
3570 void
3571 fs_visitor::optimize()
3572 {
3573 /* bld is the common builder object pointing at the end of the program we
3574 * used to translate it into i965 IR. For the optimization and lowering
3575 * passes coming next, any code added after the end of the program without
3576 * having explicitly called fs_builder::at() clearly points at a mistake.
3577 * Ideally optimization passes wouldn't be part of the visitor so they
3578 * wouldn't have access to bld at all, but they do, so just in case some
3579 * pass forgets to ask for a location explicitly set it to NULL here to
3580 * make it trip.
3581 */
3582 bld = bld.at(NULL, NULL);
3583
3584 split_virtual_grfs();
3585
3586 move_uniform_array_access_to_pull_constants();
3587 assign_constant_locations();
3588 demote_pull_constants();
3589
3590 #define OPT(pass, args...) ({ \
3591 pass_num++; \
3592 bool this_progress = pass(args); \
3593 \
3594 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3595 char filename[64]; \
3596 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3597 stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3598 \
3599 backend_shader::dump_instructions(filename); \
3600 } \
3601 \
3602 progress = progress || this_progress; \
3603 this_progress; \
3604 })
3605
3606 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3607 char filename[64];
3608 snprintf(filename, 64, "%s%d-%04d-00-start",
3609 stage_abbrev, dispatch_width,
3610 shader_prog ? shader_prog->Name : 0);
3611
3612 backend_shader::dump_instructions(filename);
3613 }
3614
3615 bool progress;
3616 int iteration = 0;
3617 int pass_num = 0;
3618 do {
3619 progress = false;
3620 pass_num = 0;
3621 iteration++;
3622
3623 OPT(remove_duplicate_mrf_writes);
3624
3625 OPT(opt_algebraic);
3626 OPT(opt_cse);
3627 OPT(opt_copy_propagate);
3628 OPT(opt_peephole_predicated_break);
3629 OPT(opt_cmod_propagation);
3630 OPT(dead_code_eliminate);
3631 OPT(opt_peephole_sel);
3632 OPT(dead_control_flow_eliminate, this);
3633 OPT(opt_register_renaming);
3634 OPT(opt_redundant_discard_jumps);
3635 OPT(opt_saturate_propagation);
3636 OPT(opt_zero_samples);
3637 OPT(register_coalesce);
3638 OPT(compute_to_mrf);
3639 OPT(eliminate_find_live_channel);
3640
3641 OPT(compact_virtual_grfs);
3642 } while (progress);
3643
3644 pass_num = 0;
3645
3646 OPT(opt_sampler_eot);
3647
3648 if (OPT(lower_load_payload)) {
3649 split_virtual_grfs();
3650 OPT(register_coalesce);
3651 OPT(compute_to_mrf);
3652 OPT(dead_code_eliminate);
3653 }
3654
3655 OPT(opt_combine_constants);
3656 OPT(lower_integer_multiplication);
3657
3658 lower_uniform_pull_constant_loads();
3659 }
3660
3661 /**
3662 * Three source instruction must have a GRF/MRF destination register.
3663 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
3664 */
3665 void
3666 fs_visitor::fixup_3src_null_dest()
3667 {
3668 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3669 if (inst->is_3src() && inst->dst.is_null()) {
3670 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3671 inst->dst.type);
3672 }
3673 }
3674 }
3675
3676 void
3677 fs_visitor::allocate_registers()
3678 {
3679 bool allocated_without_spills;
3680
3681 static const enum instruction_scheduler_mode pre_modes[] = {
3682 SCHEDULE_PRE,
3683 SCHEDULE_PRE_NON_LIFO,
3684 SCHEDULE_PRE_LIFO,
3685 };
3686
3687 /* Try each scheduling heuristic to see if it can successfully register
3688 * allocate without spilling. They should be ordered by decreasing
3689 * performance but increasing likelihood of allocating.
3690 */
3691 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3692 schedule_instructions(pre_modes[i]);
3693
3694 if (0) {
3695 assign_regs_trivial();
3696 allocated_without_spills = true;
3697 } else {
3698 allocated_without_spills = assign_regs(false);
3699 }
3700 if (allocated_without_spills)
3701 break;
3702 }
3703
3704 if (!allocated_without_spills) {
3705 /* We assume that any spilling is worse than just dropping back to
3706 * SIMD8. There's probably actually some intermediate point where
3707 * SIMD16 with a couple of spills is still better.
3708 */
3709 if (dispatch_width == 16) {
3710 fail("Failure to register allocate. Reduce number of "
3711 "live scalar values to avoid this.");
3712 } else {
3713 compiler->shader_perf_log(log_data,
3714 "%s shader triggered register spilling. "
3715 "Try reducing the number of live scalar "
3716 "values to improve performance.\n",
3717 stage_name);
3718 }
3719
3720 /* Since we're out of heuristics, just go spill registers until we
3721 * get an allocation.
3722 */
3723 while (!assign_regs(true)) {
3724 if (failed)
3725 break;
3726 }
3727 }
3728
3729 /* This must come after all optimization and register allocation, since
3730 * it inserts dead code that happens to have side effects, and it does
3731 * so based on the actual physical registers in use.
3732 */
3733 insert_gen4_send_dependency_workarounds();
3734
3735 if (failed)
3736 return;
3737
3738 if (!allocated_without_spills)
3739 schedule_instructions(SCHEDULE_POST);
3740
3741 if (last_scratch > 0)
3742 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3743 }
3744
3745 bool
3746 fs_visitor::run_vs(gl_clip_plane *clip_planes)
3747 {
3748 assert(stage == MESA_SHADER_VERTEX);
3749
3750 assign_common_binding_table_offsets(0);
3751 setup_vs_payload();
3752
3753 if (shader_time_index >= 0)
3754 emit_shader_time_begin();
3755
3756 emit_nir_code();
3757
3758 if (failed)
3759 return false;
3760
3761 compute_clip_distance(clip_planes);
3762
3763 emit_urb_writes();
3764
3765 if (shader_time_index >= 0)
3766 emit_shader_time_end();
3767
3768 calculate_cfg();
3769
3770 optimize();
3771
3772 assign_curb_setup();
3773 assign_vs_urb_setup();
3774
3775 fixup_3src_null_dest();
3776 allocate_registers();
3777
3778 return !failed;
3779 }
3780
3781 bool
3782 fs_visitor::run_fs(bool do_rep_send)
3783 {
3784 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3785 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3786
3787 assert(stage == MESA_SHADER_FRAGMENT);
3788
3789 sanity_param_count = prog->Parameters->NumParameters;
3790
3791 assign_binding_table_offsets();
3792
3793 if (devinfo->gen >= 6)
3794 setup_payload_gen6();
3795 else
3796 setup_payload_gen4();
3797
3798 if (0) {
3799 emit_dummy_fs();
3800 } else if (do_rep_send) {
3801 assert(dispatch_width == 16);
3802 emit_repclear_shader();
3803 } else {
3804 if (shader_time_index >= 0)
3805 emit_shader_time_begin();
3806
3807 calculate_urb_setup();
3808 if (prog->InputsRead > 0) {
3809 if (devinfo->gen < 6)
3810 emit_interpolation_setup_gen4();
3811 else
3812 emit_interpolation_setup_gen6();
3813 }
3814
3815 /* We handle discards by keeping track of the still-live pixels in f0.1.
3816 * Initialize it with the dispatched pixels.
3817 */
3818 if (wm_prog_data->uses_kill) {
3819 fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3820 discard_init->flag_subreg = 1;
3821 }
3822
3823 /* Generate FS IR for main(). (the visitor only descends into
3824 * functions called "main").
3825 */
3826 emit_nir_code();
3827
3828 if (failed)
3829 return false;
3830
3831 if (wm_prog_data->uses_kill)
3832 bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
3833
3834 if (wm_key->alpha_test_func)
3835 emit_alpha_test();
3836
3837 emit_fb_writes();
3838
3839 if (shader_time_index >= 0)
3840 emit_shader_time_end();
3841
3842 calculate_cfg();
3843
3844 optimize();
3845
3846 assign_curb_setup();
3847 assign_urb_setup();
3848
3849 fixup_3src_null_dest();
3850 allocate_registers();
3851
3852 if (failed)
3853 return false;
3854 }
3855
3856 if (dispatch_width == 8)
3857 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3858 else
3859 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3860
3861 /* If any state parameters were appended, then ParameterValues could have
3862 * been realloced, in which case the driver uniform storage set up by
3863 * _mesa_associate_uniform_storage() would point to freed memory. Make
3864 * sure that didn't happen.
3865 */
3866 assert(sanity_param_count == prog->Parameters->NumParameters);
3867
3868 return !failed;
3869 }
3870
3871 bool
3872 fs_visitor::run_cs()
3873 {
3874 assert(stage == MESA_SHADER_COMPUTE);
3875 assert(shader);
3876
3877 sanity_param_count = prog->Parameters->NumParameters;
3878
3879 assign_common_binding_table_offsets(0);
3880
3881 setup_cs_payload();
3882
3883 if (shader_time_index >= 0)
3884 emit_shader_time_begin();
3885
3886 emit_nir_code();
3887
3888 if (failed)
3889 return false;
3890
3891 emit_cs_terminate();
3892
3893 if (shader_time_index >= 0)
3894 emit_shader_time_end();
3895
3896 calculate_cfg();
3897
3898 optimize();
3899
3900 assign_curb_setup();
3901
3902 fixup_3src_null_dest();
3903 allocate_registers();
3904
3905 if (failed)
3906 return false;
3907
3908 /* If any state parameters were appended, then ParameterValues could have
3909 * been realloced, in which case the driver uniform storage set up by
3910 * _mesa_associate_uniform_storage() would point to freed memory. Make
3911 * sure that didn't happen.
3912 */
3913 assert(sanity_param_count == prog->Parameters->NumParameters);
3914
3915 return !failed;
3916 }
3917
3918 const unsigned *
3919 brw_wm_fs_emit(struct brw_context *brw,
3920 void *mem_ctx,
3921 const struct brw_wm_prog_key *key,
3922 struct brw_wm_prog_data *prog_data,
3923 struct gl_fragment_program *fp,
3924 struct gl_shader_program *prog,
3925 unsigned *final_assembly_size)
3926 {
3927 bool start_busy = false;
3928 double start_time = 0;
3929
3930 if (unlikely(brw->perf_debug)) {
3931 start_busy = (brw->batch.last_bo &&
3932 drm_intel_bo_busy(brw->batch.last_bo));
3933 start_time = get_time();
3934 }
3935
3936 struct brw_shader *shader = NULL;
3937 if (prog)
3938 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3939
3940 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3941 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3942
3943 int st_index8 = -1, st_index16 = -1;
3944 if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
3945 st_index8 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS8);
3946 st_index16 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS16);
3947 }
3948
3949 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3950 */
3951 fs_visitor v(brw->intelScreen->compiler, brw,
3952 mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
3953 prog, &fp->Base, 8, st_index8);
3954 if (!v.run_fs(false /* do_rep_send */)) {
3955 if (prog) {
3956 prog->LinkStatus = false;
3957 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3958 }
3959
3960 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3961 v.fail_msg);
3962
3963 return NULL;
3964 }
3965
3966 cfg_t *simd16_cfg = NULL;
3967 fs_visitor v2(brw->intelScreen->compiler, brw,
3968 mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
3969 prog, &fp->Base, 16, st_index16);
3970 if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
3971 if (!v.simd16_unsupported) {
3972 /* Try a SIMD16 compile */
3973 v2.import_uniforms(&v);
3974 if (!v2.run_fs(brw->use_rep_send)) {
3975 perf_debug("SIMD16 shader failed to compile: %s", v2.fail_msg);
3976 } else {
3977 simd16_cfg = v2.cfg;
3978 }
3979 }
3980 }
3981
3982 cfg_t *simd8_cfg;
3983 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3984 if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
3985 simd8_cfg = NULL;
3986 prog_data->no_8 = true;
3987 } else {
3988 simd8_cfg = v.cfg;
3989 prog_data->no_8 = false;
3990 }
3991
3992 fs_generator g(brw->intelScreen->compiler, brw,
3993 mem_ctx, (void *) key, &prog_data->base,
3994 &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
3995
3996 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3997 char *name;
3998 if (prog)
3999 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4000 prog->Label ? prog->Label : "unnamed",
4001 prog->Name);
4002 else
4003 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4004
4005 g.enable_debug(name);
4006 }
4007
4008 if (simd8_cfg)
4009 g.generate_code(simd8_cfg, 8);
4010 if (simd16_cfg)
4011 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4012
4013 if (unlikely(brw->perf_debug) && shader) {
4014 if (shader->compiled_once)
4015 brw_wm_debug_recompile(brw, prog, key);
4016 shader->compiled_once = true;
4017
4018 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4019 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4020 (get_time() - start_time) * 1000);
4021 }
4022 }
4023
4024 return g.get_assembly(final_assembly_size);
4025 }
4026
4027 extern "C" bool
4028 brw_fs_precompile(struct gl_context *ctx,
4029 struct gl_shader_program *shader_prog,
4030 struct gl_program *prog)
4031 {
4032 struct brw_context *brw = brw_context(ctx);
4033 struct brw_wm_prog_key key;
4034
4035 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4036 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4037 bool program_uses_dfdy = fp->UsesDFdy;
4038
4039 memset(&key, 0, sizeof(key));
4040
4041 if (brw->gen < 6) {
4042 if (fp->UsesKill)
4043 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4044
4045 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4046 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4047
4048 /* Just assume depth testing. */
4049 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4050 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4051 }
4052
4053 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4054 BRW_FS_VARYING_INPUT_MASK) > 16)
4055 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4056
4057 brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4058
4059 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4060 key.drawable_height = ctx->DrawBuffer->Height;
4061 }
4062
4063 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4064 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4065 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4066
4067 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4068 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4069 key.nr_color_regions > 1;
4070 }
4071
4072 key.program_string_id = bfp->id;
4073
4074 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4075 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4076
4077 bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4078
4079 brw->wm.base.prog_offset = old_prog_offset;
4080 brw->wm.prog_data = old_prog_data;
4081
4082 return success;
4083 }
4084
4085 void
4086 brw_setup_tex_for_precompile(struct brw_context *brw,
4087 struct brw_sampler_prog_key_data *tex,
4088 struct gl_program *prog)
4089 {
4090 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4091 unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4092 for (unsigned i = 0; i < sampler_count; i++) {
4093 if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4094 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4095 tex->swizzles[i] =
4096 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4097 } else {
4098 /* Color sampler: assume no swizzling. */
4099 tex->swizzles[i] = SWIZZLE_XYZW;
4100 }
4101 }
4102 }