i965/fs: Fix stride for immediate registers.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 using namespace brw;
53
54 void
55 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
56 const fs_reg *src, unsigned sources)
57 {
58 memset(this, 0, sizeof(*this));
59
60 this->src = new fs_reg[MAX2(sources, 3)];
61 for (unsigned i = 0; i < sources; i++)
62 this->src[i] = src[i];
63
64 this->opcode = opcode;
65 this->dst = dst;
66 this->sources = sources;
67 this->exec_size = exec_size;
68
69 assert(dst.file != IMM && dst.file != UNIFORM);
70
71 assert(this->exec_size != 0);
72
73 this->conditional_mod = BRW_CONDITIONAL_NONE;
74
75 /* This will be the case for almost all instructions. */
76 switch (dst.file) {
77 case GRF:
78 case HW_REG:
79 case MRF:
80 case ATTR:
81 this->regs_written = DIV_ROUND_UP(dst.component_size(exec_size),
82 REG_SIZE);
83 break;
84 case BAD_FILE:
85 this->regs_written = 0;
86 break;
87 case IMM:
88 case UNIFORM:
89 unreachable("Invalid destination register file");
90 default:
91 unreachable("Invalid register file");
92 }
93
94 this->writes_accumulator = false;
95 }
96
97 fs_inst::fs_inst()
98 {
99 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
100 }
101
102 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
103 {
104 init(opcode, exec_size, reg_undef, NULL, 0);
105 }
106
107 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
108 {
109 init(opcode, exec_size, dst, NULL, 0);
110 }
111
112 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
113 const fs_reg &src0)
114 {
115 const fs_reg src[1] = { src0 };
116 init(opcode, exec_size, dst, src, 1);
117 }
118
119 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
120 const fs_reg &src0, const fs_reg &src1)
121 {
122 const fs_reg src[2] = { src0, src1 };
123 init(opcode, exec_size, dst, src, 2);
124 }
125
126 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
127 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
128 {
129 const fs_reg src[3] = { src0, src1, src2 };
130 init(opcode, exec_size, dst, src, 3);
131 }
132
133 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
134 const fs_reg src[], unsigned sources)
135 {
136 init(opcode, exec_width, dst, src, sources);
137 }
138
139 fs_inst::fs_inst(const fs_inst &that)
140 {
141 memcpy(this, &that, sizeof(that));
142
143 this->src = new fs_reg[MAX2(that.sources, 3)];
144
145 for (unsigned i = 0; i < that.sources; i++)
146 this->src[i] = that.src[i];
147 }
148
149 fs_inst::~fs_inst()
150 {
151 delete[] this->src;
152 }
153
154 void
155 fs_inst::resize_sources(uint8_t num_sources)
156 {
157 if (this->sources != num_sources) {
158 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
159
160 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
161 src[i] = this->src[i];
162
163 delete[] this->src;
164 this->src = src;
165 this->sources = num_sources;
166 }
167 }
168
169 void
170 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
171 const fs_reg &dst,
172 const fs_reg &surf_index,
173 const fs_reg &varying_offset,
174 uint32_t const_offset)
175 {
176 /* We have our constant surface use a pitch of 4 bytes, so our index can
177 * be any component of a vector, and then we load 4 contiguous
178 * components starting from that.
179 *
180 * We break down the const_offset to a portion added to the variable
181 * offset and a portion done using reg_offset, which means that if you
182 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
183 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
184 * CSE can later notice that those loads are all the same and eliminate
185 * the redundant ones.
186 */
187 fs_reg vec4_offset = vgrf(glsl_type::int_type);
188 bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
189
190 int scale = 1;
191 if (devinfo->gen == 4 && bld.dispatch_width() == 8) {
192 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
193 * u, v, r) as parameters, or we can just use the SIMD16 message
194 * consisting of (header, u). We choose the second, at the cost of a
195 * longer return length.
196 */
197 scale = 2;
198 }
199
200 enum opcode op;
201 if (devinfo->gen >= 7)
202 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
203 else
204 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
205
206 int regs_written = 4 * (bld.dispatch_width() / 8) * scale;
207 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written), dst.type);
208 fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
209 inst->regs_written = regs_written;
210
211 if (devinfo->gen < 7) {
212 inst->base_mrf = 13;
213 inst->header_size = 1;
214 if (devinfo->gen == 4)
215 inst->mlen = 3;
216 else
217 inst->mlen = 1 + bld.dispatch_width() / 8;
218 }
219
220 bld.MOV(dst, offset(vec4_result, bld, (const_offset & 3) * scale));
221 }
222
223 /**
224 * A helper for MOV generation for fixing up broken hardware SEND dependency
225 * handling.
226 */
227 void
228 fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
229 {
230 /* The caller always wants uncompressed to emit the minimal extra
231 * dependencies, and to avoid having to deal with aligning its regs to 2.
232 */
233 const fs_builder ubld = bld.annotate("send dependency resolve")
234 .half(0);
235
236 ubld.MOV(ubld.null_reg_f(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
237 }
238
239 bool
240 fs_inst::equals(fs_inst *inst) const
241 {
242 return (opcode == inst->opcode &&
243 dst.equals(inst->dst) &&
244 src[0].equals(inst->src[0]) &&
245 src[1].equals(inst->src[1]) &&
246 src[2].equals(inst->src[2]) &&
247 saturate == inst->saturate &&
248 predicate == inst->predicate &&
249 conditional_mod == inst->conditional_mod &&
250 mlen == inst->mlen &&
251 base_mrf == inst->base_mrf &&
252 target == inst->target &&
253 eot == inst->eot &&
254 header_size == inst->header_size &&
255 shadow_compare == inst->shadow_compare &&
256 exec_size == inst->exec_size &&
257 offset == inst->offset);
258 }
259
260 bool
261 fs_inst::overwrites_reg(const fs_reg &reg) const
262 {
263 return reg.in_range(dst, regs_written);
264 }
265
266 bool
267 fs_inst::is_send_from_grf() const
268 {
269 switch (opcode) {
270 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
271 case SHADER_OPCODE_SHADER_TIME_ADD:
272 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
273 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
274 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
275 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
276 case SHADER_OPCODE_UNTYPED_ATOMIC:
277 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
278 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
279 case SHADER_OPCODE_TYPED_ATOMIC:
280 case SHADER_OPCODE_TYPED_SURFACE_READ:
281 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
282 case SHADER_OPCODE_URB_WRITE_SIMD8:
283 return true;
284 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
285 return src[1].file == GRF;
286 case FS_OPCODE_FB_WRITE:
287 return src[0].file == GRF;
288 default:
289 if (is_tex())
290 return src[0].file == GRF;
291
292 return false;
293 }
294 }
295
296 bool
297 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
298 {
299 if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
300 return false;
301
302 fs_reg reg = this->src[0];
303 if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
304 return false;
305
306 if (grf_alloc.sizes[reg.reg] != this->regs_written)
307 return false;
308
309 for (int i = 0; i < this->sources; i++) {
310 reg.type = this->src[i].type;
311 if (!this->src[i].equals(reg))
312 return false;
313
314 if (i < this->header_size) {
315 reg.reg_offset += 1;
316 } else {
317 reg.reg_offset += this->exec_size / 8;
318 }
319 }
320
321 return true;
322 }
323
324 bool
325 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
326 {
327 if (devinfo->gen == 6 && is_math())
328 return false;
329
330 if (is_send_from_grf())
331 return false;
332
333 if (!backend_instruction::can_do_source_mods())
334 return false;
335
336 return true;
337 }
338
339 bool
340 fs_inst::has_side_effects() const
341 {
342 return this->eot || backend_instruction::has_side_effects();
343 }
344
345 void
346 fs_reg::init()
347 {
348 memset(this, 0, sizeof(*this));
349 stride = 1;
350 }
351
352 /** Generic unset register constructor. */
353 fs_reg::fs_reg()
354 {
355 init();
356 this->file = BAD_FILE;
357 }
358
359 /** Immediate value constructor. */
360 fs_reg::fs_reg(float f)
361 {
362 init();
363 this->file = IMM;
364 this->type = BRW_REGISTER_TYPE_F;
365 this->stride = 0;
366 this->fixed_hw_reg.dw1.f = f;
367 }
368
369 /** Immediate value constructor. */
370 fs_reg::fs_reg(int32_t i)
371 {
372 init();
373 this->file = IMM;
374 this->type = BRW_REGISTER_TYPE_D;
375 this->stride = 0;
376 this->fixed_hw_reg.dw1.d = i;
377 }
378
379 /** Immediate value constructor. */
380 fs_reg::fs_reg(uint32_t u)
381 {
382 init();
383 this->file = IMM;
384 this->type = BRW_REGISTER_TYPE_UD;
385 this->stride = 0;
386 this->fixed_hw_reg.dw1.ud = u;
387 }
388
389 /** Vector float immediate value constructor. */
390 fs_reg::fs_reg(uint8_t vf[4])
391 {
392 init();
393 this->file = IMM;
394 this->type = BRW_REGISTER_TYPE_VF;
395 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
396 }
397
398 /** Vector float immediate value constructor. */
399 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
400 {
401 init();
402 this->file = IMM;
403 this->type = BRW_REGISTER_TYPE_VF;
404 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
405 (vf1 << 8) |
406 (vf2 << 16) |
407 (vf3 << 24);
408 }
409
410 /** Fixed brw_reg. */
411 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
412 {
413 init();
414 this->file = HW_REG;
415 this->fixed_hw_reg = fixed_hw_reg;
416 this->type = fixed_hw_reg.type;
417 }
418
419 bool
420 fs_reg::equals(const fs_reg &r) const
421 {
422 return (file == r.file &&
423 reg == r.reg &&
424 reg_offset == r.reg_offset &&
425 subreg_offset == r.subreg_offset &&
426 type == r.type &&
427 negate == r.negate &&
428 abs == r.abs &&
429 !reladdr && !r.reladdr &&
430 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
431 stride == r.stride);
432 }
433
434 fs_reg &
435 fs_reg::set_smear(unsigned subreg)
436 {
437 assert(file != HW_REG && file != IMM);
438 subreg_offset = subreg * type_sz(type);
439 stride = 0;
440 return *this;
441 }
442
443 bool
444 fs_reg::is_contiguous() const
445 {
446 return stride == 1;
447 }
448
449 unsigned
450 fs_reg::component_size(unsigned width) const
451 {
452 const unsigned stride = (file != HW_REG ? this->stride :
453 fixed_hw_reg.hstride == 0 ? 0 :
454 1 << (fixed_hw_reg.hstride - 1));
455 return MAX2(width * stride, 1) * type_sz(type);
456 }
457
458 int
459 fs_visitor::type_size(const struct glsl_type *type)
460 {
461 unsigned int size, i;
462
463 switch (type->base_type) {
464 case GLSL_TYPE_UINT:
465 case GLSL_TYPE_INT:
466 case GLSL_TYPE_FLOAT:
467 case GLSL_TYPE_BOOL:
468 return type->components();
469 case GLSL_TYPE_ARRAY:
470 return type_size(type->fields.array) * type->length;
471 case GLSL_TYPE_STRUCT:
472 size = 0;
473 for (i = 0; i < type->length; i++) {
474 size += type_size(type->fields.structure[i].type);
475 }
476 return size;
477 case GLSL_TYPE_SAMPLER:
478 /* Samplers take up no register space, since they're baked in at
479 * link time.
480 */
481 return 0;
482 case GLSL_TYPE_ATOMIC_UINT:
483 return 0;
484 case GLSL_TYPE_IMAGE:
485 case GLSL_TYPE_VOID:
486 case GLSL_TYPE_ERROR:
487 case GLSL_TYPE_INTERFACE:
488 case GLSL_TYPE_DOUBLE:
489 unreachable("not reached");
490 }
491
492 return 0;
493 }
494
495 /**
496 * Create a MOV to read the timestamp register.
497 *
498 * The caller is responsible for emitting the MOV. The return value is
499 * the destination of the MOV, with extra parameters set.
500 */
501 fs_reg
502 fs_visitor::get_timestamp(const fs_builder &bld)
503 {
504 assert(devinfo->gen >= 7);
505
506 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
507 BRW_ARF_TIMESTAMP,
508 0),
509 BRW_REGISTER_TYPE_UD));
510
511 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
512
513 /* We want to read the 3 fields we care about even if it's not enabled in
514 * the dispatch.
515 */
516 bld.group(4, 0).exec_all().MOV(dst, ts);
517
518 /* The caller wants the low 32 bits of the timestamp. Since it's running
519 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
520 * which is plenty of time for our purposes. It is identical across the
521 * EUs, but since it's tracking GPU core speed it will increment at a
522 * varying rate as render P-states change.
523 *
524 * The caller could also check if render P-states have changed (or anything
525 * else that might disrupt timing) by setting smear to 2 and checking if
526 * that field is != 0.
527 */
528 dst.set_smear(0);
529
530 return dst;
531 }
532
533 void
534 fs_visitor::emit_shader_time_begin()
535 {
536 shader_start_time = get_timestamp(bld.annotate("shader time start"));
537 }
538
539 void
540 fs_visitor::emit_shader_time_end()
541 {
542 /* Insert our code just before the final SEND with EOT. */
543 exec_node *end = this->instructions.get_tail();
544 assert(end && ((fs_inst *) end)->eot);
545 const fs_builder ibld = bld.annotate("shader time end")
546 .exec_all().at(NULL, end);
547
548 fs_reg shader_end_time = get_timestamp(ibld);
549
550 /* Check that there weren't any timestamp reset events (assuming these
551 * were the only two timestamp reads that happened).
552 */
553 fs_reg reset = shader_end_time;
554 reset.set_smear(2);
555 set_condmod(BRW_CONDITIONAL_Z,
556 ibld.AND(ibld.null_reg_ud(), reset, fs_reg(1u)));
557 ibld.IF(BRW_PREDICATE_NORMAL);
558
559 fs_reg start = shader_start_time;
560 start.negate = true;
561 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
562 diff.set_smear(0);
563
564 const fs_builder cbld = ibld.group(1, 0);
565 cbld.group(1, 0).ADD(diff, start, shader_end_time);
566
567 /* If there were no instructions between the two timestamp gets, the diff
568 * is 2 cycles. Remove that overhead, so I can forget about that when
569 * trying to determine the time taken for single instructions.
570 */
571 cbld.ADD(diff, diff, fs_reg(-2u));
572 SHADER_TIME_ADD(cbld, 0, diff);
573 SHADER_TIME_ADD(cbld, 1, fs_reg(1u));
574 ibld.emit(BRW_OPCODE_ELSE);
575 SHADER_TIME_ADD(cbld, 2, fs_reg(1u));
576 ibld.emit(BRW_OPCODE_ENDIF);
577 }
578
579 void
580 fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
581 int shader_time_subindex,
582 fs_reg value)
583 {
584 int index = shader_time_index * 3 + shader_time_subindex;
585 fs_reg offset = fs_reg(index * SHADER_TIME_STRIDE);
586
587 fs_reg payload;
588 if (dispatch_width == 8)
589 payload = vgrf(glsl_type::uvec2_type);
590 else
591 payload = vgrf(glsl_type::uint_type);
592
593 bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
594 }
595
596 void
597 fs_visitor::vfail(const char *format, va_list va)
598 {
599 char *msg;
600
601 if (failed)
602 return;
603
604 failed = true;
605
606 msg = ralloc_vasprintf(mem_ctx, format, va);
607 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
608
609 this->fail_msg = msg;
610
611 if (debug_enabled) {
612 fprintf(stderr, "%s", msg);
613 }
614 }
615
616 void
617 fs_visitor::fail(const char *format, ...)
618 {
619 va_list va;
620
621 va_start(va, format);
622 vfail(format, va);
623 va_end(va);
624 }
625
626 /**
627 * Mark this program as impossible to compile in SIMD16 mode.
628 *
629 * During the SIMD8 compile (which happens first), we can detect and flag
630 * things that are unsupported in SIMD16 mode, so the compiler can skip
631 * the SIMD16 compile altogether.
632 *
633 * During a SIMD16 compile (if one happens anyway), this just calls fail().
634 */
635 void
636 fs_visitor::no16(const char *msg)
637 {
638 if (dispatch_width == 16) {
639 fail("%s", msg);
640 } else {
641 simd16_unsupported = true;
642
643 compiler->shader_perf_log(log_data,
644 "SIMD16 shader failed to compile: %s", msg);
645 }
646 }
647
648 /**
649 * Returns true if the instruction has a flag that means it won't
650 * update an entire destination register.
651 *
652 * For example, dead code elimination and live variable analysis want to know
653 * when a write to a variable screens off any preceding values that were in
654 * it.
655 */
656 bool
657 fs_inst::is_partial_write() const
658 {
659 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
660 (this->exec_size * type_sz(this->dst.type)) < 32 ||
661 !this->dst.is_contiguous());
662 }
663
664 int
665 fs_inst::regs_read(int arg) const
666 {
667 unsigned components = 1;
668 switch (opcode) {
669 case FS_OPCODE_FB_WRITE:
670 case SHADER_OPCODE_URB_WRITE_SIMD8:
671 case SHADER_OPCODE_UNTYPED_ATOMIC:
672 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
673 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
674 case SHADER_OPCODE_TYPED_ATOMIC:
675 case SHADER_OPCODE_TYPED_SURFACE_READ:
676 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
677 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
678 if (arg == 0)
679 return mlen;
680 break;
681
682 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
683 /* The payload is actually stored in src1 */
684 if (arg == 1)
685 return mlen;
686 break;
687
688 case FS_OPCODE_LINTERP:
689 if (arg == 0)
690 return exec_size / 4;
691 else
692 return 1;
693
694 case FS_OPCODE_PIXEL_X:
695 case FS_OPCODE_PIXEL_Y:
696 if (arg == 0)
697 components = 2;
698 break;
699
700 case SHADER_OPCODE_LOAD_PAYLOAD:
701 if (arg < this->header_size)
702 return 1;
703 break;
704
705 case CS_OPCODE_CS_TERMINATE:
706 return 1;
707
708 default:
709 if (is_tex() && arg == 0 && src[0].file == GRF)
710 return mlen;
711 break;
712 }
713
714 switch (src[arg].file) {
715 case BAD_FILE:
716 case UNIFORM:
717 case IMM:
718 return 1;
719 case GRF:
720 case HW_REG:
721 return DIV_ROUND_UP(components * src[arg].component_size(exec_size),
722 REG_SIZE);
723 case MRF:
724 unreachable("MRF registers are not allowed as sources");
725 default:
726 unreachable("Invalid register file");
727 }
728 }
729
730 bool
731 fs_inst::reads_flag() const
732 {
733 return predicate;
734 }
735
736 bool
737 fs_inst::writes_flag() const
738 {
739 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
740 opcode != BRW_OPCODE_IF &&
741 opcode != BRW_OPCODE_WHILE)) ||
742 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
743 }
744
745 /**
746 * Returns how many MRFs an FS opcode will write over.
747 *
748 * Note that this is not the 0 or 1 implied writes in an actual gen
749 * instruction -- the FS opcodes often generate MOVs in addition.
750 */
751 int
752 fs_visitor::implied_mrf_writes(fs_inst *inst)
753 {
754 if (inst->mlen == 0)
755 return 0;
756
757 if (inst->base_mrf == -1)
758 return 0;
759
760 switch (inst->opcode) {
761 case SHADER_OPCODE_RCP:
762 case SHADER_OPCODE_RSQ:
763 case SHADER_OPCODE_SQRT:
764 case SHADER_OPCODE_EXP2:
765 case SHADER_OPCODE_LOG2:
766 case SHADER_OPCODE_SIN:
767 case SHADER_OPCODE_COS:
768 return 1 * dispatch_width / 8;
769 case SHADER_OPCODE_POW:
770 case SHADER_OPCODE_INT_QUOTIENT:
771 case SHADER_OPCODE_INT_REMAINDER:
772 return 2 * dispatch_width / 8;
773 case SHADER_OPCODE_TEX:
774 case FS_OPCODE_TXB:
775 case SHADER_OPCODE_TXD:
776 case SHADER_OPCODE_TXF:
777 case SHADER_OPCODE_TXF_CMS:
778 case SHADER_OPCODE_TXF_MCS:
779 case SHADER_OPCODE_TG4:
780 case SHADER_OPCODE_TG4_OFFSET:
781 case SHADER_OPCODE_TXL:
782 case SHADER_OPCODE_TXS:
783 case SHADER_OPCODE_LOD:
784 return 1;
785 case FS_OPCODE_FB_WRITE:
786 return 2;
787 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
788 case SHADER_OPCODE_GEN4_SCRATCH_READ:
789 return 1;
790 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
791 return inst->mlen;
792 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
793 return inst->mlen;
794 case SHADER_OPCODE_UNTYPED_ATOMIC:
795 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
796 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
797 case SHADER_OPCODE_TYPED_ATOMIC:
798 case SHADER_OPCODE_TYPED_SURFACE_READ:
799 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
800 case SHADER_OPCODE_URB_WRITE_SIMD8:
801 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
802 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
803 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
804 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
805 return 0;
806 default:
807 unreachable("not reached");
808 }
809 }
810
811 fs_reg
812 fs_visitor::vgrf(const glsl_type *const type)
813 {
814 int reg_width = dispatch_width / 8;
815 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
816 brw_type_for_base_type(type));
817 }
818
819 /** Fixed HW reg constructor. */
820 fs_reg::fs_reg(enum register_file file, int reg)
821 {
822 init();
823 this->file = file;
824 this->reg = reg;
825 this->type = BRW_REGISTER_TYPE_F;
826 }
827
828 /** Fixed HW reg constructor. */
829 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
830 {
831 init();
832 this->file = file;
833 this->reg = reg;
834 this->type = type;
835 }
836
837 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
838 * This brings in those uniform definitions
839 */
840 void
841 fs_visitor::import_uniforms(fs_visitor *v)
842 {
843 this->push_constant_loc = v->push_constant_loc;
844 this->pull_constant_loc = v->pull_constant_loc;
845 this->uniforms = v->uniforms;
846 this->param_size = v->param_size;
847 }
848
849 fs_reg *
850 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
851 bool origin_upper_left)
852 {
853 assert(stage == MESA_SHADER_FRAGMENT);
854 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
855 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
856 fs_reg wpos = *reg;
857 bool flip = !origin_upper_left ^ key->render_to_fbo;
858
859 /* gl_FragCoord.x */
860 if (pixel_center_integer) {
861 bld.MOV(wpos, this->pixel_x);
862 } else {
863 bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
864 }
865 wpos = offset(wpos, bld, 1);
866
867 /* gl_FragCoord.y */
868 if (!flip && pixel_center_integer) {
869 bld.MOV(wpos, this->pixel_y);
870 } else {
871 fs_reg pixel_y = this->pixel_y;
872 float offset = (pixel_center_integer ? 0.0 : 0.5);
873
874 if (flip) {
875 pixel_y.negate = true;
876 offset += key->drawable_height - 1.0;
877 }
878
879 bld.ADD(wpos, pixel_y, fs_reg(offset));
880 }
881 wpos = offset(wpos, bld, 1);
882
883 /* gl_FragCoord.z */
884 if (devinfo->gen >= 6) {
885 bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
886 } else {
887 bld.emit(FS_OPCODE_LINTERP, wpos,
888 this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
889 interp_reg(VARYING_SLOT_POS, 2));
890 }
891 wpos = offset(wpos, bld, 1);
892
893 /* gl_FragCoord.w: Already set up in emit_interpolation */
894 bld.MOV(wpos, this->wpos_w);
895
896 return reg;
897 }
898
899 fs_inst *
900 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
901 glsl_interp_qualifier interpolation_mode,
902 bool is_centroid, bool is_sample)
903 {
904 brw_wm_barycentric_interp_mode barycoord_mode;
905 if (devinfo->gen >= 6) {
906 if (is_centroid) {
907 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
908 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
909 else
910 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
911 } else if (is_sample) {
912 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
913 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
914 else
915 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
916 } else {
917 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
918 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
919 else
920 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
921 }
922 } else {
923 /* On Ironlake and below, there is only one interpolation mode.
924 * Centroid interpolation doesn't mean anything on this hardware --
925 * there is no multisampling.
926 */
927 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
928 }
929 return bld.emit(FS_OPCODE_LINTERP, attr,
930 this->delta_xy[barycoord_mode], interp);
931 }
932
933 void
934 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
935 const glsl_type *type,
936 glsl_interp_qualifier interpolation_mode,
937 int location, bool mod_centroid,
938 bool mod_sample)
939 {
940 attr.type = brw_type_for_base_type(type->get_scalar_type());
941
942 assert(stage == MESA_SHADER_FRAGMENT);
943 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
944 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
945
946 unsigned int array_elements;
947
948 if (type->is_array()) {
949 array_elements = type->length;
950 if (array_elements == 0) {
951 fail("dereferenced array '%s' has length 0\n", name);
952 }
953 type = type->fields.array;
954 } else {
955 array_elements = 1;
956 }
957
958 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
959 bool is_gl_Color =
960 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
961 if (key->flat_shade && is_gl_Color) {
962 interpolation_mode = INTERP_QUALIFIER_FLAT;
963 } else {
964 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
965 }
966 }
967
968 for (unsigned int i = 0; i < array_elements; i++) {
969 for (unsigned int j = 0; j < type->matrix_columns; j++) {
970 if (prog_data->urb_setup[location] == -1) {
971 /* If there's no incoming setup data for this slot, don't
972 * emit interpolation for it.
973 */
974 attr = offset(attr, bld, type->vector_elements);
975 location++;
976 continue;
977 }
978
979 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
980 /* Constant interpolation (flat shading) case. The SF has
981 * handed us defined values in only the constant offset
982 * field of the setup reg.
983 */
984 for (unsigned int k = 0; k < type->vector_elements; k++) {
985 struct brw_reg interp = interp_reg(location, k);
986 interp = suboffset(interp, 3);
987 interp.type = attr.type;
988 bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
989 attr = offset(attr, bld, 1);
990 }
991 } else {
992 /* Smooth/noperspective interpolation case. */
993 for (unsigned int k = 0; k < type->vector_elements; k++) {
994 struct brw_reg interp = interp_reg(location, k);
995 if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
996 /* Get the pixel/sample mask into f0 so that we know
997 * which pixels are lit. Then, for each channel that is
998 * unlit, replace the centroid data with non-centroid
999 * data.
1000 */
1001 bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1002
1003 fs_inst *inst;
1004 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1005 false, false);
1006 inst->predicate = BRW_PREDICATE_NORMAL;
1007 inst->predicate_inverse = true;
1008 if (devinfo->has_pln)
1009 inst->no_dd_clear = true;
1010
1011 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1012 mod_centroid && !key->persample_shading,
1013 mod_sample || key->persample_shading);
1014 inst->predicate = BRW_PREDICATE_NORMAL;
1015 inst->predicate_inverse = false;
1016 if (devinfo->has_pln)
1017 inst->no_dd_check = true;
1018
1019 } else {
1020 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1021 mod_centroid && !key->persample_shading,
1022 mod_sample || key->persample_shading);
1023 }
1024 if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1025 bld.MUL(attr, attr, this->pixel_w);
1026 }
1027 attr = offset(attr, bld, 1);
1028 }
1029
1030 }
1031 location++;
1032 }
1033 }
1034 }
1035
1036 fs_reg *
1037 fs_visitor::emit_frontfacing_interpolation()
1038 {
1039 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1040
1041 if (devinfo->gen >= 6) {
1042 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1043 * a boolean result from this (~0/true or 0/false).
1044 *
1045 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1046 * this task in only one instruction:
1047 * - a negation source modifier will flip the bit; and
1048 * - a W -> D type conversion will sign extend the bit into the high
1049 * word of the destination.
1050 *
1051 * An ASR 15 fills the low word of the destination.
1052 */
1053 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1054 g0.negate = true;
1055
1056 bld.ASR(*reg, g0, fs_reg(15));
1057 } else {
1058 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1059 * a boolean result from this (1/true or 0/false).
1060 *
1061 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1062 * the negation source modifier to flip it. Unfortunately the SHR
1063 * instruction only operates on UD (or D with an abs source modifier)
1064 * sources without negation.
1065 *
1066 * Instead, use ASR (which will give ~0/true or 0/false).
1067 */
1068 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1069 g1_6.negate = true;
1070
1071 bld.ASR(*reg, g1_6, fs_reg(31));
1072 }
1073
1074 return reg;
1075 }
1076
1077 void
1078 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1079 {
1080 assert(stage == MESA_SHADER_FRAGMENT);
1081 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1082 assert(dst.type == BRW_REGISTER_TYPE_F);
1083
1084 if (key->compute_pos_offset) {
1085 /* Convert int_sample_pos to floating point */
1086 bld.MOV(dst, int_sample_pos);
1087 /* Scale to the range [0, 1] */
1088 bld.MUL(dst, dst, fs_reg(1 / 16.0f));
1089 }
1090 else {
1091 /* From ARB_sample_shading specification:
1092 * "When rendering to a non-multisample buffer, or if multisample
1093 * rasterization is disabled, gl_SamplePosition will always be
1094 * (0.5, 0.5).
1095 */
1096 bld.MOV(dst, fs_reg(0.5f));
1097 }
1098 }
1099
1100 fs_reg *
1101 fs_visitor::emit_samplepos_setup()
1102 {
1103 assert(devinfo->gen >= 6);
1104
1105 const fs_builder abld = bld.annotate("compute sample position");
1106 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1107 fs_reg pos = *reg;
1108 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1109 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1110
1111 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1112 * mode will be enabled.
1113 *
1114 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1115 * R31.1:0 Position Offset X/Y for Slot[3:0]
1116 * R31.3:2 Position Offset X/Y for Slot[7:4]
1117 * .....
1118 *
1119 * The X, Y sample positions come in as bytes in thread payload. So, read
1120 * the positions using vstride=16, width=8, hstride=2.
1121 */
1122 struct brw_reg sample_pos_reg =
1123 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1124 BRW_REGISTER_TYPE_B), 16, 8, 2);
1125
1126 if (dispatch_width == 8) {
1127 abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
1128 } else {
1129 abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
1130 abld.half(1).MOV(half(int_sample_x, 1),
1131 fs_reg(suboffset(sample_pos_reg, 16)));
1132 }
1133 /* Compute gl_SamplePosition.x */
1134 compute_sample_position(pos, int_sample_x);
1135 pos = offset(pos, abld, 1);
1136 if (dispatch_width == 8) {
1137 abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
1138 } else {
1139 abld.half(0).MOV(half(int_sample_y, 0),
1140 fs_reg(suboffset(sample_pos_reg, 1)));
1141 abld.half(1).MOV(half(int_sample_y, 1),
1142 fs_reg(suboffset(sample_pos_reg, 17)));
1143 }
1144 /* Compute gl_SamplePosition.y */
1145 compute_sample_position(pos, int_sample_y);
1146 return reg;
1147 }
1148
1149 fs_reg *
1150 fs_visitor::emit_sampleid_setup()
1151 {
1152 assert(stage == MESA_SHADER_FRAGMENT);
1153 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1154 assert(devinfo->gen >= 6);
1155
1156 const fs_builder abld = bld.annotate("compute sample id");
1157 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1158
1159 if (key->compute_sample_id) {
1160 fs_reg t1 = vgrf(glsl_type::int_type);
1161 fs_reg t2 = vgrf(glsl_type::int_type);
1162 t2.type = BRW_REGISTER_TYPE_UW;
1163
1164 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1165 * 8x multisampling, subspan 0 will represent sample N (where N
1166 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1167 * 7. We can find the value of N by looking at R0.0 bits 7:6
1168 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1169 * (since samples are always delivered in pairs). That is, we
1170 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1171 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1172 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1173 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1174 * populating a temporary variable with the sequence (0, 1, 2, 3),
1175 * and then reading from it using vstride=1, width=4, hstride=0.
1176 * These computations hold good for 4x multisampling as well.
1177 *
1178 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1179 * the first four slots are sample 0 of subspan 0; the next four
1180 * are sample 1 of subspan 0; the third group is sample 0 of
1181 * subspan 1, and finally sample 1 of subspan 1.
1182 */
1183 abld.exec_all()
1184 .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1185 fs_reg(0xc0));
1186 abld.exec_all().SHR(t1, t1, fs_reg(5));
1187
1188 /* This works for both SIMD8 and SIMD16 */
1189 abld.exec_all()
1190 .MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210));
1191
1192 /* This special instruction takes care of setting vstride=1,
1193 * width=4, hstride=0 of t2 during an ADD instruction.
1194 */
1195 abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1196 } else {
1197 /* As per GL_ARB_sample_shading specification:
1198 * "When rendering to a non-multisample buffer, or if multisample
1199 * rasterization is disabled, gl_SampleID will always be zero."
1200 */
1201 abld.MOV(*reg, fs_reg(0));
1202 }
1203
1204 return reg;
1205 }
1206
1207 void
1208 fs_visitor::resolve_source_modifiers(fs_reg *src)
1209 {
1210 if (!src->abs && !src->negate)
1211 return;
1212
1213 fs_reg temp = bld.vgrf(src->type);
1214 bld.MOV(temp, *src);
1215 *src = temp;
1216 }
1217
1218 void
1219 fs_visitor::emit_discard_jump()
1220 {
1221 assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1222
1223 /* For performance, after a discard, jump to the end of the
1224 * shader if all relevant channels have been discarded.
1225 */
1226 fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
1227 discard_jump->flag_subreg = 1;
1228
1229 discard_jump->predicate = (dispatch_width == 8)
1230 ? BRW_PREDICATE_ALIGN1_ANY8H
1231 : BRW_PREDICATE_ALIGN1_ANY16H;
1232 discard_jump->predicate_inverse = true;
1233 }
1234
1235 void
1236 fs_visitor::assign_curb_setup()
1237 {
1238 if (dispatch_width == 8) {
1239 prog_data->dispatch_grf_start_reg = payload.num_regs;
1240 } else {
1241 if (stage == MESA_SHADER_FRAGMENT) {
1242 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1243 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1244 } else if (stage == MESA_SHADER_COMPUTE) {
1245 brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1246 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1247 } else {
1248 unreachable("Unsupported shader type!");
1249 }
1250 }
1251
1252 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1253
1254 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1255 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1256 for (unsigned int i = 0; i < inst->sources; i++) {
1257 if (inst->src[i].file == UNIFORM) {
1258 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1259 int constant_nr;
1260 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1261 constant_nr = push_constant_loc[uniform_nr];
1262 } else {
1263 /* Section 5.11 of the OpenGL 4.1 spec says:
1264 * "Out-of-bounds reads return undefined values, which include
1265 * values from other variables of the active program or zero."
1266 * Just return the first push constant.
1267 */
1268 constant_nr = 0;
1269 }
1270
1271 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1272 constant_nr / 8,
1273 constant_nr % 8);
1274
1275 inst->src[i].file = HW_REG;
1276 inst->src[i].fixed_hw_reg = byte_offset(
1277 retype(brw_reg, inst->src[i].type),
1278 inst->src[i].subreg_offset);
1279 }
1280 }
1281 }
1282 }
1283
1284 void
1285 fs_visitor::calculate_urb_setup()
1286 {
1287 assert(stage == MESA_SHADER_FRAGMENT);
1288 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1289 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1290
1291 memset(prog_data->urb_setup, -1,
1292 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1293
1294 int urb_next = 0;
1295 /* Figure out where each of the incoming setup attributes lands. */
1296 if (devinfo->gen >= 6) {
1297 if (_mesa_bitcount_64(prog->InputsRead &
1298 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1299 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1300 * first 16 varying inputs, so we can put them wherever we want.
1301 * Just put them in order.
1302 *
1303 * This is useful because it means that (a) inputs not used by the
1304 * fragment shader won't take up valuable register space, and (b) we
1305 * won't have to recompile the fragment shader if it gets paired with
1306 * a different vertex (or geometry) shader.
1307 */
1308 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1309 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1310 BITFIELD64_BIT(i)) {
1311 prog_data->urb_setup[i] = urb_next++;
1312 }
1313 }
1314 } else {
1315 /* We have enough input varyings that the SF/SBE pipeline stage can't
1316 * arbitrarily rearrange them to suit our whim; we have to put them
1317 * in an order that matches the output of the previous pipeline stage
1318 * (geometry or vertex shader).
1319 */
1320 struct brw_vue_map prev_stage_vue_map;
1321 brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1322 key->input_slots_valid);
1323 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1324 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1325 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1326 slot++) {
1327 int varying = prev_stage_vue_map.slot_to_varying[slot];
1328 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1329 * unused.
1330 */
1331 if (varying != BRW_VARYING_SLOT_COUNT &&
1332 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1333 BITFIELD64_BIT(varying))) {
1334 prog_data->urb_setup[varying] = slot - first_slot;
1335 }
1336 }
1337 urb_next = prev_stage_vue_map.num_slots - first_slot;
1338 }
1339 } else {
1340 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1341 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1342 /* Point size is packed into the header, not as a general attribute */
1343 if (i == VARYING_SLOT_PSIZ)
1344 continue;
1345
1346 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1347 /* The back color slot is skipped when the front color is
1348 * also written to. In addition, some slots can be
1349 * written in the vertex shader and not read in the
1350 * fragment shader. So the register number must always be
1351 * incremented, mapped or not.
1352 */
1353 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1354 prog_data->urb_setup[i] = urb_next;
1355 urb_next++;
1356 }
1357 }
1358
1359 /*
1360 * It's a FS only attribute, and we did interpolation for this attribute
1361 * in SF thread. So, count it here, too.
1362 *
1363 * See compile_sf_prog() for more info.
1364 */
1365 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1366 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1367 }
1368
1369 prog_data->num_varying_inputs = urb_next;
1370 }
1371
1372 void
1373 fs_visitor::assign_urb_setup()
1374 {
1375 assert(stage == MESA_SHADER_FRAGMENT);
1376 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1377
1378 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1379
1380 /* Offset all the urb_setup[] index by the actual position of the
1381 * setup regs, now that the location of the constants has been chosen.
1382 */
1383 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1384 if (inst->opcode == FS_OPCODE_LINTERP) {
1385 assert(inst->src[1].file == HW_REG);
1386 inst->src[1].fixed_hw_reg.nr += urb_start;
1387 }
1388
1389 if (inst->opcode == FS_OPCODE_CINTERP) {
1390 assert(inst->src[0].file == HW_REG);
1391 inst->src[0].fixed_hw_reg.nr += urb_start;
1392 }
1393 }
1394
1395 /* Each attribute is 4 setup channels, each of which is half a reg. */
1396 this->first_non_payload_grf =
1397 urb_start + prog_data->num_varying_inputs * 2;
1398 }
1399
1400 void
1401 fs_visitor::assign_vs_urb_setup()
1402 {
1403 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1404 int grf, count, slot, channel, attr;
1405
1406 assert(stage == MESA_SHADER_VERTEX);
1407 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1408 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1409 count++;
1410
1411 /* Each attribute is 4 regs. */
1412 this->first_non_payload_grf =
1413 payload.num_regs + prog_data->curb_read_length + count * 4;
1414
1415 unsigned vue_entries =
1416 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1417
1418 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1419 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1420
1421 assert(vs_prog_data->base.urb_read_length <= 15);
1422
1423 /* Rewrite all ATTR file references to the hw grf that they land in. */
1424 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1425 for (int i = 0; i < inst->sources; i++) {
1426 if (inst->src[i].file == ATTR) {
1427
1428 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1429 slot = count - 1;
1430 } else {
1431 /* Attributes come in in a contiguous block, ordered by their
1432 * gl_vert_attrib value. That means we can compute the slot
1433 * number for an attribute by masking out the enabled
1434 * attributes before it and counting the bits.
1435 */
1436 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1437 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1438 BITFIELD64_MASK(attr));
1439 }
1440
1441 channel = inst->src[i].reg_offset & 3;
1442
1443 grf = payload.num_regs +
1444 prog_data->curb_read_length +
1445 slot * 4 + channel;
1446
1447 inst->src[i].file = HW_REG;
1448 inst->src[i].fixed_hw_reg =
1449 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1450 }
1451 }
1452 }
1453 }
1454
1455 /**
1456 * Split large virtual GRFs into separate components if we can.
1457 *
1458 * This is mostly duplicated with what brw_fs_vector_splitting does,
1459 * but that's really conservative because it's afraid of doing
1460 * splitting that doesn't result in real progress after the rest of
1461 * the optimization phases, which would cause infinite looping in
1462 * optimization. We can do it once here, safely. This also has the
1463 * opportunity to split interpolated values, or maybe even uniforms,
1464 * which we don't have at the IR level.
1465 *
1466 * We want to split, because virtual GRFs are what we register
1467 * allocate and spill (due to contiguousness requirements for some
1468 * instructions), and they're what we naturally generate in the
1469 * codegen process, but most virtual GRFs don't actually need to be
1470 * contiguous sets of GRFs. If we split, we'll end up with reduced
1471 * live intervals and better dead code elimination and coalescing.
1472 */
1473 void
1474 fs_visitor::split_virtual_grfs()
1475 {
1476 int num_vars = this->alloc.count;
1477
1478 /* Count the total number of registers */
1479 int reg_count = 0;
1480 int vgrf_to_reg[num_vars];
1481 for (int i = 0; i < num_vars; i++) {
1482 vgrf_to_reg[i] = reg_count;
1483 reg_count += alloc.sizes[i];
1484 }
1485
1486 /* An array of "split points". For each register slot, this indicates
1487 * if this slot can be separated from the previous slot. Every time an
1488 * instruction uses multiple elements of a register (as a source or
1489 * destination), we mark the used slots as inseparable. Then we go
1490 * through and split the registers into the smallest pieces we can.
1491 */
1492 bool split_points[reg_count];
1493 memset(split_points, 0, sizeof(split_points));
1494
1495 /* Mark all used registers as fully splittable */
1496 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1497 if (inst->dst.file == GRF) {
1498 int reg = vgrf_to_reg[inst->dst.reg];
1499 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1500 split_points[reg + j] = true;
1501 }
1502
1503 for (int i = 0; i < inst->sources; i++) {
1504 if (inst->src[i].file == GRF) {
1505 int reg = vgrf_to_reg[inst->src[i].reg];
1506 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1507 split_points[reg + j] = true;
1508 }
1509 }
1510 }
1511
1512 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1513 if (inst->dst.file == GRF) {
1514 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1515 for (int j = 1; j < inst->regs_written; j++)
1516 split_points[reg + j] = false;
1517 }
1518 for (int i = 0; i < inst->sources; i++) {
1519 if (inst->src[i].file == GRF) {
1520 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1521 for (int j = 1; j < inst->regs_read(i); j++)
1522 split_points[reg + j] = false;
1523 }
1524 }
1525 }
1526
1527 int new_virtual_grf[reg_count];
1528 int new_reg_offset[reg_count];
1529
1530 int reg = 0;
1531 for (int i = 0; i < num_vars; i++) {
1532 /* The first one should always be 0 as a quick sanity check. */
1533 assert(split_points[reg] == false);
1534
1535 /* j = 0 case */
1536 new_reg_offset[reg] = 0;
1537 reg++;
1538 int offset = 1;
1539
1540 /* j > 0 case */
1541 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1542 /* If this is a split point, reset the offset to 0 and allocate a
1543 * new virtual GRF for the previous offset many registers
1544 */
1545 if (split_points[reg]) {
1546 assert(offset <= MAX_VGRF_SIZE);
1547 int grf = alloc.allocate(offset);
1548 for (int k = reg - offset; k < reg; k++)
1549 new_virtual_grf[k] = grf;
1550 offset = 0;
1551 }
1552 new_reg_offset[reg] = offset;
1553 offset++;
1554 reg++;
1555 }
1556
1557 /* The last one gets the original register number */
1558 assert(offset <= MAX_VGRF_SIZE);
1559 alloc.sizes[i] = offset;
1560 for (int k = reg - offset; k < reg; k++)
1561 new_virtual_grf[k] = i;
1562 }
1563 assert(reg == reg_count);
1564
1565 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1566 if (inst->dst.file == GRF) {
1567 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1568 inst->dst.reg = new_virtual_grf[reg];
1569 inst->dst.reg_offset = new_reg_offset[reg];
1570 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1571 }
1572 for (int i = 0; i < inst->sources; i++) {
1573 if (inst->src[i].file == GRF) {
1574 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1575 inst->src[i].reg = new_virtual_grf[reg];
1576 inst->src[i].reg_offset = new_reg_offset[reg];
1577 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1578 }
1579 }
1580 }
1581 invalidate_live_intervals();
1582 }
1583
1584 /**
1585 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1586 *
1587 * During code generation, we create tons of temporary variables, many of
1588 * which get immediately killed and are never used again. Yet, in later
1589 * optimization and analysis passes, such as compute_live_intervals, we need
1590 * to loop over all the virtual GRFs. Compacting them can save a lot of
1591 * overhead.
1592 */
1593 bool
1594 fs_visitor::compact_virtual_grfs()
1595 {
1596 bool progress = false;
1597 int remap_table[this->alloc.count];
1598 memset(remap_table, -1, sizeof(remap_table));
1599
1600 /* Mark which virtual GRFs are used. */
1601 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1602 if (inst->dst.file == GRF)
1603 remap_table[inst->dst.reg] = 0;
1604
1605 for (int i = 0; i < inst->sources; i++) {
1606 if (inst->src[i].file == GRF)
1607 remap_table[inst->src[i].reg] = 0;
1608 }
1609 }
1610
1611 /* Compact the GRF arrays. */
1612 int new_index = 0;
1613 for (unsigned i = 0; i < this->alloc.count; i++) {
1614 if (remap_table[i] == -1) {
1615 /* We just found an unused register. This means that we are
1616 * actually going to compact something.
1617 */
1618 progress = true;
1619 } else {
1620 remap_table[i] = new_index;
1621 alloc.sizes[new_index] = alloc.sizes[i];
1622 invalidate_live_intervals();
1623 ++new_index;
1624 }
1625 }
1626
1627 this->alloc.count = new_index;
1628
1629 /* Patch all the instructions to use the newly renumbered registers */
1630 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1631 if (inst->dst.file == GRF)
1632 inst->dst.reg = remap_table[inst->dst.reg];
1633
1634 for (int i = 0; i < inst->sources; i++) {
1635 if (inst->src[i].file == GRF)
1636 inst->src[i].reg = remap_table[inst->src[i].reg];
1637 }
1638 }
1639
1640 /* Patch all the references to delta_xy, since they're used in register
1641 * allocation. If they're unused, switch them to BAD_FILE so we don't
1642 * think some random VGRF is delta_xy.
1643 */
1644 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
1645 if (delta_xy[i].file == GRF) {
1646 if (remap_table[delta_xy[i].reg] != -1) {
1647 delta_xy[i].reg = remap_table[delta_xy[i].reg];
1648 } else {
1649 delta_xy[i].file = BAD_FILE;
1650 }
1651 }
1652 }
1653
1654 return progress;
1655 }
1656
1657 /*
1658 * Implements array access of uniforms by inserting a
1659 * PULL_CONSTANT_LOAD instruction.
1660 *
1661 * Unlike temporary GRF array access (where we don't support it due to
1662 * the difficulty of doing relative addressing on instruction
1663 * destinations), we could potentially do array access of uniforms
1664 * that were loaded in GRF space as push constants. In real-world
1665 * usage we've seen, though, the arrays being used are always larger
1666 * than we could load as push constants, so just always move all
1667 * uniform array access out to a pull constant buffer.
1668 */
1669 void
1670 fs_visitor::move_uniform_array_access_to_pull_constants()
1671 {
1672 if (dispatch_width != 8)
1673 return;
1674
1675 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1676 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
1677
1678 /* Walk through and find array access of uniforms. Put a copy of that
1679 * uniform in the pull constant buffer.
1680 *
1681 * Note that we don't move constant-indexed accesses to arrays. No
1682 * testing has been done of the performance impact of this choice.
1683 */
1684 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
1685 for (int i = 0 ; i < inst->sources; i++) {
1686 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1687 continue;
1688
1689 int uniform = inst->src[i].reg;
1690
1691 /* If this array isn't already present in the pull constant buffer,
1692 * add it.
1693 */
1694 if (pull_constant_loc[uniform] == -1) {
1695 const gl_constant_value **values = &stage_prog_data->param[uniform];
1696
1697 assert(param_size[uniform]);
1698
1699 for (int j = 0; j < param_size[uniform]; j++) {
1700 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1701
1702 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1703 values[j];
1704 }
1705 }
1706 }
1707 }
1708 }
1709
1710 /**
1711 * Assign UNIFORM file registers to either push constants or pull constants.
1712 *
1713 * We allow a fragment shader to have more than the specified minimum
1714 * maximum number of fragment shader uniform components (64). If
1715 * there are too many of these, they'd fill up all of register space.
1716 * So, this will push some of them out to the pull constant buffer and
1717 * update the program to load them.
1718 */
1719 void
1720 fs_visitor::assign_constant_locations()
1721 {
1722 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1723 if (dispatch_width != 8)
1724 return;
1725
1726 /* Find which UNIFORM registers are still in use. */
1727 bool is_live[uniforms];
1728 for (unsigned int i = 0; i < uniforms; i++) {
1729 is_live[i] = false;
1730 }
1731
1732 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1733 for (int i = 0; i < inst->sources; i++) {
1734 if (inst->src[i].file != UNIFORM)
1735 continue;
1736
1737 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1738 if (constant_nr >= 0 && constant_nr < (int) uniforms)
1739 is_live[constant_nr] = true;
1740 }
1741 }
1742
1743 /* Only allow 16 registers (128 uniform components) as push constants.
1744 *
1745 * Just demote the end of the list. We could probably do better
1746 * here, demoting things that are rarely used in the program first.
1747 *
1748 * If changing this value, note the limitation about total_regs in
1749 * brw_curbe.c.
1750 */
1751 unsigned int max_push_components = 16 * 8;
1752 unsigned int num_push_constants = 0;
1753
1754 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1755
1756 for (unsigned int i = 0; i < uniforms; i++) {
1757 if (!is_live[i] || pull_constant_loc[i] != -1) {
1758 /* This UNIFORM register is either dead, or has already been demoted
1759 * to a pull const. Mark it as no longer living in the param[] array.
1760 */
1761 push_constant_loc[i] = -1;
1762 continue;
1763 }
1764
1765 if (num_push_constants < max_push_components) {
1766 /* Retain as a push constant. Record the location in the params[]
1767 * array.
1768 */
1769 push_constant_loc[i] = num_push_constants++;
1770 } else {
1771 /* Demote to a pull constant. */
1772 push_constant_loc[i] = -1;
1773
1774 int pull_index = stage_prog_data->nr_pull_params++;
1775 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1776 pull_constant_loc[i] = pull_index;
1777 }
1778 }
1779
1780 stage_prog_data->nr_params = num_push_constants;
1781
1782 /* Up until now, the param[] array has been indexed by reg + reg_offset
1783 * of UNIFORM registers. Condense it to only contain the uniforms we
1784 * chose to upload as push constants.
1785 */
1786 for (unsigned int i = 0; i < uniforms; i++) {
1787 int remapped = push_constant_loc[i];
1788
1789 if (remapped == -1)
1790 continue;
1791
1792 assert(remapped <= (int)i);
1793 stage_prog_data->param[remapped] = stage_prog_data->param[i];
1794 }
1795 }
1796
1797 /**
1798 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1799 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1800 */
1801 void
1802 fs_visitor::demote_pull_constants()
1803 {
1804 foreach_block_and_inst (block, fs_inst, inst, cfg) {
1805 for (int i = 0; i < inst->sources; i++) {
1806 if (inst->src[i].file != UNIFORM)
1807 continue;
1808
1809 int pull_index;
1810 unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
1811 if (location >= uniforms) /* Out of bounds access */
1812 pull_index = -1;
1813 else
1814 pull_index = pull_constant_loc[location];
1815
1816 if (pull_index == -1)
1817 continue;
1818
1819 /* Set up the annotation tracking for new generated instructions. */
1820 const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
1821 .at(block, inst);
1822 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1823 fs_reg dst = vgrf(glsl_type::float_type);
1824
1825 /* Generate a pull load into dst. */
1826 if (inst->src[i].reladdr) {
1827 VARYING_PULL_CONSTANT_LOAD(ibld, dst,
1828 surf_index,
1829 *inst->src[i].reladdr,
1830 pull_index);
1831 inst->src[i].reladdr = NULL;
1832 } else {
1833 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1834 ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1835 dst, surf_index, offset);
1836 inst->src[i].set_smear(pull_index & 3);
1837 }
1838
1839 /* Rewrite the instruction to use the temporary VGRF. */
1840 inst->src[i].file = GRF;
1841 inst->src[i].reg = dst.reg;
1842 inst->src[i].reg_offset = 0;
1843 }
1844 }
1845 invalidate_live_intervals();
1846 }
1847
1848 bool
1849 fs_visitor::opt_algebraic()
1850 {
1851 bool progress = false;
1852
1853 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1854 switch (inst->opcode) {
1855 case BRW_OPCODE_MOV:
1856 if (inst->src[0].file != IMM)
1857 break;
1858
1859 if (inst->saturate) {
1860 if (inst->dst.type != inst->src[0].type)
1861 assert(!"unimplemented: saturate mixed types");
1862
1863 if (brw_saturate_immediate(inst->dst.type,
1864 &inst->src[0].fixed_hw_reg)) {
1865 inst->saturate = false;
1866 progress = true;
1867 }
1868 }
1869 break;
1870
1871 case BRW_OPCODE_MUL:
1872 if (inst->src[1].file != IMM)
1873 continue;
1874
1875 /* a * 1.0 = a */
1876 if (inst->src[1].is_one()) {
1877 inst->opcode = BRW_OPCODE_MOV;
1878 inst->src[1] = reg_undef;
1879 progress = true;
1880 break;
1881 }
1882
1883 /* a * -1.0 = -a */
1884 if (inst->src[1].is_negative_one()) {
1885 inst->opcode = BRW_OPCODE_MOV;
1886 inst->src[0].negate = !inst->src[0].negate;
1887 inst->src[1] = reg_undef;
1888 progress = true;
1889 break;
1890 }
1891
1892 /* a * 0.0 = 0.0 */
1893 if (inst->src[1].is_zero()) {
1894 inst->opcode = BRW_OPCODE_MOV;
1895 inst->src[0] = inst->src[1];
1896 inst->src[1] = reg_undef;
1897 progress = true;
1898 break;
1899 }
1900
1901 if (inst->src[0].file == IMM) {
1902 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
1903 inst->opcode = BRW_OPCODE_MOV;
1904 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
1905 inst->src[1] = reg_undef;
1906 progress = true;
1907 break;
1908 }
1909 break;
1910 case BRW_OPCODE_ADD:
1911 if (inst->src[1].file != IMM)
1912 continue;
1913
1914 /* a + 0.0 = a */
1915 if (inst->src[1].is_zero()) {
1916 inst->opcode = BRW_OPCODE_MOV;
1917 inst->src[1] = reg_undef;
1918 progress = true;
1919 break;
1920 }
1921
1922 if (inst->src[0].file == IMM) {
1923 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
1924 inst->opcode = BRW_OPCODE_MOV;
1925 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
1926 inst->src[1] = reg_undef;
1927 progress = true;
1928 break;
1929 }
1930 break;
1931 case BRW_OPCODE_OR:
1932 if (inst->src[0].equals(inst->src[1])) {
1933 inst->opcode = BRW_OPCODE_MOV;
1934 inst->src[1] = reg_undef;
1935 progress = true;
1936 break;
1937 }
1938 break;
1939 case BRW_OPCODE_LRP:
1940 if (inst->src[1].equals(inst->src[2])) {
1941 inst->opcode = BRW_OPCODE_MOV;
1942 inst->src[0] = inst->src[1];
1943 inst->src[1] = reg_undef;
1944 inst->src[2] = reg_undef;
1945 progress = true;
1946 break;
1947 }
1948 break;
1949 case BRW_OPCODE_CMP:
1950 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
1951 inst->src[0].abs &&
1952 inst->src[0].negate &&
1953 inst->src[1].is_zero()) {
1954 inst->src[0].abs = false;
1955 inst->src[0].negate = false;
1956 inst->conditional_mod = BRW_CONDITIONAL_Z;
1957 progress = true;
1958 break;
1959 }
1960 break;
1961 case BRW_OPCODE_SEL:
1962 if (inst->src[0].equals(inst->src[1])) {
1963 inst->opcode = BRW_OPCODE_MOV;
1964 inst->src[1] = reg_undef;
1965 inst->predicate = BRW_PREDICATE_NONE;
1966 inst->predicate_inverse = false;
1967 progress = true;
1968 } else if (inst->saturate && inst->src[1].file == IMM) {
1969 switch (inst->conditional_mod) {
1970 case BRW_CONDITIONAL_LE:
1971 case BRW_CONDITIONAL_L:
1972 switch (inst->src[1].type) {
1973 case BRW_REGISTER_TYPE_F:
1974 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
1975 inst->opcode = BRW_OPCODE_MOV;
1976 inst->src[1] = reg_undef;
1977 inst->conditional_mod = BRW_CONDITIONAL_NONE;
1978 progress = true;
1979 }
1980 break;
1981 default:
1982 break;
1983 }
1984 break;
1985 case BRW_CONDITIONAL_GE:
1986 case BRW_CONDITIONAL_G:
1987 switch (inst->src[1].type) {
1988 case BRW_REGISTER_TYPE_F:
1989 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
1990 inst->opcode = BRW_OPCODE_MOV;
1991 inst->src[1] = reg_undef;
1992 inst->conditional_mod = BRW_CONDITIONAL_NONE;
1993 progress = true;
1994 }
1995 break;
1996 default:
1997 break;
1998 }
1999 default:
2000 break;
2001 }
2002 }
2003 break;
2004 case BRW_OPCODE_MAD:
2005 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2006 inst->opcode = BRW_OPCODE_MOV;
2007 inst->src[1] = reg_undef;
2008 inst->src[2] = reg_undef;
2009 progress = true;
2010 } else if (inst->src[0].is_zero()) {
2011 inst->opcode = BRW_OPCODE_MUL;
2012 inst->src[0] = inst->src[2];
2013 inst->src[2] = reg_undef;
2014 progress = true;
2015 } else if (inst->src[1].is_one()) {
2016 inst->opcode = BRW_OPCODE_ADD;
2017 inst->src[1] = inst->src[2];
2018 inst->src[2] = reg_undef;
2019 progress = true;
2020 } else if (inst->src[2].is_one()) {
2021 inst->opcode = BRW_OPCODE_ADD;
2022 inst->src[2] = reg_undef;
2023 progress = true;
2024 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2025 inst->opcode = BRW_OPCODE_ADD;
2026 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2027 inst->src[2] = reg_undef;
2028 progress = true;
2029 }
2030 break;
2031 case SHADER_OPCODE_RCP: {
2032 fs_inst *prev = (fs_inst *)inst->prev;
2033 if (prev->opcode == SHADER_OPCODE_SQRT) {
2034 if (inst->src[0].equals(prev->dst)) {
2035 inst->opcode = SHADER_OPCODE_RSQ;
2036 inst->src[0] = prev->src[0];
2037 progress = true;
2038 }
2039 }
2040 break;
2041 }
2042 case SHADER_OPCODE_BROADCAST:
2043 if (is_uniform(inst->src[0])) {
2044 inst->opcode = BRW_OPCODE_MOV;
2045 inst->sources = 1;
2046 inst->force_writemask_all = true;
2047 progress = true;
2048 } else if (inst->src[1].file == IMM) {
2049 inst->opcode = BRW_OPCODE_MOV;
2050 inst->src[0] = component(inst->src[0],
2051 inst->src[1].fixed_hw_reg.dw1.ud);
2052 inst->sources = 1;
2053 inst->force_writemask_all = true;
2054 progress = true;
2055 }
2056 break;
2057
2058 default:
2059 break;
2060 }
2061
2062 /* Swap if src[0] is immediate. */
2063 if (progress && inst->is_commutative()) {
2064 if (inst->src[0].file == IMM) {
2065 fs_reg tmp = inst->src[1];
2066 inst->src[1] = inst->src[0];
2067 inst->src[0] = tmp;
2068 }
2069 }
2070 }
2071 return progress;
2072 }
2073
2074 /**
2075 * Optimize sample messages that have constant zero values for the trailing
2076 * texture coordinates. We can just reduce the message length for these
2077 * instructions instead of reserving a register for it. Trailing parameters
2078 * that aren't sent default to zero anyway. This will cause the dead code
2079 * eliminator to remove the MOV instruction that would otherwise be emitted to
2080 * set up the zero value.
2081 */
2082 bool
2083 fs_visitor::opt_zero_samples()
2084 {
2085 /* Gen4 infers the texturing opcode based on the message length so we can't
2086 * change it.
2087 */
2088 if (devinfo->gen < 5)
2089 return false;
2090
2091 bool progress = false;
2092
2093 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2094 if (!inst->is_tex())
2095 continue;
2096
2097 fs_inst *load_payload = (fs_inst *) inst->prev;
2098
2099 if (load_payload->is_head_sentinel() ||
2100 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2101 continue;
2102
2103 /* We don't want to remove the message header or the first parameter.
2104 * Removing the first parameter is not allowed, see the Haswell PRM
2105 * volume 7, page 149:
2106 *
2107 * "Parameter 0 is required except for the sampleinfo message, which
2108 * has no parameter 0"
2109 */
2110 while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2111 load_payload->src[(inst->mlen - inst->header_size) /
2112 (dispatch_width / 8) +
2113 inst->header_size - 1].is_zero()) {
2114 inst->mlen -= dispatch_width / 8;
2115 progress = true;
2116 }
2117 }
2118
2119 if (progress)
2120 invalidate_live_intervals();
2121
2122 return progress;
2123 }
2124
2125 /**
2126 * Optimize sample messages which are followed by the final RT write.
2127 *
2128 * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2129 * results sent directly to the framebuffer, bypassing the EU. Recognize the
2130 * final texturing results copied to the framebuffer write payload and modify
2131 * them to write to the framebuffer directly.
2132 */
2133 bool
2134 fs_visitor::opt_sampler_eot()
2135 {
2136 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2137
2138 if (stage != MESA_SHADER_FRAGMENT)
2139 return false;
2140
2141 if (devinfo->gen < 9 && !devinfo->is_cherryview)
2142 return false;
2143
2144 /* FINISHME: It should be possible to implement this optimization when there
2145 * are multiple drawbuffers.
2146 */
2147 if (key->nr_color_regions != 1)
2148 return false;
2149
2150 /* Look for a texturing instruction immediately before the final FB_WRITE. */
2151 fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2152 assert(fb_write->eot);
2153 assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2154
2155 fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2156
2157 /* There wasn't one; nothing to do. */
2158 if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2159 return false;
2160
2161 /* This optimisation doesn't seem to work for textureGather for some
2162 * reason. I can't find any documentation or known workarounds to indicate
2163 * that this is expected, but considering that it is probably pretty
2164 * unlikely that a shader would directly write out the results from
2165 * textureGather we might as well just disable it.
2166 */
2167 if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
2168 tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2169 return false;
2170
2171 /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2172 * It's very likely to be the previous instruction.
2173 */
2174 fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2175 if (load_payload->is_head_sentinel() ||
2176 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2177 return false;
2178
2179 assert(!tex_inst->eot); /* We can't get here twice */
2180 assert((tex_inst->offset & (0xff << 24)) == 0);
2181
2182 tex_inst->offset |= fb_write->target << 24;
2183 tex_inst->eot = true;
2184 tex_inst->dst = bld.null_reg_ud();
2185 fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2186
2187 /* If a header is present, marking the eot is sufficient. Otherwise, we need
2188 * to create a new LOAD_PAYLOAD command with the same sources and a space
2189 * saved for the header. Using a new destination register not only makes sure
2190 * we have enough space, but it will make sure the dead code eliminator kills
2191 * the instruction that this will replace.
2192 */
2193 if (tex_inst->header_size != 0)
2194 return true;
2195
2196 fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
2197 load_payload->sources + 1);
2198 fs_reg *new_sources =
2199 ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2200
2201 new_sources[0] = fs_reg();
2202 for (int i = 0; i < load_payload->sources; i++)
2203 new_sources[i+1] = load_payload->src[i];
2204
2205 /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2206 * requires a lot of information about the sources to appropriately figure
2207 * out the number of registers needed to be used. Given this stage in our
2208 * optimization, we may not have the appropriate GRFs required by
2209 * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2210 * manually emit the instruction.
2211 */
2212 fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2213 load_payload->exec_size,
2214 send_header,
2215 new_sources,
2216 load_payload->sources + 1);
2217
2218 new_load_payload->regs_written = load_payload->regs_written + 1;
2219 new_load_payload->header_size = 1;
2220 tex_inst->mlen++;
2221 tex_inst->header_size = 1;
2222 tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2223 tex_inst->src[0] = send_header;
2224
2225 return true;
2226 }
2227
2228 bool
2229 fs_visitor::opt_register_renaming()
2230 {
2231 bool progress = false;
2232 int depth = 0;
2233
2234 int remap[alloc.count];
2235 memset(remap, -1, sizeof(int) * alloc.count);
2236
2237 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2238 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2239 depth++;
2240 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2241 inst->opcode == BRW_OPCODE_WHILE) {
2242 depth--;
2243 }
2244
2245 /* Rewrite instruction sources. */
2246 for (int i = 0; i < inst->sources; i++) {
2247 if (inst->src[i].file == GRF &&
2248 remap[inst->src[i].reg] != -1 &&
2249 remap[inst->src[i].reg] != inst->src[i].reg) {
2250 inst->src[i].reg = remap[inst->src[i].reg];
2251 progress = true;
2252 }
2253 }
2254
2255 const int dst = inst->dst.reg;
2256
2257 if (depth == 0 &&
2258 inst->dst.file == GRF &&
2259 alloc.sizes[inst->dst.reg] == inst->exec_size / 8 &&
2260 !inst->is_partial_write()) {
2261 if (remap[dst] == -1) {
2262 remap[dst] = dst;
2263 } else {
2264 remap[dst] = alloc.allocate(inst->exec_size / 8);
2265 inst->dst.reg = remap[dst];
2266 progress = true;
2267 }
2268 } else if (inst->dst.file == GRF &&
2269 remap[dst] != -1 &&
2270 remap[dst] != dst) {
2271 inst->dst.reg = remap[dst];
2272 progress = true;
2273 }
2274 }
2275
2276 if (progress) {
2277 invalidate_live_intervals();
2278
2279 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2280 if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2281 delta_xy[i].reg = remap[delta_xy[i].reg];
2282 }
2283 }
2284 }
2285
2286 return progress;
2287 }
2288
2289 /**
2290 * Remove redundant or useless discard jumps.
2291 *
2292 * For example, we can eliminate jumps in the following sequence:
2293 *
2294 * discard-jump (redundant with the next jump)
2295 * discard-jump (useless; jumps to the next instruction)
2296 * placeholder-halt
2297 */
2298 bool
2299 fs_visitor::opt_redundant_discard_jumps()
2300 {
2301 bool progress = false;
2302
2303 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2304
2305 fs_inst *placeholder_halt = NULL;
2306 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2307 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2308 placeholder_halt = inst;
2309 break;
2310 }
2311 }
2312
2313 if (!placeholder_halt)
2314 return false;
2315
2316 /* Delete any HALTs immediately before the placeholder halt. */
2317 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2318 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2319 prev = (fs_inst *) placeholder_halt->prev) {
2320 prev->remove(last_bblock);
2321 progress = true;
2322 }
2323
2324 if (progress)
2325 invalidate_live_intervals();
2326
2327 return progress;
2328 }
2329
2330 bool
2331 fs_visitor::compute_to_mrf()
2332 {
2333 bool progress = false;
2334 int next_ip = 0;
2335
2336 /* No MRFs on Gen >= 7. */
2337 if (devinfo->gen >= 7)
2338 return false;
2339
2340 calculate_live_intervals();
2341
2342 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2343 int ip = next_ip;
2344 next_ip++;
2345
2346 if (inst->opcode != BRW_OPCODE_MOV ||
2347 inst->is_partial_write() ||
2348 inst->dst.file != MRF || inst->src[0].file != GRF ||
2349 inst->dst.type != inst->src[0].type ||
2350 inst->src[0].abs || inst->src[0].negate ||
2351 !inst->src[0].is_contiguous() ||
2352 inst->src[0].subreg_offset)
2353 continue;
2354
2355 /* Work out which hardware MRF registers are written by this
2356 * instruction.
2357 */
2358 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2359 int mrf_high;
2360 if (inst->dst.reg & BRW_MRF_COMPR4) {
2361 mrf_high = mrf_low + 4;
2362 } else if (inst->exec_size == 16) {
2363 mrf_high = mrf_low + 1;
2364 } else {
2365 mrf_high = mrf_low;
2366 }
2367
2368 /* Can't compute-to-MRF this GRF if someone else was going to
2369 * read it later.
2370 */
2371 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2372 continue;
2373
2374 /* Found a move of a GRF to a MRF. Let's see if we can go
2375 * rewrite the thing that made this GRF to write into the MRF.
2376 */
2377 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2378 if (scan_inst->dst.file == GRF &&
2379 scan_inst->dst.reg == inst->src[0].reg) {
2380 /* Found the last thing to write our reg we want to turn
2381 * into a compute-to-MRF.
2382 */
2383
2384 /* If this one instruction didn't populate all the
2385 * channels, bail. We might be able to rewrite everything
2386 * that writes that reg, but it would require smarter
2387 * tracking to delay the rewriting until complete success.
2388 */
2389 if (scan_inst->is_partial_write())
2390 break;
2391
2392 /* Things returning more than one register would need us to
2393 * understand coalescing out more than one MOV at a time.
2394 */
2395 if (scan_inst->regs_written > scan_inst->exec_size / 8)
2396 break;
2397
2398 /* SEND instructions can't have MRF as a destination. */
2399 if (scan_inst->mlen)
2400 break;
2401
2402 if (devinfo->gen == 6) {
2403 /* gen6 math instructions must have the destination be
2404 * GRF, so no compute-to-MRF for them.
2405 */
2406 if (scan_inst->is_math()) {
2407 break;
2408 }
2409 }
2410
2411 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2412 /* Found the creator of our MRF's source value. */
2413 scan_inst->dst.file = MRF;
2414 scan_inst->dst.reg = inst->dst.reg;
2415 scan_inst->saturate |= inst->saturate;
2416 inst->remove(block);
2417 progress = true;
2418 }
2419 break;
2420 }
2421
2422 /* We don't handle control flow here. Most computation of
2423 * values that end up in MRFs are shortly before the MRF
2424 * write anyway.
2425 */
2426 if (block->start() == scan_inst)
2427 break;
2428
2429 /* You can't read from an MRF, so if someone else reads our
2430 * MRF's source GRF that we wanted to rewrite, that stops us.
2431 */
2432 bool interfered = false;
2433 for (int i = 0; i < scan_inst->sources; i++) {
2434 if (scan_inst->src[i].file == GRF &&
2435 scan_inst->src[i].reg == inst->src[0].reg &&
2436 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2437 interfered = true;
2438 }
2439 }
2440 if (interfered)
2441 break;
2442
2443 if (scan_inst->dst.file == MRF) {
2444 /* If somebody else writes our MRF here, we can't
2445 * compute-to-MRF before that.
2446 */
2447 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2448 int scan_mrf_high;
2449
2450 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2451 scan_mrf_high = scan_mrf_low + 4;
2452 } else if (scan_inst->exec_size == 16) {
2453 scan_mrf_high = scan_mrf_low + 1;
2454 } else {
2455 scan_mrf_high = scan_mrf_low;
2456 }
2457
2458 if (mrf_low == scan_mrf_low ||
2459 mrf_low == scan_mrf_high ||
2460 mrf_high == scan_mrf_low ||
2461 mrf_high == scan_mrf_high) {
2462 break;
2463 }
2464 }
2465
2466 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2467 /* Found a SEND instruction, which means that there are
2468 * live values in MRFs from base_mrf to base_mrf +
2469 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2470 * above it.
2471 */
2472 if (mrf_low >= scan_inst->base_mrf &&
2473 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2474 break;
2475 }
2476 if (mrf_high >= scan_inst->base_mrf &&
2477 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2478 break;
2479 }
2480 }
2481 }
2482 }
2483
2484 if (progress)
2485 invalidate_live_intervals();
2486
2487 return progress;
2488 }
2489
2490 /**
2491 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2492 * flow. We could probably do better here with some form of divergence
2493 * analysis.
2494 */
2495 bool
2496 fs_visitor::eliminate_find_live_channel()
2497 {
2498 bool progress = false;
2499 unsigned depth = 0;
2500
2501 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2502 switch (inst->opcode) {
2503 case BRW_OPCODE_IF:
2504 case BRW_OPCODE_DO:
2505 depth++;
2506 break;
2507
2508 case BRW_OPCODE_ENDIF:
2509 case BRW_OPCODE_WHILE:
2510 depth--;
2511 break;
2512
2513 case FS_OPCODE_DISCARD_JUMP:
2514 /* This can potentially make control flow non-uniform until the end
2515 * of the program.
2516 */
2517 return progress;
2518
2519 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2520 if (depth == 0) {
2521 inst->opcode = BRW_OPCODE_MOV;
2522 inst->src[0] = fs_reg(0);
2523 inst->sources = 1;
2524 inst->force_writemask_all = true;
2525 progress = true;
2526 }
2527 break;
2528
2529 default:
2530 break;
2531 }
2532 }
2533
2534 return progress;
2535 }
2536
2537 /**
2538 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2539 * instructions to FS_OPCODE_REP_FB_WRITE.
2540 */
2541 void
2542 fs_visitor::emit_repclear_shader()
2543 {
2544 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2545 int base_mrf = 1;
2546 int color_mrf = base_mrf + 2;
2547
2548 fs_inst *mov = bld.exec_all().MOV(vec4(brw_message_reg(color_mrf)),
2549 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
2550
2551 fs_inst *write;
2552 if (key->nr_color_regions == 1) {
2553 write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2554 write->saturate = key->clamp_fragment_color;
2555 write->base_mrf = color_mrf;
2556 write->target = 0;
2557 write->header_size = 0;
2558 write->mlen = 1;
2559 } else {
2560 assume(key->nr_color_regions > 0);
2561 for (int i = 0; i < key->nr_color_regions; ++i) {
2562 write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2563 write->saturate = key->clamp_fragment_color;
2564 write->base_mrf = base_mrf;
2565 write->target = i;
2566 write->header_size = 2;
2567 write->mlen = 3;
2568 }
2569 }
2570 write->eot = true;
2571
2572 calculate_cfg();
2573
2574 assign_constant_locations();
2575 assign_curb_setup();
2576
2577 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2578 assert(mov->src[0].file == HW_REG);
2579 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2580 }
2581
2582 /**
2583 * Walks through basic blocks, looking for repeated MRF writes and
2584 * removing the later ones.
2585 */
2586 bool
2587 fs_visitor::remove_duplicate_mrf_writes()
2588 {
2589 fs_inst *last_mrf_move[16];
2590 bool progress = false;
2591
2592 /* Need to update the MRF tracking for compressed instructions. */
2593 if (dispatch_width == 16)
2594 return false;
2595
2596 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2597
2598 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2599 if (inst->is_control_flow()) {
2600 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2601 }
2602
2603 if (inst->opcode == BRW_OPCODE_MOV &&
2604 inst->dst.file == MRF) {
2605 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2606 if (prev_inst && inst->equals(prev_inst)) {
2607 inst->remove(block);
2608 progress = true;
2609 continue;
2610 }
2611 }
2612
2613 /* Clear out the last-write records for MRFs that were overwritten. */
2614 if (inst->dst.file == MRF) {
2615 last_mrf_move[inst->dst.reg] = NULL;
2616 }
2617
2618 if (inst->mlen > 0 && inst->base_mrf != -1) {
2619 /* Found a SEND instruction, which will include two or fewer
2620 * implied MRF writes. We could do better here.
2621 */
2622 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2623 last_mrf_move[inst->base_mrf + i] = NULL;
2624 }
2625 }
2626
2627 /* Clear out any MRF move records whose sources got overwritten. */
2628 if (inst->dst.file == GRF) {
2629 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2630 if (last_mrf_move[i] &&
2631 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2632 last_mrf_move[i] = NULL;
2633 }
2634 }
2635 }
2636
2637 if (inst->opcode == BRW_OPCODE_MOV &&
2638 inst->dst.file == MRF &&
2639 inst->src[0].file == GRF &&
2640 !inst->is_partial_write()) {
2641 last_mrf_move[inst->dst.reg] = inst;
2642 }
2643 }
2644
2645 if (progress)
2646 invalidate_live_intervals();
2647
2648 return progress;
2649 }
2650
2651 static void
2652 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
2653 {
2654 /* Clear the flag for registers that actually got read (as expected). */
2655 for (int i = 0; i < inst->sources; i++) {
2656 int grf;
2657 if (inst->src[i].file == GRF) {
2658 grf = inst->src[i].reg;
2659 } else if (inst->src[i].file == HW_REG &&
2660 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2661 grf = inst->src[i].fixed_hw_reg.nr;
2662 } else {
2663 continue;
2664 }
2665
2666 if (grf >= first_grf &&
2667 grf < first_grf + grf_len) {
2668 deps[grf - first_grf] = false;
2669 if (inst->exec_size == 16)
2670 deps[grf - first_grf + 1] = false;
2671 }
2672 }
2673 }
2674
2675 /**
2676 * Implements this workaround for the original 965:
2677 *
2678 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2679 * check for post destination dependencies on this instruction, software
2680 * must ensure that there is no destination hazard for the case of ‘write
2681 * followed by a posted write’ shown in the following example.
2682 *
2683 * 1. mov r3 0
2684 * 2. send r3.xy <rest of send instruction>
2685 * 3. mov r2 r3
2686 *
2687 * Due to no post-destination dependency check on the ‘send’, the above
2688 * code sequence could have two instructions (1 and 2) in flight at the
2689 * same time that both consider ‘r3’ as the target of their final writes.
2690 */
2691 void
2692 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2693 fs_inst *inst)
2694 {
2695 int write_len = inst->regs_written;
2696 int first_write_grf = inst->dst.reg;
2697 bool needs_dep[BRW_MAX_MRF];
2698 assert(write_len < (int)sizeof(needs_dep) - 1);
2699
2700 memset(needs_dep, false, sizeof(needs_dep));
2701 memset(needs_dep, true, write_len);
2702
2703 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
2704
2705 /* Walk backwards looking for writes to registers we're writing which
2706 * aren't read since being written. If we hit the start of the program,
2707 * we assume that there are no outstanding dependencies on entry to the
2708 * program.
2709 */
2710 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2711 /* If we hit control flow, assume that there *are* outstanding
2712 * dependencies, and force their cleanup before our instruction.
2713 */
2714 if (block->start() == scan_inst) {
2715 for (int i = 0; i < write_len; i++) {
2716 if (needs_dep[i])
2717 DEP_RESOLVE_MOV(bld.at(block, inst), first_write_grf + i);
2718 }
2719 return;
2720 }
2721
2722 /* We insert our reads as late as possible on the assumption that any
2723 * instruction but a MOV that might have left us an outstanding
2724 * dependency has more latency than a MOV.
2725 */
2726 if (scan_inst->dst.file == GRF) {
2727 for (int i = 0; i < scan_inst->regs_written; i++) {
2728 int reg = scan_inst->dst.reg + i;
2729
2730 if (reg >= first_write_grf &&
2731 reg < first_write_grf + write_len &&
2732 needs_dep[reg - first_write_grf]) {
2733 DEP_RESOLVE_MOV(bld.at(block, inst), reg);
2734 needs_dep[reg - first_write_grf] = false;
2735 if (scan_inst->exec_size == 16)
2736 needs_dep[reg - first_write_grf + 1] = false;
2737 }
2738 }
2739 }
2740
2741 /* Clear the flag for registers that actually got read (as expected). */
2742 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2743
2744 /* Continue the loop only if we haven't resolved all the dependencies */
2745 int i;
2746 for (i = 0; i < write_len; i++) {
2747 if (needs_dep[i])
2748 break;
2749 }
2750 if (i == write_len)
2751 return;
2752 }
2753 }
2754
2755 /**
2756 * Implements this workaround for the original 965:
2757 *
2758 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2759 * used as a destination register until after it has been sourced by an
2760 * instruction with a different destination register.
2761 */
2762 void
2763 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2764 {
2765 int write_len = inst->regs_written;
2766 int first_write_grf = inst->dst.reg;
2767 bool needs_dep[BRW_MAX_MRF];
2768 assert(write_len < (int)sizeof(needs_dep) - 1);
2769
2770 memset(needs_dep, false, sizeof(needs_dep));
2771 memset(needs_dep, true, write_len);
2772 /* Walk forwards looking for writes to registers we're writing which aren't
2773 * read before being written.
2774 */
2775 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2776 /* If we hit control flow, force resolve all remaining dependencies. */
2777 if (block->end() == scan_inst) {
2778 for (int i = 0; i < write_len; i++) {
2779 if (needs_dep[i])
2780 DEP_RESOLVE_MOV(bld.at(block, scan_inst), first_write_grf + i);
2781 }
2782 return;
2783 }
2784
2785 /* Clear the flag for registers that actually got read (as expected). */
2786 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2787
2788 /* We insert our reads as late as possible since they're reading the
2789 * result of a SEND, which has massive latency.
2790 */
2791 if (scan_inst->dst.file == GRF &&
2792 scan_inst->dst.reg >= first_write_grf &&
2793 scan_inst->dst.reg < first_write_grf + write_len &&
2794 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2795 DEP_RESOLVE_MOV(bld.at(block, scan_inst), scan_inst->dst.reg);
2796 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2797 }
2798
2799 /* Continue the loop only if we haven't resolved all the dependencies */
2800 int i;
2801 for (i = 0; i < write_len; i++) {
2802 if (needs_dep[i])
2803 break;
2804 }
2805 if (i == write_len)
2806 return;
2807 }
2808 }
2809
2810 void
2811 fs_visitor::insert_gen4_send_dependency_workarounds()
2812 {
2813 if (devinfo->gen != 4 || devinfo->is_g4x)
2814 return;
2815
2816 bool progress = false;
2817
2818 /* Note that we're done with register allocation, so GRF fs_regs always
2819 * have a .reg_offset of 0.
2820 */
2821
2822 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2823 if (inst->mlen != 0 && inst->dst.file == GRF) {
2824 insert_gen4_pre_send_dependency_workarounds(block, inst);
2825 insert_gen4_post_send_dependency_workarounds(block, inst);
2826 progress = true;
2827 }
2828 }
2829
2830 if (progress)
2831 invalidate_live_intervals();
2832 }
2833
2834 /**
2835 * Turns the generic expression-style uniform pull constant load instruction
2836 * into a hardware-specific series of instructions for loading a pull
2837 * constant.
2838 *
2839 * The expression style allows the CSE pass before this to optimize out
2840 * repeated loads from the same offset, and gives the pre-register-allocation
2841 * scheduling full flexibility, while the conversion to native instructions
2842 * allows the post-register-allocation scheduler the best information
2843 * possible.
2844 *
2845 * Note that execution masking for setting up pull constant loads is special:
2846 * the channels that need to be written are unrelated to the current execution
2847 * mask, since a later instruction will use one of the result channels as a
2848 * source operand for all 8 or 16 of its channels.
2849 */
2850 void
2851 fs_visitor::lower_uniform_pull_constant_loads()
2852 {
2853 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2854 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2855 continue;
2856
2857 if (devinfo->gen >= 7) {
2858 /* The offset arg before was a vec4-aligned byte offset. We need to
2859 * turn it into a dword offset.
2860 */
2861 fs_reg const_offset_reg = inst->src[1];
2862 assert(const_offset_reg.file == IMM &&
2863 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2864 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2865
2866 fs_reg payload, offset;
2867 if (devinfo->gen >= 9) {
2868 /* We have to use a message header on Skylake to get SIMD4x2
2869 * mode. Reserve space for the register.
2870 */
2871 offset = payload = fs_reg(GRF, alloc.allocate(2));
2872 offset.reg_offset++;
2873 inst->mlen = 2;
2874 } else {
2875 offset = payload = fs_reg(GRF, alloc.allocate(1));
2876 inst->mlen = 1;
2877 }
2878
2879 /* This is actually going to be a MOV, but since only the first dword
2880 * is accessed, we have a special opcode to do just that one. Note
2881 * that this needs to be an operation that will be considered a def
2882 * by live variable analysis, or register allocation will explode.
2883 */
2884 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2885 8, offset, const_offset_reg);
2886 setup->force_writemask_all = true;
2887
2888 setup->ir = inst->ir;
2889 setup->annotation = inst->annotation;
2890 inst->insert_before(block, setup);
2891
2892 /* Similarly, this will only populate the first 4 channels of the
2893 * result register (since we only use smear values from 0-3), but we
2894 * don't tell the optimizer.
2895 */
2896 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2897 inst->src[1] = payload;
2898 inst->base_mrf = -1;
2899
2900 invalidate_live_intervals();
2901 } else {
2902 /* Before register allocation, we didn't tell the scheduler about the
2903 * MRF we use. We know it's safe to use this MRF because nothing
2904 * else does except for register spill/unspill, which generates and
2905 * uses its MRF within a single IR instruction.
2906 */
2907 inst->base_mrf = 14;
2908 inst->mlen = 1;
2909 }
2910 }
2911 }
2912
2913 bool
2914 fs_visitor::lower_load_payload()
2915 {
2916 bool progress = false;
2917
2918 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2919 if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2920 continue;
2921
2922 assert(inst->dst.file == MRF || inst->dst.file == GRF);
2923 assert(inst->saturate == false);
2924 fs_reg dst = inst->dst;
2925
2926 /* Get rid of COMPR4. We'll add it back in if we need it */
2927 if (dst.file == MRF)
2928 dst.reg = dst.reg & ~BRW_MRF_COMPR4;
2929
2930 const fs_builder hbld = bld.exec_all().group(8, 0).at(block, inst);
2931
2932 for (uint8_t i = 0; i < inst->header_size; i++) {
2933 if (inst->src[i].file != BAD_FILE) {
2934 fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
2935 fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
2936 hbld.MOV(mov_dst, mov_src);
2937 }
2938 dst = offset(dst, hbld, 1);
2939 }
2940
2941 const fs_builder ibld = bld.exec_all(inst->force_writemask_all)
2942 .group(inst->exec_size, inst->force_sechalf)
2943 .at(block, inst);
2944
2945 if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
2946 inst->exec_size > 8) {
2947 /* In this case, the payload portion of the LOAD_PAYLOAD isn't
2948 * a straightforward copy. Instead, the result of the
2949 * LOAD_PAYLOAD is treated as interleaved and the first four
2950 * non-header sources are unpacked as:
2951 *
2952 * m + 0: r0
2953 * m + 1: g0
2954 * m + 2: b0
2955 * m + 3: a0
2956 * m + 4: r1
2957 * m + 5: g1
2958 * m + 6: b1
2959 * m + 7: a1
2960 *
2961 * This is used for gen <= 5 fb writes.
2962 */
2963 assert(inst->exec_size == 16);
2964 assert(inst->header_size + 4 <= inst->sources);
2965 for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
2966 if (inst->src[i].file != BAD_FILE) {
2967 if (devinfo->has_compr4) {
2968 fs_reg compr4_dst = retype(dst, inst->src[i].type);
2969 compr4_dst.reg |= BRW_MRF_COMPR4;
2970 ibld.MOV(compr4_dst, inst->src[i]);
2971 } else {
2972 /* Platform doesn't have COMPR4. We have to fake it */
2973 fs_reg mov_dst = retype(dst, inst->src[i].type);
2974 ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
2975 mov_dst.reg += 4;
2976 ibld.half(1).MOV(mov_dst, half(inst->src[i], 1));
2977 }
2978 }
2979
2980 dst.reg++;
2981 }
2982
2983 /* The loop above only ever incremented us through the first set
2984 * of 4 registers. However, thanks to the magic of COMPR4, we
2985 * actually wrote to the first 8 registers, so we need to take
2986 * that into account now.
2987 */
2988 dst.reg += 4;
2989
2990 /* The COMPR4 code took care of the first 4 sources. We'll let
2991 * the regular path handle any remaining sources. Yes, we are
2992 * modifying the instruction but we're about to delete it so
2993 * this really doesn't hurt anything.
2994 */
2995 inst->header_size += 4;
2996 }
2997
2998 for (uint8_t i = inst->header_size; i < inst->sources; i++) {
2999 if (inst->src[i].file != BAD_FILE)
3000 ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
3001 dst = offset(dst, ibld, 1);
3002 }
3003
3004 inst->remove(block);
3005 progress = true;
3006 }
3007
3008 if (progress)
3009 invalidate_live_intervals();
3010
3011 return progress;
3012 }
3013
3014 bool
3015 fs_visitor::lower_integer_multiplication()
3016 {
3017 bool progress = false;
3018
3019 /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
3020 * directly, but Cherryview cannot.
3021 */
3022 if (devinfo->gen >= 8 && !devinfo->is_cherryview)
3023 return false;
3024
3025 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3026 if (inst->opcode != BRW_OPCODE_MUL ||
3027 inst->dst.is_accumulator() ||
3028 (inst->dst.type != BRW_REGISTER_TYPE_D &&
3029 inst->dst.type != BRW_REGISTER_TYPE_UD))
3030 continue;
3031
3032 const fs_builder ibld = bld.at(block, inst);
3033
3034 /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3035 * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3036 * src1 are used.
3037 *
3038 * If multiplying by an immediate value that fits in 16-bits, do a
3039 * single MUL instruction with that value in the proper location.
3040 */
3041 if (inst->src[1].file == IMM &&
3042 inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
3043 if (devinfo->gen < 7) {
3044 fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
3045 inst->dst.type);
3046 ibld.MOV(imm, inst->src[1]);
3047 ibld.MUL(inst->dst, imm, inst->src[0]);
3048 } else {
3049 ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
3050 }
3051 } else {
3052 /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3053 * do 32-bit integer multiplication in one instruction, but instead
3054 * must do a sequence (which actually calculates a 64-bit result):
3055 *
3056 * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
3057 * mach(8) null g3<8,8,1>D g4<8,8,1>D
3058 * mov(8) g2<1>D acc0<8,8,1>D
3059 *
3060 * But on Gen > 6, the ability to use second accumulator register
3061 * (acc1) for non-float data types was removed, preventing a simple
3062 * implementation in SIMD16. A 16-channel result can be calculated by
3063 * executing the three instructions twice in SIMD8, once with quarter
3064 * control of 1Q for the first eight channels and again with 2Q for
3065 * the second eight channels.
3066 *
3067 * Which accumulator register is implicitly accessed (by AccWrEnable
3068 * for instance) is determined by the quarter control. Unfortunately
3069 * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3070 * implicit accumulator access by an instruction with 2Q will access
3071 * acc1 regardless of whether the data type is usable in acc1.
3072 *
3073 * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3074 * integer data types.
3075 *
3076 * Since we only want the low 32-bits of the result, we can do two
3077 * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3078 * adjust the high result and add them (like the mach is doing):
3079 *
3080 * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
3081 * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
3082 * shl(8) g9<1>D g8<8,8,1>D 16D
3083 * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
3084 *
3085 * We avoid the shl instruction by realizing that we only want to add
3086 * the low 16-bits of the "high" result to the high 16-bits of the
3087 * "low" result and using proper regioning on the add:
3088 *
3089 * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
3090 * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
3091 * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
3092 *
3093 * Since it does not use the (single) accumulator register, we can
3094 * schedule multi-component multiplications much better.
3095 */
3096
3097 if (inst->conditional_mod && inst->dst.is_null()) {
3098 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3099 inst->dst.type);
3100 }
3101 fs_reg low = inst->dst;
3102 fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
3103 inst->dst.type);
3104
3105 if (devinfo->gen >= 7) {
3106 fs_reg src1_0_w = inst->src[1];
3107 fs_reg src1_1_w = inst->src[1];
3108
3109 if (inst->src[1].file == IMM) {
3110 src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
3111 src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
3112 } else {
3113 src1_0_w.type = BRW_REGISTER_TYPE_UW;
3114 if (src1_0_w.stride != 0) {
3115 assert(src1_0_w.stride == 1);
3116 src1_0_w.stride = 2;
3117 }
3118
3119 src1_1_w.type = BRW_REGISTER_TYPE_UW;
3120 if (src1_1_w.stride != 0) {
3121 assert(src1_1_w.stride == 1);
3122 src1_1_w.stride = 2;
3123 }
3124 src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3125 }
3126 ibld.MUL(low, inst->src[0], src1_0_w);
3127 ibld.MUL(high, inst->src[0], src1_1_w);
3128 } else {
3129 fs_reg src0_0_w = inst->src[0];
3130 fs_reg src0_1_w = inst->src[0];
3131
3132 src0_0_w.type = BRW_REGISTER_TYPE_UW;
3133 if (src0_0_w.stride != 0) {
3134 assert(src0_0_w.stride == 1);
3135 src0_0_w.stride = 2;
3136 }
3137
3138 src0_1_w.type = BRW_REGISTER_TYPE_UW;
3139 if (src0_1_w.stride != 0) {
3140 assert(src0_1_w.stride == 1);
3141 src0_1_w.stride = 2;
3142 }
3143 src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3144
3145 ibld.MUL(low, src0_0_w, inst->src[1]);
3146 ibld.MUL(high, src0_1_w, inst->src[1]);
3147 }
3148
3149 fs_reg dst = inst->dst;
3150 dst.type = BRW_REGISTER_TYPE_UW;
3151 dst.subreg_offset = 2;
3152 dst.stride = 2;
3153
3154 high.type = BRW_REGISTER_TYPE_UW;
3155 high.stride = 2;
3156
3157 low.type = BRW_REGISTER_TYPE_UW;
3158 low.subreg_offset = 2;
3159 low.stride = 2;
3160
3161 ibld.ADD(dst, low, high);
3162
3163 if (inst->conditional_mod) {
3164 fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
3165 set_condmod(inst->conditional_mod,
3166 ibld.MOV(null, inst->dst));
3167 }
3168 }
3169
3170 inst->remove(block);
3171 progress = true;
3172 }
3173
3174 if (progress)
3175 invalidate_live_intervals();
3176
3177 return progress;
3178 }
3179
3180 void
3181 fs_visitor::dump_instructions()
3182 {
3183 dump_instructions(NULL);
3184 }
3185
3186 void
3187 fs_visitor::dump_instructions(const char *name)
3188 {
3189 FILE *file = stderr;
3190 if (name && geteuid() != 0) {
3191 file = fopen(name, "w");
3192 if (!file)
3193 file = stderr;
3194 }
3195
3196 if (cfg) {
3197 calculate_register_pressure();
3198 int ip = 0, max_pressure = 0;
3199 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3200 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3201 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3202 dump_instruction(inst, file);
3203 ip++;
3204 }
3205 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3206 } else {
3207 int ip = 0;
3208 foreach_in_list(backend_instruction, inst, &instructions) {
3209 fprintf(file, "%4d: ", ip++);
3210 dump_instruction(inst, file);
3211 }
3212 }
3213
3214 if (file != stderr) {
3215 fclose(file);
3216 }
3217 }
3218
3219 void
3220 fs_visitor::dump_instruction(backend_instruction *be_inst)
3221 {
3222 dump_instruction(be_inst, stderr);
3223 }
3224
3225 void
3226 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3227 {
3228 fs_inst *inst = (fs_inst *)be_inst;
3229
3230 if (inst->predicate) {
3231 fprintf(file, "(%cf0.%d) ",
3232 inst->predicate_inverse ? '-' : '+',
3233 inst->flag_subreg);
3234 }
3235
3236 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3237 if (inst->saturate)
3238 fprintf(file, ".sat");
3239 if (inst->conditional_mod) {
3240 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3241 if (!inst->predicate &&
3242 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3243 inst->opcode != BRW_OPCODE_IF &&
3244 inst->opcode != BRW_OPCODE_WHILE))) {
3245 fprintf(file, ".f0.%d", inst->flag_subreg);
3246 }
3247 }
3248 fprintf(file, "(%d) ", inst->exec_size);
3249
3250 if (inst->mlen) {
3251 fprintf(file, "(mlen: %d) ", inst->mlen);
3252 }
3253
3254 switch (inst->dst.file) {
3255 case GRF:
3256 fprintf(file, "vgrf%d", inst->dst.reg);
3257 if (alloc.sizes[inst->dst.reg] != inst->regs_written ||
3258 inst->dst.subreg_offset)
3259 fprintf(file, "+%d.%d",
3260 inst->dst.reg_offset, inst->dst.subreg_offset);
3261 break;
3262 case MRF:
3263 fprintf(file, "m%d", inst->dst.reg);
3264 break;
3265 case BAD_FILE:
3266 fprintf(file, "(null)");
3267 break;
3268 case UNIFORM:
3269 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3270 break;
3271 case ATTR:
3272 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3273 break;
3274 case HW_REG:
3275 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3276 switch (inst->dst.fixed_hw_reg.nr) {
3277 case BRW_ARF_NULL:
3278 fprintf(file, "null");
3279 break;
3280 case BRW_ARF_ADDRESS:
3281 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3282 break;
3283 case BRW_ARF_ACCUMULATOR:
3284 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3285 break;
3286 case BRW_ARF_FLAG:
3287 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3288 inst->dst.fixed_hw_reg.subnr);
3289 break;
3290 default:
3291 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3292 inst->dst.fixed_hw_reg.subnr);
3293 break;
3294 }
3295 } else {
3296 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3297 }
3298 if (inst->dst.fixed_hw_reg.subnr)
3299 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3300 break;
3301 default:
3302 fprintf(file, "???");
3303 break;
3304 }
3305 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3306
3307 for (int i = 0; i < inst->sources; i++) {
3308 if (inst->src[i].negate)
3309 fprintf(file, "-");
3310 if (inst->src[i].abs)
3311 fprintf(file, "|");
3312 switch (inst->src[i].file) {
3313 case GRF:
3314 fprintf(file, "vgrf%d", inst->src[i].reg);
3315 if (alloc.sizes[inst->src[i].reg] != (unsigned)inst->regs_read(i) ||
3316 inst->src[i].subreg_offset)
3317 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3318 inst->src[i].subreg_offset);
3319 break;
3320 case MRF:
3321 fprintf(file, "***m%d***", inst->src[i].reg);
3322 break;
3323 case ATTR:
3324 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3325 break;
3326 case UNIFORM:
3327 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3328 if (inst->src[i].reladdr) {
3329 fprintf(file, "+reladdr");
3330 } else if (inst->src[i].subreg_offset) {
3331 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3332 inst->src[i].subreg_offset);
3333 }
3334 break;
3335 case BAD_FILE:
3336 fprintf(file, "(null)");
3337 break;
3338 case IMM:
3339 switch (inst->src[i].type) {
3340 case BRW_REGISTER_TYPE_F:
3341 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3342 break;
3343 case BRW_REGISTER_TYPE_W:
3344 case BRW_REGISTER_TYPE_D:
3345 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3346 break;
3347 case BRW_REGISTER_TYPE_UW:
3348 case BRW_REGISTER_TYPE_UD:
3349 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3350 break;
3351 case BRW_REGISTER_TYPE_VF:
3352 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3353 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3354 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3355 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3356 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3357 break;
3358 default:
3359 fprintf(file, "???");
3360 break;
3361 }
3362 break;
3363 case HW_REG:
3364 if (inst->src[i].fixed_hw_reg.negate)
3365 fprintf(file, "-");
3366 if (inst->src[i].fixed_hw_reg.abs)
3367 fprintf(file, "|");
3368 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3369 switch (inst->src[i].fixed_hw_reg.nr) {
3370 case BRW_ARF_NULL:
3371 fprintf(file, "null");
3372 break;
3373 case BRW_ARF_ADDRESS:
3374 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3375 break;
3376 case BRW_ARF_ACCUMULATOR:
3377 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3378 break;
3379 case BRW_ARF_FLAG:
3380 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3381 inst->src[i].fixed_hw_reg.subnr);
3382 break;
3383 default:
3384 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3385 inst->src[i].fixed_hw_reg.subnr);
3386 break;
3387 }
3388 } else {
3389 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3390 }
3391 if (inst->src[i].fixed_hw_reg.subnr)
3392 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3393 if (inst->src[i].fixed_hw_reg.abs)
3394 fprintf(file, "|");
3395 break;
3396 default:
3397 fprintf(file, "???");
3398 break;
3399 }
3400 if (inst->src[i].abs)
3401 fprintf(file, "|");
3402
3403 if (inst->src[i].file != IMM) {
3404 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3405 }
3406
3407 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3408 fprintf(file, ", ");
3409 }
3410
3411 fprintf(file, " ");
3412
3413 if (dispatch_width == 16 && inst->exec_size == 8) {
3414 if (inst->force_sechalf)
3415 fprintf(file, "2ndhalf ");
3416 else
3417 fprintf(file, "1sthalf ");
3418 }
3419
3420 fprintf(file, "\n");
3421 }
3422
3423 /**
3424 * Possibly returns an instruction that set up @param reg.
3425 *
3426 * Sometimes we want to take the result of some expression/variable
3427 * dereference tree and rewrite the instruction generating the result
3428 * of the tree. When processing the tree, we know that the
3429 * instructions generated are all writing temporaries that are dead
3430 * outside of this tree. So, if we have some instructions that write
3431 * a temporary, we're free to point that temp write somewhere else.
3432 *
3433 * Note that this doesn't guarantee that the instruction generated
3434 * only reg -- it might be the size=4 destination of a texture instruction.
3435 */
3436 fs_inst *
3437 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3438 fs_inst *end,
3439 const fs_reg &reg)
3440 {
3441 if (end == start ||
3442 end->is_partial_write() ||
3443 reg.reladdr ||
3444 !reg.equals(end->dst)) {
3445 return NULL;
3446 } else {
3447 return end;
3448 }
3449 }
3450
3451 void
3452 fs_visitor::setup_payload_gen6()
3453 {
3454 bool uses_depth =
3455 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3456 unsigned barycentric_interp_modes =
3457 (stage == MESA_SHADER_FRAGMENT) ?
3458 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3459
3460 assert(devinfo->gen >= 6);
3461
3462 /* R0-1: masks, pixel X/Y coordinates. */
3463 payload.num_regs = 2;
3464 /* R2: only for 32-pixel dispatch.*/
3465
3466 /* R3-26: barycentric interpolation coordinates. These appear in the
3467 * same order that they appear in the brw_wm_barycentric_interp_mode
3468 * enum. Each set of coordinates occupies 2 registers if dispatch width
3469 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3470 * appear if they were enabled using the "Barycentric Interpolation
3471 * Mode" bits in WM_STATE.
3472 */
3473 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3474 if (barycentric_interp_modes & (1 << i)) {
3475 payload.barycentric_coord_reg[i] = payload.num_regs;
3476 payload.num_regs += 2;
3477 if (dispatch_width == 16) {
3478 payload.num_regs += 2;
3479 }
3480 }
3481 }
3482
3483 /* R27: interpolated depth if uses source depth */
3484 if (uses_depth) {
3485 payload.source_depth_reg = payload.num_regs;
3486 payload.num_regs++;
3487 if (dispatch_width == 16) {
3488 /* R28: interpolated depth if not SIMD8. */
3489 payload.num_regs++;
3490 }
3491 }
3492 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3493 if (uses_depth) {
3494 payload.source_w_reg = payload.num_regs;
3495 payload.num_regs++;
3496 if (dispatch_width == 16) {
3497 /* R30: interpolated W if not SIMD8. */
3498 payload.num_regs++;
3499 }
3500 }
3501
3502 if (stage == MESA_SHADER_FRAGMENT) {
3503 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3504 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3505 prog_data->uses_pos_offset = key->compute_pos_offset;
3506 /* R31: MSAA position offsets. */
3507 if (prog_data->uses_pos_offset) {
3508 payload.sample_pos_reg = payload.num_regs;
3509 payload.num_regs++;
3510 }
3511 }
3512
3513 /* R32: MSAA input coverage mask */
3514 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3515 assert(devinfo->gen >= 7);
3516 payload.sample_mask_in_reg = payload.num_regs;
3517 payload.num_regs++;
3518 if (dispatch_width == 16) {
3519 /* R33: input coverage mask if not SIMD8. */
3520 payload.num_regs++;
3521 }
3522 }
3523
3524 /* R34-: bary for 32-pixel. */
3525 /* R58-59: interp W for 32-pixel. */
3526
3527 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3528 source_depth_to_render_target = true;
3529 }
3530 }
3531
3532 void
3533 fs_visitor::setup_vs_payload()
3534 {
3535 /* R0: thread header, R1: urb handles */
3536 payload.num_regs = 2;
3537 }
3538
3539 void
3540 fs_visitor::setup_cs_payload()
3541 {
3542 assert(devinfo->gen >= 7);
3543
3544 payload.num_regs = 1;
3545 }
3546
3547 void
3548 fs_visitor::assign_binding_table_offsets()
3549 {
3550 assert(stage == MESA_SHADER_FRAGMENT);
3551 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3552 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3553 uint32_t next_binding_table_offset = 0;
3554
3555 /* If there are no color regions, we still perform an FB write to a null
3556 * renderbuffer, which we place at surface index 0.
3557 */
3558 prog_data->binding_table.render_target_start = next_binding_table_offset;
3559 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3560
3561 assign_common_binding_table_offsets(next_binding_table_offset);
3562 }
3563
3564 void
3565 fs_visitor::calculate_register_pressure()
3566 {
3567 invalidate_live_intervals();
3568 calculate_live_intervals();
3569
3570 unsigned num_instructions = 0;
3571 foreach_block(block, cfg)
3572 num_instructions += block->instructions.length();
3573
3574 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3575
3576 for (unsigned reg = 0; reg < alloc.count; reg++) {
3577 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3578 regs_live_at_ip[ip] += alloc.sizes[reg];
3579 }
3580 }
3581
3582 void
3583 fs_visitor::optimize()
3584 {
3585 /* bld is the common builder object pointing at the end of the program we
3586 * used to translate it into i965 IR. For the optimization and lowering
3587 * passes coming next, any code added after the end of the program without
3588 * having explicitly called fs_builder::at() clearly points at a mistake.
3589 * Ideally optimization passes wouldn't be part of the visitor so they
3590 * wouldn't have access to bld at all, but they do, so just in case some
3591 * pass forgets to ask for a location explicitly set it to NULL here to
3592 * make it trip.
3593 */
3594 bld = bld.at(NULL, NULL);
3595
3596 split_virtual_grfs();
3597
3598 move_uniform_array_access_to_pull_constants();
3599 assign_constant_locations();
3600 demote_pull_constants();
3601
3602 #define OPT(pass, args...) ({ \
3603 pass_num++; \
3604 bool this_progress = pass(args); \
3605 \
3606 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3607 char filename[64]; \
3608 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3609 stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3610 \
3611 backend_shader::dump_instructions(filename); \
3612 } \
3613 \
3614 progress = progress || this_progress; \
3615 this_progress; \
3616 })
3617
3618 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3619 char filename[64];
3620 snprintf(filename, 64, "%s%d-%04d-00-start",
3621 stage_abbrev, dispatch_width,
3622 shader_prog ? shader_prog->Name : 0);
3623
3624 backend_shader::dump_instructions(filename);
3625 }
3626
3627 bool progress;
3628 int iteration = 0;
3629 int pass_num = 0;
3630 do {
3631 progress = false;
3632 pass_num = 0;
3633 iteration++;
3634
3635 OPT(remove_duplicate_mrf_writes);
3636
3637 OPT(opt_algebraic);
3638 OPT(opt_cse);
3639 OPT(opt_copy_propagate);
3640 OPT(opt_peephole_predicated_break);
3641 OPT(opt_cmod_propagation);
3642 OPT(dead_code_eliminate);
3643 OPT(opt_peephole_sel);
3644 OPT(dead_control_flow_eliminate, this);
3645 OPT(opt_register_renaming);
3646 OPT(opt_redundant_discard_jumps);
3647 OPT(opt_saturate_propagation);
3648 OPT(opt_zero_samples);
3649 OPT(register_coalesce);
3650 OPT(compute_to_mrf);
3651 OPT(eliminate_find_live_channel);
3652
3653 OPT(compact_virtual_grfs);
3654 } while (progress);
3655
3656 pass_num = 0;
3657
3658 OPT(opt_sampler_eot);
3659
3660 if (OPT(lower_load_payload)) {
3661 split_virtual_grfs();
3662 OPT(register_coalesce);
3663 OPT(compute_to_mrf);
3664 OPT(dead_code_eliminate);
3665 }
3666
3667 OPT(opt_combine_constants);
3668 OPT(lower_integer_multiplication);
3669
3670 lower_uniform_pull_constant_loads();
3671 }
3672
3673 /**
3674 * Three source instruction must have a GRF/MRF destination register.
3675 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
3676 */
3677 void
3678 fs_visitor::fixup_3src_null_dest()
3679 {
3680 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3681 if (inst->is_3src() && inst->dst.is_null()) {
3682 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3683 inst->dst.type);
3684 }
3685 }
3686 }
3687
3688 void
3689 fs_visitor::allocate_registers()
3690 {
3691 bool allocated_without_spills;
3692
3693 static const enum instruction_scheduler_mode pre_modes[] = {
3694 SCHEDULE_PRE,
3695 SCHEDULE_PRE_NON_LIFO,
3696 SCHEDULE_PRE_LIFO,
3697 };
3698
3699 /* Try each scheduling heuristic to see if it can successfully register
3700 * allocate without spilling. They should be ordered by decreasing
3701 * performance but increasing likelihood of allocating.
3702 */
3703 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3704 schedule_instructions(pre_modes[i]);
3705
3706 if (0) {
3707 assign_regs_trivial();
3708 allocated_without_spills = true;
3709 } else {
3710 allocated_without_spills = assign_regs(false);
3711 }
3712 if (allocated_without_spills)
3713 break;
3714 }
3715
3716 if (!allocated_without_spills) {
3717 /* We assume that any spilling is worse than just dropping back to
3718 * SIMD8. There's probably actually some intermediate point where
3719 * SIMD16 with a couple of spills is still better.
3720 */
3721 if (dispatch_width == 16) {
3722 fail("Failure to register allocate. Reduce number of "
3723 "live scalar values to avoid this.");
3724 } else {
3725 compiler->shader_perf_log(log_data,
3726 "%s shader triggered register spilling. "
3727 "Try reducing the number of live scalar "
3728 "values to improve performance.\n",
3729 stage_name);
3730 }
3731
3732 /* Since we're out of heuristics, just go spill registers until we
3733 * get an allocation.
3734 */
3735 while (!assign_regs(true)) {
3736 if (failed)
3737 break;
3738 }
3739 }
3740
3741 /* This must come after all optimization and register allocation, since
3742 * it inserts dead code that happens to have side effects, and it does
3743 * so based on the actual physical registers in use.
3744 */
3745 insert_gen4_send_dependency_workarounds();
3746
3747 if (failed)
3748 return;
3749
3750 if (!allocated_without_spills)
3751 schedule_instructions(SCHEDULE_POST);
3752
3753 if (last_scratch > 0)
3754 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3755 }
3756
3757 bool
3758 fs_visitor::run_vs(gl_clip_plane *clip_planes)
3759 {
3760 assert(stage == MESA_SHADER_VERTEX);
3761
3762 assign_common_binding_table_offsets(0);
3763 setup_vs_payload();
3764
3765 if (shader_time_index >= 0)
3766 emit_shader_time_begin();
3767
3768 emit_nir_code();
3769
3770 if (failed)
3771 return false;
3772
3773 compute_clip_distance(clip_planes);
3774
3775 emit_urb_writes();
3776
3777 if (shader_time_index >= 0)
3778 emit_shader_time_end();
3779
3780 calculate_cfg();
3781
3782 optimize();
3783
3784 assign_curb_setup();
3785 assign_vs_urb_setup();
3786
3787 fixup_3src_null_dest();
3788 allocate_registers();
3789
3790 return !failed;
3791 }
3792
3793 bool
3794 fs_visitor::run_fs(bool do_rep_send)
3795 {
3796 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3797 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3798
3799 assert(stage == MESA_SHADER_FRAGMENT);
3800
3801 sanity_param_count = prog->Parameters->NumParameters;
3802
3803 assign_binding_table_offsets();
3804
3805 if (devinfo->gen >= 6)
3806 setup_payload_gen6();
3807 else
3808 setup_payload_gen4();
3809
3810 if (0) {
3811 emit_dummy_fs();
3812 } else if (do_rep_send) {
3813 assert(dispatch_width == 16);
3814 emit_repclear_shader();
3815 } else {
3816 if (shader_time_index >= 0)
3817 emit_shader_time_begin();
3818
3819 calculate_urb_setup();
3820 if (prog->InputsRead > 0) {
3821 if (devinfo->gen < 6)
3822 emit_interpolation_setup_gen4();
3823 else
3824 emit_interpolation_setup_gen6();
3825 }
3826
3827 /* We handle discards by keeping track of the still-live pixels in f0.1.
3828 * Initialize it with the dispatched pixels.
3829 */
3830 if (wm_prog_data->uses_kill) {
3831 fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3832 discard_init->flag_subreg = 1;
3833 }
3834
3835 /* Generate FS IR for main(). (the visitor only descends into
3836 * functions called "main").
3837 */
3838 emit_nir_code();
3839
3840 if (failed)
3841 return false;
3842
3843 if (wm_prog_data->uses_kill)
3844 bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
3845
3846 if (wm_key->alpha_test_func)
3847 emit_alpha_test();
3848
3849 emit_fb_writes();
3850
3851 if (shader_time_index >= 0)
3852 emit_shader_time_end();
3853
3854 calculate_cfg();
3855
3856 optimize();
3857
3858 assign_curb_setup();
3859 assign_urb_setup();
3860
3861 fixup_3src_null_dest();
3862 allocate_registers();
3863
3864 if (failed)
3865 return false;
3866 }
3867
3868 if (dispatch_width == 8)
3869 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3870 else
3871 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3872
3873 /* If any state parameters were appended, then ParameterValues could have
3874 * been realloced, in which case the driver uniform storage set up by
3875 * _mesa_associate_uniform_storage() would point to freed memory. Make
3876 * sure that didn't happen.
3877 */
3878 assert(sanity_param_count == prog->Parameters->NumParameters);
3879
3880 return !failed;
3881 }
3882
3883 bool
3884 fs_visitor::run_cs()
3885 {
3886 assert(stage == MESA_SHADER_COMPUTE);
3887 assert(shader);
3888
3889 sanity_param_count = prog->Parameters->NumParameters;
3890
3891 assign_common_binding_table_offsets(0);
3892
3893 setup_cs_payload();
3894
3895 if (shader_time_index >= 0)
3896 emit_shader_time_begin();
3897
3898 emit_nir_code();
3899
3900 if (failed)
3901 return false;
3902
3903 emit_cs_terminate();
3904
3905 if (shader_time_index >= 0)
3906 emit_shader_time_end();
3907
3908 calculate_cfg();
3909
3910 optimize();
3911
3912 assign_curb_setup();
3913
3914 fixup_3src_null_dest();
3915 allocate_registers();
3916
3917 if (failed)
3918 return false;
3919
3920 /* If any state parameters were appended, then ParameterValues could have
3921 * been realloced, in which case the driver uniform storage set up by
3922 * _mesa_associate_uniform_storage() would point to freed memory. Make
3923 * sure that didn't happen.
3924 */
3925 assert(sanity_param_count == prog->Parameters->NumParameters);
3926
3927 return !failed;
3928 }
3929
3930 const unsigned *
3931 brw_wm_fs_emit(struct brw_context *brw,
3932 void *mem_ctx,
3933 const struct brw_wm_prog_key *key,
3934 struct brw_wm_prog_data *prog_data,
3935 struct gl_fragment_program *fp,
3936 struct gl_shader_program *prog,
3937 unsigned *final_assembly_size)
3938 {
3939 bool start_busy = false;
3940 double start_time = 0;
3941
3942 if (unlikely(brw->perf_debug)) {
3943 start_busy = (brw->batch.last_bo &&
3944 drm_intel_bo_busy(brw->batch.last_bo));
3945 start_time = get_time();
3946 }
3947
3948 struct brw_shader *shader = NULL;
3949 if (prog)
3950 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3951
3952 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3953 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3954
3955 int st_index8 = -1, st_index16 = -1;
3956 if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
3957 st_index8 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS8);
3958 st_index16 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS16);
3959 }
3960
3961 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3962 */
3963 fs_visitor v(brw->intelScreen->compiler, brw,
3964 mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
3965 prog, &fp->Base, 8, st_index8);
3966 if (!v.run_fs(false /* do_rep_send */)) {
3967 if (prog) {
3968 prog->LinkStatus = false;
3969 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3970 }
3971
3972 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3973 v.fail_msg);
3974
3975 return NULL;
3976 }
3977
3978 cfg_t *simd16_cfg = NULL;
3979 fs_visitor v2(brw->intelScreen->compiler, brw,
3980 mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
3981 prog, &fp->Base, 16, st_index16);
3982 if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
3983 if (!v.simd16_unsupported) {
3984 /* Try a SIMD16 compile */
3985 v2.import_uniforms(&v);
3986 if (!v2.run_fs(brw->use_rep_send)) {
3987 perf_debug("SIMD16 shader failed to compile: %s", v2.fail_msg);
3988 } else {
3989 simd16_cfg = v2.cfg;
3990 }
3991 }
3992 }
3993
3994 cfg_t *simd8_cfg;
3995 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3996 if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
3997 simd8_cfg = NULL;
3998 prog_data->no_8 = true;
3999 } else {
4000 simd8_cfg = v.cfg;
4001 prog_data->no_8 = false;
4002 }
4003
4004 fs_generator g(brw->intelScreen->compiler, brw,
4005 mem_ctx, (void *) key, &prog_data->base,
4006 &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4007
4008 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4009 char *name;
4010 if (prog)
4011 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4012 prog->Label ? prog->Label : "unnamed",
4013 prog->Name);
4014 else
4015 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4016
4017 g.enable_debug(name);
4018 }
4019
4020 if (simd8_cfg)
4021 g.generate_code(simd8_cfg, 8);
4022 if (simd16_cfg)
4023 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4024
4025 if (unlikely(brw->perf_debug) && shader) {
4026 if (shader->compiled_once)
4027 brw_wm_debug_recompile(brw, prog, key);
4028 shader->compiled_once = true;
4029
4030 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4031 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4032 (get_time() - start_time) * 1000);
4033 }
4034 }
4035
4036 return g.get_assembly(final_assembly_size);
4037 }
4038
4039 extern "C" bool
4040 brw_fs_precompile(struct gl_context *ctx,
4041 struct gl_shader_program *shader_prog,
4042 struct gl_program *prog)
4043 {
4044 struct brw_context *brw = brw_context(ctx);
4045 struct brw_wm_prog_key key;
4046
4047 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4048 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4049 bool program_uses_dfdy = fp->UsesDFdy;
4050
4051 memset(&key, 0, sizeof(key));
4052
4053 if (brw->gen < 6) {
4054 if (fp->UsesKill)
4055 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4056
4057 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4058 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4059
4060 /* Just assume depth testing. */
4061 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4062 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4063 }
4064
4065 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4066 BRW_FS_VARYING_INPUT_MASK) > 16)
4067 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4068
4069 brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4070
4071 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4072 key.drawable_height = ctx->DrawBuffer->Height;
4073 }
4074
4075 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4076 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4077 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4078
4079 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4080 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4081 key.nr_color_regions > 1;
4082 }
4083
4084 key.program_string_id = bfp->id;
4085
4086 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4087 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4088
4089 bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4090
4091 brw->wm.base.prog_offset = old_prog_offset;
4092 brw->wm.prog_data = old_prog_data;
4093
4094 return success;
4095 }
4096
4097 void
4098 brw_setup_tex_for_precompile(struct brw_context *brw,
4099 struct brw_sampler_prog_key_data *tex,
4100 struct gl_program *prog)
4101 {
4102 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4103 unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4104 for (unsigned i = 0; i < sampler_count; i++) {
4105 if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4106 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4107 tex->swizzles[i] =
4108 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4109 } else {
4110 /* Color sampler: assume no swizzling. */
4111 tex->swizzles[i] = SWIZZLE_XYZW;
4112 }
4113 }
4114 }