i965/fs: Factor out source components calculation to a separate method.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 using namespace brw;
53
54 void
55 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
56 const fs_reg *src, unsigned sources)
57 {
58 memset(this, 0, sizeof(*this));
59
60 this->src = new fs_reg[MAX2(sources, 3)];
61 for (unsigned i = 0; i < sources; i++)
62 this->src[i] = src[i];
63
64 this->opcode = opcode;
65 this->dst = dst;
66 this->sources = sources;
67 this->exec_size = exec_size;
68
69 assert(dst.file != IMM && dst.file != UNIFORM);
70
71 assert(this->exec_size != 0);
72
73 this->conditional_mod = BRW_CONDITIONAL_NONE;
74
75 /* This will be the case for almost all instructions. */
76 switch (dst.file) {
77 case GRF:
78 case HW_REG:
79 case MRF:
80 case ATTR:
81 this->regs_written = DIV_ROUND_UP(dst.component_size(exec_size),
82 REG_SIZE);
83 break;
84 case BAD_FILE:
85 this->regs_written = 0;
86 break;
87 case IMM:
88 case UNIFORM:
89 unreachable("Invalid destination register file");
90 default:
91 unreachable("Invalid register file");
92 }
93
94 this->writes_accumulator = false;
95 }
96
97 fs_inst::fs_inst()
98 {
99 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
100 }
101
102 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
103 {
104 init(opcode, exec_size, reg_undef, NULL, 0);
105 }
106
107 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
108 {
109 init(opcode, exec_size, dst, NULL, 0);
110 }
111
112 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
113 const fs_reg &src0)
114 {
115 const fs_reg src[1] = { src0 };
116 init(opcode, exec_size, dst, src, 1);
117 }
118
119 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
120 const fs_reg &src0, const fs_reg &src1)
121 {
122 const fs_reg src[2] = { src0, src1 };
123 init(opcode, exec_size, dst, src, 2);
124 }
125
126 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
127 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
128 {
129 const fs_reg src[3] = { src0, src1, src2 };
130 init(opcode, exec_size, dst, src, 3);
131 }
132
133 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
134 const fs_reg src[], unsigned sources)
135 {
136 init(opcode, exec_width, dst, src, sources);
137 }
138
139 fs_inst::fs_inst(const fs_inst &that)
140 {
141 memcpy(this, &that, sizeof(that));
142
143 this->src = new fs_reg[MAX2(that.sources, 3)];
144
145 for (unsigned i = 0; i < that.sources; i++)
146 this->src[i] = that.src[i];
147 }
148
149 fs_inst::~fs_inst()
150 {
151 delete[] this->src;
152 }
153
154 void
155 fs_inst::resize_sources(uint8_t num_sources)
156 {
157 if (this->sources != num_sources) {
158 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
159
160 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
161 src[i] = this->src[i];
162
163 delete[] this->src;
164 this->src = src;
165 this->sources = num_sources;
166 }
167 }
168
169 void
170 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
171 const fs_reg &dst,
172 const fs_reg &surf_index,
173 const fs_reg &varying_offset,
174 uint32_t const_offset)
175 {
176 /* We have our constant surface use a pitch of 4 bytes, so our index can
177 * be any component of a vector, and then we load 4 contiguous
178 * components starting from that.
179 *
180 * We break down the const_offset to a portion added to the variable
181 * offset and a portion done using reg_offset, which means that if you
182 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
183 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
184 * CSE can later notice that those loads are all the same and eliminate
185 * the redundant ones.
186 */
187 fs_reg vec4_offset = vgrf(glsl_type::int_type);
188 bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
189
190 int scale = 1;
191 if (devinfo->gen == 4 && bld.dispatch_width() == 8) {
192 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
193 * u, v, r) as parameters, or we can just use the SIMD16 message
194 * consisting of (header, u). We choose the second, at the cost of a
195 * longer return length.
196 */
197 scale = 2;
198 }
199
200 enum opcode op;
201 if (devinfo->gen >= 7)
202 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
203 else
204 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
205
206 int regs_written = 4 * (bld.dispatch_width() / 8) * scale;
207 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written), dst.type);
208 fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
209 inst->regs_written = regs_written;
210
211 if (devinfo->gen < 7) {
212 inst->base_mrf = 13;
213 inst->header_size = 1;
214 if (devinfo->gen == 4)
215 inst->mlen = 3;
216 else
217 inst->mlen = 1 + bld.dispatch_width() / 8;
218 }
219
220 bld.MOV(dst, offset(vec4_result, bld, (const_offset & 3) * scale));
221 }
222
223 /**
224 * A helper for MOV generation for fixing up broken hardware SEND dependency
225 * handling.
226 */
227 void
228 fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
229 {
230 /* The caller always wants uncompressed to emit the minimal extra
231 * dependencies, and to avoid having to deal with aligning its regs to 2.
232 */
233 const fs_builder ubld = bld.annotate("send dependency resolve")
234 .half(0);
235
236 ubld.MOV(ubld.null_reg_f(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
237 }
238
239 bool
240 fs_inst::equals(fs_inst *inst) const
241 {
242 return (opcode == inst->opcode &&
243 dst.equals(inst->dst) &&
244 src[0].equals(inst->src[0]) &&
245 src[1].equals(inst->src[1]) &&
246 src[2].equals(inst->src[2]) &&
247 saturate == inst->saturate &&
248 predicate == inst->predicate &&
249 conditional_mod == inst->conditional_mod &&
250 mlen == inst->mlen &&
251 base_mrf == inst->base_mrf &&
252 target == inst->target &&
253 eot == inst->eot &&
254 header_size == inst->header_size &&
255 shadow_compare == inst->shadow_compare &&
256 exec_size == inst->exec_size &&
257 offset == inst->offset);
258 }
259
260 bool
261 fs_inst::overwrites_reg(const fs_reg &reg) const
262 {
263 return reg.in_range(dst, regs_written);
264 }
265
266 bool
267 fs_inst::is_send_from_grf() const
268 {
269 switch (opcode) {
270 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
271 case SHADER_OPCODE_SHADER_TIME_ADD:
272 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
273 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
274 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
275 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
276 case SHADER_OPCODE_UNTYPED_ATOMIC:
277 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
278 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
279 case SHADER_OPCODE_TYPED_ATOMIC:
280 case SHADER_OPCODE_TYPED_SURFACE_READ:
281 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
282 case SHADER_OPCODE_URB_WRITE_SIMD8:
283 return true;
284 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
285 return src[1].file == GRF;
286 case FS_OPCODE_FB_WRITE:
287 return src[0].file == GRF;
288 default:
289 if (is_tex())
290 return src[0].file == GRF;
291
292 return false;
293 }
294 }
295
296 bool
297 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
298 {
299 if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
300 return false;
301
302 fs_reg reg = this->src[0];
303 if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
304 return false;
305
306 if (grf_alloc.sizes[reg.reg] != this->regs_written)
307 return false;
308
309 for (int i = 0; i < this->sources; i++) {
310 reg.type = this->src[i].type;
311 if (!this->src[i].equals(reg))
312 return false;
313
314 if (i < this->header_size) {
315 reg.reg_offset += 1;
316 } else {
317 reg.reg_offset += this->exec_size / 8;
318 }
319 }
320
321 return true;
322 }
323
324 bool
325 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
326 {
327 if (devinfo->gen == 6 && is_math())
328 return false;
329
330 if (is_send_from_grf())
331 return false;
332
333 if (!backend_instruction::can_do_source_mods())
334 return false;
335
336 return true;
337 }
338
339 bool
340 fs_inst::has_side_effects() const
341 {
342 return this->eot || backend_instruction::has_side_effects();
343 }
344
345 void
346 fs_reg::init()
347 {
348 memset(this, 0, sizeof(*this));
349 stride = 1;
350 }
351
352 /** Generic unset register constructor. */
353 fs_reg::fs_reg()
354 {
355 init();
356 this->file = BAD_FILE;
357 }
358
359 /** Immediate value constructor. */
360 fs_reg::fs_reg(float f)
361 {
362 init();
363 this->file = IMM;
364 this->type = BRW_REGISTER_TYPE_F;
365 this->stride = 0;
366 this->fixed_hw_reg.dw1.f = f;
367 }
368
369 /** Immediate value constructor. */
370 fs_reg::fs_reg(int32_t i)
371 {
372 init();
373 this->file = IMM;
374 this->type = BRW_REGISTER_TYPE_D;
375 this->stride = 0;
376 this->fixed_hw_reg.dw1.d = i;
377 }
378
379 /** Immediate value constructor. */
380 fs_reg::fs_reg(uint32_t u)
381 {
382 init();
383 this->file = IMM;
384 this->type = BRW_REGISTER_TYPE_UD;
385 this->stride = 0;
386 this->fixed_hw_reg.dw1.ud = u;
387 }
388
389 /** Vector float immediate value constructor. */
390 fs_reg::fs_reg(uint8_t vf[4])
391 {
392 init();
393 this->file = IMM;
394 this->type = BRW_REGISTER_TYPE_VF;
395 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
396 }
397
398 /** Vector float immediate value constructor. */
399 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
400 {
401 init();
402 this->file = IMM;
403 this->type = BRW_REGISTER_TYPE_VF;
404 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
405 (vf1 << 8) |
406 (vf2 << 16) |
407 (vf3 << 24);
408 }
409
410 /** Fixed brw_reg. */
411 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
412 {
413 init();
414 this->file = HW_REG;
415 this->fixed_hw_reg = fixed_hw_reg;
416 this->type = fixed_hw_reg.type;
417 }
418
419 bool
420 fs_reg::equals(const fs_reg &r) const
421 {
422 return (file == r.file &&
423 reg == r.reg &&
424 reg_offset == r.reg_offset &&
425 subreg_offset == r.subreg_offset &&
426 type == r.type &&
427 negate == r.negate &&
428 abs == r.abs &&
429 !reladdr && !r.reladdr &&
430 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
431 stride == r.stride);
432 }
433
434 fs_reg &
435 fs_reg::set_smear(unsigned subreg)
436 {
437 assert(file != HW_REG && file != IMM);
438 subreg_offset = subreg * type_sz(type);
439 stride = 0;
440 return *this;
441 }
442
443 bool
444 fs_reg::is_contiguous() const
445 {
446 return stride == 1;
447 }
448
449 unsigned
450 fs_reg::component_size(unsigned width) const
451 {
452 const unsigned stride = (file != HW_REG ? this->stride :
453 fixed_hw_reg.hstride == 0 ? 0 :
454 1 << (fixed_hw_reg.hstride - 1));
455 return MAX2(width * stride, 1) * type_sz(type);
456 }
457
458 int
459 fs_visitor::type_size(const struct glsl_type *type)
460 {
461 unsigned int size, i;
462
463 switch (type->base_type) {
464 case GLSL_TYPE_UINT:
465 case GLSL_TYPE_INT:
466 case GLSL_TYPE_FLOAT:
467 case GLSL_TYPE_BOOL:
468 return type->components();
469 case GLSL_TYPE_ARRAY:
470 return type_size(type->fields.array) * type->length;
471 case GLSL_TYPE_STRUCT:
472 size = 0;
473 for (i = 0; i < type->length; i++) {
474 size += type_size(type->fields.structure[i].type);
475 }
476 return size;
477 case GLSL_TYPE_SAMPLER:
478 /* Samplers take up no register space, since they're baked in at
479 * link time.
480 */
481 return 0;
482 case GLSL_TYPE_ATOMIC_UINT:
483 return 0;
484 case GLSL_TYPE_SUBROUTINE:
485 return 1;
486 case GLSL_TYPE_IMAGE:
487 case GLSL_TYPE_VOID:
488 case GLSL_TYPE_ERROR:
489 case GLSL_TYPE_INTERFACE:
490 case GLSL_TYPE_DOUBLE:
491 unreachable("not reached");
492 }
493
494 return 0;
495 }
496
497 /**
498 * Create a MOV to read the timestamp register.
499 *
500 * The caller is responsible for emitting the MOV. The return value is
501 * the destination of the MOV, with extra parameters set.
502 */
503 fs_reg
504 fs_visitor::get_timestamp(const fs_builder &bld)
505 {
506 assert(devinfo->gen >= 7);
507
508 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
509 BRW_ARF_TIMESTAMP,
510 0),
511 BRW_REGISTER_TYPE_UD));
512
513 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
514
515 /* We want to read the 3 fields we care about even if it's not enabled in
516 * the dispatch.
517 */
518 bld.group(4, 0).exec_all().MOV(dst, ts);
519
520 /* The caller wants the low 32 bits of the timestamp. Since it's running
521 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
522 * which is plenty of time for our purposes. It is identical across the
523 * EUs, but since it's tracking GPU core speed it will increment at a
524 * varying rate as render P-states change.
525 *
526 * The caller could also check if render P-states have changed (or anything
527 * else that might disrupt timing) by setting smear to 2 and checking if
528 * that field is != 0.
529 */
530 dst.set_smear(0);
531
532 return dst;
533 }
534
535 void
536 fs_visitor::emit_shader_time_begin()
537 {
538 shader_start_time = get_timestamp(bld.annotate("shader time start"));
539 }
540
541 void
542 fs_visitor::emit_shader_time_end()
543 {
544 /* Insert our code just before the final SEND with EOT. */
545 exec_node *end = this->instructions.get_tail();
546 assert(end && ((fs_inst *) end)->eot);
547 const fs_builder ibld = bld.annotate("shader time end")
548 .exec_all().at(NULL, end);
549
550 fs_reg shader_end_time = get_timestamp(ibld);
551
552 /* Check that there weren't any timestamp reset events (assuming these
553 * were the only two timestamp reads that happened).
554 */
555 fs_reg reset = shader_end_time;
556 reset.set_smear(2);
557 set_condmod(BRW_CONDITIONAL_Z,
558 ibld.AND(ibld.null_reg_ud(), reset, fs_reg(1u)));
559 ibld.IF(BRW_PREDICATE_NORMAL);
560
561 fs_reg start = shader_start_time;
562 start.negate = true;
563 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
564 diff.set_smear(0);
565
566 const fs_builder cbld = ibld.group(1, 0);
567 cbld.group(1, 0).ADD(diff, start, shader_end_time);
568
569 /* If there were no instructions between the two timestamp gets, the diff
570 * is 2 cycles. Remove that overhead, so I can forget about that when
571 * trying to determine the time taken for single instructions.
572 */
573 cbld.ADD(diff, diff, fs_reg(-2u));
574 SHADER_TIME_ADD(cbld, 0, diff);
575 SHADER_TIME_ADD(cbld, 1, fs_reg(1u));
576 ibld.emit(BRW_OPCODE_ELSE);
577 SHADER_TIME_ADD(cbld, 2, fs_reg(1u));
578 ibld.emit(BRW_OPCODE_ENDIF);
579 }
580
581 void
582 fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
583 int shader_time_subindex,
584 fs_reg value)
585 {
586 int index = shader_time_index * 3 + shader_time_subindex;
587 fs_reg offset = fs_reg(index * SHADER_TIME_STRIDE);
588
589 fs_reg payload;
590 if (dispatch_width == 8)
591 payload = vgrf(glsl_type::uvec2_type);
592 else
593 payload = vgrf(glsl_type::uint_type);
594
595 bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
596 }
597
598 void
599 fs_visitor::vfail(const char *format, va_list va)
600 {
601 char *msg;
602
603 if (failed)
604 return;
605
606 failed = true;
607
608 msg = ralloc_vasprintf(mem_ctx, format, va);
609 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
610
611 this->fail_msg = msg;
612
613 if (debug_enabled) {
614 fprintf(stderr, "%s", msg);
615 }
616 }
617
618 void
619 fs_visitor::fail(const char *format, ...)
620 {
621 va_list va;
622
623 va_start(va, format);
624 vfail(format, va);
625 va_end(va);
626 }
627
628 /**
629 * Mark this program as impossible to compile in SIMD16 mode.
630 *
631 * During the SIMD8 compile (which happens first), we can detect and flag
632 * things that are unsupported in SIMD16 mode, so the compiler can skip
633 * the SIMD16 compile altogether.
634 *
635 * During a SIMD16 compile (if one happens anyway), this just calls fail().
636 */
637 void
638 fs_visitor::no16(const char *msg)
639 {
640 if (dispatch_width == 16) {
641 fail("%s", msg);
642 } else {
643 simd16_unsupported = true;
644
645 compiler->shader_perf_log(log_data,
646 "SIMD16 shader failed to compile: %s", msg);
647 }
648 }
649
650 /**
651 * Returns true if the instruction has a flag that means it won't
652 * update an entire destination register.
653 *
654 * For example, dead code elimination and live variable analysis want to know
655 * when a write to a variable screens off any preceding values that were in
656 * it.
657 */
658 bool
659 fs_inst::is_partial_write() const
660 {
661 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
662 (this->exec_size * type_sz(this->dst.type)) < 32 ||
663 !this->dst.is_contiguous());
664 }
665
666 unsigned
667 fs_inst::components_read(unsigned i) const
668 {
669 switch (opcode) {
670 case FS_OPCODE_LINTERP:
671 if (i == 0)
672 return 2;
673 else
674 return 1;
675
676 case FS_OPCODE_PIXEL_X:
677 case FS_OPCODE_PIXEL_Y:
678 assert(i == 0);
679 return 2;
680
681 default:
682 return 1;
683 }
684 }
685
686 int
687 fs_inst::regs_read(int arg) const
688 {
689 switch (opcode) {
690 case FS_OPCODE_FB_WRITE:
691 case SHADER_OPCODE_URB_WRITE_SIMD8:
692 case SHADER_OPCODE_UNTYPED_ATOMIC:
693 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
694 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
695 case SHADER_OPCODE_TYPED_ATOMIC:
696 case SHADER_OPCODE_TYPED_SURFACE_READ:
697 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
698 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
699 if (arg == 0)
700 return mlen;
701 break;
702
703 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
704 /* The payload is actually stored in src1 */
705 if (arg == 1)
706 return mlen;
707 break;
708
709 case FS_OPCODE_LINTERP:
710 if (arg == 1)
711 return 1;
712 break;
713
714 case SHADER_OPCODE_LOAD_PAYLOAD:
715 if (arg < this->header_size)
716 return 1;
717 break;
718
719 case CS_OPCODE_CS_TERMINATE:
720 return 1;
721
722 default:
723 if (is_tex() && arg == 0 && src[0].file == GRF)
724 return mlen;
725 break;
726 }
727
728 switch (src[arg].file) {
729 case BAD_FILE:
730 case UNIFORM:
731 case IMM:
732 return 1;
733 case GRF:
734 case HW_REG:
735 return DIV_ROUND_UP(components_read(arg) *
736 src[arg].component_size(exec_size),
737 REG_SIZE);
738 case MRF:
739 unreachable("MRF registers are not allowed as sources");
740 default:
741 unreachable("Invalid register file");
742 }
743 }
744
745 bool
746 fs_inst::reads_flag() const
747 {
748 return predicate;
749 }
750
751 bool
752 fs_inst::writes_flag() const
753 {
754 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
755 opcode != BRW_OPCODE_IF &&
756 opcode != BRW_OPCODE_WHILE)) ||
757 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
758 }
759
760 /**
761 * Returns how many MRFs an FS opcode will write over.
762 *
763 * Note that this is not the 0 or 1 implied writes in an actual gen
764 * instruction -- the FS opcodes often generate MOVs in addition.
765 */
766 int
767 fs_visitor::implied_mrf_writes(fs_inst *inst)
768 {
769 if (inst->mlen == 0)
770 return 0;
771
772 if (inst->base_mrf == -1)
773 return 0;
774
775 switch (inst->opcode) {
776 case SHADER_OPCODE_RCP:
777 case SHADER_OPCODE_RSQ:
778 case SHADER_OPCODE_SQRT:
779 case SHADER_OPCODE_EXP2:
780 case SHADER_OPCODE_LOG2:
781 case SHADER_OPCODE_SIN:
782 case SHADER_OPCODE_COS:
783 return 1 * dispatch_width / 8;
784 case SHADER_OPCODE_POW:
785 case SHADER_OPCODE_INT_QUOTIENT:
786 case SHADER_OPCODE_INT_REMAINDER:
787 return 2 * dispatch_width / 8;
788 case SHADER_OPCODE_TEX:
789 case FS_OPCODE_TXB:
790 case SHADER_OPCODE_TXD:
791 case SHADER_OPCODE_TXF:
792 case SHADER_OPCODE_TXF_CMS:
793 case SHADER_OPCODE_TXF_MCS:
794 case SHADER_OPCODE_TG4:
795 case SHADER_OPCODE_TG4_OFFSET:
796 case SHADER_OPCODE_TXL:
797 case SHADER_OPCODE_TXS:
798 case SHADER_OPCODE_LOD:
799 return 1;
800 case FS_OPCODE_FB_WRITE:
801 return 2;
802 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
803 case SHADER_OPCODE_GEN4_SCRATCH_READ:
804 return 1;
805 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
806 return inst->mlen;
807 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
808 return inst->mlen;
809 case SHADER_OPCODE_UNTYPED_ATOMIC:
810 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
811 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
812 case SHADER_OPCODE_TYPED_ATOMIC:
813 case SHADER_OPCODE_TYPED_SURFACE_READ:
814 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
815 case SHADER_OPCODE_URB_WRITE_SIMD8:
816 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
817 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
818 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
819 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
820 return 0;
821 default:
822 unreachable("not reached");
823 }
824 }
825
826 fs_reg
827 fs_visitor::vgrf(const glsl_type *const type)
828 {
829 int reg_width = dispatch_width / 8;
830 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
831 brw_type_for_base_type(type));
832 }
833
834 /** Fixed HW reg constructor. */
835 fs_reg::fs_reg(enum register_file file, int reg)
836 {
837 init();
838 this->file = file;
839 this->reg = reg;
840 this->type = BRW_REGISTER_TYPE_F;
841 this->stride = (file == UNIFORM ? 0 : 1);
842 }
843
844 /** Fixed HW reg constructor. */
845 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
846 {
847 init();
848 this->file = file;
849 this->reg = reg;
850 this->type = type;
851 this->stride = (file == UNIFORM ? 0 : 1);
852 }
853
854 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
855 * This brings in those uniform definitions
856 */
857 void
858 fs_visitor::import_uniforms(fs_visitor *v)
859 {
860 this->push_constant_loc = v->push_constant_loc;
861 this->pull_constant_loc = v->pull_constant_loc;
862 this->uniforms = v->uniforms;
863 this->param_size = v->param_size;
864 }
865
866 fs_reg *
867 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
868 bool origin_upper_left)
869 {
870 assert(stage == MESA_SHADER_FRAGMENT);
871 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
872 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
873 fs_reg wpos = *reg;
874 bool flip = !origin_upper_left ^ key->render_to_fbo;
875
876 /* gl_FragCoord.x */
877 if (pixel_center_integer) {
878 bld.MOV(wpos, this->pixel_x);
879 } else {
880 bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
881 }
882 wpos = offset(wpos, bld, 1);
883
884 /* gl_FragCoord.y */
885 if (!flip && pixel_center_integer) {
886 bld.MOV(wpos, this->pixel_y);
887 } else {
888 fs_reg pixel_y = this->pixel_y;
889 float offset = (pixel_center_integer ? 0.0 : 0.5);
890
891 if (flip) {
892 pixel_y.negate = true;
893 offset += key->drawable_height - 1.0;
894 }
895
896 bld.ADD(wpos, pixel_y, fs_reg(offset));
897 }
898 wpos = offset(wpos, bld, 1);
899
900 /* gl_FragCoord.z */
901 if (devinfo->gen >= 6) {
902 bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
903 } else {
904 bld.emit(FS_OPCODE_LINTERP, wpos,
905 this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
906 interp_reg(VARYING_SLOT_POS, 2));
907 }
908 wpos = offset(wpos, bld, 1);
909
910 /* gl_FragCoord.w: Already set up in emit_interpolation */
911 bld.MOV(wpos, this->wpos_w);
912
913 return reg;
914 }
915
916 fs_inst *
917 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
918 glsl_interp_qualifier interpolation_mode,
919 bool is_centroid, bool is_sample)
920 {
921 brw_wm_barycentric_interp_mode barycoord_mode;
922 if (devinfo->gen >= 6) {
923 if (is_centroid) {
924 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
925 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
926 else
927 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
928 } else if (is_sample) {
929 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
930 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
931 else
932 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
933 } else {
934 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
935 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
936 else
937 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
938 }
939 } else {
940 /* On Ironlake and below, there is only one interpolation mode.
941 * Centroid interpolation doesn't mean anything on this hardware --
942 * there is no multisampling.
943 */
944 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
945 }
946 return bld.emit(FS_OPCODE_LINTERP, attr,
947 this->delta_xy[barycoord_mode], interp);
948 }
949
950 void
951 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
952 const glsl_type *type,
953 glsl_interp_qualifier interpolation_mode,
954 int location, bool mod_centroid,
955 bool mod_sample)
956 {
957 attr.type = brw_type_for_base_type(type->get_scalar_type());
958
959 assert(stage == MESA_SHADER_FRAGMENT);
960 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
961 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
962
963 unsigned int array_elements;
964
965 if (type->is_array()) {
966 array_elements = type->length;
967 if (array_elements == 0) {
968 fail("dereferenced array '%s' has length 0\n", name);
969 }
970 type = type->fields.array;
971 } else {
972 array_elements = 1;
973 }
974
975 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
976 bool is_gl_Color =
977 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
978 if (key->flat_shade && is_gl_Color) {
979 interpolation_mode = INTERP_QUALIFIER_FLAT;
980 } else {
981 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
982 }
983 }
984
985 for (unsigned int i = 0; i < array_elements; i++) {
986 for (unsigned int j = 0; j < type->matrix_columns; j++) {
987 if (prog_data->urb_setup[location] == -1) {
988 /* If there's no incoming setup data for this slot, don't
989 * emit interpolation for it.
990 */
991 attr = offset(attr, bld, type->vector_elements);
992 location++;
993 continue;
994 }
995
996 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
997 /* Constant interpolation (flat shading) case. The SF has
998 * handed us defined values in only the constant offset
999 * field of the setup reg.
1000 */
1001 for (unsigned int k = 0; k < type->vector_elements; k++) {
1002 struct brw_reg interp = interp_reg(location, k);
1003 interp = suboffset(interp, 3);
1004 interp.type = attr.type;
1005 bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1006 attr = offset(attr, bld, 1);
1007 }
1008 } else {
1009 /* Smooth/noperspective interpolation case. */
1010 for (unsigned int k = 0; k < type->vector_elements; k++) {
1011 struct brw_reg interp = interp_reg(location, k);
1012 if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1013 /* Get the pixel/sample mask into f0 so that we know
1014 * which pixels are lit. Then, for each channel that is
1015 * unlit, replace the centroid data with non-centroid
1016 * data.
1017 */
1018 bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1019
1020 fs_inst *inst;
1021 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1022 false, false);
1023 inst->predicate = BRW_PREDICATE_NORMAL;
1024 inst->predicate_inverse = true;
1025 if (devinfo->has_pln)
1026 inst->no_dd_clear = true;
1027
1028 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1029 mod_centroid && !key->persample_shading,
1030 mod_sample || key->persample_shading);
1031 inst->predicate = BRW_PREDICATE_NORMAL;
1032 inst->predicate_inverse = false;
1033 if (devinfo->has_pln)
1034 inst->no_dd_check = true;
1035
1036 } else {
1037 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1038 mod_centroid && !key->persample_shading,
1039 mod_sample || key->persample_shading);
1040 }
1041 if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1042 bld.MUL(attr, attr, this->pixel_w);
1043 }
1044 attr = offset(attr, bld, 1);
1045 }
1046
1047 }
1048 location++;
1049 }
1050 }
1051 }
1052
1053 fs_reg *
1054 fs_visitor::emit_frontfacing_interpolation()
1055 {
1056 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1057
1058 if (devinfo->gen >= 6) {
1059 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1060 * a boolean result from this (~0/true or 0/false).
1061 *
1062 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1063 * this task in only one instruction:
1064 * - a negation source modifier will flip the bit; and
1065 * - a W -> D type conversion will sign extend the bit into the high
1066 * word of the destination.
1067 *
1068 * An ASR 15 fills the low word of the destination.
1069 */
1070 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1071 g0.negate = true;
1072
1073 bld.ASR(*reg, g0, fs_reg(15));
1074 } else {
1075 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1076 * a boolean result from this (1/true or 0/false).
1077 *
1078 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1079 * the negation source modifier to flip it. Unfortunately the SHR
1080 * instruction only operates on UD (or D with an abs source modifier)
1081 * sources without negation.
1082 *
1083 * Instead, use ASR (which will give ~0/true or 0/false).
1084 */
1085 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1086 g1_6.negate = true;
1087
1088 bld.ASR(*reg, g1_6, fs_reg(31));
1089 }
1090
1091 return reg;
1092 }
1093
1094 void
1095 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1096 {
1097 assert(stage == MESA_SHADER_FRAGMENT);
1098 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1099 assert(dst.type == BRW_REGISTER_TYPE_F);
1100
1101 if (key->compute_pos_offset) {
1102 /* Convert int_sample_pos to floating point */
1103 bld.MOV(dst, int_sample_pos);
1104 /* Scale to the range [0, 1] */
1105 bld.MUL(dst, dst, fs_reg(1 / 16.0f));
1106 }
1107 else {
1108 /* From ARB_sample_shading specification:
1109 * "When rendering to a non-multisample buffer, or if multisample
1110 * rasterization is disabled, gl_SamplePosition will always be
1111 * (0.5, 0.5).
1112 */
1113 bld.MOV(dst, fs_reg(0.5f));
1114 }
1115 }
1116
1117 fs_reg *
1118 fs_visitor::emit_samplepos_setup()
1119 {
1120 assert(devinfo->gen >= 6);
1121
1122 const fs_builder abld = bld.annotate("compute sample position");
1123 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1124 fs_reg pos = *reg;
1125 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1126 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1127
1128 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1129 * mode will be enabled.
1130 *
1131 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1132 * R31.1:0 Position Offset X/Y for Slot[3:0]
1133 * R31.3:2 Position Offset X/Y for Slot[7:4]
1134 * .....
1135 *
1136 * The X, Y sample positions come in as bytes in thread payload. So, read
1137 * the positions using vstride=16, width=8, hstride=2.
1138 */
1139 struct brw_reg sample_pos_reg =
1140 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1141 BRW_REGISTER_TYPE_B), 16, 8, 2);
1142
1143 if (dispatch_width == 8) {
1144 abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
1145 } else {
1146 abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
1147 abld.half(1).MOV(half(int_sample_x, 1),
1148 fs_reg(suboffset(sample_pos_reg, 16)));
1149 }
1150 /* Compute gl_SamplePosition.x */
1151 compute_sample_position(pos, int_sample_x);
1152 pos = offset(pos, abld, 1);
1153 if (dispatch_width == 8) {
1154 abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
1155 } else {
1156 abld.half(0).MOV(half(int_sample_y, 0),
1157 fs_reg(suboffset(sample_pos_reg, 1)));
1158 abld.half(1).MOV(half(int_sample_y, 1),
1159 fs_reg(suboffset(sample_pos_reg, 17)));
1160 }
1161 /* Compute gl_SamplePosition.y */
1162 compute_sample_position(pos, int_sample_y);
1163 return reg;
1164 }
1165
1166 fs_reg *
1167 fs_visitor::emit_sampleid_setup()
1168 {
1169 assert(stage == MESA_SHADER_FRAGMENT);
1170 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1171 assert(devinfo->gen >= 6);
1172
1173 const fs_builder abld = bld.annotate("compute sample id");
1174 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1175
1176 if (key->compute_sample_id) {
1177 fs_reg t1 = vgrf(glsl_type::int_type);
1178 fs_reg t2 = vgrf(glsl_type::int_type);
1179 t2.type = BRW_REGISTER_TYPE_UW;
1180
1181 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1182 * 8x multisampling, subspan 0 will represent sample N (where N
1183 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1184 * 7. We can find the value of N by looking at R0.0 bits 7:6
1185 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1186 * (since samples are always delivered in pairs). That is, we
1187 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1188 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1189 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1190 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1191 * populating a temporary variable with the sequence (0, 1, 2, 3),
1192 * and then reading from it using vstride=1, width=4, hstride=0.
1193 * These computations hold good for 4x multisampling as well.
1194 *
1195 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1196 * the first four slots are sample 0 of subspan 0; the next four
1197 * are sample 1 of subspan 0; the third group is sample 0 of
1198 * subspan 1, and finally sample 1 of subspan 1.
1199 */
1200 abld.exec_all()
1201 .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1202 fs_reg(0xc0));
1203 abld.exec_all().SHR(t1, t1, fs_reg(5));
1204
1205 /* This works for both SIMD8 and SIMD16 */
1206 abld.exec_all()
1207 .MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210));
1208
1209 /* This special instruction takes care of setting vstride=1,
1210 * width=4, hstride=0 of t2 during an ADD instruction.
1211 */
1212 abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1213 } else {
1214 /* As per GL_ARB_sample_shading specification:
1215 * "When rendering to a non-multisample buffer, or if multisample
1216 * rasterization is disabled, gl_SampleID will always be zero."
1217 */
1218 abld.MOV(*reg, fs_reg(0));
1219 }
1220
1221 return reg;
1222 }
1223
1224 void
1225 fs_visitor::resolve_source_modifiers(fs_reg *src)
1226 {
1227 if (!src->abs && !src->negate)
1228 return;
1229
1230 fs_reg temp = bld.vgrf(src->type);
1231 bld.MOV(temp, *src);
1232 *src = temp;
1233 }
1234
1235 void
1236 fs_visitor::emit_discard_jump()
1237 {
1238 assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1239
1240 /* For performance, after a discard, jump to the end of the
1241 * shader if all relevant channels have been discarded.
1242 */
1243 fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
1244 discard_jump->flag_subreg = 1;
1245
1246 discard_jump->predicate = (dispatch_width == 8)
1247 ? BRW_PREDICATE_ALIGN1_ANY8H
1248 : BRW_PREDICATE_ALIGN1_ANY16H;
1249 discard_jump->predicate_inverse = true;
1250 }
1251
1252 void
1253 fs_visitor::assign_curb_setup()
1254 {
1255 if (dispatch_width == 8) {
1256 prog_data->dispatch_grf_start_reg = payload.num_regs;
1257 } else {
1258 if (stage == MESA_SHADER_FRAGMENT) {
1259 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1260 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1261 } else if (stage == MESA_SHADER_COMPUTE) {
1262 brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1263 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1264 } else {
1265 unreachable("Unsupported shader type!");
1266 }
1267 }
1268
1269 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1270
1271 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1272 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1273 for (unsigned int i = 0; i < inst->sources; i++) {
1274 if (inst->src[i].file == UNIFORM) {
1275 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1276 int constant_nr;
1277 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1278 constant_nr = push_constant_loc[uniform_nr];
1279 } else {
1280 /* Section 5.11 of the OpenGL 4.1 spec says:
1281 * "Out-of-bounds reads return undefined values, which include
1282 * values from other variables of the active program or zero."
1283 * Just return the first push constant.
1284 */
1285 constant_nr = 0;
1286 }
1287
1288 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1289 constant_nr / 8,
1290 constant_nr % 8);
1291
1292 assert(inst->src[i].stride == 0);
1293 inst->src[i].file = HW_REG;
1294 inst->src[i].fixed_hw_reg = byte_offset(
1295 retype(brw_reg, inst->src[i].type),
1296 inst->src[i].subreg_offset);
1297 }
1298 }
1299 }
1300 }
1301
1302 void
1303 fs_visitor::calculate_urb_setup()
1304 {
1305 assert(stage == MESA_SHADER_FRAGMENT);
1306 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1307 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1308
1309 memset(prog_data->urb_setup, -1,
1310 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1311
1312 int urb_next = 0;
1313 /* Figure out where each of the incoming setup attributes lands. */
1314 if (devinfo->gen >= 6) {
1315 if (_mesa_bitcount_64(prog->InputsRead &
1316 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1317 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1318 * first 16 varying inputs, so we can put them wherever we want.
1319 * Just put them in order.
1320 *
1321 * This is useful because it means that (a) inputs not used by the
1322 * fragment shader won't take up valuable register space, and (b) we
1323 * won't have to recompile the fragment shader if it gets paired with
1324 * a different vertex (or geometry) shader.
1325 */
1326 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1327 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1328 BITFIELD64_BIT(i)) {
1329 prog_data->urb_setup[i] = urb_next++;
1330 }
1331 }
1332 } else {
1333 /* We have enough input varyings that the SF/SBE pipeline stage can't
1334 * arbitrarily rearrange them to suit our whim; we have to put them
1335 * in an order that matches the output of the previous pipeline stage
1336 * (geometry or vertex shader).
1337 */
1338 struct brw_vue_map prev_stage_vue_map;
1339 brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1340 key->input_slots_valid);
1341 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1342 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1343 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1344 slot++) {
1345 int varying = prev_stage_vue_map.slot_to_varying[slot];
1346 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1347 * unused.
1348 */
1349 if (varying != BRW_VARYING_SLOT_COUNT &&
1350 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1351 BITFIELD64_BIT(varying))) {
1352 prog_data->urb_setup[varying] = slot - first_slot;
1353 }
1354 }
1355 urb_next = prev_stage_vue_map.num_slots - first_slot;
1356 }
1357 } else {
1358 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1359 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1360 /* Point size is packed into the header, not as a general attribute */
1361 if (i == VARYING_SLOT_PSIZ)
1362 continue;
1363
1364 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1365 /* The back color slot is skipped when the front color is
1366 * also written to. In addition, some slots can be
1367 * written in the vertex shader and not read in the
1368 * fragment shader. So the register number must always be
1369 * incremented, mapped or not.
1370 */
1371 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1372 prog_data->urb_setup[i] = urb_next;
1373 urb_next++;
1374 }
1375 }
1376
1377 /*
1378 * It's a FS only attribute, and we did interpolation for this attribute
1379 * in SF thread. So, count it here, too.
1380 *
1381 * See compile_sf_prog() for more info.
1382 */
1383 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1384 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1385 }
1386
1387 prog_data->num_varying_inputs = urb_next;
1388 }
1389
1390 void
1391 fs_visitor::assign_urb_setup()
1392 {
1393 assert(stage == MESA_SHADER_FRAGMENT);
1394 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1395
1396 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1397
1398 /* Offset all the urb_setup[] index by the actual position of the
1399 * setup regs, now that the location of the constants has been chosen.
1400 */
1401 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1402 if (inst->opcode == FS_OPCODE_LINTERP) {
1403 assert(inst->src[1].file == HW_REG);
1404 inst->src[1].fixed_hw_reg.nr += urb_start;
1405 }
1406
1407 if (inst->opcode == FS_OPCODE_CINTERP) {
1408 assert(inst->src[0].file == HW_REG);
1409 inst->src[0].fixed_hw_reg.nr += urb_start;
1410 }
1411 }
1412
1413 /* Each attribute is 4 setup channels, each of which is half a reg. */
1414 this->first_non_payload_grf =
1415 urb_start + prog_data->num_varying_inputs * 2;
1416 }
1417
1418 void
1419 fs_visitor::assign_vs_urb_setup()
1420 {
1421 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1422 int grf, count, slot, channel, attr;
1423
1424 assert(stage == MESA_SHADER_VERTEX);
1425 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1426 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1427 count++;
1428
1429 /* Each attribute is 4 regs. */
1430 this->first_non_payload_grf =
1431 payload.num_regs + prog_data->curb_read_length + count * 4;
1432
1433 unsigned vue_entries =
1434 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1435
1436 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1437 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1438
1439 assert(vs_prog_data->base.urb_read_length <= 15);
1440
1441 /* Rewrite all ATTR file references to the hw grf that they land in. */
1442 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1443 for (int i = 0; i < inst->sources; i++) {
1444 if (inst->src[i].file == ATTR) {
1445
1446 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1447 slot = count - 1;
1448 } else {
1449 /* Attributes come in in a contiguous block, ordered by their
1450 * gl_vert_attrib value. That means we can compute the slot
1451 * number for an attribute by masking out the enabled
1452 * attributes before it and counting the bits.
1453 */
1454 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1455 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1456 BITFIELD64_MASK(attr));
1457 }
1458
1459 channel = inst->src[i].reg_offset & 3;
1460
1461 grf = payload.num_regs +
1462 prog_data->curb_read_length +
1463 slot * 4 + channel;
1464
1465 inst->src[i].file = HW_REG;
1466 inst->src[i].fixed_hw_reg =
1467 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1468 }
1469 }
1470 }
1471 }
1472
1473 /**
1474 * Split large virtual GRFs into separate components if we can.
1475 *
1476 * This is mostly duplicated with what brw_fs_vector_splitting does,
1477 * but that's really conservative because it's afraid of doing
1478 * splitting that doesn't result in real progress after the rest of
1479 * the optimization phases, which would cause infinite looping in
1480 * optimization. We can do it once here, safely. This also has the
1481 * opportunity to split interpolated values, or maybe even uniforms,
1482 * which we don't have at the IR level.
1483 *
1484 * We want to split, because virtual GRFs are what we register
1485 * allocate and spill (due to contiguousness requirements for some
1486 * instructions), and they're what we naturally generate in the
1487 * codegen process, but most virtual GRFs don't actually need to be
1488 * contiguous sets of GRFs. If we split, we'll end up with reduced
1489 * live intervals and better dead code elimination and coalescing.
1490 */
1491 void
1492 fs_visitor::split_virtual_grfs()
1493 {
1494 int num_vars = this->alloc.count;
1495
1496 /* Count the total number of registers */
1497 int reg_count = 0;
1498 int vgrf_to_reg[num_vars];
1499 for (int i = 0; i < num_vars; i++) {
1500 vgrf_to_reg[i] = reg_count;
1501 reg_count += alloc.sizes[i];
1502 }
1503
1504 /* An array of "split points". For each register slot, this indicates
1505 * if this slot can be separated from the previous slot. Every time an
1506 * instruction uses multiple elements of a register (as a source or
1507 * destination), we mark the used slots as inseparable. Then we go
1508 * through and split the registers into the smallest pieces we can.
1509 */
1510 bool split_points[reg_count];
1511 memset(split_points, 0, sizeof(split_points));
1512
1513 /* Mark all used registers as fully splittable */
1514 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1515 if (inst->dst.file == GRF) {
1516 int reg = vgrf_to_reg[inst->dst.reg];
1517 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1518 split_points[reg + j] = true;
1519 }
1520
1521 for (int i = 0; i < inst->sources; i++) {
1522 if (inst->src[i].file == GRF) {
1523 int reg = vgrf_to_reg[inst->src[i].reg];
1524 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1525 split_points[reg + j] = true;
1526 }
1527 }
1528 }
1529
1530 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1531 if (inst->dst.file == GRF) {
1532 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1533 for (int j = 1; j < inst->regs_written; j++)
1534 split_points[reg + j] = false;
1535 }
1536 for (int i = 0; i < inst->sources; i++) {
1537 if (inst->src[i].file == GRF) {
1538 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1539 for (int j = 1; j < inst->regs_read(i); j++)
1540 split_points[reg + j] = false;
1541 }
1542 }
1543 }
1544
1545 int new_virtual_grf[reg_count];
1546 int new_reg_offset[reg_count];
1547
1548 int reg = 0;
1549 for (int i = 0; i < num_vars; i++) {
1550 /* The first one should always be 0 as a quick sanity check. */
1551 assert(split_points[reg] == false);
1552
1553 /* j = 0 case */
1554 new_reg_offset[reg] = 0;
1555 reg++;
1556 int offset = 1;
1557
1558 /* j > 0 case */
1559 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1560 /* If this is a split point, reset the offset to 0 and allocate a
1561 * new virtual GRF for the previous offset many registers
1562 */
1563 if (split_points[reg]) {
1564 assert(offset <= MAX_VGRF_SIZE);
1565 int grf = alloc.allocate(offset);
1566 for (int k = reg - offset; k < reg; k++)
1567 new_virtual_grf[k] = grf;
1568 offset = 0;
1569 }
1570 new_reg_offset[reg] = offset;
1571 offset++;
1572 reg++;
1573 }
1574
1575 /* The last one gets the original register number */
1576 assert(offset <= MAX_VGRF_SIZE);
1577 alloc.sizes[i] = offset;
1578 for (int k = reg - offset; k < reg; k++)
1579 new_virtual_grf[k] = i;
1580 }
1581 assert(reg == reg_count);
1582
1583 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1584 if (inst->dst.file == GRF) {
1585 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1586 inst->dst.reg = new_virtual_grf[reg];
1587 inst->dst.reg_offset = new_reg_offset[reg];
1588 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1589 }
1590 for (int i = 0; i < inst->sources; i++) {
1591 if (inst->src[i].file == GRF) {
1592 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1593 inst->src[i].reg = new_virtual_grf[reg];
1594 inst->src[i].reg_offset = new_reg_offset[reg];
1595 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1596 }
1597 }
1598 }
1599 invalidate_live_intervals();
1600 }
1601
1602 /**
1603 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1604 *
1605 * During code generation, we create tons of temporary variables, many of
1606 * which get immediately killed and are never used again. Yet, in later
1607 * optimization and analysis passes, such as compute_live_intervals, we need
1608 * to loop over all the virtual GRFs. Compacting them can save a lot of
1609 * overhead.
1610 */
1611 bool
1612 fs_visitor::compact_virtual_grfs()
1613 {
1614 bool progress = false;
1615 int remap_table[this->alloc.count];
1616 memset(remap_table, -1, sizeof(remap_table));
1617
1618 /* Mark which virtual GRFs are used. */
1619 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1620 if (inst->dst.file == GRF)
1621 remap_table[inst->dst.reg] = 0;
1622
1623 for (int i = 0; i < inst->sources; i++) {
1624 if (inst->src[i].file == GRF)
1625 remap_table[inst->src[i].reg] = 0;
1626 }
1627 }
1628
1629 /* Compact the GRF arrays. */
1630 int new_index = 0;
1631 for (unsigned i = 0; i < this->alloc.count; i++) {
1632 if (remap_table[i] == -1) {
1633 /* We just found an unused register. This means that we are
1634 * actually going to compact something.
1635 */
1636 progress = true;
1637 } else {
1638 remap_table[i] = new_index;
1639 alloc.sizes[new_index] = alloc.sizes[i];
1640 invalidate_live_intervals();
1641 ++new_index;
1642 }
1643 }
1644
1645 this->alloc.count = new_index;
1646
1647 /* Patch all the instructions to use the newly renumbered registers */
1648 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1649 if (inst->dst.file == GRF)
1650 inst->dst.reg = remap_table[inst->dst.reg];
1651
1652 for (int i = 0; i < inst->sources; i++) {
1653 if (inst->src[i].file == GRF)
1654 inst->src[i].reg = remap_table[inst->src[i].reg];
1655 }
1656 }
1657
1658 /* Patch all the references to delta_xy, since they're used in register
1659 * allocation. If they're unused, switch them to BAD_FILE so we don't
1660 * think some random VGRF is delta_xy.
1661 */
1662 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
1663 if (delta_xy[i].file == GRF) {
1664 if (remap_table[delta_xy[i].reg] != -1) {
1665 delta_xy[i].reg = remap_table[delta_xy[i].reg];
1666 } else {
1667 delta_xy[i].file = BAD_FILE;
1668 }
1669 }
1670 }
1671
1672 return progress;
1673 }
1674
1675 /*
1676 * Implements array access of uniforms by inserting a
1677 * PULL_CONSTANT_LOAD instruction.
1678 *
1679 * Unlike temporary GRF array access (where we don't support it due to
1680 * the difficulty of doing relative addressing on instruction
1681 * destinations), we could potentially do array access of uniforms
1682 * that were loaded in GRF space as push constants. In real-world
1683 * usage we've seen, though, the arrays being used are always larger
1684 * than we could load as push constants, so just always move all
1685 * uniform array access out to a pull constant buffer.
1686 */
1687 void
1688 fs_visitor::move_uniform_array_access_to_pull_constants()
1689 {
1690 if (dispatch_width != 8)
1691 return;
1692
1693 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1694 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
1695
1696 /* Walk through and find array access of uniforms. Put a copy of that
1697 * uniform in the pull constant buffer.
1698 *
1699 * Note that we don't move constant-indexed accesses to arrays. No
1700 * testing has been done of the performance impact of this choice.
1701 */
1702 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
1703 for (int i = 0 ; i < inst->sources; i++) {
1704 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1705 continue;
1706
1707 int uniform = inst->src[i].reg;
1708
1709 /* If this array isn't already present in the pull constant buffer,
1710 * add it.
1711 */
1712 if (pull_constant_loc[uniform] == -1) {
1713 const gl_constant_value **values = &stage_prog_data->param[uniform];
1714
1715 assert(param_size[uniform]);
1716
1717 for (int j = 0; j < param_size[uniform]; j++) {
1718 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1719
1720 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1721 values[j];
1722 }
1723 }
1724 }
1725 }
1726 }
1727
1728 /**
1729 * Assign UNIFORM file registers to either push constants or pull constants.
1730 *
1731 * We allow a fragment shader to have more than the specified minimum
1732 * maximum number of fragment shader uniform components (64). If
1733 * there are too many of these, they'd fill up all of register space.
1734 * So, this will push some of them out to the pull constant buffer and
1735 * update the program to load them.
1736 */
1737 void
1738 fs_visitor::assign_constant_locations()
1739 {
1740 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1741 if (dispatch_width != 8)
1742 return;
1743
1744 /* Find which UNIFORM registers are still in use. */
1745 bool is_live[uniforms];
1746 for (unsigned int i = 0; i < uniforms; i++) {
1747 is_live[i] = false;
1748 }
1749
1750 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1751 for (int i = 0; i < inst->sources; i++) {
1752 if (inst->src[i].file != UNIFORM)
1753 continue;
1754
1755 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1756 if (constant_nr >= 0 && constant_nr < (int) uniforms)
1757 is_live[constant_nr] = true;
1758 }
1759 }
1760
1761 /* Only allow 16 registers (128 uniform components) as push constants.
1762 *
1763 * Just demote the end of the list. We could probably do better
1764 * here, demoting things that are rarely used in the program first.
1765 *
1766 * If changing this value, note the limitation about total_regs in
1767 * brw_curbe.c.
1768 */
1769 unsigned int max_push_components = 16 * 8;
1770 unsigned int num_push_constants = 0;
1771
1772 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1773
1774 for (unsigned int i = 0; i < uniforms; i++) {
1775 if (!is_live[i] || pull_constant_loc[i] != -1) {
1776 /* This UNIFORM register is either dead, or has already been demoted
1777 * to a pull const. Mark it as no longer living in the param[] array.
1778 */
1779 push_constant_loc[i] = -1;
1780 continue;
1781 }
1782
1783 if (num_push_constants < max_push_components) {
1784 /* Retain as a push constant. Record the location in the params[]
1785 * array.
1786 */
1787 push_constant_loc[i] = num_push_constants++;
1788 } else {
1789 /* Demote to a pull constant. */
1790 push_constant_loc[i] = -1;
1791
1792 int pull_index = stage_prog_data->nr_pull_params++;
1793 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1794 pull_constant_loc[i] = pull_index;
1795 }
1796 }
1797
1798 stage_prog_data->nr_params = num_push_constants;
1799
1800 /* Up until now, the param[] array has been indexed by reg + reg_offset
1801 * of UNIFORM registers. Condense it to only contain the uniforms we
1802 * chose to upload as push constants.
1803 */
1804 for (unsigned int i = 0; i < uniforms; i++) {
1805 int remapped = push_constant_loc[i];
1806
1807 if (remapped == -1)
1808 continue;
1809
1810 assert(remapped <= (int)i);
1811 stage_prog_data->param[remapped] = stage_prog_data->param[i];
1812 }
1813 }
1814
1815 /**
1816 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1817 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1818 */
1819 void
1820 fs_visitor::demote_pull_constants()
1821 {
1822 foreach_block_and_inst (block, fs_inst, inst, cfg) {
1823 for (int i = 0; i < inst->sources; i++) {
1824 if (inst->src[i].file != UNIFORM)
1825 continue;
1826
1827 int pull_index;
1828 unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
1829 if (location >= uniforms) /* Out of bounds access */
1830 pull_index = -1;
1831 else
1832 pull_index = pull_constant_loc[location];
1833
1834 if (pull_index == -1)
1835 continue;
1836
1837 /* Set up the annotation tracking for new generated instructions. */
1838 const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
1839 .at(block, inst);
1840 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1841 fs_reg dst = vgrf(glsl_type::float_type);
1842
1843 assert(inst->src[i].stride == 0);
1844
1845 /* Generate a pull load into dst. */
1846 if (inst->src[i].reladdr) {
1847 VARYING_PULL_CONSTANT_LOAD(ibld, dst,
1848 surf_index,
1849 *inst->src[i].reladdr,
1850 pull_index);
1851 inst->src[i].reladdr = NULL;
1852 inst->src[i].stride = 1;
1853 } else {
1854 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1855 ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1856 dst, surf_index, offset);
1857 inst->src[i].set_smear(pull_index & 3);
1858 }
1859
1860 /* Rewrite the instruction to use the temporary VGRF. */
1861 inst->src[i].file = GRF;
1862 inst->src[i].reg = dst.reg;
1863 inst->src[i].reg_offset = 0;
1864 }
1865 }
1866 invalidate_live_intervals();
1867 }
1868
1869 bool
1870 fs_visitor::opt_algebraic()
1871 {
1872 bool progress = false;
1873
1874 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1875 switch (inst->opcode) {
1876 case BRW_OPCODE_MOV:
1877 if (inst->src[0].file != IMM)
1878 break;
1879
1880 if (inst->saturate) {
1881 if (inst->dst.type != inst->src[0].type)
1882 assert(!"unimplemented: saturate mixed types");
1883
1884 if (brw_saturate_immediate(inst->dst.type,
1885 &inst->src[0].fixed_hw_reg)) {
1886 inst->saturate = false;
1887 progress = true;
1888 }
1889 }
1890 break;
1891
1892 case BRW_OPCODE_MUL:
1893 if (inst->src[1].file != IMM)
1894 continue;
1895
1896 /* a * 1.0 = a */
1897 if (inst->src[1].is_one()) {
1898 inst->opcode = BRW_OPCODE_MOV;
1899 inst->src[1] = reg_undef;
1900 progress = true;
1901 break;
1902 }
1903
1904 /* a * -1.0 = -a */
1905 if (inst->src[1].is_negative_one()) {
1906 inst->opcode = BRW_OPCODE_MOV;
1907 inst->src[0].negate = !inst->src[0].negate;
1908 inst->src[1] = reg_undef;
1909 progress = true;
1910 break;
1911 }
1912
1913 /* a * 0.0 = 0.0 */
1914 if (inst->src[1].is_zero()) {
1915 inst->opcode = BRW_OPCODE_MOV;
1916 inst->src[0] = inst->src[1];
1917 inst->src[1] = reg_undef;
1918 progress = true;
1919 break;
1920 }
1921
1922 if (inst->src[0].file == IMM) {
1923 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
1924 inst->opcode = BRW_OPCODE_MOV;
1925 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
1926 inst->src[1] = reg_undef;
1927 progress = true;
1928 break;
1929 }
1930 break;
1931 case BRW_OPCODE_ADD:
1932 if (inst->src[1].file != IMM)
1933 continue;
1934
1935 /* a + 0.0 = a */
1936 if (inst->src[1].is_zero()) {
1937 inst->opcode = BRW_OPCODE_MOV;
1938 inst->src[1] = reg_undef;
1939 progress = true;
1940 break;
1941 }
1942
1943 if (inst->src[0].file == IMM) {
1944 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
1945 inst->opcode = BRW_OPCODE_MOV;
1946 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
1947 inst->src[1] = reg_undef;
1948 progress = true;
1949 break;
1950 }
1951 break;
1952 case BRW_OPCODE_OR:
1953 if (inst->src[0].equals(inst->src[1])) {
1954 inst->opcode = BRW_OPCODE_MOV;
1955 inst->src[1] = reg_undef;
1956 progress = true;
1957 break;
1958 }
1959 break;
1960 case BRW_OPCODE_LRP:
1961 if (inst->src[1].equals(inst->src[2])) {
1962 inst->opcode = BRW_OPCODE_MOV;
1963 inst->src[0] = inst->src[1];
1964 inst->src[1] = reg_undef;
1965 inst->src[2] = reg_undef;
1966 progress = true;
1967 break;
1968 }
1969 break;
1970 case BRW_OPCODE_CMP:
1971 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
1972 inst->src[0].abs &&
1973 inst->src[0].negate &&
1974 inst->src[1].is_zero()) {
1975 inst->src[0].abs = false;
1976 inst->src[0].negate = false;
1977 inst->conditional_mod = BRW_CONDITIONAL_Z;
1978 progress = true;
1979 break;
1980 }
1981 break;
1982 case BRW_OPCODE_SEL:
1983 if (inst->src[0].equals(inst->src[1])) {
1984 inst->opcode = BRW_OPCODE_MOV;
1985 inst->src[1] = reg_undef;
1986 inst->predicate = BRW_PREDICATE_NONE;
1987 inst->predicate_inverse = false;
1988 progress = true;
1989 } else if (inst->saturate && inst->src[1].file == IMM) {
1990 switch (inst->conditional_mod) {
1991 case BRW_CONDITIONAL_LE:
1992 case BRW_CONDITIONAL_L:
1993 switch (inst->src[1].type) {
1994 case BRW_REGISTER_TYPE_F:
1995 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
1996 inst->opcode = BRW_OPCODE_MOV;
1997 inst->src[1] = reg_undef;
1998 inst->conditional_mod = BRW_CONDITIONAL_NONE;
1999 progress = true;
2000 }
2001 break;
2002 default:
2003 break;
2004 }
2005 break;
2006 case BRW_CONDITIONAL_GE:
2007 case BRW_CONDITIONAL_G:
2008 switch (inst->src[1].type) {
2009 case BRW_REGISTER_TYPE_F:
2010 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2011 inst->opcode = BRW_OPCODE_MOV;
2012 inst->src[1] = reg_undef;
2013 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2014 progress = true;
2015 }
2016 break;
2017 default:
2018 break;
2019 }
2020 default:
2021 break;
2022 }
2023 }
2024 break;
2025 case BRW_OPCODE_MAD:
2026 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2027 inst->opcode = BRW_OPCODE_MOV;
2028 inst->src[1] = reg_undef;
2029 inst->src[2] = reg_undef;
2030 progress = true;
2031 } else if (inst->src[0].is_zero()) {
2032 inst->opcode = BRW_OPCODE_MUL;
2033 inst->src[0] = inst->src[2];
2034 inst->src[2] = reg_undef;
2035 progress = true;
2036 } else if (inst->src[1].is_one()) {
2037 inst->opcode = BRW_OPCODE_ADD;
2038 inst->src[1] = inst->src[2];
2039 inst->src[2] = reg_undef;
2040 progress = true;
2041 } else if (inst->src[2].is_one()) {
2042 inst->opcode = BRW_OPCODE_ADD;
2043 inst->src[2] = reg_undef;
2044 progress = true;
2045 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2046 inst->opcode = BRW_OPCODE_ADD;
2047 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2048 inst->src[2] = reg_undef;
2049 progress = true;
2050 }
2051 break;
2052 case SHADER_OPCODE_RCP: {
2053 fs_inst *prev = (fs_inst *)inst->prev;
2054 if (prev->opcode == SHADER_OPCODE_SQRT) {
2055 if (inst->src[0].equals(prev->dst)) {
2056 inst->opcode = SHADER_OPCODE_RSQ;
2057 inst->src[0] = prev->src[0];
2058 progress = true;
2059 }
2060 }
2061 break;
2062 }
2063 case SHADER_OPCODE_BROADCAST:
2064 if (is_uniform(inst->src[0])) {
2065 inst->opcode = BRW_OPCODE_MOV;
2066 inst->sources = 1;
2067 inst->force_writemask_all = true;
2068 progress = true;
2069 } else if (inst->src[1].file == IMM) {
2070 inst->opcode = BRW_OPCODE_MOV;
2071 inst->src[0] = component(inst->src[0],
2072 inst->src[1].fixed_hw_reg.dw1.ud);
2073 inst->sources = 1;
2074 inst->force_writemask_all = true;
2075 progress = true;
2076 }
2077 break;
2078
2079 default:
2080 break;
2081 }
2082
2083 /* Swap if src[0] is immediate. */
2084 if (progress && inst->is_commutative()) {
2085 if (inst->src[0].file == IMM) {
2086 fs_reg tmp = inst->src[1];
2087 inst->src[1] = inst->src[0];
2088 inst->src[0] = tmp;
2089 }
2090 }
2091 }
2092 return progress;
2093 }
2094
2095 /**
2096 * Optimize sample messages that have constant zero values for the trailing
2097 * texture coordinates. We can just reduce the message length for these
2098 * instructions instead of reserving a register for it. Trailing parameters
2099 * that aren't sent default to zero anyway. This will cause the dead code
2100 * eliminator to remove the MOV instruction that would otherwise be emitted to
2101 * set up the zero value.
2102 */
2103 bool
2104 fs_visitor::opt_zero_samples()
2105 {
2106 /* Gen4 infers the texturing opcode based on the message length so we can't
2107 * change it.
2108 */
2109 if (devinfo->gen < 5)
2110 return false;
2111
2112 bool progress = false;
2113
2114 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2115 if (!inst->is_tex())
2116 continue;
2117
2118 fs_inst *load_payload = (fs_inst *) inst->prev;
2119
2120 if (load_payload->is_head_sentinel() ||
2121 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2122 continue;
2123
2124 /* We don't want to remove the message header or the first parameter.
2125 * Removing the first parameter is not allowed, see the Haswell PRM
2126 * volume 7, page 149:
2127 *
2128 * "Parameter 0 is required except for the sampleinfo message, which
2129 * has no parameter 0"
2130 */
2131 while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2132 load_payload->src[(inst->mlen - inst->header_size) /
2133 (dispatch_width / 8) +
2134 inst->header_size - 1].is_zero()) {
2135 inst->mlen -= dispatch_width / 8;
2136 progress = true;
2137 }
2138 }
2139
2140 if (progress)
2141 invalidate_live_intervals();
2142
2143 return progress;
2144 }
2145
2146 /**
2147 * Optimize sample messages which are followed by the final RT write.
2148 *
2149 * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2150 * results sent directly to the framebuffer, bypassing the EU. Recognize the
2151 * final texturing results copied to the framebuffer write payload and modify
2152 * them to write to the framebuffer directly.
2153 */
2154 bool
2155 fs_visitor::opt_sampler_eot()
2156 {
2157 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2158
2159 if (stage != MESA_SHADER_FRAGMENT)
2160 return false;
2161
2162 if (devinfo->gen < 9 && !devinfo->is_cherryview)
2163 return false;
2164
2165 /* FINISHME: It should be possible to implement this optimization when there
2166 * are multiple drawbuffers.
2167 */
2168 if (key->nr_color_regions != 1)
2169 return false;
2170
2171 /* Look for a texturing instruction immediately before the final FB_WRITE. */
2172 fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2173 assert(fb_write->eot);
2174 assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2175
2176 fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2177
2178 /* There wasn't one; nothing to do. */
2179 if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2180 return false;
2181
2182 /* This optimisation doesn't seem to work for textureGather for some
2183 * reason. I can't find any documentation or known workarounds to indicate
2184 * that this is expected, but considering that it is probably pretty
2185 * unlikely that a shader would directly write out the results from
2186 * textureGather we might as well just disable it.
2187 */
2188 if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
2189 tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2190 return false;
2191
2192 /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2193 * It's very likely to be the previous instruction.
2194 */
2195 fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2196 if (load_payload->is_head_sentinel() ||
2197 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2198 return false;
2199
2200 assert(!tex_inst->eot); /* We can't get here twice */
2201 assert((tex_inst->offset & (0xff << 24)) == 0);
2202
2203 tex_inst->offset |= fb_write->target << 24;
2204 tex_inst->eot = true;
2205 tex_inst->dst = bld.null_reg_ud();
2206 fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2207
2208 /* If a header is present, marking the eot is sufficient. Otherwise, we need
2209 * to create a new LOAD_PAYLOAD command with the same sources and a space
2210 * saved for the header. Using a new destination register not only makes sure
2211 * we have enough space, but it will make sure the dead code eliminator kills
2212 * the instruction that this will replace.
2213 */
2214 if (tex_inst->header_size != 0)
2215 return true;
2216
2217 fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
2218 load_payload->sources + 1);
2219 fs_reg *new_sources =
2220 ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2221
2222 new_sources[0] = fs_reg();
2223 for (int i = 0; i < load_payload->sources; i++)
2224 new_sources[i+1] = load_payload->src[i];
2225
2226 /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2227 * requires a lot of information about the sources to appropriately figure
2228 * out the number of registers needed to be used. Given this stage in our
2229 * optimization, we may not have the appropriate GRFs required by
2230 * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2231 * manually emit the instruction.
2232 */
2233 fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2234 load_payload->exec_size,
2235 send_header,
2236 new_sources,
2237 load_payload->sources + 1);
2238
2239 new_load_payload->regs_written = load_payload->regs_written + 1;
2240 new_load_payload->header_size = 1;
2241 tex_inst->mlen++;
2242 tex_inst->header_size = 1;
2243 tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2244 tex_inst->src[0] = send_header;
2245
2246 return true;
2247 }
2248
2249 bool
2250 fs_visitor::opt_register_renaming()
2251 {
2252 bool progress = false;
2253 int depth = 0;
2254
2255 int remap[alloc.count];
2256 memset(remap, -1, sizeof(int) * alloc.count);
2257
2258 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2259 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2260 depth++;
2261 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2262 inst->opcode == BRW_OPCODE_WHILE) {
2263 depth--;
2264 }
2265
2266 /* Rewrite instruction sources. */
2267 for (int i = 0; i < inst->sources; i++) {
2268 if (inst->src[i].file == GRF &&
2269 remap[inst->src[i].reg] != -1 &&
2270 remap[inst->src[i].reg] != inst->src[i].reg) {
2271 inst->src[i].reg = remap[inst->src[i].reg];
2272 progress = true;
2273 }
2274 }
2275
2276 const int dst = inst->dst.reg;
2277
2278 if (depth == 0 &&
2279 inst->dst.file == GRF &&
2280 alloc.sizes[inst->dst.reg] == inst->exec_size / 8 &&
2281 !inst->is_partial_write()) {
2282 if (remap[dst] == -1) {
2283 remap[dst] = dst;
2284 } else {
2285 remap[dst] = alloc.allocate(inst->exec_size / 8);
2286 inst->dst.reg = remap[dst];
2287 progress = true;
2288 }
2289 } else if (inst->dst.file == GRF &&
2290 remap[dst] != -1 &&
2291 remap[dst] != dst) {
2292 inst->dst.reg = remap[dst];
2293 progress = true;
2294 }
2295 }
2296
2297 if (progress) {
2298 invalidate_live_intervals();
2299
2300 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2301 if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2302 delta_xy[i].reg = remap[delta_xy[i].reg];
2303 }
2304 }
2305 }
2306
2307 return progress;
2308 }
2309
2310 /**
2311 * Remove redundant or useless discard jumps.
2312 *
2313 * For example, we can eliminate jumps in the following sequence:
2314 *
2315 * discard-jump (redundant with the next jump)
2316 * discard-jump (useless; jumps to the next instruction)
2317 * placeholder-halt
2318 */
2319 bool
2320 fs_visitor::opt_redundant_discard_jumps()
2321 {
2322 bool progress = false;
2323
2324 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2325
2326 fs_inst *placeholder_halt = NULL;
2327 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2328 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2329 placeholder_halt = inst;
2330 break;
2331 }
2332 }
2333
2334 if (!placeholder_halt)
2335 return false;
2336
2337 /* Delete any HALTs immediately before the placeholder halt. */
2338 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2339 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2340 prev = (fs_inst *) placeholder_halt->prev) {
2341 prev->remove(last_bblock);
2342 progress = true;
2343 }
2344
2345 if (progress)
2346 invalidate_live_intervals();
2347
2348 return progress;
2349 }
2350
2351 bool
2352 fs_visitor::compute_to_mrf()
2353 {
2354 bool progress = false;
2355 int next_ip = 0;
2356
2357 /* No MRFs on Gen >= 7. */
2358 if (devinfo->gen >= 7)
2359 return false;
2360
2361 calculate_live_intervals();
2362
2363 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2364 int ip = next_ip;
2365 next_ip++;
2366
2367 if (inst->opcode != BRW_OPCODE_MOV ||
2368 inst->is_partial_write() ||
2369 inst->dst.file != MRF || inst->src[0].file != GRF ||
2370 inst->dst.type != inst->src[0].type ||
2371 inst->src[0].abs || inst->src[0].negate ||
2372 !inst->src[0].is_contiguous() ||
2373 inst->src[0].subreg_offset)
2374 continue;
2375
2376 /* Work out which hardware MRF registers are written by this
2377 * instruction.
2378 */
2379 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2380 int mrf_high;
2381 if (inst->dst.reg & BRW_MRF_COMPR4) {
2382 mrf_high = mrf_low + 4;
2383 } else if (inst->exec_size == 16) {
2384 mrf_high = mrf_low + 1;
2385 } else {
2386 mrf_high = mrf_low;
2387 }
2388
2389 /* Can't compute-to-MRF this GRF if someone else was going to
2390 * read it later.
2391 */
2392 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2393 continue;
2394
2395 /* Found a move of a GRF to a MRF. Let's see if we can go
2396 * rewrite the thing that made this GRF to write into the MRF.
2397 */
2398 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2399 if (scan_inst->dst.file == GRF &&
2400 scan_inst->dst.reg == inst->src[0].reg) {
2401 /* Found the last thing to write our reg we want to turn
2402 * into a compute-to-MRF.
2403 */
2404
2405 /* If this one instruction didn't populate all the
2406 * channels, bail. We might be able to rewrite everything
2407 * that writes that reg, but it would require smarter
2408 * tracking to delay the rewriting until complete success.
2409 */
2410 if (scan_inst->is_partial_write())
2411 break;
2412
2413 /* Things returning more than one register would need us to
2414 * understand coalescing out more than one MOV at a time.
2415 */
2416 if (scan_inst->regs_written > scan_inst->exec_size / 8)
2417 break;
2418
2419 /* SEND instructions can't have MRF as a destination. */
2420 if (scan_inst->mlen)
2421 break;
2422
2423 if (devinfo->gen == 6) {
2424 /* gen6 math instructions must have the destination be
2425 * GRF, so no compute-to-MRF for them.
2426 */
2427 if (scan_inst->is_math()) {
2428 break;
2429 }
2430 }
2431
2432 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2433 /* Found the creator of our MRF's source value. */
2434 scan_inst->dst.file = MRF;
2435 scan_inst->dst.reg = inst->dst.reg;
2436 scan_inst->saturate |= inst->saturate;
2437 inst->remove(block);
2438 progress = true;
2439 }
2440 break;
2441 }
2442
2443 /* We don't handle control flow here. Most computation of
2444 * values that end up in MRFs are shortly before the MRF
2445 * write anyway.
2446 */
2447 if (block->start() == scan_inst)
2448 break;
2449
2450 /* You can't read from an MRF, so if someone else reads our
2451 * MRF's source GRF that we wanted to rewrite, that stops us.
2452 */
2453 bool interfered = false;
2454 for (int i = 0; i < scan_inst->sources; i++) {
2455 if (scan_inst->src[i].file == GRF &&
2456 scan_inst->src[i].reg == inst->src[0].reg &&
2457 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2458 interfered = true;
2459 }
2460 }
2461 if (interfered)
2462 break;
2463
2464 if (scan_inst->dst.file == MRF) {
2465 /* If somebody else writes our MRF here, we can't
2466 * compute-to-MRF before that.
2467 */
2468 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2469 int scan_mrf_high;
2470
2471 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2472 scan_mrf_high = scan_mrf_low + 4;
2473 } else if (scan_inst->exec_size == 16) {
2474 scan_mrf_high = scan_mrf_low + 1;
2475 } else {
2476 scan_mrf_high = scan_mrf_low;
2477 }
2478
2479 if (mrf_low == scan_mrf_low ||
2480 mrf_low == scan_mrf_high ||
2481 mrf_high == scan_mrf_low ||
2482 mrf_high == scan_mrf_high) {
2483 break;
2484 }
2485 }
2486
2487 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2488 /* Found a SEND instruction, which means that there are
2489 * live values in MRFs from base_mrf to base_mrf +
2490 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2491 * above it.
2492 */
2493 if (mrf_low >= scan_inst->base_mrf &&
2494 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2495 break;
2496 }
2497 if (mrf_high >= scan_inst->base_mrf &&
2498 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2499 break;
2500 }
2501 }
2502 }
2503 }
2504
2505 if (progress)
2506 invalidate_live_intervals();
2507
2508 return progress;
2509 }
2510
2511 /**
2512 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2513 * flow. We could probably do better here with some form of divergence
2514 * analysis.
2515 */
2516 bool
2517 fs_visitor::eliminate_find_live_channel()
2518 {
2519 bool progress = false;
2520 unsigned depth = 0;
2521
2522 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2523 switch (inst->opcode) {
2524 case BRW_OPCODE_IF:
2525 case BRW_OPCODE_DO:
2526 depth++;
2527 break;
2528
2529 case BRW_OPCODE_ENDIF:
2530 case BRW_OPCODE_WHILE:
2531 depth--;
2532 break;
2533
2534 case FS_OPCODE_DISCARD_JUMP:
2535 /* This can potentially make control flow non-uniform until the end
2536 * of the program.
2537 */
2538 return progress;
2539
2540 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2541 if (depth == 0) {
2542 inst->opcode = BRW_OPCODE_MOV;
2543 inst->src[0] = fs_reg(0);
2544 inst->sources = 1;
2545 inst->force_writemask_all = true;
2546 progress = true;
2547 }
2548 break;
2549
2550 default:
2551 break;
2552 }
2553 }
2554
2555 return progress;
2556 }
2557
2558 /**
2559 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2560 * instructions to FS_OPCODE_REP_FB_WRITE.
2561 */
2562 void
2563 fs_visitor::emit_repclear_shader()
2564 {
2565 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2566 int base_mrf = 1;
2567 int color_mrf = base_mrf + 2;
2568
2569 fs_inst *mov = bld.exec_all().MOV(vec4(brw_message_reg(color_mrf)),
2570 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
2571
2572 fs_inst *write;
2573 if (key->nr_color_regions == 1) {
2574 write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2575 write->saturate = key->clamp_fragment_color;
2576 write->base_mrf = color_mrf;
2577 write->target = 0;
2578 write->header_size = 0;
2579 write->mlen = 1;
2580 } else {
2581 assume(key->nr_color_regions > 0);
2582 for (int i = 0; i < key->nr_color_regions; ++i) {
2583 write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2584 write->saturate = key->clamp_fragment_color;
2585 write->base_mrf = base_mrf;
2586 write->target = i;
2587 write->header_size = 2;
2588 write->mlen = 3;
2589 }
2590 }
2591 write->eot = true;
2592
2593 calculate_cfg();
2594
2595 assign_constant_locations();
2596 assign_curb_setup();
2597
2598 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2599 assert(mov->src[0].file == HW_REG);
2600 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2601 }
2602
2603 /**
2604 * Walks through basic blocks, looking for repeated MRF writes and
2605 * removing the later ones.
2606 */
2607 bool
2608 fs_visitor::remove_duplicate_mrf_writes()
2609 {
2610 fs_inst *last_mrf_move[16];
2611 bool progress = false;
2612
2613 /* Need to update the MRF tracking for compressed instructions. */
2614 if (dispatch_width == 16)
2615 return false;
2616
2617 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2618
2619 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2620 if (inst->is_control_flow()) {
2621 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2622 }
2623
2624 if (inst->opcode == BRW_OPCODE_MOV &&
2625 inst->dst.file == MRF) {
2626 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2627 if (prev_inst && inst->equals(prev_inst)) {
2628 inst->remove(block);
2629 progress = true;
2630 continue;
2631 }
2632 }
2633
2634 /* Clear out the last-write records for MRFs that were overwritten. */
2635 if (inst->dst.file == MRF) {
2636 last_mrf_move[inst->dst.reg] = NULL;
2637 }
2638
2639 if (inst->mlen > 0 && inst->base_mrf != -1) {
2640 /* Found a SEND instruction, which will include two or fewer
2641 * implied MRF writes. We could do better here.
2642 */
2643 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2644 last_mrf_move[inst->base_mrf + i] = NULL;
2645 }
2646 }
2647
2648 /* Clear out any MRF move records whose sources got overwritten. */
2649 if (inst->dst.file == GRF) {
2650 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2651 if (last_mrf_move[i] &&
2652 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2653 last_mrf_move[i] = NULL;
2654 }
2655 }
2656 }
2657
2658 if (inst->opcode == BRW_OPCODE_MOV &&
2659 inst->dst.file == MRF &&
2660 inst->src[0].file == GRF &&
2661 !inst->is_partial_write()) {
2662 last_mrf_move[inst->dst.reg] = inst;
2663 }
2664 }
2665
2666 if (progress)
2667 invalidate_live_intervals();
2668
2669 return progress;
2670 }
2671
2672 static void
2673 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
2674 {
2675 /* Clear the flag for registers that actually got read (as expected). */
2676 for (int i = 0; i < inst->sources; i++) {
2677 int grf;
2678 if (inst->src[i].file == GRF) {
2679 grf = inst->src[i].reg;
2680 } else if (inst->src[i].file == HW_REG &&
2681 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2682 grf = inst->src[i].fixed_hw_reg.nr;
2683 } else {
2684 continue;
2685 }
2686
2687 if (grf >= first_grf &&
2688 grf < first_grf + grf_len) {
2689 deps[grf - first_grf] = false;
2690 if (inst->exec_size == 16)
2691 deps[grf - first_grf + 1] = false;
2692 }
2693 }
2694 }
2695
2696 /**
2697 * Implements this workaround for the original 965:
2698 *
2699 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2700 * check for post destination dependencies on this instruction, software
2701 * must ensure that there is no destination hazard for the case of ‘write
2702 * followed by a posted write’ shown in the following example.
2703 *
2704 * 1. mov r3 0
2705 * 2. send r3.xy <rest of send instruction>
2706 * 3. mov r2 r3
2707 *
2708 * Due to no post-destination dependency check on the ‘send’, the above
2709 * code sequence could have two instructions (1 and 2) in flight at the
2710 * same time that both consider ‘r3’ as the target of their final writes.
2711 */
2712 void
2713 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2714 fs_inst *inst)
2715 {
2716 int write_len = inst->regs_written;
2717 int first_write_grf = inst->dst.reg;
2718 bool needs_dep[BRW_MAX_MRF];
2719 assert(write_len < (int)sizeof(needs_dep) - 1);
2720
2721 memset(needs_dep, false, sizeof(needs_dep));
2722 memset(needs_dep, true, write_len);
2723
2724 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
2725
2726 /* Walk backwards looking for writes to registers we're writing which
2727 * aren't read since being written. If we hit the start of the program,
2728 * we assume that there are no outstanding dependencies on entry to the
2729 * program.
2730 */
2731 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2732 /* If we hit control flow, assume that there *are* outstanding
2733 * dependencies, and force their cleanup before our instruction.
2734 */
2735 if (block->start() == scan_inst) {
2736 for (int i = 0; i < write_len; i++) {
2737 if (needs_dep[i])
2738 DEP_RESOLVE_MOV(bld.at(block, inst), first_write_grf + i);
2739 }
2740 return;
2741 }
2742
2743 /* We insert our reads as late as possible on the assumption that any
2744 * instruction but a MOV that might have left us an outstanding
2745 * dependency has more latency than a MOV.
2746 */
2747 if (scan_inst->dst.file == GRF) {
2748 for (int i = 0; i < scan_inst->regs_written; i++) {
2749 int reg = scan_inst->dst.reg + i;
2750
2751 if (reg >= first_write_grf &&
2752 reg < first_write_grf + write_len &&
2753 needs_dep[reg - first_write_grf]) {
2754 DEP_RESOLVE_MOV(bld.at(block, inst), reg);
2755 needs_dep[reg - first_write_grf] = false;
2756 if (scan_inst->exec_size == 16)
2757 needs_dep[reg - first_write_grf + 1] = false;
2758 }
2759 }
2760 }
2761
2762 /* Clear the flag for registers that actually got read (as expected). */
2763 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2764
2765 /* Continue the loop only if we haven't resolved all the dependencies */
2766 int i;
2767 for (i = 0; i < write_len; i++) {
2768 if (needs_dep[i])
2769 break;
2770 }
2771 if (i == write_len)
2772 return;
2773 }
2774 }
2775
2776 /**
2777 * Implements this workaround for the original 965:
2778 *
2779 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2780 * used as a destination register until after it has been sourced by an
2781 * instruction with a different destination register.
2782 */
2783 void
2784 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2785 {
2786 int write_len = inst->regs_written;
2787 int first_write_grf = inst->dst.reg;
2788 bool needs_dep[BRW_MAX_MRF];
2789 assert(write_len < (int)sizeof(needs_dep) - 1);
2790
2791 memset(needs_dep, false, sizeof(needs_dep));
2792 memset(needs_dep, true, write_len);
2793 /* Walk forwards looking for writes to registers we're writing which aren't
2794 * read before being written.
2795 */
2796 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2797 /* If we hit control flow, force resolve all remaining dependencies. */
2798 if (block->end() == scan_inst) {
2799 for (int i = 0; i < write_len; i++) {
2800 if (needs_dep[i])
2801 DEP_RESOLVE_MOV(bld.at(block, scan_inst), first_write_grf + i);
2802 }
2803 return;
2804 }
2805
2806 /* Clear the flag for registers that actually got read (as expected). */
2807 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2808
2809 /* We insert our reads as late as possible since they're reading the
2810 * result of a SEND, which has massive latency.
2811 */
2812 if (scan_inst->dst.file == GRF &&
2813 scan_inst->dst.reg >= first_write_grf &&
2814 scan_inst->dst.reg < first_write_grf + write_len &&
2815 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2816 DEP_RESOLVE_MOV(bld.at(block, scan_inst), scan_inst->dst.reg);
2817 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2818 }
2819
2820 /* Continue the loop only if we haven't resolved all the dependencies */
2821 int i;
2822 for (i = 0; i < write_len; i++) {
2823 if (needs_dep[i])
2824 break;
2825 }
2826 if (i == write_len)
2827 return;
2828 }
2829 }
2830
2831 void
2832 fs_visitor::insert_gen4_send_dependency_workarounds()
2833 {
2834 if (devinfo->gen != 4 || devinfo->is_g4x)
2835 return;
2836
2837 bool progress = false;
2838
2839 /* Note that we're done with register allocation, so GRF fs_regs always
2840 * have a .reg_offset of 0.
2841 */
2842
2843 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2844 if (inst->mlen != 0 && inst->dst.file == GRF) {
2845 insert_gen4_pre_send_dependency_workarounds(block, inst);
2846 insert_gen4_post_send_dependency_workarounds(block, inst);
2847 progress = true;
2848 }
2849 }
2850
2851 if (progress)
2852 invalidate_live_intervals();
2853 }
2854
2855 /**
2856 * Turns the generic expression-style uniform pull constant load instruction
2857 * into a hardware-specific series of instructions for loading a pull
2858 * constant.
2859 *
2860 * The expression style allows the CSE pass before this to optimize out
2861 * repeated loads from the same offset, and gives the pre-register-allocation
2862 * scheduling full flexibility, while the conversion to native instructions
2863 * allows the post-register-allocation scheduler the best information
2864 * possible.
2865 *
2866 * Note that execution masking for setting up pull constant loads is special:
2867 * the channels that need to be written are unrelated to the current execution
2868 * mask, since a later instruction will use one of the result channels as a
2869 * source operand for all 8 or 16 of its channels.
2870 */
2871 void
2872 fs_visitor::lower_uniform_pull_constant_loads()
2873 {
2874 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2875 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2876 continue;
2877
2878 if (devinfo->gen >= 7) {
2879 /* The offset arg before was a vec4-aligned byte offset. We need to
2880 * turn it into a dword offset.
2881 */
2882 fs_reg const_offset_reg = inst->src[1];
2883 assert(const_offset_reg.file == IMM &&
2884 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2885 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2886
2887 fs_reg payload, offset;
2888 if (devinfo->gen >= 9) {
2889 /* We have to use a message header on Skylake to get SIMD4x2
2890 * mode. Reserve space for the register.
2891 */
2892 offset = payload = fs_reg(GRF, alloc.allocate(2));
2893 offset.reg_offset++;
2894 inst->mlen = 2;
2895 } else {
2896 offset = payload = fs_reg(GRF, alloc.allocate(1));
2897 inst->mlen = 1;
2898 }
2899
2900 /* This is actually going to be a MOV, but since only the first dword
2901 * is accessed, we have a special opcode to do just that one. Note
2902 * that this needs to be an operation that will be considered a def
2903 * by live variable analysis, or register allocation will explode.
2904 */
2905 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2906 8, offset, const_offset_reg);
2907 setup->force_writemask_all = true;
2908
2909 setup->ir = inst->ir;
2910 setup->annotation = inst->annotation;
2911 inst->insert_before(block, setup);
2912
2913 /* Similarly, this will only populate the first 4 channels of the
2914 * result register (since we only use smear values from 0-3), but we
2915 * don't tell the optimizer.
2916 */
2917 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2918 inst->src[1] = payload;
2919 inst->base_mrf = -1;
2920
2921 invalidate_live_intervals();
2922 } else {
2923 /* Before register allocation, we didn't tell the scheduler about the
2924 * MRF we use. We know it's safe to use this MRF because nothing
2925 * else does except for register spill/unspill, which generates and
2926 * uses its MRF within a single IR instruction.
2927 */
2928 inst->base_mrf = 14;
2929 inst->mlen = 1;
2930 }
2931 }
2932 }
2933
2934 bool
2935 fs_visitor::lower_load_payload()
2936 {
2937 bool progress = false;
2938
2939 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2940 if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2941 continue;
2942
2943 assert(inst->dst.file == MRF || inst->dst.file == GRF);
2944 assert(inst->saturate == false);
2945 fs_reg dst = inst->dst;
2946
2947 /* Get rid of COMPR4. We'll add it back in if we need it */
2948 if (dst.file == MRF)
2949 dst.reg = dst.reg & ~BRW_MRF_COMPR4;
2950
2951 const fs_builder hbld = bld.exec_all().group(8, 0).at(block, inst);
2952
2953 for (uint8_t i = 0; i < inst->header_size; i++) {
2954 if (inst->src[i].file != BAD_FILE) {
2955 fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
2956 fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
2957 hbld.MOV(mov_dst, mov_src);
2958 }
2959 dst = offset(dst, hbld, 1);
2960 }
2961
2962 const fs_builder ibld = bld.exec_all(inst->force_writemask_all)
2963 .group(inst->exec_size, inst->force_sechalf)
2964 .at(block, inst);
2965
2966 if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
2967 inst->exec_size > 8) {
2968 /* In this case, the payload portion of the LOAD_PAYLOAD isn't
2969 * a straightforward copy. Instead, the result of the
2970 * LOAD_PAYLOAD is treated as interleaved and the first four
2971 * non-header sources are unpacked as:
2972 *
2973 * m + 0: r0
2974 * m + 1: g0
2975 * m + 2: b0
2976 * m + 3: a0
2977 * m + 4: r1
2978 * m + 5: g1
2979 * m + 6: b1
2980 * m + 7: a1
2981 *
2982 * This is used for gen <= 5 fb writes.
2983 */
2984 assert(inst->exec_size == 16);
2985 assert(inst->header_size + 4 <= inst->sources);
2986 for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
2987 if (inst->src[i].file != BAD_FILE) {
2988 if (devinfo->has_compr4) {
2989 fs_reg compr4_dst = retype(dst, inst->src[i].type);
2990 compr4_dst.reg |= BRW_MRF_COMPR4;
2991 ibld.MOV(compr4_dst, inst->src[i]);
2992 } else {
2993 /* Platform doesn't have COMPR4. We have to fake it */
2994 fs_reg mov_dst = retype(dst, inst->src[i].type);
2995 ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
2996 mov_dst.reg += 4;
2997 ibld.half(1).MOV(mov_dst, half(inst->src[i], 1));
2998 }
2999 }
3000
3001 dst.reg++;
3002 }
3003
3004 /* The loop above only ever incremented us through the first set
3005 * of 4 registers. However, thanks to the magic of COMPR4, we
3006 * actually wrote to the first 8 registers, so we need to take
3007 * that into account now.
3008 */
3009 dst.reg += 4;
3010
3011 /* The COMPR4 code took care of the first 4 sources. We'll let
3012 * the regular path handle any remaining sources. Yes, we are
3013 * modifying the instruction but we're about to delete it so
3014 * this really doesn't hurt anything.
3015 */
3016 inst->header_size += 4;
3017 }
3018
3019 for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3020 if (inst->src[i].file != BAD_FILE)
3021 ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
3022 dst = offset(dst, ibld, 1);
3023 }
3024
3025 inst->remove(block);
3026 progress = true;
3027 }
3028
3029 if (progress)
3030 invalidate_live_intervals();
3031
3032 return progress;
3033 }
3034
3035 bool
3036 fs_visitor::lower_integer_multiplication()
3037 {
3038 bool progress = false;
3039
3040 /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
3041 * directly, but Cherryview cannot.
3042 */
3043 if (devinfo->gen >= 8 && !devinfo->is_cherryview)
3044 return false;
3045
3046 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3047 if (inst->opcode != BRW_OPCODE_MUL ||
3048 inst->dst.is_accumulator() ||
3049 (inst->dst.type != BRW_REGISTER_TYPE_D &&
3050 inst->dst.type != BRW_REGISTER_TYPE_UD))
3051 continue;
3052
3053 const fs_builder ibld = bld.at(block, inst);
3054
3055 /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3056 * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3057 * src1 are used.
3058 *
3059 * If multiplying by an immediate value that fits in 16-bits, do a
3060 * single MUL instruction with that value in the proper location.
3061 */
3062 if (inst->src[1].file == IMM &&
3063 inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
3064 if (devinfo->gen < 7) {
3065 fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
3066 inst->dst.type);
3067 ibld.MOV(imm, inst->src[1]);
3068 ibld.MUL(inst->dst, imm, inst->src[0]);
3069 } else {
3070 ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
3071 }
3072 } else {
3073 /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3074 * do 32-bit integer multiplication in one instruction, but instead
3075 * must do a sequence (which actually calculates a 64-bit result):
3076 *
3077 * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
3078 * mach(8) null g3<8,8,1>D g4<8,8,1>D
3079 * mov(8) g2<1>D acc0<8,8,1>D
3080 *
3081 * But on Gen > 6, the ability to use second accumulator register
3082 * (acc1) for non-float data types was removed, preventing a simple
3083 * implementation in SIMD16. A 16-channel result can be calculated by
3084 * executing the three instructions twice in SIMD8, once with quarter
3085 * control of 1Q for the first eight channels and again with 2Q for
3086 * the second eight channels.
3087 *
3088 * Which accumulator register is implicitly accessed (by AccWrEnable
3089 * for instance) is determined by the quarter control. Unfortunately
3090 * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3091 * implicit accumulator access by an instruction with 2Q will access
3092 * acc1 regardless of whether the data type is usable in acc1.
3093 *
3094 * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3095 * integer data types.
3096 *
3097 * Since we only want the low 32-bits of the result, we can do two
3098 * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3099 * adjust the high result and add them (like the mach is doing):
3100 *
3101 * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
3102 * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
3103 * shl(8) g9<1>D g8<8,8,1>D 16D
3104 * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
3105 *
3106 * We avoid the shl instruction by realizing that we only want to add
3107 * the low 16-bits of the "high" result to the high 16-bits of the
3108 * "low" result and using proper regioning on the add:
3109 *
3110 * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
3111 * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
3112 * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
3113 *
3114 * Since it does not use the (single) accumulator register, we can
3115 * schedule multi-component multiplications much better.
3116 */
3117
3118 if (inst->conditional_mod && inst->dst.is_null()) {
3119 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3120 inst->dst.type);
3121 }
3122 fs_reg low = inst->dst;
3123 fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
3124 inst->dst.type);
3125
3126 if (devinfo->gen >= 7) {
3127 fs_reg src1_0_w = inst->src[1];
3128 fs_reg src1_1_w = inst->src[1];
3129
3130 if (inst->src[1].file == IMM) {
3131 src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
3132 src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
3133 } else {
3134 src1_0_w.type = BRW_REGISTER_TYPE_UW;
3135 if (src1_0_w.stride != 0) {
3136 assert(src1_0_w.stride == 1);
3137 src1_0_w.stride = 2;
3138 }
3139
3140 src1_1_w.type = BRW_REGISTER_TYPE_UW;
3141 if (src1_1_w.stride != 0) {
3142 assert(src1_1_w.stride == 1);
3143 src1_1_w.stride = 2;
3144 }
3145 src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3146 }
3147 ibld.MUL(low, inst->src[0], src1_0_w);
3148 ibld.MUL(high, inst->src[0], src1_1_w);
3149 } else {
3150 fs_reg src0_0_w = inst->src[0];
3151 fs_reg src0_1_w = inst->src[0];
3152
3153 src0_0_w.type = BRW_REGISTER_TYPE_UW;
3154 if (src0_0_w.stride != 0) {
3155 assert(src0_0_w.stride == 1);
3156 src0_0_w.stride = 2;
3157 }
3158
3159 src0_1_w.type = BRW_REGISTER_TYPE_UW;
3160 if (src0_1_w.stride != 0) {
3161 assert(src0_1_w.stride == 1);
3162 src0_1_w.stride = 2;
3163 }
3164 src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3165
3166 ibld.MUL(low, src0_0_w, inst->src[1]);
3167 ibld.MUL(high, src0_1_w, inst->src[1]);
3168 }
3169
3170 fs_reg dst = inst->dst;
3171 dst.type = BRW_REGISTER_TYPE_UW;
3172 dst.subreg_offset = 2;
3173 dst.stride = 2;
3174
3175 high.type = BRW_REGISTER_TYPE_UW;
3176 high.stride = 2;
3177
3178 low.type = BRW_REGISTER_TYPE_UW;
3179 low.subreg_offset = 2;
3180 low.stride = 2;
3181
3182 ibld.ADD(dst, low, high);
3183
3184 if (inst->conditional_mod) {
3185 fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
3186 set_condmod(inst->conditional_mod,
3187 ibld.MOV(null, inst->dst));
3188 }
3189 }
3190
3191 inst->remove(block);
3192 progress = true;
3193 }
3194
3195 if (progress)
3196 invalidate_live_intervals();
3197
3198 return progress;
3199 }
3200
3201 void
3202 fs_visitor::dump_instructions()
3203 {
3204 dump_instructions(NULL);
3205 }
3206
3207 void
3208 fs_visitor::dump_instructions(const char *name)
3209 {
3210 FILE *file = stderr;
3211 if (name && geteuid() != 0) {
3212 file = fopen(name, "w");
3213 if (!file)
3214 file = stderr;
3215 }
3216
3217 if (cfg) {
3218 calculate_register_pressure();
3219 int ip = 0, max_pressure = 0;
3220 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3221 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3222 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3223 dump_instruction(inst, file);
3224 ip++;
3225 }
3226 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3227 } else {
3228 int ip = 0;
3229 foreach_in_list(backend_instruction, inst, &instructions) {
3230 fprintf(file, "%4d: ", ip++);
3231 dump_instruction(inst, file);
3232 }
3233 }
3234
3235 if (file != stderr) {
3236 fclose(file);
3237 }
3238 }
3239
3240 void
3241 fs_visitor::dump_instruction(backend_instruction *be_inst)
3242 {
3243 dump_instruction(be_inst, stderr);
3244 }
3245
3246 void
3247 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3248 {
3249 fs_inst *inst = (fs_inst *)be_inst;
3250
3251 if (inst->predicate) {
3252 fprintf(file, "(%cf0.%d) ",
3253 inst->predicate_inverse ? '-' : '+',
3254 inst->flag_subreg);
3255 }
3256
3257 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3258 if (inst->saturate)
3259 fprintf(file, ".sat");
3260 if (inst->conditional_mod) {
3261 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3262 if (!inst->predicate &&
3263 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3264 inst->opcode != BRW_OPCODE_IF &&
3265 inst->opcode != BRW_OPCODE_WHILE))) {
3266 fprintf(file, ".f0.%d", inst->flag_subreg);
3267 }
3268 }
3269 fprintf(file, "(%d) ", inst->exec_size);
3270
3271 if (inst->mlen) {
3272 fprintf(file, "(mlen: %d) ", inst->mlen);
3273 }
3274
3275 switch (inst->dst.file) {
3276 case GRF:
3277 fprintf(file, "vgrf%d", inst->dst.reg);
3278 if (alloc.sizes[inst->dst.reg] != inst->regs_written ||
3279 inst->dst.subreg_offset)
3280 fprintf(file, "+%d.%d",
3281 inst->dst.reg_offset, inst->dst.subreg_offset);
3282 break;
3283 case MRF:
3284 fprintf(file, "m%d", inst->dst.reg);
3285 break;
3286 case BAD_FILE:
3287 fprintf(file, "(null)");
3288 break;
3289 case UNIFORM:
3290 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3291 break;
3292 case ATTR:
3293 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3294 break;
3295 case HW_REG:
3296 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3297 switch (inst->dst.fixed_hw_reg.nr) {
3298 case BRW_ARF_NULL:
3299 fprintf(file, "null");
3300 break;
3301 case BRW_ARF_ADDRESS:
3302 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3303 break;
3304 case BRW_ARF_ACCUMULATOR:
3305 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3306 break;
3307 case BRW_ARF_FLAG:
3308 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3309 inst->dst.fixed_hw_reg.subnr);
3310 break;
3311 default:
3312 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3313 inst->dst.fixed_hw_reg.subnr);
3314 break;
3315 }
3316 } else {
3317 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3318 }
3319 if (inst->dst.fixed_hw_reg.subnr)
3320 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3321 break;
3322 default:
3323 fprintf(file, "???");
3324 break;
3325 }
3326 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3327
3328 for (int i = 0; i < inst->sources; i++) {
3329 if (inst->src[i].negate)
3330 fprintf(file, "-");
3331 if (inst->src[i].abs)
3332 fprintf(file, "|");
3333 switch (inst->src[i].file) {
3334 case GRF:
3335 fprintf(file, "vgrf%d", inst->src[i].reg);
3336 if (alloc.sizes[inst->src[i].reg] != (unsigned)inst->regs_read(i) ||
3337 inst->src[i].subreg_offset)
3338 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3339 inst->src[i].subreg_offset);
3340 break;
3341 case MRF:
3342 fprintf(file, "***m%d***", inst->src[i].reg);
3343 break;
3344 case ATTR:
3345 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3346 break;
3347 case UNIFORM:
3348 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3349 if (inst->src[i].reladdr) {
3350 fprintf(file, "+reladdr");
3351 } else if (inst->src[i].subreg_offset) {
3352 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3353 inst->src[i].subreg_offset);
3354 }
3355 break;
3356 case BAD_FILE:
3357 fprintf(file, "(null)");
3358 break;
3359 case IMM:
3360 switch (inst->src[i].type) {
3361 case BRW_REGISTER_TYPE_F:
3362 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3363 break;
3364 case BRW_REGISTER_TYPE_W:
3365 case BRW_REGISTER_TYPE_D:
3366 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3367 break;
3368 case BRW_REGISTER_TYPE_UW:
3369 case BRW_REGISTER_TYPE_UD:
3370 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3371 break;
3372 case BRW_REGISTER_TYPE_VF:
3373 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3374 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3375 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3376 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3377 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3378 break;
3379 default:
3380 fprintf(file, "???");
3381 break;
3382 }
3383 break;
3384 case HW_REG:
3385 if (inst->src[i].fixed_hw_reg.negate)
3386 fprintf(file, "-");
3387 if (inst->src[i].fixed_hw_reg.abs)
3388 fprintf(file, "|");
3389 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3390 switch (inst->src[i].fixed_hw_reg.nr) {
3391 case BRW_ARF_NULL:
3392 fprintf(file, "null");
3393 break;
3394 case BRW_ARF_ADDRESS:
3395 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3396 break;
3397 case BRW_ARF_ACCUMULATOR:
3398 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3399 break;
3400 case BRW_ARF_FLAG:
3401 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3402 inst->src[i].fixed_hw_reg.subnr);
3403 break;
3404 default:
3405 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3406 inst->src[i].fixed_hw_reg.subnr);
3407 break;
3408 }
3409 } else {
3410 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3411 }
3412 if (inst->src[i].fixed_hw_reg.subnr)
3413 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3414 if (inst->src[i].fixed_hw_reg.abs)
3415 fprintf(file, "|");
3416 break;
3417 default:
3418 fprintf(file, "???");
3419 break;
3420 }
3421 if (inst->src[i].abs)
3422 fprintf(file, "|");
3423
3424 if (inst->src[i].file != IMM) {
3425 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3426 }
3427
3428 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3429 fprintf(file, ", ");
3430 }
3431
3432 fprintf(file, " ");
3433
3434 if (dispatch_width == 16 && inst->exec_size == 8) {
3435 if (inst->force_sechalf)
3436 fprintf(file, "2ndhalf ");
3437 else
3438 fprintf(file, "1sthalf ");
3439 }
3440
3441 fprintf(file, "\n");
3442 }
3443
3444 /**
3445 * Possibly returns an instruction that set up @param reg.
3446 *
3447 * Sometimes we want to take the result of some expression/variable
3448 * dereference tree and rewrite the instruction generating the result
3449 * of the tree. When processing the tree, we know that the
3450 * instructions generated are all writing temporaries that are dead
3451 * outside of this tree. So, if we have some instructions that write
3452 * a temporary, we're free to point that temp write somewhere else.
3453 *
3454 * Note that this doesn't guarantee that the instruction generated
3455 * only reg -- it might be the size=4 destination of a texture instruction.
3456 */
3457 fs_inst *
3458 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3459 fs_inst *end,
3460 const fs_reg &reg)
3461 {
3462 if (end == start ||
3463 end->is_partial_write() ||
3464 reg.reladdr ||
3465 !reg.equals(end->dst)) {
3466 return NULL;
3467 } else {
3468 return end;
3469 }
3470 }
3471
3472 void
3473 fs_visitor::setup_payload_gen6()
3474 {
3475 bool uses_depth =
3476 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3477 unsigned barycentric_interp_modes =
3478 (stage == MESA_SHADER_FRAGMENT) ?
3479 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3480
3481 assert(devinfo->gen >= 6);
3482
3483 /* R0-1: masks, pixel X/Y coordinates. */
3484 payload.num_regs = 2;
3485 /* R2: only for 32-pixel dispatch.*/
3486
3487 /* R3-26: barycentric interpolation coordinates. These appear in the
3488 * same order that they appear in the brw_wm_barycentric_interp_mode
3489 * enum. Each set of coordinates occupies 2 registers if dispatch width
3490 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3491 * appear if they were enabled using the "Barycentric Interpolation
3492 * Mode" bits in WM_STATE.
3493 */
3494 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3495 if (barycentric_interp_modes & (1 << i)) {
3496 payload.barycentric_coord_reg[i] = payload.num_regs;
3497 payload.num_regs += 2;
3498 if (dispatch_width == 16) {
3499 payload.num_regs += 2;
3500 }
3501 }
3502 }
3503
3504 /* R27: interpolated depth if uses source depth */
3505 if (uses_depth) {
3506 payload.source_depth_reg = payload.num_regs;
3507 payload.num_regs++;
3508 if (dispatch_width == 16) {
3509 /* R28: interpolated depth if not SIMD8. */
3510 payload.num_regs++;
3511 }
3512 }
3513 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3514 if (uses_depth) {
3515 payload.source_w_reg = payload.num_regs;
3516 payload.num_regs++;
3517 if (dispatch_width == 16) {
3518 /* R30: interpolated W if not SIMD8. */
3519 payload.num_regs++;
3520 }
3521 }
3522
3523 if (stage == MESA_SHADER_FRAGMENT) {
3524 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3525 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3526 prog_data->uses_pos_offset = key->compute_pos_offset;
3527 /* R31: MSAA position offsets. */
3528 if (prog_data->uses_pos_offset) {
3529 payload.sample_pos_reg = payload.num_regs;
3530 payload.num_regs++;
3531 }
3532 }
3533
3534 /* R32: MSAA input coverage mask */
3535 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3536 assert(devinfo->gen >= 7);
3537 payload.sample_mask_in_reg = payload.num_regs;
3538 payload.num_regs++;
3539 if (dispatch_width == 16) {
3540 /* R33: input coverage mask if not SIMD8. */
3541 payload.num_regs++;
3542 }
3543 }
3544
3545 /* R34-: bary for 32-pixel. */
3546 /* R58-59: interp W for 32-pixel. */
3547
3548 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3549 source_depth_to_render_target = true;
3550 }
3551 }
3552
3553 void
3554 fs_visitor::setup_vs_payload()
3555 {
3556 /* R0: thread header, R1: urb handles */
3557 payload.num_regs = 2;
3558 }
3559
3560 void
3561 fs_visitor::setup_cs_payload()
3562 {
3563 assert(devinfo->gen >= 7);
3564
3565 payload.num_regs = 1;
3566 }
3567
3568 void
3569 fs_visitor::assign_binding_table_offsets()
3570 {
3571 assert(stage == MESA_SHADER_FRAGMENT);
3572 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3573 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3574 uint32_t next_binding_table_offset = 0;
3575
3576 /* If there are no color regions, we still perform an FB write to a null
3577 * renderbuffer, which we place at surface index 0.
3578 */
3579 prog_data->binding_table.render_target_start = next_binding_table_offset;
3580 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3581
3582 assign_common_binding_table_offsets(next_binding_table_offset);
3583 }
3584
3585 void
3586 fs_visitor::calculate_register_pressure()
3587 {
3588 invalidate_live_intervals();
3589 calculate_live_intervals();
3590
3591 unsigned num_instructions = 0;
3592 foreach_block(block, cfg)
3593 num_instructions += block->instructions.length();
3594
3595 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3596
3597 for (unsigned reg = 0; reg < alloc.count; reg++) {
3598 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3599 regs_live_at_ip[ip] += alloc.sizes[reg];
3600 }
3601 }
3602
3603 void
3604 fs_visitor::optimize()
3605 {
3606 /* bld is the common builder object pointing at the end of the program we
3607 * used to translate it into i965 IR. For the optimization and lowering
3608 * passes coming next, any code added after the end of the program without
3609 * having explicitly called fs_builder::at() clearly points at a mistake.
3610 * Ideally optimization passes wouldn't be part of the visitor so they
3611 * wouldn't have access to bld at all, but they do, so just in case some
3612 * pass forgets to ask for a location explicitly set it to NULL here to
3613 * make it trip.
3614 */
3615 bld = bld.at(NULL, NULL);
3616
3617 split_virtual_grfs();
3618
3619 move_uniform_array_access_to_pull_constants();
3620 assign_constant_locations();
3621 demote_pull_constants();
3622
3623 #define OPT(pass, args...) ({ \
3624 pass_num++; \
3625 bool this_progress = pass(args); \
3626 \
3627 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3628 char filename[64]; \
3629 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3630 stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3631 \
3632 backend_shader::dump_instructions(filename); \
3633 } \
3634 \
3635 progress = progress || this_progress; \
3636 this_progress; \
3637 })
3638
3639 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3640 char filename[64];
3641 snprintf(filename, 64, "%s%d-%04d-00-start",
3642 stage_abbrev, dispatch_width,
3643 shader_prog ? shader_prog->Name : 0);
3644
3645 backend_shader::dump_instructions(filename);
3646 }
3647
3648 bool progress;
3649 int iteration = 0;
3650 int pass_num = 0;
3651 do {
3652 progress = false;
3653 pass_num = 0;
3654 iteration++;
3655
3656 OPT(remove_duplicate_mrf_writes);
3657
3658 OPT(opt_algebraic);
3659 OPT(opt_cse);
3660 OPT(opt_copy_propagate);
3661 OPT(opt_peephole_predicated_break);
3662 OPT(opt_cmod_propagation);
3663 OPT(dead_code_eliminate);
3664 OPT(opt_peephole_sel);
3665 OPT(dead_control_flow_eliminate, this);
3666 OPT(opt_register_renaming);
3667 OPT(opt_redundant_discard_jumps);
3668 OPT(opt_saturate_propagation);
3669 OPT(opt_zero_samples);
3670 OPT(register_coalesce);
3671 OPT(compute_to_mrf);
3672 OPT(eliminate_find_live_channel);
3673
3674 OPT(compact_virtual_grfs);
3675 } while (progress);
3676
3677 pass_num = 0;
3678
3679 OPT(opt_sampler_eot);
3680
3681 if (OPT(lower_load_payload)) {
3682 split_virtual_grfs();
3683 OPT(register_coalesce);
3684 OPT(compute_to_mrf);
3685 OPT(dead_code_eliminate);
3686 }
3687
3688 OPT(opt_combine_constants);
3689 OPT(lower_integer_multiplication);
3690
3691 lower_uniform_pull_constant_loads();
3692 }
3693
3694 /**
3695 * Three source instruction must have a GRF/MRF destination register.
3696 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
3697 */
3698 void
3699 fs_visitor::fixup_3src_null_dest()
3700 {
3701 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3702 if (inst->is_3src() && inst->dst.is_null()) {
3703 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3704 inst->dst.type);
3705 }
3706 }
3707 }
3708
3709 void
3710 fs_visitor::allocate_registers()
3711 {
3712 bool allocated_without_spills;
3713
3714 static const enum instruction_scheduler_mode pre_modes[] = {
3715 SCHEDULE_PRE,
3716 SCHEDULE_PRE_NON_LIFO,
3717 SCHEDULE_PRE_LIFO,
3718 };
3719
3720 /* Try each scheduling heuristic to see if it can successfully register
3721 * allocate without spilling. They should be ordered by decreasing
3722 * performance but increasing likelihood of allocating.
3723 */
3724 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3725 schedule_instructions(pre_modes[i]);
3726
3727 if (0) {
3728 assign_regs_trivial();
3729 allocated_without_spills = true;
3730 } else {
3731 allocated_without_spills = assign_regs(false);
3732 }
3733 if (allocated_without_spills)
3734 break;
3735 }
3736
3737 if (!allocated_without_spills) {
3738 /* We assume that any spilling is worse than just dropping back to
3739 * SIMD8. There's probably actually some intermediate point where
3740 * SIMD16 with a couple of spills is still better.
3741 */
3742 if (dispatch_width == 16) {
3743 fail("Failure to register allocate. Reduce number of "
3744 "live scalar values to avoid this.");
3745 } else {
3746 compiler->shader_perf_log(log_data,
3747 "%s shader triggered register spilling. "
3748 "Try reducing the number of live scalar "
3749 "values to improve performance.\n",
3750 stage_name);
3751 }
3752
3753 /* Since we're out of heuristics, just go spill registers until we
3754 * get an allocation.
3755 */
3756 while (!assign_regs(true)) {
3757 if (failed)
3758 break;
3759 }
3760 }
3761
3762 /* This must come after all optimization and register allocation, since
3763 * it inserts dead code that happens to have side effects, and it does
3764 * so based on the actual physical registers in use.
3765 */
3766 insert_gen4_send_dependency_workarounds();
3767
3768 if (failed)
3769 return;
3770
3771 if (!allocated_without_spills)
3772 schedule_instructions(SCHEDULE_POST);
3773
3774 if (last_scratch > 0)
3775 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3776 }
3777
3778 bool
3779 fs_visitor::run_vs(gl_clip_plane *clip_planes)
3780 {
3781 assert(stage == MESA_SHADER_VERTEX);
3782
3783 assign_common_binding_table_offsets(0);
3784 setup_vs_payload();
3785
3786 if (shader_time_index >= 0)
3787 emit_shader_time_begin();
3788
3789 emit_nir_code();
3790
3791 if (failed)
3792 return false;
3793
3794 compute_clip_distance(clip_planes);
3795
3796 emit_urb_writes();
3797
3798 if (shader_time_index >= 0)
3799 emit_shader_time_end();
3800
3801 calculate_cfg();
3802
3803 optimize();
3804
3805 assign_curb_setup();
3806 assign_vs_urb_setup();
3807
3808 fixup_3src_null_dest();
3809 allocate_registers();
3810
3811 return !failed;
3812 }
3813
3814 bool
3815 fs_visitor::run_fs(bool do_rep_send)
3816 {
3817 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3818 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3819
3820 assert(stage == MESA_SHADER_FRAGMENT);
3821
3822 sanity_param_count = prog->Parameters->NumParameters;
3823
3824 assign_binding_table_offsets();
3825
3826 if (devinfo->gen >= 6)
3827 setup_payload_gen6();
3828 else
3829 setup_payload_gen4();
3830
3831 if (0) {
3832 emit_dummy_fs();
3833 } else if (do_rep_send) {
3834 assert(dispatch_width == 16);
3835 emit_repclear_shader();
3836 } else {
3837 if (shader_time_index >= 0)
3838 emit_shader_time_begin();
3839
3840 calculate_urb_setup();
3841 if (prog->InputsRead > 0) {
3842 if (devinfo->gen < 6)
3843 emit_interpolation_setup_gen4();
3844 else
3845 emit_interpolation_setup_gen6();
3846 }
3847
3848 /* We handle discards by keeping track of the still-live pixels in f0.1.
3849 * Initialize it with the dispatched pixels.
3850 */
3851 if (wm_prog_data->uses_kill) {
3852 fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3853 discard_init->flag_subreg = 1;
3854 }
3855
3856 /* Generate FS IR for main(). (the visitor only descends into
3857 * functions called "main").
3858 */
3859 emit_nir_code();
3860
3861 if (failed)
3862 return false;
3863
3864 if (wm_prog_data->uses_kill)
3865 bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
3866
3867 if (wm_key->alpha_test_func)
3868 emit_alpha_test();
3869
3870 emit_fb_writes();
3871
3872 if (shader_time_index >= 0)
3873 emit_shader_time_end();
3874
3875 calculate_cfg();
3876
3877 optimize();
3878
3879 assign_curb_setup();
3880 assign_urb_setup();
3881
3882 fixup_3src_null_dest();
3883 allocate_registers();
3884
3885 if (failed)
3886 return false;
3887 }
3888
3889 if (dispatch_width == 8)
3890 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3891 else
3892 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3893
3894 /* If any state parameters were appended, then ParameterValues could have
3895 * been realloced, in which case the driver uniform storage set up by
3896 * _mesa_associate_uniform_storage() would point to freed memory. Make
3897 * sure that didn't happen.
3898 */
3899 assert(sanity_param_count == prog->Parameters->NumParameters);
3900
3901 return !failed;
3902 }
3903
3904 bool
3905 fs_visitor::run_cs()
3906 {
3907 assert(stage == MESA_SHADER_COMPUTE);
3908 assert(shader);
3909
3910 sanity_param_count = prog->Parameters->NumParameters;
3911
3912 assign_common_binding_table_offsets(0);
3913
3914 setup_cs_payload();
3915
3916 if (shader_time_index >= 0)
3917 emit_shader_time_begin();
3918
3919 emit_nir_code();
3920
3921 if (failed)
3922 return false;
3923
3924 emit_cs_terminate();
3925
3926 if (shader_time_index >= 0)
3927 emit_shader_time_end();
3928
3929 calculate_cfg();
3930
3931 optimize();
3932
3933 assign_curb_setup();
3934
3935 fixup_3src_null_dest();
3936 allocate_registers();
3937
3938 if (failed)
3939 return false;
3940
3941 /* If any state parameters were appended, then ParameterValues could have
3942 * been realloced, in which case the driver uniform storage set up by
3943 * _mesa_associate_uniform_storage() would point to freed memory. Make
3944 * sure that didn't happen.
3945 */
3946 assert(sanity_param_count == prog->Parameters->NumParameters);
3947
3948 return !failed;
3949 }
3950
3951 const unsigned *
3952 brw_wm_fs_emit(struct brw_context *brw,
3953 void *mem_ctx,
3954 const struct brw_wm_prog_key *key,
3955 struct brw_wm_prog_data *prog_data,
3956 struct gl_fragment_program *fp,
3957 struct gl_shader_program *prog,
3958 unsigned *final_assembly_size)
3959 {
3960 bool start_busy = false;
3961 double start_time = 0;
3962
3963 if (unlikely(brw->perf_debug)) {
3964 start_busy = (brw->batch.last_bo &&
3965 drm_intel_bo_busy(brw->batch.last_bo));
3966 start_time = get_time();
3967 }
3968
3969 struct brw_shader *shader = NULL;
3970 if (prog)
3971 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3972
3973 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3974 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3975
3976 int st_index8 = -1, st_index16 = -1;
3977 if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
3978 st_index8 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS8);
3979 st_index16 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS16);
3980 }
3981
3982 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3983 */
3984 fs_visitor v(brw->intelScreen->compiler, brw,
3985 mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
3986 prog, &fp->Base, 8, st_index8);
3987 if (!v.run_fs(false /* do_rep_send */)) {
3988 if (prog) {
3989 prog->LinkStatus = false;
3990 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3991 }
3992
3993 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3994 v.fail_msg);
3995
3996 return NULL;
3997 }
3998
3999 cfg_t *simd16_cfg = NULL;
4000 fs_visitor v2(brw->intelScreen->compiler, brw,
4001 mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4002 prog, &fp->Base, 16, st_index16);
4003 if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4004 if (!v.simd16_unsupported) {
4005 /* Try a SIMD16 compile */
4006 v2.import_uniforms(&v);
4007 if (!v2.run_fs(brw->use_rep_send)) {
4008 perf_debug("SIMD16 shader failed to compile: %s", v2.fail_msg);
4009 } else {
4010 simd16_cfg = v2.cfg;
4011 }
4012 }
4013 }
4014
4015 cfg_t *simd8_cfg;
4016 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4017 if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4018 simd8_cfg = NULL;
4019 prog_data->no_8 = true;
4020 } else {
4021 simd8_cfg = v.cfg;
4022 prog_data->no_8 = false;
4023 }
4024
4025 fs_generator g(brw->intelScreen->compiler, brw,
4026 mem_ctx, (void *) key, &prog_data->base,
4027 &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4028
4029 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4030 char *name;
4031 if (prog)
4032 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4033 prog->Label ? prog->Label : "unnamed",
4034 prog->Name);
4035 else
4036 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4037
4038 g.enable_debug(name);
4039 }
4040
4041 if (simd8_cfg)
4042 g.generate_code(simd8_cfg, 8);
4043 if (simd16_cfg)
4044 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4045
4046 if (unlikely(brw->perf_debug) && shader) {
4047 if (shader->compiled_once)
4048 brw_wm_debug_recompile(brw, prog, key);
4049 shader->compiled_once = true;
4050
4051 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4052 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4053 (get_time() - start_time) * 1000);
4054 }
4055 }
4056
4057 return g.get_assembly(final_assembly_size);
4058 }
4059
4060 extern "C" bool
4061 brw_fs_precompile(struct gl_context *ctx,
4062 struct gl_shader_program *shader_prog,
4063 struct gl_program *prog)
4064 {
4065 struct brw_context *brw = brw_context(ctx);
4066 struct brw_wm_prog_key key;
4067
4068 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4069 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4070 bool program_uses_dfdy = fp->UsesDFdy;
4071
4072 memset(&key, 0, sizeof(key));
4073
4074 if (brw->gen < 6) {
4075 if (fp->UsesKill)
4076 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4077
4078 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4079 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4080
4081 /* Just assume depth testing. */
4082 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4083 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4084 }
4085
4086 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4087 BRW_FS_VARYING_INPUT_MASK) > 16)
4088 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4089
4090 brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4091
4092 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4093 key.drawable_height = ctx->DrawBuffer->Height;
4094 }
4095
4096 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4097 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4098 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4099
4100 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4101 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4102 key.nr_color_regions > 1;
4103 }
4104
4105 key.program_string_id = bfp->id;
4106
4107 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4108 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4109
4110 bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4111
4112 brw->wm.base.prog_offset = old_prog_offset;
4113 brw->wm.prog_data = old_prog_data;
4114
4115 return success;
4116 }
4117
4118 void
4119 brw_setup_tex_for_precompile(struct brw_context *brw,
4120 struct brw_sampler_prog_key_data *tex,
4121 struct gl_program *prog)
4122 {
4123 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4124 unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4125 for (unsigned i = 0; i < sampler_count; i++) {
4126 if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4127 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4128 tex->swizzles[i] =
4129 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4130 } else {
4131 /* Color sampler: assume no swizzling. */
4132 tex->swizzles[i] = SWIZZLE_XYZW;
4133 }
4134 }
4135 }