i965: Fix predicated-send-based discards with MRT.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53
54 void
55 fs_inst::init()
56 {
57 memset(this, 0, sizeof(*this));
58 this->conditional_mod = BRW_CONDITIONAL_NONE;
59
60 this->dst = reg_undef;
61 this->src[0] = reg_undef;
62 this->src[1] = reg_undef;
63 this->src[2] = reg_undef;
64
65 /* This will be the case for almost all instructions. */
66 this->regs_written = 1;
67 }
68
69 fs_inst::fs_inst()
70 {
71 init();
72 this->opcode = BRW_OPCODE_NOP;
73 }
74
75 fs_inst::fs_inst(enum opcode opcode)
76 {
77 init();
78 this->opcode = opcode;
79 }
80
81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
82 {
83 init();
84 this->opcode = opcode;
85 this->dst = dst;
86
87 if (dst.file == GRF)
88 assert(dst.reg_offset >= 0);
89 }
90
91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
92 {
93 init();
94 this->opcode = opcode;
95 this->dst = dst;
96 this->src[0] = src0;
97
98 if (dst.file == GRF)
99 assert(dst.reg_offset >= 0);
100 if (src[0].file == GRF)
101 assert(src[0].reg_offset >= 0);
102 }
103
104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
105 {
106 init();
107 this->opcode = opcode;
108 this->dst = dst;
109 this->src[0] = src0;
110 this->src[1] = src1;
111
112 if (dst.file == GRF)
113 assert(dst.reg_offset >= 0);
114 if (src[0].file == GRF)
115 assert(src[0].reg_offset >= 0);
116 if (src[1].file == GRF)
117 assert(src[1].reg_offset >= 0);
118 }
119
120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
121 fs_reg src0, fs_reg src1, fs_reg src2)
122 {
123 init();
124 this->opcode = opcode;
125 this->dst = dst;
126 this->src[0] = src0;
127 this->src[1] = src1;
128 this->src[2] = src2;
129
130 if (dst.file == GRF)
131 assert(dst.reg_offset >= 0);
132 if (src[0].file == GRF)
133 assert(src[0].reg_offset >= 0);
134 if (src[1].file == GRF)
135 assert(src[1].reg_offset >= 0);
136 if (src[2].file == GRF)
137 assert(src[2].reg_offset >= 0);
138 }
139
140 #define ALU1(op) \
141 fs_inst * \
142 fs_visitor::op(fs_reg dst, fs_reg src0) \
143 { \
144 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
145 }
146
147 #define ALU2(op) \
148 fs_inst * \
149 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
150 { \
151 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
152 }
153
154 #define ALU3(op) \
155 fs_inst * \
156 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
157 { \
158 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
159 }
160
161 ALU1(NOT)
162 ALU1(MOV)
163 ALU1(FRC)
164 ALU1(RNDD)
165 ALU1(RNDE)
166 ALU1(RNDZ)
167 ALU2(ADD)
168 ALU2(MUL)
169 ALU2(MACH)
170 ALU2(AND)
171 ALU2(OR)
172 ALU2(XOR)
173 ALU2(SHL)
174 ALU2(SHR)
175 ALU2(ASR)
176 ALU3(LRP)
177 ALU1(BFREV)
178 ALU3(BFE)
179 ALU2(BFI1)
180 ALU3(BFI2)
181 ALU1(FBH)
182 ALU1(FBL)
183 ALU1(CBIT)
184 ALU3(MAD)
185 ALU2(ADDC)
186 ALU2(SUBB)
187 ALU2(SEL)
188
189 /** Gen4 predicated IF. */
190 fs_inst *
191 fs_visitor::IF(uint32_t predicate)
192 {
193 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195 return inst;
196 }
197
198 /** Gen6 IF with embedded comparison. */
199 fs_inst *
200 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
201 {
202 assert(brw->gen == 6);
203 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
204 reg_null_d, src0, src1);
205 inst->conditional_mod = condition;
206 return inst;
207 }
208
209 /**
210 * CMP: Sets the low bit of the destination channels with the result
211 * of the comparison, while the upper bits are undefined, and updates
212 * the flag register with the packed 16 bits of the result.
213 */
214 fs_inst *
215 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
216 {
217 fs_inst *inst;
218
219 /* Take the instruction:
220 *
221 * CMP null<d> src0<f> src1<f>
222 *
223 * Original gen4 does type conversion to the destination type before
224 * comparison, producing garbage results for floating point comparisons.
225 * gen5 does the comparison on the execution type (resolved source types),
226 * so dst type doesn't matter. gen6 does comparison and then uses the
227 * result as if it was the dst type with no conversion, which happens to
228 * mostly work out for float-interpreted-as-int since our comparisons are
229 * for >0, =0, <0.
230 */
231 if (brw->gen == 4) {
232 dst.type = src0.type;
233 if (dst.file == HW_REG)
234 dst.fixed_hw_reg.type = dst.type;
235 }
236
237 resolve_ud_negate(&src0);
238 resolve_ud_negate(&src1);
239
240 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
241 inst->conditional_mod = condition;
242
243 return inst;
244 }
245
246 exec_list
247 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
248 const fs_reg &surf_index,
249 const fs_reg &varying_offset,
250 uint32_t const_offset)
251 {
252 exec_list instructions;
253 fs_inst *inst;
254
255 /* We have our constant surface use a pitch of 4 bytes, so our index can
256 * be any component of a vector, and then we load 4 contiguous
257 * components starting from that.
258 *
259 * We break down the const_offset to a portion added to the variable
260 * offset and a portion done using reg_offset, which means that if you
261 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
262 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
263 * CSE can later notice that those loads are all the same and eliminate
264 * the redundant ones.
265 */
266 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
267 instructions.push_tail(ADD(vec4_offset,
268 varying_offset, const_offset & ~3));
269
270 int scale = 1;
271 if (brw->gen == 4 && dispatch_width == 8) {
272 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
273 * u, v, r) as parameters, or we can just use the SIMD16 message
274 * consisting of (header, u). We choose the second, at the cost of a
275 * longer return length.
276 */
277 scale = 2;
278 }
279
280 enum opcode op;
281 if (brw->gen >= 7)
282 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
283 else
284 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
285 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
286 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
287 inst->regs_written = 4 * scale;
288 instructions.push_tail(inst);
289
290 if (brw->gen < 7) {
291 inst->base_mrf = 13;
292 inst->header_present = true;
293 if (brw->gen == 4)
294 inst->mlen = 3;
295 else
296 inst->mlen = 1 + dispatch_width / 8;
297 }
298
299 vec4_result.reg_offset += (const_offset & 3) * scale;
300 instructions.push_tail(MOV(dst, vec4_result));
301
302 return instructions;
303 }
304
305 /**
306 * A helper for MOV generation for fixing up broken hardware SEND dependency
307 * handling.
308 */
309 fs_inst *
310 fs_visitor::DEP_RESOLVE_MOV(int grf)
311 {
312 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
313
314 inst->ir = NULL;
315 inst->annotation = "send dependency resolve";
316
317 /* The caller always wants uncompressed to emit the minimal extra
318 * dependencies, and to avoid having to deal with aligning its regs to 2.
319 */
320 inst->force_uncompressed = true;
321
322 return inst;
323 }
324
325 bool
326 fs_inst::equals(fs_inst *inst)
327 {
328 return (opcode == inst->opcode &&
329 dst.equals(inst->dst) &&
330 src[0].equals(inst->src[0]) &&
331 src[1].equals(inst->src[1]) &&
332 src[2].equals(inst->src[2]) &&
333 saturate == inst->saturate &&
334 predicate == inst->predicate &&
335 conditional_mod == inst->conditional_mod &&
336 mlen == inst->mlen &&
337 base_mrf == inst->base_mrf &&
338 sampler == inst->sampler &&
339 target == inst->target &&
340 eot == inst->eot &&
341 header_present == inst->header_present &&
342 shadow_compare == inst->shadow_compare &&
343 offset == inst->offset);
344 }
345
346 bool
347 fs_inst::overwrites_reg(const fs_reg &reg)
348 {
349 return (reg.file == dst.file &&
350 reg.reg == dst.reg &&
351 reg.reg_offset >= dst.reg_offset &&
352 reg.reg_offset < dst.reg_offset + regs_written);
353 }
354
355 bool
356 fs_inst::is_send_from_grf()
357 {
358 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
359 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
360 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
361 src[1].file == GRF) ||
362 (is_tex() && src[0].file == GRF));
363 }
364
365 bool
366 fs_visitor::can_do_source_mods(fs_inst *inst)
367 {
368 if (brw->gen == 6 && inst->is_math())
369 return false;
370
371 if (inst->is_send_from_grf())
372 return false;
373
374 if (!inst->can_do_source_mods())
375 return false;
376
377 return true;
378 }
379
380 void
381 fs_reg::init()
382 {
383 memset(this, 0, sizeof(*this));
384 stride = 1;
385 }
386
387 /** Generic unset register constructor. */
388 fs_reg::fs_reg()
389 {
390 init();
391 this->file = BAD_FILE;
392 }
393
394 /** Immediate value constructor. */
395 fs_reg::fs_reg(float f)
396 {
397 init();
398 this->file = IMM;
399 this->type = BRW_REGISTER_TYPE_F;
400 this->imm.f = f;
401 }
402
403 /** Immediate value constructor. */
404 fs_reg::fs_reg(int32_t i)
405 {
406 init();
407 this->file = IMM;
408 this->type = BRW_REGISTER_TYPE_D;
409 this->imm.i = i;
410 }
411
412 /** Immediate value constructor. */
413 fs_reg::fs_reg(uint32_t u)
414 {
415 init();
416 this->file = IMM;
417 this->type = BRW_REGISTER_TYPE_UD;
418 this->imm.u = u;
419 }
420
421 /** Fixed brw_reg. */
422 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
423 {
424 init();
425 this->file = HW_REG;
426 this->fixed_hw_reg = fixed_hw_reg;
427 this->type = fixed_hw_reg.type;
428 }
429
430 bool
431 fs_reg::equals(const fs_reg &r) const
432 {
433 return (file == r.file &&
434 reg == r.reg &&
435 reg_offset == r.reg_offset &&
436 subreg_offset == r.subreg_offset &&
437 type == r.type &&
438 negate == r.negate &&
439 abs == r.abs &&
440 !reladdr && !r.reladdr &&
441 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
442 sizeof(fixed_hw_reg)) == 0 &&
443 stride == r.stride &&
444 imm.u == r.imm.u);
445 }
446
447 fs_reg &
448 fs_reg::apply_stride(unsigned stride)
449 {
450 assert((this->stride * stride) <= 4 &&
451 (is_power_of_two(stride) || stride == 0) &&
452 file != HW_REG && file != IMM);
453 this->stride *= stride;
454 return *this;
455 }
456
457 fs_reg &
458 fs_reg::set_smear(unsigned subreg)
459 {
460 assert(file != HW_REG && file != IMM);
461 subreg_offset = subreg * type_sz(type);
462 stride = 0;
463 return *this;
464 }
465
466 bool
467 fs_reg::is_contiguous() const
468 {
469 return stride == 1;
470 }
471
472 bool
473 fs_reg::is_zero() const
474 {
475 if (file != IMM)
476 return false;
477
478 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
479 }
480
481 bool
482 fs_reg::is_one() const
483 {
484 if (file != IMM)
485 return false;
486
487 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
488 }
489
490 bool
491 fs_reg::is_null() const
492 {
493 return file == HW_REG &&
494 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
495 fixed_hw_reg.nr == BRW_ARF_NULL;
496 }
497
498 bool
499 fs_reg::is_valid_3src() const
500 {
501 return file == GRF || file == UNIFORM;
502 }
503
504 int
505 fs_visitor::type_size(const struct glsl_type *type)
506 {
507 unsigned int size, i;
508
509 switch (type->base_type) {
510 case GLSL_TYPE_UINT:
511 case GLSL_TYPE_INT:
512 case GLSL_TYPE_FLOAT:
513 case GLSL_TYPE_BOOL:
514 return type->components();
515 case GLSL_TYPE_ARRAY:
516 return type_size(type->fields.array) * type->length;
517 case GLSL_TYPE_STRUCT:
518 size = 0;
519 for (i = 0; i < type->length; i++) {
520 size += type_size(type->fields.structure[i].type);
521 }
522 return size;
523 case GLSL_TYPE_SAMPLER:
524 /* Samplers take up no register space, since they're baked in at
525 * link time.
526 */
527 return 0;
528 case GLSL_TYPE_ATOMIC_UINT:
529 return 0;
530 case GLSL_TYPE_IMAGE:
531 case GLSL_TYPE_VOID:
532 case GLSL_TYPE_ERROR:
533 case GLSL_TYPE_INTERFACE:
534 assert(!"not reached");
535 break;
536 }
537
538 return 0;
539 }
540
541 fs_reg
542 fs_visitor::get_timestamp()
543 {
544 assert(brw->gen >= 7);
545
546 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
547 BRW_ARF_TIMESTAMP,
548 0),
549 BRW_REGISTER_TYPE_UD));
550
551 fs_reg dst = fs_reg(this, glsl_type::uint_type);
552
553 fs_inst *mov = emit(MOV(dst, ts));
554 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
555 * even if it's not enabled in the dispatch.
556 */
557 mov->force_writemask_all = true;
558 mov->force_uncompressed = true;
559
560 /* The caller wants the low 32 bits of the timestamp. Since it's running
561 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
562 * which is plenty of time for our purposes. It is identical across the
563 * EUs, but since it's tracking GPU core speed it will increment at a
564 * varying rate as render P-states change.
565 *
566 * The caller could also check if render P-states have changed (or anything
567 * else that might disrupt timing) by setting smear to 2 and checking if
568 * that field is != 0.
569 */
570 dst.set_smear(0);
571
572 return dst;
573 }
574
575 void
576 fs_visitor::emit_shader_time_begin()
577 {
578 current_annotation = "shader time start";
579 shader_start_time = get_timestamp();
580 }
581
582 void
583 fs_visitor::emit_shader_time_end()
584 {
585 current_annotation = "shader time end";
586
587 enum shader_time_shader_type type, written_type, reset_type;
588 if (dispatch_width == 8) {
589 type = ST_FS8;
590 written_type = ST_FS8_WRITTEN;
591 reset_type = ST_FS8_RESET;
592 } else {
593 assert(dispatch_width == 16);
594 type = ST_FS16;
595 written_type = ST_FS16_WRITTEN;
596 reset_type = ST_FS16_RESET;
597 }
598
599 fs_reg shader_end_time = get_timestamp();
600
601 /* Check that there weren't any timestamp reset events (assuming these
602 * were the only two timestamp reads that happened).
603 */
604 fs_reg reset = shader_end_time;
605 reset.set_smear(2);
606 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
607 test->conditional_mod = BRW_CONDITIONAL_Z;
608 emit(IF(BRW_PREDICATE_NORMAL));
609
610 push_force_uncompressed();
611 fs_reg start = shader_start_time;
612 start.negate = true;
613 fs_reg diff = fs_reg(this, glsl_type::uint_type);
614 emit(ADD(diff, start, shader_end_time));
615
616 /* If there were no instructions between the two timestamp gets, the diff
617 * is 2 cycles. Remove that overhead, so I can forget about that when
618 * trying to determine the time taken for single instructions.
619 */
620 emit(ADD(diff, diff, fs_reg(-2u)));
621
622 emit_shader_time_write(type, diff);
623 emit_shader_time_write(written_type, fs_reg(1u));
624 emit(BRW_OPCODE_ELSE);
625 emit_shader_time_write(reset_type, fs_reg(1u));
626 emit(BRW_OPCODE_ENDIF);
627
628 pop_force_uncompressed();
629 }
630
631 void
632 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
633 fs_reg value)
634 {
635 int shader_time_index =
636 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
637 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
638
639 fs_reg payload;
640 if (dispatch_width == 8)
641 payload = fs_reg(this, glsl_type::uvec2_type);
642 else
643 payload = fs_reg(this, glsl_type::uint_type);
644
645 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
646 fs_reg(), payload, offset, value));
647 }
648
649 void
650 fs_visitor::fail(const char *format, ...)
651 {
652 va_list va;
653 char *msg;
654
655 if (failed)
656 return;
657
658 failed = true;
659
660 va_start(va, format);
661 msg = ralloc_vasprintf(mem_ctx, format, va);
662 va_end(va);
663 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
664
665 this->fail_msg = msg;
666
667 if (INTEL_DEBUG & DEBUG_WM) {
668 fprintf(stderr, "%s", msg);
669 }
670 }
671
672 fs_inst *
673 fs_visitor::emit(enum opcode opcode)
674 {
675 return emit(new(mem_ctx) fs_inst(opcode));
676 }
677
678 fs_inst *
679 fs_visitor::emit(enum opcode opcode, fs_reg dst)
680 {
681 return emit(new(mem_ctx) fs_inst(opcode, dst));
682 }
683
684 fs_inst *
685 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
686 {
687 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
688 }
689
690 fs_inst *
691 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
692 {
693 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
694 }
695
696 fs_inst *
697 fs_visitor::emit(enum opcode opcode, fs_reg dst,
698 fs_reg src0, fs_reg src1, fs_reg src2)
699 {
700 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
701 }
702
703 void
704 fs_visitor::push_force_uncompressed()
705 {
706 force_uncompressed_stack++;
707 }
708
709 void
710 fs_visitor::pop_force_uncompressed()
711 {
712 force_uncompressed_stack--;
713 assert(force_uncompressed_stack >= 0);
714 }
715
716 /**
717 * Returns true if the instruction has a flag that means it won't
718 * update an entire destination register.
719 *
720 * For example, dead code elimination and live variable analysis want to know
721 * when a write to a variable screens off any preceding values that were in
722 * it.
723 */
724 bool
725 fs_inst::is_partial_write()
726 {
727 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
728 this->force_uncompressed ||
729 this->force_sechalf || !this->dst.is_contiguous());
730 }
731
732 int
733 fs_inst::regs_read(fs_visitor *v, int arg)
734 {
735 if (is_tex() && arg == 0 && src[0].file == GRF) {
736 if (v->dispatch_width == 16)
737 return (mlen + 1) / 2;
738 else
739 return mlen;
740 }
741 return 1;
742 }
743
744 bool
745 fs_inst::reads_flag()
746 {
747 return predicate;
748 }
749
750 bool
751 fs_inst::writes_flag()
752 {
753 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
754 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
755 }
756
757 /**
758 * Returns how many MRFs an FS opcode will write over.
759 *
760 * Note that this is not the 0 or 1 implied writes in an actual gen
761 * instruction -- the FS opcodes often generate MOVs in addition.
762 */
763 int
764 fs_visitor::implied_mrf_writes(fs_inst *inst)
765 {
766 if (inst->mlen == 0)
767 return 0;
768
769 if (inst->base_mrf == -1)
770 return 0;
771
772 switch (inst->opcode) {
773 case SHADER_OPCODE_RCP:
774 case SHADER_OPCODE_RSQ:
775 case SHADER_OPCODE_SQRT:
776 case SHADER_OPCODE_EXP2:
777 case SHADER_OPCODE_LOG2:
778 case SHADER_OPCODE_SIN:
779 case SHADER_OPCODE_COS:
780 return 1 * dispatch_width / 8;
781 case SHADER_OPCODE_POW:
782 case SHADER_OPCODE_INT_QUOTIENT:
783 case SHADER_OPCODE_INT_REMAINDER:
784 return 2 * dispatch_width / 8;
785 case SHADER_OPCODE_TEX:
786 case FS_OPCODE_TXB:
787 case SHADER_OPCODE_TXD:
788 case SHADER_OPCODE_TXF:
789 case SHADER_OPCODE_TXF_CMS:
790 case SHADER_OPCODE_TXF_MCS:
791 case SHADER_OPCODE_TG4:
792 case SHADER_OPCODE_TG4_OFFSET:
793 case SHADER_OPCODE_TXL:
794 case SHADER_OPCODE_TXS:
795 case SHADER_OPCODE_LOD:
796 return 1;
797 case FS_OPCODE_FB_WRITE:
798 return 2;
799 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
800 case SHADER_OPCODE_GEN4_SCRATCH_READ:
801 return 1;
802 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
803 return inst->mlen;
804 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
805 return 2;
806 case SHADER_OPCODE_UNTYPED_ATOMIC:
807 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
808 return 0;
809 default:
810 assert(!"not reached");
811 return inst->mlen;
812 }
813 }
814
815 int
816 fs_visitor::virtual_grf_alloc(int size)
817 {
818 if (virtual_grf_array_size <= virtual_grf_count) {
819 if (virtual_grf_array_size == 0)
820 virtual_grf_array_size = 16;
821 else
822 virtual_grf_array_size *= 2;
823 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
824 virtual_grf_array_size);
825 }
826 virtual_grf_sizes[virtual_grf_count] = size;
827 return virtual_grf_count++;
828 }
829
830 /** Fixed HW reg constructor. */
831 fs_reg::fs_reg(enum register_file file, int reg)
832 {
833 init();
834 this->file = file;
835 this->reg = reg;
836 this->type = BRW_REGISTER_TYPE_F;
837 }
838
839 /** Fixed HW reg constructor. */
840 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
841 {
842 init();
843 this->file = file;
844 this->reg = reg;
845 this->type = type;
846 }
847
848 /** Automatic reg constructor. */
849 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
850 {
851 init();
852
853 this->file = GRF;
854 this->reg = v->virtual_grf_alloc(v->type_size(type));
855 this->reg_offset = 0;
856 this->type = brw_type_for_base_type(type);
857 }
858
859 fs_reg *
860 fs_visitor::variable_storage(ir_variable *var)
861 {
862 return (fs_reg *)hash_table_find(this->variable_ht, var);
863 }
864
865 void
866 import_uniforms_callback(const void *key,
867 void *data,
868 void *closure)
869 {
870 struct hash_table *dst_ht = (struct hash_table *)closure;
871 const fs_reg *reg = (const fs_reg *)data;
872
873 if (reg->file != UNIFORM)
874 return;
875
876 hash_table_insert(dst_ht, data, key);
877 }
878
879 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
880 * This brings in those uniform definitions
881 */
882 void
883 fs_visitor::import_uniforms(fs_visitor *v)
884 {
885 hash_table_call_foreach(v->variable_ht,
886 import_uniforms_callback,
887 variable_ht);
888 this->params_remap = v->params_remap;
889 this->nr_params_remap = v->nr_params_remap;
890 }
891
892 /* Our support for uniforms is piggy-backed on the struct
893 * gl_fragment_program, because that's where the values actually
894 * get stored, rather than in some global gl_shader_program uniform
895 * store.
896 */
897 void
898 fs_visitor::setup_uniform_values(ir_variable *ir)
899 {
900 int namelen = strlen(ir->name);
901
902 /* The data for our (non-builtin) uniforms is stored in a series of
903 * gl_uniform_driver_storage structs for each subcomponent that
904 * glGetUniformLocation() could name. We know it's been set up in the same
905 * order we'd walk the type, so walk the list of storage and find anything
906 * with our name, or the prefix of a component that starts with our name.
907 */
908 unsigned params_before = uniforms;
909 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
910 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
911
912 if (strncmp(ir->name, storage->name, namelen) != 0 ||
913 (storage->name[namelen] != 0 &&
914 storage->name[namelen] != '.' &&
915 storage->name[namelen] != '[')) {
916 continue;
917 }
918
919 unsigned slots = storage->type->component_slots();
920 if (storage->array_elements)
921 slots *= storage->array_elements;
922
923 for (unsigned i = 0; i < slots; i++) {
924 stage_prog_data->param[uniforms++] = &storage->storage[i].f;
925 }
926 }
927
928 /* Make sure we actually initialized the right amount of stuff here. */
929 assert(params_before + ir->type->component_slots() == uniforms);
930 (void)params_before;
931 }
932
933
934 /* Our support for builtin uniforms is even scarier than non-builtin.
935 * It sits on top of the PROG_STATE_VAR parameters that are
936 * automatically updated from GL context state.
937 */
938 void
939 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
940 {
941 const ir_state_slot *const slots = ir->state_slots;
942 assert(ir->state_slots != NULL);
943
944 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
945 /* This state reference has already been setup by ir_to_mesa, but we'll
946 * get the same index back here.
947 */
948 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
949 (gl_state_index *)slots[i].tokens);
950
951 /* Add each of the unique swizzles of the element as a parameter.
952 * This'll end up matching the expected layout of the
953 * array/matrix/structure we're trying to fill in.
954 */
955 int last_swiz = -1;
956 for (unsigned int j = 0; j < 4; j++) {
957 int swiz = GET_SWZ(slots[i].swizzle, j);
958 if (swiz == last_swiz)
959 break;
960 last_swiz = swiz;
961
962 stage_prog_data->param[uniforms++] =
963 &fp->Base.Parameters->ParameterValues[index][swiz].f;
964 }
965 }
966 }
967
968 fs_reg *
969 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
970 {
971 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
972 fs_reg wpos = *reg;
973 bool flip = !ir->data.origin_upper_left ^ c->key.render_to_fbo;
974
975 /* gl_FragCoord.x */
976 if (ir->data.pixel_center_integer) {
977 emit(MOV(wpos, this->pixel_x));
978 } else {
979 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
980 }
981 wpos.reg_offset++;
982
983 /* gl_FragCoord.y */
984 if (!flip && ir->data.pixel_center_integer) {
985 emit(MOV(wpos, this->pixel_y));
986 } else {
987 fs_reg pixel_y = this->pixel_y;
988 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
989
990 if (flip) {
991 pixel_y.negate = true;
992 offset += c->key.drawable_height - 1.0;
993 }
994
995 emit(ADD(wpos, pixel_y, fs_reg(offset)));
996 }
997 wpos.reg_offset++;
998
999 /* gl_FragCoord.z */
1000 if (brw->gen >= 6) {
1001 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
1002 } else {
1003 emit(FS_OPCODE_LINTERP, wpos,
1004 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1005 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1006 interp_reg(VARYING_SLOT_POS, 2));
1007 }
1008 wpos.reg_offset++;
1009
1010 /* gl_FragCoord.w: Already set up in emit_interpolation */
1011 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1012
1013 return reg;
1014 }
1015
1016 fs_inst *
1017 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1018 glsl_interp_qualifier interpolation_mode,
1019 bool is_centroid, bool is_sample)
1020 {
1021 brw_wm_barycentric_interp_mode barycoord_mode;
1022 if (brw->gen >= 6) {
1023 if (is_centroid) {
1024 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1025 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1026 else
1027 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1028 } else if (is_sample) {
1029 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1030 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1031 else
1032 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1033 } else {
1034 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1035 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1036 else
1037 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1038 }
1039 } else {
1040 /* On Ironlake and below, there is only one interpolation mode.
1041 * Centroid interpolation doesn't mean anything on this hardware --
1042 * there is no multisampling.
1043 */
1044 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1045 }
1046 return emit(FS_OPCODE_LINTERP, attr,
1047 this->delta_x[barycoord_mode],
1048 this->delta_y[barycoord_mode], interp);
1049 }
1050
1051 fs_reg *
1052 fs_visitor::emit_general_interpolation(ir_variable *ir)
1053 {
1054 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1055 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1056 fs_reg attr = *reg;
1057
1058 unsigned int array_elements;
1059 const glsl_type *type;
1060
1061 if (ir->type->is_array()) {
1062 array_elements = ir->type->length;
1063 if (array_elements == 0) {
1064 fail("dereferenced array '%s' has length 0\n", ir->name);
1065 }
1066 type = ir->type->fields.array;
1067 } else {
1068 array_elements = 1;
1069 type = ir->type;
1070 }
1071
1072 glsl_interp_qualifier interpolation_mode =
1073 ir->determine_interpolation_mode(c->key.flat_shade);
1074
1075 int location = ir->data.location;
1076 for (unsigned int i = 0; i < array_elements; i++) {
1077 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1078 if (c->prog_data.urb_setup[location] == -1) {
1079 /* If there's no incoming setup data for this slot, don't
1080 * emit interpolation for it.
1081 */
1082 attr.reg_offset += type->vector_elements;
1083 location++;
1084 continue;
1085 }
1086
1087 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1088 /* Constant interpolation (flat shading) case. The SF has
1089 * handed us defined values in only the constant offset
1090 * field of the setup reg.
1091 */
1092 for (unsigned int k = 0; k < type->vector_elements; k++) {
1093 struct brw_reg interp = interp_reg(location, k);
1094 interp = suboffset(interp, 3);
1095 interp.type = reg->type;
1096 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1097 attr.reg_offset++;
1098 }
1099 } else {
1100 /* Smooth/noperspective interpolation case. */
1101 for (unsigned int k = 0; k < type->vector_elements; k++) {
1102 struct brw_reg interp = interp_reg(location, k);
1103 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1104 ir->data.centroid && !c->key.persample_shading,
1105 ir->data.sample || c->key.persample_shading);
1106 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1107 /* Get the pixel/sample mask into f0 so that we know
1108 * which pixels are lit. Then, for each channel that is
1109 * unlit, replace the centroid data with non-centroid
1110 * data.
1111 */
1112 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1113 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1114 interpolation_mode,
1115 false, false);
1116 inst->predicate = BRW_PREDICATE_NORMAL;
1117 inst->predicate_inverse = true;
1118 }
1119 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1120 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1121 }
1122 attr.reg_offset++;
1123 }
1124
1125 }
1126 location++;
1127 }
1128 }
1129
1130 return reg;
1131 }
1132
1133 fs_reg *
1134 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1135 {
1136 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1137
1138 /* The frontfacing comes in as a bit in the thread payload. */
1139 if (brw->gen >= 6) {
1140 emit(BRW_OPCODE_ASR, *reg,
1141 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1142 fs_reg(15));
1143 emit(BRW_OPCODE_NOT, *reg, *reg);
1144 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1145 } else {
1146 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1147 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1148 * us front face
1149 */
1150 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1151 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1152 }
1153
1154 return reg;
1155 }
1156
1157 void
1158 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1159 {
1160 assert(dst.type == BRW_REGISTER_TYPE_F);
1161
1162 if (c->key.compute_pos_offset) {
1163 /* Convert int_sample_pos to floating point */
1164 emit(MOV(dst, int_sample_pos));
1165 /* Scale to the range [0, 1] */
1166 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1167 }
1168 else {
1169 /* From ARB_sample_shading specification:
1170 * "When rendering to a non-multisample buffer, or if multisample
1171 * rasterization is disabled, gl_SamplePosition will always be
1172 * (0.5, 0.5).
1173 */
1174 emit(MOV(dst, fs_reg(0.5f)));
1175 }
1176 }
1177
1178 fs_reg *
1179 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1180 {
1181 assert(brw->gen >= 6);
1182 assert(ir->type == glsl_type::vec2_type);
1183
1184 this->current_annotation = "compute sample position";
1185 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1186 fs_reg pos = *reg;
1187 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1188 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1189
1190 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1191 * mode will be enabled.
1192 *
1193 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1194 * R31.1:0 Position Offset X/Y for Slot[3:0]
1195 * R31.3:2 Position Offset X/Y for Slot[7:4]
1196 * .....
1197 *
1198 * The X, Y sample positions come in as bytes in thread payload. So, read
1199 * the positions using vstride=16, width=8, hstride=2.
1200 */
1201 struct brw_reg sample_pos_reg =
1202 stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1203 BRW_REGISTER_TYPE_B), 16, 8, 2);
1204
1205 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1206 if (dispatch_width == 16) {
1207 fs_inst *inst = emit(MOV(half(int_sample_x, 1),
1208 fs_reg(suboffset(sample_pos_reg, 16))));
1209 inst->force_sechalf = true;
1210 }
1211 /* Compute gl_SamplePosition.x */
1212 compute_sample_position(pos, int_sample_x);
1213 pos.reg_offset++;
1214 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1215 if (dispatch_width == 16) {
1216 fs_inst *inst = emit(MOV(half(int_sample_y, 1),
1217 fs_reg(suboffset(sample_pos_reg, 17))));
1218 inst->force_sechalf = true;
1219 }
1220 /* Compute gl_SamplePosition.y */
1221 compute_sample_position(pos, int_sample_y);
1222 return reg;
1223 }
1224
1225 fs_reg *
1226 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1227 {
1228 assert(brw->gen >= 6);
1229
1230 this->current_annotation = "compute sample id";
1231 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1232
1233 if (c->key.compute_sample_id) {
1234 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1235 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1236 t2.type = BRW_REGISTER_TYPE_UW;
1237
1238 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1239 * 8x multisampling, subspan 0 will represent sample N (where N
1240 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1241 * 7. We can find the value of N by looking at R0.0 bits 7:6
1242 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1243 * (since samples are always delivered in pairs). That is, we
1244 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1245 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1246 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1247 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1248 * populating a temporary variable with the sequence (0, 1, 2, 3),
1249 * and then reading from it using vstride=1, width=4, hstride=0.
1250 * These computations hold good for 4x multisampling as well.
1251 */
1252 emit(BRW_OPCODE_AND, t1,
1253 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1254 fs_reg(brw_imm_d(0xc0)));
1255 emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1256 /* This works for both SIMD8 and SIMD16 */
1257 emit(MOV(t2, brw_imm_v(0x3210)));
1258 /* This special instruction takes care of setting vstride=1,
1259 * width=4, hstride=0 of t2 during an ADD instruction.
1260 */
1261 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1262 } else {
1263 /* As per GL_ARB_sample_shading specification:
1264 * "When rendering to a non-multisample buffer, or if multisample
1265 * rasterization is disabled, gl_SampleID will always be zero."
1266 */
1267 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1268 }
1269
1270 return reg;
1271 }
1272
1273 fs_reg *
1274 fs_visitor::emit_samplemaskin_setup(ir_variable *ir)
1275 {
1276 assert(brw->gen >= 7);
1277 this->current_annotation = "compute gl_SampleMaskIn";
1278 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1279 emit(MOV(*reg, fs_reg(retype(brw_vec8_grf(c->sample_mask_reg, 0), BRW_REGISTER_TYPE_D))));
1280 return reg;
1281 }
1282
1283 fs_reg
1284 fs_visitor::fix_math_operand(fs_reg src)
1285 {
1286 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1287 * might be able to do better by doing execsize = 1 math and then
1288 * expanding that result out, but we would need to be careful with
1289 * masking.
1290 *
1291 * The hardware ignores source modifiers (negate and abs) on math
1292 * instructions, so we also move to a temp to set those up.
1293 */
1294 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1295 !src.abs && !src.negate)
1296 return src;
1297
1298 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1299 * operands to math
1300 */
1301 if (brw->gen >= 7 && src.file != IMM)
1302 return src;
1303
1304 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1305 expanded.type = src.type;
1306 emit(BRW_OPCODE_MOV, expanded, src);
1307 return expanded;
1308 }
1309
1310 fs_inst *
1311 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1312 {
1313 switch (opcode) {
1314 case SHADER_OPCODE_RCP:
1315 case SHADER_OPCODE_RSQ:
1316 case SHADER_OPCODE_SQRT:
1317 case SHADER_OPCODE_EXP2:
1318 case SHADER_OPCODE_LOG2:
1319 case SHADER_OPCODE_SIN:
1320 case SHADER_OPCODE_COS:
1321 break;
1322 default:
1323 assert(!"not reached: bad math opcode");
1324 return NULL;
1325 }
1326
1327 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1328 * might be able to do better by doing execsize = 1 math and then
1329 * expanding that result out, but we would need to be careful with
1330 * masking.
1331 *
1332 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1333 * instructions, so we also move to a temp to set those up.
1334 */
1335 if (brw->gen >= 6)
1336 src = fix_math_operand(src);
1337
1338 fs_inst *inst = emit(opcode, dst, src);
1339
1340 if (brw->gen < 6) {
1341 inst->base_mrf = 2;
1342 inst->mlen = dispatch_width / 8;
1343 }
1344
1345 return inst;
1346 }
1347
1348 fs_inst *
1349 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1350 {
1351 int base_mrf = 2;
1352 fs_inst *inst;
1353
1354 switch (opcode) {
1355 case SHADER_OPCODE_INT_QUOTIENT:
1356 case SHADER_OPCODE_INT_REMAINDER:
1357 if (brw->gen >= 7 && dispatch_width == 16)
1358 fail("SIMD16 INTDIV unsupported\n");
1359 break;
1360 case SHADER_OPCODE_POW:
1361 break;
1362 default:
1363 assert(!"not reached: unsupported binary math opcode.");
1364 return NULL;
1365 }
1366
1367 if (brw->gen >= 6) {
1368 src0 = fix_math_operand(src0);
1369 src1 = fix_math_operand(src1);
1370
1371 inst = emit(opcode, dst, src0, src1);
1372 } else {
1373 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1374 * "Message Payload":
1375 *
1376 * "Operand0[7]. For the INT DIV functions, this operand is the
1377 * denominator."
1378 * ...
1379 * "Operand1[7]. For the INT DIV functions, this operand is the
1380 * numerator."
1381 */
1382 bool is_int_div = opcode != SHADER_OPCODE_POW;
1383 fs_reg &op0 = is_int_div ? src1 : src0;
1384 fs_reg &op1 = is_int_div ? src0 : src1;
1385
1386 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1387 inst = emit(opcode, dst, op0, reg_null_f);
1388
1389 inst->base_mrf = base_mrf;
1390 inst->mlen = 2 * dispatch_width / 8;
1391 }
1392 return inst;
1393 }
1394
1395 void
1396 fs_visitor::assign_curb_setup()
1397 {
1398 if (dispatch_width == 8) {
1399 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1400 stage_prog_data->nr_params = uniforms;
1401 } else {
1402 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1403 /* Make sure we didn't try to sneak in an extra uniform */
1404 assert(uniforms == 0);
1405 }
1406
1407 c->prog_data.curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1408
1409 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1410 foreach_list(node, &this->instructions) {
1411 fs_inst *inst = (fs_inst *)node;
1412
1413 for (unsigned int i = 0; i < 3; i++) {
1414 if (inst->src[i].file == UNIFORM) {
1415 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1416 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1417 constant_nr / 8,
1418 constant_nr % 8);
1419
1420 inst->src[i].file = HW_REG;
1421 inst->src[i].fixed_hw_reg = byte_offset(
1422 retype(brw_reg, inst->src[i].type),
1423 inst->src[i].subreg_offset);
1424 }
1425 }
1426 }
1427 }
1428
1429 void
1430 fs_visitor::calculate_urb_setup()
1431 {
1432 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1433 c->prog_data.urb_setup[i] = -1;
1434 }
1435
1436 int urb_next = 0;
1437 /* Figure out where each of the incoming setup attributes lands. */
1438 if (brw->gen >= 6) {
1439 if (_mesa_bitcount_64(fp->Base.InputsRead &
1440 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1441 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1442 * first 16 varying inputs, so we can put them wherever we want.
1443 * Just put them in order.
1444 *
1445 * This is useful because it means that (a) inputs not used by the
1446 * fragment shader won't take up valuable register space, and (b) we
1447 * won't have to recompile the fragment shader if it gets paired with
1448 * a different vertex (or geometry) shader.
1449 */
1450 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1451 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1452 BITFIELD64_BIT(i)) {
1453 c->prog_data.urb_setup[i] = urb_next++;
1454 }
1455 }
1456 } else {
1457 /* We have enough input varyings that the SF/SBE pipeline stage can't
1458 * arbitrarily rearrange them to suit our whim; we have to put them
1459 * in an order that matches the output of the previous pipeline stage
1460 * (geometry or vertex shader).
1461 */
1462 struct brw_vue_map prev_stage_vue_map;
1463 brw_compute_vue_map(brw, &prev_stage_vue_map,
1464 c->key.input_slots_valid);
1465 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1466 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1467 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1468 slot++) {
1469 int varying = prev_stage_vue_map.slot_to_varying[slot];
1470 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1471 * unused.
1472 */
1473 if (varying != BRW_VARYING_SLOT_COUNT &&
1474 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1475 BITFIELD64_BIT(varying))) {
1476 c->prog_data.urb_setup[varying] = slot - first_slot;
1477 }
1478 }
1479 urb_next = prev_stage_vue_map.num_slots - first_slot;
1480 }
1481 } else {
1482 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1483 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1484 /* Point size is packed into the header, not as a general attribute */
1485 if (i == VARYING_SLOT_PSIZ)
1486 continue;
1487
1488 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1489 /* The back color slot is skipped when the front color is
1490 * also written to. In addition, some slots can be
1491 * written in the vertex shader and not read in the
1492 * fragment shader. So the register number must always be
1493 * incremented, mapped or not.
1494 */
1495 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1496 c->prog_data.urb_setup[i] = urb_next;
1497 urb_next++;
1498 }
1499 }
1500
1501 /*
1502 * It's a FS only attribute, and we did interpolation for this attribute
1503 * in SF thread. So, count it here, too.
1504 *
1505 * See compile_sf_prog() for more info.
1506 */
1507 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1508 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1509 }
1510
1511 c->prog_data.num_varying_inputs = urb_next;
1512 }
1513
1514 void
1515 fs_visitor::assign_urb_setup()
1516 {
1517 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1518
1519 /* Offset all the urb_setup[] index by the actual position of the
1520 * setup regs, now that the location of the constants has been chosen.
1521 */
1522 foreach_list(node, &this->instructions) {
1523 fs_inst *inst = (fs_inst *)node;
1524
1525 if (inst->opcode == FS_OPCODE_LINTERP) {
1526 assert(inst->src[2].file == HW_REG);
1527 inst->src[2].fixed_hw_reg.nr += urb_start;
1528 }
1529
1530 if (inst->opcode == FS_OPCODE_CINTERP) {
1531 assert(inst->src[0].file == HW_REG);
1532 inst->src[0].fixed_hw_reg.nr += urb_start;
1533 }
1534 }
1535
1536 /* Each attribute is 4 setup channels, each of which is half a reg. */
1537 this->first_non_payload_grf =
1538 urb_start + c->prog_data.num_varying_inputs * 2;
1539 }
1540
1541 /**
1542 * Split large virtual GRFs into separate components if we can.
1543 *
1544 * This is mostly duplicated with what brw_fs_vector_splitting does,
1545 * but that's really conservative because it's afraid of doing
1546 * splitting that doesn't result in real progress after the rest of
1547 * the optimization phases, which would cause infinite looping in
1548 * optimization. We can do it once here, safely. This also has the
1549 * opportunity to split interpolated values, or maybe even uniforms,
1550 * which we don't have at the IR level.
1551 *
1552 * We want to split, because virtual GRFs are what we register
1553 * allocate and spill (due to contiguousness requirements for some
1554 * instructions), and they're what we naturally generate in the
1555 * codegen process, but most virtual GRFs don't actually need to be
1556 * contiguous sets of GRFs. If we split, we'll end up with reduced
1557 * live intervals and better dead code elimination and coalescing.
1558 */
1559 void
1560 fs_visitor::split_virtual_grfs()
1561 {
1562 int num_vars = this->virtual_grf_count;
1563 bool split_grf[num_vars];
1564 int new_virtual_grf[num_vars];
1565
1566 /* Try to split anything > 0 sized. */
1567 for (int i = 0; i < num_vars; i++) {
1568 if (this->virtual_grf_sizes[i] != 1)
1569 split_grf[i] = true;
1570 else
1571 split_grf[i] = false;
1572 }
1573
1574 if (brw->has_pln &&
1575 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1576 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1577 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1578 * Gen6, that was the only supported interpolation mode, and since Gen6,
1579 * delta_x and delta_y are in fixed hardware registers.
1580 */
1581 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1582 false;
1583 }
1584
1585 foreach_list(node, &this->instructions) {
1586 fs_inst *inst = (fs_inst *)node;
1587
1588 /* If there's a SEND message that requires contiguous destination
1589 * registers, no splitting is allowed.
1590 */
1591 if (inst->regs_written > 1) {
1592 split_grf[inst->dst.reg] = false;
1593 }
1594
1595 /* If we're sending from a GRF, don't split it, on the assumption that
1596 * the send is reading the whole thing.
1597 */
1598 if (inst->is_send_from_grf()) {
1599 for (int i = 0; i < 3; i++) {
1600 if (inst->src[i].file == GRF) {
1601 split_grf[inst->src[i].reg] = false;
1602 }
1603 }
1604 }
1605 }
1606
1607 /* Allocate new space for split regs. Note that the virtual
1608 * numbers will be contiguous.
1609 */
1610 for (int i = 0; i < num_vars; i++) {
1611 if (split_grf[i]) {
1612 new_virtual_grf[i] = virtual_grf_alloc(1);
1613 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1614 int reg = virtual_grf_alloc(1);
1615 assert(reg == new_virtual_grf[i] + j - 1);
1616 (void) reg;
1617 }
1618 this->virtual_grf_sizes[i] = 1;
1619 }
1620 }
1621
1622 foreach_list(node, &this->instructions) {
1623 fs_inst *inst = (fs_inst *)node;
1624
1625 if (inst->dst.file == GRF &&
1626 split_grf[inst->dst.reg] &&
1627 inst->dst.reg_offset != 0) {
1628 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1629 inst->dst.reg_offset - 1);
1630 inst->dst.reg_offset = 0;
1631 }
1632 for (int i = 0; i < 3; i++) {
1633 if (inst->src[i].file == GRF &&
1634 split_grf[inst->src[i].reg] &&
1635 inst->src[i].reg_offset != 0) {
1636 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1637 inst->src[i].reg_offset - 1);
1638 inst->src[i].reg_offset = 0;
1639 }
1640 }
1641 }
1642 invalidate_live_intervals();
1643 }
1644
1645 /**
1646 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1647 *
1648 * During code generation, we create tons of temporary variables, many of
1649 * which get immediately killed and are never used again. Yet, in later
1650 * optimization and analysis passes, such as compute_live_intervals, we need
1651 * to loop over all the virtual GRFs. Compacting them can save a lot of
1652 * overhead.
1653 */
1654 void
1655 fs_visitor::compact_virtual_grfs()
1656 {
1657 /* Mark which virtual GRFs are used, and count how many. */
1658 int remap_table[this->virtual_grf_count];
1659 memset(remap_table, -1, sizeof(remap_table));
1660
1661 foreach_list(node, &this->instructions) {
1662 const fs_inst *inst = (const fs_inst *) node;
1663
1664 if (inst->dst.file == GRF)
1665 remap_table[inst->dst.reg] = 0;
1666
1667 for (int i = 0; i < 3; i++) {
1668 if (inst->src[i].file == GRF)
1669 remap_table[inst->src[i].reg] = 0;
1670 }
1671 }
1672
1673 /* In addition to registers used in instructions, fs_visitor keeps
1674 * direct references to certain special values which must be patched:
1675 */
1676 fs_reg *special[] = {
1677 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1678 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1679 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1680 &delta_x[0], &delta_x[1], &delta_x[2],
1681 &delta_x[3], &delta_x[4], &delta_x[5],
1682 &delta_y[0], &delta_y[1], &delta_y[2],
1683 &delta_y[3], &delta_y[4], &delta_y[5],
1684 };
1685 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1686 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1687
1688 /* Treat all special values as used, to be conservative */
1689 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1690 if (special[i]->file == GRF)
1691 remap_table[special[i]->reg] = 0;
1692 }
1693
1694 /* Compact the GRF arrays. */
1695 int new_index = 0;
1696 for (int i = 0; i < this->virtual_grf_count; i++) {
1697 if (remap_table[i] != -1) {
1698 remap_table[i] = new_index;
1699 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1700 invalidate_live_intervals();
1701 ++new_index;
1702 }
1703 }
1704
1705 this->virtual_grf_count = new_index;
1706
1707 /* Patch all the instructions to use the newly renumbered registers */
1708 foreach_list(node, &this->instructions) {
1709 fs_inst *inst = (fs_inst *) node;
1710
1711 if (inst->dst.file == GRF)
1712 inst->dst.reg = remap_table[inst->dst.reg];
1713
1714 for (int i = 0; i < 3; i++) {
1715 if (inst->src[i].file == GRF)
1716 inst->src[i].reg = remap_table[inst->src[i].reg];
1717 }
1718 }
1719
1720 /* Patch all the references to special values */
1721 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1722 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1723 special[i]->reg = remap_table[special[i]->reg];
1724 }
1725 }
1726
1727 bool
1728 fs_visitor::remove_dead_constants()
1729 {
1730 if (dispatch_width == 8) {
1731 this->params_remap = ralloc_array(mem_ctx, int, uniforms);
1732 this->nr_params_remap = uniforms;
1733
1734 for (unsigned int i = 0; i < uniforms; i++)
1735 this->params_remap[i] = -1;
1736
1737 /* Find which params are still in use. */
1738 foreach_list(node, &this->instructions) {
1739 fs_inst *inst = (fs_inst *)node;
1740
1741 for (int i = 0; i < 3; i++) {
1742 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1743
1744 if (inst->src[i].file != UNIFORM)
1745 continue;
1746
1747 /* Section 5.11 of the OpenGL 4.3 spec says:
1748 *
1749 * "Out-of-bounds reads return undefined values, which include
1750 * values from other variables of the active program or zero."
1751 */
1752 if (constant_nr < 0 || constant_nr >= (int)uniforms) {
1753 constant_nr = 0;
1754 }
1755
1756 /* For now, set this to non-negative. We'll give it the
1757 * actual new number in a moment, in order to keep the
1758 * register numbers nicely ordered.
1759 */
1760 this->params_remap[constant_nr] = 0;
1761 }
1762 }
1763
1764 /* Figure out what the new numbers for the params will be. At some
1765 * point when we're doing uniform array access, we're going to want
1766 * to keep the distinction between .reg and .reg_offset, but for
1767 * now we don't care.
1768 */
1769 unsigned int new_nr_params = 0;
1770 for (unsigned int i = 0; i < uniforms; i++) {
1771 if (this->params_remap[i] != -1) {
1772 this->params_remap[i] = new_nr_params++;
1773 }
1774 }
1775
1776 /* Update the list of params to be uploaded to match our new numbering. */
1777 for (unsigned int i = 0; i < uniforms; i++) {
1778 int remapped = this->params_remap[i];
1779
1780 if (remapped == -1)
1781 continue;
1782
1783 stage_prog_data->param[remapped] = stage_prog_data->param[i];
1784 }
1785
1786 uniforms = new_nr_params;
1787 } else {
1788 /* This should have been generated in the SIMD8 pass already. */
1789 assert(this->params_remap);
1790 }
1791
1792 /* Now do the renumbering of the shader to remove unused params. */
1793 foreach_list(node, &this->instructions) {
1794 fs_inst *inst = (fs_inst *)node;
1795
1796 for (int i = 0; i < 3; i++) {
1797 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1798
1799 if (inst->src[i].file != UNIFORM)
1800 continue;
1801
1802 /* as above alias to 0 */
1803 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1804 constant_nr = 0;
1805 }
1806 assert(this->params_remap[constant_nr] != -1);
1807 inst->src[i].reg = this->params_remap[constant_nr];
1808 inst->src[i].reg_offset = 0;
1809 }
1810 }
1811
1812 return true;
1813 }
1814
1815 /*
1816 * Implements array access of uniforms by inserting a
1817 * PULL_CONSTANT_LOAD instruction.
1818 *
1819 * Unlike temporary GRF array access (where we don't support it due to
1820 * the difficulty of doing relative addressing on instruction
1821 * destinations), we could potentially do array access of uniforms
1822 * that were loaded in GRF space as push constants. In real-world
1823 * usage we've seen, though, the arrays being used are always larger
1824 * than we could load as push constants, so just always move all
1825 * uniform array access out to a pull constant buffer.
1826 */
1827 void
1828 fs_visitor::move_uniform_array_access_to_pull_constants()
1829 {
1830 int pull_constant_loc[uniforms];
1831
1832 for (unsigned int i = 0; i < uniforms; i++) {
1833 pull_constant_loc[i] = -1;
1834 }
1835
1836 /* Walk through and find array access of uniforms. Put a copy of that
1837 * uniform in the pull constant buffer.
1838 *
1839 * Note that we don't move constant-indexed accesses to arrays. No
1840 * testing has been done of the performance impact of this choice.
1841 */
1842 foreach_list_safe(node, &this->instructions) {
1843 fs_inst *inst = (fs_inst *)node;
1844
1845 for (int i = 0 ; i < 3; i++) {
1846 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1847 continue;
1848
1849 int uniform = inst->src[i].reg;
1850
1851 /* If this array isn't already present in the pull constant buffer,
1852 * add it.
1853 */
1854 if (pull_constant_loc[uniform] == -1) {
1855 const float **values = &stage_prog_data->param[uniform];
1856
1857 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params;
1858
1859 assert(param_size[uniform]);
1860
1861 for (int j = 0; j < param_size[uniform]; j++) {
1862 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1863 values[j];
1864 }
1865 }
1866
1867 /* Set up the annotation tracking for new generated instructions. */
1868 base_ir = inst->ir;
1869 current_annotation = inst->annotation;
1870
1871 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1872 fs_reg temp = fs_reg(this, glsl_type::float_type);
1873 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1874 surf_index,
1875 *inst->src[i].reladdr,
1876 pull_constant_loc[uniform] +
1877 inst->src[i].reg_offset);
1878 inst->insert_before(&list);
1879
1880 inst->src[i].file = temp.file;
1881 inst->src[i].reg = temp.reg;
1882 inst->src[i].reg_offset = temp.reg_offset;
1883 inst->src[i].reladdr = NULL;
1884 }
1885 }
1886 }
1887
1888 /**
1889 * Choose accesses from the UNIFORM file to demote to using the pull
1890 * constant buffer.
1891 *
1892 * We allow a fragment shader to have more than the specified minimum
1893 * maximum number of fragment shader uniform components (64). If
1894 * there are too many of these, they'd fill up all of register space.
1895 * So, this will push some of them out to the pull constant buffer and
1896 * update the program to load them.
1897 */
1898 void
1899 fs_visitor::setup_pull_constants()
1900 {
1901 /* Only allow 16 registers (128 uniform components) as push constants. */
1902 unsigned int max_uniform_components = 16 * 8;
1903 if (uniforms <= max_uniform_components)
1904 return;
1905
1906 if (dispatch_width == 16) {
1907 fail("Pull constants not supported in SIMD16\n");
1908 return;
1909 }
1910
1911 /* Just demote the end of the list. We could probably do better
1912 * here, demoting things that are rarely used in the program first.
1913 */
1914 unsigned int pull_uniform_base = max_uniform_components;
1915
1916 int pull_constant_loc[uniforms];
1917 for (unsigned int i = 0; i < uniforms; i++) {
1918 if (i < pull_uniform_base) {
1919 pull_constant_loc[i] = -1;
1920 } else {
1921 pull_constant_loc[i] = -1;
1922 /* If our constant is already being uploaded for reladdr purposes,
1923 * reuse it.
1924 */
1925 for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j++) {
1926 if (stage_prog_data->pull_param[j] == stage_prog_data->param[i]) {
1927 pull_constant_loc[i] = j;
1928 break;
1929 }
1930 }
1931 if (pull_constant_loc[i] == -1) {
1932 int pull_index = stage_prog_data->nr_pull_params++;
1933 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1934 pull_constant_loc[i] = pull_index;
1935 }
1936 }
1937 }
1938 uniforms = pull_uniform_base;
1939
1940 foreach_list(node, &this->instructions) {
1941 fs_inst *inst = (fs_inst *)node;
1942
1943 for (int i = 0; i < 3; i++) {
1944 if (inst->src[i].file != UNIFORM)
1945 continue;
1946
1947 int pull_index = pull_constant_loc[inst->src[i].reg +
1948 inst->src[i].reg_offset];
1949 if (pull_index == -1)
1950 continue;
1951
1952 assert(!inst->src[i].reladdr);
1953
1954 fs_reg dst = fs_reg(this, glsl_type::float_type);
1955 fs_reg index(stage_prog_data->binding_table.pull_constants_start);
1956 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1957 fs_inst *pull =
1958 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1959 dst, index, offset);
1960 pull->ir = inst->ir;
1961 pull->annotation = inst->annotation;
1962
1963 inst->insert_before(pull);
1964
1965 inst->src[i].file = GRF;
1966 inst->src[i].reg = dst.reg;
1967 inst->src[i].reg_offset = 0;
1968 inst->src[i].set_smear(pull_index & 3);
1969 }
1970 }
1971 }
1972
1973 bool
1974 fs_visitor::opt_algebraic()
1975 {
1976 bool progress = false;
1977
1978 foreach_list(node, &this->instructions) {
1979 fs_inst *inst = (fs_inst *)node;
1980
1981 switch (inst->opcode) {
1982 case BRW_OPCODE_MUL:
1983 if (inst->src[1].file != IMM)
1984 continue;
1985
1986 /* a * 1.0 = a */
1987 if (inst->src[1].is_one()) {
1988 inst->opcode = BRW_OPCODE_MOV;
1989 inst->src[1] = reg_undef;
1990 progress = true;
1991 break;
1992 }
1993
1994 /* a * 0.0 = 0.0 */
1995 if (inst->src[1].is_zero()) {
1996 inst->opcode = BRW_OPCODE_MOV;
1997 inst->src[0] = inst->src[1];
1998 inst->src[1] = reg_undef;
1999 progress = true;
2000 break;
2001 }
2002
2003 break;
2004 case BRW_OPCODE_ADD:
2005 if (inst->src[1].file != IMM)
2006 continue;
2007
2008 /* a + 0.0 = a */
2009 if (inst->src[1].is_zero()) {
2010 inst->opcode = BRW_OPCODE_MOV;
2011 inst->src[1] = reg_undef;
2012 progress = true;
2013 break;
2014 }
2015 break;
2016 case BRW_OPCODE_OR:
2017 if (inst->src[0].equals(inst->src[1])) {
2018 inst->opcode = BRW_OPCODE_MOV;
2019 inst->src[1] = reg_undef;
2020 progress = true;
2021 break;
2022 }
2023 break;
2024 case BRW_OPCODE_LRP:
2025 if (inst->src[1].equals(inst->src[2])) {
2026 inst->opcode = BRW_OPCODE_MOV;
2027 inst->src[0] = inst->src[1];
2028 inst->src[1] = reg_undef;
2029 inst->src[2] = reg_undef;
2030 progress = true;
2031 break;
2032 }
2033 break;
2034 case BRW_OPCODE_SEL:
2035 if (inst->saturate && inst->src[1].file == IMM) {
2036 switch (inst->conditional_mod) {
2037 case BRW_CONDITIONAL_LE:
2038 case BRW_CONDITIONAL_L:
2039 switch (inst->src[1].type) {
2040 case BRW_REGISTER_TYPE_F:
2041 if (inst->src[1].imm.f >= 1.0f) {
2042 inst->opcode = BRW_OPCODE_MOV;
2043 inst->src[1] = reg_undef;
2044 progress = true;
2045 }
2046 break;
2047 default:
2048 break;
2049 }
2050 break;
2051 case BRW_CONDITIONAL_GE:
2052 case BRW_CONDITIONAL_G:
2053 switch (inst->src[1].type) {
2054 case BRW_REGISTER_TYPE_F:
2055 if (inst->src[1].imm.f <= 0.0f) {
2056 inst->opcode = BRW_OPCODE_MOV;
2057 inst->src[1] = reg_undef;
2058 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2059 progress = true;
2060 }
2061 break;
2062 default:
2063 break;
2064 }
2065 default:
2066 break;
2067 }
2068 }
2069 break;
2070 default:
2071 break;
2072 }
2073 }
2074
2075 return progress;
2076 }
2077
2078 /**
2079 * Removes any instructions writing a VGRF where that VGRF is not used by any
2080 * later instruction.
2081 */
2082 bool
2083 fs_visitor::dead_code_eliminate()
2084 {
2085 bool progress = false;
2086 int pc = 0;
2087
2088 calculate_live_intervals();
2089
2090 foreach_list_safe(node, &this->instructions) {
2091 fs_inst *inst = (fs_inst *)node;
2092
2093 if (inst->dst.file == GRF && !inst->has_side_effects()) {
2094 bool dead = true;
2095
2096 for (int i = 0; i < inst->regs_written; i++) {
2097 int var = live_intervals->var_from_vgrf[inst->dst.reg];
2098 assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2099 if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2100 dead = false;
2101 break;
2102 }
2103 }
2104
2105 if (dead) {
2106 /* Don't dead code eliminate instructions that write to the
2107 * accumulator as a side-effect. Instead just set the destination
2108 * to the null register to free it.
2109 */
2110 switch (inst->opcode) {
2111 case BRW_OPCODE_ADDC:
2112 case BRW_OPCODE_SUBB:
2113 case BRW_OPCODE_MACH:
2114 inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2115 break;
2116 default:
2117 inst->remove();
2118 progress = true;
2119 break;
2120 }
2121 }
2122 }
2123
2124 pc++;
2125 }
2126
2127 if (progress)
2128 invalidate_live_intervals();
2129
2130 return progress;
2131 }
2132
2133 struct dead_code_hash_key
2134 {
2135 int vgrf;
2136 int reg_offset;
2137 };
2138
2139 static bool
2140 dead_code_hash_compare(const void *a, const void *b)
2141 {
2142 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2143 }
2144
2145 static void
2146 clear_dead_code_hash(struct hash_table *ht)
2147 {
2148 struct hash_entry *entry;
2149
2150 hash_table_foreach(ht, entry) {
2151 _mesa_hash_table_remove(ht, entry);
2152 }
2153 }
2154
2155 static void
2156 insert_dead_code_hash(struct hash_table *ht,
2157 int vgrf, int reg_offset, fs_inst *inst)
2158 {
2159 /* We don't bother freeing keys, because they'll be GCed with the ht. */
2160 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2161
2162 key->vgrf = vgrf;
2163 key->reg_offset = reg_offset;
2164
2165 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2166 }
2167
2168 static struct hash_entry *
2169 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2170 {
2171 struct dead_code_hash_key key;
2172
2173 key.vgrf = vgrf;
2174 key.reg_offset = reg_offset;
2175
2176 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2177 }
2178
2179 static void
2180 remove_dead_code_hash(struct hash_table *ht,
2181 int vgrf, int reg_offset)
2182 {
2183 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2184 if (!entry)
2185 return;
2186
2187 _mesa_hash_table_remove(ht, entry);
2188 }
2189
2190 /**
2191 * Walks basic blocks, removing any regs that are written but not read before
2192 * being redefined.
2193 *
2194 * The dead_code_eliminate() function implements a global dead code
2195 * elimination, but it only handles the removing the last write to a register
2196 * if it's never read. This one can handle intermediate writes, but only
2197 * within a basic block.
2198 */
2199 bool
2200 fs_visitor::dead_code_eliminate_local()
2201 {
2202 struct hash_table *ht;
2203 bool progress = false;
2204
2205 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2206
2207 if (ht == NULL) {
2208 return false;
2209 }
2210
2211 foreach_list_safe(node, &this->instructions) {
2212 fs_inst *inst = (fs_inst *)node;
2213
2214 /* At a basic block, empty the HT since we don't understand dataflow
2215 * here.
2216 */
2217 if (inst->is_control_flow()) {
2218 clear_dead_code_hash(ht);
2219 continue;
2220 }
2221
2222 /* Clear the HT of any instructions that got read. */
2223 for (int i = 0; i < 3; i++) {
2224 fs_reg src = inst->src[i];
2225 if (src.file != GRF)
2226 continue;
2227
2228 int read = 1;
2229 if (inst->is_send_from_grf())
2230 read = virtual_grf_sizes[src.reg] - src.reg_offset;
2231
2232 for (int reg_offset = src.reg_offset;
2233 reg_offset < src.reg_offset + read;
2234 reg_offset++) {
2235 remove_dead_code_hash(ht, src.reg, reg_offset);
2236 }
2237 }
2238
2239 /* Add any update of a GRF to the HT, removing a previous write if it
2240 * wasn't read.
2241 */
2242 if (inst->dst.file == GRF) {
2243 if (inst->regs_written > 1) {
2244 /* We don't know how to trim channels from an instruction's
2245 * writes, so we can't incrementally remove unread channels from
2246 * it. Just remove whatever it overwrites from the table
2247 */
2248 for (int i = 0; i < inst->regs_written; i++) {
2249 remove_dead_code_hash(ht,
2250 inst->dst.reg,
2251 inst->dst.reg_offset + i);
2252 }
2253 } else {
2254 struct hash_entry *entry =
2255 get_dead_code_hash_entry(ht, inst->dst.reg,
2256 inst->dst.reg_offset);
2257
2258 if (entry) {
2259 if (inst->is_partial_write()) {
2260 /* For a partial write, we can't remove any previous dead code
2261 * candidate, since we're just modifying their result.
2262 */
2263 } else {
2264 /* We're completely updating a channel, and there was a
2265 * previous write to the channel that wasn't read. Kill it!
2266 */
2267 fs_inst *inst = (fs_inst *)entry->data;
2268 inst->remove();
2269 progress = true;
2270 }
2271
2272 _mesa_hash_table_remove(ht, entry);
2273 }
2274
2275 if (!inst->has_side_effects())
2276 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2277 inst);
2278 }
2279 }
2280 }
2281
2282 _mesa_hash_table_destroy(ht, NULL);
2283
2284 if (progress)
2285 invalidate_live_intervals();
2286
2287 return progress;
2288 }
2289
2290 /**
2291 * Implements register coalescing: Checks if the two registers involved in a
2292 * raw move don't interfere, in which case they can both be stored in the same
2293 * place and the MOV removed.
2294 *
2295 * To do this, all uses of the source of the MOV in the shader are replaced
2296 * with the destination of the MOV. For example:
2297 *
2298 * add vgrf3:F, vgrf1:F, vgrf2:F
2299 * mov vgrf4:F, vgrf3:F
2300 * mul vgrf5:F, vgrf5:F, vgrf4:F
2301 *
2302 * becomes
2303 *
2304 * add vgrf4:F, vgrf1:F, vgrf2:F
2305 * mul vgrf5:F, vgrf5:F, vgrf4:F
2306 */
2307 bool
2308 fs_visitor::register_coalesce()
2309 {
2310 bool progress = false;
2311
2312 calculate_live_intervals();
2313
2314 int src_size = 0;
2315 int channels_remaining = 0;
2316 int reg_from = -1, reg_to = -1;
2317 int reg_to_offset[MAX_SAMPLER_MESSAGE_SIZE];
2318 fs_inst *mov[MAX_SAMPLER_MESSAGE_SIZE];
2319
2320 foreach_list(node, &this->instructions) {
2321 fs_inst *inst = (fs_inst *)node;
2322
2323 if (inst->opcode != BRW_OPCODE_MOV ||
2324 inst->is_partial_write() ||
2325 inst->saturate ||
2326 inst->src[0].file != GRF ||
2327 inst->src[0].negate ||
2328 inst->src[0].abs ||
2329 !inst->src[0].is_contiguous() ||
2330 inst->dst.file != GRF ||
2331 inst->dst.type != inst->src[0].type) {
2332 continue;
2333 }
2334
2335 if (virtual_grf_sizes[inst->src[0].reg] >
2336 virtual_grf_sizes[inst->dst.reg])
2337 continue;
2338
2339 int var_from = live_intervals->var_from_reg(&inst->src[0]);
2340 int var_to = live_intervals->var_from_reg(&inst->dst);
2341
2342 if (live_intervals->vars_interfere(var_from, var_to) &&
2343 !inst->dst.equals(inst->src[0])) {
2344
2345 /* We know that the live ranges of A (var_from) and B (var_to)
2346 * interfere because of the ->vars_interfere() call above. If the end
2347 * of B's live range is after the end of A's range, then we know two
2348 * things:
2349 * - the start of B's live range must be in A's live range (since we
2350 * already know the two ranges interfere, this is the only remaining
2351 * possibility)
2352 * - the interference isn't of the form we're looking for (where B is
2353 * entirely inside A)
2354 */
2355 if (live_intervals->end[var_to] > live_intervals->end[var_from])
2356 continue;
2357
2358 bool overwritten = false;
2359 int scan_ip = -1;
2360
2361 foreach_list(n, &this->instructions) {
2362 fs_inst *scan_inst = (fs_inst *)n;
2363 scan_ip++;
2364
2365 if (scan_inst->is_control_flow()) {
2366 overwritten = true;
2367 break;
2368 }
2369
2370 if (scan_ip <= live_intervals->start[var_to])
2371 continue;
2372
2373 if (scan_ip > live_intervals->end[var_to])
2374 break;
2375
2376 if (scan_inst->dst.equals(inst->dst) ||
2377 scan_inst->dst.equals(inst->src[0])) {
2378 overwritten = true;
2379 break;
2380 }
2381 }
2382
2383 if (overwritten)
2384 continue;
2385 }
2386
2387 if (reg_from != inst->src[0].reg) {
2388 reg_from = inst->src[0].reg;
2389
2390 src_size = virtual_grf_sizes[inst->src[0].reg];
2391 assert(src_size <= MAX_SAMPLER_MESSAGE_SIZE);
2392
2393 channels_remaining = src_size;
2394 memset(mov, 0, sizeof(mov));
2395
2396 reg_to = inst->dst.reg;
2397 }
2398
2399 if (reg_to != inst->dst.reg)
2400 continue;
2401
2402 const int offset = inst->src[0].reg_offset;
2403 reg_to_offset[offset] = inst->dst.reg_offset;
2404 mov[offset] = inst;
2405 channels_remaining--;
2406
2407 if (channels_remaining)
2408 continue;
2409
2410 bool removed = false;
2411 for (int i = 0; i < src_size; i++) {
2412 if (mov[i]) {
2413 removed = true;
2414
2415 mov[i]->opcode = BRW_OPCODE_NOP;
2416 mov[i]->conditional_mod = BRW_CONDITIONAL_NONE;
2417 mov[i]->dst = reg_undef;
2418 mov[i]->src[0] = reg_undef;
2419 mov[i]->src[1] = reg_undef;
2420 mov[i]->src[2] = reg_undef;
2421 }
2422 }
2423
2424 foreach_list(node, &this->instructions) {
2425 fs_inst *scan_inst = (fs_inst *)node;
2426
2427 for (int i = 0; i < src_size; i++) {
2428 if (mov[i]) {
2429 if (scan_inst->dst.file == GRF &&
2430 scan_inst->dst.reg == reg_from &&
2431 scan_inst->dst.reg_offset == i) {
2432 scan_inst->dst.reg = reg_to;
2433 scan_inst->dst.reg_offset = reg_to_offset[i];
2434 }
2435 for (int j = 0; j < 3; j++) {
2436 if (scan_inst->src[j].file == GRF &&
2437 scan_inst->src[j].reg == reg_from &&
2438 scan_inst->src[j].reg_offset == i) {
2439 scan_inst->src[j].reg = reg_to;
2440 scan_inst->src[j].reg_offset = reg_to_offset[i];
2441 }
2442 }
2443 }
2444 }
2445 }
2446
2447 if (removed) {
2448 live_intervals->start[var_to] = MIN2(live_intervals->start[var_to],
2449 live_intervals->start[var_from]);
2450 live_intervals->end[var_to] = MAX2(live_intervals->end[var_to],
2451 live_intervals->end[var_from]);
2452 reg_from = -1;
2453 }
2454 }
2455
2456 foreach_list_safe(node, &this->instructions) {
2457 fs_inst *inst = (fs_inst *)node;
2458
2459 if (inst->opcode == BRW_OPCODE_NOP) {
2460 inst->remove();
2461 progress = true;
2462 }
2463 }
2464
2465 if (progress)
2466 invalidate_live_intervals();
2467
2468 return progress;
2469 }
2470
2471 bool
2472 fs_visitor::compute_to_mrf()
2473 {
2474 bool progress = false;
2475 int next_ip = 0;
2476
2477 calculate_live_intervals();
2478
2479 foreach_list_safe(node, &this->instructions) {
2480 fs_inst *inst = (fs_inst *)node;
2481
2482 int ip = next_ip;
2483 next_ip++;
2484
2485 if (inst->opcode != BRW_OPCODE_MOV ||
2486 inst->is_partial_write() ||
2487 inst->dst.file != MRF || inst->src[0].file != GRF ||
2488 inst->dst.type != inst->src[0].type ||
2489 inst->src[0].abs || inst->src[0].negate ||
2490 !inst->src[0].is_contiguous() ||
2491 inst->src[0].subreg_offset)
2492 continue;
2493
2494 /* Work out which hardware MRF registers are written by this
2495 * instruction.
2496 */
2497 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2498 int mrf_high;
2499 if (inst->dst.reg & BRW_MRF_COMPR4) {
2500 mrf_high = mrf_low + 4;
2501 } else if (dispatch_width == 16 &&
2502 (!inst->force_uncompressed && !inst->force_sechalf)) {
2503 mrf_high = mrf_low + 1;
2504 } else {
2505 mrf_high = mrf_low;
2506 }
2507
2508 /* Can't compute-to-MRF this GRF if someone else was going to
2509 * read it later.
2510 */
2511 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2512 continue;
2513
2514 /* Found a move of a GRF to a MRF. Let's see if we can go
2515 * rewrite the thing that made this GRF to write into the MRF.
2516 */
2517 fs_inst *scan_inst;
2518 for (scan_inst = (fs_inst *)inst->prev;
2519 scan_inst->prev != NULL;
2520 scan_inst = (fs_inst *)scan_inst->prev) {
2521 if (scan_inst->dst.file == GRF &&
2522 scan_inst->dst.reg == inst->src[0].reg) {
2523 /* Found the last thing to write our reg we want to turn
2524 * into a compute-to-MRF.
2525 */
2526
2527 /* If this one instruction didn't populate all the
2528 * channels, bail. We might be able to rewrite everything
2529 * that writes that reg, but it would require smarter
2530 * tracking to delay the rewriting until complete success.
2531 */
2532 if (scan_inst->is_partial_write())
2533 break;
2534
2535 /* Things returning more than one register would need us to
2536 * understand coalescing out more than one MOV at a time.
2537 */
2538 if (scan_inst->regs_written > 1)
2539 break;
2540
2541 /* SEND instructions can't have MRF as a destination. */
2542 if (scan_inst->mlen)
2543 break;
2544
2545 if (brw->gen == 6) {
2546 /* gen6 math instructions must have the destination be
2547 * GRF, so no compute-to-MRF for them.
2548 */
2549 if (scan_inst->is_math()) {
2550 break;
2551 }
2552 }
2553
2554 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2555 /* Found the creator of our MRF's source value. */
2556 scan_inst->dst.file = MRF;
2557 scan_inst->dst.reg = inst->dst.reg;
2558 scan_inst->saturate |= inst->saturate;
2559 inst->remove();
2560 progress = true;
2561 }
2562 break;
2563 }
2564
2565 /* We don't handle control flow here. Most computation of
2566 * values that end up in MRFs are shortly before the MRF
2567 * write anyway.
2568 */
2569 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2570 break;
2571
2572 /* You can't read from an MRF, so if someone else reads our
2573 * MRF's source GRF that we wanted to rewrite, that stops us.
2574 */
2575 bool interfered = false;
2576 for (int i = 0; i < 3; i++) {
2577 if (scan_inst->src[i].file == GRF &&
2578 scan_inst->src[i].reg == inst->src[0].reg &&
2579 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2580 interfered = true;
2581 }
2582 }
2583 if (interfered)
2584 break;
2585
2586 if (scan_inst->dst.file == MRF) {
2587 /* If somebody else writes our MRF here, we can't
2588 * compute-to-MRF before that.
2589 */
2590 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2591 int scan_mrf_high;
2592
2593 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2594 scan_mrf_high = scan_mrf_low + 4;
2595 } else if (dispatch_width == 16 &&
2596 (!scan_inst->force_uncompressed &&
2597 !scan_inst->force_sechalf)) {
2598 scan_mrf_high = scan_mrf_low + 1;
2599 } else {
2600 scan_mrf_high = scan_mrf_low;
2601 }
2602
2603 if (mrf_low == scan_mrf_low ||
2604 mrf_low == scan_mrf_high ||
2605 mrf_high == scan_mrf_low ||
2606 mrf_high == scan_mrf_high) {
2607 break;
2608 }
2609 }
2610
2611 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2612 /* Found a SEND instruction, which means that there are
2613 * live values in MRFs from base_mrf to base_mrf +
2614 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2615 * above it.
2616 */
2617 if (mrf_low >= scan_inst->base_mrf &&
2618 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2619 break;
2620 }
2621 if (mrf_high >= scan_inst->base_mrf &&
2622 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2623 break;
2624 }
2625 }
2626 }
2627 }
2628
2629 if (progress)
2630 invalidate_live_intervals();
2631
2632 return progress;
2633 }
2634
2635 /**
2636 * Walks through basic blocks, looking for repeated MRF writes and
2637 * removing the later ones.
2638 */
2639 bool
2640 fs_visitor::remove_duplicate_mrf_writes()
2641 {
2642 fs_inst *last_mrf_move[16];
2643 bool progress = false;
2644
2645 /* Need to update the MRF tracking for compressed instructions. */
2646 if (dispatch_width == 16)
2647 return false;
2648
2649 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2650
2651 foreach_list_safe(node, &this->instructions) {
2652 fs_inst *inst = (fs_inst *)node;
2653
2654 if (inst->is_control_flow()) {
2655 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2656 }
2657
2658 if (inst->opcode == BRW_OPCODE_MOV &&
2659 inst->dst.file == MRF) {
2660 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2661 if (prev_inst && inst->equals(prev_inst)) {
2662 inst->remove();
2663 progress = true;
2664 continue;
2665 }
2666 }
2667
2668 /* Clear out the last-write records for MRFs that were overwritten. */
2669 if (inst->dst.file == MRF) {
2670 last_mrf_move[inst->dst.reg] = NULL;
2671 }
2672
2673 if (inst->mlen > 0 && inst->base_mrf != -1) {
2674 /* Found a SEND instruction, which will include two or fewer
2675 * implied MRF writes. We could do better here.
2676 */
2677 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2678 last_mrf_move[inst->base_mrf + i] = NULL;
2679 }
2680 }
2681
2682 /* Clear out any MRF move records whose sources got overwritten. */
2683 if (inst->dst.file == GRF) {
2684 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2685 if (last_mrf_move[i] &&
2686 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2687 last_mrf_move[i] = NULL;
2688 }
2689 }
2690 }
2691
2692 if (inst->opcode == BRW_OPCODE_MOV &&
2693 inst->dst.file == MRF &&
2694 inst->src[0].file == GRF &&
2695 !inst->is_partial_write()) {
2696 last_mrf_move[inst->dst.reg] = inst;
2697 }
2698 }
2699
2700 if (progress)
2701 invalidate_live_intervals();
2702
2703 return progress;
2704 }
2705
2706 static void
2707 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2708 int first_grf, int grf_len)
2709 {
2710 bool inst_simd16 = (dispatch_width > 8 &&
2711 !inst->force_uncompressed &&
2712 !inst->force_sechalf);
2713
2714 /* Clear the flag for registers that actually got read (as expected). */
2715 for (int i = 0; i < 3; i++) {
2716 int grf;
2717 if (inst->src[i].file == GRF) {
2718 grf = inst->src[i].reg;
2719 } else if (inst->src[i].file == HW_REG &&
2720 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2721 grf = inst->src[i].fixed_hw_reg.nr;
2722 } else {
2723 continue;
2724 }
2725
2726 if (grf >= first_grf &&
2727 grf < first_grf + grf_len) {
2728 deps[grf - first_grf] = false;
2729 if (inst_simd16)
2730 deps[grf - first_grf + 1] = false;
2731 }
2732 }
2733 }
2734
2735 /**
2736 * Implements this workaround for the original 965:
2737 *
2738 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2739 * check for post destination dependencies on this instruction, software
2740 * must ensure that there is no destination hazard for the case of ‘write
2741 * followed by a posted write’ shown in the following example.
2742 *
2743 * 1. mov r3 0
2744 * 2. send r3.xy <rest of send instruction>
2745 * 3. mov r2 r3
2746 *
2747 * Due to no post-destination dependency check on the ‘send’, the above
2748 * code sequence could have two instructions (1 and 2) in flight at the
2749 * same time that both consider ‘r3’ as the target of their final writes.
2750 */
2751 void
2752 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2753 {
2754 int reg_size = dispatch_width / 8;
2755 int write_len = inst->regs_written * reg_size;
2756 int first_write_grf = inst->dst.reg;
2757 bool needs_dep[BRW_MAX_MRF];
2758 assert(write_len < (int)sizeof(needs_dep) - 1);
2759
2760 memset(needs_dep, false, sizeof(needs_dep));
2761 memset(needs_dep, true, write_len);
2762
2763 clear_deps_for_inst_src(inst, dispatch_width,
2764 needs_dep, first_write_grf, write_len);
2765
2766 /* Walk backwards looking for writes to registers we're writing which
2767 * aren't read since being written. If we hit the start of the program,
2768 * we assume that there are no outstanding dependencies on entry to the
2769 * program.
2770 */
2771 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2772 scan_inst != NULL;
2773 scan_inst = (fs_inst *)scan_inst->prev) {
2774
2775 /* If we hit control flow, assume that there *are* outstanding
2776 * dependencies, and force their cleanup before our instruction.
2777 */
2778 if (scan_inst->is_control_flow()) {
2779 for (int i = 0; i < write_len; i++) {
2780 if (needs_dep[i]) {
2781 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2782 }
2783 }
2784 return;
2785 }
2786
2787 bool scan_inst_simd16 = (dispatch_width > 8 &&
2788 !scan_inst->force_uncompressed &&
2789 !scan_inst->force_sechalf);
2790
2791 /* We insert our reads as late as possible on the assumption that any
2792 * instruction but a MOV that might have left us an outstanding
2793 * dependency has more latency than a MOV.
2794 */
2795 if (scan_inst->dst.file == GRF) {
2796 for (int i = 0; i < scan_inst->regs_written; i++) {
2797 int reg = scan_inst->dst.reg + i * reg_size;
2798
2799 if (reg >= first_write_grf &&
2800 reg < first_write_grf + write_len &&
2801 needs_dep[reg - first_write_grf]) {
2802 inst->insert_before(DEP_RESOLVE_MOV(reg));
2803 needs_dep[reg - first_write_grf] = false;
2804 if (scan_inst_simd16)
2805 needs_dep[reg - first_write_grf + 1] = false;
2806 }
2807 }
2808 }
2809
2810 /* Clear the flag for registers that actually got read (as expected). */
2811 clear_deps_for_inst_src(scan_inst, dispatch_width,
2812 needs_dep, first_write_grf, write_len);
2813
2814 /* Continue the loop only if we haven't resolved all the dependencies */
2815 int i;
2816 for (i = 0; i < write_len; i++) {
2817 if (needs_dep[i])
2818 break;
2819 }
2820 if (i == write_len)
2821 return;
2822 }
2823 }
2824
2825 /**
2826 * Implements this workaround for the original 965:
2827 *
2828 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2829 * used as a destination register until after it has been sourced by an
2830 * instruction with a different destination register.
2831 */
2832 void
2833 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2834 {
2835 int write_len = inst->regs_written * dispatch_width / 8;
2836 int first_write_grf = inst->dst.reg;
2837 bool needs_dep[BRW_MAX_MRF];
2838 assert(write_len < (int)sizeof(needs_dep) - 1);
2839
2840 memset(needs_dep, false, sizeof(needs_dep));
2841 memset(needs_dep, true, write_len);
2842 /* Walk forwards looking for writes to registers we're writing which aren't
2843 * read before being written.
2844 */
2845 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2846 !scan_inst->is_tail_sentinel();
2847 scan_inst = (fs_inst *)scan_inst->next) {
2848 /* If we hit control flow, force resolve all remaining dependencies. */
2849 if (scan_inst->is_control_flow()) {
2850 for (int i = 0; i < write_len; i++) {
2851 if (needs_dep[i])
2852 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2853 }
2854 return;
2855 }
2856
2857 /* Clear the flag for registers that actually got read (as expected). */
2858 clear_deps_for_inst_src(scan_inst, dispatch_width,
2859 needs_dep, first_write_grf, write_len);
2860
2861 /* We insert our reads as late as possible since they're reading the
2862 * result of a SEND, which has massive latency.
2863 */
2864 if (scan_inst->dst.file == GRF &&
2865 scan_inst->dst.reg >= first_write_grf &&
2866 scan_inst->dst.reg < first_write_grf + write_len &&
2867 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2868 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2869 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2870 }
2871
2872 /* Continue the loop only if we haven't resolved all the dependencies */
2873 int i;
2874 for (i = 0; i < write_len; i++) {
2875 if (needs_dep[i])
2876 break;
2877 }
2878 if (i == write_len)
2879 return;
2880 }
2881
2882 /* If we hit the end of the program, resolve all remaining dependencies out
2883 * of paranoia.
2884 */
2885 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2886 assert(last_inst->eot);
2887 for (int i = 0; i < write_len; i++) {
2888 if (needs_dep[i])
2889 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2890 }
2891 }
2892
2893 void
2894 fs_visitor::insert_gen4_send_dependency_workarounds()
2895 {
2896 if (brw->gen != 4 || brw->is_g4x)
2897 return;
2898
2899 /* Note that we're done with register allocation, so GRF fs_regs always
2900 * have a .reg_offset of 0.
2901 */
2902
2903 foreach_list_safe(node, &this->instructions) {
2904 fs_inst *inst = (fs_inst *)node;
2905
2906 if (inst->mlen != 0 && inst->dst.file == GRF) {
2907 insert_gen4_pre_send_dependency_workarounds(inst);
2908 insert_gen4_post_send_dependency_workarounds(inst);
2909 }
2910 }
2911 }
2912
2913 /**
2914 * Turns the generic expression-style uniform pull constant load instruction
2915 * into a hardware-specific series of instructions for loading a pull
2916 * constant.
2917 *
2918 * The expression style allows the CSE pass before this to optimize out
2919 * repeated loads from the same offset, and gives the pre-register-allocation
2920 * scheduling full flexibility, while the conversion to native instructions
2921 * allows the post-register-allocation scheduler the best information
2922 * possible.
2923 *
2924 * Note that execution masking for setting up pull constant loads is special:
2925 * the channels that need to be written are unrelated to the current execution
2926 * mask, since a later instruction will use one of the result channels as a
2927 * source operand for all 8 or 16 of its channels.
2928 */
2929 void
2930 fs_visitor::lower_uniform_pull_constant_loads()
2931 {
2932 foreach_list(node, &this->instructions) {
2933 fs_inst *inst = (fs_inst *)node;
2934
2935 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2936 continue;
2937
2938 if (brw->gen >= 7) {
2939 /* The offset arg before was a vec4-aligned byte offset. We need to
2940 * turn it into a dword offset.
2941 */
2942 fs_reg const_offset_reg = inst->src[1];
2943 assert(const_offset_reg.file == IMM &&
2944 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2945 const_offset_reg.imm.u /= 4;
2946 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2947
2948 /* This is actually going to be a MOV, but since only the first dword
2949 * is accessed, we have a special opcode to do just that one. Note
2950 * that this needs to be an operation that will be considered a def
2951 * by live variable analysis, or register allocation will explode.
2952 */
2953 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2954 payload, const_offset_reg);
2955 setup->force_writemask_all = true;
2956
2957 setup->ir = inst->ir;
2958 setup->annotation = inst->annotation;
2959 inst->insert_before(setup);
2960
2961 /* Similarly, this will only populate the first 4 channels of the
2962 * result register (since we only use smear values from 0-3), but we
2963 * don't tell the optimizer.
2964 */
2965 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2966 inst->src[1] = payload;
2967
2968 invalidate_live_intervals();
2969 } else {
2970 /* Before register allocation, we didn't tell the scheduler about the
2971 * MRF we use. We know it's safe to use this MRF because nothing
2972 * else does except for register spill/unspill, which generates and
2973 * uses its MRF within a single IR instruction.
2974 */
2975 inst->base_mrf = 14;
2976 inst->mlen = 1;
2977 }
2978 }
2979 }
2980
2981 void
2982 fs_visitor::dump_instructions()
2983 {
2984 calculate_register_pressure();
2985
2986 int ip = 0, max_pressure = 0;
2987 foreach_list(node, &this->instructions) {
2988 backend_instruction *inst = (backend_instruction *)node;
2989 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2990 fprintf(stderr, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
2991 dump_instruction(inst);
2992 ++ip;
2993 }
2994 fprintf(stderr, "Maximum %3d registers live at once.\n", max_pressure);
2995 }
2996
2997 void
2998 fs_visitor::dump_instruction(backend_instruction *be_inst)
2999 {
3000 fs_inst *inst = (fs_inst *)be_inst;
3001
3002 if (inst->predicate) {
3003 fprintf(stderr, "(%cf0.%d) ",
3004 inst->predicate_inverse ? '-' : '+',
3005 inst->flag_subreg);
3006 }
3007
3008 fprintf(stderr, "%s", brw_instruction_name(inst->opcode));
3009 if (inst->saturate)
3010 fprintf(stderr, ".sat");
3011 if (inst->conditional_mod) {
3012 fprintf(stderr, "%s", conditional_modifier[inst->conditional_mod]);
3013 if (!inst->predicate &&
3014 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3015 inst->opcode != BRW_OPCODE_IF &&
3016 inst->opcode != BRW_OPCODE_WHILE))) {
3017 fprintf(stderr, ".f0.%d", inst->flag_subreg);
3018 }
3019 }
3020 fprintf(stderr, " ");
3021
3022
3023 switch (inst->dst.file) {
3024 case GRF:
3025 fprintf(stderr, "vgrf%d", inst->dst.reg);
3026 if (virtual_grf_sizes[inst->dst.reg] != 1 ||
3027 inst->dst.subreg_offset)
3028 fprintf(stderr, "+%d.%d",
3029 inst->dst.reg_offset, inst->dst.subreg_offset);
3030 break;
3031 case MRF:
3032 fprintf(stderr, "m%d", inst->dst.reg);
3033 break;
3034 case BAD_FILE:
3035 fprintf(stderr, "(null)");
3036 break;
3037 case UNIFORM:
3038 fprintf(stderr, "***u%d***", inst->dst.reg);
3039 break;
3040 case HW_REG:
3041 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3042 switch (inst->dst.fixed_hw_reg.nr) {
3043 case BRW_ARF_NULL:
3044 fprintf(stderr, "null");
3045 break;
3046 case BRW_ARF_ADDRESS:
3047 fprintf(stderr, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3048 break;
3049 case BRW_ARF_ACCUMULATOR:
3050 fprintf(stderr, "acc%d", inst->dst.fixed_hw_reg.subnr);
3051 break;
3052 case BRW_ARF_FLAG:
3053 fprintf(stderr, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3054 inst->dst.fixed_hw_reg.subnr);
3055 break;
3056 default:
3057 fprintf(stderr, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3058 inst->dst.fixed_hw_reg.subnr);
3059 break;
3060 }
3061 } else {
3062 fprintf(stderr, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3063 }
3064 if (inst->dst.fixed_hw_reg.subnr)
3065 fprintf(stderr, "+%d", inst->dst.fixed_hw_reg.subnr);
3066 break;
3067 default:
3068 fprintf(stderr, "???");
3069 break;
3070 }
3071 fprintf(stderr, ":%s, ", reg_encoding[inst->dst.type]);
3072
3073 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
3074 if (inst->src[i].negate)
3075 fprintf(stderr, "-");
3076 if (inst->src[i].abs)
3077 fprintf(stderr, "|");
3078 switch (inst->src[i].file) {
3079 case GRF:
3080 fprintf(stderr, "vgrf%d", inst->src[i].reg);
3081 if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
3082 inst->src[i].subreg_offset)
3083 fprintf(stderr, "+%d.%d", inst->src[i].reg_offset,
3084 inst->src[i].subreg_offset);
3085 break;
3086 case MRF:
3087 fprintf(stderr, "***m%d***", inst->src[i].reg);
3088 break;
3089 case UNIFORM:
3090 fprintf(stderr, "u%d", inst->src[i].reg);
3091 if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
3092 inst->src[i].subreg_offset)
3093 fprintf(stderr, "+%d.%d", inst->src[i].reg_offset,
3094 inst->src[i].subreg_offset);
3095 break;
3096 case BAD_FILE:
3097 fprintf(stderr, "(null)");
3098 break;
3099 case IMM:
3100 switch (inst->src[i].type) {
3101 case BRW_REGISTER_TYPE_F:
3102 fprintf(stderr, "%ff", inst->src[i].imm.f);
3103 break;
3104 case BRW_REGISTER_TYPE_D:
3105 fprintf(stderr, "%dd", inst->src[i].imm.i);
3106 break;
3107 case BRW_REGISTER_TYPE_UD:
3108 fprintf(stderr, "%uu", inst->src[i].imm.u);
3109 break;
3110 default:
3111 fprintf(stderr, "???");
3112 break;
3113 }
3114 break;
3115 case HW_REG:
3116 if (inst->src[i].fixed_hw_reg.negate)
3117 fprintf(stderr, "-");
3118 if (inst->src[i].fixed_hw_reg.abs)
3119 fprintf(stderr, "|");
3120 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3121 switch (inst->src[i].fixed_hw_reg.nr) {
3122 case BRW_ARF_NULL:
3123 fprintf(stderr, "null");
3124 break;
3125 case BRW_ARF_ADDRESS:
3126 fprintf(stderr, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3127 break;
3128 case BRW_ARF_ACCUMULATOR:
3129 fprintf(stderr, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3130 break;
3131 case BRW_ARF_FLAG:
3132 fprintf(stderr, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3133 inst->src[i].fixed_hw_reg.subnr);
3134 break;
3135 default:
3136 fprintf(stderr, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3137 inst->src[i].fixed_hw_reg.subnr);
3138 break;
3139 }
3140 } else {
3141 fprintf(stderr, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3142 }
3143 if (inst->src[i].fixed_hw_reg.subnr)
3144 fprintf(stderr, "+%d", inst->src[i].fixed_hw_reg.subnr);
3145 if (inst->src[i].fixed_hw_reg.abs)
3146 fprintf(stderr, "|");
3147 break;
3148 default:
3149 fprintf(stderr, "???");
3150 break;
3151 }
3152 if (inst->src[i].abs)
3153 fprintf(stderr, "|");
3154
3155 if (inst->src[i].file != IMM) {
3156 fprintf(stderr, ":%s", brw_reg_type_letters(inst->src[i].type));
3157 }
3158
3159 if (i < 2 && inst->src[i + 1].file != BAD_FILE)
3160 fprintf(stderr, ", ");
3161 }
3162
3163 fprintf(stderr, " ");
3164
3165 if (inst->force_uncompressed)
3166 fprintf(stderr, "1sthalf ");
3167
3168 if (inst->force_sechalf)
3169 fprintf(stderr, "2ndhalf ");
3170
3171 fprintf(stderr, "\n");
3172 }
3173
3174 /**
3175 * Possibly returns an instruction that set up @param reg.
3176 *
3177 * Sometimes we want to take the result of some expression/variable
3178 * dereference tree and rewrite the instruction generating the result
3179 * of the tree. When processing the tree, we know that the
3180 * instructions generated are all writing temporaries that are dead
3181 * outside of this tree. So, if we have some instructions that write
3182 * a temporary, we're free to point that temp write somewhere else.
3183 *
3184 * Note that this doesn't guarantee that the instruction generated
3185 * only reg -- it might be the size=4 destination of a texture instruction.
3186 */
3187 fs_inst *
3188 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3189 fs_inst *end,
3190 const fs_reg &reg)
3191 {
3192 if (end == start ||
3193 end->is_partial_write() ||
3194 reg.reladdr ||
3195 !reg.equals(end->dst)) {
3196 return NULL;
3197 } else {
3198 return end;
3199 }
3200 }
3201
3202 void
3203 fs_visitor::setup_payload_gen6()
3204 {
3205 bool uses_depth =
3206 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3207 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3208
3209 assert(brw->gen >= 6);
3210
3211 /* R0-1: masks, pixel X/Y coordinates. */
3212 c->nr_payload_regs = 2;
3213 /* R2: only for 32-pixel dispatch.*/
3214
3215 /* R3-26: barycentric interpolation coordinates. These appear in the
3216 * same order that they appear in the brw_wm_barycentric_interp_mode
3217 * enum. Each set of coordinates occupies 2 registers if dispatch width
3218 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3219 * appear if they were enabled using the "Barycentric Interpolation
3220 * Mode" bits in WM_STATE.
3221 */
3222 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3223 if (barycentric_interp_modes & (1 << i)) {
3224 c->barycentric_coord_reg[i] = c->nr_payload_regs;
3225 c->nr_payload_regs += 2;
3226 if (dispatch_width == 16) {
3227 c->nr_payload_regs += 2;
3228 }
3229 }
3230 }
3231
3232 /* R27: interpolated depth if uses source depth */
3233 if (uses_depth) {
3234 c->source_depth_reg = c->nr_payload_regs;
3235 c->nr_payload_regs++;
3236 if (dispatch_width == 16) {
3237 /* R28: interpolated depth if not SIMD8. */
3238 c->nr_payload_regs++;
3239 }
3240 }
3241 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3242 if (uses_depth) {
3243 c->source_w_reg = c->nr_payload_regs;
3244 c->nr_payload_regs++;
3245 if (dispatch_width == 16) {
3246 /* R30: interpolated W if not SIMD8. */
3247 c->nr_payload_regs++;
3248 }
3249 }
3250
3251 c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3252 /* R31: MSAA position offsets. */
3253 if (c->prog_data.uses_pos_offset) {
3254 c->sample_pos_reg = c->nr_payload_regs;
3255 c->nr_payload_regs++;
3256 }
3257
3258 /* R32: MSAA input coverage mask */
3259 if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3260 assert(brw->gen >= 7);
3261 c->sample_mask_reg = c->nr_payload_regs;
3262 c->nr_payload_regs++;
3263 if (dispatch_width == 16) {
3264 /* R33: input coverage mask if not SIMD8. */
3265 c->nr_payload_regs++;
3266 }
3267 }
3268
3269 /* R34-: bary for 32-pixel. */
3270 /* R58-59: interp W for 32-pixel. */
3271
3272 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3273 c->source_depth_to_render_target = true;
3274 }
3275 }
3276
3277 void
3278 fs_visitor::assign_binding_table_offsets()
3279 {
3280 uint32_t next_binding_table_offset = 0;
3281
3282 /* If there are no color regions, we still perform an FB write to a null
3283 * renderbuffer, which we place at surface index 0.
3284 */
3285 c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3286 next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3287
3288 assign_common_binding_table_offsets(next_binding_table_offset);
3289 }
3290
3291 void
3292 fs_visitor::calculate_register_pressure()
3293 {
3294 calculate_live_intervals();
3295
3296 int num_instructions = 0;
3297 foreach_list(node, &this->instructions) {
3298 ++num_instructions;
3299 }
3300
3301 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3302
3303 for (int reg = 0; reg < virtual_grf_count; reg++) {
3304 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3305 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3306 }
3307 }
3308
3309 /**
3310 * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
3311 *
3312 * The needs_unlit_centroid_workaround ends up producing one of these per
3313 * channel of centroid input, so it's good to clean them up.
3314 *
3315 * An assumption here is that nothing ever modifies the dispatched pixels
3316 * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
3317 * dictates that anyway.
3318 */
3319 void
3320 fs_visitor::opt_drop_redundant_mov_to_flags()
3321 {
3322 bool flag_mov_found[2] = {false};
3323
3324 foreach_list_safe(node, &this->instructions) {
3325 fs_inst *inst = (fs_inst *)node;
3326
3327 if (inst->is_control_flow()) {
3328 memset(flag_mov_found, 0, sizeof(flag_mov_found));
3329 } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
3330 if (!flag_mov_found[inst->flag_subreg])
3331 flag_mov_found[inst->flag_subreg] = true;
3332 else
3333 inst->remove();
3334 } else if (inst->writes_flag()) {
3335 flag_mov_found[inst->flag_subreg] = false;
3336 }
3337 }
3338 }
3339
3340 bool
3341 fs_visitor::run()
3342 {
3343 sanity_param_count = fp->Base.Parameters->NumParameters;
3344 bool allocated_without_spills;
3345
3346 assign_binding_table_offsets();
3347
3348 if (brw->gen >= 6)
3349 setup_payload_gen6();
3350 else
3351 setup_payload_gen4();
3352
3353 if (0) {
3354 emit_dummy_fs();
3355 } else {
3356 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3357 emit_shader_time_begin();
3358
3359 calculate_urb_setup();
3360 if (fp->Base.InputsRead > 0) {
3361 if (brw->gen < 6)
3362 emit_interpolation_setup_gen4();
3363 else
3364 emit_interpolation_setup_gen6();
3365 }
3366
3367 /* We handle discards by keeping track of the still-live pixels in f0.1.
3368 * Initialize it with the dispatched pixels.
3369 */
3370 if (fp->UsesKill || c->key.alpha_test_func) {
3371 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3372 discard_init->flag_subreg = 1;
3373 }
3374
3375 /* Generate FS IR for main(). (the visitor only descends into
3376 * functions called "main").
3377 */
3378 if (shader) {
3379 foreach_list(node, &*shader->base.ir) {
3380 ir_instruction *ir = (ir_instruction *)node;
3381 base_ir = ir;
3382 this->result = reg_undef;
3383 ir->accept(this);
3384 }
3385 } else {
3386 emit_fragment_program_code();
3387 }
3388 base_ir = NULL;
3389 if (failed)
3390 return false;
3391
3392 emit(FS_OPCODE_PLACEHOLDER_HALT);
3393
3394 if (c->key.alpha_test_func)
3395 emit_alpha_test();
3396
3397 emit_fb_writes();
3398
3399 split_virtual_grfs();
3400
3401 move_uniform_array_access_to_pull_constants();
3402 remove_dead_constants();
3403 setup_pull_constants();
3404
3405 opt_drop_redundant_mov_to_flags();
3406
3407 bool progress;
3408 do {
3409 progress = false;
3410
3411 compact_virtual_grfs();
3412
3413 progress = remove_duplicate_mrf_writes() || progress;
3414
3415 progress = opt_algebraic() || progress;
3416 progress = opt_cse() || progress;
3417 progress = opt_copy_propagate() || progress;
3418 progress = opt_peephole_predicated_break() || progress;
3419 progress = dead_code_eliminate() || progress;
3420 progress = dead_code_eliminate_local() || progress;
3421 progress = opt_peephole_sel() || progress;
3422 progress = dead_control_flow_eliminate(this) || progress;
3423 progress = opt_saturate_propagation() || progress;
3424 progress = register_coalesce() || progress;
3425 progress = compute_to_mrf() || progress;
3426 } while (progress);
3427
3428 lower_uniform_pull_constant_loads();
3429
3430 assign_curb_setup();
3431 assign_urb_setup();
3432
3433 static enum instruction_scheduler_mode pre_modes[] = {
3434 SCHEDULE_PRE,
3435 SCHEDULE_PRE_NON_LIFO,
3436 SCHEDULE_PRE_LIFO,
3437 };
3438
3439 /* Try each scheduling heuristic to see if it can successfully register
3440 * allocate without spilling. They should be ordered by decreasing
3441 * performance but increasing likelihood of allocating.
3442 */
3443 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3444 schedule_instructions(pre_modes[i]);
3445
3446 if (0) {
3447 assign_regs_trivial();
3448 allocated_without_spills = true;
3449 } else {
3450 allocated_without_spills = assign_regs(false);
3451 }
3452 if (allocated_without_spills)
3453 break;
3454 }
3455
3456 if (!allocated_without_spills) {
3457 /* We assume that any spilling is worse than just dropping back to
3458 * SIMD8. There's probably actually some intermediate point where
3459 * SIMD16 with a couple of spills is still better.
3460 */
3461 if (dispatch_width == 16) {
3462 fail("Failure to register allocate. Reduce number of "
3463 "live scalar values to avoid this.");
3464 }
3465
3466 /* Since we're out of heuristics, just go spill registers until we
3467 * get an allocation.
3468 */
3469 while (!assign_regs(true)) {
3470 if (failed)
3471 break;
3472 }
3473 }
3474 }
3475 assert(force_uncompressed_stack == 0);
3476
3477 /* This must come after all optimization and register allocation, since
3478 * it inserts dead code that happens to have side effects, and it does
3479 * so based on the actual physical registers in use.
3480 */
3481 insert_gen4_send_dependency_workarounds();
3482
3483 if (failed)
3484 return false;
3485
3486 if (!allocated_without_spills)
3487 schedule_instructions(SCHEDULE_POST);
3488
3489 if (dispatch_width == 8)
3490 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3491 else
3492 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3493
3494 /* If any state parameters were appended, then ParameterValues could have
3495 * been realloced, in which case the driver uniform storage set up by
3496 * _mesa_associate_uniform_storage() would point to freed memory. Make
3497 * sure that didn't happen.
3498 */
3499 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3500
3501 return !failed;
3502 }
3503
3504 const unsigned *
3505 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3506 struct gl_fragment_program *fp,
3507 struct gl_shader_program *prog,
3508 unsigned *final_assembly_size)
3509 {
3510 bool start_busy = false;
3511 double start_time = 0;
3512
3513 if (unlikely(brw->perf_debug)) {
3514 start_busy = (brw->batch.last_bo &&
3515 drm_intel_bo_busy(brw->batch.last_bo));
3516 start_time = get_time();
3517 }
3518
3519 struct brw_shader *shader = NULL;
3520 if (prog)
3521 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3522
3523 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3524 brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3525
3526 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3527 */
3528 fs_visitor v(brw, c, prog, fp, 8);
3529 if (!v.run()) {
3530 if (prog) {
3531 prog->LinkStatus = false;
3532 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3533 }
3534
3535 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3536 v.fail_msg);
3537
3538 return NULL;
3539 }
3540
3541 exec_list *simd16_instructions = NULL;
3542 fs_visitor v2(brw, c, prog, fp, 16);
3543 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3544 if (c->prog_data.base.nr_pull_params == 0) {
3545 /* Try a SIMD16 compile */
3546 v2.import_uniforms(&v);
3547 if (!v2.run()) {
3548 perf_debug("SIMD16 shader failed to compile, falling back to "
3549 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3550 } else {
3551 simd16_instructions = &v2.instructions;
3552 }
3553 } else {
3554 perf_debug("Skipping SIMD16 due to pull parameters.\n");
3555 }
3556 }
3557
3558 const unsigned *assembly = NULL;
3559 if (brw->gen >= 8) {
3560 gen8_fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3561 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3562 final_assembly_size);
3563 } else {
3564 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3565 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3566 final_assembly_size);
3567 }
3568
3569 if (unlikely(brw->perf_debug) && shader) {
3570 if (shader->compiled_once)
3571 brw_wm_debug_recompile(brw, prog, &c->key);
3572 shader->compiled_once = true;
3573
3574 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3575 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3576 (get_time() - start_time) * 1000);
3577 }
3578 }
3579
3580 return assembly;
3581 }
3582
3583 bool
3584 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3585 {
3586 struct brw_context *brw = brw_context(ctx);
3587 struct brw_wm_prog_key key;
3588
3589 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3590 return true;
3591
3592 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3593 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3594 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3595 bool program_uses_dfdy = fp->UsesDFdy;
3596
3597 memset(&key, 0, sizeof(key));
3598
3599 if (brw->gen < 6) {
3600 if (fp->UsesKill)
3601 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3602
3603 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3604 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3605
3606 /* Just assume depth testing. */
3607 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3608 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3609 }
3610
3611 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3612 BRW_FS_VARYING_INPUT_MASK) > 16)
3613 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3614
3615 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3616
3617 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3618 for (unsigned i = 0; i < sampler_count; i++) {
3619 if (fp->Base.ShadowSamplers & (1 << i)) {
3620 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3621 key.tex.swizzles[i] =
3622 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3623 } else {
3624 /* Color sampler: assume no swizzling. */
3625 key.tex.swizzles[i] = SWIZZLE_XYZW;
3626 }
3627 }
3628
3629 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3630 key.drawable_height = ctx->DrawBuffer->Height;
3631 }
3632
3633 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3634 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3635 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3636
3637 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3638 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3639 key.nr_color_regions > 1;
3640 }
3641
3642 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3643 * quality of the derivatives is likely to be determined by the driconf
3644 * option.
3645 */
3646 key.high_quality_derivatives = brw->disable_derivative_optimization;
3647
3648 key.program_string_id = bfp->id;
3649
3650 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3651 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3652
3653 bool success = do_wm_prog(brw, prog, bfp, &key);
3654
3655 brw->wm.base.prog_offset = old_prog_offset;
3656 brw->wm.prog_data = old_prog_data;
3657
3658 return success;
3659 }