i965/fs: Split fs_visitor::register_coalesce() into its own file.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53
54 void
55 fs_inst::init()
56 {
57 memset(this, 0, sizeof(*this));
58 this->conditional_mod = BRW_CONDITIONAL_NONE;
59
60 this->dst = reg_undef;
61 this->src[0] = reg_undef;
62 this->src[1] = reg_undef;
63 this->src[2] = reg_undef;
64
65 /* This will be the case for almost all instructions. */
66 this->regs_written = 1;
67 }
68
69 fs_inst::fs_inst()
70 {
71 init();
72 this->opcode = BRW_OPCODE_NOP;
73 }
74
75 fs_inst::fs_inst(enum opcode opcode)
76 {
77 init();
78 this->opcode = opcode;
79 }
80
81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
82 {
83 init();
84 this->opcode = opcode;
85 this->dst = dst;
86
87 if (dst.file == GRF)
88 assert(dst.reg_offset >= 0);
89 }
90
91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
92 {
93 init();
94 this->opcode = opcode;
95 this->dst = dst;
96 this->src[0] = src0;
97
98 if (dst.file == GRF)
99 assert(dst.reg_offset >= 0);
100 if (src[0].file == GRF)
101 assert(src[0].reg_offset >= 0);
102 }
103
104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
105 {
106 init();
107 this->opcode = opcode;
108 this->dst = dst;
109 this->src[0] = src0;
110 this->src[1] = src1;
111
112 if (dst.file == GRF)
113 assert(dst.reg_offset >= 0);
114 if (src[0].file == GRF)
115 assert(src[0].reg_offset >= 0);
116 if (src[1].file == GRF)
117 assert(src[1].reg_offset >= 0);
118 }
119
120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
121 fs_reg src0, fs_reg src1, fs_reg src2)
122 {
123 init();
124 this->opcode = opcode;
125 this->dst = dst;
126 this->src[0] = src0;
127 this->src[1] = src1;
128 this->src[2] = src2;
129
130 if (dst.file == GRF)
131 assert(dst.reg_offset >= 0);
132 if (src[0].file == GRF)
133 assert(src[0].reg_offset >= 0);
134 if (src[1].file == GRF)
135 assert(src[1].reg_offset >= 0);
136 if (src[2].file == GRF)
137 assert(src[2].reg_offset >= 0);
138 }
139
140 #define ALU1(op) \
141 fs_inst * \
142 fs_visitor::op(fs_reg dst, fs_reg src0) \
143 { \
144 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
145 }
146
147 #define ALU2(op) \
148 fs_inst * \
149 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
150 { \
151 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
152 }
153
154 #define ALU3(op) \
155 fs_inst * \
156 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
157 { \
158 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
159 }
160
161 ALU1(NOT)
162 ALU1(MOV)
163 ALU1(FRC)
164 ALU1(RNDD)
165 ALU1(RNDE)
166 ALU1(RNDZ)
167 ALU2(ADD)
168 ALU2(MUL)
169 ALU2(MACH)
170 ALU2(AND)
171 ALU2(OR)
172 ALU2(XOR)
173 ALU2(SHL)
174 ALU2(SHR)
175 ALU2(ASR)
176 ALU3(LRP)
177 ALU1(BFREV)
178 ALU3(BFE)
179 ALU2(BFI1)
180 ALU3(BFI2)
181 ALU1(FBH)
182 ALU1(FBL)
183 ALU1(CBIT)
184 ALU3(MAD)
185 ALU2(ADDC)
186 ALU2(SUBB)
187 ALU2(SEL)
188
189 /** Gen4 predicated IF. */
190 fs_inst *
191 fs_visitor::IF(uint32_t predicate)
192 {
193 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195 return inst;
196 }
197
198 /** Gen6 IF with embedded comparison. */
199 fs_inst *
200 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
201 {
202 assert(brw->gen == 6);
203 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
204 reg_null_d, src0, src1);
205 inst->conditional_mod = condition;
206 return inst;
207 }
208
209 /**
210 * CMP: Sets the low bit of the destination channels with the result
211 * of the comparison, while the upper bits are undefined, and updates
212 * the flag register with the packed 16 bits of the result.
213 */
214 fs_inst *
215 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
216 {
217 fs_inst *inst;
218
219 /* Take the instruction:
220 *
221 * CMP null<d> src0<f> src1<f>
222 *
223 * Original gen4 does type conversion to the destination type before
224 * comparison, producing garbage results for floating point comparisons.
225 * gen5 does the comparison on the execution type (resolved source types),
226 * so dst type doesn't matter. gen6 does comparison and then uses the
227 * result as if it was the dst type with no conversion, which happens to
228 * mostly work out for float-interpreted-as-int since our comparisons are
229 * for >0, =0, <0.
230 */
231 if (brw->gen == 4) {
232 dst.type = src0.type;
233 if (dst.file == HW_REG)
234 dst.fixed_hw_reg.type = dst.type;
235 }
236
237 resolve_ud_negate(&src0);
238 resolve_ud_negate(&src1);
239
240 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
241 inst->conditional_mod = condition;
242
243 return inst;
244 }
245
246 exec_list
247 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
248 const fs_reg &surf_index,
249 const fs_reg &varying_offset,
250 uint32_t const_offset)
251 {
252 exec_list instructions;
253 fs_inst *inst;
254
255 /* We have our constant surface use a pitch of 4 bytes, so our index can
256 * be any component of a vector, and then we load 4 contiguous
257 * components starting from that.
258 *
259 * We break down the const_offset to a portion added to the variable
260 * offset and a portion done using reg_offset, which means that if you
261 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
262 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
263 * CSE can later notice that those loads are all the same and eliminate
264 * the redundant ones.
265 */
266 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
267 instructions.push_tail(ADD(vec4_offset,
268 varying_offset, const_offset & ~3));
269
270 int scale = 1;
271 if (brw->gen == 4 && dispatch_width == 8) {
272 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
273 * u, v, r) as parameters, or we can just use the SIMD16 message
274 * consisting of (header, u). We choose the second, at the cost of a
275 * longer return length.
276 */
277 scale = 2;
278 }
279
280 enum opcode op;
281 if (brw->gen >= 7)
282 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
283 else
284 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
285 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
286 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
287 inst->regs_written = 4 * scale;
288 instructions.push_tail(inst);
289
290 if (brw->gen < 7) {
291 inst->base_mrf = 13;
292 inst->header_present = true;
293 if (brw->gen == 4)
294 inst->mlen = 3;
295 else
296 inst->mlen = 1 + dispatch_width / 8;
297 }
298
299 vec4_result.reg_offset += (const_offset & 3) * scale;
300 instructions.push_tail(MOV(dst, vec4_result));
301
302 return instructions;
303 }
304
305 /**
306 * A helper for MOV generation for fixing up broken hardware SEND dependency
307 * handling.
308 */
309 fs_inst *
310 fs_visitor::DEP_RESOLVE_MOV(int grf)
311 {
312 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
313
314 inst->ir = NULL;
315 inst->annotation = "send dependency resolve";
316
317 /* The caller always wants uncompressed to emit the minimal extra
318 * dependencies, and to avoid having to deal with aligning its regs to 2.
319 */
320 inst->force_uncompressed = true;
321
322 return inst;
323 }
324
325 bool
326 fs_inst::equals(fs_inst *inst) const
327 {
328 return (opcode == inst->opcode &&
329 dst.equals(inst->dst) &&
330 src[0].equals(inst->src[0]) &&
331 src[1].equals(inst->src[1]) &&
332 src[2].equals(inst->src[2]) &&
333 saturate == inst->saturate &&
334 predicate == inst->predicate &&
335 conditional_mod == inst->conditional_mod &&
336 mlen == inst->mlen &&
337 base_mrf == inst->base_mrf &&
338 sampler == inst->sampler &&
339 target == inst->target &&
340 eot == inst->eot &&
341 header_present == inst->header_present &&
342 shadow_compare == inst->shadow_compare &&
343 offset == inst->offset);
344 }
345
346 bool
347 fs_inst::overwrites_reg(const fs_reg &reg) const
348 {
349 return (reg.file == dst.file &&
350 reg.reg == dst.reg &&
351 reg.reg_offset >= dst.reg_offset &&
352 reg.reg_offset < dst.reg_offset + regs_written);
353 }
354
355 bool
356 fs_inst::is_send_from_grf() const
357 {
358 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
359 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
360 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
361 src[1].file == GRF) ||
362 (is_tex() && src[0].file == GRF));
363 }
364
365 bool
366 fs_visitor::can_do_source_mods(fs_inst *inst)
367 {
368 if (brw->gen == 6 && inst->is_math())
369 return false;
370
371 if (inst->is_send_from_grf())
372 return false;
373
374 if (!inst->can_do_source_mods())
375 return false;
376
377 return true;
378 }
379
380 void
381 fs_reg::init()
382 {
383 memset(this, 0, sizeof(*this));
384 stride = 1;
385 }
386
387 /** Generic unset register constructor. */
388 fs_reg::fs_reg()
389 {
390 init();
391 this->file = BAD_FILE;
392 }
393
394 /** Immediate value constructor. */
395 fs_reg::fs_reg(float f)
396 {
397 init();
398 this->file = IMM;
399 this->type = BRW_REGISTER_TYPE_F;
400 this->imm.f = f;
401 }
402
403 /** Immediate value constructor. */
404 fs_reg::fs_reg(int32_t i)
405 {
406 init();
407 this->file = IMM;
408 this->type = BRW_REGISTER_TYPE_D;
409 this->imm.i = i;
410 }
411
412 /** Immediate value constructor. */
413 fs_reg::fs_reg(uint32_t u)
414 {
415 init();
416 this->file = IMM;
417 this->type = BRW_REGISTER_TYPE_UD;
418 this->imm.u = u;
419 }
420
421 /** Fixed brw_reg. */
422 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
423 {
424 init();
425 this->file = HW_REG;
426 this->fixed_hw_reg = fixed_hw_reg;
427 this->type = fixed_hw_reg.type;
428 }
429
430 bool
431 fs_reg::equals(const fs_reg &r) const
432 {
433 return (file == r.file &&
434 reg == r.reg &&
435 reg_offset == r.reg_offset &&
436 subreg_offset == r.subreg_offset &&
437 type == r.type &&
438 negate == r.negate &&
439 abs == r.abs &&
440 !reladdr && !r.reladdr &&
441 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
442 sizeof(fixed_hw_reg)) == 0 &&
443 stride == r.stride &&
444 imm.u == r.imm.u);
445 }
446
447 fs_reg &
448 fs_reg::apply_stride(unsigned stride)
449 {
450 assert((this->stride * stride) <= 4 &&
451 (is_power_of_two(stride) || stride == 0) &&
452 file != HW_REG && file != IMM);
453 this->stride *= stride;
454 return *this;
455 }
456
457 fs_reg &
458 fs_reg::set_smear(unsigned subreg)
459 {
460 assert(file != HW_REG && file != IMM);
461 subreg_offset = subreg * type_sz(type);
462 stride = 0;
463 return *this;
464 }
465
466 bool
467 fs_reg::is_contiguous() const
468 {
469 return stride == 1;
470 }
471
472 bool
473 fs_reg::is_zero() const
474 {
475 if (file != IMM)
476 return false;
477
478 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
479 }
480
481 bool
482 fs_reg::is_one() const
483 {
484 if (file != IMM)
485 return false;
486
487 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
488 }
489
490 bool
491 fs_reg::is_null() const
492 {
493 return file == HW_REG &&
494 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
495 fixed_hw_reg.nr == BRW_ARF_NULL;
496 }
497
498 bool
499 fs_reg::is_valid_3src() const
500 {
501 return file == GRF || file == UNIFORM;
502 }
503
504 int
505 fs_visitor::type_size(const struct glsl_type *type)
506 {
507 unsigned int size, i;
508
509 switch (type->base_type) {
510 case GLSL_TYPE_UINT:
511 case GLSL_TYPE_INT:
512 case GLSL_TYPE_FLOAT:
513 case GLSL_TYPE_BOOL:
514 return type->components();
515 case GLSL_TYPE_ARRAY:
516 return type_size(type->fields.array) * type->length;
517 case GLSL_TYPE_STRUCT:
518 size = 0;
519 for (i = 0; i < type->length; i++) {
520 size += type_size(type->fields.structure[i].type);
521 }
522 return size;
523 case GLSL_TYPE_SAMPLER:
524 /* Samplers take up no register space, since they're baked in at
525 * link time.
526 */
527 return 0;
528 case GLSL_TYPE_ATOMIC_UINT:
529 return 0;
530 case GLSL_TYPE_IMAGE:
531 case GLSL_TYPE_VOID:
532 case GLSL_TYPE_ERROR:
533 case GLSL_TYPE_INTERFACE:
534 assert(!"not reached");
535 break;
536 }
537
538 return 0;
539 }
540
541 fs_reg
542 fs_visitor::get_timestamp()
543 {
544 assert(brw->gen >= 7);
545
546 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
547 BRW_ARF_TIMESTAMP,
548 0),
549 BRW_REGISTER_TYPE_UD));
550
551 fs_reg dst = fs_reg(this, glsl_type::uint_type);
552
553 fs_inst *mov = emit(MOV(dst, ts));
554 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
555 * even if it's not enabled in the dispatch.
556 */
557 mov->force_writemask_all = true;
558 mov->force_uncompressed = true;
559
560 /* The caller wants the low 32 bits of the timestamp. Since it's running
561 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
562 * which is plenty of time for our purposes. It is identical across the
563 * EUs, but since it's tracking GPU core speed it will increment at a
564 * varying rate as render P-states change.
565 *
566 * The caller could also check if render P-states have changed (or anything
567 * else that might disrupt timing) by setting smear to 2 and checking if
568 * that field is != 0.
569 */
570 dst.set_smear(0);
571
572 return dst;
573 }
574
575 void
576 fs_visitor::emit_shader_time_begin()
577 {
578 current_annotation = "shader time start";
579 shader_start_time = get_timestamp();
580 }
581
582 void
583 fs_visitor::emit_shader_time_end()
584 {
585 current_annotation = "shader time end";
586
587 enum shader_time_shader_type type, written_type, reset_type;
588 if (dispatch_width == 8) {
589 type = ST_FS8;
590 written_type = ST_FS8_WRITTEN;
591 reset_type = ST_FS8_RESET;
592 } else {
593 assert(dispatch_width == 16);
594 type = ST_FS16;
595 written_type = ST_FS16_WRITTEN;
596 reset_type = ST_FS16_RESET;
597 }
598
599 fs_reg shader_end_time = get_timestamp();
600
601 /* Check that there weren't any timestamp reset events (assuming these
602 * were the only two timestamp reads that happened).
603 */
604 fs_reg reset = shader_end_time;
605 reset.set_smear(2);
606 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
607 test->conditional_mod = BRW_CONDITIONAL_Z;
608 emit(IF(BRW_PREDICATE_NORMAL));
609
610 push_force_uncompressed();
611 fs_reg start = shader_start_time;
612 start.negate = true;
613 fs_reg diff = fs_reg(this, glsl_type::uint_type);
614 emit(ADD(diff, start, shader_end_time));
615
616 /* If there were no instructions between the two timestamp gets, the diff
617 * is 2 cycles. Remove that overhead, so I can forget about that when
618 * trying to determine the time taken for single instructions.
619 */
620 emit(ADD(diff, diff, fs_reg(-2u)));
621
622 emit_shader_time_write(type, diff);
623 emit_shader_time_write(written_type, fs_reg(1u));
624 emit(BRW_OPCODE_ELSE);
625 emit_shader_time_write(reset_type, fs_reg(1u));
626 emit(BRW_OPCODE_ENDIF);
627
628 pop_force_uncompressed();
629 }
630
631 void
632 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
633 fs_reg value)
634 {
635 int shader_time_index =
636 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
637 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
638
639 fs_reg payload;
640 if (dispatch_width == 8)
641 payload = fs_reg(this, glsl_type::uvec2_type);
642 else
643 payload = fs_reg(this, glsl_type::uint_type);
644
645 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
646 fs_reg(), payload, offset, value));
647 }
648
649 void
650 fs_visitor::vfail(const char *format, va_list va)
651 {
652 char *msg;
653
654 if (failed)
655 return;
656
657 failed = true;
658
659 msg = ralloc_vasprintf(mem_ctx, format, va);
660 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
661
662 this->fail_msg = msg;
663
664 if (INTEL_DEBUG & DEBUG_WM) {
665 fprintf(stderr, "%s", msg);
666 }
667 }
668
669 void
670 fs_visitor::fail(const char *format, ...)
671 {
672 va_list va;
673
674 va_start(va, format);
675 vfail(format, va);
676 va_end(va);
677 }
678
679 /**
680 * Mark this program as impossible to compile in SIMD16 mode.
681 *
682 * During the SIMD8 compile (which happens first), we can detect and flag
683 * things that are unsupported in SIMD16 mode, so the compiler can skip
684 * the SIMD16 compile altogether.
685 *
686 * During a SIMD16 compile (if one happens anyway), this just calls fail().
687 */
688 void
689 fs_visitor::no16(const char *format, ...)
690 {
691 va_list va;
692
693 va_start(va, format);
694
695 if (dispatch_width == 16) {
696 vfail(format, va);
697 } else {
698 simd16_unsupported = true;
699
700 if (INTEL_DEBUG & DEBUG_PERF) {
701 if (no16_msg)
702 ralloc_vasprintf_append(&no16_msg, format, va);
703 else
704 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
705 }
706 }
707
708 va_end(va);
709 }
710
711 fs_inst *
712 fs_visitor::emit(enum opcode opcode)
713 {
714 return emit(new(mem_ctx) fs_inst(opcode));
715 }
716
717 fs_inst *
718 fs_visitor::emit(enum opcode opcode, fs_reg dst)
719 {
720 return emit(new(mem_ctx) fs_inst(opcode, dst));
721 }
722
723 fs_inst *
724 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
725 {
726 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
727 }
728
729 fs_inst *
730 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
731 {
732 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
733 }
734
735 fs_inst *
736 fs_visitor::emit(enum opcode opcode, fs_reg dst,
737 fs_reg src0, fs_reg src1, fs_reg src2)
738 {
739 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
740 }
741
742 void
743 fs_visitor::push_force_uncompressed()
744 {
745 force_uncompressed_stack++;
746 }
747
748 void
749 fs_visitor::pop_force_uncompressed()
750 {
751 force_uncompressed_stack--;
752 assert(force_uncompressed_stack >= 0);
753 }
754
755 /**
756 * Returns true if the instruction has a flag that means it won't
757 * update an entire destination register.
758 *
759 * For example, dead code elimination and live variable analysis want to know
760 * when a write to a variable screens off any preceding values that were in
761 * it.
762 */
763 bool
764 fs_inst::is_partial_write() const
765 {
766 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
767 this->force_uncompressed ||
768 this->force_sechalf || !this->dst.is_contiguous());
769 }
770
771 int
772 fs_inst::regs_read(fs_visitor *v, int arg) const
773 {
774 if (is_tex() && arg == 0 && src[0].file == GRF) {
775 if (v->dispatch_width == 16)
776 return (mlen + 1) / 2;
777 else
778 return mlen;
779 }
780 return 1;
781 }
782
783 bool
784 fs_inst::reads_flag() const
785 {
786 return predicate;
787 }
788
789 bool
790 fs_inst::writes_flag() const
791 {
792 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
793 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
794 }
795
796 /**
797 * Returns how many MRFs an FS opcode will write over.
798 *
799 * Note that this is not the 0 or 1 implied writes in an actual gen
800 * instruction -- the FS opcodes often generate MOVs in addition.
801 */
802 int
803 fs_visitor::implied_mrf_writes(fs_inst *inst)
804 {
805 if (inst->mlen == 0)
806 return 0;
807
808 if (inst->base_mrf == -1)
809 return 0;
810
811 switch (inst->opcode) {
812 case SHADER_OPCODE_RCP:
813 case SHADER_OPCODE_RSQ:
814 case SHADER_OPCODE_SQRT:
815 case SHADER_OPCODE_EXP2:
816 case SHADER_OPCODE_LOG2:
817 case SHADER_OPCODE_SIN:
818 case SHADER_OPCODE_COS:
819 return 1 * dispatch_width / 8;
820 case SHADER_OPCODE_POW:
821 case SHADER_OPCODE_INT_QUOTIENT:
822 case SHADER_OPCODE_INT_REMAINDER:
823 return 2 * dispatch_width / 8;
824 case SHADER_OPCODE_TEX:
825 case FS_OPCODE_TXB:
826 case SHADER_OPCODE_TXD:
827 case SHADER_OPCODE_TXF:
828 case SHADER_OPCODE_TXF_CMS:
829 case SHADER_OPCODE_TXF_MCS:
830 case SHADER_OPCODE_TG4:
831 case SHADER_OPCODE_TG4_OFFSET:
832 case SHADER_OPCODE_TXL:
833 case SHADER_OPCODE_TXS:
834 case SHADER_OPCODE_LOD:
835 return 1;
836 case FS_OPCODE_FB_WRITE:
837 return 2;
838 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
839 case SHADER_OPCODE_GEN4_SCRATCH_READ:
840 return 1;
841 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
842 return inst->mlen;
843 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
844 return 2;
845 case SHADER_OPCODE_UNTYPED_ATOMIC:
846 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
847 return 0;
848 default:
849 assert(!"not reached");
850 return inst->mlen;
851 }
852 }
853
854 int
855 fs_visitor::virtual_grf_alloc(int size)
856 {
857 if (virtual_grf_array_size <= virtual_grf_count) {
858 if (virtual_grf_array_size == 0)
859 virtual_grf_array_size = 16;
860 else
861 virtual_grf_array_size *= 2;
862 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
863 virtual_grf_array_size);
864 }
865 virtual_grf_sizes[virtual_grf_count] = size;
866 return virtual_grf_count++;
867 }
868
869 /** Fixed HW reg constructor. */
870 fs_reg::fs_reg(enum register_file file, int reg)
871 {
872 init();
873 this->file = file;
874 this->reg = reg;
875 this->type = BRW_REGISTER_TYPE_F;
876 }
877
878 /** Fixed HW reg constructor. */
879 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
880 {
881 init();
882 this->file = file;
883 this->reg = reg;
884 this->type = type;
885 }
886
887 /** Automatic reg constructor. */
888 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
889 {
890 init();
891
892 this->file = GRF;
893 this->reg = v->virtual_grf_alloc(v->type_size(type));
894 this->reg_offset = 0;
895 this->type = brw_type_for_base_type(type);
896 }
897
898 fs_reg *
899 fs_visitor::variable_storage(ir_variable *var)
900 {
901 return (fs_reg *)hash_table_find(this->variable_ht, var);
902 }
903
904 void
905 import_uniforms_callback(const void *key,
906 void *data,
907 void *closure)
908 {
909 struct hash_table *dst_ht = (struct hash_table *)closure;
910 const fs_reg *reg = (const fs_reg *)data;
911
912 if (reg->file != UNIFORM)
913 return;
914
915 hash_table_insert(dst_ht, data, key);
916 }
917
918 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
919 * This brings in those uniform definitions
920 */
921 void
922 fs_visitor::import_uniforms(fs_visitor *v)
923 {
924 hash_table_call_foreach(v->variable_ht,
925 import_uniforms_callback,
926 variable_ht);
927 this->push_constant_loc = v->push_constant_loc;
928 this->pull_constant_loc = v->pull_constant_loc;
929 this->uniforms = v->uniforms;
930 this->param_size = v->param_size;
931 }
932
933 /* Our support for uniforms is piggy-backed on the struct
934 * gl_fragment_program, because that's where the values actually
935 * get stored, rather than in some global gl_shader_program uniform
936 * store.
937 */
938 void
939 fs_visitor::setup_uniform_values(ir_variable *ir)
940 {
941 int namelen = strlen(ir->name);
942
943 /* The data for our (non-builtin) uniforms is stored in a series of
944 * gl_uniform_driver_storage structs for each subcomponent that
945 * glGetUniformLocation() could name. We know it's been set up in the same
946 * order we'd walk the type, so walk the list of storage and find anything
947 * with our name, or the prefix of a component that starts with our name.
948 */
949 unsigned params_before = uniforms;
950 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
951 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
952
953 if (strncmp(ir->name, storage->name, namelen) != 0 ||
954 (storage->name[namelen] != 0 &&
955 storage->name[namelen] != '.' &&
956 storage->name[namelen] != '[')) {
957 continue;
958 }
959
960 unsigned slots = storage->type->component_slots();
961 if (storage->array_elements)
962 slots *= storage->array_elements;
963
964 for (unsigned i = 0; i < slots; i++) {
965 stage_prog_data->param[uniforms++] = &storage->storage[i].f;
966 }
967 }
968
969 /* Make sure we actually initialized the right amount of stuff here. */
970 assert(params_before + ir->type->component_slots() == uniforms);
971 (void)params_before;
972 }
973
974
975 /* Our support for builtin uniforms is even scarier than non-builtin.
976 * It sits on top of the PROG_STATE_VAR parameters that are
977 * automatically updated from GL context state.
978 */
979 void
980 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
981 {
982 const ir_state_slot *const slots = ir->state_slots;
983 assert(ir->state_slots != NULL);
984
985 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
986 /* This state reference has already been setup by ir_to_mesa, but we'll
987 * get the same index back here.
988 */
989 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
990 (gl_state_index *)slots[i].tokens);
991
992 /* Add each of the unique swizzles of the element as a parameter.
993 * This'll end up matching the expected layout of the
994 * array/matrix/structure we're trying to fill in.
995 */
996 int last_swiz = -1;
997 for (unsigned int j = 0; j < 4; j++) {
998 int swiz = GET_SWZ(slots[i].swizzle, j);
999 if (swiz == last_swiz)
1000 break;
1001 last_swiz = swiz;
1002
1003 stage_prog_data->param[uniforms++] =
1004 &fp->Base.Parameters->ParameterValues[index][swiz].f;
1005 }
1006 }
1007 }
1008
1009 fs_reg *
1010 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1011 {
1012 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1013 fs_reg wpos = *reg;
1014 bool flip = !ir->data.origin_upper_left ^ c->key.render_to_fbo;
1015
1016 /* gl_FragCoord.x */
1017 if (ir->data.pixel_center_integer) {
1018 emit(MOV(wpos, this->pixel_x));
1019 } else {
1020 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1021 }
1022 wpos.reg_offset++;
1023
1024 /* gl_FragCoord.y */
1025 if (!flip && ir->data.pixel_center_integer) {
1026 emit(MOV(wpos, this->pixel_y));
1027 } else {
1028 fs_reg pixel_y = this->pixel_y;
1029 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1030
1031 if (flip) {
1032 pixel_y.negate = true;
1033 offset += c->key.drawable_height - 1.0;
1034 }
1035
1036 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1037 }
1038 wpos.reg_offset++;
1039
1040 /* gl_FragCoord.z */
1041 if (brw->gen >= 6) {
1042 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
1043 } else {
1044 emit(FS_OPCODE_LINTERP, wpos,
1045 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1046 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1047 interp_reg(VARYING_SLOT_POS, 2));
1048 }
1049 wpos.reg_offset++;
1050
1051 /* gl_FragCoord.w: Already set up in emit_interpolation */
1052 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1053
1054 return reg;
1055 }
1056
1057 fs_inst *
1058 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1059 glsl_interp_qualifier interpolation_mode,
1060 bool is_centroid, bool is_sample)
1061 {
1062 brw_wm_barycentric_interp_mode barycoord_mode;
1063 if (brw->gen >= 6) {
1064 if (is_centroid) {
1065 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1066 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1067 else
1068 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1069 } else if (is_sample) {
1070 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1071 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1072 else
1073 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1074 } else {
1075 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1076 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1077 else
1078 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1079 }
1080 } else {
1081 /* On Ironlake and below, there is only one interpolation mode.
1082 * Centroid interpolation doesn't mean anything on this hardware --
1083 * there is no multisampling.
1084 */
1085 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1086 }
1087 return emit(FS_OPCODE_LINTERP, attr,
1088 this->delta_x[barycoord_mode],
1089 this->delta_y[barycoord_mode], interp);
1090 }
1091
1092 fs_reg *
1093 fs_visitor::emit_general_interpolation(ir_variable *ir)
1094 {
1095 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1096 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1097 fs_reg attr = *reg;
1098
1099 unsigned int array_elements;
1100 const glsl_type *type;
1101
1102 if (ir->type->is_array()) {
1103 array_elements = ir->type->length;
1104 if (array_elements == 0) {
1105 fail("dereferenced array '%s' has length 0\n", ir->name);
1106 }
1107 type = ir->type->fields.array;
1108 } else {
1109 array_elements = 1;
1110 type = ir->type;
1111 }
1112
1113 glsl_interp_qualifier interpolation_mode =
1114 ir->determine_interpolation_mode(c->key.flat_shade);
1115
1116 int location = ir->data.location;
1117 for (unsigned int i = 0; i < array_elements; i++) {
1118 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1119 if (c->prog_data.urb_setup[location] == -1) {
1120 /* If there's no incoming setup data for this slot, don't
1121 * emit interpolation for it.
1122 */
1123 attr.reg_offset += type->vector_elements;
1124 location++;
1125 continue;
1126 }
1127
1128 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1129 /* Constant interpolation (flat shading) case. The SF has
1130 * handed us defined values in only the constant offset
1131 * field of the setup reg.
1132 */
1133 for (unsigned int k = 0; k < type->vector_elements; k++) {
1134 struct brw_reg interp = interp_reg(location, k);
1135 interp = suboffset(interp, 3);
1136 interp.type = reg->type;
1137 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1138 attr.reg_offset++;
1139 }
1140 } else {
1141 /* Smooth/noperspective interpolation case. */
1142 for (unsigned int k = 0; k < type->vector_elements; k++) {
1143 struct brw_reg interp = interp_reg(location, k);
1144 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1145 ir->data.centroid && !c->key.persample_shading,
1146 ir->data.sample || c->key.persample_shading);
1147 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1148 /* Get the pixel/sample mask into f0 so that we know
1149 * which pixels are lit. Then, for each channel that is
1150 * unlit, replace the centroid data with non-centroid
1151 * data.
1152 */
1153 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1154 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1155 interpolation_mode,
1156 false, false);
1157 inst->predicate = BRW_PREDICATE_NORMAL;
1158 inst->predicate_inverse = true;
1159 }
1160 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1161 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1162 }
1163 attr.reg_offset++;
1164 }
1165
1166 }
1167 location++;
1168 }
1169 }
1170
1171 return reg;
1172 }
1173
1174 fs_reg *
1175 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1176 {
1177 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1178
1179 /* The frontfacing comes in as a bit in the thread payload. */
1180 if (brw->gen >= 6) {
1181 emit(BRW_OPCODE_ASR, *reg,
1182 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1183 fs_reg(15));
1184 emit(BRW_OPCODE_NOT, *reg, *reg);
1185 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1186 } else {
1187 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1188 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1189 * us front face
1190 */
1191 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1192 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1193 }
1194
1195 return reg;
1196 }
1197
1198 void
1199 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1200 {
1201 assert(dst.type == BRW_REGISTER_TYPE_F);
1202
1203 if (c->key.compute_pos_offset) {
1204 /* Convert int_sample_pos to floating point */
1205 emit(MOV(dst, int_sample_pos));
1206 /* Scale to the range [0, 1] */
1207 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1208 }
1209 else {
1210 /* From ARB_sample_shading specification:
1211 * "When rendering to a non-multisample buffer, or if multisample
1212 * rasterization is disabled, gl_SamplePosition will always be
1213 * (0.5, 0.5).
1214 */
1215 emit(MOV(dst, fs_reg(0.5f)));
1216 }
1217 }
1218
1219 fs_reg *
1220 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1221 {
1222 assert(brw->gen >= 6);
1223 assert(ir->type == glsl_type::vec2_type);
1224
1225 this->current_annotation = "compute sample position";
1226 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1227 fs_reg pos = *reg;
1228 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1229 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1230
1231 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1232 * mode will be enabled.
1233 *
1234 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1235 * R31.1:0 Position Offset X/Y for Slot[3:0]
1236 * R31.3:2 Position Offset X/Y for Slot[7:4]
1237 * .....
1238 *
1239 * The X, Y sample positions come in as bytes in thread payload. So, read
1240 * the positions using vstride=16, width=8, hstride=2.
1241 */
1242 struct brw_reg sample_pos_reg =
1243 stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1244 BRW_REGISTER_TYPE_B), 16, 8, 2);
1245
1246 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1247 if (dispatch_width == 16) {
1248 fs_inst *inst = emit(MOV(half(int_sample_x, 1),
1249 fs_reg(suboffset(sample_pos_reg, 16))));
1250 inst->force_sechalf = true;
1251 }
1252 /* Compute gl_SamplePosition.x */
1253 compute_sample_position(pos, int_sample_x);
1254 pos.reg_offset++;
1255 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1256 if (dispatch_width == 16) {
1257 fs_inst *inst = emit(MOV(half(int_sample_y, 1),
1258 fs_reg(suboffset(sample_pos_reg, 17))));
1259 inst->force_sechalf = true;
1260 }
1261 /* Compute gl_SamplePosition.y */
1262 compute_sample_position(pos, int_sample_y);
1263 return reg;
1264 }
1265
1266 fs_reg *
1267 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1268 {
1269 assert(brw->gen >= 6);
1270
1271 this->current_annotation = "compute sample id";
1272 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1273
1274 if (c->key.compute_sample_id) {
1275 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1276 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1277 t2.type = BRW_REGISTER_TYPE_UW;
1278
1279 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1280 * 8x multisampling, subspan 0 will represent sample N (where N
1281 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1282 * 7. We can find the value of N by looking at R0.0 bits 7:6
1283 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1284 * (since samples are always delivered in pairs). That is, we
1285 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1286 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1287 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1288 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1289 * populating a temporary variable with the sequence (0, 1, 2, 3),
1290 * and then reading from it using vstride=1, width=4, hstride=0.
1291 * These computations hold good for 4x multisampling as well.
1292 */
1293 emit(BRW_OPCODE_AND, t1,
1294 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1295 fs_reg(brw_imm_d(0xc0)));
1296 emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1297 /* This works for both SIMD8 and SIMD16 */
1298 emit(MOV(t2, brw_imm_v(0x3210)));
1299 /* This special instruction takes care of setting vstride=1,
1300 * width=4, hstride=0 of t2 during an ADD instruction.
1301 */
1302 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1303 } else {
1304 /* As per GL_ARB_sample_shading specification:
1305 * "When rendering to a non-multisample buffer, or if multisample
1306 * rasterization is disabled, gl_SampleID will always be zero."
1307 */
1308 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1309 }
1310
1311 return reg;
1312 }
1313
1314 fs_reg *
1315 fs_visitor::emit_samplemaskin_setup(ir_variable *ir)
1316 {
1317 assert(brw->gen >= 7);
1318 this->current_annotation = "compute gl_SampleMaskIn";
1319 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1320 emit(MOV(*reg, fs_reg(retype(brw_vec8_grf(c->sample_mask_reg, 0), BRW_REGISTER_TYPE_D))));
1321 return reg;
1322 }
1323
1324 fs_reg
1325 fs_visitor::fix_math_operand(fs_reg src)
1326 {
1327 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1328 * might be able to do better by doing execsize = 1 math and then
1329 * expanding that result out, but we would need to be careful with
1330 * masking.
1331 *
1332 * The hardware ignores source modifiers (negate and abs) on math
1333 * instructions, so we also move to a temp to set those up.
1334 */
1335 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1336 !src.abs && !src.negate)
1337 return src;
1338
1339 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1340 * operands to math
1341 */
1342 if (brw->gen >= 7 && src.file != IMM)
1343 return src;
1344
1345 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1346 expanded.type = src.type;
1347 emit(BRW_OPCODE_MOV, expanded, src);
1348 return expanded;
1349 }
1350
1351 fs_inst *
1352 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1353 {
1354 switch (opcode) {
1355 case SHADER_OPCODE_RCP:
1356 case SHADER_OPCODE_RSQ:
1357 case SHADER_OPCODE_SQRT:
1358 case SHADER_OPCODE_EXP2:
1359 case SHADER_OPCODE_LOG2:
1360 case SHADER_OPCODE_SIN:
1361 case SHADER_OPCODE_COS:
1362 break;
1363 default:
1364 assert(!"not reached: bad math opcode");
1365 return NULL;
1366 }
1367
1368 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1369 * might be able to do better by doing execsize = 1 math and then
1370 * expanding that result out, but we would need to be careful with
1371 * masking.
1372 *
1373 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1374 * instructions, so we also move to a temp to set those up.
1375 */
1376 if (brw->gen >= 6)
1377 src = fix_math_operand(src);
1378
1379 fs_inst *inst = emit(opcode, dst, src);
1380
1381 if (brw->gen < 6) {
1382 inst->base_mrf = 2;
1383 inst->mlen = dispatch_width / 8;
1384 }
1385
1386 return inst;
1387 }
1388
1389 fs_inst *
1390 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1391 {
1392 int base_mrf = 2;
1393 fs_inst *inst;
1394
1395 switch (opcode) {
1396 case SHADER_OPCODE_INT_QUOTIENT:
1397 case SHADER_OPCODE_INT_REMAINDER:
1398 if (brw->gen >= 7)
1399 no16("SIMD16 INTDIV unsupported\n");
1400 break;
1401 case SHADER_OPCODE_POW:
1402 break;
1403 default:
1404 assert(!"not reached: unsupported binary math opcode.");
1405 return NULL;
1406 }
1407
1408 if (brw->gen >= 6) {
1409 src0 = fix_math_operand(src0);
1410 src1 = fix_math_operand(src1);
1411
1412 inst = emit(opcode, dst, src0, src1);
1413 } else {
1414 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1415 * "Message Payload":
1416 *
1417 * "Operand0[7]. For the INT DIV functions, this operand is the
1418 * denominator."
1419 * ...
1420 * "Operand1[7]. For the INT DIV functions, this operand is the
1421 * numerator."
1422 */
1423 bool is_int_div = opcode != SHADER_OPCODE_POW;
1424 fs_reg &op0 = is_int_div ? src1 : src0;
1425 fs_reg &op1 = is_int_div ? src0 : src1;
1426
1427 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1428 inst = emit(opcode, dst, op0, reg_null_f);
1429
1430 inst->base_mrf = base_mrf;
1431 inst->mlen = 2 * dispatch_width / 8;
1432 }
1433 return inst;
1434 }
1435
1436 void
1437 fs_visitor::assign_curb_setup()
1438 {
1439 if (dispatch_width == 8) {
1440 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1441 } else {
1442 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1443 }
1444
1445 c->prog_data.curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1446
1447 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1448 foreach_list(node, &this->instructions) {
1449 fs_inst *inst = (fs_inst *)node;
1450
1451 for (unsigned int i = 0; i < 3; i++) {
1452 if (inst->src[i].file == UNIFORM) {
1453 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1454 int constant_nr;
1455 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1456 constant_nr = push_constant_loc[uniform_nr];
1457 } else {
1458 /* Section 5.11 of the OpenGL 4.1 spec says:
1459 * "Out-of-bounds reads return undefined values, which include
1460 * values from other variables of the active program or zero."
1461 * Just return the first push constant.
1462 */
1463 constant_nr = 0;
1464 }
1465
1466 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1467 constant_nr / 8,
1468 constant_nr % 8);
1469
1470 inst->src[i].file = HW_REG;
1471 inst->src[i].fixed_hw_reg = byte_offset(
1472 retype(brw_reg, inst->src[i].type),
1473 inst->src[i].subreg_offset);
1474 }
1475 }
1476 }
1477 }
1478
1479 void
1480 fs_visitor::calculate_urb_setup()
1481 {
1482 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1483 c->prog_data.urb_setup[i] = -1;
1484 }
1485
1486 int urb_next = 0;
1487 /* Figure out where each of the incoming setup attributes lands. */
1488 if (brw->gen >= 6) {
1489 if (_mesa_bitcount_64(fp->Base.InputsRead &
1490 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1491 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1492 * first 16 varying inputs, so we can put them wherever we want.
1493 * Just put them in order.
1494 *
1495 * This is useful because it means that (a) inputs not used by the
1496 * fragment shader won't take up valuable register space, and (b) we
1497 * won't have to recompile the fragment shader if it gets paired with
1498 * a different vertex (or geometry) shader.
1499 */
1500 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1501 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1502 BITFIELD64_BIT(i)) {
1503 c->prog_data.urb_setup[i] = urb_next++;
1504 }
1505 }
1506 } else {
1507 /* We have enough input varyings that the SF/SBE pipeline stage can't
1508 * arbitrarily rearrange them to suit our whim; we have to put them
1509 * in an order that matches the output of the previous pipeline stage
1510 * (geometry or vertex shader).
1511 */
1512 struct brw_vue_map prev_stage_vue_map;
1513 brw_compute_vue_map(brw, &prev_stage_vue_map,
1514 c->key.input_slots_valid);
1515 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1516 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1517 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1518 slot++) {
1519 int varying = prev_stage_vue_map.slot_to_varying[slot];
1520 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1521 * unused.
1522 */
1523 if (varying != BRW_VARYING_SLOT_COUNT &&
1524 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1525 BITFIELD64_BIT(varying))) {
1526 c->prog_data.urb_setup[varying] = slot - first_slot;
1527 }
1528 }
1529 urb_next = prev_stage_vue_map.num_slots - first_slot;
1530 }
1531 } else {
1532 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1533 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1534 /* Point size is packed into the header, not as a general attribute */
1535 if (i == VARYING_SLOT_PSIZ)
1536 continue;
1537
1538 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1539 /* The back color slot is skipped when the front color is
1540 * also written to. In addition, some slots can be
1541 * written in the vertex shader and not read in the
1542 * fragment shader. So the register number must always be
1543 * incremented, mapped or not.
1544 */
1545 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1546 c->prog_data.urb_setup[i] = urb_next;
1547 urb_next++;
1548 }
1549 }
1550
1551 /*
1552 * It's a FS only attribute, and we did interpolation for this attribute
1553 * in SF thread. So, count it here, too.
1554 *
1555 * See compile_sf_prog() for more info.
1556 */
1557 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1558 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1559 }
1560
1561 c->prog_data.num_varying_inputs = urb_next;
1562 }
1563
1564 void
1565 fs_visitor::assign_urb_setup()
1566 {
1567 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1568
1569 /* Offset all the urb_setup[] index by the actual position of the
1570 * setup regs, now that the location of the constants has been chosen.
1571 */
1572 foreach_list(node, &this->instructions) {
1573 fs_inst *inst = (fs_inst *)node;
1574
1575 if (inst->opcode == FS_OPCODE_LINTERP) {
1576 assert(inst->src[2].file == HW_REG);
1577 inst->src[2].fixed_hw_reg.nr += urb_start;
1578 }
1579
1580 if (inst->opcode == FS_OPCODE_CINTERP) {
1581 assert(inst->src[0].file == HW_REG);
1582 inst->src[0].fixed_hw_reg.nr += urb_start;
1583 }
1584 }
1585
1586 /* Each attribute is 4 setup channels, each of which is half a reg. */
1587 this->first_non_payload_grf =
1588 urb_start + c->prog_data.num_varying_inputs * 2;
1589 }
1590
1591 /**
1592 * Split large virtual GRFs into separate components if we can.
1593 *
1594 * This is mostly duplicated with what brw_fs_vector_splitting does,
1595 * but that's really conservative because it's afraid of doing
1596 * splitting that doesn't result in real progress after the rest of
1597 * the optimization phases, which would cause infinite looping in
1598 * optimization. We can do it once here, safely. This also has the
1599 * opportunity to split interpolated values, or maybe even uniforms,
1600 * which we don't have at the IR level.
1601 *
1602 * We want to split, because virtual GRFs are what we register
1603 * allocate and spill (due to contiguousness requirements for some
1604 * instructions), and they're what we naturally generate in the
1605 * codegen process, but most virtual GRFs don't actually need to be
1606 * contiguous sets of GRFs. If we split, we'll end up with reduced
1607 * live intervals and better dead code elimination and coalescing.
1608 */
1609 void
1610 fs_visitor::split_virtual_grfs()
1611 {
1612 int num_vars = this->virtual_grf_count;
1613 bool split_grf[num_vars];
1614 int new_virtual_grf[num_vars];
1615
1616 /* Try to split anything > 0 sized. */
1617 for (int i = 0; i < num_vars; i++) {
1618 if (this->virtual_grf_sizes[i] != 1)
1619 split_grf[i] = true;
1620 else
1621 split_grf[i] = false;
1622 }
1623
1624 if (brw->has_pln &&
1625 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1626 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1627 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1628 * Gen6, that was the only supported interpolation mode, and since Gen6,
1629 * delta_x and delta_y are in fixed hardware registers.
1630 */
1631 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1632 false;
1633 }
1634
1635 foreach_list(node, &this->instructions) {
1636 fs_inst *inst = (fs_inst *)node;
1637
1638 /* If there's a SEND message that requires contiguous destination
1639 * registers, no splitting is allowed.
1640 */
1641 if (inst->regs_written > 1) {
1642 split_grf[inst->dst.reg] = false;
1643 }
1644
1645 /* If we're sending from a GRF, don't split it, on the assumption that
1646 * the send is reading the whole thing.
1647 */
1648 if (inst->is_send_from_grf()) {
1649 for (int i = 0; i < 3; i++) {
1650 if (inst->src[i].file == GRF) {
1651 split_grf[inst->src[i].reg] = false;
1652 }
1653 }
1654 }
1655 }
1656
1657 /* Allocate new space for split regs. Note that the virtual
1658 * numbers will be contiguous.
1659 */
1660 for (int i = 0; i < num_vars; i++) {
1661 if (split_grf[i]) {
1662 new_virtual_grf[i] = virtual_grf_alloc(1);
1663 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1664 int reg = virtual_grf_alloc(1);
1665 assert(reg == new_virtual_grf[i] + j - 1);
1666 (void) reg;
1667 }
1668 this->virtual_grf_sizes[i] = 1;
1669 }
1670 }
1671
1672 foreach_list(node, &this->instructions) {
1673 fs_inst *inst = (fs_inst *)node;
1674
1675 if (inst->dst.file == GRF &&
1676 split_grf[inst->dst.reg] &&
1677 inst->dst.reg_offset != 0) {
1678 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1679 inst->dst.reg_offset - 1);
1680 inst->dst.reg_offset = 0;
1681 }
1682 for (int i = 0; i < 3; i++) {
1683 if (inst->src[i].file == GRF &&
1684 split_grf[inst->src[i].reg] &&
1685 inst->src[i].reg_offset != 0) {
1686 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1687 inst->src[i].reg_offset - 1);
1688 inst->src[i].reg_offset = 0;
1689 }
1690 }
1691 }
1692 invalidate_live_intervals();
1693 }
1694
1695 /**
1696 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1697 *
1698 * During code generation, we create tons of temporary variables, many of
1699 * which get immediately killed and are never used again. Yet, in later
1700 * optimization and analysis passes, such as compute_live_intervals, we need
1701 * to loop over all the virtual GRFs. Compacting them can save a lot of
1702 * overhead.
1703 */
1704 void
1705 fs_visitor::compact_virtual_grfs()
1706 {
1707 /* Mark which virtual GRFs are used, and count how many. */
1708 int remap_table[this->virtual_grf_count];
1709 memset(remap_table, -1, sizeof(remap_table));
1710
1711 foreach_list(node, &this->instructions) {
1712 const fs_inst *inst = (const fs_inst *) node;
1713
1714 if (inst->dst.file == GRF)
1715 remap_table[inst->dst.reg] = 0;
1716
1717 for (int i = 0; i < 3; i++) {
1718 if (inst->src[i].file == GRF)
1719 remap_table[inst->src[i].reg] = 0;
1720 }
1721 }
1722
1723 /* In addition to registers used in instructions, fs_visitor keeps
1724 * direct references to certain special values which must be patched:
1725 */
1726 fs_reg *special[] = {
1727 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1728 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1729 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1730 &delta_x[0], &delta_x[1], &delta_x[2],
1731 &delta_x[3], &delta_x[4], &delta_x[5],
1732 &delta_y[0], &delta_y[1], &delta_y[2],
1733 &delta_y[3], &delta_y[4], &delta_y[5],
1734 };
1735 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1736 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1737
1738 /* Treat all special values as used, to be conservative */
1739 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1740 if (special[i]->file == GRF)
1741 remap_table[special[i]->reg] = 0;
1742 }
1743
1744 /* Compact the GRF arrays. */
1745 int new_index = 0;
1746 for (int i = 0; i < this->virtual_grf_count; i++) {
1747 if (remap_table[i] != -1) {
1748 remap_table[i] = new_index;
1749 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1750 invalidate_live_intervals();
1751 ++new_index;
1752 }
1753 }
1754
1755 this->virtual_grf_count = new_index;
1756
1757 /* Patch all the instructions to use the newly renumbered registers */
1758 foreach_list(node, &this->instructions) {
1759 fs_inst *inst = (fs_inst *) node;
1760
1761 if (inst->dst.file == GRF)
1762 inst->dst.reg = remap_table[inst->dst.reg];
1763
1764 for (int i = 0; i < 3; i++) {
1765 if (inst->src[i].file == GRF)
1766 inst->src[i].reg = remap_table[inst->src[i].reg];
1767 }
1768 }
1769
1770 /* Patch all the references to special values */
1771 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1772 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1773 special[i]->reg = remap_table[special[i]->reg];
1774 }
1775 }
1776
1777 /*
1778 * Implements array access of uniforms by inserting a
1779 * PULL_CONSTANT_LOAD instruction.
1780 *
1781 * Unlike temporary GRF array access (where we don't support it due to
1782 * the difficulty of doing relative addressing on instruction
1783 * destinations), we could potentially do array access of uniforms
1784 * that were loaded in GRF space as push constants. In real-world
1785 * usage we've seen, though, the arrays being used are always larger
1786 * than we could load as push constants, so just always move all
1787 * uniform array access out to a pull constant buffer.
1788 */
1789 void
1790 fs_visitor::move_uniform_array_access_to_pull_constants()
1791 {
1792 if (dispatch_width != 8)
1793 return;
1794
1795 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1796
1797 for (unsigned int i = 0; i < uniforms; i++) {
1798 pull_constant_loc[i] = -1;
1799 }
1800
1801 /* Walk through and find array access of uniforms. Put a copy of that
1802 * uniform in the pull constant buffer.
1803 *
1804 * Note that we don't move constant-indexed accesses to arrays. No
1805 * testing has been done of the performance impact of this choice.
1806 */
1807 foreach_list_safe(node, &this->instructions) {
1808 fs_inst *inst = (fs_inst *)node;
1809
1810 for (int i = 0 ; i < 3; i++) {
1811 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1812 continue;
1813
1814 int uniform = inst->src[i].reg;
1815
1816 /* If this array isn't already present in the pull constant buffer,
1817 * add it.
1818 */
1819 if (pull_constant_loc[uniform] == -1) {
1820 const float **values = &stage_prog_data->param[uniform];
1821
1822 assert(param_size[uniform]);
1823
1824 for (int j = 0; j < param_size[uniform]; j++) {
1825 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1826
1827 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1828 values[j];
1829 }
1830 }
1831 }
1832 }
1833 }
1834
1835 /**
1836 * Assign UNIFORM file registers to either push constants or pull constants.
1837 *
1838 * We allow a fragment shader to have more than the specified minimum
1839 * maximum number of fragment shader uniform components (64). If
1840 * there are too many of these, they'd fill up all of register space.
1841 * So, this will push some of them out to the pull constant buffer and
1842 * update the program to load them.
1843 */
1844 void
1845 fs_visitor::assign_constant_locations()
1846 {
1847 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1848 if (dispatch_width != 8)
1849 return;
1850
1851 /* Find which UNIFORM registers are still in use. */
1852 bool is_live[uniforms];
1853 for (unsigned int i = 0; i < uniforms; i++) {
1854 is_live[i] = false;
1855 }
1856
1857 foreach_list(node, &this->instructions) {
1858 fs_inst *inst = (fs_inst *) node;
1859
1860 for (int i = 0; i < 3; i++) {
1861 if (inst->src[i].file != UNIFORM)
1862 continue;
1863
1864 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1865 if (constant_nr >= 0 && constant_nr < (int) uniforms)
1866 is_live[constant_nr] = true;
1867 }
1868 }
1869
1870 /* Only allow 16 registers (128 uniform components) as push constants.
1871 *
1872 * Just demote the end of the list. We could probably do better
1873 * here, demoting things that are rarely used in the program first.
1874 */
1875 unsigned int max_push_components = 16 * 8;
1876 unsigned int num_push_constants = 0;
1877
1878 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1879
1880 for (unsigned int i = 0; i < uniforms; i++) {
1881 if (!is_live[i] || pull_constant_loc[i] != -1) {
1882 /* This UNIFORM register is either dead, or has already been demoted
1883 * to a pull const. Mark it as no longer living in the param[] array.
1884 */
1885 push_constant_loc[i] = -1;
1886 continue;
1887 }
1888
1889 if (num_push_constants < max_push_components) {
1890 /* Retain as a push constant. Record the location in the params[]
1891 * array.
1892 */
1893 push_constant_loc[i] = num_push_constants++;
1894 } else {
1895 /* Demote to a pull constant. */
1896 push_constant_loc[i] = -1;
1897
1898 int pull_index = stage_prog_data->nr_pull_params++;
1899 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1900 pull_constant_loc[i] = pull_index;
1901 }
1902 }
1903
1904 stage_prog_data->nr_params = num_push_constants;
1905
1906 /* Up until now, the param[] array has been indexed by reg + reg_offset
1907 * of UNIFORM registers. Condense it to only contain the uniforms we
1908 * chose to upload as push constants.
1909 */
1910 for (unsigned int i = 0; i < uniforms; i++) {
1911 int remapped = push_constant_loc[i];
1912
1913 if (remapped == -1)
1914 continue;
1915
1916 assert(remapped <= (int)i);
1917 stage_prog_data->param[remapped] = stage_prog_data->param[i];
1918 }
1919 }
1920
1921 /**
1922 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1923 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1924 */
1925 void
1926 fs_visitor::demote_pull_constants()
1927 {
1928 foreach_list(node, &this->instructions) {
1929 fs_inst *inst = (fs_inst *)node;
1930
1931 for (int i = 0; i < 3; i++) {
1932 if (inst->src[i].file != UNIFORM)
1933 continue;
1934
1935 int pull_index = pull_constant_loc[inst->src[i].reg +
1936 inst->src[i].reg_offset];
1937 if (pull_index == -1)
1938 continue;
1939
1940 /* Set up the annotation tracking for new generated instructions. */
1941 base_ir = inst->ir;
1942 current_annotation = inst->annotation;
1943
1944 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1945 fs_reg dst = fs_reg(this, glsl_type::float_type);
1946
1947 /* Generate a pull load into dst. */
1948 if (inst->src[i].reladdr) {
1949 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
1950 surf_index,
1951 *inst->src[i].reladdr,
1952 pull_index);
1953 inst->insert_before(&list);
1954 inst->src[i].reladdr = NULL;
1955 } else {
1956 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1957 fs_inst *pull =
1958 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1959 dst, surf_index, offset);
1960 inst->insert_before(pull);
1961 inst->src[i].set_smear(pull_index & 3);
1962 }
1963
1964 /* Rewrite the instruction to use the temporary VGRF. */
1965 inst->src[i].file = GRF;
1966 inst->src[i].reg = dst.reg;
1967 inst->src[i].reg_offset = 0;
1968 }
1969 }
1970 invalidate_live_intervals();
1971 }
1972
1973 bool
1974 fs_visitor::opt_algebraic()
1975 {
1976 bool progress = false;
1977
1978 foreach_list(node, &this->instructions) {
1979 fs_inst *inst = (fs_inst *)node;
1980
1981 switch (inst->opcode) {
1982 case BRW_OPCODE_MUL:
1983 if (inst->src[1].file != IMM)
1984 continue;
1985
1986 /* a * 1.0 = a */
1987 if (inst->src[1].is_one()) {
1988 inst->opcode = BRW_OPCODE_MOV;
1989 inst->src[1] = reg_undef;
1990 progress = true;
1991 break;
1992 }
1993
1994 /* a * 0.0 = 0.0 */
1995 if (inst->src[1].is_zero()) {
1996 inst->opcode = BRW_OPCODE_MOV;
1997 inst->src[0] = inst->src[1];
1998 inst->src[1] = reg_undef;
1999 progress = true;
2000 break;
2001 }
2002
2003 break;
2004 case BRW_OPCODE_ADD:
2005 if (inst->src[1].file != IMM)
2006 continue;
2007
2008 /* a + 0.0 = a */
2009 if (inst->src[1].is_zero()) {
2010 inst->opcode = BRW_OPCODE_MOV;
2011 inst->src[1] = reg_undef;
2012 progress = true;
2013 break;
2014 }
2015 break;
2016 case BRW_OPCODE_OR:
2017 if (inst->src[0].equals(inst->src[1])) {
2018 inst->opcode = BRW_OPCODE_MOV;
2019 inst->src[1] = reg_undef;
2020 progress = true;
2021 break;
2022 }
2023 break;
2024 case BRW_OPCODE_LRP:
2025 if (inst->src[1].equals(inst->src[2])) {
2026 inst->opcode = BRW_OPCODE_MOV;
2027 inst->src[0] = inst->src[1];
2028 inst->src[1] = reg_undef;
2029 inst->src[2] = reg_undef;
2030 progress = true;
2031 break;
2032 }
2033 break;
2034 case BRW_OPCODE_SEL:
2035 if (inst->saturate && inst->src[1].file == IMM) {
2036 switch (inst->conditional_mod) {
2037 case BRW_CONDITIONAL_LE:
2038 case BRW_CONDITIONAL_L:
2039 switch (inst->src[1].type) {
2040 case BRW_REGISTER_TYPE_F:
2041 if (inst->src[1].imm.f >= 1.0f) {
2042 inst->opcode = BRW_OPCODE_MOV;
2043 inst->src[1] = reg_undef;
2044 progress = true;
2045 }
2046 break;
2047 default:
2048 break;
2049 }
2050 break;
2051 case BRW_CONDITIONAL_GE:
2052 case BRW_CONDITIONAL_G:
2053 switch (inst->src[1].type) {
2054 case BRW_REGISTER_TYPE_F:
2055 if (inst->src[1].imm.f <= 0.0f) {
2056 inst->opcode = BRW_OPCODE_MOV;
2057 inst->src[1] = reg_undef;
2058 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2059 progress = true;
2060 }
2061 break;
2062 default:
2063 break;
2064 }
2065 default:
2066 break;
2067 }
2068 }
2069 break;
2070 default:
2071 break;
2072 }
2073 }
2074
2075 return progress;
2076 }
2077
2078 /**
2079 * Removes any instructions writing a VGRF where that VGRF is not used by any
2080 * later instruction.
2081 */
2082 bool
2083 fs_visitor::dead_code_eliminate()
2084 {
2085 bool progress = false;
2086 int pc = 0;
2087
2088 calculate_live_intervals();
2089
2090 foreach_list_safe(node, &this->instructions) {
2091 fs_inst *inst = (fs_inst *)node;
2092
2093 if (inst->dst.file == GRF && !inst->has_side_effects()) {
2094 bool dead = true;
2095
2096 for (int i = 0; i < inst->regs_written; i++) {
2097 int var = live_intervals->var_from_vgrf[inst->dst.reg];
2098 assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2099 if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2100 dead = false;
2101 break;
2102 }
2103 }
2104
2105 if (dead) {
2106 /* Don't dead code eliminate instructions that write to the
2107 * accumulator as a side-effect. Instead just set the destination
2108 * to the null register to free it.
2109 */
2110 switch (inst->opcode) {
2111 case BRW_OPCODE_ADDC:
2112 case BRW_OPCODE_SUBB:
2113 case BRW_OPCODE_MACH:
2114 inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2115 break;
2116 default:
2117 inst->remove();
2118 progress = true;
2119 break;
2120 }
2121 }
2122 }
2123
2124 pc++;
2125 }
2126
2127 if (progress)
2128 invalidate_live_intervals();
2129
2130 return progress;
2131 }
2132
2133 struct dead_code_hash_key
2134 {
2135 int vgrf;
2136 int reg_offset;
2137 };
2138
2139 static bool
2140 dead_code_hash_compare(const void *a, const void *b)
2141 {
2142 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2143 }
2144
2145 static void
2146 clear_dead_code_hash(struct hash_table *ht)
2147 {
2148 struct hash_entry *entry;
2149
2150 hash_table_foreach(ht, entry) {
2151 _mesa_hash_table_remove(ht, entry);
2152 }
2153 }
2154
2155 static void
2156 insert_dead_code_hash(struct hash_table *ht,
2157 int vgrf, int reg_offset, fs_inst *inst)
2158 {
2159 /* We don't bother freeing keys, because they'll be GCed with the ht. */
2160 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2161
2162 key->vgrf = vgrf;
2163 key->reg_offset = reg_offset;
2164
2165 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2166 }
2167
2168 static struct hash_entry *
2169 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2170 {
2171 struct dead_code_hash_key key;
2172
2173 key.vgrf = vgrf;
2174 key.reg_offset = reg_offset;
2175
2176 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2177 }
2178
2179 static void
2180 remove_dead_code_hash(struct hash_table *ht,
2181 int vgrf, int reg_offset)
2182 {
2183 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2184 if (!entry)
2185 return;
2186
2187 _mesa_hash_table_remove(ht, entry);
2188 }
2189
2190 /**
2191 * Walks basic blocks, removing any regs that are written but not read before
2192 * being redefined.
2193 *
2194 * The dead_code_eliminate() function implements a global dead code
2195 * elimination, but it only handles the removing the last write to a register
2196 * if it's never read. This one can handle intermediate writes, but only
2197 * within a basic block.
2198 */
2199 bool
2200 fs_visitor::dead_code_eliminate_local()
2201 {
2202 struct hash_table *ht;
2203 bool progress = false;
2204
2205 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2206
2207 if (ht == NULL) {
2208 return false;
2209 }
2210
2211 foreach_list_safe(node, &this->instructions) {
2212 fs_inst *inst = (fs_inst *)node;
2213
2214 /* At a basic block, empty the HT since we don't understand dataflow
2215 * here.
2216 */
2217 if (inst->is_control_flow()) {
2218 clear_dead_code_hash(ht);
2219 continue;
2220 }
2221
2222 /* Clear the HT of any instructions that got read. */
2223 for (int i = 0; i < 3; i++) {
2224 fs_reg src = inst->src[i];
2225 if (src.file != GRF)
2226 continue;
2227
2228 int read = 1;
2229 if (inst->is_send_from_grf())
2230 read = virtual_grf_sizes[src.reg] - src.reg_offset;
2231
2232 for (int reg_offset = src.reg_offset;
2233 reg_offset < src.reg_offset + read;
2234 reg_offset++) {
2235 remove_dead_code_hash(ht, src.reg, reg_offset);
2236 }
2237 }
2238
2239 /* Add any update of a GRF to the HT, removing a previous write if it
2240 * wasn't read.
2241 */
2242 if (inst->dst.file == GRF) {
2243 if (inst->regs_written > 1) {
2244 /* We don't know how to trim channels from an instruction's
2245 * writes, so we can't incrementally remove unread channels from
2246 * it. Just remove whatever it overwrites from the table
2247 */
2248 for (int i = 0; i < inst->regs_written; i++) {
2249 remove_dead_code_hash(ht,
2250 inst->dst.reg,
2251 inst->dst.reg_offset + i);
2252 }
2253 } else {
2254 struct hash_entry *entry =
2255 get_dead_code_hash_entry(ht, inst->dst.reg,
2256 inst->dst.reg_offset);
2257
2258 if (entry) {
2259 if (inst->is_partial_write()) {
2260 /* For a partial write, we can't remove any previous dead code
2261 * candidate, since we're just modifying their result.
2262 */
2263 } else {
2264 /* We're completely updating a channel, and there was a
2265 * previous write to the channel that wasn't read. Kill it!
2266 */
2267 fs_inst *inst = (fs_inst *)entry->data;
2268 inst->remove();
2269 progress = true;
2270 }
2271
2272 _mesa_hash_table_remove(ht, entry);
2273 }
2274
2275 if (!inst->has_side_effects())
2276 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2277 inst);
2278 }
2279 }
2280 }
2281
2282 _mesa_hash_table_destroy(ht, NULL);
2283
2284 if (progress)
2285 invalidate_live_intervals();
2286
2287 return progress;
2288 }
2289
2290 bool
2291 fs_visitor::compute_to_mrf()
2292 {
2293 bool progress = false;
2294 int next_ip = 0;
2295
2296 calculate_live_intervals();
2297
2298 foreach_list_safe(node, &this->instructions) {
2299 fs_inst *inst = (fs_inst *)node;
2300
2301 int ip = next_ip;
2302 next_ip++;
2303
2304 if (inst->opcode != BRW_OPCODE_MOV ||
2305 inst->is_partial_write() ||
2306 inst->dst.file != MRF || inst->src[0].file != GRF ||
2307 inst->dst.type != inst->src[0].type ||
2308 inst->src[0].abs || inst->src[0].negate ||
2309 !inst->src[0].is_contiguous() ||
2310 inst->src[0].subreg_offset)
2311 continue;
2312
2313 /* Work out which hardware MRF registers are written by this
2314 * instruction.
2315 */
2316 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2317 int mrf_high;
2318 if (inst->dst.reg & BRW_MRF_COMPR4) {
2319 mrf_high = mrf_low + 4;
2320 } else if (dispatch_width == 16 &&
2321 (!inst->force_uncompressed && !inst->force_sechalf)) {
2322 mrf_high = mrf_low + 1;
2323 } else {
2324 mrf_high = mrf_low;
2325 }
2326
2327 /* Can't compute-to-MRF this GRF if someone else was going to
2328 * read it later.
2329 */
2330 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2331 continue;
2332
2333 /* Found a move of a GRF to a MRF. Let's see if we can go
2334 * rewrite the thing that made this GRF to write into the MRF.
2335 */
2336 fs_inst *scan_inst;
2337 for (scan_inst = (fs_inst *)inst->prev;
2338 scan_inst->prev != NULL;
2339 scan_inst = (fs_inst *)scan_inst->prev) {
2340 if (scan_inst->dst.file == GRF &&
2341 scan_inst->dst.reg == inst->src[0].reg) {
2342 /* Found the last thing to write our reg we want to turn
2343 * into a compute-to-MRF.
2344 */
2345
2346 /* If this one instruction didn't populate all the
2347 * channels, bail. We might be able to rewrite everything
2348 * that writes that reg, but it would require smarter
2349 * tracking to delay the rewriting until complete success.
2350 */
2351 if (scan_inst->is_partial_write())
2352 break;
2353
2354 /* Things returning more than one register would need us to
2355 * understand coalescing out more than one MOV at a time.
2356 */
2357 if (scan_inst->regs_written > 1)
2358 break;
2359
2360 /* SEND instructions can't have MRF as a destination. */
2361 if (scan_inst->mlen)
2362 break;
2363
2364 if (brw->gen == 6) {
2365 /* gen6 math instructions must have the destination be
2366 * GRF, so no compute-to-MRF for them.
2367 */
2368 if (scan_inst->is_math()) {
2369 break;
2370 }
2371 }
2372
2373 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2374 /* Found the creator of our MRF's source value. */
2375 scan_inst->dst.file = MRF;
2376 scan_inst->dst.reg = inst->dst.reg;
2377 scan_inst->saturate |= inst->saturate;
2378 inst->remove();
2379 progress = true;
2380 }
2381 break;
2382 }
2383
2384 /* We don't handle control flow here. Most computation of
2385 * values that end up in MRFs are shortly before the MRF
2386 * write anyway.
2387 */
2388 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2389 break;
2390
2391 /* You can't read from an MRF, so if someone else reads our
2392 * MRF's source GRF that we wanted to rewrite, that stops us.
2393 */
2394 bool interfered = false;
2395 for (int i = 0; i < 3; i++) {
2396 if (scan_inst->src[i].file == GRF &&
2397 scan_inst->src[i].reg == inst->src[0].reg &&
2398 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2399 interfered = true;
2400 }
2401 }
2402 if (interfered)
2403 break;
2404
2405 if (scan_inst->dst.file == MRF) {
2406 /* If somebody else writes our MRF here, we can't
2407 * compute-to-MRF before that.
2408 */
2409 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2410 int scan_mrf_high;
2411
2412 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2413 scan_mrf_high = scan_mrf_low + 4;
2414 } else if (dispatch_width == 16 &&
2415 (!scan_inst->force_uncompressed &&
2416 !scan_inst->force_sechalf)) {
2417 scan_mrf_high = scan_mrf_low + 1;
2418 } else {
2419 scan_mrf_high = scan_mrf_low;
2420 }
2421
2422 if (mrf_low == scan_mrf_low ||
2423 mrf_low == scan_mrf_high ||
2424 mrf_high == scan_mrf_low ||
2425 mrf_high == scan_mrf_high) {
2426 break;
2427 }
2428 }
2429
2430 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2431 /* Found a SEND instruction, which means that there are
2432 * live values in MRFs from base_mrf to base_mrf +
2433 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2434 * above it.
2435 */
2436 if (mrf_low >= scan_inst->base_mrf &&
2437 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2438 break;
2439 }
2440 if (mrf_high >= scan_inst->base_mrf &&
2441 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2442 break;
2443 }
2444 }
2445 }
2446 }
2447
2448 if (progress)
2449 invalidate_live_intervals();
2450
2451 return progress;
2452 }
2453
2454 /**
2455 * Walks through basic blocks, looking for repeated MRF writes and
2456 * removing the later ones.
2457 */
2458 bool
2459 fs_visitor::remove_duplicate_mrf_writes()
2460 {
2461 fs_inst *last_mrf_move[16];
2462 bool progress = false;
2463
2464 /* Need to update the MRF tracking for compressed instructions. */
2465 if (dispatch_width == 16)
2466 return false;
2467
2468 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2469
2470 foreach_list_safe(node, &this->instructions) {
2471 fs_inst *inst = (fs_inst *)node;
2472
2473 if (inst->is_control_flow()) {
2474 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2475 }
2476
2477 if (inst->opcode == BRW_OPCODE_MOV &&
2478 inst->dst.file == MRF) {
2479 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2480 if (prev_inst && inst->equals(prev_inst)) {
2481 inst->remove();
2482 progress = true;
2483 continue;
2484 }
2485 }
2486
2487 /* Clear out the last-write records for MRFs that were overwritten. */
2488 if (inst->dst.file == MRF) {
2489 last_mrf_move[inst->dst.reg] = NULL;
2490 }
2491
2492 if (inst->mlen > 0 && inst->base_mrf != -1) {
2493 /* Found a SEND instruction, which will include two or fewer
2494 * implied MRF writes. We could do better here.
2495 */
2496 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2497 last_mrf_move[inst->base_mrf + i] = NULL;
2498 }
2499 }
2500
2501 /* Clear out any MRF move records whose sources got overwritten. */
2502 if (inst->dst.file == GRF) {
2503 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2504 if (last_mrf_move[i] &&
2505 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2506 last_mrf_move[i] = NULL;
2507 }
2508 }
2509 }
2510
2511 if (inst->opcode == BRW_OPCODE_MOV &&
2512 inst->dst.file == MRF &&
2513 inst->src[0].file == GRF &&
2514 !inst->is_partial_write()) {
2515 last_mrf_move[inst->dst.reg] = inst;
2516 }
2517 }
2518
2519 if (progress)
2520 invalidate_live_intervals();
2521
2522 return progress;
2523 }
2524
2525 static void
2526 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2527 int first_grf, int grf_len)
2528 {
2529 bool inst_simd16 = (dispatch_width > 8 &&
2530 !inst->force_uncompressed &&
2531 !inst->force_sechalf);
2532
2533 /* Clear the flag for registers that actually got read (as expected). */
2534 for (int i = 0; i < 3; i++) {
2535 int grf;
2536 if (inst->src[i].file == GRF) {
2537 grf = inst->src[i].reg;
2538 } else if (inst->src[i].file == HW_REG &&
2539 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2540 grf = inst->src[i].fixed_hw_reg.nr;
2541 } else {
2542 continue;
2543 }
2544
2545 if (grf >= first_grf &&
2546 grf < first_grf + grf_len) {
2547 deps[grf - first_grf] = false;
2548 if (inst_simd16)
2549 deps[grf - first_grf + 1] = false;
2550 }
2551 }
2552 }
2553
2554 /**
2555 * Implements this workaround for the original 965:
2556 *
2557 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2558 * check for post destination dependencies on this instruction, software
2559 * must ensure that there is no destination hazard for the case of ‘write
2560 * followed by a posted write’ shown in the following example.
2561 *
2562 * 1. mov r3 0
2563 * 2. send r3.xy <rest of send instruction>
2564 * 3. mov r2 r3
2565 *
2566 * Due to no post-destination dependency check on the ‘send’, the above
2567 * code sequence could have two instructions (1 and 2) in flight at the
2568 * same time that both consider ‘r3’ as the target of their final writes.
2569 */
2570 void
2571 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2572 {
2573 int reg_size = dispatch_width / 8;
2574 int write_len = inst->regs_written * reg_size;
2575 int first_write_grf = inst->dst.reg;
2576 bool needs_dep[BRW_MAX_MRF];
2577 assert(write_len < (int)sizeof(needs_dep) - 1);
2578
2579 memset(needs_dep, false, sizeof(needs_dep));
2580 memset(needs_dep, true, write_len);
2581
2582 clear_deps_for_inst_src(inst, dispatch_width,
2583 needs_dep, first_write_grf, write_len);
2584
2585 /* Walk backwards looking for writes to registers we're writing which
2586 * aren't read since being written. If we hit the start of the program,
2587 * we assume that there are no outstanding dependencies on entry to the
2588 * program.
2589 */
2590 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2591 scan_inst != NULL;
2592 scan_inst = (fs_inst *)scan_inst->prev) {
2593
2594 /* If we hit control flow, assume that there *are* outstanding
2595 * dependencies, and force their cleanup before our instruction.
2596 */
2597 if (scan_inst->is_control_flow()) {
2598 for (int i = 0; i < write_len; i++) {
2599 if (needs_dep[i]) {
2600 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2601 }
2602 }
2603 return;
2604 }
2605
2606 bool scan_inst_simd16 = (dispatch_width > 8 &&
2607 !scan_inst->force_uncompressed &&
2608 !scan_inst->force_sechalf);
2609
2610 /* We insert our reads as late as possible on the assumption that any
2611 * instruction but a MOV that might have left us an outstanding
2612 * dependency has more latency than a MOV.
2613 */
2614 if (scan_inst->dst.file == GRF) {
2615 for (int i = 0; i < scan_inst->regs_written; i++) {
2616 int reg = scan_inst->dst.reg + i * reg_size;
2617
2618 if (reg >= first_write_grf &&
2619 reg < first_write_grf + write_len &&
2620 needs_dep[reg - first_write_grf]) {
2621 inst->insert_before(DEP_RESOLVE_MOV(reg));
2622 needs_dep[reg - first_write_grf] = false;
2623 if (scan_inst_simd16)
2624 needs_dep[reg - first_write_grf + 1] = false;
2625 }
2626 }
2627 }
2628
2629 /* Clear the flag for registers that actually got read (as expected). */
2630 clear_deps_for_inst_src(scan_inst, dispatch_width,
2631 needs_dep, first_write_grf, write_len);
2632
2633 /* Continue the loop only if we haven't resolved all the dependencies */
2634 int i;
2635 for (i = 0; i < write_len; i++) {
2636 if (needs_dep[i])
2637 break;
2638 }
2639 if (i == write_len)
2640 return;
2641 }
2642 }
2643
2644 /**
2645 * Implements this workaround for the original 965:
2646 *
2647 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2648 * used as a destination register until after it has been sourced by an
2649 * instruction with a different destination register.
2650 */
2651 void
2652 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2653 {
2654 int write_len = inst->regs_written * dispatch_width / 8;
2655 int first_write_grf = inst->dst.reg;
2656 bool needs_dep[BRW_MAX_MRF];
2657 assert(write_len < (int)sizeof(needs_dep) - 1);
2658
2659 memset(needs_dep, false, sizeof(needs_dep));
2660 memset(needs_dep, true, write_len);
2661 /* Walk forwards looking for writes to registers we're writing which aren't
2662 * read before being written.
2663 */
2664 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2665 !scan_inst->is_tail_sentinel();
2666 scan_inst = (fs_inst *)scan_inst->next) {
2667 /* If we hit control flow, force resolve all remaining dependencies. */
2668 if (scan_inst->is_control_flow()) {
2669 for (int i = 0; i < write_len; i++) {
2670 if (needs_dep[i])
2671 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2672 }
2673 return;
2674 }
2675
2676 /* Clear the flag for registers that actually got read (as expected). */
2677 clear_deps_for_inst_src(scan_inst, dispatch_width,
2678 needs_dep, first_write_grf, write_len);
2679
2680 /* We insert our reads as late as possible since they're reading the
2681 * result of a SEND, which has massive latency.
2682 */
2683 if (scan_inst->dst.file == GRF &&
2684 scan_inst->dst.reg >= first_write_grf &&
2685 scan_inst->dst.reg < first_write_grf + write_len &&
2686 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2687 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2688 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2689 }
2690
2691 /* Continue the loop only if we haven't resolved all the dependencies */
2692 int i;
2693 for (i = 0; i < write_len; i++) {
2694 if (needs_dep[i])
2695 break;
2696 }
2697 if (i == write_len)
2698 return;
2699 }
2700
2701 /* If we hit the end of the program, resolve all remaining dependencies out
2702 * of paranoia.
2703 */
2704 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2705 assert(last_inst->eot);
2706 for (int i = 0; i < write_len; i++) {
2707 if (needs_dep[i])
2708 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2709 }
2710 }
2711
2712 void
2713 fs_visitor::insert_gen4_send_dependency_workarounds()
2714 {
2715 if (brw->gen != 4 || brw->is_g4x)
2716 return;
2717
2718 /* Note that we're done with register allocation, so GRF fs_regs always
2719 * have a .reg_offset of 0.
2720 */
2721
2722 foreach_list_safe(node, &this->instructions) {
2723 fs_inst *inst = (fs_inst *)node;
2724
2725 if (inst->mlen != 0 && inst->dst.file == GRF) {
2726 insert_gen4_pre_send_dependency_workarounds(inst);
2727 insert_gen4_post_send_dependency_workarounds(inst);
2728 }
2729 }
2730 }
2731
2732 /**
2733 * Turns the generic expression-style uniform pull constant load instruction
2734 * into a hardware-specific series of instructions for loading a pull
2735 * constant.
2736 *
2737 * The expression style allows the CSE pass before this to optimize out
2738 * repeated loads from the same offset, and gives the pre-register-allocation
2739 * scheduling full flexibility, while the conversion to native instructions
2740 * allows the post-register-allocation scheduler the best information
2741 * possible.
2742 *
2743 * Note that execution masking for setting up pull constant loads is special:
2744 * the channels that need to be written are unrelated to the current execution
2745 * mask, since a later instruction will use one of the result channels as a
2746 * source operand for all 8 or 16 of its channels.
2747 */
2748 void
2749 fs_visitor::lower_uniform_pull_constant_loads()
2750 {
2751 foreach_list(node, &this->instructions) {
2752 fs_inst *inst = (fs_inst *)node;
2753
2754 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2755 continue;
2756
2757 if (brw->gen >= 7) {
2758 /* The offset arg before was a vec4-aligned byte offset. We need to
2759 * turn it into a dword offset.
2760 */
2761 fs_reg const_offset_reg = inst->src[1];
2762 assert(const_offset_reg.file == IMM &&
2763 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2764 const_offset_reg.imm.u /= 4;
2765 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2766
2767 /* This is actually going to be a MOV, but since only the first dword
2768 * is accessed, we have a special opcode to do just that one. Note
2769 * that this needs to be an operation that will be considered a def
2770 * by live variable analysis, or register allocation will explode.
2771 */
2772 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2773 payload, const_offset_reg);
2774 setup->force_writemask_all = true;
2775
2776 setup->ir = inst->ir;
2777 setup->annotation = inst->annotation;
2778 inst->insert_before(setup);
2779
2780 /* Similarly, this will only populate the first 4 channels of the
2781 * result register (since we only use smear values from 0-3), but we
2782 * don't tell the optimizer.
2783 */
2784 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2785 inst->src[1] = payload;
2786
2787 invalidate_live_intervals();
2788 } else {
2789 /* Before register allocation, we didn't tell the scheduler about the
2790 * MRF we use. We know it's safe to use this MRF because nothing
2791 * else does except for register spill/unspill, which generates and
2792 * uses its MRF within a single IR instruction.
2793 */
2794 inst->base_mrf = 14;
2795 inst->mlen = 1;
2796 }
2797 }
2798 }
2799
2800 void
2801 fs_visitor::dump_instructions()
2802 {
2803 calculate_register_pressure();
2804
2805 int ip = 0, max_pressure = 0;
2806 foreach_list(node, &this->instructions) {
2807 backend_instruction *inst = (backend_instruction *)node;
2808 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2809 fprintf(stderr, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
2810 dump_instruction(inst);
2811 ++ip;
2812 }
2813 fprintf(stderr, "Maximum %3d registers live at once.\n", max_pressure);
2814 }
2815
2816 void
2817 fs_visitor::dump_instruction(backend_instruction *be_inst)
2818 {
2819 fs_inst *inst = (fs_inst *)be_inst;
2820
2821 if (inst->predicate) {
2822 fprintf(stderr, "(%cf0.%d) ",
2823 inst->predicate_inverse ? '-' : '+',
2824 inst->flag_subreg);
2825 }
2826
2827 fprintf(stderr, "%s", brw_instruction_name(inst->opcode));
2828 if (inst->saturate)
2829 fprintf(stderr, ".sat");
2830 if (inst->conditional_mod) {
2831 fprintf(stderr, "%s", conditional_modifier[inst->conditional_mod]);
2832 if (!inst->predicate &&
2833 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2834 inst->opcode != BRW_OPCODE_IF &&
2835 inst->opcode != BRW_OPCODE_WHILE))) {
2836 fprintf(stderr, ".f0.%d", inst->flag_subreg);
2837 }
2838 }
2839 fprintf(stderr, " ");
2840
2841
2842 switch (inst->dst.file) {
2843 case GRF:
2844 fprintf(stderr, "vgrf%d", inst->dst.reg);
2845 if (virtual_grf_sizes[inst->dst.reg] != 1 ||
2846 inst->dst.subreg_offset)
2847 fprintf(stderr, "+%d.%d",
2848 inst->dst.reg_offset, inst->dst.subreg_offset);
2849 break;
2850 case MRF:
2851 fprintf(stderr, "m%d", inst->dst.reg);
2852 break;
2853 case BAD_FILE:
2854 fprintf(stderr, "(null)");
2855 break;
2856 case UNIFORM:
2857 fprintf(stderr, "***u%d***", inst->dst.reg);
2858 break;
2859 case HW_REG:
2860 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2861 switch (inst->dst.fixed_hw_reg.nr) {
2862 case BRW_ARF_NULL:
2863 fprintf(stderr, "null");
2864 break;
2865 case BRW_ARF_ADDRESS:
2866 fprintf(stderr, "a0.%d", inst->dst.fixed_hw_reg.subnr);
2867 break;
2868 case BRW_ARF_ACCUMULATOR:
2869 fprintf(stderr, "acc%d", inst->dst.fixed_hw_reg.subnr);
2870 break;
2871 case BRW_ARF_FLAG:
2872 fprintf(stderr, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2873 inst->dst.fixed_hw_reg.subnr);
2874 break;
2875 default:
2876 fprintf(stderr, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2877 inst->dst.fixed_hw_reg.subnr);
2878 break;
2879 }
2880 } else {
2881 fprintf(stderr, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
2882 }
2883 if (inst->dst.fixed_hw_reg.subnr)
2884 fprintf(stderr, "+%d", inst->dst.fixed_hw_reg.subnr);
2885 break;
2886 default:
2887 fprintf(stderr, "???");
2888 break;
2889 }
2890 fprintf(stderr, ":%s, ", brw_reg_type_letters(inst->dst.type));
2891
2892 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
2893 if (inst->src[i].negate)
2894 fprintf(stderr, "-");
2895 if (inst->src[i].abs)
2896 fprintf(stderr, "|");
2897 switch (inst->src[i].file) {
2898 case GRF:
2899 fprintf(stderr, "vgrf%d", inst->src[i].reg);
2900 if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2901 inst->src[i].subreg_offset)
2902 fprintf(stderr, "+%d.%d", inst->src[i].reg_offset,
2903 inst->src[i].subreg_offset);
2904 break;
2905 case MRF:
2906 fprintf(stderr, "***m%d***", inst->src[i].reg);
2907 break;
2908 case UNIFORM:
2909 fprintf(stderr, "u%d", inst->src[i].reg);
2910 if (inst->src[i].reladdr) {
2911 fprintf(stderr, "+reladdr");
2912 } else if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2913 inst->src[i].subreg_offset) {
2914 fprintf(stderr, "+%d.%d", inst->src[i].reg_offset,
2915 inst->src[i].subreg_offset);
2916 }
2917 break;
2918 case BAD_FILE:
2919 fprintf(stderr, "(null)");
2920 break;
2921 case IMM:
2922 switch (inst->src[i].type) {
2923 case BRW_REGISTER_TYPE_F:
2924 fprintf(stderr, "%ff", inst->src[i].imm.f);
2925 break;
2926 case BRW_REGISTER_TYPE_D:
2927 fprintf(stderr, "%dd", inst->src[i].imm.i);
2928 break;
2929 case BRW_REGISTER_TYPE_UD:
2930 fprintf(stderr, "%uu", inst->src[i].imm.u);
2931 break;
2932 default:
2933 fprintf(stderr, "???");
2934 break;
2935 }
2936 break;
2937 case HW_REG:
2938 if (inst->src[i].fixed_hw_reg.negate)
2939 fprintf(stderr, "-");
2940 if (inst->src[i].fixed_hw_reg.abs)
2941 fprintf(stderr, "|");
2942 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2943 switch (inst->src[i].fixed_hw_reg.nr) {
2944 case BRW_ARF_NULL:
2945 fprintf(stderr, "null");
2946 break;
2947 case BRW_ARF_ADDRESS:
2948 fprintf(stderr, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
2949 break;
2950 case BRW_ARF_ACCUMULATOR:
2951 fprintf(stderr, "acc%d", inst->src[i].fixed_hw_reg.subnr);
2952 break;
2953 case BRW_ARF_FLAG:
2954 fprintf(stderr, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2955 inst->src[i].fixed_hw_reg.subnr);
2956 break;
2957 default:
2958 fprintf(stderr, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2959 inst->src[i].fixed_hw_reg.subnr);
2960 break;
2961 }
2962 } else {
2963 fprintf(stderr, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2964 }
2965 if (inst->src[i].fixed_hw_reg.subnr)
2966 fprintf(stderr, "+%d", inst->src[i].fixed_hw_reg.subnr);
2967 if (inst->src[i].fixed_hw_reg.abs)
2968 fprintf(stderr, "|");
2969 break;
2970 default:
2971 fprintf(stderr, "???");
2972 break;
2973 }
2974 if (inst->src[i].abs)
2975 fprintf(stderr, "|");
2976
2977 if (inst->src[i].file != IMM) {
2978 fprintf(stderr, ":%s", brw_reg_type_letters(inst->src[i].type));
2979 }
2980
2981 if (i < 2 && inst->src[i + 1].file != BAD_FILE)
2982 fprintf(stderr, ", ");
2983 }
2984
2985 fprintf(stderr, " ");
2986
2987 if (inst->force_uncompressed)
2988 fprintf(stderr, "1sthalf ");
2989
2990 if (inst->force_sechalf)
2991 fprintf(stderr, "2ndhalf ");
2992
2993 fprintf(stderr, "\n");
2994 }
2995
2996 /**
2997 * Possibly returns an instruction that set up @param reg.
2998 *
2999 * Sometimes we want to take the result of some expression/variable
3000 * dereference tree and rewrite the instruction generating the result
3001 * of the tree. When processing the tree, we know that the
3002 * instructions generated are all writing temporaries that are dead
3003 * outside of this tree. So, if we have some instructions that write
3004 * a temporary, we're free to point that temp write somewhere else.
3005 *
3006 * Note that this doesn't guarantee that the instruction generated
3007 * only reg -- it might be the size=4 destination of a texture instruction.
3008 */
3009 fs_inst *
3010 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3011 fs_inst *end,
3012 const fs_reg &reg)
3013 {
3014 if (end == start ||
3015 end->is_partial_write() ||
3016 reg.reladdr ||
3017 !reg.equals(end->dst)) {
3018 return NULL;
3019 } else {
3020 return end;
3021 }
3022 }
3023
3024 void
3025 fs_visitor::setup_payload_gen6()
3026 {
3027 bool uses_depth =
3028 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3029 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3030
3031 assert(brw->gen >= 6);
3032
3033 /* R0-1: masks, pixel X/Y coordinates. */
3034 c->nr_payload_regs = 2;
3035 /* R2: only for 32-pixel dispatch.*/
3036
3037 /* R3-26: barycentric interpolation coordinates. These appear in the
3038 * same order that they appear in the brw_wm_barycentric_interp_mode
3039 * enum. Each set of coordinates occupies 2 registers if dispatch width
3040 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3041 * appear if they were enabled using the "Barycentric Interpolation
3042 * Mode" bits in WM_STATE.
3043 */
3044 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3045 if (barycentric_interp_modes & (1 << i)) {
3046 c->barycentric_coord_reg[i] = c->nr_payload_regs;
3047 c->nr_payload_regs += 2;
3048 if (dispatch_width == 16) {
3049 c->nr_payload_regs += 2;
3050 }
3051 }
3052 }
3053
3054 /* R27: interpolated depth if uses source depth */
3055 if (uses_depth) {
3056 c->source_depth_reg = c->nr_payload_regs;
3057 c->nr_payload_regs++;
3058 if (dispatch_width == 16) {
3059 /* R28: interpolated depth if not SIMD8. */
3060 c->nr_payload_regs++;
3061 }
3062 }
3063 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3064 if (uses_depth) {
3065 c->source_w_reg = c->nr_payload_regs;
3066 c->nr_payload_regs++;
3067 if (dispatch_width == 16) {
3068 /* R30: interpolated W if not SIMD8. */
3069 c->nr_payload_regs++;
3070 }
3071 }
3072
3073 c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3074 /* R31: MSAA position offsets. */
3075 if (c->prog_data.uses_pos_offset) {
3076 c->sample_pos_reg = c->nr_payload_regs;
3077 c->nr_payload_regs++;
3078 }
3079
3080 /* R32: MSAA input coverage mask */
3081 if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3082 assert(brw->gen >= 7);
3083 c->sample_mask_reg = c->nr_payload_regs;
3084 c->nr_payload_regs++;
3085 if (dispatch_width == 16) {
3086 /* R33: input coverage mask if not SIMD8. */
3087 c->nr_payload_regs++;
3088 }
3089 }
3090
3091 /* R34-: bary for 32-pixel. */
3092 /* R58-59: interp W for 32-pixel. */
3093
3094 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3095 c->source_depth_to_render_target = true;
3096 }
3097 }
3098
3099 void
3100 fs_visitor::assign_binding_table_offsets()
3101 {
3102 uint32_t next_binding_table_offset = 0;
3103
3104 /* If there are no color regions, we still perform an FB write to a null
3105 * renderbuffer, which we place at surface index 0.
3106 */
3107 c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3108 next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3109
3110 assign_common_binding_table_offsets(next_binding_table_offset);
3111 }
3112
3113 void
3114 fs_visitor::calculate_register_pressure()
3115 {
3116 invalidate_live_intervals();
3117 calculate_live_intervals();
3118
3119 int num_instructions = 0;
3120 foreach_list(node, &this->instructions) {
3121 ++num_instructions;
3122 }
3123
3124 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3125
3126 for (int reg = 0; reg < virtual_grf_count; reg++) {
3127 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3128 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3129 }
3130 }
3131
3132 /**
3133 * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
3134 *
3135 * The needs_unlit_centroid_workaround ends up producing one of these per
3136 * channel of centroid input, so it's good to clean them up.
3137 *
3138 * An assumption here is that nothing ever modifies the dispatched pixels
3139 * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
3140 * dictates that anyway.
3141 */
3142 void
3143 fs_visitor::opt_drop_redundant_mov_to_flags()
3144 {
3145 bool flag_mov_found[2] = {false};
3146
3147 foreach_list_safe(node, &this->instructions) {
3148 fs_inst *inst = (fs_inst *)node;
3149
3150 if (inst->is_control_flow()) {
3151 memset(flag_mov_found, 0, sizeof(flag_mov_found));
3152 } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
3153 if (!flag_mov_found[inst->flag_subreg])
3154 flag_mov_found[inst->flag_subreg] = true;
3155 else
3156 inst->remove();
3157 } else if (inst->writes_flag()) {
3158 flag_mov_found[inst->flag_subreg] = false;
3159 }
3160 }
3161 }
3162
3163 bool
3164 fs_visitor::run()
3165 {
3166 sanity_param_count = fp->Base.Parameters->NumParameters;
3167 bool allocated_without_spills;
3168
3169 assign_binding_table_offsets();
3170
3171 if (brw->gen >= 6)
3172 setup_payload_gen6();
3173 else
3174 setup_payload_gen4();
3175
3176 if (0) {
3177 emit_dummy_fs();
3178 } else {
3179 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3180 emit_shader_time_begin();
3181
3182 calculate_urb_setup();
3183 if (fp->Base.InputsRead > 0) {
3184 if (brw->gen < 6)
3185 emit_interpolation_setup_gen4();
3186 else
3187 emit_interpolation_setup_gen6();
3188 }
3189
3190 /* We handle discards by keeping track of the still-live pixels in f0.1.
3191 * Initialize it with the dispatched pixels.
3192 */
3193 if (fp->UsesKill || c->key.alpha_test_func) {
3194 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3195 discard_init->flag_subreg = 1;
3196 }
3197
3198 /* Generate FS IR for main(). (the visitor only descends into
3199 * functions called "main").
3200 */
3201 if (shader) {
3202 foreach_list(node, &*shader->base.ir) {
3203 ir_instruction *ir = (ir_instruction *)node;
3204 base_ir = ir;
3205 this->result = reg_undef;
3206 ir->accept(this);
3207 }
3208 } else {
3209 emit_fragment_program_code();
3210 }
3211 base_ir = NULL;
3212 if (failed)
3213 return false;
3214
3215 emit(FS_OPCODE_PLACEHOLDER_HALT);
3216
3217 if (c->key.alpha_test_func)
3218 emit_alpha_test();
3219
3220 emit_fb_writes();
3221
3222 split_virtual_grfs();
3223
3224 move_uniform_array_access_to_pull_constants();
3225 assign_constant_locations();
3226 demote_pull_constants();
3227
3228 opt_drop_redundant_mov_to_flags();
3229
3230 bool progress;
3231 do {
3232 progress = false;
3233
3234 compact_virtual_grfs();
3235
3236 progress = remove_duplicate_mrf_writes() || progress;
3237
3238 progress = opt_algebraic() || progress;
3239 progress = opt_cse() || progress;
3240 progress = opt_copy_propagate() || progress;
3241 progress = opt_peephole_predicated_break() || progress;
3242 progress = dead_code_eliminate() || progress;
3243 progress = dead_code_eliminate_local() || progress;
3244 progress = opt_peephole_sel() || progress;
3245 progress = dead_control_flow_eliminate(this) || progress;
3246 progress = opt_saturate_propagation() || progress;
3247 progress = register_coalesce() || progress;
3248 progress = compute_to_mrf() || progress;
3249 } while (progress);
3250
3251 lower_uniform_pull_constant_loads();
3252
3253 assign_curb_setup();
3254 assign_urb_setup();
3255
3256 static enum instruction_scheduler_mode pre_modes[] = {
3257 SCHEDULE_PRE,
3258 SCHEDULE_PRE_NON_LIFO,
3259 SCHEDULE_PRE_LIFO,
3260 };
3261
3262 /* Try each scheduling heuristic to see if it can successfully register
3263 * allocate without spilling. They should be ordered by decreasing
3264 * performance but increasing likelihood of allocating.
3265 */
3266 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3267 schedule_instructions(pre_modes[i]);
3268
3269 if (0) {
3270 assign_regs_trivial();
3271 allocated_without_spills = true;
3272 } else {
3273 allocated_without_spills = assign_regs(false);
3274 }
3275 if (allocated_without_spills)
3276 break;
3277 }
3278
3279 if (!allocated_without_spills) {
3280 /* We assume that any spilling is worse than just dropping back to
3281 * SIMD8. There's probably actually some intermediate point where
3282 * SIMD16 with a couple of spills is still better.
3283 */
3284 if (dispatch_width == 16) {
3285 fail("Failure to register allocate. Reduce number of "
3286 "live scalar values to avoid this.");
3287 }
3288
3289 /* Since we're out of heuristics, just go spill registers until we
3290 * get an allocation.
3291 */
3292 while (!assign_regs(true)) {
3293 if (failed)
3294 break;
3295 }
3296 }
3297 }
3298 assert(force_uncompressed_stack == 0);
3299
3300 /* This must come after all optimization and register allocation, since
3301 * it inserts dead code that happens to have side effects, and it does
3302 * so based on the actual physical registers in use.
3303 */
3304 insert_gen4_send_dependency_workarounds();
3305
3306 if (failed)
3307 return false;
3308
3309 if (!allocated_without_spills)
3310 schedule_instructions(SCHEDULE_POST);
3311
3312 if (dispatch_width == 8)
3313 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3314 else
3315 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3316
3317 /* If any state parameters were appended, then ParameterValues could have
3318 * been realloced, in which case the driver uniform storage set up by
3319 * _mesa_associate_uniform_storage() would point to freed memory. Make
3320 * sure that didn't happen.
3321 */
3322 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3323
3324 return !failed;
3325 }
3326
3327 const unsigned *
3328 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3329 struct gl_fragment_program *fp,
3330 struct gl_shader_program *prog,
3331 unsigned *final_assembly_size)
3332 {
3333 bool start_busy = false;
3334 double start_time = 0;
3335
3336 if (unlikely(brw->perf_debug)) {
3337 start_busy = (brw->batch.last_bo &&
3338 drm_intel_bo_busy(brw->batch.last_bo));
3339 start_time = get_time();
3340 }
3341
3342 struct brw_shader *shader = NULL;
3343 if (prog)
3344 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3345
3346 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3347 brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3348
3349 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3350 */
3351 fs_visitor v(brw, c, prog, fp, 8);
3352 if (!v.run()) {
3353 if (prog) {
3354 prog->LinkStatus = false;
3355 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3356 }
3357
3358 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3359 v.fail_msg);
3360
3361 return NULL;
3362 }
3363
3364 exec_list *simd16_instructions = NULL;
3365 fs_visitor v2(brw, c, prog, fp, 16);
3366 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3367 if (!v.simd16_unsupported) {
3368 /* Try a SIMD16 compile */
3369 v2.import_uniforms(&v);
3370 if (!v2.run()) {
3371 perf_debug("SIMD16 shader failed to compile, falling back to "
3372 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3373 } else {
3374 simd16_instructions = &v2.instructions;
3375 }
3376 } else {
3377 perf_debug("SIMD16 shader unsupported, falling back to "
3378 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3379 }
3380 }
3381
3382 const unsigned *assembly = NULL;
3383 if (brw->gen >= 8) {
3384 gen8_fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3385 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3386 final_assembly_size);
3387 } else {
3388 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3389 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3390 final_assembly_size);
3391 }
3392
3393 if (unlikely(brw->perf_debug) && shader) {
3394 if (shader->compiled_once)
3395 brw_wm_debug_recompile(brw, prog, &c->key);
3396 shader->compiled_once = true;
3397
3398 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3399 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3400 (get_time() - start_time) * 1000);
3401 }
3402 }
3403
3404 return assembly;
3405 }
3406
3407 bool
3408 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3409 {
3410 struct brw_context *brw = brw_context(ctx);
3411 struct brw_wm_prog_key key;
3412
3413 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3414 return true;
3415
3416 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3417 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3418 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3419 bool program_uses_dfdy = fp->UsesDFdy;
3420
3421 memset(&key, 0, sizeof(key));
3422
3423 if (brw->gen < 6) {
3424 if (fp->UsesKill)
3425 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3426
3427 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3428 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3429
3430 /* Just assume depth testing. */
3431 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3432 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3433 }
3434
3435 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3436 BRW_FS_VARYING_INPUT_MASK) > 16)
3437 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3438
3439 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3440
3441 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3442 for (unsigned i = 0; i < sampler_count; i++) {
3443 if (fp->Base.ShadowSamplers & (1 << i)) {
3444 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3445 key.tex.swizzles[i] =
3446 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3447 } else {
3448 /* Color sampler: assume no swizzling. */
3449 key.tex.swizzles[i] = SWIZZLE_XYZW;
3450 }
3451 }
3452
3453 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3454 key.drawable_height = ctx->DrawBuffer->Height;
3455 }
3456
3457 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3458 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3459 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3460
3461 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3462 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3463 key.nr_color_regions > 1;
3464 }
3465
3466 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3467 * quality of the derivatives is likely to be determined by the driconf
3468 * option.
3469 */
3470 key.high_quality_derivatives = brw->disable_derivative_optimization;
3471
3472 key.program_string_id = bfp->id;
3473
3474 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3475 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3476
3477 bool success = do_wm_prog(brw, prog, bfp, &key);
3478
3479 brw->wm.base.prog_offset = old_prog_offset;
3480 brw->wm.prog_data = old_prog_data;
3481
3482 return success;
3483 }