i965/fs: Track whether we're doing dual source in a more obvious way.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53
54 void
55 fs_inst::init()
56 {
57 memset(this, 0, sizeof(*this));
58 this->conditional_mod = BRW_CONDITIONAL_NONE;
59
60 this->dst = reg_undef;
61 this->src[0] = reg_undef;
62 this->src[1] = reg_undef;
63 this->src[2] = reg_undef;
64
65 /* This will be the case for almost all instructions. */
66 this->regs_written = 1;
67 }
68
69 fs_inst::fs_inst()
70 {
71 init();
72 this->opcode = BRW_OPCODE_NOP;
73 }
74
75 fs_inst::fs_inst(enum opcode opcode)
76 {
77 init();
78 this->opcode = opcode;
79 }
80
81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
82 {
83 init();
84 this->opcode = opcode;
85 this->dst = dst;
86
87 if (dst.file == GRF)
88 assert(dst.reg_offset >= 0);
89 }
90
91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
92 {
93 init();
94 this->opcode = opcode;
95 this->dst = dst;
96 this->src[0] = src0;
97
98 if (dst.file == GRF)
99 assert(dst.reg_offset >= 0);
100 if (src[0].file == GRF)
101 assert(src[0].reg_offset >= 0);
102 }
103
104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
105 {
106 init();
107 this->opcode = opcode;
108 this->dst = dst;
109 this->src[0] = src0;
110 this->src[1] = src1;
111
112 if (dst.file == GRF)
113 assert(dst.reg_offset >= 0);
114 if (src[0].file == GRF)
115 assert(src[0].reg_offset >= 0);
116 if (src[1].file == GRF)
117 assert(src[1].reg_offset >= 0);
118 }
119
120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
121 fs_reg src0, fs_reg src1, fs_reg src2)
122 {
123 init();
124 this->opcode = opcode;
125 this->dst = dst;
126 this->src[0] = src0;
127 this->src[1] = src1;
128 this->src[2] = src2;
129
130 if (dst.file == GRF)
131 assert(dst.reg_offset >= 0);
132 if (src[0].file == GRF)
133 assert(src[0].reg_offset >= 0);
134 if (src[1].file == GRF)
135 assert(src[1].reg_offset >= 0);
136 if (src[2].file == GRF)
137 assert(src[2].reg_offset >= 0);
138 }
139
140 #define ALU1(op) \
141 fs_inst * \
142 fs_visitor::op(fs_reg dst, fs_reg src0) \
143 { \
144 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
145 }
146
147 #define ALU2(op) \
148 fs_inst * \
149 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
150 { \
151 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
152 }
153
154 #define ALU3(op) \
155 fs_inst * \
156 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
157 { \
158 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
159 }
160
161 ALU1(NOT)
162 ALU1(MOV)
163 ALU1(FRC)
164 ALU1(RNDD)
165 ALU1(RNDE)
166 ALU1(RNDZ)
167 ALU2(ADD)
168 ALU2(MUL)
169 ALU2(MACH)
170 ALU2(AND)
171 ALU2(OR)
172 ALU2(XOR)
173 ALU2(SHL)
174 ALU2(SHR)
175 ALU2(ASR)
176 ALU3(LRP)
177 ALU1(BFREV)
178 ALU3(BFE)
179 ALU2(BFI1)
180 ALU3(BFI2)
181 ALU1(FBH)
182 ALU1(FBL)
183 ALU1(CBIT)
184 ALU3(MAD)
185 ALU2(ADDC)
186 ALU2(SUBB)
187 ALU2(SEL)
188
189 /** Gen4 predicated IF. */
190 fs_inst *
191 fs_visitor::IF(uint32_t predicate)
192 {
193 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195 return inst;
196 }
197
198 /** Gen6 IF with embedded comparison. */
199 fs_inst *
200 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
201 {
202 assert(brw->gen == 6);
203 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
204 reg_null_d, src0, src1);
205 inst->conditional_mod = condition;
206 return inst;
207 }
208
209 /**
210 * CMP: Sets the low bit of the destination channels with the result
211 * of the comparison, while the upper bits are undefined, and updates
212 * the flag register with the packed 16 bits of the result.
213 */
214 fs_inst *
215 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
216 {
217 fs_inst *inst;
218
219 /* Take the instruction:
220 *
221 * CMP null<d> src0<f> src1<f>
222 *
223 * Original gen4 does type conversion to the destination type before
224 * comparison, producing garbage results for floating point comparisons.
225 * gen5 does the comparison on the execution type (resolved source types),
226 * so dst type doesn't matter. gen6 does comparison and then uses the
227 * result as if it was the dst type with no conversion, which happens to
228 * mostly work out for float-interpreted-as-int since our comparisons are
229 * for >0, =0, <0.
230 */
231 if (brw->gen == 4) {
232 dst.type = src0.type;
233 if (dst.file == HW_REG)
234 dst.fixed_hw_reg.type = dst.type;
235 }
236
237 resolve_ud_negate(&src0);
238 resolve_ud_negate(&src1);
239
240 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
241 inst->conditional_mod = condition;
242
243 return inst;
244 }
245
246 exec_list
247 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
248 const fs_reg &surf_index,
249 const fs_reg &varying_offset,
250 uint32_t const_offset)
251 {
252 exec_list instructions;
253 fs_inst *inst;
254
255 /* We have our constant surface use a pitch of 4 bytes, so our index can
256 * be any component of a vector, and then we load 4 contiguous
257 * components starting from that.
258 *
259 * We break down the const_offset to a portion added to the variable
260 * offset and a portion done using reg_offset, which means that if you
261 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
262 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
263 * CSE can later notice that those loads are all the same and eliminate
264 * the redundant ones.
265 */
266 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
267 instructions.push_tail(ADD(vec4_offset,
268 varying_offset, const_offset & ~3));
269
270 int scale = 1;
271 if (brw->gen == 4 && dispatch_width == 8) {
272 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
273 * u, v, r) as parameters, or we can just use the SIMD16 message
274 * consisting of (header, u). We choose the second, at the cost of a
275 * longer return length.
276 */
277 scale = 2;
278 }
279
280 enum opcode op;
281 if (brw->gen >= 7)
282 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
283 else
284 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
285 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
286 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
287 inst->regs_written = 4 * scale;
288 instructions.push_tail(inst);
289
290 if (brw->gen < 7) {
291 inst->base_mrf = 13;
292 inst->header_present = true;
293 if (brw->gen == 4)
294 inst->mlen = 3;
295 else
296 inst->mlen = 1 + dispatch_width / 8;
297 }
298
299 vec4_result.reg_offset += (const_offset & 3) * scale;
300 instructions.push_tail(MOV(dst, vec4_result));
301
302 return instructions;
303 }
304
305 /**
306 * A helper for MOV generation for fixing up broken hardware SEND dependency
307 * handling.
308 */
309 fs_inst *
310 fs_visitor::DEP_RESOLVE_MOV(int grf)
311 {
312 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
313
314 inst->ir = NULL;
315 inst->annotation = "send dependency resolve";
316
317 /* The caller always wants uncompressed to emit the minimal extra
318 * dependencies, and to avoid having to deal with aligning its regs to 2.
319 */
320 inst->force_uncompressed = true;
321
322 return inst;
323 }
324
325 bool
326 fs_inst::equals(fs_inst *inst) const
327 {
328 return (opcode == inst->opcode &&
329 dst.equals(inst->dst) &&
330 src[0].equals(inst->src[0]) &&
331 src[1].equals(inst->src[1]) &&
332 src[2].equals(inst->src[2]) &&
333 saturate == inst->saturate &&
334 predicate == inst->predicate &&
335 conditional_mod == inst->conditional_mod &&
336 mlen == inst->mlen &&
337 base_mrf == inst->base_mrf &&
338 sampler == inst->sampler &&
339 target == inst->target &&
340 eot == inst->eot &&
341 header_present == inst->header_present &&
342 shadow_compare == inst->shadow_compare &&
343 offset == inst->offset);
344 }
345
346 bool
347 fs_inst::overwrites_reg(const fs_reg &reg) const
348 {
349 return (reg.file == dst.file &&
350 reg.reg == dst.reg &&
351 reg.reg_offset >= dst.reg_offset &&
352 reg.reg_offset < dst.reg_offset + regs_written);
353 }
354
355 bool
356 fs_inst::is_send_from_grf() const
357 {
358 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
359 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
360 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
361 src[1].file == GRF) ||
362 (is_tex() && src[0].file == GRF));
363 }
364
365 bool
366 fs_visitor::can_do_source_mods(fs_inst *inst)
367 {
368 if (brw->gen == 6 && inst->is_math())
369 return false;
370
371 if (inst->is_send_from_grf())
372 return false;
373
374 if (!inst->can_do_source_mods())
375 return false;
376
377 return true;
378 }
379
380 void
381 fs_reg::init()
382 {
383 memset(this, 0, sizeof(*this));
384 stride = 1;
385 }
386
387 /** Generic unset register constructor. */
388 fs_reg::fs_reg()
389 {
390 init();
391 this->file = BAD_FILE;
392 }
393
394 /** Immediate value constructor. */
395 fs_reg::fs_reg(float f)
396 {
397 init();
398 this->file = IMM;
399 this->type = BRW_REGISTER_TYPE_F;
400 this->imm.f = f;
401 }
402
403 /** Immediate value constructor. */
404 fs_reg::fs_reg(int32_t i)
405 {
406 init();
407 this->file = IMM;
408 this->type = BRW_REGISTER_TYPE_D;
409 this->imm.i = i;
410 }
411
412 /** Immediate value constructor. */
413 fs_reg::fs_reg(uint32_t u)
414 {
415 init();
416 this->file = IMM;
417 this->type = BRW_REGISTER_TYPE_UD;
418 this->imm.u = u;
419 }
420
421 /** Fixed brw_reg. */
422 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
423 {
424 init();
425 this->file = HW_REG;
426 this->fixed_hw_reg = fixed_hw_reg;
427 this->type = fixed_hw_reg.type;
428 }
429
430 bool
431 fs_reg::equals(const fs_reg &r) const
432 {
433 return (file == r.file &&
434 reg == r.reg &&
435 reg_offset == r.reg_offset &&
436 subreg_offset == r.subreg_offset &&
437 type == r.type &&
438 negate == r.negate &&
439 abs == r.abs &&
440 !reladdr && !r.reladdr &&
441 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
442 sizeof(fixed_hw_reg)) == 0 &&
443 stride == r.stride &&
444 imm.u == r.imm.u);
445 }
446
447 fs_reg &
448 fs_reg::apply_stride(unsigned stride)
449 {
450 assert((this->stride * stride) <= 4 &&
451 (is_power_of_two(stride) || stride == 0) &&
452 file != HW_REG && file != IMM);
453 this->stride *= stride;
454 return *this;
455 }
456
457 fs_reg &
458 fs_reg::set_smear(unsigned subreg)
459 {
460 assert(file != HW_REG && file != IMM);
461 subreg_offset = subreg * type_sz(type);
462 stride = 0;
463 return *this;
464 }
465
466 bool
467 fs_reg::is_contiguous() const
468 {
469 return stride == 1;
470 }
471
472 bool
473 fs_reg::is_zero() const
474 {
475 if (file != IMM)
476 return false;
477
478 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
479 }
480
481 bool
482 fs_reg::is_one() const
483 {
484 if (file != IMM)
485 return false;
486
487 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
488 }
489
490 bool
491 fs_reg::is_null() const
492 {
493 return file == HW_REG &&
494 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
495 fixed_hw_reg.nr == BRW_ARF_NULL;
496 }
497
498 bool
499 fs_reg::is_valid_3src() const
500 {
501 return file == GRF || file == UNIFORM;
502 }
503
504 int
505 fs_visitor::type_size(const struct glsl_type *type)
506 {
507 unsigned int size, i;
508
509 switch (type->base_type) {
510 case GLSL_TYPE_UINT:
511 case GLSL_TYPE_INT:
512 case GLSL_TYPE_FLOAT:
513 case GLSL_TYPE_BOOL:
514 return type->components();
515 case GLSL_TYPE_ARRAY:
516 return type_size(type->fields.array) * type->length;
517 case GLSL_TYPE_STRUCT:
518 size = 0;
519 for (i = 0; i < type->length; i++) {
520 size += type_size(type->fields.structure[i].type);
521 }
522 return size;
523 case GLSL_TYPE_SAMPLER:
524 /* Samplers take up no register space, since they're baked in at
525 * link time.
526 */
527 return 0;
528 case GLSL_TYPE_ATOMIC_UINT:
529 return 0;
530 case GLSL_TYPE_IMAGE:
531 case GLSL_TYPE_VOID:
532 case GLSL_TYPE_ERROR:
533 case GLSL_TYPE_INTERFACE:
534 assert(!"not reached");
535 break;
536 }
537
538 return 0;
539 }
540
541 fs_reg
542 fs_visitor::get_timestamp()
543 {
544 assert(brw->gen >= 7);
545
546 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
547 BRW_ARF_TIMESTAMP,
548 0),
549 BRW_REGISTER_TYPE_UD));
550
551 fs_reg dst = fs_reg(this, glsl_type::uint_type);
552
553 fs_inst *mov = emit(MOV(dst, ts));
554 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
555 * even if it's not enabled in the dispatch.
556 */
557 mov->force_writemask_all = true;
558 mov->force_uncompressed = true;
559
560 /* The caller wants the low 32 bits of the timestamp. Since it's running
561 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
562 * which is plenty of time for our purposes. It is identical across the
563 * EUs, but since it's tracking GPU core speed it will increment at a
564 * varying rate as render P-states change.
565 *
566 * The caller could also check if render P-states have changed (or anything
567 * else that might disrupt timing) by setting smear to 2 and checking if
568 * that field is != 0.
569 */
570 dst.set_smear(0);
571
572 return dst;
573 }
574
575 void
576 fs_visitor::emit_shader_time_begin()
577 {
578 current_annotation = "shader time start";
579 shader_start_time = get_timestamp();
580 }
581
582 void
583 fs_visitor::emit_shader_time_end()
584 {
585 current_annotation = "shader time end";
586
587 enum shader_time_shader_type type, written_type, reset_type;
588 if (dispatch_width == 8) {
589 type = ST_FS8;
590 written_type = ST_FS8_WRITTEN;
591 reset_type = ST_FS8_RESET;
592 } else {
593 assert(dispatch_width == 16);
594 type = ST_FS16;
595 written_type = ST_FS16_WRITTEN;
596 reset_type = ST_FS16_RESET;
597 }
598
599 fs_reg shader_end_time = get_timestamp();
600
601 /* Check that there weren't any timestamp reset events (assuming these
602 * were the only two timestamp reads that happened).
603 */
604 fs_reg reset = shader_end_time;
605 reset.set_smear(2);
606 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
607 test->conditional_mod = BRW_CONDITIONAL_Z;
608 emit(IF(BRW_PREDICATE_NORMAL));
609
610 push_force_uncompressed();
611 fs_reg start = shader_start_time;
612 start.negate = true;
613 fs_reg diff = fs_reg(this, glsl_type::uint_type);
614 emit(ADD(diff, start, shader_end_time));
615
616 /* If there were no instructions between the two timestamp gets, the diff
617 * is 2 cycles. Remove that overhead, so I can forget about that when
618 * trying to determine the time taken for single instructions.
619 */
620 emit(ADD(diff, diff, fs_reg(-2u)));
621
622 emit_shader_time_write(type, diff);
623 emit_shader_time_write(written_type, fs_reg(1u));
624 emit(BRW_OPCODE_ELSE);
625 emit_shader_time_write(reset_type, fs_reg(1u));
626 emit(BRW_OPCODE_ENDIF);
627
628 pop_force_uncompressed();
629 }
630
631 void
632 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
633 fs_reg value)
634 {
635 int shader_time_index =
636 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
637 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
638
639 fs_reg payload;
640 if (dispatch_width == 8)
641 payload = fs_reg(this, glsl_type::uvec2_type);
642 else
643 payload = fs_reg(this, glsl_type::uint_type);
644
645 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
646 fs_reg(), payload, offset, value));
647 }
648
649 void
650 fs_visitor::vfail(const char *format, va_list va)
651 {
652 char *msg;
653
654 if (failed)
655 return;
656
657 failed = true;
658
659 msg = ralloc_vasprintf(mem_ctx, format, va);
660 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
661
662 this->fail_msg = msg;
663
664 if (INTEL_DEBUG & DEBUG_WM) {
665 fprintf(stderr, "%s", msg);
666 }
667 }
668
669 void
670 fs_visitor::fail(const char *format, ...)
671 {
672 va_list va;
673
674 va_start(va, format);
675 vfail(format, va);
676 va_end(va);
677 }
678
679 /**
680 * Mark this program as impossible to compile in SIMD16 mode.
681 *
682 * During the SIMD8 compile (which happens first), we can detect and flag
683 * things that are unsupported in SIMD16 mode, so the compiler can skip
684 * the SIMD16 compile altogether.
685 *
686 * During a SIMD16 compile (if one happens anyway), this just calls fail().
687 */
688 void
689 fs_visitor::no16(const char *format, ...)
690 {
691 va_list va;
692
693 va_start(va, format);
694
695 if (dispatch_width == 16) {
696 vfail(format, va);
697 } else {
698 simd16_unsupported = true;
699
700 if (brw->perf_debug) {
701 if (no16_msg)
702 ralloc_vasprintf_append(&no16_msg, format, va);
703 else
704 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
705 }
706 }
707
708 va_end(va);
709 }
710
711 fs_inst *
712 fs_visitor::emit(enum opcode opcode)
713 {
714 return emit(new(mem_ctx) fs_inst(opcode));
715 }
716
717 fs_inst *
718 fs_visitor::emit(enum opcode opcode, fs_reg dst)
719 {
720 return emit(new(mem_ctx) fs_inst(opcode, dst));
721 }
722
723 fs_inst *
724 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
725 {
726 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
727 }
728
729 fs_inst *
730 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
731 {
732 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
733 }
734
735 fs_inst *
736 fs_visitor::emit(enum opcode opcode, fs_reg dst,
737 fs_reg src0, fs_reg src1, fs_reg src2)
738 {
739 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
740 }
741
742 void
743 fs_visitor::push_force_uncompressed()
744 {
745 force_uncompressed_stack++;
746 }
747
748 void
749 fs_visitor::pop_force_uncompressed()
750 {
751 force_uncompressed_stack--;
752 assert(force_uncompressed_stack >= 0);
753 }
754
755 /**
756 * Returns true if the instruction has a flag that means it won't
757 * update an entire destination register.
758 *
759 * For example, dead code elimination and live variable analysis want to know
760 * when a write to a variable screens off any preceding values that were in
761 * it.
762 */
763 bool
764 fs_inst::is_partial_write() const
765 {
766 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
767 this->force_uncompressed ||
768 this->force_sechalf || !this->dst.is_contiguous());
769 }
770
771 int
772 fs_inst::regs_read(fs_visitor *v, int arg) const
773 {
774 if (is_tex() && arg == 0 && src[0].file == GRF) {
775 if (v->dispatch_width == 16)
776 return (mlen + 1) / 2;
777 else
778 return mlen;
779 }
780 return 1;
781 }
782
783 bool
784 fs_inst::reads_flag() const
785 {
786 return predicate;
787 }
788
789 bool
790 fs_inst::writes_flag() const
791 {
792 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
793 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
794 }
795
796 /**
797 * Returns how many MRFs an FS opcode will write over.
798 *
799 * Note that this is not the 0 or 1 implied writes in an actual gen
800 * instruction -- the FS opcodes often generate MOVs in addition.
801 */
802 int
803 fs_visitor::implied_mrf_writes(fs_inst *inst)
804 {
805 if (inst->mlen == 0)
806 return 0;
807
808 if (inst->base_mrf == -1)
809 return 0;
810
811 switch (inst->opcode) {
812 case SHADER_OPCODE_RCP:
813 case SHADER_OPCODE_RSQ:
814 case SHADER_OPCODE_SQRT:
815 case SHADER_OPCODE_EXP2:
816 case SHADER_OPCODE_LOG2:
817 case SHADER_OPCODE_SIN:
818 case SHADER_OPCODE_COS:
819 return 1 * dispatch_width / 8;
820 case SHADER_OPCODE_POW:
821 case SHADER_OPCODE_INT_QUOTIENT:
822 case SHADER_OPCODE_INT_REMAINDER:
823 return 2 * dispatch_width / 8;
824 case SHADER_OPCODE_TEX:
825 case FS_OPCODE_TXB:
826 case SHADER_OPCODE_TXD:
827 case SHADER_OPCODE_TXF:
828 case SHADER_OPCODE_TXF_CMS:
829 case SHADER_OPCODE_TXF_MCS:
830 case SHADER_OPCODE_TG4:
831 case SHADER_OPCODE_TG4_OFFSET:
832 case SHADER_OPCODE_TXL:
833 case SHADER_OPCODE_TXS:
834 case SHADER_OPCODE_LOD:
835 return 1;
836 case FS_OPCODE_FB_WRITE:
837 return 2;
838 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
839 case SHADER_OPCODE_GEN4_SCRATCH_READ:
840 return 1;
841 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
842 return inst->mlen;
843 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
844 return 2;
845 case SHADER_OPCODE_UNTYPED_ATOMIC:
846 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
847 return 0;
848 default:
849 assert(!"not reached");
850 return inst->mlen;
851 }
852 }
853
854 int
855 fs_visitor::virtual_grf_alloc(int size)
856 {
857 if (virtual_grf_array_size <= virtual_grf_count) {
858 if (virtual_grf_array_size == 0)
859 virtual_grf_array_size = 16;
860 else
861 virtual_grf_array_size *= 2;
862 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
863 virtual_grf_array_size);
864 }
865 virtual_grf_sizes[virtual_grf_count] = size;
866 return virtual_grf_count++;
867 }
868
869 /** Fixed HW reg constructor. */
870 fs_reg::fs_reg(enum register_file file, int reg)
871 {
872 init();
873 this->file = file;
874 this->reg = reg;
875 this->type = BRW_REGISTER_TYPE_F;
876 }
877
878 /** Fixed HW reg constructor. */
879 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
880 {
881 init();
882 this->file = file;
883 this->reg = reg;
884 this->type = type;
885 }
886
887 /** Automatic reg constructor. */
888 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
889 {
890 init();
891
892 this->file = GRF;
893 this->reg = v->virtual_grf_alloc(v->type_size(type));
894 this->reg_offset = 0;
895 this->type = brw_type_for_base_type(type);
896 }
897
898 fs_reg *
899 fs_visitor::variable_storage(ir_variable *var)
900 {
901 return (fs_reg *)hash_table_find(this->variable_ht, var);
902 }
903
904 void
905 import_uniforms_callback(const void *key,
906 void *data,
907 void *closure)
908 {
909 struct hash_table *dst_ht = (struct hash_table *)closure;
910 const fs_reg *reg = (const fs_reg *)data;
911
912 if (reg->file != UNIFORM)
913 return;
914
915 hash_table_insert(dst_ht, data, key);
916 }
917
918 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
919 * This brings in those uniform definitions
920 */
921 void
922 fs_visitor::import_uniforms(fs_visitor *v)
923 {
924 hash_table_call_foreach(v->variable_ht,
925 import_uniforms_callback,
926 variable_ht);
927 this->push_constant_loc = v->push_constant_loc;
928 this->pull_constant_loc = v->pull_constant_loc;
929 this->uniforms = v->uniforms;
930 this->param_size = v->param_size;
931 }
932
933 /* Our support for uniforms is piggy-backed on the struct
934 * gl_fragment_program, because that's where the values actually
935 * get stored, rather than in some global gl_shader_program uniform
936 * store.
937 */
938 void
939 fs_visitor::setup_uniform_values(ir_variable *ir)
940 {
941 int namelen = strlen(ir->name);
942
943 /* The data for our (non-builtin) uniforms is stored in a series of
944 * gl_uniform_driver_storage structs for each subcomponent that
945 * glGetUniformLocation() could name. We know it's been set up in the same
946 * order we'd walk the type, so walk the list of storage and find anything
947 * with our name, or the prefix of a component that starts with our name.
948 */
949 unsigned params_before = uniforms;
950 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
951 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
952
953 if (strncmp(ir->name, storage->name, namelen) != 0 ||
954 (storage->name[namelen] != 0 &&
955 storage->name[namelen] != '.' &&
956 storage->name[namelen] != '[')) {
957 continue;
958 }
959
960 unsigned slots = storage->type->component_slots();
961 if (storage->array_elements)
962 slots *= storage->array_elements;
963
964 for (unsigned i = 0; i < slots; i++) {
965 stage_prog_data->param[uniforms++] = &storage->storage[i].f;
966 }
967 }
968
969 /* Make sure we actually initialized the right amount of stuff here. */
970 assert(params_before + ir->type->component_slots() == uniforms);
971 (void)params_before;
972 }
973
974
975 /* Our support for builtin uniforms is even scarier than non-builtin.
976 * It sits on top of the PROG_STATE_VAR parameters that are
977 * automatically updated from GL context state.
978 */
979 void
980 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
981 {
982 const ir_state_slot *const slots = ir->state_slots;
983 assert(ir->state_slots != NULL);
984
985 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
986 /* This state reference has already been setup by ir_to_mesa, but we'll
987 * get the same index back here.
988 */
989 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
990 (gl_state_index *)slots[i].tokens);
991
992 /* Add each of the unique swizzles of the element as a parameter.
993 * This'll end up matching the expected layout of the
994 * array/matrix/structure we're trying to fill in.
995 */
996 int last_swiz = -1;
997 for (unsigned int j = 0; j < 4; j++) {
998 int swiz = GET_SWZ(slots[i].swizzle, j);
999 if (swiz == last_swiz)
1000 break;
1001 last_swiz = swiz;
1002
1003 stage_prog_data->param[uniforms++] =
1004 &fp->Base.Parameters->ParameterValues[index][swiz].f;
1005 }
1006 }
1007 }
1008
1009 fs_reg *
1010 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1011 {
1012 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1013 fs_reg wpos = *reg;
1014 bool flip = !ir->data.origin_upper_left ^ c->key.render_to_fbo;
1015
1016 /* gl_FragCoord.x */
1017 if (ir->data.pixel_center_integer) {
1018 emit(MOV(wpos, this->pixel_x));
1019 } else {
1020 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1021 }
1022 wpos.reg_offset++;
1023
1024 /* gl_FragCoord.y */
1025 if (!flip && ir->data.pixel_center_integer) {
1026 emit(MOV(wpos, this->pixel_y));
1027 } else {
1028 fs_reg pixel_y = this->pixel_y;
1029 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1030
1031 if (flip) {
1032 pixel_y.negate = true;
1033 offset += c->key.drawable_height - 1.0;
1034 }
1035
1036 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1037 }
1038 wpos.reg_offset++;
1039
1040 /* gl_FragCoord.z */
1041 if (brw->gen >= 6) {
1042 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
1043 } else {
1044 emit(FS_OPCODE_LINTERP, wpos,
1045 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1046 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1047 interp_reg(VARYING_SLOT_POS, 2));
1048 }
1049 wpos.reg_offset++;
1050
1051 /* gl_FragCoord.w: Already set up in emit_interpolation */
1052 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1053
1054 return reg;
1055 }
1056
1057 fs_inst *
1058 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1059 glsl_interp_qualifier interpolation_mode,
1060 bool is_centroid, bool is_sample)
1061 {
1062 brw_wm_barycentric_interp_mode barycoord_mode;
1063 if (brw->gen >= 6) {
1064 if (is_centroid) {
1065 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1066 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1067 else
1068 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1069 } else if (is_sample) {
1070 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1071 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1072 else
1073 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1074 } else {
1075 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1076 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1077 else
1078 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1079 }
1080 } else {
1081 /* On Ironlake and below, there is only one interpolation mode.
1082 * Centroid interpolation doesn't mean anything on this hardware --
1083 * there is no multisampling.
1084 */
1085 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1086 }
1087 return emit(FS_OPCODE_LINTERP, attr,
1088 this->delta_x[barycoord_mode],
1089 this->delta_y[barycoord_mode], interp);
1090 }
1091
1092 fs_reg *
1093 fs_visitor::emit_general_interpolation(ir_variable *ir)
1094 {
1095 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1096 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1097 fs_reg attr = *reg;
1098
1099 unsigned int array_elements;
1100 const glsl_type *type;
1101
1102 if (ir->type->is_array()) {
1103 array_elements = ir->type->length;
1104 if (array_elements == 0) {
1105 fail("dereferenced array '%s' has length 0\n", ir->name);
1106 }
1107 type = ir->type->fields.array;
1108 } else {
1109 array_elements = 1;
1110 type = ir->type;
1111 }
1112
1113 glsl_interp_qualifier interpolation_mode =
1114 ir->determine_interpolation_mode(c->key.flat_shade);
1115
1116 int location = ir->data.location;
1117 for (unsigned int i = 0; i < array_elements; i++) {
1118 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1119 if (c->prog_data.urb_setup[location] == -1) {
1120 /* If there's no incoming setup data for this slot, don't
1121 * emit interpolation for it.
1122 */
1123 attr.reg_offset += type->vector_elements;
1124 location++;
1125 continue;
1126 }
1127
1128 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1129 /* Constant interpolation (flat shading) case. The SF has
1130 * handed us defined values in only the constant offset
1131 * field of the setup reg.
1132 */
1133 for (unsigned int k = 0; k < type->vector_elements; k++) {
1134 struct brw_reg interp = interp_reg(location, k);
1135 interp = suboffset(interp, 3);
1136 interp.type = reg->type;
1137 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1138 attr.reg_offset++;
1139 }
1140 } else {
1141 /* Smooth/noperspective interpolation case. */
1142 for (unsigned int k = 0; k < type->vector_elements; k++) {
1143 struct brw_reg interp = interp_reg(location, k);
1144 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1145 ir->data.centroid && !c->key.persample_shading,
1146 ir->data.sample || c->key.persample_shading);
1147 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1148 /* Get the pixel/sample mask into f0 so that we know
1149 * which pixels are lit. Then, for each channel that is
1150 * unlit, replace the centroid data with non-centroid
1151 * data.
1152 */
1153 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1154 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1155 interpolation_mode,
1156 false, false);
1157 inst->predicate = BRW_PREDICATE_NORMAL;
1158 inst->predicate_inverse = true;
1159 }
1160 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1161 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1162 }
1163 attr.reg_offset++;
1164 }
1165
1166 }
1167 location++;
1168 }
1169 }
1170
1171 return reg;
1172 }
1173
1174 fs_reg *
1175 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1176 {
1177 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1178
1179 /* The frontfacing comes in as a bit in the thread payload. */
1180 if (brw->gen >= 6) {
1181 emit(BRW_OPCODE_ASR, *reg,
1182 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1183 fs_reg(15));
1184 emit(BRW_OPCODE_NOT, *reg, *reg);
1185 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1186 } else {
1187 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1188 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1189 * us front face
1190 */
1191 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1192 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1193 }
1194
1195 return reg;
1196 }
1197
1198 void
1199 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1200 {
1201 assert(dst.type == BRW_REGISTER_TYPE_F);
1202
1203 if (c->key.compute_pos_offset) {
1204 /* Convert int_sample_pos to floating point */
1205 emit(MOV(dst, int_sample_pos));
1206 /* Scale to the range [0, 1] */
1207 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1208 }
1209 else {
1210 /* From ARB_sample_shading specification:
1211 * "When rendering to a non-multisample buffer, or if multisample
1212 * rasterization is disabled, gl_SamplePosition will always be
1213 * (0.5, 0.5).
1214 */
1215 emit(MOV(dst, fs_reg(0.5f)));
1216 }
1217 }
1218
1219 fs_reg *
1220 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1221 {
1222 assert(brw->gen >= 6);
1223 assert(ir->type == glsl_type::vec2_type);
1224
1225 this->current_annotation = "compute sample position";
1226 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1227 fs_reg pos = *reg;
1228 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1229 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1230
1231 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1232 * mode will be enabled.
1233 *
1234 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1235 * R31.1:0 Position Offset X/Y for Slot[3:0]
1236 * R31.3:2 Position Offset X/Y for Slot[7:4]
1237 * .....
1238 *
1239 * The X, Y sample positions come in as bytes in thread payload. So, read
1240 * the positions using vstride=16, width=8, hstride=2.
1241 */
1242 struct brw_reg sample_pos_reg =
1243 stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1244 BRW_REGISTER_TYPE_B), 16, 8, 2);
1245
1246 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1247 if (dispatch_width == 16) {
1248 fs_inst *inst = emit(MOV(half(int_sample_x, 1),
1249 fs_reg(suboffset(sample_pos_reg, 16))));
1250 inst->force_sechalf = true;
1251 }
1252 /* Compute gl_SamplePosition.x */
1253 compute_sample_position(pos, int_sample_x);
1254 pos.reg_offset++;
1255 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1256 if (dispatch_width == 16) {
1257 fs_inst *inst = emit(MOV(half(int_sample_y, 1),
1258 fs_reg(suboffset(sample_pos_reg, 17))));
1259 inst->force_sechalf = true;
1260 }
1261 /* Compute gl_SamplePosition.y */
1262 compute_sample_position(pos, int_sample_y);
1263 return reg;
1264 }
1265
1266 fs_reg *
1267 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1268 {
1269 assert(brw->gen >= 6);
1270
1271 this->current_annotation = "compute sample id";
1272 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1273
1274 if (c->key.compute_sample_id) {
1275 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1276 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1277 t2.type = BRW_REGISTER_TYPE_UW;
1278
1279 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1280 * 8x multisampling, subspan 0 will represent sample N (where N
1281 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1282 * 7. We can find the value of N by looking at R0.0 bits 7:6
1283 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1284 * (since samples are always delivered in pairs). That is, we
1285 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1286 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1287 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1288 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1289 * populating a temporary variable with the sequence (0, 1, 2, 3),
1290 * and then reading from it using vstride=1, width=4, hstride=0.
1291 * These computations hold good for 4x multisampling as well.
1292 */
1293 emit(BRW_OPCODE_AND, t1,
1294 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1295 fs_reg(brw_imm_d(0xc0)));
1296 emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1297 /* This works for both SIMD8 and SIMD16 */
1298 emit(MOV(t2, brw_imm_v(0x3210)));
1299 /* This special instruction takes care of setting vstride=1,
1300 * width=4, hstride=0 of t2 during an ADD instruction.
1301 */
1302 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1303 } else {
1304 /* As per GL_ARB_sample_shading specification:
1305 * "When rendering to a non-multisample buffer, or if multisample
1306 * rasterization is disabled, gl_SampleID will always be zero."
1307 */
1308 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1309 }
1310
1311 return reg;
1312 }
1313
1314 fs_reg *
1315 fs_visitor::emit_samplemaskin_setup(ir_variable *ir)
1316 {
1317 assert(brw->gen >= 7);
1318 this->current_annotation = "compute gl_SampleMaskIn";
1319 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1320 emit(MOV(*reg, fs_reg(retype(brw_vec8_grf(c->sample_mask_reg, 0), BRW_REGISTER_TYPE_D))));
1321 return reg;
1322 }
1323
1324 fs_reg
1325 fs_visitor::fix_math_operand(fs_reg src)
1326 {
1327 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1328 * might be able to do better by doing execsize = 1 math and then
1329 * expanding that result out, but we would need to be careful with
1330 * masking.
1331 *
1332 * The hardware ignores source modifiers (negate and abs) on math
1333 * instructions, so we also move to a temp to set those up.
1334 */
1335 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1336 !src.abs && !src.negate)
1337 return src;
1338
1339 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1340 * operands to math
1341 */
1342 if (brw->gen >= 7 && src.file != IMM)
1343 return src;
1344
1345 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1346 expanded.type = src.type;
1347 emit(BRW_OPCODE_MOV, expanded, src);
1348 return expanded;
1349 }
1350
1351 fs_inst *
1352 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1353 {
1354 switch (opcode) {
1355 case SHADER_OPCODE_RCP:
1356 case SHADER_OPCODE_RSQ:
1357 case SHADER_OPCODE_SQRT:
1358 case SHADER_OPCODE_EXP2:
1359 case SHADER_OPCODE_LOG2:
1360 case SHADER_OPCODE_SIN:
1361 case SHADER_OPCODE_COS:
1362 break;
1363 default:
1364 assert(!"not reached: bad math opcode");
1365 return NULL;
1366 }
1367
1368 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1369 * might be able to do better by doing execsize = 1 math and then
1370 * expanding that result out, but we would need to be careful with
1371 * masking.
1372 *
1373 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1374 * instructions, so we also move to a temp to set those up.
1375 */
1376 if (brw->gen >= 6)
1377 src = fix_math_operand(src);
1378
1379 fs_inst *inst = emit(opcode, dst, src);
1380
1381 if (brw->gen < 6) {
1382 inst->base_mrf = 2;
1383 inst->mlen = dispatch_width / 8;
1384 }
1385
1386 return inst;
1387 }
1388
1389 fs_inst *
1390 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1391 {
1392 int base_mrf = 2;
1393 fs_inst *inst;
1394
1395 switch (opcode) {
1396 case SHADER_OPCODE_INT_QUOTIENT:
1397 case SHADER_OPCODE_INT_REMAINDER:
1398 if (brw->gen >= 7)
1399 no16("SIMD16 INTDIV unsupported\n");
1400 break;
1401 case SHADER_OPCODE_POW:
1402 break;
1403 default:
1404 assert(!"not reached: unsupported binary math opcode.");
1405 return NULL;
1406 }
1407
1408 if (brw->gen >= 6) {
1409 src0 = fix_math_operand(src0);
1410 src1 = fix_math_operand(src1);
1411
1412 inst = emit(opcode, dst, src0, src1);
1413 } else {
1414 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1415 * "Message Payload":
1416 *
1417 * "Operand0[7]. For the INT DIV functions, this operand is the
1418 * denominator."
1419 * ...
1420 * "Operand1[7]. For the INT DIV functions, this operand is the
1421 * numerator."
1422 */
1423 bool is_int_div = opcode != SHADER_OPCODE_POW;
1424 fs_reg &op0 = is_int_div ? src1 : src0;
1425 fs_reg &op1 = is_int_div ? src0 : src1;
1426
1427 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1428 inst = emit(opcode, dst, op0, reg_null_f);
1429
1430 inst->base_mrf = base_mrf;
1431 inst->mlen = 2 * dispatch_width / 8;
1432 }
1433 return inst;
1434 }
1435
1436 void
1437 fs_visitor::assign_curb_setup()
1438 {
1439 if (dispatch_width == 8) {
1440 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1441 } else {
1442 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1443 }
1444
1445 c->prog_data.curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1446
1447 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1448 foreach_list(node, &this->instructions) {
1449 fs_inst *inst = (fs_inst *)node;
1450
1451 for (unsigned int i = 0; i < 3; i++) {
1452 if (inst->src[i].file == UNIFORM) {
1453 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1454 int constant_nr;
1455 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1456 constant_nr = push_constant_loc[uniform_nr];
1457 } else {
1458 /* Section 5.11 of the OpenGL 4.1 spec says:
1459 * "Out-of-bounds reads return undefined values, which include
1460 * values from other variables of the active program or zero."
1461 * Just return the first push constant.
1462 */
1463 constant_nr = 0;
1464 }
1465
1466 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1467 constant_nr / 8,
1468 constant_nr % 8);
1469
1470 inst->src[i].file = HW_REG;
1471 inst->src[i].fixed_hw_reg = byte_offset(
1472 retype(brw_reg, inst->src[i].type),
1473 inst->src[i].subreg_offset);
1474 }
1475 }
1476 }
1477 }
1478
1479 void
1480 fs_visitor::calculate_urb_setup()
1481 {
1482 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1483 c->prog_data.urb_setup[i] = -1;
1484 }
1485
1486 int urb_next = 0;
1487 /* Figure out where each of the incoming setup attributes lands. */
1488 if (brw->gen >= 6) {
1489 if (_mesa_bitcount_64(fp->Base.InputsRead &
1490 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1491 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1492 * first 16 varying inputs, so we can put them wherever we want.
1493 * Just put them in order.
1494 *
1495 * This is useful because it means that (a) inputs not used by the
1496 * fragment shader won't take up valuable register space, and (b) we
1497 * won't have to recompile the fragment shader if it gets paired with
1498 * a different vertex (or geometry) shader.
1499 */
1500 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1501 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1502 BITFIELD64_BIT(i)) {
1503 c->prog_data.urb_setup[i] = urb_next++;
1504 }
1505 }
1506 } else {
1507 /* We have enough input varyings that the SF/SBE pipeline stage can't
1508 * arbitrarily rearrange them to suit our whim; we have to put them
1509 * in an order that matches the output of the previous pipeline stage
1510 * (geometry or vertex shader).
1511 */
1512 struct brw_vue_map prev_stage_vue_map;
1513 brw_compute_vue_map(brw, &prev_stage_vue_map,
1514 c->key.input_slots_valid);
1515 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1516 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1517 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1518 slot++) {
1519 int varying = prev_stage_vue_map.slot_to_varying[slot];
1520 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1521 * unused.
1522 */
1523 if (varying != BRW_VARYING_SLOT_COUNT &&
1524 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1525 BITFIELD64_BIT(varying))) {
1526 c->prog_data.urb_setup[varying] = slot - first_slot;
1527 }
1528 }
1529 urb_next = prev_stage_vue_map.num_slots - first_slot;
1530 }
1531 } else {
1532 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1533 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1534 /* Point size is packed into the header, not as a general attribute */
1535 if (i == VARYING_SLOT_PSIZ)
1536 continue;
1537
1538 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1539 /* The back color slot is skipped when the front color is
1540 * also written to. In addition, some slots can be
1541 * written in the vertex shader and not read in the
1542 * fragment shader. So the register number must always be
1543 * incremented, mapped or not.
1544 */
1545 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1546 c->prog_data.urb_setup[i] = urb_next;
1547 urb_next++;
1548 }
1549 }
1550
1551 /*
1552 * It's a FS only attribute, and we did interpolation for this attribute
1553 * in SF thread. So, count it here, too.
1554 *
1555 * See compile_sf_prog() for more info.
1556 */
1557 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1558 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1559 }
1560
1561 c->prog_data.num_varying_inputs = urb_next;
1562 }
1563
1564 void
1565 fs_visitor::assign_urb_setup()
1566 {
1567 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1568
1569 /* Offset all the urb_setup[] index by the actual position of the
1570 * setup regs, now that the location of the constants has been chosen.
1571 */
1572 foreach_list(node, &this->instructions) {
1573 fs_inst *inst = (fs_inst *)node;
1574
1575 if (inst->opcode == FS_OPCODE_LINTERP) {
1576 assert(inst->src[2].file == HW_REG);
1577 inst->src[2].fixed_hw_reg.nr += urb_start;
1578 }
1579
1580 if (inst->opcode == FS_OPCODE_CINTERP) {
1581 assert(inst->src[0].file == HW_REG);
1582 inst->src[0].fixed_hw_reg.nr += urb_start;
1583 }
1584 }
1585
1586 /* Each attribute is 4 setup channels, each of which is half a reg. */
1587 this->first_non_payload_grf =
1588 urb_start + c->prog_data.num_varying_inputs * 2;
1589 }
1590
1591 /**
1592 * Split large virtual GRFs into separate components if we can.
1593 *
1594 * This is mostly duplicated with what brw_fs_vector_splitting does,
1595 * but that's really conservative because it's afraid of doing
1596 * splitting that doesn't result in real progress after the rest of
1597 * the optimization phases, which would cause infinite looping in
1598 * optimization. We can do it once here, safely. This also has the
1599 * opportunity to split interpolated values, or maybe even uniforms,
1600 * which we don't have at the IR level.
1601 *
1602 * We want to split, because virtual GRFs are what we register
1603 * allocate and spill (due to contiguousness requirements for some
1604 * instructions), and they're what we naturally generate in the
1605 * codegen process, but most virtual GRFs don't actually need to be
1606 * contiguous sets of GRFs. If we split, we'll end up with reduced
1607 * live intervals and better dead code elimination and coalescing.
1608 */
1609 void
1610 fs_visitor::split_virtual_grfs()
1611 {
1612 int num_vars = this->virtual_grf_count;
1613 bool split_grf[num_vars];
1614 int new_virtual_grf[num_vars];
1615
1616 /* Try to split anything > 0 sized. */
1617 for (int i = 0; i < num_vars; i++) {
1618 if (this->virtual_grf_sizes[i] != 1)
1619 split_grf[i] = true;
1620 else
1621 split_grf[i] = false;
1622 }
1623
1624 if (brw->has_pln &&
1625 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1626 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1627 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1628 * Gen6, that was the only supported interpolation mode, and since Gen6,
1629 * delta_x and delta_y are in fixed hardware registers.
1630 */
1631 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1632 false;
1633 }
1634
1635 foreach_list(node, &this->instructions) {
1636 fs_inst *inst = (fs_inst *)node;
1637
1638 /* If there's a SEND message that requires contiguous destination
1639 * registers, no splitting is allowed.
1640 */
1641 if (inst->regs_written > 1) {
1642 split_grf[inst->dst.reg] = false;
1643 }
1644
1645 /* If we're sending from a GRF, don't split it, on the assumption that
1646 * the send is reading the whole thing.
1647 */
1648 if (inst->is_send_from_grf()) {
1649 for (int i = 0; i < 3; i++) {
1650 if (inst->src[i].file == GRF) {
1651 split_grf[inst->src[i].reg] = false;
1652 }
1653 }
1654 }
1655 }
1656
1657 /* Allocate new space for split regs. Note that the virtual
1658 * numbers will be contiguous.
1659 */
1660 for (int i = 0; i < num_vars; i++) {
1661 if (split_grf[i]) {
1662 new_virtual_grf[i] = virtual_grf_alloc(1);
1663 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1664 int reg = virtual_grf_alloc(1);
1665 assert(reg == new_virtual_grf[i] + j - 1);
1666 (void) reg;
1667 }
1668 this->virtual_grf_sizes[i] = 1;
1669 }
1670 }
1671
1672 foreach_list(node, &this->instructions) {
1673 fs_inst *inst = (fs_inst *)node;
1674
1675 if (inst->dst.file == GRF &&
1676 split_grf[inst->dst.reg] &&
1677 inst->dst.reg_offset != 0) {
1678 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1679 inst->dst.reg_offset - 1);
1680 inst->dst.reg_offset = 0;
1681 }
1682 for (int i = 0; i < 3; i++) {
1683 if (inst->src[i].file == GRF &&
1684 split_grf[inst->src[i].reg] &&
1685 inst->src[i].reg_offset != 0) {
1686 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1687 inst->src[i].reg_offset - 1);
1688 inst->src[i].reg_offset = 0;
1689 }
1690 }
1691 }
1692 invalidate_live_intervals();
1693 }
1694
1695 /**
1696 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1697 *
1698 * During code generation, we create tons of temporary variables, many of
1699 * which get immediately killed and are never used again. Yet, in later
1700 * optimization and analysis passes, such as compute_live_intervals, we need
1701 * to loop over all the virtual GRFs. Compacting them can save a lot of
1702 * overhead.
1703 */
1704 void
1705 fs_visitor::compact_virtual_grfs()
1706 {
1707 /* Mark which virtual GRFs are used, and count how many. */
1708 int remap_table[this->virtual_grf_count];
1709 memset(remap_table, -1, sizeof(remap_table));
1710
1711 foreach_list(node, &this->instructions) {
1712 const fs_inst *inst = (const fs_inst *) node;
1713
1714 if (inst->dst.file == GRF)
1715 remap_table[inst->dst.reg] = 0;
1716
1717 for (int i = 0; i < 3; i++) {
1718 if (inst->src[i].file == GRF)
1719 remap_table[inst->src[i].reg] = 0;
1720 }
1721 }
1722
1723 /* In addition to registers used in instructions, fs_visitor keeps
1724 * direct references to certain special values which must be patched:
1725 */
1726 struct {
1727 fs_reg *reg;
1728 unsigned count;
1729 } special[] = {
1730 { &frag_depth, 1 },
1731 { &pixel_x, 1 },
1732 { &pixel_y, 1 },
1733 { &pixel_w, 1 },
1734 { &wpos_w, 1 },
1735 { &dual_src_output, 1 },
1736 { outputs, ARRAY_SIZE(outputs) },
1737 { delta_x, ARRAY_SIZE(delta_x) },
1738 { delta_y, ARRAY_SIZE(delta_y) },
1739 { &sample_mask, 1 },
1740 { &shader_start_time, 1 },
1741 };
1742
1743 /* Treat all special values as used, to be conservative */
1744 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1745 for (unsigned j = 0; j < special[i].count; j++) {
1746 if (special[i].reg[j].file == GRF)
1747 remap_table[special[i].reg[j].reg] = 0;
1748 }
1749 }
1750
1751 /* Compact the GRF arrays. */
1752 int new_index = 0;
1753 for (int i = 0; i < this->virtual_grf_count; i++) {
1754 if (remap_table[i] != -1) {
1755 remap_table[i] = new_index;
1756 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1757 invalidate_live_intervals();
1758 ++new_index;
1759 }
1760 }
1761
1762 this->virtual_grf_count = new_index;
1763
1764 /* Patch all the instructions to use the newly renumbered registers */
1765 foreach_list(node, &this->instructions) {
1766 fs_inst *inst = (fs_inst *) node;
1767
1768 if (inst->dst.file == GRF)
1769 inst->dst.reg = remap_table[inst->dst.reg];
1770
1771 for (int i = 0; i < 3; i++) {
1772 if (inst->src[i].file == GRF)
1773 inst->src[i].reg = remap_table[inst->src[i].reg];
1774 }
1775 }
1776
1777 /* Patch all the references to special values */
1778 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1779 for (unsigned j = 0; j < special[i].count; j++) {
1780 fs_reg *reg = &special[i].reg[j];
1781 if (reg->file == GRF && remap_table[reg->reg] != -1)
1782 reg->reg = remap_table[reg->reg];
1783 }
1784 }
1785 }
1786
1787 /*
1788 * Implements array access of uniforms by inserting a
1789 * PULL_CONSTANT_LOAD instruction.
1790 *
1791 * Unlike temporary GRF array access (where we don't support it due to
1792 * the difficulty of doing relative addressing on instruction
1793 * destinations), we could potentially do array access of uniforms
1794 * that were loaded in GRF space as push constants. In real-world
1795 * usage we've seen, though, the arrays being used are always larger
1796 * than we could load as push constants, so just always move all
1797 * uniform array access out to a pull constant buffer.
1798 */
1799 void
1800 fs_visitor::move_uniform_array_access_to_pull_constants()
1801 {
1802 if (dispatch_width != 8)
1803 return;
1804
1805 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1806
1807 for (unsigned int i = 0; i < uniforms; i++) {
1808 pull_constant_loc[i] = -1;
1809 }
1810
1811 /* Walk through and find array access of uniforms. Put a copy of that
1812 * uniform in the pull constant buffer.
1813 *
1814 * Note that we don't move constant-indexed accesses to arrays. No
1815 * testing has been done of the performance impact of this choice.
1816 */
1817 foreach_list_safe(node, &this->instructions) {
1818 fs_inst *inst = (fs_inst *)node;
1819
1820 for (int i = 0 ; i < 3; i++) {
1821 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1822 continue;
1823
1824 int uniform = inst->src[i].reg;
1825
1826 /* If this array isn't already present in the pull constant buffer,
1827 * add it.
1828 */
1829 if (pull_constant_loc[uniform] == -1) {
1830 const float **values = &stage_prog_data->param[uniform];
1831
1832 assert(param_size[uniform]);
1833
1834 for (int j = 0; j < param_size[uniform]; j++) {
1835 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1836
1837 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1838 values[j];
1839 }
1840 }
1841 }
1842 }
1843 }
1844
1845 /**
1846 * Assign UNIFORM file registers to either push constants or pull constants.
1847 *
1848 * We allow a fragment shader to have more than the specified minimum
1849 * maximum number of fragment shader uniform components (64). If
1850 * there are too many of these, they'd fill up all of register space.
1851 * So, this will push some of them out to the pull constant buffer and
1852 * update the program to load them.
1853 */
1854 void
1855 fs_visitor::assign_constant_locations()
1856 {
1857 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1858 if (dispatch_width != 8)
1859 return;
1860
1861 /* Find which UNIFORM registers are still in use. */
1862 bool is_live[uniforms];
1863 for (unsigned int i = 0; i < uniforms; i++) {
1864 is_live[i] = false;
1865 }
1866
1867 foreach_list(node, &this->instructions) {
1868 fs_inst *inst = (fs_inst *) node;
1869
1870 for (int i = 0; i < 3; i++) {
1871 if (inst->src[i].file != UNIFORM)
1872 continue;
1873
1874 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1875 if (constant_nr >= 0 && constant_nr < (int) uniforms)
1876 is_live[constant_nr] = true;
1877 }
1878 }
1879
1880 /* Only allow 16 registers (128 uniform components) as push constants.
1881 *
1882 * Just demote the end of the list. We could probably do better
1883 * here, demoting things that are rarely used in the program first.
1884 */
1885 unsigned int max_push_components = 16 * 8;
1886 unsigned int num_push_constants = 0;
1887
1888 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1889
1890 for (unsigned int i = 0; i < uniforms; i++) {
1891 if (!is_live[i] || pull_constant_loc[i] != -1) {
1892 /* This UNIFORM register is either dead, or has already been demoted
1893 * to a pull const. Mark it as no longer living in the param[] array.
1894 */
1895 push_constant_loc[i] = -1;
1896 continue;
1897 }
1898
1899 if (num_push_constants < max_push_components) {
1900 /* Retain as a push constant. Record the location in the params[]
1901 * array.
1902 */
1903 push_constant_loc[i] = num_push_constants++;
1904 } else {
1905 /* Demote to a pull constant. */
1906 push_constant_loc[i] = -1;
1907
1908 int pull_index = stage_prog_data->nr_pull_params++;
1909 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1910 pull_constant_loc[i] = pull_index;
1911 }
1912 }
1913
1914 stage_prog_data->nr_params = num_push_constants;
1915
1916 /* Up until now, the param[] array has been indexed by reg + reg_offset
1917 * of UNIFORM registers. Condense it to only contain the uniforms we
1918 * chose to upload as push constants.
1919 */
1920 for (unsigned int i = 0; i < uniforms; i++) {
1921 int remapped = push_constant_loc[i];
1922
1923 if (remapped == -1)
1924 continue;
1925
1926 assert(remapped <= (int)i);
1927 stage_prog_data->param[remapped] = stage_prog_data->param[i];
1928 }
1929 }
1930
1931 /**
1932 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1933 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1934 */
1935 void
1936 fs_visitor::demote_pull_constants()
1937 {
1938 foreach_list(node, &this->instructions) {
1939 fs_inst *inst = (fs_inst *)node;
1940
1941 for (int i = 0; i < 3; i++) {
1942 if (inst->src[i].file != UNIFORM)
1943 continue;
1944
1945 int pull_index = pull_constant_loc[inst->src[i].reg +
1946 inst->src[i].reg_offset];
1947 if (pull_index == -1)
1948 continue;
1949
1950 /* Set up the annotation tracking for new generated instructions. */
1951 base_ir = inst->ir;
1952 current_annotation = inst->annotation;
1953
1954 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1955 fs_reg dst = fs_reg(this, glsl_type::float_type);
1956
1957 /* Generate a pull load into dst. */
1958 if (inst->src[i].reladdr) {
1959 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
1960 surf_index,
1961 *inst->src[i].reladdr,
1962 pull_index);
1963 inst->insert_before(&list);
1964 inst->src[i].reladdr = NULL;
1965 } else {
1966 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1967 fs_inst *pull =
1968 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1969 dst, surf_index, offset);
1970 inst->insert_before(pull);
1971 inst->src[i].set_smear(pull_index & 3);
1972 }
1973
1974 /* Rewrite the instruction to use the temporary VGRF. */
1975 inst->src[i].file = GRF;
1976 inst->src[i].reg = dst.reg;
1977 inst->src[i].reg_offset = 0;
1978 }
1979 }
1980 invalidate_live_intervals();
1981 }
1982
1983 bool
1984 fs_visitor::opt_algebraic()
1985 {
1986 bool progress = false;
1987
1988 foreach_list(node, &this->instructions) {
1989 fs_inst *inst = (fs_inst *)node;
1990
1991 switch (inst->opcode) {
1992 case BRW_OPCODE_MUL:
1993 if (inst->src[1].file != IMM)
1994 continue;
1995
1996 /* a * 1.0 = a */
1997 if (inst->src[1].is_one()) {
1998 inst->opcode = BRW_OPCODE_MOV;
1999 inst->src[1] = reg_undef;
2000 progress = true;
2001 break;
2002 }
2003
2004 /* a * 0.0 = 0.0 */
2005 if (inst->src[1].is_zero()) {
2006 inst->opcode = BRW_OPCODE_MOV;
2007 inst->src[0] = inst->src[1];
2008 inst->src[1] = reg_undef;
2009 progress = true;
2010 break;
2011 }
2012
2013 break;
2014 case BRW_OPCODE_ADD:
2015 if (inst->src[1].file != IMM)
2016 continue;
2017
2018 /* a + 0.0 = a */
2019 if (inst->src[1].is_zero()) {
2020 inst->opcode = BRW_OPCODE_MOV;
2021 inst->src[1] = reg_undef;
2022 progress = true;
2023 break;
2024 }
2025 break;
2026 case BRW_OPCODE_OR:
2027 if (inst->src[0].equals(inst->src[1])) {
2028 inst->opcode = BRW_OPCODE_MOV;
2029 inst->src[1] = reg_undef;
2030 progress = true;
2031 break;
2032 }
2033 break;
2034 case BRW_OPCODE_LRP:
2035 if (inst->src[1].equals(inst->src[2])) {
2036 inst->opcode = BRW_OPCODE_MOV;
2037 inst->src[0] = inst->src[1];
2038 inst->src[1] = reg_undef;
2039 inst->src[2] = reg_undef;
2040 progress = true;
2041 break;
2042 }
2043 break;
2044 case BRW_OPCODE_SEL:
2045 if (inst->saturate && inst->src[1].file == IMM) {
2046 switch (inst->conditional_mod) {
2047 case BRW_CONDITIONAL_LE:
2048 case BRW_CONDITIONAL_L:
2049 switch (inst->src[1].type) {
2050 case BRW_REGISTER_TYPE_F:
2051 if (inst->src[1].imm.f >= 1.0f) {
2052 inst->opcode = BRW_OPCODE_MOV;
2053 inst->src[1] = reg_undef;
2054 progress = true;
2055 }
2056 break;
2057 default:
2058 break;
2059 }
2060 break;
2061 case BRW_CONDITIONAL_GE:
2062 case BRW_CONDITIONAL_G:
2063 switch (inst->src[1].type) {
2064 case BRW_REGISTER_TYPE_F:
2065 if (inst->src[1].imm.f <= 0.0f) {
2066 inst->opcode = BRW_OPCODE_MOV;
2067 inst->src[1] = reg_undef;
2068 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2069 progress = true;
2070 }
2071 break;
2072 default:
2073 break;
2074 }
2075 default:
2076 break;
2077 }
2078 }
2079 break;
2080 default:
2081 break;
2082 }
2083 }
2084
2085 return progress;
2086 }
2087
2088 /**
2089 * Removes any instructions writing a VGRF where that VGRF is not used by any
2090 * later instruction.
2091 */
2092 bool
2093 fs_visitor::dead_code_eliminate()
2094 {
2095 bool progress = false;
2096 int pc = 0;
2097
2098 calculate_live_intervals();
2099
2100 foreach_list_safe(node, &this->instructions) {
2101 fs_inst *inst = (fs_inst *)node;
2102
2103 if (inst->dst.file == GRF && !inst->has_side_effects()) {
2104 bool dead = true;
2105
2106 for (int i = 0; i < inst->regs_written; i++) {
2107 int var = live_intervals->var_from_vgrf[inst->dst.reg];
2108 assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2109 if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2110 dead = false;
2111 break;
2112 }
2113 }
2114
2115 if (dead) {
2116 /* Don't dead code eliminate instructions that write to the
2117 * accumulator as a side-effect. Instead just set the destination
2118 * to the null register to free it.
2119 */
2120 switch (inst->opcode) {
2121 case BRW_OPCODE_ADDC:
2122 case BRW_OPCODE_SUBB:
2123 case BRW_OPCODE_MACH:
2124 inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2125 break;
2126 default:
2127 inst->remove();
2128 progress = true;
2129 break;
2130 }
2131 }
2132 }
2133
2134 pc++;
2135 }
2136
2137 if (progress)
2138 invalidate_live_intervals();
2139
2140 return progress;
2141 }
2142
2143 struct dead_code_hash_key
2144 {
2145 int vgrf;
2146 int reg_offset;
2147 };
2148
2149 static bool
2150 dead_code_hash_compare(const void *a, const void *b)
2151 {
2152 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2153 }
2154
2155 static void
2156 clear_dead_code_hash(struct hash_table *ht)
2157 {
2158 struct hash_entry *entry;
2159
2160 hash_table_foreach(ht, entry) {
2161 _mesa_hash_table_remove(ht, entry);
2162 }
2163 }
2164
2165 static void
2166 insert_dead_code_hash(struct hash_table *ht,
2167 int vgrf, int reg_offset, fs_inst *inst)
2168 {
2169 /* We don't bother freeing keys, because they'll be GCed with the ht. */
2170 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2171
2172 key->vgrf = vgrf;
2173 key->reg_offset = reg_offset;
2174
2175 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2176 }
2177
2178 static struct hash_entry *
2179 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2180 {
2181 struct dead_code_hash_key key;
2182
2183 key.vgrf = vgrf;
2184 key.reg_offset = reg_offset;
2185
2186 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2187 }
2188
2189 static void
2190 remove_dead_code_hash(struct hash_table *ht,
2191 int vgrf, int reg_offset)
2192 {
2193 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2194 if (!entry)
2195 return;
2196
2197 _mesa_hash_table_remove(ht, entry);
2198 }
2199
2200 /**
2201 * Walks basic blocks, removing any regs that are written but not read before
2202 * being redefined.
2203 *
2204 * The dead_code_eliminate() function implements a global dead code
2205 * elimination, but it only handles the removing the last write to a register
2206 * if it's never read. This one can handle intermediate writes, but only
2207 * within a basic block.
2208 */
2209 bool
2210 fs_visitor::dead_code_eliminate_local()
2211 {
2212 struct hash_table *ht;
2213 bool progress = false;
2214
2215 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2216
2217 if (ht == NULL) {
2218 return false;
2219 }
2220
2221 foreach_list_safe(node, &this->instructions) {
2222 fs_inst *inst = (fs_inst *)node;
2223
2224 /* At a basic block, empty the HT since we don't understand dataflow
2225 * here.
2226 */
2227 if (inst->is_control_flow()) {
2228 clear_dead_code_hash(ht);
2229 continue;
2230 }
2231
2232 /* Clear the HT of any instructions that got read. */
2233 for (int i = 0; i < 3; i++) {
2234 fs_reg src = inst->src[i];
2235 if (src.file != GRF)
2236 continue;
2237
2238 int read = 1;
2239 if (inst->is_send_from_grf())
2240 read = virtual_grf_sizes[src.reg] - src.reg_offset;
2241
2242 for (int reg_offset = src.reg_offset;
2243 reg_offset < src.reg_offset + read;
2244 reg_offset++) {
2245 remove_dead_code_hash(ht, src.reg, reg_offset);
2246 }
2247 }
2248
2249 /* Add any update of a GRF to the HT, removing a previous write if it
2250 * wasn't read.
2251 */
2252 if (inst->dst.file == GRF) {
2253 if (inst->regs_written > 1) {
2254 /* We don't know how to trim channels from an instruction's
2255 * writes, so we can't incrementally remove unread channels from
2256 * it. Just remove whatever it overwrites from the table
2257 */
2258 for (int i = 0; i < inst->regs_written; i++) {
2259 remove_dead_code_hash(ht,
2260 inst->dst.reg,
2261 inst->dst.reg_offset + i);
2262 }
2263 } else {
2264 struct hash_entry *entry =
2265 get_dead_code_hash_entry(ht, inst->dst.reg,
2266 inst->dst.reg_offset);
2267
2268 if (entry) {
2269 if (inst->is_partial_write()) {
2270 /* For a partial write, we can't remove any previous dead code
2271 * candidate, since we're just modifying their result.
2272 */
2273 } else {
2274 /* We're completely updating a channel, and there was a
2275 * previous write to the channel that wasn't read. Kill it!
2276 */
2277 fs_inst *inst = (fs_inst *)entry->data;
2278 inst->remove();
2279 progress = true;
2280 }
2281
2282 _mesa_hash_table_remove(ht, entry);
2283 }
2284
2285 if (!inst->has_side_effects())
2286 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2287 inst);
2288 }
2289 }
2290 }
2291
2292 _mesa_hash_table_destroy(ht, NULL);
2293
2294 if (progress)
2295 invalidate_live_intervals();
2296
2297 return progress;
2298 }
2299
2300 bool
2301 fs_visitor::compute_to_mrf()
2302 {
2303 bool progress = false;
2304 int next_ip = 0;
2305
2306 calculate_live_intervals();
2307
2308 foreach_list_safe(node, &this->instructions) {
2309 fs_inst *inst = (fs_inst *)node;
2310
2311 int ip = next_ip;
2312 next_ip++;
2313
2314 if (inst->opcode != BRW_OPCODE_MOV ||
2315 inst->is_partial_write() ||
2316 inst->dst.file != MRF || inst->src[0].file != GRF ||
2317 inst->dst.type != inst->src[0].type ||
2318 inst->src[0].abs || inst->src[0].negate ||
2319 !inst->src[0].is_contiguous() ||
2320 inst->src[0].subreg_offset)
2321 continue;
2322
2323 /* Work out which hardware MRF registers are written by this
2324 * instruction.
2325 */
2326 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2327 int mrf_high;
2328 if (inst->dst.reg & BRW_MRF_COMPR4) {
2329 mrf_high = mrf_low + 4;
2330 } else if (dispatch_width == 16 &&
2331 (!inst->force_uncompressed && !inst->force_sechalf)) {
2332 mrf_high = mrf_low + 1;
2333 } else {
2334 mrf_high = mrf_low;
2335 }
2336
2337 /* Can't compute-to-MRF this GRF if someone else was going to
2338 * read it later.
2339 */
2340 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2341 continue;
2342
2343 /* Found a move of a GRF to a MRF. Let's see if we can go
2344 * rewrite the thing that made this GRF to write into the MRF.
2345 */
2346 fs_inst *scan_inst;
2347 for (scan_inst = (fs_inst *)inst->prev;
2348 scan_inst->prev != NULL;
2349 scan_inst = (fs_inst *)scan_inst->prev) {
2350 if (scan_inst->dst.file == GRF &&
2351 scan_inst->dst.reg == inst->src[0].reg) {
2352 /* Found the last thing to write our reg we want to turn
2353 * into a compute-to-MRF.
2354 */
2355
2356 /* If this one instruction didn't populate all the
2357 * channels, bail. We might be able to rewrite everything
2358 * that writes that reg, but it would require smarter
2359 * tracking to delay the rewriting until complete success.
2360 */
2361 if (scan_inst->is_partial_write())
2362 break;
2363
2364 /* Things returning more than one register would need us to
2365 * understand coalescing out more than one MOV at a time.
2366 */
2367 if (scan_inst->regs_written > 1)
2368 break;
2369
2370 /* SEND instructions can't have MRF as a destination. */
2371 if (scan_inst->mlen)
2372 break;
2373
2374 if (brw->gen == 6) {
2375 /* gen6 math instructions must have the destination be
2376 * GRF, so no compute-to-MRF for them.
2377 */
2378 if (scan_inst->is_math()) {
2379 break;
2380 }
2381 }
2382
2383 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2384 /* Found the creator of our MRF's source value. */
2385 scan_inst->dst.file = MRF;
2386 scan_inst->dst.reg = inst->dst.reg;
2387 scan_inst->saturate |= inst->saturate;
2388 inst->remove();
2389 progress = true;
2390 }
2391 break;
2392 }
2393
2394 /* We don't handle control flow here. Most computation of
2395 * values that end up in MRFs are shortly before the MRF
2396 * write anyway.
2397 */
2398 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2399 break;
2400
2401 /* You can't read from an MRF, so if someone else reads our
2402 * MRF's source GRF that we wanted to rewrite, that stops us.
2403 */
2404 bool interfered = false;
2405 for (int i = 0; i < 3; i++) {
2406 if (scan_inst->src[i].file == GRF &&
2407 scan_inst->src[i].reg == inst->src[0].reg &&
2408 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2409 interfered = true;
2410 }
2411 }
2412 if (interfered)
2413 break;
2414
2415 if (scan_inst->dst.file == MRF) {
2416 /* If somebody else writes our MRF here, we can't
2417 * compute-to-MRF before that.
2418 */
2419 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2420 int scan_mrf_high;
2421
2422 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2423 scan_mrf_high = scan_mrf_low + 4;
2424 } else if (dispatch_width == 16 &&
2425 (!scan_inst->force_uncompressed &&
2426 !scan_inst->force_sechalf)) {
2427 scan_mrf_high = scan_mrf_low + 1;
2428 } else {
2429 scan_mrf_high = scan_mrf_low;
2430 }
2431
2432 if (mrf_low == scan_mrf_low ||
2433 mrf_low == scan_mrf_high ||
2434 mrf_high == scan_mrf_low ||
2435 mrf_high == scan_mrf_high) {
2436 break;
2437 }
2438 }
2439
2440 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2441 /* Found a SEND instruction, which means that there are
2442 * live values in MRFs from base_mrf to base_mrf +
2443 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2444 * above it.
2445 */
2446 if (mrf_low >= scan_inst->base_mrf &&
2447 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2448 break;
2449 }
2450 if (mrf_high >= scan_inst->base_mrf &&
2451 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2452 break;
2453 }
2454 }
2455 }
2456 }
2457
2458 if (progress)
2459 invalidate_live_intervals();
2460
2461 return progress;
2462 }
2463
2464 /**
2465 * Walks through basic blocks, looking for repeated MRF writes and
2466 * removing the later ones.
2467 */
2468 bool
2469 fs_visitor::remove_duplicate_mrf_writes()
2470 {
2471 fs_inst *last_mrf_move[16];
2472 bool progress = false;
2473
2474 /* Need to update the MRF tracking for compressed instructions. */
2475 if (dispatch_width == 16)
2476 return false;
2477
2478 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2479
2480 foreach_list_safe(node, &this->instructions) {
2481 fs_inst *inst = (fs_inst *)node;
2482
2483 if (inst->is_control_flow()) {
2484 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2485 }
2486
2487 if (inst->opcode == BRW_OPCODE_MOV &&
2488 inst->dst.file == MRF) {
2489 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2490 if (prev_inst && inst->equals(prev_inst)) {
2491 inst->remove();
2492 progress = true;
2493 continue;
2494 }
2495 }
2496
2497 /* Clear out the last-write records for MRFs that were overwritten. */
2498 if (inst->dst.file == MRF) {
2499 last_mrf_move[inst->dst.reg] = NULL;
2500 }
2501
2502 if (inst->mlen > 0 && inst->base_mrf != -1) {
2503 /* Found a SEND instruction, which will include two or fewer
2504 * implied MRF writes. We could do better here.
2505 */
2506 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2507 last_mrf_move[inst->base_mrf + i] = NULL;
2508 }
2509 }
2510
2511 /* Clear out any MRF move records whose sources got overwritten. */
2512 if (inst->dst.file == GRF) {
2513 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2514 if (last_mrf_move[i] &&
2515 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2516 last_mrf_move[i] = NULL;
2517 }
2518 }
2519 }
2520
2521 if (inst->opcode == BRW_OPCODE_MOV &&
2522 inst->dst.file == MRF &&
2523 inst->src[0].file == GRF &&
2524 !inst->is_partial_write()) {
2525 last_mrf_move[inst->dst.reg] = inst;
2526 }
2527 }
2528
2529 if (progress)
2530 invalidate_live_intervals();
2531
2532 return progress;
2533 }
2534
2535 static void
2536 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2537 int first_grf, int grf_len)
2538 {
2539 bool inst_simd16 = (dispatch_width > 8 &&
2540 !inst->force_uncompressed &&
2541 !inst->force_sechalf);
2542
2543 /* Clear the flag for registers that actually got read (as expected). */
2544 for (int i = 0; i < 3; i++) {
2545 int grf;
2546 if (inst->src[i].file == GRF) {
2547 grf = inst->src[i].reg;
2548 } else if (inst->src[i].file == HW_REG &&
2549 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2550 grf = inst->src[i].fixed_hw_reg.nr;
2551 } else {
2552 continue;
2553 }
2554
2555 if (grf >= first_grf &&
2556 grf < first_grf + grf_len) {
2557 deps[grf - first_grf] = false;
2558 if (inst_simd16)
2559 deps[grf - first_grf + 1] = false;
2560 }
2561 }
2562 }
2563
2564 /**
2565 * Implements this workaround for the original 965:
2566 *
2567 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2568 * check for post destination dependencies on this instruction, software
2569 * must ensure that there is no destination hazard for the case of ‘write
2570 * followed by a posted write’ shown in the following example.
2571 *
2572 * 1. mov r3 0
2573 * 2. send r3.xy <rest of send instruction>
2574 * 3. mov r2 r3
2575 *
2576 * Due to no post-destination dependency check on the ‘send’, the above
2577 * code sequence could have two instructions (1 and 2) in flight at the
2578 * same time that both consider ‘r3’ as the target of their final writes.
2579 */
2580 void
2581 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2582 {
2583 int reg_size = dispatch_width / 8;
2584 int write_len = inst->regs_written * reg_size;
2585 int first_write_grf = inst->dst.reg;
2586 bool needs_dep[BRW_MAX_MRF];
2587 assert(write_len < (int)sizeof(needs_dep) - 1);
2588
2589 memset(needs_dep, false, sizeof(needs_dep));
2590 memset(needs_dep, true, write_len);
2591
2592 clear_deps_for_inst_src(inst, dispatch_width,
2593 needs_dep, first_write_grf, write_len);
2594
2595 /* Walk backwards looking for writes to registers we're writing which
2596 * aren't read since being written. If we hit the start of the program,
2597 * we assume that there are no outstanding dependencies on entry to the
2598 * program.
2599 */
2600 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2601 scan_inst != NULL;
2602 scan_inst = (fs_inst *)scan_inst->prev) {
2603
2604 /* If we hit control flow, assume that there *are* outstanding
2605 * dependencies, and force their cleanup before our instruction.
2606 */
2607 if (scan_inst->is_control_flow()) {
2608 for (int i = 0; i < write_len; i++) {
2609 if (needs_dep[i]) {
2610 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2611 }
2612 }
2613 return;
2614 }
2615
2616 bool scan_inst_simd16 = (dispatch_width > 8 &&
2617 !scan_inst->force_uncompressed &&
2618 !scan_inst->force_sechalf);
2619
2620 /* We insert our reads as late as possible on the assumption that any
2621 * instruction but a MOV that might have left us an outstanding
2622 * dependency has more latency than a MOV.
2623 */
2624 if (scan_inst->dst.file == GRF) {
2625 for (int i = 0; i < scan_inst->regs_written; i++) {
2626 int reg = scan_inst->dst.reg + i * reg_size;
2627
2628 if (reg >= first_write_grf &&
2629 reg < first_write_grf + write_len &&
2630 needs_dep[reg - first_write_grf]) {
2631 inst->insert_before(DEP_RESOLVE_MOV(reg));
2632 needs_dep[reg - first_write_grf] = false;
2633 if (scan_inst_simd16)
2634 needs_dep[reg - first_write_grf + 1] = false;
2635 }
2636 }
2637 }
2638
2639 /* Clear the flag for registers that actually got read (as expected). */
2640 clear_deps_for_inst_src(scan_inst, dispatch_width,
2641 needs_dep, first_write_grf, write_len);
2642
2643 /* Continue the loop only if we haven't resolved all the dependencies */
2644 int i;
2645 for (i = 0; i < write_len; i++) {
2646 if (needs_dep[i])
2647 break;
2648 }
2649 if (i == write_len)
2650 return;
2651 }
2652 }
2653
2654 /**
2655 * Implements this workaround for the original 965:
2656 *
2657 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2658 * used as a destination register until after it has been sourced by an
2659 * instruction with a different destination register.
2660 */
2661 void
2662 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2663 {
2664 int write_len = inst->regs_written * dispatch_width / 8;
2665 int first_write_grf = inst->dst.reg;
2666 bool needs_dep[BRW_MAX_MRF];
2667 assert(write_len < (int)sizeof(needs_dep) - 1);
2668
2669 memset(needs_dep, false, sizeof(needs_dep));
2670 memset(needs_dep, true, write_len);
2671 /* Walk forwards looking for writes to registers we're writing which aren't
2672 * read before being written.
2673 */
2674 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2675 !scan_inst->is_tail_sentinel();
2676 scan_inst = (fs_inst *)scan_inst->next) {
2677 /* If we hit control flow, force resolve all remaining dependencies. */
2678 if (scan_inst->is_control_flow()) {
2679 for (int i = 0; i < write_len; i++) {
2680 if (needs_dep[i])
2681 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2682 }
2683 return;
2684 }
2685
2686 /* Clear the flag for registers that actually got read (as expected). */
2687 clear_deps_for_inst_src(scan_inst, dispatch_width,
2688 needs_dep, first_write_grf, write_len);
2689
2690 /* We insert our reads as late as possible since they're reading the
2691 * result of a SEND, which has massive latency.
2692 */
2693 if (scan_inst->dst.file == GRF &&
2694 scan_inst->dst.reg >= first_write_grf &&
2695 scan_inst->dst.reg < first_write_grf + write_len &&
2696 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2697 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2698 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2699 }
2700
2701 /* Continue the loop only if we haven't resolved all the dependencies */
2702 int i;
2703 for (i = 0; i < write_len; i++) {
2704 if (needs_dep[i])
2705 break;
2706 }
2707 if (i == write_len)
2708 return;
2709 }
2710
2711 /* If we hit the end of the program, resolve all remaining dependencies out
2712 * of paranoia.
2713 */
2714 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2715 assert(last_inst->eot);
2716 for (int i = 0; i < write_len; i++) {
2717 if (needs_dep[i])
2718 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2719 }
2720 }
2721
2722 void
2723 fs_visitor::insert_gen4_send_dependency_workarounds()
2724 {
2725 if (brw->gen != 4 || brw->is_g4x)
2726 return;
2727
2728 /* Note that we're done with register allocation, so GRF fs_regs always
2729 * have a .reg_offset of 0.
2730 */
2731
2732 foreach_list_safe(node, &this->instructions) {
2733 fs_inst *inst = (fs_inst *)node;
2734
2735 if (inst->mlen != 0 && inst->dst.file == GRF) {
2736 insert_gen4_pre_send_dependency_workarounds(inst);
2737 insert_gen4_post_send_dependency_workarounds(inst);
2738 }
2739 }
2740 }
2741
2742 /**
2743 * Turns the generic expression-style uniform pull constant load instruction
2744 * into a hardware-specific series of instructions for loading a pull
2745 * constant.
2746 *
2747 * The expression style allows the CSE pass before this to optimize out
2748 * repeated loads from the same offset, and gives the pre-register-allocation
2749 * scheduling full flexibility, while the conversion to native instructions
2750 * allows the post-register-allocation scheduler the best information
2751 * possible.
2752 *
2753 * Note that execution masking for setting up pull constant loads is special:
2754 * the channels that need to be written are unrelated to the current execution
2755 * mask, since a later instruction will use one of the result channels as a
2756 * source operand for all 8 or 16 of its channels.
2757 */
2758 void
2759 fs_visitor::lower_uniform_pull_constant_loads()
2760 {
2761 foreach_list(node, &this->instructions) {
2762 fs_inst *inst = (fs_inst *)node;
2763
2764 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2765 continue;
2766
2767 if (brw->gen >= 7) {
2768 /* The offset arg before was a vec4-aligned byte offset. We need to
2769 * turn it into a dword offset.
2770 */
2771 fs_reg const_offset_reg = inst->src[1];
2772 assert(const_offset_reg.file == IMM &&
2773 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2774 const_offset_reg.imm.u /= 4;
2775 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2776
2777 /* This is actually going to be a MOV, but since only the first dword
2778 * is accessed, we have a special opcode to do just that one. Note
2779 * that this needs to be an operation that will be considered a def
2780 * by live variable analysis, or register allocation will explode.
2781 */
2782 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2783 payload, const_offset_reg);
2784 setup->force_writemask_all = true;
2785
2786 setup->ir = inst->ir;
2787 setup->annotation = inst->annotation;
2788 inst->insert_before(setup);
2789
2790 /* Similarly, this will only populate the first 4 channels of the
2791 * result register (since we only use smear values from 0-3), but we
2792 * don't tell the optimizer.
2793 */
2794 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2795 inst->src[1] = payload;
2796
2797 invalidate_live_intervals();
2798 } else {
2799 /* Before register allocation, we didn't tell the scheduler about the
2800 * MRF we use. We know it's safe to use this MRF because nothing
2801 * else does except for register spill/unspill, which generates and
2802 * uses its MRF within a single IR instruction.
2803 */
2804 inst->base_mrf = 14;
2805 inst->mlen = 1;
2806 }
2807 }
2808 }
2809
2810 void
2811 fs_visitor::dump_instructions()
2812 {
2813 calculate_register_pressure();
2814
2815 int ip = 0, max_pressure = 0;
2816 foreach_list(node, &this->instructions) {
2817 backend_instruction *inst = (backend_instruction *)node;
2818 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2819 fprintf(stderr, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
2820 dump_instruction(inst);
2821 ++ip;
2822 }
2823 fprintf(stderr, "Maximum %3d registers live at once.\n", max_pressure);
2824 }
2825
2826 void
2827 fs_visitor::dump_instruction(backend_instruction *be_inst)
2828 {
2829 fs_inst *inst = (fs_inst *)be_inst;
2830
2831 if (inst->predicate) {
2832 fprintf(stderr, "(%cf0.%d) ",
2833 inst->predicate_inverse ? '-' : '+',
2834 inst->flag_subreg);
2835 }
2836
2837 fprintf(stderr, "%s", brw_instruction_name(inst->opcode));
2838 if (inst->saturate)
2839 fprintf(stderr, ".sat");
2840 if (inst->conditional_mod) {
2841 fprintf(stderr, "%s", conditional_modifier[inst->conditional_mod]);
2842 if (!inst->predicate &&
2843 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2844 inst->opcode != BRW_OPCODE_IF &&
2845 inst->opcode != BRW_OPCODE_WHILE))) {
2846 fprintf(stderr, ".f0.%d", inst->flag_subreg);
2847 }
2848 }
2849 fprintf(stderr, " ");
2850
2851
2852 switch (inst->dst.file) {
2853 case GRF:
2854 fprintf(stderr, "vgrf%d", inst->dst.reg);
2855 if (virtual_grf_sizes[inst->dst.reg] != 1 ||
2856 inst->dst.subreg_offset)
2857 fprintf(stderr, "+%d.%d",
2858 inst->dst.reg_offset, inst->dst.subreg_offset);
2859 break;
2860 case MRF:
2861 fprintf(stderr, "m%d", inst->dst.reg);
2862 break;
2863 case BAD_FILE:
2864 fprintf(stderr, "(null)");
2865 break;
2866 case UNIFORM:
2867 fprintf(stderr, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
2868 break;
2869 case HW_REG:
2870 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2871 switch (inst->dst.fixed_hw_reg.nr) {
2872 case BRW_ARF_NULL:
2873 fprintf(stderr, "null");
2874 break;
2875 case BRW_ARF_ADDRESS:
2876 fprintf(stderr, "a0.%d", inst->dst.fixed_hw_reg.subnr);
2877 break;
2878 case BRW_ARF_ACCUMULATOR:
2879 fprintf(stderr, "acc%d", inst->dst.fixed_hw_reg.subnr);
2880 break;
2881 case BRW_ARF_FLAG:
2882 fprintf(stderr, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2883 inst->dst.fixed_hw_reg.subnr);
2884 break;
2885 default:
2886 fprintf(stderr, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2887 inst->dst.fixed_hw_reg.subnr);
2888 break;
2889 }
2890 } else {
2891 fprintf(stderr, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
2892 }
2893 if (inst->dst.fixed_hw_reg.subnr)
2894 fprintf(stderr, "+%d", inst->dst.fixed_hw_reg.subnr);
2895 break;
2896 default:
2897 fprintf(stderr, "???");
2898 break;
2899 }
2900 fprintf(stderr, ":%s, ", brw_reg_type_letters(inst->dst.type));
2901
2902 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
2903 if (inst->src[i].negate)
2904 fprintf(stderr, "-");
2905 if (inst->src[i].abs)
2906 fprintf(stderr, "|");
2907 switch (inst->src[i].file) {
2908 case GRF:
2909 fprintf(stderr, "vgrf%d", inst->src[i].reg);
2910 if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2911 inst->src[i].subreg_offset)
2912 fprintf(stderr, "+%d.%d", inst->src[i].reg_offset,
2913 inst->src[i].subreg_offset);
2914 break;
2915 case MRF:
2916 fprintf(stderr, "***m%d***", inst->src[i].reg);
2917 break;
2918 case UNIFORM:
2919 fprintf(stderr, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
2920 if (inst->src[i].reladdr) {
2921 fprintf(stderr, "+reladdr");
2922 } else if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2923 inst->src[i].subreg_offset) {
2924 fprintf(stderr, "+%d.%d", inst->src[i].reg_offset,
2925 inst->src[i].subreg_offset);
2926 }
2927 break;
2928 case BAD_FILE:
2929 fprintf(stderr, "(null)");
2930 break;
2931 case IMM:
2932 switch (inst->src[i].type) {
2933 case BRW_REGISTER_TYPE_F:
2934 fprintf(stderr, "%ff", inst->src[i].imm.f);
2935 break;
2936 case BRW_REGISTER_TYPE_D:
2937 fprintf(stderr, "%dd", inst->src[i].imm.i);
2938 break;
2939 case BRW_REGISTER_TYPE_UD:
2940 fprintf(stderr, "%uu", inst->src[i].imm.u);
2941 break;
2942 default:
2943 fprintf(stderr, "???");
2944 break;
2945 }
2946 break;
2947 case HW_REG:
2948 if (inst->src[i].fixed_hw_reg.negate)
2949 fprintf(stderr, "-");
2950 if (inst->src[i].fixed_hw_reg.abs)
2951 fprintf(stderr, "|");
2952 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2953 switch (inst->src[i].fixed_hw_reg.nr) {
2954 case BRW_ARF_NULL:
2955 fprintf(stderr, "null");
2956 break;
2957 case BRW_ARF_ADDRESS:
2958 fprintf(stderr, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
2959 break;
2960 case BRW_ARF_ACCUMULATOR:
2961 fprintf(stderr, "acc%d", inst->src[i].fixed_hw_reg.subnr);
2962 break;
2963 case BRW_ARF_FLAG:
2964 fprintf(stderr, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2965 inst->src[i].fixed_hw_reg.subnr);
2966 break;
2967 default:
2968 fprintf(stderr, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2969 inst->src[i].fixed_hw_reg.subnr);
2970 break;
2971 }
2972 } else {
2973 fprintf(stderr, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2974 }
2975 if (inst->src[i].fixed_hw_reg.subnr)
2976 fprintf(stderr, "+%d", inst->src[i].fixed_hw_reg.subnr);
2977 if (inst->src[i].fixed_hw_reg.abs)
2978 fprintf(stderr, "|");
2979 break;
2980 default:
2981 fprintf(stderr, "???");
2982 break;
2983 }
2984 if (inst->src[i].abs)
2985 fprintf(stderr, "|");
2986
2987 if (inst->src[i].file != IMM) {
2988 fprintf(stderr, ":%s", brw_reg_type_letters(inst->src[i].type));
2989 }
2990
2991 if (i < 2 && inst->src[i + 1].file != BAD_FILE)
2992 fprintf(stderr, ", ");
2993 }
2994
2995 fprintf(stderr, " ");
2996
2997 if (inst->force_uncompressed)
2998 fprintf(stderr, "1sthalf ");
2999
3000 if (inst->force_sechalf)
3001 fprintf(stderr, "2ndhalf ");
3002
3003 fprintf(stderr, "\n");
3004 }
3005
3006 /**
3007 * Possibly returns an instruction that set up @param reg.
3008 *
3009 * Sometimes we want to take the result of some expression/variable
3010 * dereference tree and rewrite the instruction generating the result
3011 * of the tree. When processing the tree, we know that the
3012 * instructions generated are all writing temporaries that are dead
3013 * outside of this tree. So, if we have some instructions that write
3014 * a temporary, we're free to point that temp write somewhere else.
3015 *
3016 * Note that this doesn't guarantee that the instruction generated
3017 * only reg -- it might be the size=4 destination of a texture instruction.
3018 */
3019 fs_inst *
3020 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3021 fs_inst *end,
3022 const fs_reg &reg)
3023 {
3024 if (end == start ||
3025 end->is_partial_write() ||
3026 reg.reladdr ||
3027 !reg.equals(end->dst)) {
3028 return NULL;
3029 } else {
3030 return end;
3031 }
3032 }
3033
3034 void
3035 fs_visitor::setup_payload_gen6()
3036 {
3037 bool uses_depth =
3038 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3039 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3040
3041 assert(brw->gen >= 6);
3042
3043 /* R0-1: masks, pixel X/Y coordinates. */
3044 c->nr_payload_regs = 2;
3045 /* R2: only for 32-pixel dispatch.*/
3046
3047 /* R3-26: barycentric interpolation coordinates. These appear in the
3048 * same order that they appear in the brw_wm_barycentric_interp_mode
3049 * enum. Each set of coordinates occupies 2 registers if dispatch width
3050 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3051 * appear if they were enabled using the "Barycentric Interpolation
3052 * Mode" bits in WM_STATE.
3053 */
3054 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3055 if (barycentric_interp_modes & (1 << i)) {
3056 c->barycentric_coord_reg[i] = c->nr_payload_regs;
3057 c->nr_payload_regs += 2;
3058 if (dispatch_width == 16) {
3059 c->nr_payload_regs += 2;
3060 }
3061 }
3062 }
3063
3064 /* R27: interpolated depth if uses source depth */
3065 if (uses_depth) {
3066 c->source_depth_reg = c->nr_payload_regs;
3067 c->nr_payload_regs++;
3068 if (dispatch_width == 16) {
3069 /* R28: interpolated depth if not SIMD8. */
3070 c->nr_payload_regs++;
3071 }
3072 }
3073 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3074 if (uses_depth) {
3075 c->source_w_reg = c->nr_payload_regs;
3076 c->nr_payload_regs++;
3077 if (dispatch_width == 16) {
3078 /* R30: interpolated W if not SIMD8. */
3079 c->nr_payload_regs++;
3080 }
3081 }
3082
3083 c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3084 /* R31: MSAA position offsets. */
3085 if (c->prog_data.uses_pos_offset) {
3086 c->sample_pos_reg = c->nr_payload_regs;
3087 c->nr_payload_regs++;
3088 }
3089
3090 /* R32: MSAA input coverage mask */
3091 if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3092 assert(brw->gen >= 7);
3093 c->sample_mask_reg = c->nr_payload_regs;
3094 c->nr_payload_regs++;
3095 if (dispatch_width == 16) {
3096 /* R33: input coverage mask if not SIMD8. */
3097 c->nr_payload_regs++;
3098 }
3099 }
3100
3101 /* R34-: bary for 32-pixel. */
3102 /* R58-59: interp W for 32-pixel. */
3103
3104 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3105 c->source_depth_to_render_target = true;
3106 }
3107 }
3108
3109 void
3110 fs_visitor::assign_binding_table_offsets()
3111 {
3112 uint32_t next_binding_table_offset = 0;
3113
3114 /* If there are no color regions, we still perform an FB write to a null
3115 * renderbuffer, which we place at surface index 0.
3116 */
3117 c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3118 next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3119
3120 assign_common_binding_table_offsets(next_binding_table_offset);
3121 }
3122
3123 void
3124 fs_visitor::calculate_register_pressure()
3125 {
3126 invalidate_live_intervals();
3127 calculate_live_intervals();
3128
3129 int num_instructions = 0;
3130 foreach_list(node, &this->instructions) {
3131 ++num_instructions;
3132 }
3133
3134 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3135
3136 for (int reg = 0; reg < virtual_grf_count; reg++) {
3137 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3138 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3139 }
3140 }
3141
3142 /**
3143 * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
3144 *
3145 * The needs_unlit_centroid_workaround ends up producing one of these per
3146 * channel of centroid input, so it's good to clean them up.
3147 *
3148 * An assumption here is that nothing ever modifies the dispatched pixels
3149 * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
3150 * dictates that anyway.
3151 */
3152 void
3153 fs_visitor::opt_drop_redundant_mov_to_flags()
3154 {
3155 bool flag_mov_found[2] = {false};
3156
3157 foreach_list_safe(node, &this->instructions) {
3158 fs_inst *inst = (fs_inst *)node;
3159
3160 if (inst->is_control_flow()) {
3161 memset(flag_mov_found, 0, sizeof(flag_mov_found));
3162 } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
3163 if (!flag_mov_found[inst->flag_subreg])
3164 flag_mov_found[inst->flag_subreg] = true;
3165 else
3166 inst->remove();
3167 } else if (inst->writes_flag()) {
3168 flag_mov_found[inst->flag_subreg] = false;
3169 }
3170 }
3171 }
3172
3173 bool
3174 fs_visitor::run()
3175 {
3176 sanity_param_count = fp->Base.Parameters->NumParameters;
3177 bool allocated_without_spills;
3178
3179 assign_binding_table_offsets();
3180
3181 if (brw->gen >= 6)
3182 setup_payload_gen6();
3183 else
3184 setup_payload_gen4();
3185
3186 if (0) {
3187 emit_dummy_fs();
3188 } else {
3189 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3190 emit_shader_time_begin();
3191
3192 calculate_urb_setup();
3193 if (fp->Base.InputsRead > 0) {
3194 if (brw->gen < 6)
3195 emit_interpolation_setup_gen4();
3196 else
3197 emit_interpolation_setup_gen6();
3198 }
3199
3200 /* We handle discards by keeping track of the still-live pixels in f0.1.
3201 * Initialize it with the dispatched pixels.
3202 */
3203 if (fp->UsesKill || c->key.alpha_test_func) {
3204 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3205 discard_init->flag_subreg = 1;
3206 }
3207
3208 /* Generate FS IR for main(). (the visitor only descends into
3209 * functions called "main").
3210 */
3211 if (shader) {
3212 foreach_list(node, &*shader->base.ir) {
3213 ir_instruction *ir = (ir_instruction *)node;
3214 base_ir = ir;
3215 this->result = reg_undef;
3216 ir->accept(this);
3217 }
3218 } else {
3219 emit_fragment_program_code();
3220 }
3221 base_ir = NULL;
3222 if (failed)
3223 return false;
3224
3225 emit(FS_OPCODE_PLACEHOLDER_HALT);
3226
3227 if (c->key.alpha_test_func)
3228 emit_alpha_test();
3229
3230 emit_fb_writes();
3231
3232 split_virtual_grfs();
3233
3234 move_uniform_array_access_to_pull_constants();
3235 assign_constant_locations();
3236 demote_pull_constants();
3237
3238 opt_drop_redundant_mov_to_flags();
3239
3240 bool progress;
3241 do {
3242 progress = false;
3243
3244 compact_virtual_grfs();
3245
3246 progress = remove_duplicate_mrf_writes() || progress;
3247
3248 progress = opt_algebraic() || progress;
3249 progress = opt_cse() || progress;
3250 progress = opt_copy_propagate() || progress;
3251 progress = opt_peephole_predicated_break() || progress;
3252 progress = dead_code_eliminate() || progress;
3253 progress = dead_code_eliminate_local() || progress;
3254 progress = opt_peephole_sel() || progress;
3255 progress = dead_control_flow_eliminate(this) || progress;
3256 progress = opt_saturate_propagation() || progress;
3257 progress = register_coalesce() || progress;
3258 progress = compute_to_mrf() || progress;
3259 } while (progress);
3260
3261 lower_uniform_pull_constant_loads();
3262
3263 assign_curb_setup();
3264 assign_urb_setup();
3265
3266 static enum instruction_scheduler_mode pre_modes[] = {
3267 SCHEDULE_PRE,
3268 SCHEDULE_PRE_NON_LIFO,
3269 SCHEDULE_PRE_LIFO,
3270 };
3271
3272 /* Try each scheduling heuristic to see if it can successfully register
3273 * allocate without spilling. They should be ordered by decreasing
3274 * performance but increasing likelihood of allocating.
3275 */
3276 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3277 schedule_instructions(pre_modes[i]);
3278
3279 if (0) {
3280 assign_regs_trivial();
3281 allocated_without_spills = true;
3282 } else {
3283 allocated_without_spills = assign_regs(false);
3284 }
3285 if (allocated_without_spills)
3286 break;
3287 }
3288
3289 if (!allocated_without_spills) {
3290 /* We assume that any spilling is worse than just dropping back to
3291 * SIMD8. There's probably actually some intermediate point where
3292 * SIMD16 with a couple of spills is still better.
3293 */
3294 if (dispatch_width == 16) {
3295 fail("Failure to register allocate. Reduce number of "
3296 "live scalar values to avoid this.");
3297 }
3298
3299 /* Since we're out of heuristics, just go spill registers until we
3300 * get an allocation.
3301 */
3302 while (!assign_regs(true)) {
3303 if (failed)
3304 break;
3305 }
3306 }
3307 }
3308 assert(force_uncompressed_stack == 0);
3309
3310 /* This must come after all optimization and register allocation, since
3311 * it inserts dead code that happens to have side effects, and it does
3312 * so based on the actual physical registers in use.
3313 */
3314 insert_gen4_send_dependency_workarounds();
3315
3316 if (failed)
3317 return false;
3318
3319 if (!allocated_without_spills)
3320 schedule_instructions(SCHEDULE_POST);
3321
3322 if (dispatch_width == 8)
3323 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3324 else
3325 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3326
3327 /* If any state parameters were appended, then ParameterValues could have
3328 * been realloced, in which case the driver uniform storage set up by
3329 * _mesa_associate_uniform_storage() would point to freed memory. Make
3330 * sure that didn't happen.
3331 */
3332 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3333
3334 return !failed;
3335 }
3336
3337 const unsigned *
3338 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3339 struct gl_fragment_program *fp,
3340 struct gl_shader_program *prog,
3341 unsigned *final_assembly_size)
3342 {
3343 bool start_busy = false;
3344 double start_time = 0;
3345
3346 if (unlikely(brw->perf_debug)) {
3347 start_busy = (brw->batch.last_bo &&
3348 drm_intel_bo_busy(brw->batch.last_bo));
3349 start_time = get_time();
3350 }
3351
3352 struct brw_shader *shader = NULL;
3353 if (prog)
3354 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3355
3356 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3357 brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3358
3359 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3360 */
3361 fs_visitor v(brw, c, prog, fp, 8);
3362 if (!v.run()) {
3363 if (prog) {
3364 prog->LinkStatus = false;
3365 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3366 }
3367
3368 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3369 v.fail_msg);
3370
3371 return NULL;
3372 }
3373
3374 exec_list *simd16_instructions = NULL;
3375 fs_visitor v2(brw, c, prog, fp, 16);
3376 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3377 if (!v.simd16_unsupported) {
3378 /* Try a SIMD16 compile */
3379 v2.import_uniforms(&v);
3380 if (!v2.run()) {
3381 perf_debug("SIMD16 shader failed to compile, falling back to "
3382 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3383 } else {
3384 simd16_instructions = &v2.instructions;
3385 }
3386 } else {
3387 perf_debug("SIMD16 shader unsupported, falling back to "
3388 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3389 }
3390 }
3391
3392 const unsigned *assembly = NULL;
3393 if (brw->gen >= 8) {
3394 gen8_fs_generator g(brw, c, prog, fp, v.do_dual_src);
3395 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3396 final_assembly_size);
3397 } else {
3398 fs_generator g(brw, c, prog, fp, v.do_dual_src);
3399 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3400 final_assembly_size);
3401 }
3402
3403 if (unlikely(brw->perf_debug) && shader) {
3404 if (shader->compiled_once)
3405 brw_wm_debug_recompile(brw, prog, &c->key);
3406 shader->compiled_once = true;
3407
3408 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3409 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3410 (get_time() - start_time) * 1000);
3411 }
3412 }
3413
3414 return assembly;
3415 }
3416
3417 bool
3418 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3419 {
3420 struct brw_context *brw = brw_context(ctx);
3421 struct brw_wm_prog_key key;
3422
3423 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3424 return true;
3425
3426 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3427 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3428 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3429 bool program_uses_dfdy = fp->UsesDFdy;
3430
3431 memset(&key, 0, sizeof(key));
3432
3433 if (brw->gen < 6) {
3434 if (fp->UsesKill)
3435 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3436
3437 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3438 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3439
3440 /* Just assume depth testing. */
3441 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3442 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3443 }
3444
3445 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3446 BRW_FS_VARYING_INPUT_MASK) > 16)
3447 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3448
3449 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3450
3451 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3452 for (unsigned i = 0; i < sampler_count; i++) {
3453 if (fp->Base.ShadowSamplers & (1 << i)) {
3454 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3455 key.tex.swizzles[i] =
3456 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3457 } else {
3458 /* Color sampler: assume no swizzling. */
3459 key.tex.swizzles[i] = SWIZZLE_XYZW;
3460 }
3461 }
3462
3463 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3464 key.drawable_height = ctx->DrawBuffer->Height;
3465 }
3466
3467 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3468 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3469 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3470
3471 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3472 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3473 key.nr_color_regions > 1;
3474 }
3475
3476 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3477 * quality of the derivatives is likely to be determined by the driconf
3478 * option.
3479 */
3480 key.high_quality_derivatives = brw->disable_derivative_optimization;
3481
3482 key.program_string_id = bfp->id;
3483
3484 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3485 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3486
3487 bool success = do_wm_prog(brw, prog, bfp, &key);
3488
3489 brw->wm.base.prog_offset = old_prog_offset;
3490 brw->wm.prog_data = old_prog_data;
3491
3492 return success;
3493 }