i965: Stop throwing away our double precision for time calculations.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53
54 void
55 fs_inst::init()
56 {
57 memset(this, 0, sizeof(*this));
58 this->opcode = BRW_OPCODE_NOP;
59 this->conditional_mod = BRW_CONDITIONAL_NONE;
60
61 this->dst = reg_undef;
62 this->src[0] = reg_undef;
63 this->src[1] = reg_undef;
64 this->src[2] = reg_undef;
65
66 /* This will be the case for almost all instructions. */
67 this->regs_written = 1;
68 }
69
70 fs_inst::fs_inst()
71 {
72 init();
73 }
74
75 fs_inst::fs_inst(enum opcode opcode)
76 {
77 init();
78 this->opcode = opcode;
79 }
80
81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
82 {
83 init();
84 this->opcode = opcode;
85 this->dst = dst;
86
87 if (dst.file == GRF)
88 assert(dst.reg_offset >= 0);
89 }
90
91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
92 {
93 init();
94 this->opcode = opcode;
95 this->dst = dst;
96 this->src[0] = src0;
97
98 if (dst.file == GRF)
99 assert(dst.reg_offset >= 0);
100 if (src[0].file == GRF)
101 assert(src[0].reg_offset >= 0);
102 }
103
104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
105 {
106 init();
107 this->opcode = opcode;
108 this->dst = dst;
109 this->src[0] = src0;
110 this->src[1] = src1;
111
112 if (dst.file == GRF)
113 assert(dst.reg_offset >= 0);
114 if (src[0].file == GRF)
115 assert(src[0].reg_offset >= 0);
116 if (src[1].file == GRF)
117 assert(src[1].reg_offset >= 0);
118 }
119
120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
121 fs_reg src0, fs_reg src1, fs_reg src2)
122 {
123 init();
124 this->opcode = opcode;
125 this->dst = dst;
126 this->src[0] = src0;
127 this->src[1] = src1;
128 this->src[2] = src2;
129
130 if (dst.file == GRF)
131 assert(dst.reg_offset >= 0);
132 if (src[0].file == GRF)
133 assert(src[0].reg_offset >= 0);
134 if (src[1].file == GRF)
135 assert(src[1].reg_offset >= 0);
136 if (src[2].file == GRF)
137 assert(src[2].reg_offset >= 0);
138 }
139
140 #define ALU1(op) \
141 fs_inst * \
142 fs_visitor::op(fs_reg dst, fs_reg src0) \
143 { \
144 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
145 }
146
147 #define ALU2(op) \
148 fs_inst * \
149 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
150 { \
151 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
152 }
153
154 #define ALU3(op) \
155 fs_inst * \
156 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
157 { \
158 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
159 }
160
161 ALU1(NOT)
162 ALU1(MOV)
163 ALU1(FRC)
164 ALU1(RNDD)
165 ALU1(RNDE)
166 ALU1(RNDZ)
167 ALU2(ADD)
168 ALU2(MUL)
169 ALU2(MACH)
170 ALU2(AND)
171 ALU2(OR)
172 ALU2(XOR)
173 ALU2(SHL)
174 ALU2(SHR)
175 ALU2(ASR)
176 ALU3(LRP)
177 ALU1(BFREV)
178 ALU3(BFE)
179 ALU2(BFI1)
180 ALU3(BFI2)
181 ALU1(FBH)
182 ALU1(FBL)
183 ALU1(CBIT)
184 ALU3(MAD)
185 ALU2(ADDC)
186 ALU2(SUBB)
187 ALU2(SEL)
188
189 /** Gen4 predicated IF. */
190 fs_inst *
191 fs_visitor::IF(uint32_t predicate)
192 {
193 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195 return inst;
196 }
197
198 /** Gen6 IF with embedded comparison. */
199 fs_inst *
200 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
201 {
202 assert(brw->gen == 6);
203 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
204 reg_null_d, src0, src1);
205 inst->conditional_mod = condition;
206 return inst;
207 }
208
209 /**
210 * CMP: Sets the low bit of the destination channels with the result
211 * of the comparison, while the upper bits are undefined, and updates
212 * the flag register with the packed 16 bits of the result.
213 */
214 fs_inst *
215 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
216 {
217 fs_inst *inst;
218
219 /* Take the instruction:
220 *
221 * CMP null<d> src0<f> src1<f>
222 *
223 * Original gen4 does type conversion to the destination type before
224 * comparison, producing garbage results for floating point comparisons.
225 * gen5 does the comparison on the execution type (resolved source types),
226 * so dst type doesn't matter. gen6 does comparison and then uses the
227 * result as if it was the dst type with no conversion, which happens to
228 * mostly work out for float-interpreted-as-int since our comparisons are
229 * for >0, =0, <0.
230 */
231 if (brw->gen == 4) {
232 dst.type = src0.type;
233 if (dst.file == HW_REG)
234 dst.fixed_hw_reg.type = dst.type;
235 }
236
237 resolve_ud_negate(&src0);
238 resolve_ud_negate(&src1);
239
240 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
241 inst->conditional_mod = condition;
242
243 return inst;
244 }
245
246 exec_list
247 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
248 fs_reg varying_offset,
249 uint32_t const_offset)
250 {
251 exec_list instructions;
252 fs_inst *inst;
253
254 /* We have our constant surface use a pitch of 4 bytes, so our index can
255 * be any component of a vector, and then we load 4 contiguous
256 * components starting from that.
257 *
258 * We break down the const_offset to a portion added to the variable
259 * offset and a portion done using reg_offset, which means that if you
260 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
261 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
262 * CSE can later notice that those loads are all the same and eliminate
263 * the redundant ones.
264 */
265 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
266 instructions.push_tail(ADD(vec4_offset,
267 varying_offset, const_offset & ~3));
268
269 int scale = 1;
270 if (brw->gen == 4 && dispatch_width == 8) {
271 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
272 * u, v, r) as parameters, or we can just use the SIMD16 message
273 * consisting of (header, u). We choose the second, at the cost of a
274 * longer return length.
275 */
276 scale = 2;
277 }
278
279 enum opcode op;
280 if (brw->gen >= 7)
281 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
282 else
283 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
284 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
285 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
286 inst->regs_written = 4 * scale;
287 instructions.push_tail(inst);
288
289 if (brw->gen < 7) {
290 inst->base_mrf = 13;
291 inst->header_present = true;
292 if (brw->gen == 4)
293 inst->mlen = 3;
294 else
295 inst->mlen = 1 + dispatch_width / 8;
296 }
297
298 vec4_result.reg_offset += (const_offset & 3) * scale;
299 instructions.push_tail(MOV(dst, vec4_result));
300
301 return instructions;
302 }
303
304 /**
305 * A helper for MOV generation for fixing up broken hardware SEND dependency
306 * handling.
307 */
308 fs_inst *
309 fs_visitor::DEP_RESOLVE_MOV(int grf)
310 {
311 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
312
313 inst->ir = NULL;
314 inst->annotation = "send dependency resolve";
315
316 /* The caller always wants uncompressed to emit the minimal extra
317 * dependencies, and to avoid having to deal with aligning its regs to 2.
318 */
319 inst->force_uncompressed = true;
320
321 return inst;
322 }
323
324 bool
325 fs_inst::equals(fs_inst *inst)
326 {
327 return (opcode == inst->opcode &&
328 dst.equals(inst->dst) &&
329 src[0].equals(inst->src[0]) &&
330 src[1].equals(inst->src[1]) &&
331 src[2].equals(inst->src[2]) &&
332 saturate == inst->saturate &&
333 predicate == inst->predicate &&
334 conditional_mod == inst->conditional_mod &&
335 mlen == inst->mlen &&
336 base_mrf == inst->base_mrf &&
337 sampler == inst->sampler &&
338 target == inst->target &&
339 eot == inst->eot &&
340 header_present == inst->header_present &&
341 shadow_compare == inst->shadow_compare &&
342 offset == inst->offset);
343 }
344
345 bool
346 fs_inst::overwrites_reg(const fs_reg &reg)
347 {
348 return (reg.file == dst.file &&
349 reg.reg == dst.reg &&
350 reg.reg_offset >= dst.reg_offset &&
351 reg.reg_offset < dst.reg_offset + regs_written);
352 }
353
354 bool
355 fs_inst::is_send_from_grf()
356 {
357 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
358 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
359 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
360 src[1].file == GRF) ||
361 (is_tex() && src[0].file == GRF));
362 }
363
364 bool
365 fs_visitor::can_do_source_mods(fs_inst *inst)
366 {
367 if (brw->gen == 6 && inst->is_math())
368 return false;
369
370 if (inst->is_send_from_grf())
371 return false;
372
373 if (!inst->can_do_source_mods())
374 return false;
375
376 return true;
377 }
378
379 void
380 fs_reg::init()
381 {
382 memset(this, 0, sizeof(*this));
383 stride = 1;
384 }
385
386 /** Generic unset register constructor. */
387 fs_reg::fs_reg()
388 {
389 init();
390 this->file = BAD_FILE;
391 }
392
393 /** Immediate value constructor. */
394 fs_reg::fs_reg(float f)
395 {
396 init();
397 this->file = IMM;
398 this->type = BRW_REGISTER_TYPE_F;
399 this->imm.f = f;
400 }
401
402 /** Immediate value constructor. */
403 fs_reg::fs_reg(int32_t i)
404 {
405 init();
406 this->file = IMM;
407 this->type = BRW_REGISTER_TYPE_D;
408 this->imm.i = i;
409 }
410
411 /** Immediate value constructor. */
412 fs_reg::fs_reg(uint32_t u)
413 {
414 init();
415 this->file = IMM;
416 this->type = BRW_REGISTER_TYPE_UD;
417 this->imm.u = u;
418 }
419
420 /** Fixed brw_reg. */
421 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
422 {
423 init();
424 this->file = HW_REG;
425 this->fixed_hw_reg = fixed_hw_reg;
426 this->type = fixed_hw_reg.type;
427 }
428
429 bool
430 fs_reg::equals(const fs_reg &r) const
431 {
432 return (file == r.file &&
433 reg == r.reg &&
434 reg_offset == r.reg_offset &&
435 subreg_offset == r.subreg_offset &&
436 type == r.type &&
437 negate == r.negate &&
438 abs == r.abs &&
439 !reladdr && !r.reladdr &&
440 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
441 sizeof(fixed_hw_reg)) == 0 &&
442 stride == r.stride &&
443 imm.u == r.imm.u);
444 }
445
446 fs_reg &
447 fs_reg::apply_stride(unsigned stride)
448 {
449 assert((this->stride * stride) <= 4 &&
450 (is_power_of_two(stride) || stride == 0) &&
451 file != HW_REG && file != IMM);
452 this->stride *= stride;
453 return *this;
454 }
455
456 fs_reg &
457 fs_reg::set_smear(unsigned subreg)
458 {
459 assert(file != HW_REG && file != IMM);
460 subreg_offset = subreg * type_sz(type);
461 stride = 0;
462 return *this;
463 }
464
465 bool
466 fs_reg::is_contiguous() const
467 {
468 return stride == 1;
469 }
470
471 bool
472 fs_reg::is_zero() const
473 {
474 if (file != IMM)
475 return false;
476
477 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
478 }
479
480 bool
481 fs_reg::is_one() const
482 {
483 if (file != IMM)
484 return false;
485
486 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
487 }
488
489 bool
490 fs_reg::is_null() const
491 {
492 return file == HW_REG &&
493 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
494 fixed_hw_reg.nr == BRW_ARF_NULL;
495 }
496
497 bool
498 fs_reg::is_valid_3src() const
499 {
500 return file == GRF || file == UNIFORM;
501 }
502
503 int
504 fs_visitor::type_size(const struct glsl_type *type)
505 {
506 unsigned int size, i;
507
508 switch (type->base_type) {
509 case GLSL_TYPE_UINT:
510 case GLSL_TYPE_INT:
511 case GLSL_TYPE_FLOAT:
512 case GLSL_TYPE_BOOL:
513 return type->components();
514 case GLSL_TYPE_ARRAY:
515 return type_size(type->fields.array) * type->length;
516 case GLSL_TYPE_STRUCT:
517 size = 0;
518 for (i = 0; i < type->length; i++) {
519 size += type_size(type->fields.structure[i].type);
520 }
521 return size;
522 case GLSL_TYPE_SAMPLER:
523 /* Samplers take up no register space, since they're baked in at
524 * link time.
525 */
526 return 0;
527 case GLSL_TYPE_ATOMIC_UINT:
528 return 0;
529 case GLSL_TYPE_IMAGE:
530 case GLSL_TYPE_VOID:
531 case GLSL_TYPE_ERROR:
532 case GLSL_TYPE_INTERFACE:
533 assert(!"not reached");
534 break;
535 }
536
537 return 0;
538 }
539
540 fs_reg
541 fs_visitor::get_timestamp()
542 {
543 assert(brw->gen >= 7);
544
545 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
546 BRW_ARF_TIMESTAMP,
547 0),
548 BRW_REGISTER_TYPE_UD));
549
550 fs_reg dst = fs_reg(this, glsl_type::uint_type);
551
552 fs_inst *mov = emit(MOV(dst, ts));
553 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
554 * even if it's not enabled in the dispatch.
555 */
556 mov->force_writemask_all = true;
557 mov->force_uncompressed = true;
558
559 /* The caller wants the low 32 bits of the timestamp. Since it's running
560 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
561 * which is plenty of time for our purposes. It is identical across the
562 * EUs, but since it's tracking GPU core speed it will increment at a
563 * varying rate as render P-states change.
564 *
565 * The caller could also check if render P-states have changed (or anything
566 * else that might disrupt timing) by setting smear to 2 and checking if
567 * that field is != 0.
568 */
569 dst.set_smear(0);
570
571 return dst;
572 }
573
574 void
575 fs_visitor::emit_shader_time_begin()
576 {
577 current_annotation = "shader time start";
578 shader_start_time = get_timestamp();
579 }
580
581 void
582 fs_visitor::emit_shader_time_end()
583 {
584 current_annotation = "shader time end";
585
586 enum shader_time_shader_type type, written_type, reset_type;
587 if (dispatch_width == 8) {
588 type = ST_FS8;
589 written_type = ST_FS8_WRITTEN;
590 reset_type = ST_FS8_RESET;
591 } else {
592 assert(dispatch_width == 16);
593 type = ST_FS16;
594 written_type = ST_FS16_WRITTEN;
595 reset_type = ST_FS16_RESET;
596 }
597
598 fs_reg shader_end_time = get_timestamp();
599
600 /* Check that there weren't any timestamp reset events (assuming these
601 * were the only two timestamp reads that happened).
602 */
603 fs_reg reset = shader_end_time;
604 reset.set_smear(2);
605 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
606 test->conditional_mod = BRW_CONDITIONAL_Z;
607 emit(IF(BRW_PREDICATE_NORMAL));
608
609 push_force_uncompressed();
610 fs_reg start = shader_start_time;
611 start.negate = true;
612 fs_reg diff = fs_reg(this, glsl_type::uint_type);
613 emit(ADD(diff, start, shader_end_time));
614
615 /* If there were no instructions between the two timestamp gets, the diff
616 * is 2 cycles. Remove that overhead, so I can forget about that when
617 * trying to determine the time taken for single instructions.
618 */
619 emit(ADD(diff, diff, fs_reg(-2u)));
620
621 emit_shader_time_write(type, diff);
622 emit_shader_time_write(written_type, fs_reg(1u));
623 emit(BRW_OPCODE_ELSE);
624 emit_shader_time_write(reset_type, fs_reg(1u));
625 emit(BRW_OPCODE_ENDIF);
626
627 pop_force_uncompressed();
628 }
629
630 void
631 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
632 fs_reg value)
633 {
634 int shader_time_index =
635 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
636 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
637
638 fs_reg payload;
639 if (dispatch_width == 8)
640 payload = fs_reg(this, glsl_type::uvec2_type);
641 else
642 payload = fs_reg(this, glsl_type::uint_type);
643
644 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
645 fs_reg(), payload, offset, value));
646 }
647
648 void
649 fs_visitor::fail(const char *format, ...)
650 {
651 va_list va;
652 char *msg;
653
654 if (failed)
655 return;
656
657 failed = true;
658
659 va_start(va, format);
660 msg = ralloc_vasprintf(mem_ctx, format, va);
661 va_end(va);
662 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
663
664 this->fail_msg = msg;
665
666 if (INTEL_DEBUG & DEBUG_WM) {
667 fprintf(stderr, "%s", msg);
668 }
669 }
670
671 fs_inst *
672 fs_visitor::emit(enum opcode opcode)
673 {
674 return emit(fs_inst(opcode));
675 }
676
677 fs_inst *
678 fs_visitor::emit(enum opcode opcode, fs_reg dst)
679 {
680 return emit(fs_inst(opcode, dst));
681 }
682
683 fs_inst *
684 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
685 {
686 return emit(fs_inst(opcode, dst, src0));
687 }
688
689 fs_inst *
690 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
691 {
692 return emit(fs_inst(opcode, dst, src0, src1));
693 }
694
695 fs_inst *
696 fs_visitor::emit(enum opcode opcode, fs_reg dst,
697 fs_reg src0, fs_reg src1, fs_reg src2)
698 {
699 return emit(fs_inst(opcode, dst, src0, src1, src2));
700 }
701
702 void
703 fs_visitor::push_force_uncompressed()
704 {
705 force_uncompressed_stack++;
706 }
707
708 void
709 fs_visitor::pop_force_uncompressed()
710 {
711 force_uncompressed_stack--;
712 assert(force_uncompressed_stack >= 0);
713 }
714
715 /**
716 * Returns true if the instruction has a flag that means it won't
717 * update an entire destination register.
718 *
719 * For example, dead code elimination and live variable analysis want to know
720 * when a write to a variable screens off any preceding values that were in
721 * it.
722 */
723 bool
724 fs_inst::is_partial_write()
725 {
726 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
727 this->force_uncompressed ||
728 this->force_sechalf || !this->dst.is_contiguous());
729 }
730
731 int
732 fs_inst::regs_read(fs_visitor *v, int arg)
733 {
734 if (is_tex() && arg == 0 && src[0].file == GRF) {
735 if (v->dispatch_width == 16)
736 return (mlen + 1) / 2;
737 else
738 return mlen;
739 }
740 return 1;
741 }
742
743 bool
744 fs_inst::reads_flag()
745 {
746 return predicate;
747 }
748
749 bool
750 fs_inst::writes_flag()
751 {
752 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
753 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
754 }
755
756 /**
757 * Returns how many MRFs an FS opcode will write over.
758 *
759 * Note that this is not the 0 or 1 implied writes in an actual gen
760 * instruction -- the FS opcodes often generate MOVs in addition.
761 */
762 int
763 fs_visitor::implied_mrf_writes(fs_inst *inst)
764 {
765 if (inst->mlen == 0)
766 return 0;
767
768 if (inst->base_mrf == -1)
769 return 0;
770
771 switch (inst->opcode) {
772 case SHADER_OPCODE_RCP:
773 case SHADER_OPCODE_RSQ:
774 case SHADER_OPCODE_SQRT:
775 case SHADER_OPCODE_EXP2:
776 case SHADER_OPCODE_LOG2:
777 case SHADER_OPCODE_SIN:
778 case SHADER_OPCODE_COS:
779 return 1 * dispatch_width / 8;
780 case SHADER_OPCODE_POW:
781 case SHADER_OPCODE_INT_QUOTIENT:
782 case SHADER_OPCODE_INT_REMAINDER:
783 return 2 * dispatch_width / 8;
784 case SHADER_OPCODE_TEX:
785 case FS_OPCODE_TXB:
786 case SHADER_OPCODE_TXD:
787 case SHADER_OPCODE_TXF:
788 case SHADER_OPCODE_TXF_CMS:
789 case SHADER_OPCODE_TXF_MCS:
790 case SHADER_OPCODE_TG4:
791 case SHADER_OPCODE_TG4_OFFSET:
792 case SHADER_OPCODE_TXL:
793 case SHADER_OPCODE_TXS:
794 case SHADER_OPCODE_LOD:
795 return 1;
796 case FS_OPCODE_FB_WRITE:
797 return 2;
798 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
799 case SHADER_OPCODE_GEN4_SCRATCH_READ:
800 return 1;
801 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
802 return inst->mlen;
803 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
804 return 2;
805 case SHADER_OPCODE_UNTYPED_ATOMIC:
806 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
807 return 0;
808 default:
809 assert(!"not reached");
810 return inst->mlen;
811 }
812 }
813
814 int
815 fs_visitor::virtual_grf_alloc(int size)
816 {
817 if (virtual_grf_array_size <= virtual_grf_count) {
818 if (virtual_grf_array_size == 0)
819 virtual_grf_array_size = 16;
820 else
821 virtual_grf_array_size *= 2;
822 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
823 virtual_grf_array_size);
824 }
825 virtual_grf_sizes[virtual_grf_count] = size;
826 return virtual_grf_count++;
827 }
828
829 /** Fixed HW reg constructor. */
830 fs_reg::fs_reg(enum register_file file, int reg)
831 {
832 init();
833 this->file = file;
834 this->reg = reg;
835 this->type = BRW_REGISTER_TYPE_F;
836 }
837
838 /** Fixed HW reg constructor. */
839 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
840 {
841 init();
842 this->file = file;
843 this->reg = reg;
844 this->type = type;
845 }
846
847 /** Automatic reg constructor. */
848 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
849 {
850 init();
851
852 this->file = GRF;
853 this->reg = v->virtual_grf_alloc(v->type_size(type));
854 this->reg_offset = 0;
855 this->type = brw_type_for_base_type(type);
856 }
857
858 fs_reg *
859 fs_visitor::variable_storage(ir_variable *var)
860 {
861 return (fs_reg *)hash_table_find(this->variable_ht, var);
862 }
863
864 void
865 import_uniforms_callback(const void *key,
866 void *data,
867 void *closure)
868 {
869 struct hash_table *dst_ht = (struct hash_table *)closure;
870 const fs_reg *reg = (const fs_reg *)data;
871
872 if (reg->file != UNIFORM)
873 return;
874
875 hash_table_insert(dst_ht, data, key);
876 }
877
878 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
879 * This brings in those uniform definitions
880 */
881 void
882 fs_visitor::import_uniforms(fs_visitor *v)
883 {
884 hash_table_call_foreach(v->variable_ht,
885 import_uniforms_callback,
886 variable_ht);
887 this->params_remap = v->params_remap;
888 this->nr_params_remap = v->nr_params_remap;
889 }
890
891 /* Our support for uniforms is piggy-backed on the struct
892 * gl_fragment_program, because that's where the values actually
893 * get stored, rather than in some global gl_shader_program uniform
894 * store.
895 */
896 void
897 fs_visitor::setup_uniform_values(ir_variable *ir)
898 {
899 int namelen = strlen(ir->name);
900
901 /* The data for our (non-builtin) uniforms is stored in a series of
902 * gl_uniform_driver_storage structs for each subcomponent that
903 * glGetUniformLocation() could name. We know it's been set up in the same
904 * order we'd walk the type, so walk the list of storage and find anything
905 * with our name, or the prefix of a component that starts with our name.
906 */
907 unsigned params_before = uniforms;
908 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
909 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
910
911 if (strncmp(ir->name, storage->name, namelen) != 0 ||
912 (storage->name[namelen] != 0 &&
913 storage->name[namelen] != '.' &&
914 storage->name[namelen] != '[')) {
915 continue;
916 }
917
918 unsigned slots = storage->type->component_slots();
919 if (storage->array_elements)
920 slots *= storage->array_elements;
921
922 for (unsigned i = 0; i < slots; i++) {
923 stage_prog_data->param[uniforms++] = &storage->storage[i].f;
924 }
925 }
926
927 /* Make sure we actually initialized the right amount of stuff here. */
928 assert(params_before + ir->type->component_slots() == uniforms);
929 (void)params_before;
930 }
931
932
933 /* Our support for builtin uniforms is even scarier than non-builtin.
934 * It sits on top of the PROG_STATE_VAR parameters that are
935 * automatically updated from GL context state.
936 */
937 void
938 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
939 {
940 const ir_state_slot *const slots = ir->state_slots;
941 assert(ir->state_slots != NULL);
942
943 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
944 /* This state reference has already been setup by ir_to_mesa, but we'll
945 * get the same index back here.
946 */
947 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
948 (gl_state_index *)slots[i].tokens);
949
950 /* Add each of the unique swizzles of the element as a parameter.
951 * This'll end up matching the expected layout of the
952 * array/matrix/structure we're trying to fill in.
953 */
954 int last_swiz = -1;
955 for (unsigned int j = 0; j < 4; j++) {
956 int swiz = GET_SWZ(slots[i].swizzle, j);
957 if (swiz == last_swiz)
958 break;
959 last_swiz = swiz;
960
961 stage_prog_data->param[uniforms++] =
962 &fp->Base.Parameters->ParameterValues[index][swiz].f;
963 }
964 }
965 }
966
967 fs_reg *
968 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
969 {
970 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
971 fs_reg wpos = *reg;
972 bool flip = !ir->data.origin_upper_left ^ c->key.render_to_fbo;
973
974 /* gl_FragCoord.x */
975 if (ir->data.pixel_center_integer) {
976 emit(MOV(wpos, this->pixel_x));
977 } else {
978 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
979 }
980 wpos.reg_offset++;
981
982 /* gl_FragCoord.y */
983 if (!flip && ir->data.pixel_center_integer) {
984 emit(MOV(wpos, this->pixel_y));
985 } else {
986 fs_reg pixel_y = this->pixel_y;
987 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
988
989 if (flip) {
990 pixel_y.negate = true;
991 offset += c->key.drawable_height - 1.0;
992 }
993
994 emit(ADD(wpos, pixel_y, fs_reg(offset)));
995 }
996 wpos.reg_offset++;
997
998 /* gl_FragCoord.z */
999 if (brw->gen >= 6) {
1000 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
1001 } else {
1002 emit(FS_OPCODE_LINTERP, wpos,
1003 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1004 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1005 interp_reg(VARYING_SLOT_POS, 2));
1006 }
1007 wpos.reg_offset++;
1008
1009 /* gl_FragCoord.w: Already set up in emit_interpolation */
1010 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1011
1012 return reg;
1013 }
1014
1015 fs_inst *
1016 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1017 glsl_interp_qualifier interpolation_mode,
1018 bool is_centroid, bool is_sample)
1019 {
1020 brw_wm_barycentric_interp_mode barycoord_mode;
1021 if (brw->gen >= 6) {
1022 if (is_centroid) {
1023 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1024 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1025 else
1026 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1027 } else if (is_sample) {
1028 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1029 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1030 else
1031 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1032 } else {
1033 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1034 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1035 else
1036 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1037 }
1038 } else {
1039 /* On Ironlake and below, there is only one interpolation mode.
1040 * Centroid interpolation doesn't mean anything on this hardware --
1041 * there is no multisampling.
1042 */
1043 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1044 }
1045 return emit(FS_OPCODE_LINTERP, attr,
1046 this->delta_x[barycoord_mode],
1047 this->delta_y[barycoord_mode], interp);
1048 }
1049
1050 fs_reg *
1051 fs_visitor::emit_general_interpolation(ir_variable *ir)
1052 {
1053 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1054 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1055 fs_reg attr = *reg;
1056
1057 unsigned int array_elements;
1058 const glsl_type *type;
1059
1060 if (ir->type->is_array()) {
1061 array_elements = ir->type->length;
1062 if (array_elements == 0) {
1063 fail("dereferenced array '%s' has length 0\n", ir->name);
1064 }
1065 type = ir->type->fields.array;
1066 } else {
1067 array_elements = 1;
1068 type = ir->type;
1069 }
1070
1071 glsl_interp_qualifier interpolation_mode =
1072 ir->determine_interpolation_mode(c->key.flat_shade);
1073
1074 int location = ir->data.location;
1075 for (unsigned int i = 0; i < array_elements; i++) {
1076 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1077 if (c->prog_data.urb_setup[location] == -1) {
1078 /* If there's no incoming setup data for this slot, don't
1079 * emit interpolation for it.
1080 */
1081 attr.reg_offset += type->vector_elements;
1082 location++;
1083 continue;
1084 }
1085
1086 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1087 /* Constant interpolation (flat shading) case. The SF has
1088 * handed us defined values in only the constant offset
1089 * field of the setup reg.
1090 */
1091 for (unsigned int k = 0; k < type->vector_elements; k++) {
1092 struct brw_reg interp = interp_reg(location, k);
1093 interp = suboffset(interp, 3);
1094 interp.type = reg->type;
1095 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1096 attr.reg_offset++;
1097 }
1098 } else {
1099 /* Smooth/noperspective interpolation case. */
1100 for (unsigned int k = 0; k < type->vector_elements; k++) {
1101 struct brw_reg interp = interp_reg(location, k);
1102 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1103 ir->data.centroid && !c->key.persample_shading,
1104 ir->data.sample || c->key.persample_shading);
1105 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1106 /* Get the pixel/sample mask into f0 so that we know
1107 * which pixels are lit. Then, for each channel that is
1108 * unlit, replace the centroid data with non-centroid
1109 * data.
1110 */
1111 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1112 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1113 interpolation_mode,
1114 false, false);
1115 inst->predicate = BRW_PREDICATE_NORMAL;
1116 inst->predicate_inverse = true;
1117 }
1118 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1119 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1120 }
1121 attr.reg_offset++;
1122 }
1123
1124 }
1125 location++;
1126 }
1127 }
1128
1129 return reg;
1130 }
1131
1132 fs_reg *
1133 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1134 {
1135 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1136
1137 /* The frontfacing comes in as a bit in the thread payload. */
1138 if (brw->gen >= 6) {
1139 emit(BRW_OPCODE_ASR, *reg,
1140 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1141 fs_reg(15));
1142 emit(BRW_OPCODE_NOT, *reg, *reg);
1143 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1144 } else {
1145 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1146 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1147 * us front face
1148 */
1149 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1150 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1151 }
1152
1153 return reg;
1154 }
1155
1156 void
1157 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1158 {
1159 assert(dst.type == BRW_REGISTER_TYPE_F);
1160
1161 if (c->key.compute_pos_offset) {
1162 /* Convert int_sample_pos to floating point */
1163 emit(MOV(dst, int_sample_pos));
1164 /* Scale to the range [0, 1] */
1165 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1166 }
1167 else {
1168 /* From ARB_sample_shading specification:
1169 * "When rendering to a non-multisample buffer, or if multisample
1170 * rasterization is disabled, gl_SamplePosition will always be
1171 * (0.5, 0.5).
1172 */
1173 emit(MOV(dst, fs_reg(0.5f)));
1174 }
1175 }
1176
1177 fs_reg *
1178 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1179 {
1180 assert(brw->gen >= 6);
1181 assert(ir->type == glsl_type::vec2_type);
1182
1183 this->current_annotation = "compute sample position";
1184 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1185 fs_reg pos = *reg;
1186 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1187 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1188
1189 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1190 * mode will be enabled.
1191 *
1192 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1193 * R31.1:0 Position Offset X/Y for Slot[3:0]
1194 * R31.3:2 Position Offset X/Y for Slot[7:4]
1195 * .....
1196 *
1197 * The X, Y sample positions come in as bytes in thread payload. So, read
1198 * the positions using vstride=16, width=8, hstride=2.
1199 */
1200 struct brw_reg sample_pos_reg =
1201 stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1202 BRW_REGISTER_TYPE_B), 16, 8, 2);
1203
1204 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1205 if (dispatch_width == 16) {
1206 fs_inst *inst = emit(MOV(half(int_sample_x, 1),
1207 fs_reg(suboffset(sample_pos_reg, 16))));
1208 inst->force_sechalf = true;
1209 }
1210 /* Compute gl_SamplePosition.x */
1211 compute_sample_position(pos, int_sample_x);
1212 pos.reg_offset++;
1213 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1214 if (dispatch_width == 16) {
1215 fs_inst *inst = emit(MOV(half(int_sample_y, 1),
1216 fs_reg(suboffset(sample_pos_reg, 17))));
1217 inst->force_sechalf = true;
1218 }
1219 /* Compute gl_SamplePosition.y */
1220 compute_sample_position(pos, int_sample_y);
1221 return reg;
1222 }
1223
1224 fs_reg *
1225 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1226 {
1227 assert(brw->gen >= 6);
1228
1229 this->current_annotation = "compute sample id";
1230 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1231
1232 if (c->key.compute_sample_id) {
1233 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1234 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1235 t2.type = BRW_REGISTER_TYPE_UW;
1236
1237 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1238 * 8x multisampling, subspan 0 will represent sample N (where N
1239 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1240 * 7. We can find the value of N by looking at R0.0 bits 7:6
1241 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1242 * (since samples are always delivered in pairs). That is, we
1243 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1244 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1245 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1246 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1247 * populating a temporary variable with the sequence (0, 1, 2, 3),
1248 * and then reading from it using vstride=1, width=4, hstride=0.
1249 * These computations hold good for 4x multisampling as well.
1250 */
1251 emit(BRW_OPCODE_AND, t1,
1252 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1253 fs_reg(brw_imm_d(0xc0)));
1254 emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1255 /* This works for both SIMD8 and SIMD16 */
1256 emit(MOV(t2, brw_imm_v(0x3210)));
1257 /* This special instruction takes care of setting vstride=1,
1258 * width=4, hstride=0 of t2 during an ADD instruction.
1259 */
1260 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1261 } else {
1262 /* As per GL_ARB_sample_shading specification:
1263 * "When rendering to a non-multisample buffer, or if multisample
1264 * rasterization is disabled, gl_SampleID will always be zero."
1265 */
1266 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1267 }
1268
1269 return reg;
1270 }
1271
1272 fs_reg *
1273 fs_visitor::emit_samplemaskin_setup(ir_variable *ir)
1274 {
1275 assert(brw->gen >= 7);
1276 this->current_annotation = "compute gl_SampleMaskIn";
1277 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1278 emit(MOV(*reg, fs_reg(retype(brw_vec8_grf(c->sample_mask_reg, 0), BRW_REGISTER_TYPE_D))));
1279 return reg;
1280 }
1281
1282 fs_reg
1283 fs_visitor::fix_math_operand(fs_reg src)
1284 {
1285 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1286 * might be able to do better by doing execsize = 1 math and then
1287 * expanding that result out, but we would need to be careful with
1288 * masking.
1289 *
1290 * The hardware ignores source modifiers (negate and abs) on math
1291 * instructions, so we also move to a temp to set those up.
1292 */
1293 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1294 !src.abs && !src.negate)
1295 return src;
1296
1297 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1298 * operands to math
1299 */
1300 if (brw->gen >= 7 && src.file != IMM)
1301 return src;
1302
1303 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1304 expanded.type = src.type;
1305 emit(BRW_OPCODE_MOV, expanded, src);
1306 return expanded;
1307 }
1308
1309 fs_inst *
1310 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1311 {
1312 switch (opcode) {
1313 case SHADER_OPCODE_RCP:
1314 case SHADER_OPCODE_RSQ:
1315 case SHADER_OPCODE_SQRT:
1316 case SHADER_OPCODE_EXP2:
1317 case SHADER_OPCODE_LOG2:
1318 case SHADER_OPCODE_SIN:
1319 case SHADER_OPCODE_COS:
1320 break;
1321 default:
1322 assert(!"not reached: bad math opcode");
1323 return NULL;
1324 }
1325
1326 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1327 * might be able to do better by doing execsize = 1 math and then
1328 * expanding that result out, but we would need to be careful with
1329 * masking.
1330 *
1331 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1332 * instructions, so we also move to a temp to set those up.
1333 */
1334 if (brw->gen >= 6)
1335 src = fix_math_operand(src);
1336
1337 fs_inst *inst = emit(opcode, dst, src);
1338
1339 if (brw->gen < 6) {
1340 inst->base_mrf = 2;
1341 inst->mlen = dispatch_width / 8;
1342 }
1343
1344 return inst;
1345 }
1346
1347 fs_inst *
1348 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1349 {
1350 int base_mrf = 2;
1351 fs_inst *inst;
1352
1353 switch (opcode) {
1354 case SHADER_OPCODE_INT_QUOTIENT:
1355 case SHADER_OPCODE_INT_REMAINDER:
1356 if (brw->gen >= 7 && dispatch_width == 16)
1357 fail("SIMD16 INTDIV unsupported\n");
1358 break;
1359 case SHADER_OPCODE_POW:
1360 break;
1361 default:
1362 assert(!"not reached: unsupported binary math opcode.");
1363 return NULL;
1364 }
1365
1366 if (brw->gen >= 6) {
1367 src0 = fix_math_operand(src0);
1368 src1 = fix_math_operand(src1);
1369
1370 inst = emit(opcode, dst, src0, src1);
1371 } else {
1372 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1373 * "Message Payload":
1374 *
1375 * "Operand0[7]. For the INT DIV functions, this operand is the
1376 * denominator."
1377 * ...
1378 * "Operand1[7]. For the INT DIV functions, this operand is the
1379 * numerator."
1380 */
1381 bool is_int_div = opcode != SHADER_OPCODE_POW;
1382 fs_reg &op0 = is_int_div ? src1 : src0;
1383 fs_reg &op1 = is_int_div ? src0 : src1;
1384
1385 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1386 inst = emit(opcode, dst, op0, reg_null_f);
1387
1388 inst->base_mrf = base_mrf;
1389 inst->mlen = 2 * dispatch_width / 8;
1390 }
1391 return inst;
1392 }
1393
1394 void
1395 fs_visitor::assign_curb_setup()
1396 {
1397 if (dispatch_width == 8) {
1398 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1399 stage_prog_data->nr_params = uniforms;
1400 } else {
1401 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1402 /* Make sure we didn't try to sneak in an extra uniform */
1403 assert(uniforms == 0);
1404 }
1405
1406 c->prog_data.curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1407
1408 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1409 foreach_list(node, &this->instructions) {
1410 fs_inst *inst = (fs_inst *)node;
1411
1412 for (unsigned int i = 0; i < 3; i++) {
1413 if (inst->src[i].file == UNIFORM) {
1414 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1415 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1416 constant_nr / 8,
1417 constant_nr % 8);
1418
1419 inst->src[i].file = HW_REG;
1420 inst->src[i].fixed_hw_reg = byte_offset(
1421 retype(brw_reg, inst->src[i].type),
1422 inst->src[i].subreg_offset);
1423 }
1424 }
1425 }
1426 }
1427
1428 void
1429 fs_visitor::calculate_urb_setup()
1430 {
1431 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1432 c->prog_data.urb_setup[i] = -1;
1433 }
1434
1435 int urb_next = 0;
1436 /* Figure out where each of the incoming setup attributes lands. */
1437 if (brw->gen >= 6) {
1438 if (_mesa_bitcount_64(fp->Base.InputsRead &
1439 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1440 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1441 * first 16 varying inputs, so we can put them wherever we want.
1442 * Just put them in order.
1443 *
1444 * This is useful because it means that (a) inputs not used by the
1445 * fragment shader won't take up valuable register space, and (b) we
1446 * won't have to recompile the fragment shader if it gets paired with
1447 * a different vertex (or geometry) shader.
1448 */
1449 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1450 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1451 BITFIELD64_BIT(i)) {
1452 c->prog_data.urb_setup[i] = urb_next++;
1453 }
1454 }
1455 } else {
1456 /* We have enough input varyings that the SF/SBE pipeline stage can't
1457 * arbitrarily rearrange them to suit our whim; we have to put them
1458 * in an order that matches the output of the previous pipeline stage
1459 * (geometry or vertex shader).
1460 */
1461 struct brw_vue_map prev_stage_vue_map;
1462 brw_compute_vue_map(brw, &prev_stage_vue_map,
1463 c->key.input_slots_valid);
1464 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1465 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1466 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1467 slot++) {
1468 int varying = prev_stage_vue_map.slot_to_varying[slot];
1469 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1470 * unused.
1471 */
1472 if (varying != BRW_VARYING_SLOT_COUNT &&
1473 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1474 BITFIELD64_BIT(varying))) {
1475 c->prog_data.urb_setup[varying] = slot - first_slot;
1476 }
1477 }
1478 urb_next = prev_stage_vue_map.num_slots - first_slot;
1479 }
1480 } else {
1481 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1482 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1483 /* Point size is packed into the header, not as a general attribute */
1484 if (i == VARYING_SLOT_PSIZ)
1485 continue;
1486
1487 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1488 /* The back color slot is skipped when the front color is
1489 * also written to. In addition, some slots can be
1490 * written in the vertex shader and not read in the
1491 * fragment shader. So the register number must always be
1492 * incremented, mapped or not.
1493 */
1494 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1495 c->prog_data.urb_setup[i] = urb_next;
1496 urb_next++;
1497 }
1498 }
1499
1500 /*
1501 * It's a FS only attribute, and we did interpolation for this attribute
1502 * in SF thread. So, count it here, too.
1503 *
1504 * See compile_sf_prog() for more info.
1505 */
1506 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1507 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1508 }
1509
1510 c->prog_data.num_varying_inputs = urb_next;
1511 }
1512
1513 void
1514 fs_visitor::assign_urb_setup()
1515 {
1516 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1517
1518 /* Offset all the urb_setup[] index by the actual position of the
1519 * setup regs, now that the location of the constants has been chosen.
1520 */
1521 foreach_list(node, &this->instructions) {
1522 fs_inst *inst = (fs_inst *)node;
1523
1524 if (inst->opcode == FS_OPCODE_LINTERP) {
1525 assert(inst->src[2].file == HW_REG);
1526 inst->src[2].fixed_hw_reg.nr += urb_start;
1527 }
1528
1529 if (inst->opcode == FS_OPCODE_CINTERP) {
1530 assert(inst->src[0].file == HW_REG);
1531 inst->src[0].fixed_hw_reg.nr += urb_start;
1532 }
1533 }
1534
1535 /* Each attribute is 4 setup channels, each of which is half a reg. */
1536 this->first_non_payload_grf =
1537 urb_start + c->prog_data.num_varying_inputs * 2;
1538 }
1539
1540 /**
1541 * Split large virtual GRFs into separate components if we can.
1542 *
1543 * This is mostly duplicated with what brw_fs_vector_splitting does,
1544 * but that's really conservative because it's afraid of doing
1545 * splitting that doesn't result in real progress after the rest of
1546 * the optimization phases, which would cause infinite looping in
1547 * optimization. We can do it once here, safely. This also has the
1548 * opportunity to split interpolated values, or maybe even uniforms,
1549 * which we don't have at the IR level.
1550 *
1551 * We want to split, because virtual GRFs are what we register
1552 * allocate and spill (due to contiguousness requirements for some
1553 * instructions), and they're what we naturally generate in the
1554 * codegen process, but most virtual GRFs don't actually need to be
1555 * contiguous sets of GRFs. If we split, we'll end up with reduced
1556 * live intervals and better dead code elimination and coalescing.
1557 */
1558 void
1559 fs_visitor::split_virtual_grfs()
1560 {
1561 int num_vars = this->virtual_grf_count;
1562 bool split_grf[num_vars];
1563 int new_virtual_grf[num_vars];
1564
1565 /* Try to split anything > 0 sized. */
1566 for (int i = 0; i < num_vars; i++) {
1567 if (this->virtual_grf_sizes[i] != 1)
1568 split_grf[i] = true;
1569 else
1570 split_grf[i] = false;
1571 }
1572
1573 if (brw->has_pln &&
1574 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1575 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1576 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1577 * Gen6, that was the only supported interpolation mode, and since Gen6,
1578 * delta_x and delta_y are in fixed hardware registers.
1579 */
1580 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1581 false;
1582 }
1583
1584 foreach_list(node, &this->instructions) {
1585 fs_inst *inst = (fs_inst *)node;
1586
1587 /* If there's a SEND message that requires contiguous destination
1588 * registers, no splitting is allowed.
1589 */
1590 if (inst->regs_written > 1) {
1591 split_grf[inst->dst.reg] = false;
1592 }
1593
1594 /* If we're sending from a GRF, don't split it, on the assumption that
1595 * the send is reading the whole thing.
1596 */
1597 if (inst->is_send_from_grf()) {
1598 for (int i = 0; i < 3; i++) {
1599 if (inst->src[i].file == GRF) {
1600 split_grf[inst->src[i].reg] = false;
1601 }
1602 }
1603 }
1604 }
1605
1606 /* Allocate new space for split regs. Note that the virtual
1607 * numbers will be contiguous.
1608 */
1609 for (int i = 0; i < num_vars; i++) {
1610 if (split_grf[i]) {
1611 new_virtual_grf[i] = virtual_grf_alloc(1);
1612 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1613 int reg = virtual_grf_alloc(1);
1614 assert(reg == new_virtual_grf[i] + j - 1);
1615 (void) reg;
1616 }
1617 this->virtual_grf_sizes[i] = 1;
1618 }
1619 }
1620
1621 foreach_list(node, &this->instructions) {
1622 fs_inst *inst = (fs_inst *)node;
1623
1624 if (inst->dst.file == GRF &&
1625 split_grf[inst->dst.reg] &&
1626 inst->dst.reg_offset != 0) {
1627 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1628 inst->dst.reg_offset - 1);
1629 inst->dst.reg_offset = 0;
1630 }
1631 for (int i = 0; i < 3; i++) {
1632 if (inst->src[i].file == GRF &&
1633 split_grf[inst->src[i].reg] &&
1634 inst->src[i].reg_offset != 0) {
1635 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1636 inst->src[i].reg_offset - 1);
1637 inst->src[i].reg_offset = 0;
1638 }
1639 }
1640 }
1641 invalidate_live_intervals();
1642 }
1643
1644 /**
1645 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1646 *
1647 * During code generation, we create tons of temporary variables, many of
1648 * which get immediately killed and are never used again. Yet, in later
1649 * optimization and analysis passes, such as compute_live_intervals, we need
1650 * to loop over all the virtual GRFs. Compacting them can save a lot of
1651 * overhead.
1652 */
1653 void
1654 fs_visitor::compact_virtual_grfs()
1655 {
1656 /* Mark which virtual GRFs are used, and count how many. */
1657 int remap_table[this->virtual_grf_count];
1658 memset(remap_table, -1, sizeof(remap_table));
1659
1660 foreach_list(node, &this->instructions) {
1661 const fs_inst *inst = (const fs_inst *) node;
1662
1663 if (inst->dst.file == GRF)
1664 remap_table[inst->dst.reg] = 0;
1665
1666 for (int i = 0; i < 3; i++) {
1667 if (inst->src[i].file == GRF)
1668 remap_table[inst->src[i].reg] = 0;
1669 }
1670 }
1671
1672 /* In addition to registers used in instructions, fs_visitor keeps
1673 * direct references to certain special values which must be patched:
1674 */
1675 fs_reg *special[] = {
1676 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1677 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1678 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1679 &delta_x[0], &delta_x[1], &delta_x[2],
1680 &delta_x[3], &delta_x[4], &delta_x[5],
1681 &delta_y[0], &delta_y[1], &delta_y[2],
1682 &delta_y[3], &delta_y[4], &delta_y[5],
1683 };
1684 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1685 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1686
1687 /* Treat all special values as used, to be conservative */
1688 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1689 if (special[i]->file == GRF)
1690 remap_table[special[i]->reg] = 0;
1691 }
1692
1693 /* Compact the GRF arrays. */
1694 int new_index = 0;
1695 for (int i = 0; i < this->virtual_grf_count; i++) {
1696 if (remap_table[i] != -1) {
1697 remap_table[i] = new_index;
1698 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1699 invalidate_live_intervals();
1700 ++new_index;
1701 }
1702 }
1703
1704 this->virtual_grf_count = new_index;
1705
1706 /* Patch all the instructions to use the newly renumbered registers */
1707 foreach_list(node, &this->instructions) {
1708 fs_inst *inst = (fs_inst *) node;
1709
1710 if (inst->dst.file == GRF)
1711 inst->dst.reg = remap_table[inst->dst.reg];
1712
1713 for (int i = 0; i < 3; i++) {
1714 if (inst->src[i].file == GRF)
1715 inst->src[i].reg = remap_table[inst->src[i].reg];
1716 }
1717 }
1718
1719 /* Patch all the references to special values */
1720 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1721 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1722 special[i]->reg = remap_table[special[i]->reg];
1723 }
1724 }
1725
1726 bool
1727 fs_visitor::remove_dead_constants()
1728 {
1729 if (dispatch_width == 8) {
1730 this->params_remap = ralloc_array(mem_ctx, int, uniforms);
1731 this->nr_params_remap = uniforms;
1732
1733 for (unsigned int i = 0; i < uniforms; i++)
1734 this->params_remap[i] = -1;
1735
1736 /* Find which params are still in use. */
1737 foreach_list(node, &this->instructions) {
1738 fs_inst *inst = (fs_inst *)node;
1739
1740 for (int i = 0; i < 3; i++) {
1741 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1742
1743 if (inst->src[i].file != UNIFORM)
1744 continue;
1745
1746 /* Section 5.11 of the OpenGL 4.3 spec says:
1747 *
1748 * "Out-of-bounds reads return undefined values, which include
1749 * values from other variables of the active program or zero."
1750 */
1751 if (constant_nr < 0 || constant_nr >= (int)uniforms) {
1752 constant_nr = 0;
1753 }
1754
1755 /* For now, set this to non-negative. We'll give it the
1756 * actual new number in a moment, in order to keep the
1757 * register numbers nicely ordered.
1758 */
1759 this->params_remap[constant_nr] = 0;
1760 }
1761 }
1762
1763 /* Figure out what the new numbers for the params will be. At some
1764 * point when we're doing uniform array access, we're going to want
1765 * to keep the distinction between .reg and .reg_offset, but for
1766 * now we don't care.
1767 */
1768 unsigned int new_nr_params = 0;
1769 for (unsigned int i = 0; i < uniforms; i++) {
1770 if (this->params_remap[i] != -1) {
1771 this->params_remap[i] = new_nr_params++;
1772 }
1773 }
1774
1775 /* Update the list of params to be uploaded to match our new numbering. */
1776 for (unsigned int i = 0; i < uniforms; i++) {
1777 int remapped = this->params_remap[i];
1778
1779 if (remapped == -1)
1780 continue;
1781
1782 stage_prog_data->param[remapped] = stage_prog_data->param[i];
1783 }
1784
1785 uniforms = new_nr_params;
1786 } else {
1787 /* This should have been generated in the SIMD8 pass already. */
1788 assert(this->params_remap);
1789 }
1790
1791 /* Now do the renumbering of the shader to remove unused params. */
1792 foreach_list(node, &this->instructions) {
1793 fs_inst *inst = (fs_inst *)node;
1794
1795 for (int i = 0; i < 3; i++) {
1796 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1797
1798 if (inst->src[i].file != UNIFORM)
1799 continue;
1800
1801 /* as above alias to 0 */
1802 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1803 constant_nr = 0;
1804 }
1805 assert(this->params_remap[constant_nr] != -1);
1806 inst->src[i].reg = this->params_remap[constant_nr];
1807 inst->src[i].reg_offset = 0;
1808 }
1809 }
1810
1811 return true;
1812 }
1813
1814 /*
1815 * Implements array access of uniforms by inserting a
1816 * PULL_CONSTANT_LOAD instruction.
1817 *
1818 * Unlike temporary GRF array access (where we don't support it due to
1819 * the difficulty of doing relative addressing on instruction
1820 * destinations), we could potentially do array access of uniforms
1821 * that were loaded in GRF space as push constants. In real-world
1822 * usage we've seen, though, the arrays being used are always larger
1823 * than we could load as push constants, so just always move all
1824 * uniform array access out to a pull constant buffer.
1825 */
1826 void
1827 fs_visitor::move_uniform_array_access_to_pull_constants()
1828 {
1829 int pull_constant_loc[uniforms];
1830
1831 for (unsigned int i = 0; i < uniforms; i++) {
1832 pull_constant_loc[i] = -1;
1833 }
1834
1835 /* Walk through and find array access of uniforms. Put a copy of that
1836 * uniform in the pull constant buffer.
1837 *
1838 * Note that we don't move constant-indexed accesses to arrays. No
1839 * testing has been done of the performance impact of this choice.
1840 */
1841 foreach_list_safe(node, &this->instructions) {
1842 fs_inst *inst = (fs_inst *)node;
1843
1844 for (int i = 0 ; i < 3; i++) {
1845 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1846 continue;
1847
1848 int uniform = inst->src[i].reg;
1849
1850 /* If this array isn't already present in the pull constant buffer,
1851 * add it.
1852 */
1853 if (pull_constant_loc[uniform] == -1) {
1854 const float **values = &stage_prog_data->param[uniform];
1855
1856 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params;
1857
1858 assert(param_size[uniform]);
1859
1860 for (int j = 0; j < param_size[uniform]; j++) {
1861 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1862 values[j];
1863 }
1864 }
1865
1866 /* Set up the annotation tracking for new generated instructions. */
1867 base_ir = inst->ir;
1868 current_annotation = inst->annotation;
1869
1870 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1871 fs_reg temp = fs_reg(this, glsl_type::float_type);
1872 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1873 surf_index,
1874 *inst->src[i].reladdr,
1875 pull_constant_loc[uniform] +
1876 inst->src[i].reg_offset);
1877 inst->insert_before(&list);
1878
1879 inst->src[i].file = temp.file;
1880 inst->src[i].reg = temp.reg;
1881 inst->src[i].reg_offset = temp.reg_offset;
1882 inst->src[i].reladdr = NULL;
1883 }
1884 }
1885 }
1886
1887 /**
1888 * Choose accesses from the UNIFORM file to demote to using the pull
1889 * constant buffer.
1890 *
1891 * We allow a fragment shader to have more than the specified minimum
1892 * maximum number of fragment shader uniform components (64). If
1893 * there are too many of these, they'd fill up all of register space.
1894 * So, this will push some of them out to the pull constant buffer and
1895 * update the program to load them.
1896 */
1897 void
1898 fs_visitor::setup_pull_constants()
1899 {
1900 /* Only allow 16 registers (128 uniform components) as push constants. */
1901 unsigned int max_uniform_components = 16 * 8;
1902 if (uniforms <= max_uniform_components)
1903 return;
1904
1905 if (dispatch_width == 16) {
1906 fail("Pull constants not supported in SIMD16\n");
1907 return;
1908 }
1909
1910 /* Just demote the end of the list. We could probably do better
1911 * here, demoting things that are rarely used in the program first.
1912 */
1913 unsigned int pull_uniform_base = max_uniform_components;
1914
1915 int pull_constant_loc[uniforms];
1916 for (unsigned int i = 0; i < uniforms; i++) {
1917 if (i < pull_uniform_base) {
1918 pull_constant_loc[i] = -1;
1919 } else {
1920 pull_constant_loc[i] = -1;
1921 /* If our constant is already being uploaded for reladdr purposes,
1922 * reuse it.
1923 */
1924 for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j++) {
1925 if (stage_prog_data->pull_param[j] == stage_prog_data->param[i]) {
1926 pull_constant_loc[i] = j;
1927 break;
1928 }
1929 }
1930 if (pull_constant_loc[i] == -1) {
1931 int pull_index = stage_prog_data->nr_pull_params++;
1932 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1933 pull_constant_loc[i] = pull_index;
1934 }
1935 }
1936 }
1937 uniforms = pull_uniform_base;
1938
1939 foreach_list(node, &this->instructions) {
1940 fs_inst *inst = (fs_inst *)node;
1941
1942 for (int i = 0; i < 3; i++) {
1943 if (inst->src[i].file != UNIFORM)
1944 continue;
1945
1946 int pull_index = pull_constant_loc[inst->src[i].reg +
1947 inst->src[i].reg_offset];
1948 if (pull_index == -1)
1949 continue;
1950
1951 assert(!inst->src[i].reladdr);
1952
1953 fs_reg dst = fs_reg(this, glsl_type::float_type);
1954 fs_reg index(stage_prog_data->binding_table.pull_constants_start);
1955 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1956 fs_inst *pull =
1957 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1958 dst, index, offset);
1959 pull->ir = inst->ir;
1960 pull->annotation = inst->annotation;
1961
1962 inst->insert_before(pull);
1963
1964 inst->src[i].file = GRF;
1965 inst->src[i].reg = dst.reg;
1966 inst->src[i].reg_offset = 0;
1967 inst->src[i].set_smear(pull_index & 3);
1968 }
1969 }
1970 }
1971
1972 bool
1973 fs_visitor::opt_algebraic()
1974 {
1975 bool progress = false;
1976
1977 foreach_list(node, &this->instructions) {
1978 fs_inst *inst = (fs_inst *)node;
1979
1980 switch (inst->opcode) {
1981 case BRW_OPCODE_MUL:
1982 if (inst->src[1].file != IMM)
1983 continue;
1984
1985 /* a * 1.0 = a */
1986 if (inst->src[1].is_one()) {
1987 inst->opcode = BRW_OPCODE_MOV;
1988 inst->src[1] = reg_undef;
1989 progress = true;
1990 break;
1991 }
1992
1993 /* a * 0.0 = 0.0 */
1994 if (inst->src[1].is_zero()) {
1995 inst->opcode = BRW_OPCODE_MOV;
1996 inst->src[0] = inst->src[1];
1997 inst->src[1] = reg_undef;
1998 progress = true;
1999 break;
2000 }
2001
2002 break;
2003 case BRW_OPCODE_ADD:
2004 if (inst->src[1].file != IMM)
2005 continue;
2006
2007 /* a + 0.0 = a */
2008 if (inst->src[1].is_zero()) {
2009 inst->opcode = BRW_OPCODE_MOV;
2010 inst->src[1] = reg_undef;
2011 progress = true;
2012 break;
2013 }
2014 break;
2015 case BRW_OPCODE_OR:
2016 if (inst->src[0].equals(inst->src[1])) {
2017 inst->opcode = BRW_OPCODE_MOV;
2018 inst->src[1] = reg_undef;
2019 progress = true;
2020 break;
2021 }
2022 break;
2023 case BRW_OPCODE_LRP:
2024 if (inst->src[1].equals(inst->src[2])) {
2025 inst->opcode = BRW_OPCODE_MOV;
2026 inst->src[0] = inst->src[1];
2027 inst->src[1] = reg_undef;
2028 inst->src[2] = reg_undef;
2029 progress = true;
2030 break;
2031 }
2032 break;
2033 case BRW_OPCODE_SEL:
2034 if (inst->saturate && inst->src[1].file == IMM) {
2035 switch (inst->conditional_mod) {
2036 case BRW_CONDITIONAL_LE:
2037 case BRW_CONDITIONAL_L:
2038 switch (inst->src[1].type) {
2039 case BRW_REGISTER_TYPE_F:
2040 if (inst->src[1].imm.f >= 1.0f) {
2041 inst->opcode = BRW_OPCODE_MOV;
2042 inst->src[1] = reg_undef;
2043 progress = true;
2044 }
2045 break;
2046 default:
2047 break;
2048 }
2049 break;
2050 case BRW_CONDITIONAL_GE:
2051 case BRW_CONDITIONAL_G:
2052 switch (inst->src[1].type) {
2053 case BRW_REGISTER_TYPE_F:
2054 if (inst->src[1].imm.f <= 0.0f) {
2055 inst->opcode = BRW_OPCODE_MOV;
2056 inst->src[1] = reg_undef;
2057 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2058 progress = true;
2059 }
2060 break;
2061 default:
2062 break;
2063 }
2064 default:
2065 break;
2066 }
2067 }
2068 break;
2069 default:
2070 break;
2071 }
2072 }
2073
2074 return progress;
2075 }
2076
2077 /**
2078 * Removes any instructions writing a VGRF where that VGRF is not used by any
2079 * later instruction.
2080 */
2081 bool
2082 fs_visitor::dead_code_eliminate()
2083 {
2084 bool progress = false;
2085 int pc = 0;
2086
2087 calculate_live_intervals();
2088
2089 foreach_list_safe(node, &this->instructions) {
2090 fs_inst *inst = (fs_inst *)node;
2091
2092 if (inst->dst.file == GRF && !inst->has_side_effects()) {
2093 bool dead = true;
2094
2095 for (int i = 0; i < inst->regs_written; i++) {
2096 int var = live_intervals->var_from_vgrf[inst->dst.reg];
2097 assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2098 if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2099 dead = false;
2100 break;
2101 }
2102 }
2103
2104 if (dead) {
2105 /* Don't dead code eliminate instructions that write to the
2106 * accumulator as a side-effect. Instead just set the destination
2107 * to the null register to free it.
2108 */
2109 switch (inst->opcode) {
2110 case BRW_OPCODE_ADDC:
2111 case BRW_OPCODE_SUBB:
2112 case BRW_OPCODE_MACH:
2113 inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2114 break;
2115 default:
2116 inst->remove();
2117 progress = true;
2118 break;
2119 }
2120 }
2121 }
2122
2123 pc++;
2124 }
2125
2126 if (progress)
2127 invalidate_live_intervals();
2128
2129 return progress;
2130 }
2131
2132 struct dead_code_hash_key
2133 {
2134 int vgrf;
2135 int reg_offset;
2136 };
2137
2138 static bool
2139 dead_code_hash_compare(const void *a, const void *b)
2140 {
2141 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2142 }
2143
2144 static void
2145 clear_dead_code_hash(struct hash_table *ht)
2146 {
2147 struct hash_entry *entry;
2148
2149 hash_table_foreach(ht, entry) {
2150 _mesa_hash_table_remove(ht, entry);
2151 }
2152 }
2153
2154 static void
2155 insert_dead_code_hash(struct hash_table *ht,
2156 int vgrf, int reg_offset, fs_inst *inst)
2157 {
2158 /* We don't bother freeing keys, because they'll be GCed with the ht. */
2159 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2160
2161 key->vgrf = vgrf;
2162 key->reg_offset = reg_offset;
2163
2164 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2165 }
2166
2167 static struct hash_entry *
2168 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2169 {
2170 struct dead_code_hash_key key;
2171
2172 key.vgrf = vgrf;
2173 key.reg_offset = reg_offset;
2174
2175 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2176 }
2177
2178 static void
2179 remove_dead_code_hash(struct hash_table *ht,
2180 int vgrf, int reg_offset)
2181 {
2182 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2183 if (!entry)
2184 return;
2185
2186 _mesa_hash_table_remove(ht, entry);
2187 }
2188
2189 /**
2190 * Walks basic blocks, removing any regs that are written but not read before
2191 * being redefined.
2192 *
2193 * The dead_code_eliminate() function implements a global dead code
2194 * elimination, but it only handles the removing the last write to a register
2195 * if it's never read. This one can handle intermediate writes, but only
2196 * within a basic block.
2197 */
2198 bool
2199 fs_visitor::dead_code_eliminate_local()
2200 {
2201 struct hash_table *ht;
2202 bool progress = false;
2203
2204 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2205
2206 if (ht == NULL) {
2207 return false;
2208 }
2209
2210 foreach_list_safe(node, &this->instructions) {
2211 fs_inst *inst = (fs_inst *)node;
2212
2213 /* At a basic block, empty the HT since we don't understand dataflow
2214 * here.
2215 */
2216 if (inst->is_control_flow()) {
2217 clear_dead_code_hash(ht);
2218 continue;
2219 }
2220
2221 /* Clear the HT of any instructions that got read. */
2222 for (int i = 0; i < 3; i++) {
2223 fs_reg src = inst->src[i];
2224 if (src.file != GRF)
2225 continue;
2226
2227 int read = 1;
2228 if (inst->is_send_from_grf())
2229 read = virtual_grf_sizes[src.reg] - src.reg_offset;
2230
2231 for (int reg_offset = src.reg_offset;
2232 reg_offset < src.reg_offset + read;
2233 reg_offset++) {
2234 remove_dead_code_hash(ht, src.reg, reg_offset);
2235 }
2236 }
2237
2238 /* Add any update of a GRF to the HT, removing a previous write if it
2239 * wasn't read.
2240 */
2241 if (inst->dst.file == GRF) {
2242 if (inst->regs_written > 1) {
2243 /* We don't know how to trim channels from an instruction's
2244 * writes, so we can't incrementally remove unread channels from
2245 * it. Just remove whatever it overwrites from the table
2246 */
2247 for (int i = 0; i < inst->regs_written; i++) {
2248 remove_dead_code_hash(ht,
2249 inst->dst.reg,
2250 inst->dst.reg_offset + i);
2251 }
2252 } else {
2253 struct hash_entry *entry =
2254 get_dead_code_hash_entry(ht, inst->dst.reg,
2255 inst->dst.reg_offset);
2256
2257 if (entry) {
2258 if (inst->is_partial_write()) {
2259 /* For a partial write, we can't remove any previous dead code
2260 * candidate, since we're just modifying their result.
2261 */
2262 } else {
2263 /* We're completely updating a channel, and there was a
2264 * previous write to the channel that wasn't read. Kill it!
2265 */
2266 fs_inst *inst = (fs_inst *)entry->data;
2267 inst->remove();
2268 progress = true;
2269 }
2270
2271 _mesa_hash_table_remove(ht, entry);
2272 }
2273
2274 if (!inst->has_side_effects())
2275 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2276 inst);
2277 }
2278 }
2279 }
2280
2281 _mesa_hash_table_destroy(ht, NULL);
2282
2283 if (progress)
2284 invalidate_live_intervals();
2285
2286 return progress;
2287 }
2288
2289 /**
2290 * Implements register coalescing: Checks if the two registers involved in a
2291 * raw move don't interfere, in which case they can both be stored in the same
2292 * place and the MOV removed.
2293 *
2294 * To do this, all uses of the source of the MOV in the shader are replaced
2295 * with the destination of the MOV. For example:
2296 *
2297 * add vgrf3:F, vgrf1:F, vgrf2:F
2298 * mov vgrf4:F, vgrf3:F
2299 * mul vgrf5:F, vgrf5:F, vgrf4:F
2300 *
2301 * becomes
2302 *
2303 * add vgrf4:F, vgrf1:F, vgrf2:F
2304 * mul vgrf5:F, vgrf5:F, vgrf4:F
2305 */
2306 bool
2307 fs_visitor::register_coalesce()
2308 {
2309 bool progress = false;
2310
2311 calculate_live_intervals();
2312
2313 int src_size = 0;
2314 int channels_remaining = 0;
2315 int reg_from = -1, reg_to = -1;
2316 int reg_to_offset[MAX_SAMPLER_MESSAGE_SIZE];
2317 fs_inst *mov[MAX_SAMPLER_MESSAGE_SIZE];
2318
2319 foreach_list(node, &this->instructions) {
2320 fs_inst *inst = (fs_inst *)node;
2321
2322 if (inst->opcode != BRW_OPCODE_MOV ||
2323 inst->is_partial_write() ||
2324 inst->saturate ||
2325 inst->src[0].file != GRF ||
2326 inst->src[0].negate ||
2327 inst->src[0].abs ||
2328 !inst->src[0].is_contiguous() ||
2329 inst->dst.file != GRF ||
2330 inst->dst.type != inst->src[0].type) {
2331 continue;
2332 }
2333
2334 if (virtual_grf_sizes[inst->src[0].reg] >
2335 virtual_grf_sizes[inst->dst.reg])
2336 continue;
2337
2338 int var_from = live_intervals->var_from_reg(&inst->src[0]);
2339 int var_to = live_intervals->var_from_reg(&inst->dst);
2340
2341 if (live_intervals->vars_interfere(var_from, var_to) &&
2342 !inst->dst.equals(inst->src[0])) {
2343
2344 /* We know that the live ranges of A (var_from) and B (var_to)
2345 * interfere because of the ->vars_interfere() call above. If the end
2346 * of B's live range is after the end of A's range, then we know two
2347 * things:
2348 * - the start of B's live range must be in A's live range (since we
2349 * already know the two ranges interfere, this is the only remaining
2350 * possibility)
2351 * - the interference isn't of the form we're looking for (where B is
2352 * entirely inside A)
2353 */
2354 if (live_intervals->end[var_to] > live_intervals->end[var_from])
2355 continue;
2356
2357 bool overwritten = false;
2358 int scan_ip = -1;
2359
2360 foreach_list(n, &this->instructions) {
2361 fs_inst *scan_inst = (fs_inst *)n;
2362 scan_ip++;
2363
2364 if (scan_inst->is_control_flow()) {
2365 overwritten = true;
2366 break;
2367 }
2368
2369 if (scan_ip <= live_intervals->start[var_to])
2370 continue;
2371
2372 if (scan_ip > live_intervals->end[var_to])
2373 break;
2374
2375 if (scan_inst->dst.equals(inst->dst) ||
2376 scan_inst->dst.equals(inst->src[0])) {
2377 overwritten = true;
2378 break;
2379 }
2380 }
2381
2382 if (overwritten)
2383 continue;
2384 }
2385
2386 if (reg_from != inst->src[0].reg) {
2387 reg_from = inst->src[0].reg;
2388
2389 src_size = virtual_grf_sizes[inst->src[0].reg];
2390 assert(src_size <= MAX_SAMPLER_MESSAGE_SIZE);
2391
2392 channels_remaining = src_size;
2393 memset(mov, 0, sizeof(mov));
2394
2395 reg_to = inst->dst.reg;
2396 }
2397
2398 if (reg_to != inst->dst.reg)
2399 continue;
2400
2401 const int offset = inst->src[0].reg_offset;
2402 reg_to_offset[offset] = inst->dst.reg_offset;
2403 mov[offset] = inst;
2404 channels_remaining--;
2405
2406 if (channels_remaining)
2407 continue;
2408
2409 bool removed = false;
2410 for (int i = 0; i < src_size; i++) {
2411 if (mov[i]) {
2412 removed = true;
2413
2414 mov[i]->opcode = BRW_OPCODE_NOP;
2415 mov[i]->conditional_mod = BRW_CONDITIONAL_NONE;
2416 mov[i]->dst = reg_undef;
2417 mov[i]->src[0] = reg_undef;
2418 mov[i]->src[1] = reg_undef;
2419 mov[i]->src[2] = reg_undef;
2420 }
2421 }
2422
2423 foreach_list(node, &this->instructions) {
2424 fs_inst *scan_inst = (fs_inst *)node;
2425
2426 for (int i = 0; i < src_size; i++) {
2427 if (mov[i]) {
2428 if (scan_inst->dst.file == GRF &&
2429 scan_inst->dst.reg == reg_from &&
2430 scan_inst->dst.reg_offset == i) {
2431 scan_inst->dst.reg = reg_to;
2432 scan_inst->dst.reg_offset = reg_to_offset[i];
2433 }
2434 for (int j = 0; j < 3; j++) {
2435 if (scan_inst->src[j].file == GRF &&
2436 scan_inst->src[j].reg == reg_from &&
2437 scan_inst->src[j].reg_offset == i) {
2438 scan_inst->src[j].reg = reg_to;
2439 scan_inst->src[j].reg_offset = reg_to_offset[i];
2440 }
2441 }
2442 }
2443 }
2444 }
2445
2446 if (removed) {
2447 live_intervals->start[var_to] = MIN2(live_intervals->start[var_to],
2448 live_intervals->start[var_from]);
2449 live_intervals->end[var_to] = MAX2(live_intervals->end[var_to],
2450 live_intervals->end[var_from]);
2451 reg_from = -1;
2452 }
2453 }
2454
2455 foreach_list_safe(node, &this->instructions) {
2456 fs_inst *inst = (fs_inst *)node;
2457
2458 if (inst->opcode == BRW_OPCODE_NOP) {
2459 inst->remove();
2460 progress = true;
2461 }
2462 }
2463
2464 if (progress)
2465 invalidate_live_intervals();
2466
2467 return progress;
2468 }
2469
2470 bool
2471 fs_visitor::compute_to_mrf()
2472 {
2473 bool progress = false;
2474 int next_ip = 0;
2475
2476 calculate_live_intervals();
2477
2478 foreach_list_safe(node, &this->instructions) {
2479 fs_inst *inst = (fs_inst *)node;
2480
2481 int ip = next_ip;
2482 next_ip++;
2483
2484 if (inst->opcode != BRW_OPCODE_MOV ||
2485 inst->is_partial_write() ||
2486 inst->dst.file != MRF || inst->src[0].file != GRF ||
2487 inst->dst.type != inst->src[0].type ||
2488 inst->src[0].abs || inst->src[0].negate ||
2489 !inst->src[0].is_contiguous() ||
2490 inst->src[0].subreg_offset)
2491 continue;
2492
2493 /* Work out which hardware MRF registers are written by this
2494 * instruction.
2495 */
2496 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2497 int mrf_high;
2498 if (inst->dst.reg & BRW_MRF_COMPR4) {
2499 mrf_high = mrf_low + 4;
2500 } else if (dispatch_width == 16 &&
2501 (!inst->force_uncompressed && !inst->force_sechalf)) {
2502 mrf_high = mrf_low + 1;
2503 } else {
2504 mrf_high = mrf_low;
2505 }
2506
2507 /* Can't compute-to-MRF this GRF if someone else was going to
2508 * read it later.
2509 */
2510 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2511 continue;
2512
2513 /* Found a move of a GRF to a MRF. Let's see if we can go
2514 * rewrite the thing that made this GRF to write into the MRF.
2515 */
2516 fs_inst *scan_inst;
2517 for (scan_inst = (fs_inst *)inst->prev;
2518 scan_inst->prev != NULL;
2519 scan_inst = (fs_inst *)scan_inst->prev) {
2520 if (scan_inst->dst.file == GRF &&
2521 scan_inst->dst.reg == inst->src[0].reg) {
2522 /* Found the last thing to write our reg we want to turn
2523 * into a compute-to-MRF.
2524 */
2525
2526 /* If this one instruction didn't populate all the
2527 * channels, bail. We might be able to rewrite everything
2528 * that writes that reg, but it would require smarter
2529 * tracking to delay the rewriting until complete success.
2530 */
2531 if (scan_inst->is_partial_write())
2532 break;
2533
2534 /* Things returning more than one register would need us to
2535 * understand coalescing out more than one MOV at a time.
2536 */
2537 if (scan_inst->regs_written > 1)
2538 break;
2539
2540 /* SEND instructions can't have MRF as a destination. */
2541 if (scan_inst->mlen)
2542 break;
2543
2544 if (brw->gen == 6) {
2545 /* gen6 math instructions must have the destination be
2546 * GRF, so no compute-to-MRF for them.
2547 */
2548 if (scan_inst->is_math()) {
2549 break;
2550 }
2551 }
2552
2553 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2554 /* Found the creator of our MRF's source value. */
2555 scan_inst->dst.file = MRF;
2556 scan_inst->dst.reg = inst->dst.reg;
2557 scan_inst->saturate |= inst->saturate;
2558 inst->remove();
2559 progress = true;
2560 }
2561 break;
2562 }
2563
2564 /* We don't handle control flow here. Most computation of
2565 * values that end up in MRFs are shortly before the MRF
2566 * write anyway.
2567 */
2568 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2569 break;
2570
2571 /* You can't read from an MRF, so if someone else reads our
2572 * MRF's source GRF that we wanted to rewrite, that stops us.
2573 */
2574 bool interfered = false;
2575 for (int i = 0; i < 3; i++) {
2576 if (scan_inst->src[i].file == GRF &&
2577 scan_inst->src[i].reg == inst->src[0].reg &&
2578 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2579 interfered = true;
2580 }
2581 }
2582 if (interfered)
2583 break;
2584
2585 if (scan_inst->dst.file == MRF) {
2586 /* If somebody else writes our MRF here, we can't
2587 * compute-to-MRF before that.
2588 */
2589 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2590 int scan_mrf_high;
2591
2592 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2593 scan_mrf_high = scan_mrf_low + 4;
2594 } else if (dispatch_width == 16 &&
2595 (!scan_inst->force_uncompressed &&
2596 !scan_inst->force_sechalf)) {
2597 scan_mrf_high = scan_mrf_low + 1;
2598 } else {
2599 scan_mrf_high = scan_mrf_low;
2600 }
2601
2602 if (mrf_low == scan_mrf_low ||
2603 mrf_low == scan_mrf_high ||
2604 mrf_high == scan_mrf_low ||
2605 mrf_high == scan_mrf_high) {
2606 break;
2607 }
2608 }
2609
2610 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2611 /* Found a SEND instruction, which means that there are
2612 * live values in MRFs from base_mrf to base_mrf +
2613 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2614 * above it.
2615 */
2616 if (mrf_low >= scan_inst->base_mrf &&
2617 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2618 break;
2619 }
2620 if (mrf_high >= scan_inst->base_mrf &&
2621 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2622 break;
2623 }
2624 }
2625 }
2626 }
2627
2628 if (progress)
2629 invalidate_live_intervals();
2630
2631 return progress;
2632 }
2633
2634 /**
2635 * Walks through basic blocks, looking for repeated MRF writes and
2636 * removing the later ones.
2637 */
2638 bool
2639 fs_visitor::remove_duplicate_mrf_writes()
2640 {
2641 fs_inst *last_mrf_move[16];
2642 bool progress = false;
2643
2644 /* Need to update the MRF tracking for compressed instructions. */
2645 if (dispatch_width == 16)
2646 return false;
2647
2648 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2649
2650 foreach_list_safe(node, &this->instructions) {
2651 fs_inst *inst = (fs_inst *)node;
2652
2653 if (inst->is_control_flow()) {
2654 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2655 }
2656
2657 if (inst->opcode == BRW_OPCODE_MOV &&
2658 inst->dst.file == MRF) {
2659 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2660 if (prev_inst && inst->equals(prev_inst)) {
2661 inst->remove();
2662 progress = true;
2663 continue;
2664 }
2665 }
2666
2667 /* Clear out the last-write records for MRFs that were overwritten. */
2668 if (inst->dst.file == MRF) {
2669 last_mrf_move[inst->dst.reg] = NULL;
2670 }
2671
2672 if (inst->mlen > 0 && inst->base_mrf != -1) {
2673 /* Found a SEND instruction, which will include two or fewer
2674 * implied MRF writes. We could do better here.
2675 */
2676 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2677 last_mrf_move[inst->base_mrf + i] = NULL;
2678 }
2679 }
2680
2681 /* Clear out any MRF move records whose sources got overwritten. */
2682 if (inst->dst.file == GRF) {
2683 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2684 if (last_mrf_move[i] &&
2685 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2686 last_mrf_move[i] = NULL;
2687 }
2688 }
2689 }
2690
2691 if (inst->opcode == BRW_OPCODE_MOV &&
2692 inst->dst.file == MRF &&
2693 inst->src[0].file == GRF &&
2694 !inst->is_partial_write()) {
2695 last_mrf_move[inst->dst.reg] = inst;
2696 }
2697 }
2698
2699 if (progress)
2700 invalidate_live_intervals();
2701
2702 return progress;
2703 }
2704
2705 static void
2706 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2707 int first_grf, int grf_len)
2708 {
2709 bool inst_simd16 = (dispatch_width > 8 &&
2710 !inst->force_uncompressed &&
2711 !inst->force_sechalf);
2712
2713 /* Clear the flag for registers that actually got read (as expected). */
2714 for (int i = 0; i < 3; i++) {
2715 int grf;
2716 if (inst->src[i].file == GRF) {
2717 grf = inst->src[i].reg;
2718 } else if (inst->src[i].file == HW_REG &&
2719 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2720 grf = inst->src[i].fixed_hw_reg.nr;
2721 } else {
2722 continue;
2723 }
2724
2725 if (grf >= first_grf &&
2726 grf < first_grf + grf_len) {
2727 deps[grf - first_grf] = false;
2728 if (inst_simd16)
2729 deps[grf - first_grf + 1] = false;
2730 }
2731 }
2732 }
2733
2734 /**
2735 * Implements this workaround for the original 965:
2736 *
2737 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2738 * check for post destination dependencies on this instruction, software
2739 * must ensure that there is no destination hazard for the case of ‘write
2740 * followed by a posted write’ shown in the following example.
2741 *
2742 * 1. mov r3 0
2743 * 2. send r3.xy <rest of send instruction>
2744 * 3. mov r2 r3
2745 *
2746 * Due to no post-destination dependency check on the ‘send’, the above
2747 * code sequence could have two instructions (1 and 2) in flight at the
2748 * same time that both consider ‘r3’ as the target of their final writes.
2749 */
2750 void
2751 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2752 {
2753 int reg_size = dispatch_width / 8;
2754 int write_len = inst->regs_written * reg_size;
2755 int first_write_grf = inst->dst.reg;
2756 bool needs_dep[BRW_MAX_MRF];
2757 assert(write_len < (int)sizeof(needs_dep) - 1);
2758
2759 memset(needs_dep, false, sizeof(needs_dep));
2760 memset(needs_dep, true, write_len);
2761
2762 clear_deps_for_inst_src(inst, dispatch_width,
2763 needs_dep, first_write_grf, write_len);
2764
2765 /* Walk backwards looking for writes to registers we're writing which
2766 * aren't read since being written. If we hit the start of the program,
2767 * we assume that there are no outstanding dependencies on entry to the
2768 * program.
2769 */
2770 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2771 scan_inst != NULL;
2772 scan_inst = (fs_inst *)scan_inst->prev) {
2773
2774 /* If we hit control flow, assume that there *are* outstanding
2775 * dependencies, and force their cleanup before our instruction.
2776 */
2777 if (scan_inst->is_control_flow()) {
2778 for (int i = 0; i < write_len; i++) {
2779 if (needs_dep[i]) {
2780 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2781 }
2782 }
2783 return;
2784 }
2785
2786 bool scan_inst_simd16 = (dispatch_width > 8 &&
2787 !scan_inst->force_uncompressed &&
2788 !scan_inst->force_sechalf);
2789
2790 /* We insert our reads as late as possible on the assumption that any
2791 * instruction but a MOV that might have left us an outstanding
2792 * dependency has more latency than a MOV.
2793 */
2794 if (scan_inst->dst.file == GRF) {
2795 for (int i = 0; i < scan_inst->regs_written; i++) {
2796 int reg = scan_inst->dst.reg + i * reg_size;
2797
2798 if (reg >= first_write_grf &&
2799 reg < first_write_grf + write_len &&
2800 needs_dep[reg - first_write_grf]) {
2801 inst->insert_before(DEP_RESOLVE_MOV(reg));
2802 needs_dep[reg - first_write_grf] = false;
2803 if (scan_inst_simd16)
2804 needs_dep[reg - first_write_grf + 1] = false;
2805 }
2806 }
2807 }
2808
2809 /* Clear the flag for registers that actually got read (as expected). */
2810 clear_deps_for_inst_src(scan_inst, dispatch_width,
2811 needs_dep, first_write_grf, write_len);
2812
2813 /* Continue the loop only if we haven't resolved all the dependencies */
2814 int i;
2815 for (i = 0; i < write_len; i++) {
2816 if (needs_dep[i])
2817 break;
2818 }
2819 if (i == write_len)
2820 return;
2821 }
2822 }
2823
2824 /**
2825 * Implements this workaround for the original 965:
2826 *
2827 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2828 * used as a destination register until after it has been sourced by an
2829 * instruction with a different destination register.
2830 */
2831 void
2832 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2833 {
2834 int write_len = inst->regs_written * dispatch_width / 8;
2835 int first_write_grf = inst->dst.reg;
2836 bool needs_dep[BRW_MAX_MRF];
2837 assert(write_len < (int)sizeof(needs_dep) - 1);
2838
2839 memset(needs_dep, false, sizeof(needs_dep));
2840 memset(needs_dep, true, write_len);
2841 /* Walk forwards looking for writes to registers we're writing which aren't
2842 * read before being written.
2843 */
2844 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2845 !scan_inst->is_tail_sentinel();
2846 scan_inst = (fs_inst *)scan_inst->next) {
2847 /* If we hit control flow, force resolve all remaining dependencies. */
2848 if (scan_inst->is_control_flow()) {
2849 for (int i = 0; i < write_len; i++) {
2850 if (needs_dep[i])
2851 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2852 }
2853 return;
2854 }
2855
2856 /* Clear the flag for registers that actually got read (as expected). */
2857 clear_deps_for_inst_src(scan_inst, dispatch_width,
2858 needs_dep, first_write_grf, write_len);
2859
2860 /* We insert our reads as late as possible since they're reading the
2861 * result of a SEND, which has massive latency.
2862 */
2863 if (scan_inst->dst.file == GRF &&
2864 scan_inst->dst.reg >= first_write_grf &&
2865 scan_inst->dst.reg < first_write_grf + write_len &&
2866 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2867 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2868 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2869 }
2870
2871 /* Continue the loop only if we haven't resolved all the dependencies */
2872 int i;
2873 for (i = 0; i < write_len; i++) {
2874 if (needs_dep[i])
2875 break;
2876 }
2877 if (i == write_len)
2878 return;
2879 }
2880
2881 /* If we hit the end of the program, resolve all remaining dependencies out
2882 * of paranoia.
2883 */
2884 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2885 assert(last_inst->eot);
2886 for (int i = 0; i < write_len; i++) {
2887 if (needs_dep[i])
2888 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2889 }
2890 }
2891
2892 void
2893 fs_visitor::insert_gen4_send_dependency_workarounds()
2894 {
2895 if (brw->gen != 4 || brw->is_g4x)
2896 return;
2897
2898 /* Note that we're done with register allocation, so GRF fs_regs always
2899 * have a .reg_offset of 0.
2900 */
2901
2902 foreach_list_safe(node, &this->instructions) {
2903 fs_inst *inst = (fs_inst *)node;
2904
2905 if (inst->mlen != 0 && inst->dst.file == GRF) {
2906 insert_gen4_pre_send_dependency_workarounds(inst);
2907 insert_gen4_post_send_dependency_workarounds(inst);
2908 }
2909 }
2910 }
2911
2912 /**
2913 * Turns the generic expression-style uniform pull constant load instruction
2914 * into a hardware-specific series of instructions for loading a pull
2915 * constant.
2916 *
2917 * The expression style allows the CSE pass before this to optimize out
2918 * repeated loads from the same offset, and gives the pre-register-allocation
2919 * scheduling full flexibility, while the conversion to native instructions
2920 * allows the post-register-allocation scheduler the best information
2921 * possible.
2922 *
2923 * Note that execution masking for setting up pull constant loads is special:
2924 * the channels that need to be written are unrelated to the current execution
2925 * mask, since a later instruction will use one of the result channels as a
2926 * source operand for all 8 or 16 of its channels.
2927 */
2928 void
2929 fs_visitor::lower_uniform_pull_constant_loads()
2930 {
2931 foreach_list(node, &this->instructions) {
2932 fs_inst *inst = (fs_inst *)node;
2933
2934 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2935 continue;
2936
2937 if (brw->gen >= 7) {
2938 /* The offset arg before was a vec4-aligned byte offset. We need to
2939 * turn it into a dword offset.
2940 */
2941 fs_reg const_offset_reg = inst->src[1];
2942 assert(const_offset_reg.file == IMM &&
2943 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2944 const_offset_reg.imm.u /= 4;
2945 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2946
2947 /* This is actually going to be a MOV, but since only the first dword
2948 * is accessed, we have a special opcode to do just that one. Note
2949 * that this needs to be an operation that will be considered a def
2950 * by live variable analysis, or register allocation will explode.
2951 */
2952 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2953 payload, const_offset_reg);
2954 setup->force_writemask_all = true;
2955
2956 setup->ir = inst->ir;
2957 setup->annotation = inst->annotation;
2958 inst->insert_before(setup);
2959
2960 /* Similarly, this will only populate the first 4 channels of the
2961 * result register (since we only use smear values from 0-3), but we
2962 * don't tell the optimizer.
2963 */
2964 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2965 inst->src[1] = payload;
2966
2967 invalidate_live_intervals();
2968 } else {
2969 /* Before register allocation, we didn't tell the scheduler about the
2970 * MRF we use. We know it's safe to use this MRF because nothing
2971 * else does except for register spill/unspill, which generates and
2972 * uses its MRF within a single IR instruction.
2973 */
2974 inst->base_mrf = 14;
2975 inst->mlen = 1;
2976 }
2977 }
2978 }
2979
2980 void
2981 fs_visitor::dump_instructions()
2982 {
2983 calculate_register_pressure();
2984
2985 int ip = 0, max_pressure = 0;
2986 foreach_list(node, &this->instructions) {
2987 backend_instruction *inst = (backend_instruction *)node;
2988 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2989 printf("{%3d} %4d: ", regs_live_at_ip[ip], ip);
2990 dump_instruction(inst);
2991 ++ip;
2992 }
2993 printf("Maximum %3d registers live at once.\n", max_pressure);
2994 }
2995
2996 void
2997 fs_visitor::dump_instruction(backend_instruction *be_inst)
2998 {
2999 fs_inst *inst = (fs_inst *)be_inst;
3000
3001 if (inst->predicate) {
3002 printf("(%cf0.%d) ",
3003 inst->predicate_inverse ? '-' : '+',
3004 inst->flag_subreg);
3005 }
3006
3007 printf("%s", brw_instruction_name(inst->opcode));
3008 if (inst->saturate)
3009 printf(".sat");
3010 if (inst->conditional_mod) {
3011 printf("%s", conditional_modifier[inst->conditional_mod]);
3012 if (!inst->predicate &&
3013 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3014 inst->opcode != BRW_OPCODE_IF &&
3015 inst->opcode != BRW_OPCODE_WHILE))) {
3016 printf(".f0.%d", inst->flag_subreg);
3017 }
3018 }
3019 printf(" ");
3020
3021
3022 switch (inst->dst.file) {
3023 case GRF:
3024 printf("vgrf%d", inst->dst.reg);
3025 if (virtual_grf_sizes[inst->dst.reg] != 1 ||
3026 inst->dst.subreg_offset)
3027 printf("+%d.%d", inst->dst.reg_offset, inst->dst.subreg_offset);
3028 break;
3029 case MRF:
3030 printf("m%d", inst->dst.reg);
3031 break;
3032 case BAD_FILE:
3033 printf("(null)");
3034 break;
3035 case UNIFORM:
3036 printf("***u%d***", inst->dst.reg);
3037 break;
3038 case HW_REG:
3039 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3040 switch (inst->dst.fixed_hw_reg.nr) {
3041 case BRW_ARF_NULL:
3042 printf("null");
3043 break;
3044 case BRW_ARF_ADDRESS:
3045 printf("a0.%d", inst->dst.fixed_hw_reg.subnr);
3046 break;
3047 case BRW_ARF_ACCUMULATOR:
3048 printf("acc%d", inst->dst.fixed_hw_reg.subnr);
3049 break;
3050 case BRW_ARF_FLAG:
3051 printf("f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3052 inst->dst.fixed_hw_reg.subnr);
3053 break;
3054 default:
3055 printf("arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3056 inst->dst.fixed_hw_reg.subnr);
3057 break;
3058 }
3059 } else {
3060 printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
3061 }
3062 if (inst->dst.fixed_hw_reg.subnr)
3063 printf("+%d", inst->dst.fixed_hw_reg.subnr);
3064 break;
3065 default:
3066 printf("???");
3067 break;
3068 }
3069 printf(":%s, ", reg_encoding[inst->dst.type]);
3070
3071 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
3072 if (inst->src[i].negate)
3073 printf("-");
3074 if (inst->src[i].abs)
3075 printf("|");
3076 switch (inst->src[i].file) {
3077 case GRF:
3078 printf("vgrf%d", inst->src[i].reg);
3079 if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
3080 inst->src[i].subreg_offset)
3081 printf("+%d.%d", inst->src[i].reg_offset,
3082 inst->src[i].subreg_offset);
3083 break;
3084 case MRF:
3085 printf("***m%d***", inst->src[i].reg);
3086 break;
3087 case UNIFORM:
3088 printf("u%d", inst->src[i].reg);
3089 if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
3090 inst->src[i].subreg_offset)
3091 printf("+%d.%d", inst->src[i].reg_offset,
3092 inst->src[i].subreg_offset);
3093 break;
3094 case BAD_FILE:
3095 printf("(null)");
3096 break;
3097 case IMM:
3098 switch (inst->src[i].type) {
3099 case BRW_REGISTER_TYPE_F:
3100 printf("%ff", inst->src[i].imm.f);
3101 break;
3102 case BRW_REGISTER_TYPE_D:
3103 printf("%dd", inst->src[i].imm.i);
3104 break;
3105 case BRW_REGISTER_TYPE_UD:
3106 printf("%uu", inst->src[i].imm.u);
3107 break;
3108 default:
3109 printf("???");
3110 break;
3111 }
3112 break;
3113 case HW_REG:
3114 if (inst->src[i].fixed_hw_reg.negate)
3115 printf("-");
3116 if (inst->src[i].fixed_hw_reg.abs)
3117 printf("|");
3118 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3119 switch (inst->src[i].fixed_hw_reg.nr) {
3120 case BRW_ARF_NULL:
3121 printf("null");
3122 break;
3123 case BRW_ARF_ADDRESS:
3124 printf("a0.%d", inst->src[i].fixed_hw_reg.subnr);
3125 break;
3126 case BRW_ARF_ACCUMULATOR:
3127 printf("acc%d", inst->src[i].fixed_hw_reg.subnr);
3128 break;
3129 case BRW_ARF_FLAG:
3130 printf("f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3131 inst->src[i].fixed_hw_reg.subnr);
3132 break;
3133 default:
3134 printf("arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3135 inst->src[i].fixed_hw_reg.subnr);
3136 break;
3137 }
3138 } else {
3139 printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3140 }
3141 if (inst->src[i].fixed_hw_reg.subnr)
3142 printf("+%d", inst->src[i].fixed_hw_reg.subnr);
3143 if (inst->src[i].fixed_hw_reg.abs)
3144 printf("|");
3145 break;
3146 default:
3147 printf("???");
3148 break;
3149 }
3150 if (inst->src[i].abs)
3151 printf("|");
3152
3153 if (inst->src[i].file != IMM) {
3154 printf(":%s", brw_reg_type_letters(inst->src[i].type));
3155 }
3156
3157 if (i < 2 && inst->src[i + 1].file != BAD_FILE)
3158 printf(", ");
3159 }
3160
3161 printf(" ");
3162
3163 if (inst->force_uncompressed)
3164 printf("1sthalf ");
3165
3166 if (inst->force_sechalf)
3167 printf("2ndhalf ");
3168
3169 printf("\n");
3170 }
3171
3172 /**
3173 * Possibly returns an instruction that set up @param reg.
3174 *
3175 * Sometimes we want to take the result of some expression/variable
3176 * dereference tree and rewrite the instruction generating the result
3177 * of the tree. When processing the tree, we know that the
3178 * instructions generated are all writing temporaries that are dead
3179 * outside of this tree. So, if we have some instructions that write
3180 * a temporary, we're free to point that temp write somewhere else.
3181 *
3182 * Note that this doesn't guarantee that the instruction generated
3183 * only reg -- it might be the size=4 destination of a texture instruction.
3184 */
3185 fs_inst *
3186 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3187 fs_inst *end,
3188 fs_reg reg)
3189 {
3190 if (end == start ||
3191 end->is_partial_write() ||
3192 reg.reladdr ||
3193 !reg.equals(end->dst)) {
3194 return NULL;
3195 } else {
3196 return end;
3197 }
3198 }
3199
3200 void
3201 fs_visitor::setup_payload_gen6()
3202 {
3203 bool uses_depth =
3204 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3205 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3206
3207 assert(brw->gen >= 6);
3208
3209 /* R0-1: masks, pixel X/Y coordinates. */
3210 c->nr_payload_regs = 2;
3211 /* R2: only for 32-pixel dispatch.*/
3212
3213 /* R3-26: barycentric interpolation coordinates. These appear in the
3214 * same order that they appear in the brw_wm_barycentric_interp_mode
3215 * enum. Each set of coordinates occupies 2 registers if dispatch width
3216 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3217 * appear if they were enabled using the "Barycentric Interpolation
3218 * Mode" bits in WM_STATE.
3219 */
3220 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3221 if (barycentric_interp_modes & (1 << i)) {
3222 c->barycentric_coord_reg[i] = c->nr_payload_regs;
3223 c->nr_payload_regs += 2;
3224 if (dispatch_width == 16) {
3225 c->nr_payload_regs += 2;
3226 }
3227 }
3228 }
3229
3230 /* R27: interpolated depth if uses source depth */
3231 if (uses_depth) {
3232 c->source_depth_reg = c->nr_payload_regs;
3233 c->nr_payload_regs++;
3234 if (dispatch_width == 16) {
3235 /* R28: interpolated depth if not SIMD8. */
3236 c->nr_payload_regs++;
3237 }
3238 }
3239 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3240 if (uses_depth) {
3241 c->source_w_reg = c->nr_payload_regs;
3242 c->nr_payload_regs++;
3243 if (dispatch_width == 16) {
3244 /* R30: interpolated W if not SIMD8. */
3245 c->nr_payload_regs++;
3246 }
3247 }
3248
3249 c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3250 /* R31: MSAA position offsets. */
3251 if (c->prog_data.uses_pos_offset) {
3252 c->sample_pos_reg = c->nr_payload_regs;
3253 c->nr_payload_regs++;
3254 }
3255
3256 /* R32: MSAA input coverage mask */
3257 if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3258 assert(brw->gen >= 7);
3259 c->sample_mask_reg = c->nr_payload_regs;
3260 c->nr_payload_regs++;
3261 if (dispatch_width == 16) {
3262 /* R33: input coverage mask if not SIMD8. */
3263 c->nr_payload_regs++;
3264 }
3265 }
3266
3267 /* R34-: bary for 32-pixel. */
3268 /* R58-59: interp W for 32-pixel. */
3269
3270 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3271 c->source_depth_to_render_target = true;
3272 }
3273 }
3274
3275 void
3276 fs_visitor::assign_binding_table_offsets()
3277 {
3278 uint32_t next_binding_table_offset = 0;
3279
3280 /* If there are no color regions, we still perform an FB write to a null
3281 * renderbuffer, which we place at surface index 0.
3282 */
3283 c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3284 next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3285
3286 assign_common_binding_table_offsets(next_binding_table_offset);
3287 }
3288
3289 void
3290 fs_visitor::calculate_register_pressure()
3291 {
3292 calculate_live_intervals();
3293
3294 int num_instructions = 0;
3295 foreach_list(node, &this->instructions) {
3296 ++num_instructions;
3297 }
3298
3299 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3300
3301 for (int reg = 0; reg < virtual_grf_count; reg++) {
3302 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3303 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3304 }
3305 }
3306
3307 /**
3308 * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
3309 *
3310 * The needs_unlit_centroid_workaround ends up producing one of these per
3311 * channel of centroid input, so it's good to clean them up.
3312 *
3313 * An assumption here is that nothing ever modifies the dispatched pixels
3314 * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
3315 * dictates that anyway.
3316 */
3317 void
3318 fs_visitor::opt_drop_redundant_mov_to_flags()
3319 {
3320 bool flag_mov_found[2] = {false};
3321
3322 foreach_list_safe(node, &this->instructions) {
3323 fs_inst *inst = (fs_inst *)node;
3324
3325 if (inst->is_control_flow()) {
3326 memset(flag_mov_found, 0, sizeof(flag_mov_found));
3327 } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
3328 if (!flag_mov_found[inst->flag_subreg])
3329 flag_mov_found[inst->flag_subreg] = true;
3330 else
3331 inst->remove();
3332 } else if (inst->writes_flag()) {
3333 flag_mov_found[inst->flag_subreg] = false;
3334 }
3335 }
3336 }
3337
3338 bool
3339 fs_visitor::run()
3340 {
3341 sanity_param_count = fp->Base.Parameters->NumParameters;
3342 bool allocated_without_spills;
3343
3344 assign_binding_table_offsets();
3345
3346 if (brw->gen >= 6)
3347 setup_payload_gen6();
3348 else
3349 setup_payload_gen4();
3350
3351 if (0) {
3352 emit_dummy_fs();
3353 } else {
3354 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3355 emit_shader_time_begin();
3356
3357 calculate_urb_setup();
3358 if (fp->Base.InputsRead > 0) {
3359 if (brw->gen < 6)
3360 emit_interpolation_setup_gen4();
3361 else
3362 emit_interpolation_setup_gen6();
3363 }
3364
3365 /* We handle discards by keeping track of the still-live pixels in f0.1.
3366 * Initialize it with the dispatched pixels.
3367 */
3368 if (fp->UsesKill || c->key.alpha_test_func) {
3369 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3370 discard_init->flag_subreg = 1;
3371 }
3372
3373 /* Generate FS IR for main(). (the visitor only descends into
3374 * functions called "main").
3375 */
3376 if (shader) {
3377 foreach_list(node, &*shader->base.ir) {
3378 ir_instruction *ir = (ir_instruction *)node;
3379 base_ir = ir;
3380 this->result = reg_undef;
3381 ir->accept(this);
3382 }
3383 } else {
3384 emit_fragment_program_code();
3385 }
3386 base_ir = NULL;
3387 if (failed)
3388 return false;
3389
3390 emit(FS_OPCODE_PLACEHOLDER_HALT);
3391
3392 if (c->key.alpha_test_func)
3393 emit_alpha_test();
3394
3395 emit_fb_writes();
3396
3397 split_virtual_grfs();
3398
3399 move_uniform_array_access_to_pull_constants();
3400 remove_dead_constants();
3401 setup_pull_constants();
3402
3403 opt_drop_redundant_mov_to_flags();
3404
3405 bool progress;
3406 do {
3407 progress = false;
3408
3409 compact_virtual_grfs();
3410
3411 progress = remove_duplicate_mrf_writes() || progress;
3412
3413 progress = opt_algebraic() || progress;
3414 progress = opt_cse() || progress;
3415 progress = opt_copy_propagate() || progress;
3416 progress = opt_peephole_predicated_break() || progress;
3417 progress = dead_code_eliminate() || progress;
3418 progress = dead_code_eliminate_local() || progress;
3419 progress = opt_peephole_sel() || progress;
3420 progress = dead_control_flow_eliminate(this) || progress;
3421 progress = opt_saturate_propagation() || progress;
3422 progress = register_coalesce() || progress;
3423 progress = compute_to_mrf() || progress;
3424 } while (progress);
3425
3426 lower_uniform_pull_constant_loads();
3427
3428 assign_curb_setup();
3429 assign_urb_setup();
3430
3431 static enum instruction_scheduler_mode pre_modes[] = {
3432 SCHEDULE_PRE,
3433 SCHEDULE_PRE_NON_LIFO,
3434 SCHEDULE_PRE_LIFO,
3435 };
3436
3437 /* Try each scheduling heuristic to see if it can successfully register
3438 * allocate without spilling. They should be ordered by decreasing
3439 * performance but increasing likelihood of allocating.
3440 */
3441 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3442 schedule_instructions(pre_modes[i]);
3443
3444 if (0) {
3445 assign_regs_trivial();
3446 allocated_without_spills = true;
3447 } else {
3448 allocated_without_spills = assign_regs(false);
3449 }
3450 if (allocated_without_spills)
3451 break;
3452 }
3453
3454 if (!allocated_without_spills) {
3455 /* We assume that any spilling is worse than just dropping back to
3456 * SIMD8. There's probably actually some intermediate point where
3457 * SIMD16 with a couple of spills is still better.
3458 */
3459 if (dispatch_width == 16) {
3460 fail("Failure to register allocate. Reduce number of "
3461 "live scalar values to avoid this.");
3462 }
3463
3464 /* Since we're out of heuristics, just go spill registers until we
3465 * get an allocation.
3466 */
3467 while (!assign_regs(true)) {
3468 if (failed)
3469 break;
3470 }
3471 }
3472 }
3473 assert(force_uncompressed_stack == 0);
3474
3475 /* This must come after all optimization and register allocation, since
3476 * it inserts dead code that happens to have side effects, and it does
3477 * so based on the actual physical registers in use.
3478 */
3479 insert_gen4_send_dependency_workarounds();
3480
3481 if (failed)
3482 return false;
3483
3484 if (!allocated_without_spills)
3485 schedule_instructions(SCHEDULE_POST);
3486
3487 if (dispatch_width == 8)
3488 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3489 else
3490 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3491
3492 /* If any state parameters were appended, then ParameterValues could have
3493 * been realloced, in which case the driver uniform storage set up by
3494 * _mesa_associate_uniform_storage() would point to freed memory. Make
3495 * sure that didn't happen.
3496 */
3497 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3498
3499 return !failed;
3500 }
3501
3502 const unsigned *
3503 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3504 struct gl_fragment_program *fp,
3505 struct gl_shader_program *prog,
3506 unsigned *final_assembly_size)
3507 {
3508 bool start_busy = false;
3509 double start_time = 0;
3510
3511 if (unlikely(brw->perf_debug)) {
3512 start_busy = (brw->batch.last_bo &&
3513 drm_intel_bo_busy(brw->batch.last_bo));
3514 start_time = get_time();
3515 }
3516
3517 struct brw_shader *shader = NULL;
3518 if (prog)
3519 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3520
3521 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3522 if (prog) {
3523 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3524 _mesa_print_ir(shader->base.ir, NULL);
3525 printf("\n\n");
3526 } else {
3527 printf("ARB_fragment_program %d ir for native fragment shader\n",
3528 fp->Base.Id);
3529 _mesa_print_program(&fp->Base);
3530 }
3531 }
3532
3533 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3534 */
3535 fs_visitor v(brw, c, prog, fp, 8);
3536 if (!v.run()) {
3537 if (prog) {
3538 prog->LinkStatus = false;
3539 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3540 }
3541
3542 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3543 v.fail_msg);
3544
3545 return NULL;
3546 }
3547
3548 exec_list *simd16_instructions = NULL;
3549 fs_visitor v2(brw, c, prog, fp, 16);
3550 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3551 if (c->prog_data.base.nr_pull_params == 0) {
3552 /* Try a SIMD16 compile */
3553 v2.import_uniforms(&v);
3554 if (!v2.run()) {
3555 perf_debug("SIMD16 shader failed to compile, falling back to "
3556 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3557 } else {
3558 simd16_instructions = &v2.instructions;
3559 }
3560 } else {
3561 perf_debug("Skipping SIMD16 due to pull parameters.\n");
3562 }
3563 }
3564
3565 const unsigned *assembly = NULL;
3566 if (brw->gen >= 8) {
3567 gen8_fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3568 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3569 final_assembly_size);
3570 } else {
3571 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3572 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3573 final_assembly_size);
3574 }
3575
3576 if (unlikely(brw->perf_debug) && shader) {
3577 if (shader->compiled_once)
3578 brw_wm_debug_recompile(brw, prog, &c->key);
3579 shader->compiled_once = true;
3580
3581 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3582 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3583 (get_time() - start_time) * 1000);
3584 }
3585 }
3586
3587 return assembly;
3588 }
3589
3590 bool
3591 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3592 {
3593 struct brw_context *brw = brw_context(ctx);
3594 struct brw_wm_prog_key key;
3595
3596 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3597 return true;
3598
3599 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3600 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3601 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3602 bool program_uses_dfdy = fp->UsesDFdy;
3603
3604 memset(&key, 0, sizeof(key));
3605
3606 if (brw->gen < 6) {
3607 if (fp->UsesKill)
3608 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3609
3610 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3611 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3612
3613 /* Just assume depth testing. */
3614 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3615 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3616 }
3617
3618 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3619 BRW_FS_VARYING_INPUT_MASK) > 16)
3620 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3621
3622 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3623
3624 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3625 for (unsigned i = 0; i < sampler_count; i++) {
3626 if (fp->Base.ShadowSamplers & (1 << i)) {
3627 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3628 key.tex.swizzles[i] =
3629 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3630 } else {
3631 /* Color sampler: assume no swizzling. */
3632 key.tex.swizzles[i] = SWIZZLE_XYZW;
3633 }
3634 }
3635
3636 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3637 key.drawable_height = ctx->DrawBuffer->Height;
3638 }
3639
3640 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3641 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3642 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3643
3644 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3645 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3646 key.nr_color_regions > 1;
3647 }
3648
3649 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3650 * quality of the derivatives is likely to be determined by the driconf
3651 * option.
3652 */
3653 key.high_quality_derivatives = brw->disable_derivative_optimization;
3654
3655 key.program_string_id = bfp->id;
3656
3657 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3658 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3659
3660 bool success = do_wm_prog(brw, prog, bfp, &key);
3661
3662 brw->wm.base.prog_offset = old_prog_offset;
3663 brw->wm.prog_data = old_prog_data;
3664
3665 return success;
3666 }