i965/fs: Optimize saturating SEL.G(E) with imm val <= 0.0f.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "main/uniforms.h"
50 #include "brw_fs_live_variables.h"
51 #include "glsl/glsl_types.h"
52
53 void
54 fs_inst::init()
55 {
56 memset(this, 0, sizeof(*this));
57 this->opcode = BRW_OPCODE_NOP;
58 this->conditional_mod = BRW_CONDITIONAL_NONE;
59
60 this->dst = reg_undef;
61 this->src[0] = reg_undef;
62 this->src[1] = reg_undef;
63 this->src[2] = reg_undef;
64
65 /* This will be the case for almost all instructions. */
66 this->regs_written = 1;
67 }
68
69 fs_inst::fs_inst()
70 {
71 init();
72 }
73
74 fs_inst::fs_inst(enum opcode opcode)
75 {
76 init();
77 this->opcode = opcode;
78 }
79
80 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
81 {
82 init();
83 this->opcode = opcode;
84 this->dst = dst;
85
86 if (dst.file == GRF)
87 assert(dst.reg_offset >= 0);
88 }
89
90 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
91 {
92 init();
93 this->opcode = opcode;
94 this->dst = dst;
95 this->src[0] = src0;
96
97 if (dst.file == GRF)
98 assert(dst.reg_offset >= 0);
99 if (src[0].file == GRF)
100 assert(src[0].reg_offset >= 0);
101 }
102
103 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
104 {
105 init();
106 this->opcode = opcode;
107 this->dst = dst;
108 this->src[0] = src0;
109 this->src[1] = src1;
110
111 if (dst.file == GRF)
112 assert(dst.reg_offset >= 0);
113 if (src[0].file == GRF)
114 assert(src[0].reg_offset >= 0);
115 if (src[1].file == GRF)
116 assert(src[1].reg_offset >= 0);
117 }
118
119 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
120 fs_reg src0, fs_reg src1, fs_reg src2)
121 {
122 init();
123 this->opcode = opcode;
124 this->dst = dst;
125 this->src[0] = src0;
126 this->src[1] = src1;
127 this->src[2] = src2;
128
129 if (dst.file == GRF)
130 assert(dst.reg_offset >= 0);
131 if (src[0].file == GRF)
132 assert(src[0].reg_offset >= 0);
133 if (src[1].file == GRF)
134 assert(src[1].reg_offset >= 0);
135 if (src[2].file == GRF)
136 assert(src[2].reg_offset >= 0);
137 }
138
139 #define ALU1(op) \
140 fs_inst * \
141 fs_visitor::op(fs_reg dst, fs_reg src0) \
142 { \
143 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
144 }
145
146 #define ALU2(op) \
147 fs_inst * \
148 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
149 { \
150 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
151 }
152
153 #define ALU3(op) \
154 fs_inst * \
155 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
156 { \
157 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
158 }
159
160 ALU1(NOT)
161 ALU1(MOV)
162 ALU1(FRC)
163 ALU1(RNDD)
164 ALU1(RNDE)
165 ALU1(RNDZ)
166 ALU2(ADD)
167 ALU2(MUL)
168 ALU2(MACH)
169 ALU2(AND)
170 ALU2(OR)
171 ALU2(XOR)
172 ALU2(SHL)
173 ALU2(SHR)
174 ALU2(ASR)
175 ALU3(LRP)
176 ALU1(BFREV)
177 ALU3(BFE)
178 ALU2(BFI1)
179 ALU3(BFI2)
180 ALU1(FBH)
181 ALU1(FBL)
182 ALU1(CBIT)
183 ALU3(MAD)
184 ALU2(ADDC)
185 ALU2(SUBB)
186
187 /** Gen4 predicated IF. */
188 fs_inst *
189 fs_visitor::IF(uint32_t predicate)
190 {
191 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
192 inst->predicate = predicate;
193 return inst;
194 }
195
196 /** Gen6+ IF with embedded comparison. */
197 fs_inst *
198 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
199 {
200 assert(brw->gen >= 6);
201 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
202 reg_null_d, src0, src1);
203 inst->conditional_mod = condition;
204 return inst;
205 }
206
207 /**
208 * CMP: Sets the low bit of the destination channels with the result
209 * of the comparison, while the upper bits are undefined, and updates
210 * the flag register with the packed 16 bits of the result.
211 */
212 fs_inst *
213 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
214 {
215 fs_inst *inst;
216
217 /* Take the instruction:
218 *
219 * CMP null<d> src0<f> src1<f>
220 *
221 * Original gen4 does type conversion to the destination type before
222 * comparison, producing garbage results for floating point comparisons.
223 * gen5 does the comparison on the execution type (resolved source types),
224 * so dst type doesn't matter. gen6 does comparison and then uses the
225 * result as if it was the dst type with no conversion, which happens to
226 * mostly work out for float-interpreted-as-int since our comparisons are
227 * for >0, =0, <0.
228 */
229 if (brw->gen == 4) {
230 dst.type = src0.type;
231 if (dst.file == HW_REG)
232 dst.fixed_hw_reg.type = dst.type;
233 }
234
235 resolve_ud_negate(&src0);
236 resolve_ud_negate(&src1);
237
238 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
239 inst->conditional_mod = condition;
240
241 return inst;
242 }
243
244 exec_list
245 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
246 fs_reg varying_offset,
247 uint32_t const_offset)
248 {
249 exec_list instructions;
250 fs_inst *inst;
251
252 /* We have our constant surface use a pitch of 4 bytes, so our index can
253 * be any component of a vector, and then we load 4 contiguous
254 * components starting from that.
255 *
256 * We break down the const_offset to a portion added to the variable
257 * offset and a portion done using reg_offset, which means that if you
258 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
259 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
260 * CSE can later notice that those loads are all the same and eliminate
261 * the redundant ones.
262 */
263 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
264 instructions.push_tail(ADD(vec4_offset,
265 varying_offset, const_offset & ~3));
266
267 int scale = 1;
268 if (brw->gen == 4 && dispatch_width == 8) {
269 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
270 * u, v, r) as parameters, or we can just use the SIMD16 message
271 * consisting of (header, u). We choose the second, at the cost of a
272 * longer return length.
273 */
274 scale = 2;
275 }
276
277 enum opcode op;
278 if (brw->gen >= 7)
279 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
280 else
281 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
282 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
283 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
284 inst->regs_written = 4 * scale;
285 instructions.push_tail(inst);
286
287 if (brw->gen < 7) {
288 inst->base_mrf = 13;
289 inst->header_present = true;
290 if (brw->gen == 4)
291 inst->mlen = 3;
292 else
293 inst->mlen = 1 + dispatch_width / 8;
294 }
295
296 vec4_result.reg_offset += (const_offset & 3) * scale;
297 instructions.push_tail(MOV(dst, vec4_result));
298
299 return instructions;
300 }
301
302 /**
303 * A helper for MOV generation for fixing up broken hardware SEND dependency
304 * handling.
305 */
306 fs_inst *
307 fs_visitor::DEP_RESOLVE_MOV(int grf)
308 {
309 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
310
311 inst->ir = NULL;
312 inst->annotation = "send dependency resolve";
313
314 /* The caller always wants uncompressed to emit the minimal extra
315 * dependencies, and to avoid having to deal with aligning its regs to 2.
316 */
317 inst->force_uncompressed = true;
318
319 return inst;
320 }
321
322 bool
323 fs_inst::equals(fs_inst *inst)
324 {
325 return (opcode == inst->opcode &&
326 dst.equals(inst->dst) &&
327 src[0].equals(inst->src[0]) &&
328 src[1].equals(inst->src[1]) &&
329 src[2].equals(inst->src[2]) &&
330 saturate == inst->saturate &&
331 predicate == inst->predicate &&
332 conditional_mod == inst->conditional_mod &&
333 mlen == inst->mlen &&
334 base_mrf == inst->base_mrf &&
335 sampler == inst->sampler &&
336 target == inst->target &&
337 eot == inst->eot &&
338 header_present == inst->header_present &&
339 shadow_compare == inst->shadow_compare &&
340 offset == inst->offset);
341 }
342
343 bool
344 fs_inst::overwrites_reg(const fs_reg &reg)
345 {
346 return (reg.file == dst.file &&
347 reg.reg == dst.reg &&
348 reg.reg_offset >= dst.reg_offset &&
349 reg.reg_offset < dst.reg_offset + regs_written);
350 }
351
352 bool
353 fs_inst::is_send_from_grf()
354 {
355 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
356 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
357 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
358 src[1].file == GRF) ||
359 (is_tex() && src[0].file == GRF));
360 }
361
362 bool
363 fs_visitor::can_do_source_mods(fs_inst *inst)
364 {
365 if (brw->gen == 6 && inst->is_math())
366 return false;
367
368 if (inst->is_send_from_grf())
369 return false;
370
371 if (!inst->can_do_source_mods())
372 return false;
373
374 return true;
375 }
376
377 void
378 fs_reg::init()
379 {
380 memset(this, 0, sizeof(*this));
381 this->smear = -1;
382 }
383
384 /** Generic unset register constructor. */
385 fs_reg::fs_reg()
386 {
387 init();
388 this->file = BAD_FILE;
389 }
390
391 /** Immediate value constructor. */
392 fs_reg::fs_reg(float f)
393 {
394 init();
395 this->file = IMM;
396 this->type = BRW_REGISTER_TYPE_F;
397 this->imm.f = f;
398 }
399
400 /** Immediate value constructor. */
401 fs_reg::fs_reg(int32_t i)
402 {
403 init();
404 this->file = IMM;
405 this->type = BRW_REGISTER_TYPE_D;
406 this->imm.i = i;
407 }
408
409 /** Immediate value constructor. */
410 fs_reg::fs_reg(uint32_t u)
411 {
412 init();
413 this->file = IMM;
414 this->type = BRW_REGISTER_TYPE_UD;
415 this->imm.u = u;
416 }
417
418 /** Fixed brw_reg Immediate value constructor. */
419 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
420 {
421 init();
422 this->file = HW_REG;
423 this->fixed_hw_reg = fixed_hw_reg;
424 this->type = fixed_hw_reg.type;
425 }
426
427 bool
428 fs_reg::equals(const fs_reg &r) const
429 {
430 return (file == r.file &&
431 reg == r.reg &&
432 reg_offset == r.reg_offset &&
433 type == r.type &&
434 negate == r.negate &&
435 abs == r.abs &&
436 !reladdr && !r.reladdr &&
437 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
438 sizeof(fixed_hw_reg)) == 0 &&
439 smear == r.smear &&
440 imm.u == r.imm.u);
441 }
442
443 fs_reg
444 fs_reg::retype(uint32_t type)
445 {
446 fs_reg result = *this;
447 result.type = type;
448 return result;
449 }
450
451 bool
452 fs_reg::is_zero() const
453 {
454 if (file != IMM)
455 return false;
456
457 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
458 }
459
460 bool
461 fs_reg::is_one() const
462 {
463 if (file != IMM)
464 return false;
465
466 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
467 }
468
469 bool
470 fs_reg::is_null() const
471 {
472 return file == HW_REG &&
473 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
474 fixed_hw_reg.nr == BRW_ARF_NULL;
475 }
476
477 bool
478 fs_reg::is_valid_3src() const
479 {
480 return file == GRF || file == UNIFORM;
481 }
482
483 int
484 fs_visitor::type_size(const struct glsl_type *type)
485 {
486 unsigned int size, i;
487
488 switch (type->base_type) {
489 case GLSL_TYPE_UINT:
490 case GLSL_TYPE_INT:
491 case GLSL_TYPE_FLOAT:
492 case GLSL_TYPE_BOOL:
493 return type->components();
494 case GLSL_TYPE_ARRAY:
495 return type_size(type->fields.array) * type->length;
496 case GLSL_TYPE_STRUCT:
497 size = 0;
498 for (i = 0; i < type->length; i++) {
499 size += type_size(type->fields.structure[i].type);
500 }
501 return size;
502 case GLSL_TYPE_SAMPLER:
503 /* Samplers take up no register space, since they're baked in at
504 * link time.
505 */
506 return 0;
507 case GLSL_TYPE_ATOMIC_UINT:
508 return 0;
509 case GLSL_TYPE_VOID:
510 case GLSL_TYPE_ERROR:
511 case GLSL_TYPE_INTERFACE:
512 assert(!"not reached");
513 break;
514 }
515
516 return 0;
517 }
518
519 fs_reg
520 fs_visitor::get_timestamp()
521 {
522 assert(brw->gen >= 7);
523
524 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
525 BRW_ARF_TIMESTAMP,
526 0),
527 BRW_REGISTER_TYPE_UD));
528
529 fs_reg dst = fs_reg(this, glsl_type::uint_type);
530
531 fs_inst *mov = emit(MOV(dst, ts));
532 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
533 * even if it's not enabled in the dispatch.
534 */
535 mov->force_writemask_all = true;
536 mov->force_uncompressed = true;
537
538 /* The caller wants the low 32 bits of the timestamp. Since it's running
539 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
540 * which is plenty of time for our purposes. It is identical across the
541 * EUs, but since it's tracking GPU core speed it will increment at a
542 * varying rate as render P-states change.
543 *
544 * The caller could also check if render P-states have changed (or anything
545 * else that might disrupt timing) by setting smear to 2 and checking if
546 * that field is != 0.
547 */
548 dst.smear = 0;
549
550 return dst;
551 }
552
553 void
554 fs_visitor::emit_shader_time_begin()
555 {
556 current_annotation = "shader time start";
557 shader_start_time = get_timestamp();
558 }
559
560 void
561 fs_visitor::emit_shader_time_end()
562 {
563 current_annotation = "shader time end";
564
565 enum shader_time_shader_type type, written_type, reset_type;
566 if (dispatch_width == 8) {
567 type = ST_FS8;
568 written_type = ST_FS8_WRITTEN;
569 reset_type = ST_FS8_RESET;
570 } else {
571 assert(dispatch_width == 16);
572 type = ST_FS16;
573 written_type = ST_FS16_WRITTEN;
574 reset_type = ST_FS16_RESET;
575 }
576
577 fs_reg shader_end_time = get_timestamp();
578
579 /* Check that there weren't any timestamp reset events (assuming these
580 * were the only two timestamp reads that happened).
581 */
582 fs_reg reset = shader_end_time;
583 reset.smear = 2;
584 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
585 test->conditional_mod = BRW_CONDITIONAL_Z;
586 emit(IF(BRW_PREDICATE_NORMAL));
587
588 push_force_uncompressed();
589 fs_reg start = shader_start_time;
590 start.negate = true;
591 fs_reg diff = fs_reg(this, glsl_type::uint_type);
592 emit(ADD(diff, start, shader_end_time));
593
594 /* If there were no instructions between the two timestamp gets, the diff
595 * is 2 cycles. Remove that overhead, so I can forget about that when
596 * trying to determine the time taken for single instructions.
597 */
598 emit(ADD(diff, diff, fs_reg(-2u)));
599
600 emit_shader_time_write(type, diff);
601 emit_shader_time_write(written_type, fs_reg(1u));
602 emit(BRW_OPCODE_ELSE);
603 emit_shader_time_write(reset_type, fs_reg(1u));
604 emit(BRW_OPCODE_ENDIF);
605
606 pop_force_uncompressed();
607 }
608
609 void
610 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
611 fs_reg value)
612 {
613 int shader_time_index =
614 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
615 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
616
617 fs_reg payload;
618 if (dispatch_width == 8)
619 payload = fs_reg(this, glsl_type::uvec2_type);
620 else
621 payload = fs_reg(this, glsl_type::uint_type);
622
623 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
624 fs_reg(), payload, offset, value));
625 }
626
627 void
628 fs_visitor::fail(const char *format, ...)
629 {
630 va_list va;
631 char *msg;
632
633 if (failed)
634 return;
635
636 failed = true;
637
638 va_start(va, format);
639 msg = ralloc_vasprintf(mem_ctx, format, va);
640 va_end(va);
641 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
642
643 this->fail_msg = msg;
644
645 if (INTEL_DEBUG & DEBUG_WM) {
646 fprintf(stderr, "%s", msg);
647 }
648 }
649
650 fs_inst *
651 fs_visitor::emit(enum opcode opcode)
652 {
653 return emit(fs_inst(opcode));
654 }
655
656 fs_inst *
657 fs_visitor::emit(enum opcode opcode, fs_reg dst)
658 {
659 return emit(fs_inst(opcode, dst));
660 }
661
662 fs_inst *
663 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
664 {
665 return emit(fs_inst(opcode, dst, src0));
666 }
667
668 fs_inst *
669 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
670 {
671 return emit(fs_inst(opcode, dst, src0, src1));
672 }
673
674 fs_inst *
675 fs_visitor::emit(enum opcode opcode, fs_reg dst,
676 fs_reg src0, fs_reg src1, fs_reg src2)
677 {
678 return emit(fs_inst(opcode, dst, src0, src1, src2));
679 }
680
681 void
682 fs_visitor::push_force_uncompressed()
683 {
684 force_uncompressed_stack++;
685 }
686
687 void
688 fs_visitor::pop_force_uncompressed()
689 {
690 force_uncompressed_stack--;
691 assert(force_uncompressed_stack >= 0);
692 }
693
694 void
695 fs_visitor::push_force_sechalf()
696 {
697 force_sechalf_stack++;
698 }
699
700 void
701 fs_visitor::pop_force_sechalf()
702 {
703 force_sechalf_stack--;
704 assert(force_sechalf_stack >= 0);
705 }
706
707 /**
708 * Returns true if the instruction has a flag that means it won't
709 * update an entire destination register.
710 *
711 * For example, dead code elimination and live variable analysis want to know
712 * when a write to a variable screens off any preceding values that were in
713 * it.
714 */
715 bool
716 fs_inst::is_partial_write()
717 {
718 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
719 this->force_uncompressed ||
720 this->force_sechalf);
721 }
722
723 int
724 fs_inst::regs_read(fs_visitor *v, int arg)
725 {
726 if (is_tex() && arg == 0 && src[0].file == GRF) {
727 if (v->dispatch_width == 16)
728 return (mlen + 1) / 2;
729 else
730 return mlen;
731 }
732 return 1;
733 }
734
735 bool
736 fs_inst::reads_flag()
737 {
738 return predicate;
739 }
740
741 bool
742 fs_inst::writes_flag()
743 {
744 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
745 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
746 }
747
748 /**
749 * Returns how many MRFs an FS opcode will write over.
750 *
751 * Note that this is not the 0 or 1 implied writes in an actual gen
752 * instruction -- the FS opcodes often generate MOVs in addition.
753 */
754 int
755 fs_visitor::implied_mrf_writes(fs_inst *inst)
756 {
757 if (inst->mlen == 0)
758 return 0;
759
760 if (inst->base_mrf == -1)
761 return 0;
762
763 switch (inst->opcode) {
764 case SHADER_OPCODE_RCP:
765 case SHADER_OPCODE_RSQ:
766 case SHADER_OPCODE_SQRT:
767 case SHADER_OPCODE_EXP2:
768 case SHADER_OPCODE_LOG2:
769 case SHADER_OPCODE_SIN:
770 case SHADER_OPCODE_COS:
771 return 1 * dispatch_width / 8;
772 case SHADER_OPCODE_POW:
773 case SHADER_OPCODE_INT_QUOTIENT:
774 case SHADER_OPCODE_INT_REMAINDER:
775 return 2 * dispatch_width / 8;
776 case SHADER_OPCODE_TEX:
777 case FS_OPCODE_TXB:
778 case SHADER_OPCODE_TXD:
779 case SHADER_OPCODE_TXF:
780 case SHADER_OPCODE_TXF_MS:
781 case SHADER_OPCODE_TG4:
782 case SHADER_OPCODE_TG4_OFFSET:
783 case SHADER_OPCODE_TXL:
784 case SHADER_OPCODE_TXS:
785 case SHADER_OPCODE_LOD:
786 return 1;
787 case FS_OPCODE_FB_WRITE:
788 return 2;
789 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
790 case SHADER_OPCODE_GEN4_SCRATCH_READ:
791 return 1;
792 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
793 return inst->mlen;
794 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
795 return 2;
796 case SHADER_OPCODE_UNTYPED_ATOMIC:
797 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
798 return 0;
799 default:
800 assert(!"not reached");
801 return inst->mlen;
802 }
803 }
804
805 int
806 fs_visitor::virtual_grf_alloc(int size)
807 {
808 if (virtual_grf_array_size <= virtual_grf_count) {
809 if (virtual_grf_array_size == 0)
810 virtual_grf_array_size = 16;
811 else
812 virtual_grf_array_size *= 2;
813 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
814 virtual_grf_array_size);
815 }
816 virtual_grf_sizes[virtual_grf_count] = size;
817 return virtual_grf_count++;
818 }
819
820 /** Fixed HW reg constructor. */
821 fs_reg::fs_reg(enum register_file file, int reg)
822 {
823 init();
824 this->file = file;
825 this->reg = reg;
826 this->type = BRW_REGISTER_TYPE_F;
827 }
828
829 /** Fixed HW reg constructor. */
830 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
831 {
832 init();
833 this->file = file;
834 this->reg = reg;
835 this->type = type;
836 }
837
838 /** Automatic reg constructor. */
839 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
840 {
841 init();
842
843 this->file = GRF;
844 this->reg = v->virtual_grf_alloc(v->type_size(type));
845 this->reg_offset = 0;
846 this->type = brw_type_for_base_type(type);
847 }
848
849 fs_reg *
850 fs_visitor::variable_storage(ir_variable *var)
851 {
852 return (fs_reg *)hash_table_find(this->variable_ht, var);
853 }
854
855 void
856 import_uniforms_callback(const void *key,
857 void *data,
858 void *closure)
859 {
860 struct hash_table *dst_ht = (struct hash_table *)closure;
861 const fs_reg *reg = (const fs_reg *)data;
862
863 if (reg->file != UNIFORM)
864 return;
865
866 hash_table_insert(dst_ht, data, key);
867 }
868
869 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
870 * This brings in those uniform definitions
871 */
872 void
873 fs_visitor::import_uniforms(fs_visitor *v)
874 {
875 hash_table_call_foreach(v->variable_ht,
876 import_uniforms_callback,
877 variable_ht);
878 this->params_remap = v->params_remap;
879 this->nr_params_remap = v->nr_params_remap;
880 }
881
882 /* Our support for uniforms is piggy-backed on the struct
883 * gl_fragment_program, because that's where the values actually
884 * get stored, rather than in some global gl_shader_program uniform
885 * store.
886 */
887 void
888 fs_visitor::setup_uniform_values(ir_variable *ir)
889 {
890 int namelen = strlen(ir->name);
891
892 /* The data for our (non-builtin) uniforms is stored in a series of
893 * gl_uniform_driver_storage structs for each subcomponent that
894 * glGetUniformLocation() could name. We know it's been set up in the same
895 * order we'd walk the type, so walk the list of storage and find anything
896 * with our name, or the prefix of a component that starts with our name.
897 */
898 unsigned params_before = c->prog_data.nr_params;
899 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
900 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
901
902 if (strncmp(ir->name, storage->name, namelen) != 0 ||
903 (storage->name[namelen] != 0 &&
904 storage->name[namelen] != '.' &&
905 storage->name[namelen] != '[')) {
906 continue;
907 }
908
909 unsigned slots = storage->type->component_slots();
910 if (storage->array_elements)
911 slots *= storage->array_elements;
912
913 for (unsigned i = 0; i < slots; i++) {
914 c->prog_data.param[c->prog_data.nr_params++] =
915 &storage->storage[i].f;
916 }
917 }
918
919 /* Make sure we actually initialized the right amount of stuff here. */
920 assert(params_before + ir->type->component_slots() ==
921 c->prog_data.nr_params);
922 (void)params_before;
923 }
924
925
926 /* Our support for builtin uniforms is even scarier than non-builtin.
927 * It sits on top of the PROG_STATE_VAR parameters that are
928 * automatically updated from GL context state.
929 */
930 void
931 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
932 {
933 const ir_state_slot *const slots = ir->state_slots;
934 assert(ir->state_slots != NULL);
935
936 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
937 /* This state reference has already been setup by ir_to_mesa, but we'll
938 * get the same index back here.
939 */
940 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
941 (gl_state_index *)slots[i].tokens);
942
943 /* Add each of the unique swizzles of the element as a parameter.
944 * This'll end up matching the expected layout of the
945 * array/matrix/structure we're trying to fill in.
946 */
947 int last_swiz = -1;
948 for (unsigned int j = 0; j < 4; j++) {
949 int swiz = GET_SWZ(slots[i].swizzle, j);
950 if (swiz == last_swiz)
951 break;
952 last_swiz = swiz;
953
954 c->prog_data.param[c->prog_data.nr_params++] =
955 &fp->Base.Parameters->ParameterValues[index][swiz].f;
956 }
957 }
958 }
959
960 fs_reg *
961 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
962 {
963 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
964 fs_reg wpos = *reg;
965 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
966
967 /* gl_FragCoord.x */
968 if (ir->pixel_center_integer) {
969 emit(MOV(wpos, this->pixel_x));
970 } else {
971 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
972 }
973 wpos.reg_offset++;
974
975 /* gl_FragCoord.y */
976 if (!flip && ir->pixel_center_integer) {
977 emit(MOV(wpos, this->pixel_y));
978 } else {
979 fs_reg pixel_y = this->pixel_y;
980 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
981
982 if (flip) {
983 pixel_y.negate = true;
984 offset += c->key.drawable_height - 1.0;
985 }
986
987 emit(ADD(wpos, pixel_y, fs_reg(offset)));
988 }
989 wpos.reg_offset++;
990
991 /* gl_FragCoord.z */
992 if (brw->gen >= 6) {
993 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
994 } else {
995 emit(FS_OPCODE_LINTERP, wpos,
996 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
997 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
998 interp_reg(VARYING_SLOT_POS, 2));
999 }
1000 wpos.reg_offset++;
1001
1002 /* gl_FragCoord.w: Already set up in emit_interpolation */
1003 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1004
1005 return reg;
1006 }
1007
1008 fs_inst *
1009 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1010 glsl_interp_qualifier interpolation_mode,
1011 bool is_centroid)
1012 {
1013 brw_wm_barycentric_interp_mode barycoord_mode;
1014 if (brw->gen >= 6) {
1015 if (is_centroid) {
1016 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1017 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1018 else
1019 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1020 } else {
1021 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1022 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1023 else
1024 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1025 }
1026 } else {
1027 /* On Ironlake and below, there is only one interpolation mode.
1028 * Centroid interpolation doesn't mean anything on this hardware --
1029 * there is no multisampling.
1030 */
1031 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1032 }
1033 return emit(FS_OPCODE_LINTERP, attr,
1034 this->delta_x[barycoord_mode],
1035 this->delta_y[barycoord_mode], interp);
1036 }
1037
1038 fs_reg *
1039 fs_visitor::emit_general_interpolation(ir_variable *ir)
1040 {
1041 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1042 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1043 fs_reg attr = *reg;
1044
1045 unsigned int array_elements;
1046 const glsl_type *type;
1047
1048 if (ir->type->is_array()) {
1049 array_elements = ir->type->length;
1050 if (array_elements == 0) {
1051 fail("dereferenced array '%s' has length 0\n", ir->name);
1052 }
1053 type = ir->type->fields.array;
1054 } else {
1055 array_elements = 1;
1056 type = ir->type;
1057 }
1058
1059 glsl_interp_qualifier interpolation_mode =
1060 ir->determine_interpolation_mode(c->key.flat_shade);
1061
1062 int location = ir->location;
1063 for (unsigned int i = 0; i < array_elements; i++) {
1064 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1065 if (c->prog_data.urb_setup[location] == -1) {
1066 /* If there's no incoming setup data for this slot, don't
1067 * emit interpolation for it.
1068 */
1069 attr.reg_offset += type->vector_elements;
1070 location++;
1071 continue;
1072 }
1073
1074 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1075 /* Constant interpolation (flat shading) case. The SF has
1076 * handed us defined values in only the constant offset
1077 * field of the setup reg.
1078 */
1079 for (unsigned int k = 0; k < type->vector_elements; k++) {
1080 struct brw_reg interp = interp_reg(location, k);
1081 interp = suboffset(interp, 3);
1082 interp.type = reg->type;
1083 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1084 attr.reg_offset++;
1085 }
1086 } else {
1087 /* Smooth/noperspective interpolation case. */
1088 for (unsigned int k = 0; k < type->vector_elements; k++) {
1089 /* FINISHME: At some point we probably want to push
1090 * this farther by giving similar treatment to the
1091 * other potentially constant components of the
1092 * attribute, as well as making brw_vs_constval.c
1093 * handle varyings other than gl_TexCoord.
1094 */
1095 struct brw_reg interp = interp_reg(location, k);
1096 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1097 ir->centroid);
1098 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1099 /* Get the pixel/sample mask into f0 so that we know
1100 * which pixels are lit. Then, for each channel that is
1101 * unlit, replace the centroid data with non-centroid
1102 * data.
1103 */
1104 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1105 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1106 interpolation_mode, false);
1107 inst->predicate = BRW_PREDICATE_NORMAL;
1108 inst->predicate_inverse = true;
1109 }
1110 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1111 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1112 }
1113 attr.reg_offset++;
1114 }
1115
1116 }
1117 location++;
1118 }
1119 }
1120
1121 return reg;
1122 }
1123
1124 fs_reg *
1125 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1126 {
1127 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1128
1129 /* The frontfacing comes in as a bit in the thread payload. */
1130 if (brw->gen >= 6) {
1131 emit(BRW_OPCODE_ASR, *reg,
1132 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1133 fs_reg(15));
1134 emit(BRW_OPCODE_NOT, *reg, *reg);
1135 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1136 } else {
1137 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1138 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1139 * us front face
1140 */
1141 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1142 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1143 }
1144
1145 return reg;
1146 }
1147
1148 fs_reg
1149 fs_visitor::fix_math_operand(fs_reg src)
1150 {
1151 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1152 * might be able to do better by doing execsize = 1 math and then
1153 * expanding that result out, but we would need to be careful with
1154 * masking.
1155 *
1156 * The hardware ignores source modifiers (negate and abs) on math
1157 * instructions, so we also move to a temp to set those up.
1158 */
1159 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1160 !src.abs && !src.negate)
1161 return src;
1162
1163 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1164 * operands to math
1165 */
1166 if (brw->gen >= 7 && src.file != IMM)
1167 return src;
1168
1169 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1170 expanded.type = src.type;
1171 emit(BRW_OPCODE_MOV, expanded, src);
1172 return expanded;
1173 }
1174
1175 fs_inst *
1176 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1177 {
1178 switch (opcode) {
1179 case SHADER_OPCODE_RCP:
1180 case SHADER_OPCODE_RSQ:
1181 case SHADER_OPCODE_SQRT:
1182 case SHADER_OPCODE_EXP2:
1183 case SHADER_OPCODE_LOG2:
1184 case SHADER_OPCODE_SIN:
1185 case SHADER_OPCODE_COS:
1186 break;
1187 default:
1188 assert(!"not reached: bad math opcode");
1189 return NULL;
1190 }
1191
1192 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1193 * might be able to do better by doing execsize = 1 math and then
1194 * expanding that result out, but we would need to be careful with
1195 * masking.
1196 *
1197 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1198 * instructions, so we also move to a temp to set those up.
1199 */
1200 if (brw->gen >= 6)
1201 src = fix_math_operand(src);
1202
1203 fs_inst *inst = emit(opcode, dst, src);
1204
1205 if (brw->gen < 6) {
1206 inst->base_mrf = 2;
1207 inst->mlen = dispatch_width / 8;
1208 }
1209
1210 return inst;
1211 }
1212
1213 fs_inst *
1214 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1215 {
1216 int base_mrf = 2;
1217 fs_inst *inst;
1218
1219 switch (opcode) {
1220 case SHADER_OPCODE_INT_QUOTIENT:
1221 case SHADER_OPCODE_INT_REMAINDER:
1222 if (brw->gen >= 7 && dispatch_width == 16)
1223 fail("16-wide INTDIV unsupported\n");
1224 break;
1225 case SHADER_OPCODE_POW:
1226 break;
1227 default:
1228 assert(!"not reached: unsupported binary math opcode.");
1229 return NULL;
1230 }
1231
1232 if (brw->gen >= 6) {
1233 src0 = fix_math_operand(src0);
1234 src1 = fix_math_operand(src1);
1235
1236 inst = emit(opcode, dst, src0, src1);
1237 } else {
1238 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1239 * "Message Payload":
1240 *
1241 * "Operand0[7]. For the INT DIV functions, this operand is the
1242 * denominator."
1243 * ...
1244 * "Operand1[7]. For the INT DIV functions, this operand is the
1245 * numerator."
1246 */
1247 bool is_int_div = opcode != SHADER_OPCODE_POW;
1248 fs_reg &op0 = is_int_div ? src1 : src0;
1249 fs_reg &op1 = is_int_div ? src0 : src1;
1250
1251 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1252 inst = emit(opcode, dst, op0, reg_null_f);
1253
1254 inst->base_mrf = base_mrf;
1255 inst->mlen = 2 * dispatch_width / 8;
1256 }
1257 return inst;
1258 }
1259
1260 void
1261 fs_visitor::assign_curb_setup()
1262 {
1263 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1264 if (dispatch_width == 8) {
1265 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1266 } else {
1267 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1268 }
1269
1270 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1271 foreach_list(node, &this->instructions) {
1272 fs_inst *inst = (fs_inst *)node;
1273
1274 for (unsigned int i = 0; i < 3; i++) {
1275 if (inst->src[i].file == UNIFORM) {
1276 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1277 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1278 constant_nr / 8,
1279 constant_nr % 8);
1280
1281 inst->src[i].file = HW_REG;
1282 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1283 }
1284 }
1285 }
1286 }
1287
1288 void
1289 fs_visitor::calculate_urb_setup()
1290 {
1291 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1292 c->prog_data.urb_setup[i] = -1;
1293 }
1294
1295 int urb_next = 0;
1296 /* Figure out where each of the incoming setup attributes lands. */
1297 if (brw->gen >= 6) {
1298 if (_mesa_bitcount_64(fp->Base.InputsRead &
1299 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1300 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1301 * first 16 varying inputs, so we can put them wherever we want.
1302 * Just put them in order.
1303 *
1304 * This is useful because it means that (a) inputs not used by the
1305 * fragment shader won't take up valuable register space, and (b) we
1306 * won't have to recompile the fragment shader if it gets paired with
1307 * a different vertex (or geometry) shader.
1308 */
1309 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1310 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1311 BITFIELD64_BIT(i)) {
1312 c->prog_data.urb_setup[i] = urb_next++;
1313 }
1314 }
1315 } else {
1316 /* We have enough input varyings that the SF/SBE pipeline stage can't
1317 * arbitrarily rearrange them to suit our whim; we have to put them
1318 * in an order that matches the output of the previous pipeline stage
1319 * (geometry or vertex shader).
1320 */
1321 struct brw_vue_map prev_stage_vue_map;
1322 brw_compute_vue_map(brw, &prev_stage_vue_map,
1323 c->key.input_slots_valid);
1324 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1325 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1326 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1327 slot++) {
1328 int varying = prev_stage_vue_map.slot_to_varying[slot];
1329 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1330 * unused.
1331 */
1332 if (varying != BRW_VARYING_SLOT_COUNT &&
1333 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1334 BITFIELD64_BIT(varying))) {
1335 c->prog_data.urb_setup[varying] = slot - first_slot;
1336 }
1337 }
1338 urb_next = prev_stage_vue_map.num_slots - first_slot;
1339 }
1340 } else {
1341 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1342 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1343 /* Point size is packed into the header, not as a general attribute */
1344 if (i == VARYING_SLOT_PSIZ)
1345 continue;
1346
1347 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1348 /* The back color slot is skipped when the front color is
1349 * also written to. In addition, some slots can be
1350 * written in the vertex shader and not read in the
1351 * fragment shader. So the register number must always be
1352 * incremented, mapped or not.
1353 */
1354 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1355 c->prog_data.urb_setup[i] = urb_next;
1356 urb_next++;
1357 }
1358 }
1359
1360 /*
1361 * It's a FS only attribute, and we did interpolation for this attribute
1362 * in SF thread. So, count it here, too.
1363 *
1364 * See compile_sf_prog() for more info.
1365 */
1366 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1367 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1368 }
1369
1370 c->prog_data.num_varying_inputs = urb_next;
1371 }
1372
1373 void
1374 fs_visitor::assign_urb_setup()
1375 {
1376 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1377
1378 /* Offset all the urb_setup[] index by the actual position of the
1379 * setup regs, now that the location of the constants has been chosen.
1380 */
1381 foreach_list(node, &this->instructions) {
1382 fs_inst *inst = (fs_inst *)node;
1383
1384 if (inst->opcode == FS_OPCODE_LINTERP) {
1385 assert(inst->src[2].file == HW_REG);
1386 inst->src[2].fixed_hw_reg.nr += urb_start;
1387 }
1388
1389 if (inst->opcode == FS_OPCODE_CINTERP) {
1390 assert(inst->src[0].file == HW_REG);
1391 inst->src[0].fixed_hw_reg.nr += urb_start;
1392 }
1393 }
1394
1395 /* Each attribute is 4 setup channels, each of which is half a reg. */
1396 this->first_non_payload_grf =
1397 urb_start + c->prog_data.num_varying_inputs * 2;
1398 }
1399
1400 /**
1401 * Split large virtual GRFs into separate components if we can.
1402 *
1403 * This is mostly duplicated with what brw_fs_vector_splitting does,
1404 * but that's really conservative because it's afraid of doing
1405 * splitting that doesn't result in real progress after the rest of
1406 * the optimization phases, which would cause infinite looping in
1407 * optimization. We can do it once here, safely. This also has the
1408 * opportunity to split interpolated values, or maybe even uniforms,
1409 * which we don't have at the IR level.
1410 *
1411 * We want to split, because virtual GRFs are what we register
1412 * allocate and spill (due to contiguousness requirements for some
1413 * instructions), and they're what we naturally generate in the
1414 * codegen process, but most virtual GRFs don't actually need to be
1415 * contiguous sets of GRFs. If we split, we'll end up with reduced
1416 * live intervals and better dead code elimination and coalescing.
1417 */
1418 void
1419 fs_visitor::split_virtual_grfs()
1420 {
1421 int num_vars = this->virtual_grf_count;
1422 bool split_grf[num_vars];
1423 int new_virtual_grf[num_vars];
1424
1425 /* Try to split anything > 0 sized. */
1426 for (int i = 0; i < num_vars; i++) {
1427 if (this->virtual_grf_sizes[i] != 1)
1428 split_grf[i] = true;
1429 else
1430 split_grf[i] = false;
1431 }
1432
1433 if (brw->has_pln &&
1434 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1435 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1436 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1437 * Gen6, that was the only supported interpolation mode, and since Gen6,
1438 * delta_x and delta_y are in fixed hardware registers.
1439 */
1440 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1441 false;
1442 }
1443
1444 foreach_list(node, &this->instructions) {
1445 fs_inst *inst = (fs_inst *)node;
1446
1447 /* If there's a SEND message that requires contiguous destination
1448 * registers, no splitting is allowed.
1449 */
1450 if (inst->regs_written > 1) {
1451 split_grf[inst->dst.reg] = false;
1452 }
1453
1454 /* If we're sending from a GRF, don't split it, on the assumption that
1455 * the send is reading the whole thing.
1456 */
1457 if (inst->is_send_from_grf()) {
1458 for (int i = 0; i < 3; i++) {
1459 if (inst->src[i].file == GRF) {
1460 split_grf[inst->src[i].reg] = false;
1461 }
1462 }
1463 }
1464 }
1465
1466 /* Allocate new space for split regs. Note that the virtual
1467 * numbers will be contiguous.
1468 */
1469 for (int i = 0; i < num_vars; i++) {
1470 if (split_grf[i]) {
1471 new_virtual_grf[i] = virtual_grf_alloc(1);
1472 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1473 int reg = virtual_grf_alloc(1);
1474 assert(reg == new_virtual_grf[i] + j - 1);
1475 (void) reg;
1476 }
1477 this->virtual_grf_sizes[i] = 1;
1478 }
1479 }
1480
1481 foreach_list(node, &this->instructions) {
1482 fs_inst *inst = (fs_inst *)node;
1483
1484 if (inst->dst.file == GRF &&
1485 split_grf[inst->dst.reg] &&
1486 inst->dst.reg_offset != 0) {
1487 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1488 inst->dst.reg_offset - 1);
1489 inst->dst.reg_offset = 0;
1490 }
1491 for (int i = 0; i < 3; i++) {
1492 if (inst->src[i].file == GRF &&
1493 split_grf[inst->src[i].reg] &&
1494 inst->src[i].reg_offset != 0) {
1495 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1496 inst->src[i].reg_offset - 1);
1497 inst->src[i].reg_offset = 0;
1498 }
1499 }
1500 }
1501 invalidate_live_intervals();
1502 }
1503
1504 /**
1505 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1506 *
1507 * During code generation, we create tons of temporary variables, many of
1508 * which get immediately killed and are never used again. Yet, in later
1509 * optimization and analysis passes, such as compute_live_intervals, we need
1510 * to loop over all the virtual GRFs. Compacting them can save a lot of
1511 * overhead.
1512 */
1513 void
1514 fs_visitor::compact_virtual_grfs()
1515 {
1516 /* Mark which virtual GRFs are used, and count how many. */
1517 int remap_table[this->virtual_grf_count];
1518 memset(remap_table, -1, sizeof(remap_table));
1519
1520 foreach_list(node, &this->instructions) {
1521 const fs_inst *inst = (const fs_inst *) node;
1522
1523 if (inst->dst.file == GRF)
1524 remap_table[inst->dst.reg] = 0;
1525
1526 for (int i = 0; i < 3; i++) {
1527 if (inst->src[i].file == GRF)
1528 remap_table[inst->src[i].reg] = 0;
1529 }
1530 }
1531
1532 /* In addition to registers used in instructions, fs_visitor keeps
1533 * direct references to certain special values which must be patched:
1534 */
1535 fs_reg *special[] = {
1536 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1537 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1538 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1539 &delta_x[0], &delta_x[1], &delta_x[2],
1540 &delta_x[3], &delta_x[4], &delta_x[5],
1541 &delta_y[0], &delta_y[1], &delta_y[2],
1542 &delta_y[3], &delta_y[4], &delta_y[5],
1543 };
1544 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1545 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1546
1547 /* Treat all special values as used, to be conservative */
1548 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1549 if (special[i]->file == GRF)
1550 remap_table[special[i]->reg] = 0;
1551 }
1552
1553 /* Compact the GRF arrays. */
1554 int new_index = 0;
1555 for (int i = 0; i < this->virtual_grf_count; i++) {
1556 if (remap_table[i] != -1) {
1557 remap_table[i] = new_index;
1558 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1559 invalidate_live_intervals();
1560 ++new_index;
1561 }
1562 }
1563
1564 this->virtual_grf_count = new_index;
1565
1566 /* Patch all the instructions to use the newly renumbered registers */
1567 foreach_list(node, &this->instructions) {
1568 fs_inst *inst = (fs_inst *) node;
1569
1570 if (inst->dst.file == GRF)
1571 inst->dst.reg = remap_table[inst->dst.reg];
1572
1573 for (int i = 0; i < 3; i++) {
1574 if (inst->src[i].file == GRF)
1575 inst->src[i].reg = remap_table[inst->src[i].reg];
1576 }
1577 }
1578
1579 /* Patch all the references to special values */
1580 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1581 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1582 special[i]->reg = remap_table[special[i]->reg];
1583 }
1584 }
1585
1586 bool
1587 fs_visitor::remove_dead_constants()
1588 {
1589 if (dispatch_width == 8) {
1590 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1591 this->nr_params_remap = c->prog_data.nr_params;
1592
1593 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1594 this->params_remap[i] = -1;
1595
1596 /* Find which params are still in use. */
1597 foreach_list(node, &this->instructions) {
1598 fs_inst *inst = (fs_inst *)node;
1599
1600 for (int i = 0; i < 3; i++) {
1601 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1602
1603 if (inst->src[i].file != UNIFORM)
1604 continue;
1605
1606 /* Section 5.11 of the OpenGL 4.3 spec says:
1607 *
1608 * "Out-of-bounds reads return undefined values, which include
1609 * values from other variables of the active program or zero."
1610 */
1611 if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1612 constant_nr = 0;
1613 }
1614
1615 /* For now, set this to non-negative. We'll give it the
1616 * actual new number in a moment, in order to keep the
1617 * register numbers nicely ordered.
1618 */
1619 this->params_remap[constant_nr] = 0;
1620 }
1621 }
1622
1623 /* Figure out what the new numbers for the params will be. At some
1624 * point when we're doing uniform array access, we're going to want
1625 * to keep the distinction between .reg and .reg_offset, but for
1626 * now we don't care.
1627 */
1628 unsigned int new_nr_params = 0;
1629 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1630 if (this->params_remap[i] != -1) {
1631 this->params_remap[i] = new_nr_params++;
1632 }
1633 }
1634
1635 /* Update the list of params to be uploaded to match our new numbering. */
1636 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1637 int remapped = this->params_remap[i];
1638
1639 if (remapped == -1)
1640 continue;
1641
1642 c->prog_data.param[remapped] = c->prog_data.param[i];
1643 }
1644
1645 c->prog_data.nr_params = new_nr_params;
1646 } else {
1647 /* This should have been generated in the 8-wide pass already. */
1648 assert(this->params_remap);
1649 }
1650
1651 /* Now do the renumbering of the shader to remove unused params. */
1652 foreach_list(node, &this->instructions) {
1653 fs_inst *inst = (fs_inst *)node;
1654
1655 for (int i = 0; i < 3; i++) {
1656 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1657
1658 if (inst->src[i].file != UNIFORM)
1659 continue;
1660
1661 /* as above alias to 0 */
1662 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1663 constant_nr = 0;
1664 }
1665 assert(this->params_remap[constant_nr] != -1);
1666 inst->src[i].reg = this->params_remap[constant_nr];
1667 inst->src[i].reg_offset = 0;
1668 }
1669 }
1670
1671 return true;
1672 }
1673
1674 /*
1675 * Implements array access of uniforms by inserting a
1676 * PULL_CONSTANT_LOAD instruction.
1677 *
1678 * Unlike temporary GRF array access (where we don't support it due to
1679 * the difficulty of doing relative addressing on instruction
1680 * destinations), we could potentially do array access of uniforms
1681 * that were loaded in GRF space as push constants. In real-world
1682 * usage we've seen, though, the arrays being used are always larger
1683 * than we could load as push constants, so just always move all
1684 * uniform array access out to a pull constant buffer.
1685 */
1686 void
1687 fs_visitor::move_uniform_array_access_to_pull_constants()
1688 {
1689 int pull_constant_loc[c->prog_data.nr_params];
1690
1691 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1692 pull_constant_loc[i] = -1;
1693 }
1694
1695 /* Walk through and find array access of uniforms. Put a copy of that
1696 * uniform in the pull constant buffer.
1697 *
1698 * Note that we don't move constant-indexed accesses to arrays. No
1699 * testing has been done of the performance impact of this choice.
1700 */
1701 foreach_list_safe(node, &this->instructions) {
1702 fs_inst *inst = (fs_inst *)node;
1703
1704 for (int i = 0 ; i < 3; i++) {
1705 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1706 continue;
1707
1708 int uniform = inst->src[i].reg;
1709
1710 /* If this array isn't already present in the pull constant buffer,
1711 * add it.
1712 */
1713 if (pull_constant_loc[uniform] == -1) {
1714 const float **values = &c->prog_data.param[uniform];
1715
1716 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1717
1718 assert(param_size[uniform]);
1719
1720 for (int j = 0; j < param_size[uniform]; j++) {
1721 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1722 values[j];
1723 }
1724 }
1725
1726 /* Set up the annotation tracking for new generated instructions. */
1727 base_ir = inst->ir;
1728 current_annotation = inst->annotation;
1729
1730 fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1731 fs_reg temp = fs_reg(this, glsl_type::float_type);
1732 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1733 surf_index,
1734 *inst->src[i].reladdr,
1735 pull_constant_loc[uniform] +
1736 inst->src[i].reg_offset);
1737 inst->insert_before(&list);
1738
1739 inst->src[i].file = temp.file;
1740 inst->src[i].reg = temp.reg;
1741 inst->src[i].reg_offset = temp.reg_offset;
1742 inst->src[i].reladdr = NULL;
1743 }
1744 }
1745 }
1746
1747 /**
1748 * Choose accesses from the UNIFORM file to demote to using the pull
1749 * constant buffer.
1750 *
1751 * We allow a fragment shader to have more than the specified minimum
1752 * maximum number of fragment shader uniform components (64). If
1753 * there are too many of these, they'd fill up all of register space.
1754 * So, this will push some of them out to the pull constant buffer and
1755 * update the program to load them.
1756 */
1757 void
1758 fs_visitor::setup_pull_constants()
1759 {
1760 /* Only allow 16 registers (128 uniform components) as push constants. */
1761 unsigned int max_uniform_components = 16 * 8;
1762 if (c->prog_data.nr_params <= max_uniform_components)
1763 return;
1764
1765 if (dispatch_width == 16) {
1766 fail("Pull constants not supported in 16-wide\n");
1767 return;
1768 }
1769
1770 /* Just demote the end of the list. We could probably do better
1771 * here, demoting things that are rarely used in the program first.
1772 */
1773 unsigned int pull_uniform_base = max_uniform_components;
1774
1775 int pull_constant_loc[c->prog_data.nr_params];
1776 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1777 if (i < pull_uniform_base) {
1778 pull_constant_loc[i] = -1;
1779 } else {
1780 pull_constant_loc[i] = -1;
1781 /* If our constant is already being uploaded for reladdr purposes,
1782 * reuse it.
1783 */
1784 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1785 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1786 pull_constant_loc[i] = j;
1787 break;
1788 }
1789 }
1790 if (pull_constant_loc[i] == -1) {
1791 int pull_index = c->prog_data.nr_pull_params++;
1792 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1793 pull_constant_loc[i] = pull_index;;
1794 }
1795 }
1796 }
1797 c->prog_data.nr_params = pull_uniform_base;
1798
1799 foreach_list(node, &this->instructions) {
1800 fs_inst *inst = (fs_inst *)node;
1801
1802 for (int i = 0; i < 3; i++) {
1803 if (inst->src[i].file != UNIFORM)
1804 continue;
1805
1806 int pull_index = pull_constant_loc[inst->src[i].reg +
1807 inst->src[i].reg_offset];
1808 if (pull_index == -1)
1809 continue;
1810
1811 assert(!inst->src[i].reladdr);
1812
1813 fs_reg dst = fs_reg(this, glsl_type::float_type);
1814 fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1815 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1816 fs_inst *pull =
1817 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1818 dst, index, offset);
1819 pull->ir = inst->ir;
1820 pull->annotation = inst->annotation;
1821
1822 inst->insert_before(pull);
1823
1824 inst->src[i].file = GRF;
1825 inst->src[i].reg = dst.reg;
1826 inst->src[i].reg_offset = 0;
1827 inst->src[i].smear = pull_index & 3;
1828 }
1829 }
1830 }
1831
1832 bool
1833 fs_visitor::opt_algebraic()
1834 {
1835 bool progress = false;
1836
1837 foreach_list(node, &this->instructions) {
1838 fs_inst *inst = (fs_inst *)node;
1839
1840 switch (inst->opcode) {
1841 case BRW_OPCODE_MUL:
1842 if (inst->src[1].file != IMM)
1843 continue;
1844
1845 /* a * 1.0 = a */
1846 if (inst->src[1].is_one()) {
1847 inst->opcode = BRW_OPCODE_MOV;
1848 inst->src[1] = reg_undef;
1849 progress = true;
1850 break;
1851 }
1852
1853 /* a * 0.0 = 0.0 */
1854 if (inst->src[1].is_zero()) {
1855 inst->opcode = BRW_OPCODE_MOV;
1856 inst->src[0] = inst->src[1];
1857 inst->src[1] = reg_undef;
1858 progress = true;
1859 break;
1860 }
1861
1862 break;
1863 case BRW_OPCODE_ADD:
1864 if (inst->src[1].file != IMM)
1865 continue;
1866
1867 /* a + 0.0 = a */
1868 if (inst->src[1].is_zero()) {
1869 inst->opcode = BRW_OPCODE_MOV;
1870 inst->src[1] = reg_undef;
1871 progress = true;
1872 break;
1873 }
1874 break;
1875 case BRW_OPCODE_OR:
1876 if (inst->src[0].equals(inst->src[1])) {
1877 inst->opcode = BRW_OPCODE_MOV;
1878 inst->src[1] = reg_undef;
1879 progress = true;
1880 break;
1881 }
1882 break;
1883 case BRW_OPCODE_SEL:
1884 if (inst->saturate && inst->src[1].file == IMM) {
1885 switch (inst->conditional_mod) {
1886 case BRW_CONDITIONAL_LE:
1887 case BRW_CONDITIONAL_L:
1888 switch (inst->src[1].type) {
1889 case BRW_REGISTER_TYPE_F:
1890 if (inst->src[1].imm.f >= 1.0f) {
1891 inst->opcode = BRW_OPCODE_MOV;
1892 inst->src[1] = reg_undef;
1893 progress = true;
1894 }
1895 break;
1896 default:
1897 break;
1898 }
1899 break;
1900 case BRW_CONDITIONAL_GE:
1901 case BRW_CONDITIONAL_G:
1902 switch (inst->src[1].type) {
1903 case BRW_REGISTER_TYPE_F:
1904 if (inst->src[1].imm.f <= 0.0f) {
1905 inst->opcode = BRW_OPCODE_MOV;
1906 inst->src[1] = reg_undef;
1907 inst->conditional_mod = BRW_CONDITIONAL_NONE;
1908 progress = true;
1909 }
1910 break;
1911 default:
1912 break;
1913 }
1914 default:
1915 break;
1916 }
1917 }
1918 break;
1919 default:
1920 break;
1921 }
1922 }
1923
1924 return progress;
1925 }
1926
1927 /**
1928 * Removes any instructions writing a VGRF where that VGRF is not used by any
1929 * later instruction.
1930 */
1931 bool
1932 fs_visitor::dead_code_eliminate()
1933 {
1934 bool progress = false;
1935 int pc = 0;
1936
1937 calculate_live_intervals();
1938
1939 foreach_list_safe(node, &this->instructions) {
1940 fs_inst *inst = (fs_inst *)node;
1941
1942 if (inst->dst.file == GRF) {
1943 bool dead = true;
1944
1945 for (int i = 0; i < inst->regs_written; i++) {
1946 int var = live_intervals->var_from_vgrf[inst->dst.reg];
1947 assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
1948 if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
1949 dead = false;
1950 break;
1951 }
1952 }
1953
1954 if (dead) {
1955 /* Don't dead code eliminate instructions that write to the
1956 * accumulator as a side-effect. Instead just set the destination
1957 * to the null register to free it.
1958 */
1959 switch (inst->opcode) {
1960 case BRW_OPCODE_ADDC:
1961 case BRW_OPCODE_SUBB:
1962 case BRW_OPCODE_MACH:
1963 inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
1964 break;
1965 default:
1966 inst->remove();
1967 progress = true;
1968 break;
1969 }
1970 }
1971 }
1972
1973 pc++;
1974 }
1975
1976 if (progress)
1977 invalidate_live_intervals();
1978
1979 return progress;
1980 }
1981
1982 struct dead_code_hash_key
1983 {
1984 int vgrf;
1985 int reg_offset;
1986 };
1987
1988 static bool
1989 dead_code_hash_compare(const void *a, const void *b)
1990 {
1991 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1992 }
1993
1994 static void
1995 clear_dead_code_hash(struct hash_table *ht)
1996 {
1997 struct hash_entry *entry;
1998
1999 hash_table_foreach(ht, entry) {
2000 _mesa_hash_table_remove(ht, entry);
2001 }
2002 }
2003
2004 static void
2005 insert_dead_code_hash(struct hash_table *ht,
2006 int vgrf, int reg_offset, fs_inst *inst)
2007 {
2008 /* We don't bother freeing keys, because they'll be GCed with the ht. */
2009 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2010
2011 key->vgrf = vgrf;
2012 key->reg_offset = reg_offset;
2013
2014 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2015 }
2016
2017 static struct hash_entry *
2018 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2019 {
2020 struct dead_code_hash_key key;
2021
2022 key.vgrf = vgrf;
2023 key.reg_offset = reg_offset;
2024
2025 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2026 }
2027
2028 static void
2029 remove_dead_code_hash(struct hash_table *ht,
2030 int vgrf, int reg_offset)
2031 {
2032 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2033 if (!entry)
2034 return;
2035
2036 _mesa_hash_table_remove(ht, entry);
2037 }
2038
2039 /**
2040 * Walks basic blocks, removing any regs that are written but not read before
2041 * being redefined.
2042 *
2043 * The dead_code_eliminate() function implements a global dead code
2044 * elimination, but it only handles the removing the last write to a register
2045 * if it's never read. This one can handle intermediate writes, but only
2046 * within a basic block.
2047 */
2048 bool
2049 fs_visitor::dead_code_eliminate_local()
2050 {
2051 struct hash_table *ht;
2052 bool progress = false;
2053
2054 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2055
2056 foreach_list_safe(node, &this->instructions) {
2057 fs_inst *inst = (fs_inst *)node;
2058
2059 /* At a basic block, empty the HT since we don't understand dataflow
2060 * here.
2061 */
2062 if (inst->is_control_flow()) {
2063 clear_dead_code_hash(ht);
2064 continue;
2065 }
2066
2067 /* Clear the HT of any instructions that got read. */
2068 for (int i = 0; i < 3; i++) {
2069 fs_reg src = inst->src[i];
2070 if (src.file != GRF)
2071 continue;
2072
2073 int read = 1;
2074 if (inst->is_send_from_grf())
2075 read = virtual_grf_sizes[src.reg] - src.reg_offset;
2076
2077 for (int reg_offset = src.reg_offset;
2078 reg_offset < src.reg_offset + read;
2079 reg_offset++) {
2080 remove_dead_code_hash(ht, src.reg, reg_offset);
2081 }
2082 }
2083
2084 /* Add any update of a GRF to the HT, removing a previous write if it
2085 * wasn't read.
2086 */
2087 if (inst->dst.file == GRF) {
2088 if (inst->regs_written > 1) {
2089 /* We don't know how to trim channels from an instruction's
2090 * writes, so we can't incrementally remove unread channels from
2091 * it. Just remove whatever it overwrites from the table
2092 */
2093 for (int i = 0; i < inst->regs_written; i++) {
2094 remove_dead_code_hash(ht,
2095 inst->dst.reg,
2096 inst->dst.reg_offset + i);
2097 }
2098 } else {
2099 struct hash_entry *entry =
2100 get_dead_code_hash_entry(ht, inst->dst.reg,
2101 inst->dst.reg_offset);
2102
2103 if (inst->is_partial_write()) {
2104 /* For a partial write, we can't remove any previous dead code
2105 * candidate, since we're just modifying their result, but we can
2106 * be dead code eliminiated ourselves.
2107 */
2108 if (entry) {
2109 entry->data = inst;
2110 } else {
2111 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2112 inst);
2113 }
2114 } else {
2115 if (entry) {
2116 /* We're completely updating a channel, and there was a
2117 * previous write to the channel that wasn't read. Kill it!
2118 */
2119 fs_inst *inst = (fs_inst *)entry->data;
2120 inst->remove();
2121 progress = true;
2122 _mesa_hash_table_remove(ht, entry);
2123 }
2124
2125 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2126 inst);
2127 }
2128 }
2129 }
2130 }
2131
2132 _mesa_hash_table_destroy(ht, NULL);
2133
2134 if (progress)
2135 invalidate_live_intervals();
2136
2137 return progress;
2138 }
2139
2140 /**
2141 * Implements a second type of register coalescing: This one checks if
2142 * the two regs involved in a raw move don't interfere, in which case
2143 * they can both by stored in the same place and the MOV removed.
2144 */
2145 bool
2146 fs_visitor::register_coalesce_2()
2147 {
2148 bool progress = false;
2149
2150 calculate_live_intervals();
2151
2152 foreach_list_safe(node, &this->instructions) {
2153 fs_inst *inst = (fs_inst *)node;
2154
2155 if (inst->opcode != BRW_OPCODE_MOV ||
2156 inst->is_partial_write() ||
2157 inst->saturate ||
2158 inst->src[0].file != GRF ||
2159 inst->src[0].negate ||
2160 inst->src[0].abs ||
2161 inst->src[0].smear != -1 ||
2162 inst->dst.file != GRF ||
2163 inst->dst.type != inst->src[0].type ||
2164 virtual_grf_sizes[inst->src[0].reg] != 1) {
2165 continue;
2166 }
2167
2168 int var_from = live_intervals->var_from_reg(&inst->src[0]);
2169 int var_to = live_intervals->var_from_reg(&inst->dst);
2170
2171 if (live_intervals->vars_interfere(var_from, var_to))
2172 continue;
2173
2174 int reg_from = inst->src[0].reg;
2175 assert(inst->src[0].reg_offset == 0);
2176 int reg_to = inst->dst.reg;
2177 int reg_to_offset = inst->dst.reg_offset;
2178
2179 foreach_list(node, &this->instructions) {
2180 fs_inst *scan_inst = (fs_inst *)node;
2181
2182 if (scan_inst->dst.file == GRF &&
2183 scan_inst->dst.reg == reg_from) {
2184 scan_inst->dst.reg = reg_to;
2185 scan_inst->dst.reg_offset = reg_to_offset;
2186 }
2187 for (int i = 0; i < 3; i++) {
2188 if (scan_inst->src[i].file == GRF &&
2189 scan_inst->src[i].reg == reg_from) {
2190 scan_inst->src[i].reg = reg_to;
2191 scan_inst->src[i].reg_offset = reg_to_offset;
2192 }
2193 }
2194 }
2195
2196 inst->remove();
2197 progress = true;
2198 continue;
2199 }
2200
2201 if (progress)
2202 invalidate_live_intervals();
2203
2204 return progress;
2205 }
2206
2207 bool
2208 fs_visitor::register_coalesce()
2209 {
2210 bool progress = false;
2211 int if_depth = 0;
2212 int loop_depth = 0;
2213
2214 foreach_list_safe(node, &this->instructions) {
2215 fs_inst *inst = (fs_inst *)node;
2216
2217 /* Make sure that we dominate the instructions we're going to
2218 * scan for interfering with our coalescing, or we won't have
2219 * scanned enough to see if anything interferes with our
2220 * coalescing. We don't dominate the following instructions if
2221 * we're in a loop or an if block.
2222 */
2223 switch (inst->opcode) {
2224 case BRW_OPCODE_DO:
2225 loop_depth++;
2226 break;
2227 case BRW_OPCODE_WHILE:
2228 loop_depth--;
2229 break;
2230 case BRW_OPCODE_IF:
2231 if_depth++;
2232 break;
2233 case BRW_OPCODE_ENDIF:
2234 if_depth--;
2235 break;
2236 default:
2237 break;
2238 }
2239 if (loop_depth || if_depth)
2240 continue;
2241
2242 if (inst->opcode != BRW_OPCODE_MOV ||
2243 inst->is_partial_write() ||
2244 inst->saturate ||
2245 inst->dst.file != GRF || (inst->src[0].file != GRF &&
2246 inst->src[0].file != UNIFORM)||
2247 inst->dst.type != inst->src[0].type)
2248 continue;
2249
2250 bool has_source_modifiers = (inst->src[0].abs ||
2251 inst->src[0].negate ||
2252 inst->src[0].smear != -1 ||
2253 inst->src[0].file == UNIFORM);
2254
2255 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
2256 * them: check for no writes to either one until the exit of the
2257 * program.
2258 */
2259 bool interfered = false;
2260
2261 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2262 !scan_inst->is_tail_sentinel();
2263 scan_inst = (fs_inst *)scan_inst->next) {
2264 if (scan_inst->dst.file == GRF) {
2265 if (scan_inst->overwrites_reg(inst->dst) ||
2266 scan_inst->overwrites_reg(inst->src[0])) {
2267 interfered = true;
2268 break;
2269 }
2270 }
2271
2272 if (has_source_modifiers) {
2273 for (int i = 0; i < 3; i++) {
2274 if (scan_inst->src[i].file == GRF &&
2275 scan_inst->src[i].reg == inst->dst.reg &&
2276 scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2277 inst->dst.type != scan_inst->src[i].type)
2278 {
2279 interfered = true;
2280 break;
2281 }
2282 }
2283 }
2284
2285
2286 /* The gen6 MATH instruction can't handle source modifiers or
2287 * unusual register regions, so avoid coalescing those for
2288 * now. We should do something more specific.
2289 */
2290 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2291 interfered = true;
2292 break;
2293 }
2294
2295 if (scan_inst->mlen > 0 && scan_inst->base_mrf == -1 &&
2296 scan_inst->src[0].file == GRF &&
2297 scan_inst->src[0].reg == inst->dst.reg) {
2298 interfered = true;
2299 break;
2300 }
2301
2302 /* The accumulator result appears to get used for the
2303 * conditional modifier generation. When negating a UD
2304 * value, there is a 33rd bit generated for the sign in the
2305 * accumulator value, so now you can't check, for example,
2306 * equality with a 32-bit value. See piglit fs-op-neg-uint.
2307 */
2308 if (scan_inst->conditional_mod &&
2309 inst->src[0].negate &&
2310 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2311 interfered = true;
2312 break;
2313 }
2314 }
2315 if (interfered) {
2316 continue;
2317 }
2318
2319 /* Rewrite the later usage to point at the source of the move to
2320 * be removed.
2321 */
2322 for (fs_inst *scan_inst = inst;
2323 !scan_inst->is_tail_sentinel();
2324 scan_inst = (fs_inst *)scan_inst->next) {
2325 for (int i = 0; i < 3; i++) {
2326 if (scan_inst->src[i].file == GRF &&
2327 scan_inst->src[i].reg == inst->dst.reg &&
2328 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2329 fs_reg new_src = inst->src[0];
2330 if (scan_inst->src[i].abs) {
2331 new_src.negate = 0;
2332 new_src.abs = 1;
2333 }
2334 new_src.negate ^= scan_inst->src[i].negate;
2335 new_src.sechalf = scan_inst->src[i].sechalf;
2336 scan_inst->src[i] = new_src;
2337 }
2338 }
2339 }
2340
2341 inst->remove();
2342 progress = true;
2343 }
2344
2345 if (progress)
2346 invalidate_live_intervals();
2347
2348 return progress;
2349 }
2350
2351
2352 bool
2353 fs_visitor::compute_to_mrf()
2354 {
2355 bool progress = false;
2356 int next_ip = 0;
2357
2358 calculate_live_intervals();
2359
2360 foreach_list_safe(node, &this->instructions) {
2361 fs_inst *inst = (fs_inst *)node;
2362
2363 int ip = next_ip;
2364 next_ip++;
2365
2366 if (inst->opcode != BRW_OPCODE_MOV ||
2367 inst->is_partial_write() ||
2368 inst->dst.file != MRF || inst->src[0].file != GRF ||
2369 inst->dst.type != inst->src[0].type ||
2370 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2371 continue;
2372
2373 /* Work out which hardware MRF registers are written by this
2374 * instruction.
2375 */
2376 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2377 int mrf_high;
2378 if (inst->dst.reg & BRW_MRF_COMPR4) {
2379 mrf_high = mrf_low + 4;
2380 } else if (dispatch_width == 16 &&
2381 (!inst->force_uncompressed && !inst->force_sechalf)) {
2382 mrf_high = mrf_low + 1;
2383 } else {
2384 mrf_high = mrf_low;
2385 }
2386
2387 /* Can't compute-to-MRF this GRF if someone else was going to
2388 * read it later.
2389 */
2390 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2391 continue;
2392
2393 /* Found a move of a GRF to a MRF. Let's see if we can go
2394 * rewrite the thing that made this GRF to write into the MRF.
2395 */
2396 fs_inst *scan_inst;
2397 for (scan_inst = (fs_inst *)inst->prev;
2398 scan_inst->prev != NULL;
2399 scan_inst = (fs_inst *)scan_inst->prev) {
2400 if (scan_inst->dst.file == GRF &&
2401 scan_inst->dst.reg == inst->src[0].reg) {
2402 /* Found the last thing to write our reg we want to turn
2403 * into a compute-to-MRF.
2404 */
2405
2406 /* If this one instruction didn't populate all the
2407 * channels, bail. We might be able to rewrite everything
2408 * that writes that reg, but it would require smarter
2409 * tracking to delay the rewriting until complete success.
2410 */
2411 if (scan_inst->is_partial_write())
2412 break;
2413
2414 /* Things returning more than one register would need us to
2415 * understand coalescing out more than one MOV at a time.
2416 */
2417 if (scan_inst->regs_written > 1)
2418 break;
2419
2420 /* SEND instructions can't have MRF as a destination. */
2421 if (scan_inst->mlen)
2422 break;
2423
2424 if (brw->gen == 6) {
2425 /* gen6 math instructions must have the destination be
2426 * GRF, so no compute-to-MRF for them.
2427 */
2428 if (scan_inst->is_math()) {
2429 break;
2430 }
2431 }
2432
2433 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2434 /* Found the creator of our MRF's source value. */
2435 scan_inst->dst.file = MRF;
2436 scan_inst->dst.reg = inst->dst.reg;
2437 scan_inst->saturate |= inst->saturate;
2438 inst->remove();
2439 progress = true;
2440 }
2441 break;
2442 }
2443
2444 /* We don't handle control flow here. Most computation of
2445 * values that end up in MRFs are shortly before the MRF
2446 * write anyway.
2447 */
2448 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2449 break;
2450
2451 /* You can't read from an MRF, so if someone else reads our
2452 * MRF's source GRF that we wanted to rewrite, that stops us.
2453 */
2454 bool interfered = false;
2455 for (int i = 0; i < 3; i++) {
2456 if (scan_inst->src[i].file == GRF &&
2457 scan_inst->src[i].reg == inst->src[0].reg &&
2458 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2459 interfered = true;
2460 }
2461 }
2462 if (interfered)
2463 break;
2464
2465 if (scan_inst->dst.file == MRF) {
2466 /* If somebody else writes our MRF here, we can't
2467 * compute-to-MRF before that.
2468 */
2469 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2470 int scan_mrf_high;
2471
2472 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2473 scan_mrf_high = scan_mrf_low + 4;
2474 } else if (dispatch_width == 16 &&
2475 (!scan_inst->force_uncompressed &&
2476 !scan_inst->force_sechalf)) {
2477 scan_mrf_high = scan_mrf_low + 1;
2478 } else {
2479 scan_mrf_high = scan_mrf_low;
2480 }
2481
2482 if (mrf_low == scan_mrf_low ||
2483 mrf_low == scan_mrf_high ||
2484 mrf_high == scan_mrf_low ||
2485 mrf_high == scan_mrf_high) {
2486 break;
2487 }
2488 }
2489
2490 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2491 /* Found a SEND instruction, which means that there are
2492 * live values in MRFs from base_mrf to base_mrf +
2493 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2494 * above it.
2495 */
2496 if (mrf_low >= scan_inst->base_mrf &&
2497 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2498 break;
2499 }
2500 if (mrf_high >= scan_inst->base_mrf &&
2501 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2502 break;
2503 }
2504 }
2505 }
2506 }
2507
2508 if (progress)
2509 invalidate_live_intervals();
2510
2511 return progress;
2512 }
2513
2514 /**
2515 * Walks through basic blocks, looking for repeated MRF writes and
2516 * removing the later ones.
2517 */
2518 bool
2519 fs_visitor::remove_duplicate_mrf_writes()
2520 {
2521 fs_inst *last_mrf_move[16];
2522 bool progress = false;
2523
2524 /* Need to update the MRF tracking for compressed instructions. */
2525 if (dispatch_width == 16)
2526 return false;
2527
2528 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2529
2530 foreach_list_safe(node, &this->instructions) {
2531 fs_inst *inst = (fs_inst *)node;
2532
2533 if (inst->is_control_flow()) {
2534 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2535 }
2536
2537 if (inst->opcode == BRW_OPCODE_MOV &&
2538 inst->dst.file == MRF) {
2539 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2540 if (prev_inst && inst->equals(prev_inst)) {
2541 inst->remove();
2542 progress = true;
2543 continue;
2544 }
2545 }
2546
2547 /* Clear out the last-write records for MRFs that were overwritten. */
2548 if (inst->dst.file == MRF) {
2549 last_mrf_move[inst->dst.reg] = NULL;
2550 }
2551
2552 if (inst->mlen > 0 && inst->base_mrf != -1) {
2553 /* Found a SEND instruction, which will include two or fewer
2554 * implied MRF writes. We could do better here.
2555 */
2556 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2557 last_mrf_move[inst->base_mrf + i] = NULL;
2558 }
2559 }
2560
2561 /* Clear out any MRF move records whose sources got overwritten. */
2562 if (inst->dst.file == GRF) {
2563 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2564 if (last_mrf_move[i] &&
2565 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2566 last_mrf_move[i] = NULL;
2567 }
2568 }
2569 }
2570
2571 if (inst->opcode == BRW_OPCODE_MOV &&
2572 inst->dst.file == MRF &&
2573 inst->src[0].file == GRF &&
2574 !inst->is_partial_write()) {
2575 last_mrf_move[inst->dst.reg] = inst;
2576 }
2577 }
2578
2579 if (progress)
2580 invalidate_live_intervals();
2581
2582 return progress;
2583 }
2584
2585 static void
2586 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2587 int first_grf, int grf_len)
2588 {
2589 bool inst_16wide = (dispatch_width > 8 &&
2590 !inst->force_uncompressed &&
2591 !inst->force_sechalf);
2592
2593 /* Clear the flag for registers that actually got read (as expected). */
2594 for (int i = 0; i < 3; i++) {
2595 int grf;
2596 if (inst->src[i].file == GRF) {
2597 grf = inst->src[i].reg;
2598 } else if (inst->src[i].file == HW_REG &&
2599 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2600 grf = inst->src[i].fixed_hw_reg.nr;
2601 } else {
2602 continue;
2603 }
2604
2605 if (grf >= first_grf &&
2606 grf < first_grf + grf_len) {
2607 deps[grf - first_grf] = false;
2608 if (inst_16wide)
2609 deps[grf - first_grf + 1] = false;
2610 }
2611 }
2612 }
2613
2614 /**
2615 * Implements this workaround for the original 965:
2616 *
2617 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2618 * check for post destination dependencies on this instruction, software
2619 * must ensure that there is no destination hazard for the case of ‘write
2620 * followed by a posted write’ shown in the following example.
2621 *
2622 * 1. mov r3 0
2623 * 2. send r3.xy <rest of send instruction>
2624 * 3. mov r2 r3
2625 *
2626 * Due to no post-destination dependency check on the ‘send’, the above
2627 * code sequence could have two instructions (1 and 2) in flight at the
2628 * same time that both consider ‘r3’ as the target of their final writes.
2629 */
2630 void
2631 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2632 {
2633 int reg_size = dispatch_width / 8;
2634 int write_len = inst->regs_written * reg_size;
2635 int first_write_grf = inst->dst.reg;
2636 bool needs_dep[BRW_MAX_MRF];
2637 assert(write_len < (int)sizeof(needs_dep) - 1);
2638
2639 memset(needs_dep, false, sizeof(needs_dep));
2640 memset(needs_dep, true, write_len);
2641
2642 clear_deps_for_inst_src(inst, dispatch_width,
2643 needs_dep, first_write_grf, write_len);
2644
2645 /* Walk backwards looking for writes to registers we're writing which
2646 * aren't read since being written. If we hit the start of the program,
2647 * we assume that there are no outstanding dependencies on entry to the
2648 * program.
2649 */
2650 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2651 scan_inst != NULL;
2652 scan_inst = (fs_inst *)scan_inst->prev) {
2653
2654 /* If we hit control flow, assume that there *are* outstanding
2655 * dependencies, and force their cleanup before our instruction.
2656 */
2657 if (scan_inst->is_control_flow()) {
2658 for (int i = 0; i < write_len; i++) {
2659 if (needs_dep[i]) {
2660 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2661 }
2662 }
2663 return;
2664 }
2665
2666 bool scan_inst_16wide = (dispatch_width > 8 &&
2667 !scan_inst->force_uncompressed &&
2668 !scan_inst->force_sechalf);
2669
2670 /* We insert our reads as late as possible on the assumption that any
2671 * instruction but a MOV that might have left us an outstanding
2672 * dependency has more latency than a MOV.
2673 */
2674 if (scan_inst->dst.file == GRF) {
2675 for (int i = 0; i < scan_inst->regs_written; i++) {
2676 int reg = scan_inst->dst.reg + i * reg_size;
2677
2678 if (reg >= first_write_grf &&
2679 reg < first_write_grf + write_len &&
2680 needs_dep[reg - first_write_grf]) {
2681 inst->insert_before(DEP_RESOLVE_MOV(reg));
2682 needs_dep[reg - first_write_grf] = false;
2683 if (scan_inst_16wide)
2684 needs_dep[reg - first_write_grf + 1] = false;
2685 }
2686 }
2687 }
2688
2689 /* Clear the flag for registers that actually got read (as expected). */
2690 clear_deps_for_inst_src(scan_inst, dispatch_width,
2691 needs_dep, first_write_grf, write_len);
2692
2693 /* Continue the loop only if we haven't resolved all the dependencies */
2694 int i;
2695 for (i = 0; i < write_len; i++) {
2696 if (needs_dep[i])
2697 break;
2698 }
2699 if (i == write_len)
2700 return;
2701 }
2702 }
2703
2704 /**
2705 * Implements this workaround for the original 965:
2706 *
2707 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2708 * used as a destination register until after it has been sourced by an
2709 * instruction with a different destination register.
2710 */
2711 void
2712 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2713 {
2714 int write_len = inst->regs_written * dispatch_width / 8;
2715 int first_write_grf = inst->dst.reg;
2716 bool needs_dep[BRW_MAX_MRF];
2717 assert(write_len < (int)sizeof(needs_dep) - 1);
2718
2719 memset(needs_dep, false, sizeof(needs_dep));
2720 memset(needs_dep, true, write_len);
2721 /* Walk forwards looking for writes to registers we're writing which aren't
2722 * read before being written.
2723 */
2724 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2725 !scan_inst->is_tail_sentinel();
2726 scan_inst = (fs_inst *)scan_inst->next) {
2727 /* If we hit control flow, force resolve all remaining dependencies. */
2728 if (scan_inst->is_control_flow()) {
2729 for (int i = 0; i < write_len; i++) {
2730 if (needs_dep[i])
2731 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2732 }
2733 return;
2734 }
2735
2736 /* Clear the flag for registers that actually got read (as expected). */
2737 clear_deps_for_inst_src(scan_inst, dispatch_width,
2738 needs_dep, first_write_grf, write_len);
2739
2740 /* We insert our reads as late as possible since they're reading the
2741 * result of a SEND, which has massive latency.
2742 */
2743 if (scan_inst->dst.file == GRF &&
2744 scan_inst->dst.reg >= first_write_grf &&
2745 scan_inst->dst.reg < first_write_grf + write_len &&
2746 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2747 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2748 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2749 }
2750
2751 /* Continue the loop only if we haven't resolved all the dependencies */
2752 int i;
2753 for (i = 0; i < write_len; i++) {
2754 if (needs_dep[i])
2755 break;
2756 }
2757 if (i == write_len)
2758 return;
2759 }
2760
2761 /* If we hit the end of the program, resolve all remaining dependencies out
2762 * of paranoia.
2763 */
2764 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2765 assert(last_inst->eot);
2766 for (int i = 0; i < write_len; i++) {
2767 if (needs_dep[i])
2768 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2769 }
2770 }
2771
2772 void
2773 fs_visitor::insert_gen4_send_dependency_workarounds()
2774 {
2775 if (brw->gen != 4 || brw->is_g4x)
2776 return;
2777
2778 /* Note that we're done with register allocation, so GRF fs_regs always
2779 * have a .reg_offset of 0.
2780 */
2781
2782 foreach_list_safe(node, &this->instructions) {
2783 fs_inst *inst = (fs_inst *)node;
2784
2785 if (inst->mlen != 0 && inst->dst.file == GRF) {
2786 insert_gen4_pre_send_dependency_workarounds(inst);
2787 insert_gen4_post_send_dependency_workarounds(inst);
2788 }
2789 }
2790 }
2791
2792 /**
2793 * Turns the generic expression-style uniform pull constant load instruction
2794 * into a hardware-specific series of instructions for loading a pull
2795 * constant.
2796 *
2797 * The expression style allows the CSE pass before this to optimize out
2798 * repeated loads from the same offset, and gives the pre-register-allocation
2799 * scheduling full flexibility, while the conversion to native instructions
2800 * allows the post-register-allocation scheduler the best information
2801 * possible.
2802 *
2803 * Note that execution masking for setting up pull constant loads is special:
2804 * the channels that need to be written are unrelated to the current execution
2805 * mask, since a later instruction will use one of the result channels as a
2806 * source operand for all 8 or 16 of its channels.
2807 */
2808 void
2809 fs_visitor::lower_uniform_pull_constant_loads()
2810 {
2811 foreach_list(node, &this->instructions) {
2812 fs_inst *inst = (fs_inst *)node;
2813
2814 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2815 continue;
2816
2817 if (brw->gen >= 7) {
2818 /* The offset arg before was a vec4-aligned byte offset. We need to
2819 * turn it into a dword offset.
2820 */
2821 fs_reg const_offset_reg = inst->src[1];
2822 assert(const_offset_reg.file == IMM &&
2823 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2824 const_offset_reg.imm.u /= 4;
2825 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2826
2827 /* This is actually going to be a MOV, but since only the first dword
2828 * is accessed, we have a special opcode to do just that one. Note
2829 * that this needs to be an operation that will be considered a def
2830 * by live variable analysis, or register allocation will explode.
2831 */
2832 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2833 payload, const_offset_reg);
2834 setup->force_writemask_all = true;
2835
2836 setup->ir = inst->ir;
2837 setup->annotation = inst->annotation;
2838 inst->insert_before(setup);
2839
2840 /* Similarly, this will only populate the first 4 channels of the
2841 * result register (since we only use smear values from 0-3), but we
2842 * don't tell the optimizer.
2843 */
2844 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2845 inst->src[1] = payload;
2846
2847 invalidate_live_intervals();
2848 } else {
2849 /* Before register allocation, we didn't tell the scheduler about the
2850 * MRF we use. We know it's safe to use this MRF because nothing
2851 * else does except for register spill/unspill, which generates and
2852 * uses its MRF within a single IR instruction.
2853 */
2854 inst->base_mrf = 14;
2855 inst->mlen = 1;
2856 }
2857 }
2858 }
2859
2860 void
2861 fs_visitor::dump_instruction(backend_instruction *be_inst)
2862 {
2863 fs_inst *inst = (fs_inst *)be_inst;
2864
2865 if (inst->predicate) {
2866 printf("(%cf0.%d) ",
2867 inst->predicate_inverse ? '-' : '+',
2868 inst->flag_subreg);
2869 }
2870
2871 printf("%s", brw_instruction_name(inst->opcode));
2872 if (inst->saturate)
2873 printf(".sat");
2874 if (inst->conditional_mod) {
2875 printf(".cmod");
2876 if (!inst->predicate &&
2877 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2878 inst->opcode != BRW_OPCODE_IF &&
2879 inst->opcode != BRW_OPCODE_WHILE))) {
2880 printf(".f0.%d", inst->flag_subreg);
2881 }
2882 }
2883 printf(" ");
2884
2885
2886 switch (inst->dst.file) {
2887 case GRF:
2888 printf("vgrf%d", inst->dst.reg);
2889 if (inst->dst.reg_offset)
2890 printf("+%d", inst->dst.reg_offset);
2891 break;
2892 case MRF:
2893 printf("m%d", inst->dst.reg);
2894 break;
2895 case BAD_FILE:
2896 printf("(null)");
2897 break;
2898 case UNIFORM:
2899 printf("***u%d***", inst->dst.reg);
2900 break;
2901 case HW_REG:
2902 printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
2903 if (inst->dst.fixed_hw_reg.subnr)
2904 printf("+%d", inst->dst.fixed_hw_reg.subnr);
2905 break;
2906 default:
2907 printf("???");
2908 break;
2909 }
2910 printf(", ");
2911
2912 for (int i = 0; i < 3; i++) {
2913 if (inst->src[i].negate)
2914 printf("-");
2915 if (inst->src[i].abs)
2916 printf("|");
2917 switch (inst->src[i].file) {
2918 case GRF:
2919 printf("vgrf%d", inst->src[i].reg);
2920 if (inst->src[i].reg_offset)
2921 printf("+%d", inst->src[i].reg_offset);
2922 break;
2923 case MRF:
2924 printf("***m%d***", inst->src[i].reg);
2925 break;
2926 case UNIFORM:
2927 printf("u%d", inst->src[i].reg);
2928 if (inst->src[i].reg_offset)
2929 printf(".%d", inst->src[i].reg_offset);
2930 break;
2931 case BAD_FILE:
2932 printf("(null)");
2933 break;
2934 case IMM:
2935 switch (inst->src[i].type) {
2936 case BRW_REGISTER_TYPE_F:
2937 printf("%ff", inst->src[i].imm.f);
2938 break;
2939 case BRW_REGISTER_TYPE_D:
2940 printf("%dd", inst->src[i].imm.i);
2941 break;
2942 case BRW_REGISTER_TYPE_UD:
2943 printf("%uu", inst->src[i].imm.u);
2944 break;
2945 default:
2946 printf("???");
2947 break;
2948 }
2949 break;
2950 case HW_REG:
2951 if (inst->src[i].fixed_hw_reg.negate)
2952 printf("-");
2953 if (inst->src[i].fixed_hw_reg.abs)
2954 printf("|");
2955 printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2956 if (inst->src[i].fixed_hw_reg.subnr)
2957 printf("+%d", inst->src[i].fixed_hw_reg.subnr);
2958 if (inst->src[i].fixed_hw_reg.abs)
2959 printf("|");
2960 break;
2961 default:
2962 printf("???");
2963 break;
2964 }
2965 if (inst->src[i].abs)
2966 printf("|");
2967
2968 if (i < 3)
2969 printf(", ");
2970 }
2971
2972 printf(" ");
2973
2974 if (inst->force_uncompressed)
2975 printf("1sthalf ");
2976
2977 if (inst->force_sechalf)
2978 printf("2ndhalf ");
2979
2980 printf("\n");
2981 }
2982
2983 /**
2984 * Possibly returns an instruction that set up @param reg.
2985 *
2986 * Sometimes we want to take the result of some expression/variable
2987 * dereference tree and rewrite the instruction generating the result
2988 * of the tree. When processing the tree, we know that the
2989 * instructions generated are all writing temporaries that are dead
2990 * outside of this tree. So, if we have some instructions that write
2991 * a temporary, we're free to point that temp write somewhere else.
2992 *
2993 * Note that this doesn't guarantee that the instruction generated
2994 * only reg -- it might be the size=4 destination of a texture instruction.
2995 */
2996 fs_inst *
2997 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2998 fs_inst *end,
2999 fs_reg reg)
3000 {
3001 if (end == start ||
3002 end->is_partial_write() ||
3003 reg.reladdr ||
3004 !reg.equals(end->dst)) {
3005 return NULL;
3006 } else {
3007 return end;
3008 }
3009 }
3010
3011 void
3012 fs_visitor::setup_payload_gen6()
3013 {
3014 bool uses_depth =
3015 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3016 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3017
3018 assert(brw->gen >= 6);
3019
3020 /* R0-1: masks, pixel X/Y coordinates. */
3021 c->nr_payload_regs = 2;
3022 /* R2: only for 32-pixel dispatch.*/
3023
3024 /* R3-26: barycentric interpolation coordinates. These appear in the
3025 * same order that they appear in the brw_wm_barycentric_interp_mode
3026 * enum. Each set of coordinates occupies 2 registers if dispatch width
3027 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3028 * appear if they were enabled using the "Barycentric Interpolation
3029 * Mode" bits in WM_STATE.
3030 */
3031 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3032 if (barycentric_interp_modes & (1 << i)) {
3033 c->barycentric_coord_reg[i] = c->nr_payload_regs;
3034 c->nr_payload_regs += 2;
3035 if (dispatch_width == 16) {
3036 c->nr_payload_regs += 2;
3037 }
3038 }
3039 }
3040
3041 /* R27: interpolated depth if uses source depth */
3042 if (uses_depth) {
3043 c->source_depth_reg = c->nr_payload_regs;
3044 c->nr_payload_regs++;
3045 if (dispatch_width == 16) {
3046 /* R28: interpolated depth if not 8-wide. */
3047 c->nr_payload_regs++;
3048 }
3049 }
3050 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3051 if (uses_depth) {
3052 c->source_w_reg = c->nr_payload_regs;
3053 c->nr_payload_regs++;
3054 if (dispatch_width == 16) {
3055 /* R30: interpolated W if not 8-wide. */
3056 c->nr_payload_regs++;
3057 }
3058 }
3059 /* R31: MSAA position offsets. */
3060 /* R32-: bary for 32-pixel. */
3061 /* R58-59: interp W for 32-pixel. */
3062
3063 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3064 c->source_depth_to_render_target = true;
3065 }
3066 }
3067
3068 void
3069 fs_visitor::assign_binding_table_offsets()
3070 {
3071 uint32_t next_binding_table_offset = 0;
3072
3073 c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3074 next_binding_table_offset += c->key.nr_color_regions;
3075
3076 assign_common_binding_table_offsets(next_binding_table_offset);
3077 }
3078
3079 bool
3080 fs_visitor::run()
3081 {
3082 sanity_param_count = fp->Base.Parameters->NumParameters;
3083 uint32_t orig_nr_params = c->prog_data.nr_params;
3084
3085 assign_binding_table_offsets();
3086
3087 if (brw->gen >= 6)
3088 setup_payload_gen6();
3089 else
3090 setup_payload_gen4();
3091
3092 if (0) {
3093 emit_dummy_fs();
3094 } else {
3095 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3096 emit_shader_time_begin();
3097
3098 calculate_urb_setup();
3099 if (fp->Base.InputsRead > 0) {
3100 if (brw->gen < 6)
3101 emit_interpolation_setup_gen4();
3102 else
3103 emit_interpolation_setup_gen6();
3104 }
3105
3106 /* We handle discards by keeping track of the still-live pixels in f0.1.
3107 * Initialize it with the dispatched pixels.
3108 */
3109 if (fp->UsesKill) {
3110 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3111 discard_init->flag_subreg = 1;
3112 }
3113
3114 /* Generate FS IR for main(). (the visitor only descends into
3115 * functions called "main").
3116 */
3117 if (shader) {
3118 foreach_list(node, &*shader->ir) {
3119 ir_instruction *ir = (ir_instruction *)node;
3120 base_ir = ir;
3121 this->result = reg_undef;
3122 ir->accept(this);
3123 }
3124 } else {
3125 emit_fragment_program_code();
3126 }
3127 base_ir = NULL;
3128 if (failed)
3129 return false;
3130
3131 emit(FS_OPCODE_PLACEHOLDER_HALT);
3132
3133 emit_fb_writes();
3134
3135 split_virtual_grfs();
3136
3137 move_uniform_array_access_to_pull_constants();
3138 remove_dead_constants();
3139 setup_pull_constants();
3140
3141 bool progress;
3142 do {
3143 progress = false;
3144
3145 compact_virtual_grfs();
3146
3147 progress = remove_duplicate_mrf_writes() || progress;
3148
3149 progress = opt_algebraic() || progress;
3150 progress = opt_cse() || progress;
3151 progress = opt_copy_propagate() || progress;
3152 progress = dead_code_eliminate() || progress;
3153 progress = dead_code_eliminate_local() || progress;
3154 progress = register_coalesce() || progress;
3155 progress = register_coalesce_2() || progress;
3156 progress = compute_to_mrf() || progress;
3157 } while (progress);
3158
3159 schedule_instructions(false);
3160
3161 lower_uniform_pull_constant_loads();
3162
3163 assign_curb_setup();
3164 assign_urb_setup();
3165
3166 if (0)
3167 assign_regs_trivial();
3168 else {
3169 while (!assign_regs()) {
3170 if (failed)
3171 break;
3172 }
3173 }
3174 }
3175 assert(force_uncompressed_stack == 0);
3176 assert(force_sechalf_stack == 0);
3177
3178 /* This must come after all optimization and register allocation, since
3179 * it inserts dead code that happens to have side effects, and it does
3180 * so based on the actual physical registers in use.
3181 */
3182 insert_gen4_send_dependency_workarounds();
3183
3184 if (failed)
3185 return false;
3186
3187 schedule_instructions(true);
3188
3189 if (dispatch_width == 8) {
3190 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3191 } else {
3192 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3193
3194 /* Make sure we didn't try to sneak in an extra uniform */
3195 assert(orig_nr_params == c->prog_data.nr_params);
3196 (void) orig_nr_params;
3197 }
3198
3199 /* If any state parameters were appended, then ParameterValues could have
3200 * been realloced, in which case the driver uniform storage set up by
3201 * _mesa_associate_uniform_storage() would point to freed memory. Make
3202 * sure that didn't happen.
3203 */
3204 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3205
3206 return !failed;
3207 }
3208
3209 const unsigned *
3210 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3211 struct gl_fragment_program *fp,
3212 struct gl_shader_program *prog,
3213 unsigned *final_assembly_size)
3214 {
3215 bool start_busy = false;
3216 float start_time = 0;
3217
3218 if (unlikely(brw->perf_debug)) {
3219 start_busy = (brw->batch.last_bo &&
3220 drm_intel_bo_busy(brw->batch.last_bo));
3221 start_time = get_time();
3222 }
3223
3224 struct brw_shader *shader = NULL;
3225 if (prog)
3226 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3227
3228 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3229 if (prog) {
3230 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3231 _mesa_print_ir(shader->ir, NULL);
3232 printf("\n\n");
3233 } else {
3234 printf("ARB_fragment_program %d ir for native fragment shader\n",
3235 fp->Base.Id);
3236 _mesa_print_program(&fp->Base);
3237 }
3238 }
3239
3240 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3241 */
3242 fs_visitor v(brw, c, prog, fp, 8);
3243 if (!v.run()) {
3244 if (prog) {
3245 prog->LinkStatus = false;
3246 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3247 }
3248
3249 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3250 v.fail_msg);
3251
3252 return NULL;
3253 }
3254
3255 exec_list *simd16_instructions = NULL;
3256 fs_visitor v2(brw, c, prog, fp, 16);
3257 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3258 if (c->prog_data.nr_pull_params == 0) {
3259 /* Try a 16-wide compile */
3260 v2.import_uniforms(&v);
3261 if (!v2.run()) {
3262 perf_debug("16-wide shader failed to compile, falling back to "
3263 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3264 } else {
3265 simd16_instructions = &v2.instructions;
3266 }
3267 } else {
3268 perf_debug("Skipping 16-wide due to pull parameters.\n");
3269 }
3270 }
3271
3272 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3273 const unsigned *generated = g.generate_assembly(&v.instructions,
3274 simd16_instructions,
3275 final_assembly_size);
3276
3277 if (unlikely(brw->perf_debug) && shader) {
3278 if (shader->compiled_once)
3279 brw_wm_debug_recompile(brw, prog, &c->key);
3280 shader->compiled_once = true;
3281
3282 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3283 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3284 (get_time() - start_time) * 1000);
3285 }
3286 }
3287
3288 return generated;
3289 }
3290
3291 bool
3292 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3293 {
3294 struct brw_context *brw = brw_context(ctx);
3295 struct brw_wm_prog_key key;
3296
3297 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3298 return true;
3299
3300 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3301 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3302 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3303 bool program_uses_dfdy = fp->UsesDFdy;
3304
3305 memset(&key, 0, sizeof(key));
3306
3307 if (brw->gen < 6) {
3308 if (fp->UsesKill)
3309 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3310
3311 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3312 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3313
3314 /* Just assume depth testing. */
3315 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3316 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3317 }
3318
3319 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3320 BRW_FS_VARYING_INPUT_MASK) > 16)
3321 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3322
3323 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3324
3325 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3326 for (unsigned i = 0; i < sampler_count; i++) {
3327 if (fp->Base.ShadowSamplers & (1 << i)) {
3328 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3329 key.tex.swizzles[i] =
3330 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3331 } else {
3332 /* Color sampler: assume no swizzling. */
3333 key.tex.swizzles[i] = SWIZZLE_XYZW;
3334 }
3335 }
3336
3337 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3338 key.drawable_height = ctx->DrawBuffer->Height;
3339 }
3340
3341 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3342 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3343 }
3344
3345 key.nr_color_regions = 1;
3346
3347 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3348 * quality of the derivatives is likely to be determined by the driconf
3349 * option.
3350 */
3351 key.high_quality_derivatives = brw->disable_derivative_optimization;
3352
3353 key.program_string_id = bfp->id;
3354
3355 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3356 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3357
3358 bool success = do_wm_prog(brw, prog, bfp, &key);
3359
3360 brw->wm.base.prog_offset = old_prog_offset;
3361 brw->wm.prog_data = old_prog_data;
3362
3363 return success;
3364 }