i965/fs: Optimize OR with identical sources into a MOV.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "main/uniforms.h"
50 #include "brw_fs_live_variables.h"
51 #include "glsl/glsl_types.h"
52
53 void
54 fs_inst::init()
55 {
56 memset(this, 0, sizeof(*this));
57 this->opcode = BRW_OPCODE_NOP;
58 this->conditional_mod = BRW_CONDITIONAL_NONE;
59
60 this->dst = reg_undef;
61 this->src[0] = reg_undef;
62 this->src[1] = reg_undef;
63 this->src[2] = reg_undef;
64
65 /* This will be the case for almost all instructions. */
66 this->regs_written = 1;
67 }
68
69 fs_inst::fs_inst()
70 {
71 init();
72 }
73
74 fs_inst::fs_inst(enum opcode opcode)
75 {
76 init();
77 this->opcode = opcode;
78 }
79
80 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
81 {
82 init();
83 this->opcode = opcode;
84 this->dst = dst;
85
86 if (dst.file == GRF)
87 assert(dst.reg_offset >= 0);
88 }
89
90 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
91 {
92 init();
93 this->opcode = opcode;
94 this->dst = dst;
95 this->src[0] = src0;
96
97 if (dst.file == GRF)
98 assert(dst.reg_offset >= 0);
99 if (src[0].file == GRF)
100 assert(src[0].reg_offset >= 0);
101 }
102
103 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
104 {
105 init();
106 this->opcode = opcode;
107 this->dst = dst;
108 this->src[0] = src0;
109 this->src[1] = src1;
110
111 if (dst.file == GRF)
112 assert(dst.reg_offset >= 0);
113 if (src[0].file == GRF)
114 assert(src[0].reg_offset >= 0);
115 if (src[1].file == GRF)
116 assert(src[1].reg_offset >= 0);
117 }
118
119 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
120 fs_reg src0, fs_reg src1, fs_reg src2)
121 {
122 init();
123 this->opcode = opcode;
124 this->dst = dst;
125 this->src[0] = src0;
126 this->src[1] = src1;
127 this->src[2] = src2;
128
129 if (dst.file == GRF)
130 assert(dst.reg_offset >= 0);
131 if (src[0].file == GRF)
132 assert(src[0].reg_offset >= 0);
133 if (src[1].file == GRF)
134 assert(src[1].reg_offset >= 0);
135 if (src[2].file == GRF)
136 assert(src[2].reg_offset >= 0);
137 }
138
139 #define ALU1(op) \
140 fs_inst * \
141 fs_visitor::op(fs_reg dst, fs_reg src0) \
142 { \
143 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
144 }
145
146 #define ALU2(op) \
147 fs_inst * \
148 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
149 { \
150 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
151 }
152
153 #define ALU3(op) \
154 fs_inst * \
155 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
156 { \
157 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
158 }
159
160 ALU1(NOT)
161 ALU1(MOV)
162 ALU1(FRC)
163 ALU1(RNDD)
164 ALU1(RNDE)
165 ALU1(RNDZ)
166 ALU2(ADD)
167 ALU2(MUL)
168 ALU2(MACH)
169 ALU2(AND)
170 ALU2(OR)
171 ALU2(XOR)
172 ALU2(SHL)
173 ALU2(SHR)
174 ALU2(ASR)
175 ALU3(LRP)
176 ALU1(BFREV)
177 ALU3(BFE)
178 ALU2(BFI1)
179 ALU3(BFI2)
180 ALU1(FBH)
181 ALU1(FBL)
182 ALU1(CBIT)
183 ALU3(MAD)
184 ALU2(ADDC)
185 ALU2(SUBB)
186
187 /** Gen4 predicated IF. */
188 fs_inst *
189 fs_visitor::IF(uint32_t predicate)
190 {
191 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
192 inst->predicate = predicate;
193 return inst;
194 }
195
196 /** Gen6+ IF with embedded comparison. */
197 fs_inst *
198 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
199 {
200 assert(brw->gen >= 6);
201 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
202 reg_null_d, src0, src1);
203 inst->conditional_mod = condition;
204 return inst;
205 }
206
207 /**
208 * CMP: Sets the low bit of the destination channels with the result
209 * of the comparison, while the upper bits are undefined, and updates
210 * the flag register with the packed 16 bits of the result.
211 */
212 fs_inst *
213 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
214 {
215 fs_inst *inst;
216
217 /* Take the instruction:
218 *
219 * CMP null<d> src0<f> src1<f>
220 *
221 * Original gen4 does type conversion to the destination type before
222 * comparison, producing garbage results for floating point comparisons.
223 * gen5 does the comparison on the execution type (resolved source types),
224 * so dst type doesn't matter. gen6 does comparison and then uses the
225 * result as if it was the dst type with no conversion, which happens to
226 * mostly work out for float-interpreted-as-int since our comparisons are
227 * for >0, =0, <0.
228 */
229 if (brw->gen == 4) {
230 dst.type = src0.type;
231 if (dst.file == HW_REG)
232 dst.fixed_hw_reg.type = dst.type;
233 }
234
235 resolve_ud_negate(&src0);
236 resolve_ud_negate(&src1);
237
238 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
239 inst->conditional_mod = condition;
240
241 return inst;
242 }
243
244 exec_list
245 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
246 fs_reg varying_offset,
247 uint32_t const_offset)
248 {
249 exec_list instructions;
250 fs_inst *inst;
251
252 /* We have our constant surface use a pitch of 4 bytes, so our index can
253 * be any component of a vector, and then we load 4 contiguous
254 * components starting from that.
255 *
256 * We break down the const_offset to a portion added to the variable
257 * offset and a portion done using reg_offset, which means that if you
258 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
259 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
260 * CSE can later notice that those loads are all the same and eliminate
261 * the redundant ones.
262 */
263 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
264 instructions.push_tail(ADD(vec4_offset,
265 varying_offset, const_offset & ~3));
266
267 int scale = 1;
268 if (brw->gen == 4 && dispatch_width == 8) {
269 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
270 * u, v, r) as parameters, or we can just use the SIMD16 message
271 * consisting of (header, u). We choose the second, at the cost of a
272 * longer return length.
273 */
274 scale = 2;
275 }
276
277 enum opcode op;
278 if (brw->gen >= 7)
279 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
280 else
281 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
282 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
283 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
284 inst->regs_written = 4 * scale;
285 instructions.push_tail(inst);
286
287 if (brw->gen < 7) {
288 inst->base_mrf = 13;
289 inst->header_present = true;
290 if (brw->gen == 4)
291 inst->mlen = 3;
292 else
293 inst->mlen = 1 + dispatch_width / 8;
294 }
295
296 vec4_result.reg_offset += (const_offset & 3) * scale;
297 instructions.push_tail(MOV(dst, vec4_result));
298
299 return instructions;
300 }
301
302 /**
303 * A helper for MOV generation for fixing up broken hardware SEND dependency
304 * handling.
305 */
306 fs_inst *
307 fs_visitor::DEP_RESOLVE_MOV(int grf)
308 {
309 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
310
311 inst->ir = NULL;
312 inst->annotation = "send dependency resolve";
313
314 /* The caller always wants uncompressed to emit the minimal extra
315 * dependencies, and to avoid having to deal with aligning its regs to 2.
316 */
317 inst->force_uncompressed = true;
318
319 return inst;
320 }
321
322 bool
323 fs_inst::equals(fs_inst *inst)
324 {
325 return (opcode == inst->opcode &&
326 dst.equals(inst->dst) &&
327 src[0].equals(inst->src[0]) &&
328 src[1].equals(inst->src[1]) &&
329 src[2].equals(inst->src[2]) &&
330 saturate == inst->saturate &&
331 predicate == inst->predicate &&
332 conditional_mod == inst->conditional_mod &&
333 mlen == inst->mlen &&
334 base_mrf == inst->base_mrf &&
335 sampler == inst->sampler &&
336 target == inst->target &&
337 eot == inst->eot &&
338 header_present == inst->header_present &&
339 shadow_compare == inst->shadow_compare &&
340 offset == inst->offset);
341 }
342
343 bool
344 fs_inst::overwrites_reg(const fs_reg &reg)
345 {
346 return (reg.file == dst.file &&
347 reg.reg == dst.reg &&
348 reg.reg_offset >= dst.reg_offset &&
349 reg.reg_offset < dst.reg_offset + regs_written);
350 }
351
352 bool
353 fs_inst::is_send_from_grf()
354 {
355 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
356 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
357 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
358 src[1].file == GRF) ||
359 (is_tex() && src[0].file == GRF));
360 }
361
362 bool
363 fs_visitor::can_do_source_mods(fs_inst *inst)
364 {
365 if (brw->gen == 6 && inst->is_math())
366 return false;
367
368 if (inst->is_send_from_grf())
369 return false;
370
371 if (!inst->can_do_source_mods())
372 return false;
373
374 return true;
375 }
376
377 void
378 fs_reg::init()
379 {
380 memset(this, 0, sizeof(*this));
381 this->smear = -1;
382 }
383
384 /** Generic unset register constructor. */
385 fs_reg::fs_reg()
386 {
387 init();
388 this->file = BAD_FILE;
389 }
390
391 /** Immediate value constructor. */
392 fs_reg::fs_reg(float f)
393 {
394 init();
395 this->file = IMM;
396 this->type = BRW_REGISTER_TYPE_F;
397 this->imm.f = f;
398 }
399
400 /** Immediate value constructor. */
401 fs_reg::fs_reg(int32_t i)
402 {
403 init();
404 this->file = IMM;
405 this->type = BRW_REGISTER_TYPE_D;
406 this->imm.i = i;
407 }
408
409 /** Immediate value constructor. */
410 fs_reg::fs_reg(uint32_t u)
411 {
412 init();
413 this->file = IMM;
414 this->type = BRW_REGISTER_TYPE_UD;
415 this->imm.u = u;
416 }
417
418 /** Fixed brw_reg Immediate value constructor. */
419 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
420 {
421 init();
422 this->file = HW_REG;
423 this->fixed_hw_reg = fixed_hw_reg;
424 this->type = fixed_hw_reg.type;
425 }
426
427 bool
428 fs_reg::equals(const fs_reg &r) const
429 {
430 return (file == r.file &&
431 reg == r.reg &&
432 reg_offset == r.reg_offset &&
433 type == r.type &&
434 negate == r.negate &&
435 abs == r.abs &&
436 !reladdr && !r.reladdr &&
437 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
438 sizeof(fixed_hw_reg)) == 0 &&
439 smear == r.smear &&
440 imm.u == r.imm.u);
441 }
442
443 fs_reg
444 fs_reg::retype(uint32_t type)
445 {
446 fs_reg result = *this;
447 result.type = type;
448 return result;
449 }
450
451 bool
452 fs_reg::is_zero() const
453 {
454 if (file != IMM)
455 return false;
456
457 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
458 }
459
460 bool
461 fs_reg::is_one() const
462 {
463 if (file != IMM)
464 return false;
465
466 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
467 }
468
469 bool
470 fs_reg::is_null() const
471 {
472 return file == HW_REG &&
473 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
474 fixed_hw_reg.nr == BRW_ARF_NULL;
475 }
476
477 bool
478 fs_reg::is_valid_3src() const
479 {
480 return file == GRF || file == UNIFORM;
481 }
482
483 int
484 fs_visitor::type_size(const struct glsl_type *type)
485 {
486 unsigned int size, i;
487
488 switch (type->base_type) {
489 case GLSL_TYPE_UINT:
490 case GLSL_TYPE_INT:
491 case GLSL_TYPE_FLOAT:
492 case GLSL_TYPE_BOOL:
493 return type->components();
494 case GLSL_TYPE_ARRAY:
495 return type_size(type->fields.array) * type->length;
496 case GLSL_TYPE_STRUCT:
497 size = 0;
498 for (i = 0; i < type->length; i++) {
499 size += type_size(type->fields.structure[i].type);
500 }
501 return size;
502 case GLSL_TYPE_SAMPLER:
503 /* Samplers take up no register space, since they're baked in at
504 * link time.
505 */
506 return 0;
507 case GLSL_TYPE_ATOMIC_UINT:
508 return 0;
509 case GLSL_TYPE_VOID:
510 case GLSL_TYPE_ERROR:
511 case GLSL_TYPE_INTERFACE:
512 assert(!"not reached");
513 break;
514 }
515
516 return 0;
517 }
518
519 fs_reg
520 fs_visitor::get_timestamp()
521 {
522 assert(brw->gen >= 7);
523
524 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
525 BRW_ARF_TIMESTAMP,
526 0),
527 BRW_REGISTER_TYPE_UD));
528
529 fs_reg dst = fs_reg(this, glsl_type::uint_type);
530
531 fs_inst *mov = emit(MOV(dst, ts));
532 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
533 * even if it's not enabled in the dispatch.
534 */
535 mov->force_writemask_all = true;
536 mov->force_uncompressed = true;
537
538 /* The caller wants the low 32 bits of the timestamp. Since it's running
539 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
540 * which is plenty of time for our purposes. It is identical across the
541 * EUs, but since it's tracking GPU core speed it will increment at a
542 * varying rate as render P-states change.
543 *
544 * The caller could also check if render P-states have changed (or anything
545 * else that might disrupt timing) by setting smear to 2 and checking if
546 * that field is != 0.
547 */
548 dst.smear = 0;
549
550 return dst;
551 }
552
553 void
554 fs_visitor::emit_shader_time_begin()
555 {
556 current_annotation = "shader time start";
557 shader_start_time = get_timestamp();
558 }
559
560 void
561 fs_visitor::emit_shader_time_end()
562 {
563 current_annotation = "shader time end";
564
565 enum shader_time_shader_type type, written_type, reset_type;
566 if (dispatch_width == 8) {
567 type = ST_FS8;
568 written_type = ST_FS8_WRITTEN;
569 reset_type = ST_FS8_RESET;
570 } else {
571 assert(dispatch_width == 16);
572 type = ST_FS16;
573 written_type = ST_FS16_WRITTEN;
574 reset_type = ST_FS16_RESET;
575 }
576
577 fs_reg shader_end_time = get_timestamp();
578
579 /* Check that there weren't any timestamp reset events (assuming these
580 * were the only two timestamp reads that happened).
581 */
582 fs_reg reset = shader_end_time;
583 reset.smear = 2;
584 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
585 test->conditional_mod = BRW_CONDITIONAL_Z;
586 emit(IF(BRW_PREDICATE_NORMAL));
587
588 push_force_uncompressed();
589 fs_reg start = shader_start_time;
590 start.negate = true;
591 fs_reg diff = fs_reg(this, glsl_type::uint_type);
592 emit(ADD(diff, start, shader_end_time));
593
594 /* If there were no instructions between the two timestamp gets, the diff
595 * is 2 cycles. Remove that overhead, so I can forget about that when
596 * trying to determine the time taken for single instructions.
597 */
598 emit(ADD(diff, diff, fs_reg(-2u)));
599
600 emit_shader_time_write(type, diff);
601 emit_shader_time_write(written_type, fs_reg(1u));
602 emit(BRW_OPCODE_ELSE);
603 emit_shader_time_write(reset_type, fs_reg(1u));
604 emit(BRW_OPCODE_ENDIF);
605
606 pop_force_uncompressed();
607 }
608
609 void
610 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
611 fs_reg value)
612 {
613 int shader_time_index =
614 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
615 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
616
617 fs_reg payload;
618 if (dispatch_width == 8)
619 payload = fs_reg(this, glsl_type::uvec2_type);
620 else
621 payload = fs_reg(this, glsl_type::uint_type);
622
623 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
624 fs_reg(), payload, offset, value));
625 }
626
627 void
628 fs_visitor::fail(const char *format, ...)
629 {
630 va_list va;
631 char *msg;
632
633 if (failed)
634 return;
635
636 failed = true;
637
638 va_start(va, format);
639 msg = ralloc_vasprintf(mem_ctx, format, va);
640 va_end(va);
641 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
642
643 this->fail_msg = msg;
644
645 if (INTEL_DEBUG & DEBUG_WM) {
646 fprintf(stderr, "%s", msg);
647 }
648 }
649
650 fs_inst *
651 fs_visitor::emit(enum opcode opcode)
652 {
653 return emit(fs_inst(opcode));
654 }
655
656 fs_inst *
657 fs_visitor::emit(enum opcode opcode, fs_reg dst)
658 {
659 return emit(fs_inst(opcode, dst));
660 }
661
662 fs_inst *
663 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
664 {
665 return emit(fs_inst(opcode, dst, src0));
666 }
667
668 fs_inst *
669 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
670 {
671 return emit(fs_inst(opcode, dst, src0, src1));
672 }
673
674 fs_inst *
675 fs_visitor::emit(enum opcode opcode, fs_reg dst,
676 fs_reg src0, fs_reg src1, fs_reg src2)
677 {
678 return emit(fs_inst(opcode, dst, src0, src1, src2));
679 }
680
681 void
682 fs_visitor::push_force_uncompressed()
683 {
684 force_uncompressed_stack++;
685 }
686
687 void
688 fs_visitor::pop_force_uncompressed()
689 {
690 force_uncompressed_stack--;
691 assert(force_uncompressed_stack >= 0);
692 }
693
694 void
695 fs_visitor::push_force_sechalf()
696 {
697 force_sechalf_stack++;
698 }
699
700 void
701 fs_visitor::pop_force_sechalf()
702 {
703 force_sechalf_stack--;
704 assert(force_sechalf_stack >= 0);
705 }
706
707 /**
708 * Returns true if the instruction has a flag that means it won't
709 * update an entire destination register.
710 *
711 * For example, dead code elimination and live variable analysis want to know
712 * when a write to a variable screens off any preceding values that were in
713 * it.
714 */
715 bool
716 fs_inst::is_partial_write()
717 {
718 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
719 this->force_uncompressed ||
720 this->force_sechalf);
721 }
722
723 int
724 fs_inst::regs_read(fs_visitor *v, int arg)
725 {
726 if (is_tex() && arg == 0 && src[0].file == GRF) {
727 if (v->dispatch_width == 16)
728 return (mlen + 1) / 2;
729 else
730 return mlen;
731 }
732 return 1;
733 }
734
735 bool
736 fs_inst::reads_flag()
737 {
738 return predicate;
739 }
740
741 bool
742 fs_inst::writes_flag()
743 {
744 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
745 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
746 }
747
748 /**
749 * Returns how many MRFs an FS opcode will write over.
750 *
751 * Note that this is not the 0 or 1 implied writes in an actual gen
752 * instruction -- the FS opcodes often generate MOVs in addition.
753 */
754 int
755 fs_visitor::implied_mrf_writes(fs_inst *inst)
756 {
757 if (inst->mlen == 0)
758 return 0;
759
760 if (inst->base_mrf == -1)
761 return 0;
762
763 switch (inst->opcode) {
764 case SHADER_OPCODE_RCP:
765 case SHADER_OPCODE_RSQ:
766 case SHADER_OPCODE_SQRT:
767 case SHADER_OPCODE_EXP2:
768 case SHADER_OPCODE_LOG2:
769 case SHADER_OPCODE_SIN:
770 case SHADER_OPCODE_COS:
771 return 1 * dispatch_width / 8;
772 case SHADER_OPCODE_POW:
773 case SHADER_OPCODE_INT_QUOTIENT:
774 case SHADER_OPCODE_INT_REMAINDER:
775 return 2 * dispatch_width / 8;
776 case SHADER_OPCODE_TEX:
777 case FS_OPCODE_TXB:
778 case SHADER_OPCODE_TXD:
779 case SHADER_OPCODE_TXF:
780 case SHADER_OPCODE_TXF_MS:
781 case SHADER_OPCODE_TG4:
782 case SHADER_OPCODE_TG4_OFFSET:
783 case SHADER_OPCODE_TXL:
784 case SHADER_OPCODE_TXS:
785 case SHADER_OPCODE_LOD:
786 return 1;
787 case FS_OPCODE_FB_WRITE:
788 return 2;
789 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
790 case SHADER_OPCODE_GEN4_SCRATCH_READ:
791 return 1;
792 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
793 return inst->mlen;
794 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
795 return 2;
796 case SHADER_OPCODE_UNTYPED_ATOMIC:
797 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
798 return 0;
799 default:
800 assert(!"not reached");
801 return inst->mlen;
802 }
803 }
804
805 int
806 fs_visitor::virtual_grf_alloc(int size)
807 {
808 if (virtual_grf_array_size <= virtual_grf_count) {
809 if (virtual_grf_array_size == 0)
810 virtual_grf_array_size = 16;
811 else
812 virtual_grf_array_size *= 2;
813 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
814 virtual_grf_array_size);
815 }
816 virtual_grf_sizes[virtual_grf_count] = size;
817 return virtual_grf_count++;
818 }
819
820 /** Fixed HW reg constructor. */
821 fs_reg::fs_reg(enum register_file file, int reg)
822 {
823 init();
824 this->file = file;
825 this->reg = reg;
826 this->type = BRW_REGISTER_TYPE_F;
827 }
828
829 /** Fixed HW reg constructor. */
830 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
831 {
832 init();
833 this->file = file;
834 this->reg = reg;
835 this->type = type;
836 }
837
838 /** Automatic reg constructor. */
839 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
840 {
841 init();
842
843 this->file = GRF;
844 this->reg = v->virtual_grf_alloc(v->type_size(type));
845 this->reg_offset = 0;
846 this->type = brw_type_for_base_type(type);
847 }
848
849 fs_reg *
850 fs_visitor::variable_storage(ir_variable *var)
851 {
852 return (fs_reg *)hash_table_find(this->variable_ht, var);
853 }
854
855 void
856 import_uniforms_callback(const void *key,
857 void *data,
858 void *closure)
859 {
860 struct hash_table *dst_ht = (struct hash_table *)closure;
861 const fs_reg *reg = (const fs_reg *)data;
862
863 if (reg->file != UNIFORM)
864 return;
865
866 hash_table_insert(dst_ht, data, key);
867 }
868
869 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
870 * This brings in those uniform definitions
871 */
872 void
873 fs_visitor::import_uniforms(fs_visitor *v)
874 {
875 hash_table_call_foreach(v->variable_ht,
876 import_uniforms_callback,
877 variable_ht);
878 this->params_remap = v->params_remap;
879 this->nr_params_remap = v->nr_params_remap;
880 }
881
882 /* Our support for uniforms is piggy-backed on the struct
883 * gl_fragment_program, because that's where the values actually
884 * get stored, rather than in some global gl_shader_program uniform
885 * store.
886 */
887 void
888 fs_visitor::setup_uniform_values(ir_variable *ir)
889 {
890 int namelen = strlen(ir->name);
891
892 /* The data for our (non-builtin) uniforms is stored in a series of
893 * gl_uniform_driver_storage structs for each subcomponent that
894 * glGetUniformLocation() could name. We know it's been set up in the same
895 * order we'd walk the type, so walk the list of storage and find anything
896 * with our name, or the prefix of a component that starts with our name.
897 */
898 unsigned params_before = c->prog_data.nr_params;
899 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
900 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
901
902 if (strncmp(ir->name, storage->name, namelen) != 0 ||
903 (storage->name[namelen] != 0 &&
904 storage->name[namelen] != '.' &&
905 storage->name[namelen] != '[')) {
906 continue;
907 }
908
909 unsigned slots = storage->type->component_slots();
910 if (storage->array_elements)
911 slots *= storage->array_elements;
912
913 for (unsigned i = 0; i < slots; i++) {
914 c->prog_data.param[c->prog_data.nr_params++] =
915 &storage->storage[i].f;
916 }
917 }
918
919 /* Make sure we actually initialized the right amount of stuff here. */
920 assert(params_before + ir->type->component_slots() ==
921 c->prog_data.nr_params);
922 (void)params_before;
923 }
924
925
926 /* Our support for builtin uniforms is even scarier than non-builtin.
927 * It sits on top of the PROG_STATE_VAR parameters that are
928 * automatically updated from GL context state.
929 */
930 void
931 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
932 {
933 const ir_state_slot *const slots = ir->state_slots;
934 assert(ir->state_slots != NULL);
935
936 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
937 /* This state reference has already been setup by ir_to_mesa, but we'll
938 * get the same index back here.
939 */
940 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
941 (gl_state_index *)slots[i].tokens);
942
943 /* Add each of the unique swizzles of the element as a parameter.
944 * This'll end up matching the expected layout of the
945 * array/matrix/structure we're trying to fill in.
946 */
947 int last_swiz = -1;
948 for (unsigned int j = 0; j < 4; j++) {
949 int swiz = GET_SWZ(slots[i].swizzle, j);
950 if (swiz == last_swiz)
951 break;
952 last_swiz = swiz;
953
954 c->prog_data.param[c->prog_data.nr_params++] =
955 &fp->Base.Parameters->ParameterValues[index][swiz].f;
956 }
957 }
958 }
959
960 fs_reg *
961 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
962 {
963 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
964 fs_reg wpos = *reg;
965 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
966
967 /* gl_FragCoord.x */
968 if (ir->pixel_center_integer) {
969 emit(MOV(wpos, this->pixel_x));
970 } else {
971 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
972 }
973 wpos.reg_offset++;
974
975 /* gl_FragCoord.y */
976 if (!flip && ir->pixel_center_integer) {
977 emit(MOV(wpos, this->pixel_y));
978 } else {
979 fs_reg pixel_y = this->pixel_y;
980 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
981
982 if (flip) {
983 pixel_y.negate = true;
984 offset += c->key.drawable_height - 1.0;
985 }
986
987 emit(ADD(wpos, pixel_y, fs_reg(offset)));
988 }
989 wpos.reg_offset++;
990
991 /* gl_FragCoord.z */
992 if (brw->gen >= 6) {
993 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
994 } else {
995 emit(FS_OPCODE_LINTERP, wpos,
996 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
997 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
998 interp_reg(VARYING_SLOT_POS, 2));
999 }
1000 wpos.reg_offset++;
1001
1002 /* gl_FragCoord.w: Already set up in emit_interpolation */
1003 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1004
1005 return reg;
1006 }
1007
1008 fs_inst *
1009 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1010 glsl_interp_qualifier interpolation_mode,
1011 bool is_centroid)
1012 {
1013 brw_wm_barycentric_interp_mode barycoord_mode;
1014 if (brw->gen >= 6) {
1015 if (is_centroid) {
1016 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1017 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1018 else
1019 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1020 } else {
1021 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1022 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1023 else
1024 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1025 }
1026 } else {
1027 /* On Ironlake and below, there is only one interpolation mode.
1028 * Centroid interpolation doesn't mean anything on this hardware --
1029 * there is no multisampling.
1030 */
1031 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1032 }
1033 return emit(FS_OPCODE_LINTERP, attr,
1034 this->delta_x[barycoord_mode],
1035 this->delta_y[barycoord_mode], interp);
1036 }
1037
1038 fs_reg *
1039 fs_visitor::emit_general_interpolation(ir_variable *ir)
1040 {
1041 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1042 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1043 fs_reg attr = *reg;
1044
1045 unsigned int array_elements;
1046 const glsl_type *type;
1047
1048 if (ir->type->is_array()) {
1049 array_elements = ir->type->length;
1050 if (array_elements == 0) {
1051 fail("dereferenced array '%s' has length 0\n", ir->name);
1052 }
1053 type = ir->type->fields.array;
1054 } else {
1055 array_elements = 1;
1056 type = ir->type;
1057 }
1058
1059 glsl_interp_qualifier interpolation_mode =
1060 ir->determine_interpolation_mode(c->key.flat_shade);
1061
1062 int location = ir->location;
1063 for (unsigned int i = 0; i < array_elements; i++) {
1064 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1065 if (c->prog_data.urb_setup[location] == -1) {
1066 /* If there's no incoming setup data for this slot, don't
1067 * emit interpolation for it.
1068 */
1069 attr.reg_offset += type->vector_elements;
1070 location++;
1071 continue;
1072 }
1073
1074 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1075 /* Constant interpolation (flat shading) case. The SF has
1076 * handed us defined values in only the constant offset
1077 * field of the setup reg.
1078 */
1079 for (unsigned int k = 0; k < type->vector_elements; k++) {
1080 struct brw_reg interp = interp_reg(location, k);
1081 interp = suboffset(interp, 3);
1082 interp.type = reg->type;
1083 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1084 attr.reg_offset++;
1085 }
1086 } else {
1087 /* Smooth/noperspective interpolation case. */
1088 for (unsigned int k = 0; k < type->vector_elements; k++) {
1089 /* FINISHME: At some point we probably want to push
1090 * this farther by giving similar treatment to the
1091 * other potentially constant components of the
1092 * attribute, as well as making brw_vs_constval.c
1093 * handle varyings other than gl_TexCoord.
1094 */
1095 struct brw_reg interp = interp_reg(location, k);
1096 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1097 ir->centroid);
1098 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1099 /* Get the pixel/sample mask into f0 so that we know
1100 * which pixels are lit. Then, for each channel that is
1101 * unlit, replace the centroid data with non-centroid
1102 * data.
1103 */
1104 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1105 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1106 interpolation_mode, false);
1107 inst->predicate = BRW_PREDICATE_NORMAL;
1108 inst->predicate_inverse = true;
1109 }
1110 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1111 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1112 }
1113 attr.reg_offset++;
1114 }
1115
1116 }
1117 location++;
1118 }
1119 }
1120
1121 return reg;
1122 }
1123
1124 fs_reg *
1125 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1126 {
1127 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1128
1129 /* The frontfacing comes in as a bit in the thread payload. */
1130 if (brw->gen >= 6) {
1131 emit(BRW_OPCODE_ASR, *reg,
1132 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1133 fs_reg(15));
1134 emit(BRW_OPCODE_NOT, *reg, *reg);
1135 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1136 } else {
1137 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1138 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1139 * us front face
1140 */
1141 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1142 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1143 }
1144
1145 return reg;
1146 }
1147
1148 fs_reg
1149 fs_visitor::fix_math_operand(fs_reg src)
1150 {
1151 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1152 * might be able to do better by doing execsize = 1 math and then
1153 * expanding that result out, but we would need to be careful with
1154 * masking.
1155 *
1156 * The hardware ignores source modifiers (negate and abs) on math
1157 * instructions, so we also move to a temp to set those up.
1158 */
1159 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1160 !src.abs && !src.negate)
1161 return src;
1162
1163 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1164 * operands to math
1165 */
1166 if (brw->gen >= 7 && src.file != IMM)
1167 return src;
1168
1169 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1170 expanded.type = src.type;
1171 emit(BRW_OPCODE_MOV, expanded, src);
1172 return expanded;
1173 }
1174
1175 fs_inst *
1176 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1177 {
1178 switch (opcode) {
1179 case SHADER_OPCODE_RCP:
1180 case SHADER_OPCODE_RSQ:
1181 case SHADER_OPCODE_SQRT:
1182 case SHADER_OPCODE_EXP2:
1183 case SHADER_OPCODE_LOG2:
1184 case SHADER_OPCODE_SIN:
1185 case SHADER_OPCODE_COS:
1186 break;
1187 default:
1188 assert(!"not reached: bad math opcode");
1189 return NULL;
1190 }
1191
1192 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1193 * might be able to do better by doing execsize = 1 math and then
1194 * expanding that result out, but we would need to be careful with
1195 * masking.
1196 *
1197 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1198 * instructions, so we also move to a temp to set those up.
1199 */
1200 if (brw->gen >= 6)
1201 src = fix_math_operand(src);
1202
1203 fs_inst *inst = emit(opcode, dst, src);
1204
1205 if (brw->gen < 6) {
1206 inst->base_mrf = 2;
1207 inst->mlen = dispatch_width / 8;
1208 }
1209
1210 return inst;
1211 }
1212
1213 fs_inst *
1214 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1215 {
1216 int base_mrf = 2;
1217 fs_inst *inst;
1218
1219 switch (opcode) {
1220 case SHADER_OPCODE_INT_QUOTIENT:
1221 case SHADER_OPCODE_INT_REMAINDER:
1222 if (brw->gen >= 7 && dispatch_width == 16)
1223 fail("16-wide INTDIV unsupported\n");
1224 break;
1225 case SHADER_OPCODE_POW:
1226 break;
1227 default:
1228 assert(!"not reached: unsupported binary math opcode.");
1229 return NULL;
1230 }
1231
1232 if (brw->gen >= 6) {
1233 src0 = fix_math_operand(src0);
1234 src1 = fix_math_operand(src1);
1235
1236 inst = emit(opcode, dst, src0, src1);
1237 } else {
1238 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1239 * "Message Payload":
1240 *
1241 * "Operand0[7]. For the INT DIV functions, this operand is the
1242 * denominator."
1243 * ...
1244 * "Operand1[7]. For the INT DIV functions, this operand is the
1245 * numerator."
1246 */
1247 bool is_int_div = opcode != SHADER_OPCODE_POW;
1248 fs_reg &op0 = is_int_div ? src1 : src0;
1249 fs_reg &op1 = is_int_div ? src0 : src1;
1250
1251 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1252 inst = emit(opcode, dst, op0, reg_null_f);
1253
1254 inst->base_mrf = base_mrf;
1255 inst->mlen = 2 * dispatch_width / 8;
1256 }
1257 return inst;
1258 }
1259
1260 void
1261 fs_visitor::assign_curb_setup()
1262 {
1263 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1264 if (dispatch_width == 8) {
1265 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1266 } else {
1267 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1268 }
1269
1270 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1271 foreach_list(node, &this->instructions) {
1272 fs_inst *inst = (fs_inst *)node;
1273
1274 for (unsigned int i = 0; i < 3; i++) {
1275 if (inst->src[i].file == UNIFORM) {
1276 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1277 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1278 constant_nr / 8,
1279 constant_nr % 8);
1280
1281 inst->src[i].file = HW_REG;
1282 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1283 }
1284 }
1285 }
1286 }
1287
1288 void
1289 fs_visitor::calculate_urb_setup()
1290 {
1291 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1292 c->prog_data.urb_setup[i] = -1;
1293 }
1294
1295 int urb_next = 0;
1296 /* Figure out where each of the incoming setup attributes lands. */
1297 if (brw->gen >= 6) {
1298 if (_mesa_bitcount_64(fp->Base.InputsRead &
1299 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1300 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1301 * first 16 varying inputs, so we can put them wherever we want.
1302 * Just put them in order.
1303 *
1304 * This is useful because it means that (a) inputs not used by the
1305 * fragment shader won't take up valuable register space, and (b) we
1306 * won't have to recompile the fragment shader if it gets paired with
1307 * a different vertex (or geometry) shader.
1308 */
1309 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1310 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1311 BITFIELD64_BIT(i)) {
1312 c->prog_data.urb_setup[i] = urb_next++;
1313 }
1314 }
1315 } else {
1316 /* We have enough input varyings that the SF/SBE pipeline stage can't
1317 * arbitrarily rearrange them to suit our whim; we have to put them
1318 * in an order that matches the output of the previous pipeline stage
1319 * (geometry or vertex shader).
1320 */
1321 struct brw_vue_map prev_stage_vue_map;
1322 brw_compute_vue_map(brw, &prev_stage_vue_map,
1323 c->key.input_slots_valid);
1324 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1325 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1326 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1327 slot++) {
1328 int varying = prev_stage_vue_map.slot_to_varying[slot];
1329 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1330 * unused.
1331 */
1332 if (varying != BRW_VARYING_SLOT_COUNT &&
1333 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1334 BITFIELD64_BIT(varying))) {
1335 c->prog_data.urb_setup[varying] = slot - first_slot;
1336 }
1337 }
1338 urb_next = prev_stage_vue_map.num_slots - first_slot;
1339 }
1340 } else {
1341 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1342 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1343 /* Point size is packed into the header, not as a general attribute */
1344 if (i == VARYING_SLOT_PSIZ)
1345 continue;
1346
1347 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1348 /* The back color slot is skipped when the front color is
1349 * also written to. In addition, some slots can be
1350 * written in the vertex shader and not read in the
1351 * fragment shader. So the register number must always be
1352 * incremented, mapped or not.
1353 */
1354 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1355 c->prog_data.urb_setup[i] = urb_next;
1356 urb_next++;
1357 }
1358 }
1359
1360 /*
1361 * It's a FS only attribute, and we did interpolation for this attribute
1362 * in SF thread. So, count it here, too.
1363 *
1364 * See compile_sf_prog() for more info.
1365 */
1366 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1367 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1368 }
1369
1370 c->prog_data.num_varying_inputs = urb_next;
1371 }
1372
1373 void
1374 fs_visitor::assign_urb_setup()
1375 {
1376 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1377
1378 /* Offset all the urb_setup[] index by the actual position of the
1379 * setup regs, now that the location of the constants has been chosen.
1380 */
1381 foreach_list(node, &this->instructions) {
1382 fs_inst *inst = (fs_inst *)node;
1383
1384 if (inst->opcode == FS_OPCODE_LINTERP) {
1385 assert(inst->src[2].file == HW_REG);
1386 inst->src[2].fixed_hw_reg.nr += urb_start;
1387 }
1388
1389 if (inst->opcode == FS_OPCODE_CINTERP) {
1390 assert(inst->src[0].file == HW_REG);
1391 inst->src[0].fixed_hw_reg.nr += urb_start;
1392 }
1393 }
1394
1395 /* Each attribute is 4 setup channels, each of which is half a reg. */
1396 this->first_non_payload_grf =
1397 urb_start + c->prog_data.num_varying_inputs * 2;
1398 }
1399
1400 /**
1401 * Split large virtual GRFs into separate components if we can.
1402 *
1403 * This is mostly duplicated with what brw_fs_vector_splitting does,
1404 * but that's really conservative because it's afraid of doing
1405 * splitting that doesn't result in real progress after the rest of
1406 * the optimization phases, which would cause infinite looping in
1407 * optimization. We can do it once here, safely. This also has the
1408 * opportunity to split interpolated values, or maybe even uniforms,
1409 * which we don't have at the IR level.
1410 *
1411 * We want to split, because virtual GRFs are what we register
1412 * allocate and spill (due to contiguousness requirements for some
1413 * instructions), and they're what we naturally generate in the
1414 * codegen process, but most virtual GRFs don't actually need to be
1415 * contiguous sets of GRFs. If we split, we'll end up with reduced
1416 * live intervals and better dead code elimination and coalescing.
1417 */
1418 void
1419 fs_visitor::split_virtual_grfs()
1420 {
1421 int num_vars = this->virtual_grf_count;
1422 bool split_grf[num_vars];
1423 int new_virtual_grf[num_vars];
1424
1425 /* Try to split anything > 0 sized. */
1426 for (int i = 0; i < num_vars; i++) {
1427 if (this->virtual_grf_sizes[i] != 1)
1428 split_grf[i] = true;
1429 else
1430 split_grf[i] = false;
1431 }
1432
1433 if (brw->has_pln &&
1434 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1435 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1436 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1437 * Gen6, that was the only supported interpolation mode, and since Gen6,
1438 * delta_x and delta_y are in fixed hardware registers.
1439 */
1440 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1441 false;
1442 }
1443
1444 foreach_list(node, &this->instructions) {
1445 fs_inst *inst = (fs_inst *)node;
1446
1447 /* If there's a SEND message that requires contiguous destination
1448 * registers, no splitting is allowed.
1449 */
1450 if (inst->regs_written > 1) {
1451 split_grf[inst->dst.reg] = false;
1452 }
1453
1454 /* If we're sending from a GRF, don't split it, on the assumption that
1455 * the send is reading the whole thing.
1456 */
1457 if (inst->is_send_from_grf()) {
1458 for (int i = 0; i < 3; i++) {
1459 if (inst->src[i].file == GRF) {
1460 split_grf[inst->src[i].reg] = false;
1461 }
1462 }
1463 }
1464 }
1465
1466 /* Allocate new space for split regs. Note that the virtual
1467 * numbers will be contiguous.
1468 */
1469 for (int i = 0; i < num_vars; i++) {
1470 if (split_grf[i]) {
1471 new_virtual_grf[i] = virtual_grf_alloc(1);
1472 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1473 int reg = virtual_grf_alloc(1);
1474 assert(reg == new_virtual_grf[i] + j - 1);
1475 (void) reg;
1476 }
1477 this->virtual_grf_sizes[i] = 1;
1478 }
1479 }
1480
1481 foreach_list(node, &this->instructions) {
1482 fs_inst *inst = (fs_inst *)node;
1483
1484 if (inst->dst.file == GRF &&
1485 split_grf[inst->dst.reg] &&
1486 inst->dst.reg_offset != 0) {
1487 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1488 inst->dst.reg_offset - 1);
1489 inst->dst.reg_offset = 0;
1490 }
1491 for (int i = 0; i < 3; i++) {
1492 if (inst->src[i].file == GRF &&
1493 split_grf[inst->src[i].reg] &&
1494 inst->src[i].reg_offset != 0) {
1495 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1496 inst->src[i].reg_offset - 1);
1497 inst->src[i].reg_offset = 0;
1498 }
1499 }
1500 }
1501 invalidate_live_intervals();
1502 }
1503
1504 /**
1505 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1506 *
1507 * During code generation, we create tons of temporary variables, many of
1508 * which get immediately killed and are never used again. Yet, in later
1509 * optimization and analysis passes, such as compute_live_intervals, we need
1510 * to loop over all the virtual GRFs. Compacting them can save a lot of
1511 * overhead.
1512 */
1513 void
1514 fs_visitor::compact_virtual_grfs()
1515 {
1516 /* Mark which virtual GRFs are used, and count how many. */
1517 int remap_table[this->virtual_grf_count];
1518 memset(remap_table, -1, sizeof(remap_table));
1519
1520 foreach_list(node, &this->instructions) {
1521 const fs_inst *inst = (const fs_inst *) node;
1522
1523 if (inst->dst.file == GRF)
1524 remap_table[inst->dst.reg] = 0;
1525
1526 for (int i = 0; i < 3; i++) {
1527 if (inst->src[i].file == GRF)
1528 remap_table[inst->src[i].reg] = 0;
1529 }
1530 }
1531
1532 /* In addition to registers used in instructions, fs_visitor keeps
1533 * direct references to certain special values which must be patched:
1534 */
1535 fs_reg *special[] = {
1536 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1537 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1538 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1539 &delta_x[0], &delta_x[1], &delta_x[2],
1540 &delta_x[3], &delta_x[4], &delta_x[5],
1541 &delta_y[0], &delta_y[1], &delta_y[2],
1542 &delta_y[3], &delta_y[4], &delta_y[5],
1543 };
1544 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1545 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1546
1547 /* Treat all special values as used, to be conservative */
1548 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1549 if (special[i]->file == GRF)
1550 remap_table[special[i]->reg] = 0;
1551 }
1552
1553 /* Compact the GRF arrays. */
1554 int new_index = 0;
1555 for (int i = 0; i < this->virtual_grf_count; i++) {
1556 if (remap_table[i] != -1) {
1557 remap_table[i] = new_index;
1558 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1559 invalidate_live_intervals();
1560 ++new_index;
1561 }
1562 }
1563
1564 this->virtual_grf_count = new_index;
1565
1566 /* Patch all the instructions to use the newly renumbered registers */
1567 foreach_list(node, &this->instructions) {
1568 fs_inst *inst = (fs_inst *) node;
1569
1570 if (inst->dst.file == GRF)
1571 inst->dst.reg = remap_table[inst->dst.reg];
1572
1573 for (int i = 0; i < 3; i++) {
1574 if (inst->src[i].file == GRF)
1575 inst->src[i].reg = remap_table[inst->src[i].reg];
1576 }
1577 }
1578
1579 /* Patch all the references to special values */
1580 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1581 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1582 special[i]->reg = remap_table[special[i]->reg];
1583 }
1584 }
1585
1586 bool
1587 fs_visitor::remove_dead_constants()
1588 {
1589 if (dispatch_width == 8) {
1590 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1591 this->nr_params_remap = c->prog_data.nr_params;
1592
1593 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1594 this->params_remap[i] = -1;
1595
1596 /* Find which params are still in use. */
1597 foreach_list(node, &this->instructions) {
1598 fs_inst *inst = (fs_inst *)node;
1599
1600 for (int i = 0; i < 3; i++) {
1601 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1602
1603 if (inst->src[i].file != UNIFORM)
1604 continue;
1605
1606 /* Section 5.11 of the OpenGL 4.3 spec says:
1607 *
1608 * "Out-of-bounds reads return undefined values, which include
1609 * values from other variables of the active program or zero."
1610 */
1611 if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1612 constant_nr = 0;
1613 }
1614
1615 /* For now, set this to non-negative. We'll give it the
1616 * actual new number in a moment, in order to keep the
1617 * register numbers nicely ordered.
1618 */
1619 this->params_remap[constant_nr] = 0;
1620 }
1621 }
1622
1623 /* Figure out what the new numbers for the params will be. At some
1624 * point when we're doing uniform array access, we're going to want
1625 * to keep the distinction between .reg and .reg_offset, but for
1626 * now we don't care.
1627 */
1628 unsigned int new_nr_params = 0;
1629 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1630 if (this->params_remap[i] != -1) {
1631 this->params_remap[i] = new_nr_params++;
1632 }
1633 }
1634
1635 /* Update the list of params to be uploaded to match our new numbering. */
1636 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1637 int remapped = this->params_remap[i];
1638
1639 if (remapped == -1)
1640 continue;
1641
1642 c->prog_data.param[remapped] = c->prog_data.param[i];
1643 }
1644
1645 c->prog_data.nr_params = new_nr_params;
1646 } else {
1647 /* This should have been generated in the 8-wide pass already. */
1648 assert(this->params_remap);
1649 }
1650
1651 /* Now do the renumbering of the shader to remove unused params. */
1652 foreach_list(node, &this->instructions) {
1653 fs_inst *inst = (fs_inst *)node;
1654
1655 for (int i = 0; i < 3; i++) {
1656 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1657
1658 if (inst->src[i].file != UNIFORM)
1659 continue;
1660
1661 /* as above alias to 0 */
1662 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1663 constant_nr = 0;
1664 }
1665 assert(this->params_remap[constant_nr] != -1);
1666 inst->src[i].reg = this->params_remap[constant_nr];
1667 inst->src[i].reg_offset = 0;
1668 }
1669 }
1670
1671 return true;
1672 }
1673
1674 /*
1675 * Implements array access of uniforms by inserting a
1676 * PULL_CONSTANT_LOAD instruction.
1677 *
1678 * Unlike temporary GRF array access (where we don't support it due to
1679 * the difficulty of doing relative addressing on instruction
1680 * destinations), we could potentially do array access of uniforms
1681 * that were loaded in GRF space as push constants. In real-world
1682 * usage we've seen, though, the arrays being used are always larger
1683 * than we could load as push constants, so just always move all
1684 * uniform array access out to a pull constant buffer.
1685 */
1686 void
1687 fs_visitor::move_uniform_array_access_to_pull_constants()
1688 {
1689 int pull_constant_loc[c->prog_data.nr_params];
1690
1691 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1692 pull_constant_loc[i] = -1;
1693 }
1694
1695 /* Walk through and find array access of uniforms. Put a copy of that
1696 * uniform in the pull constant buffer.
1697 *
1698 * Note that we don't move constant-indexed accesses to arrays. No
1699 * testing has been done of the performance impact of this choice.
1700 */
1701 foreach_list_safe(node, &this->instructions) {
1702 fs_inst *inst = (fs_inst *)node;
1703
1704 for (int i = 0 ; i < 3; i++) {
1705 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1706 continue;
1707
1708 int uniform = inst->src[i].reg;
1709
1710 /* If this array isn't already present in the pull constant buffer,
1711 * add it.
1712 */
1713 if (pull_constant_loc[uniform] == -1) {
1714 const float **values = &c->prog_data.param[uniform];
1715
1716 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1717
1718 assert(param_size[uniform]);
1719
1720 for (int j = 0; j < param_size[uniform]; j++) {
1721 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1722 values[j];
1723 }
1724 }
1725
1726 /* Set up the annotation tracking for new generated instructions. */
1727 base_ir = inst->ir;
1728 current_annotation = inst->annotation;
1729
1730 fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1731 fs_reg temp = fs_reg(this, glsl_type::float_type);
1732 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1733 surf_index,
1734 *inst->src[i].reladdr,
1735 pull_constant_loc[uniform] +
1736 inst->src[i].reg_offset);
1737 inst->insert_before(&list);
1738
1739 inst->src[i].file = temp.file;
1740 inst->src[i].reg = temp.reg;
1741 inst->src[i].reg_offset = temp.reg_offset;
1742 inst->src[i].reladdr = NULL;
1743 }
1744 }
1745 }
1746
1747 /**
1748 * Choose accesses from the UNIFORM file to demote to using the pull
1749 * constant buffer.
1750 *
1751 * We allow a fragment shader to have more than the specified minimum
1752 * maximum number of fragment shader uniform components (64). If
1753 * there are too many of these, they'd fill up all of register space.
1754 * So, this will push some of them out to the pull constant buffer and
1755 * update the program to load them.
1756 */
1757 void
1758 fs_visitor::setup_pull_constants()
1759 {
1760 /* Only allow 16 registers (128 uniform components) as push constants. */
1761 unsigned int max_uniform_components = 16 * 8;
1762 if (c->prog_data.nr_params <= max_uniform_components)
1763 return;
1764
1765 if (dispatch_width == 16) {
1766 fail("Pull constants not supported in 16-wide\n");
1767 return;
1768 }
1769
1770 /* Just demote the end of the list. We could probably do better
1771 * here, demoting things that are rarely used in the program first.
1772 */
1773 unsigned int pull_uniform_base = max_uniform_components;
1774
1775 int pull_constant_loc[c->prog_data.nr_params];
1776 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1777 if (i < pull_uniform_base) {
1778 pull_constant_loc[i] = -1;
1779 } else {
1780 pull_constant_loc[i] = -1;
1781 /* If our constant is already being uploaded for reladdr purposes,
1782 * reuse it.
1783 */
1784 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1785 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1786 pull_constant_loc[i] = j;
1787 break;
1788 }
1789 }
1790 if (pull_constant_loc[i] == -1) {
1791 int pull_index = c->prog_data.nr_pull_params++;
1792 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1793 pull_constant_loc[i] = pull_index;;
1794 }
1795 }
1796 }
1797 c->prog_data.nr_params = pull_uniform_base;
1798
1799 foreach_list(node, &this->instructions) {
1800 fs_inst *inst = (fs_inst *)node;
1801
1802 for (int i = 0; i < 3; i++) {
1803 if (inst->src[i].file != UNIFORM)
1804 continue;
1805
1806 int pull_index = pull_constant_loc[inst->src[i].reg +
1807 inst->src[i].reg_offset];
1808 if (pull_index == -1)
1809 continue;
1810
1811 assert(!inst->src[i].reladdr);
1812
1813 fs_reg dst = fs_reg(this, glsl_type::float_type);
1814 fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1815 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1816 fs_inst *pull =
1817 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1818 dst, index, offset);
1819 pull->ir = inst->ir;
1820 pull->annotation = inst->annotation;
1821
1822 inst->insert_before(pull);
1823
1824 inst->src[i].file = GRF;
1825 inst->src[i].reg = dst.reg;
1826 inst->src[i].reg_offset = 0;
1827 inst->src[i].smear = pull_index & 3;
1828 }
1829 }
1830 }
1831
1832 bool
1833 fs_visitor::opt_algebraic()
1834 {
1835 bool progress = false;
1836
1837 foreach_list(node, &this->instructions) {
1838 fs_inst *inst = (fs_inst *)node;
1839
1840 switch (inst->opcode) {
1841 case BRW_OPCODE_MUL:
1842 if (inst->src[1].file != IMM)
1843 continue;
1844
1845 /* a * 1.0 = a */
1846 if (inst->src[1].is_one()) {
1847 inst->opcode = BRW_OPCODE_MOV;
1848 inst->src[1] = reg_undef;
1849 progress = true;
1850 break;
1851 }
1852
1853 /* a * 0.0 = 0.0 */
1854 if (inst->src[1].is_zero()) {
1855 inst->opcode = BRW_OPCODE_MOV;
1856 inst->src[0] = inst->src[1];
1857 inst->src[1] = reg_undef;
1858 progress = true;
1859 break;
1860 }
1861
1862 break;
1863 case BRW_OPCODE_ADD:
1864 if (inst->src[1].file != IMM)
1865 continue;
1866
1867 /* a + 0.0 = a */
1868 if (inst->src[1].is_zero()) {
1869 inst->opcode = BRW_OPCODE_MOV;
1870 inst->src[1] = reg_undef;
1871 progress = true;
1872 break;
1873 }
1874 break;
1875 case BRW_OPCODE_OR:
1876 if (inst->src[0].equals(inst->src[1])) {
1877 inst->opcode = BRW_OPCODE_MOV;
1878 inst->src[1] = reg_undef;
1879 progress = true;
1880 break;
1881 }
1882 break;
1883 default:
1884 break;
1885 }
1886 }
1887
1888 return progress;
1889 }
1890
1891 /**
1892 * Removes any instructions writing a VGRF where that VGRF is not used by any
1893 * later instruction.
1894 */
1895 bool
1896 fs_visitor::dead_code_eliminate()
1897 {
1898 bool progress = false;
1899 int pc = 0;
1900
1901 calculate_live_intervals();
1902
1903 foreach_list_safe(node, &this->instructions) {
1904 fs_inst *inst = (fs_inst *)node;
1905
1906 if (inst->dst.file == GRF) {
1907 bool dead = true;
1908
1909 for (int i = 0; i < inst->regs_written; i++) {
1910 int var = live_intervals->var_from_vgrf[inst->dst.reg];
1911 assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
1912 if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
1913 dead = false;
1914 break;
1915 }
1916 }
1917
1918 if (dead) {
1919 /* Don't dead code eliminate instructions that write to the
1920 * accumulator as a side-effect. Instead just set the destination
1921 * to the null register to free it.
1922 */
1923 switch (inst->opcode) {
1924 case BRW_OPCODE_ADDC:
1925 case BRW_OPCODE_SUBB:
1926 case BRW_OPCODE_MACH:
1927 inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
1928 break;
1929 default:
1930 inst->remove();
1931 progress = true;
1932 break;
1933 }
1934 }
1935 }
1936
1937 pc++;
1938 }
1939
1940 if (progress)
1941 invalidate_live_intervals();
1942
1943 return progress;
1944 }
1945
1946 struct dead_code_hash_key
1947 {
1948 int vgrf;
1949 int reg_offset;
1950 };
1951
1952 static bool
1953 dead_code_hash_compare(const void *a, const void *b)
1954 {
1955 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1956 }
1957
1958 static void
1959 clear_dead_code_hash(struct hash_table *ht)
1960 {
1961 struct hash_entry *entry;
1962
1963 hash_table_foreach(ht, entry) {
1964 _mesa_hash_table_remove(ht, entry);
1965 }
1966 }
1967
1968 static void
1969 insert_dead_code_hash(struct hash_table *ht,
1970 int vgrf, int reg_offset, fs_inst *inst)
1971 {
1972 /* We don't bother freeing keys, because they'll be GCed with the ht. */
1973 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1974
1975 key->vgrf = vgrf;
1976 key->reg_offset = reg_offset;
1977
1978 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1979 }
1980
1981 static struct hash_entry *
1982 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1983 {
1984 struct dead_code_hash_key key;
1985
1986 key.vgrf = vgrf;
1987 key.reg_offset = reg_offset;
1988
1989 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1990 }
1991
1992 static void
1993 remove_dead_code_hash(struct hash_table *ht,
1994 int vgrf, int reg_offset)
1995 {
1996 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1997 if (!entry)
1998 return;
1999
2000 _mesa_hash_table_remove(ht, entry);
2001 }
2002
2003 /**
2004 * Walks basic blocks, removing any regs that are written but not read before
2005 * being redefined.
2006 *
2007 * The dead_code_eliminate() function implements a global dead code
2008 * elimination, but it only handles the removing the last write to a register
2009 * if it's never read. This one can handle intermediate writes, but only
2010 * within a basic block.
2011 */
2012 bool
2013 fs_visitor::dead_code_eliminate_local()
2014 {
2015 struct hash_table *ht;
2016 bool progress = false;
2017
2018 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2019
2020 foreach_list_safe(node, &this->instructions) {
2021 fs_inst *inst = (fs_inst *)node;
2022
2023 /* At a basic block, empty the HT since we don't understand dataflow
2024 * here.
2025 */
2026 if (inst->is_control_flow()) {
2027 clear_dead_code_hash(ht);
2028 continue;
2029 }
2030
2031 /* Clear the HT of any instructions that got read. */
2032 for (int i = 0; i < 3; i++) {
2033 fs_reg src = inst->src[i];
2034 if (src.file != GRF)
2035 continue;
2036
2037 int read = 1;
2038 if (inst->is_send_from_grf())
2039 read = virtual_grf_sizes[src.reg] - src.reg_offset;
2040
2041 for (int reg_offset = src.reg_offset;
2042 reg_offset < src.reg_offset + read;
2043 reg_offset++) {
2044 remove_dead_code_hash(ht, src.reg, reg_offset);
2045 }
2046 }
2047
2048 /* Add any update of a GRF to the HT, removing a previous write if it
2049 * wasn't read.
2050 */
2051 if (inst->dst.file == GRF) {
2052 if (inst->regs_written > 1) {
2053 /* We don't know how to trim channels from an instruction's
2054 * writes, so we can't incrementally remove unread channels from
2055 * it. Just remove whatever it overwrites from the table
2056 */
2057 for (int i = 0; i < inst->regs_written; i++) {
2058 remove_dead_code_hash(ht,
2059 inst->dst.reg,
2060 inst->dst.reg_offset + i);
2061 }
2062 } else {
2063 struct hash_entry *entry =
2064 get_dead_code_hash_entry(ht, inst->dst.reg,
2065 inst->dst.reg_offset);
2066
2067 if (inst->is_partial_write()) {
2068 /* For a partial write, we can't remove any previous dead code
2069 * candidate, since we're just modifying their result, but we can
2070 * be dead code eliminiated ourselves.
2071 */
2072 if (entry) {
2073 entry->data = inst;
2074 } else {
2075 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2076 inst);
2077 }
2078 } else {
2079 if (entry) {
2080 /* We're completely updating a channel, and there was a
2081 * previous write to the channel that wasn't read. Kill it!
2082 */
2083 fs_inst *inst = (fs_inst *)entry->data;
2084 inst->remove();
2085 progress = true;
2086 _mesa_hash_table_remove(ht, entry);
2087 }
2088
2089 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2090 inst);
2091 }
2092 }
2093 }
2094 }
2095
2096 _mesa_hash_table_destroy(ht, NULL);
2097
2098 if (progress)
2099 invalidate_live_intervals();
2100
2101 return progress;
2102 }
2103
2104 /**
2105 * Implements a second type of register coalescing: This one checks if
2106 * the two regs involved in a raw move don't interfere, in which case
2107 * they can both by stored in the same place and the MOV removed.
2108 */
2109 bool
2110 fs_visitor::register_coalesce_2()
2111 {
2112 bool progress = false;
2113
2114 calculate_live_intervals();
2115
2116 foreach_list_safe(node, &this->instructions) {
2117 fs_inst *inst = (fs_inst *)node;
2118
2119 if (inst->opcode != BRW_OPCODE_MOV ||
2120 inst->is_partial_write() ||
2121 inst->saturate ||
2122 inst->src[0].file != GRF ||
2123 inst->src[0].negate ||
2124 inst->src[0].abs ||
2125 inst->src[0].smear != -1 ||
2126 inst->dst.file != GRF ||
2127 inst->dst.type != inst->src[0].type ||
2128 virtual_grf_sizes[inst->src[0].reg] != 1) {
2129 continue;
2130 }
2131
2132 int var_from = live_intervals->var_from_reg(&inst->src[0]);
2133 int var_to = live_intervals->var_from_reg(&inst->dst);
2134
2135 if (live_intervals->vars_interfere(var_from, var_to))
2136 continue;
2137
2138 int reg_from = inst->src[0].reg;
2139 assert(inst->src[0].reg_offset == 0);
2140 int reg_to = inst->dst.reg;
2141 int reg_to_offset = inst->dst.reg_offset;
2142
2143 foreach_list(node, &this->instructions) {
2144 fs_inst *scan_inst = (fs_inst *)node;
2145
2146 if (scan_inst->dst.file == GRF &&
2147 scan_inst->dst.reg == reg_from) {
2148 scan_inst->dst.reg = reg_to;
2149 scan_inst->dst.reg_offset = reg_to_offset;
2150 }
2151 for (int i = 0; i < 3; i++) {
2152 if (scan_inst->src[i].file == GRF &&
2153 scan_inst->src[i].reg == reg_from) {
2154 scan_inst->src[i].reg = reg_to;
2155 scan_inst->src[i].reg_offset = reg_to_offset;
2156 }
2157 }
2158 }
2159
2160 inst->remove();
2161 progress = true;
2162 continue;
2163 }
2164
2165 if (progress)
2166 invalidate_live_intervals();
2167
2168 return progress;
2169 }
2170
2171 bool
2172 fs_visitor::register_coalesce()
2173 {
2174 bool progress = false;
2175 int if_depth = 0;
2176 int loop_depth = 0;
2177
2178 foreach_list_safe(node, &this->instructions) {
2179 fs_inst *inst = (fs_inst *)node;
2180
2181 /* Make sure that we dominate the instructions we're going to
2182 * scan for interfering with our coalescing, or we won't have
2183 * scanned enough to see if anything interferes with our
2184 * coalescing. We don't dominate the following instructions if
2185 * we're in a loop or an if block.
2186 */
2187 switch (inst->opcode) {
2188 case BRW_OPCODE_DO:
2189 loop_depth++;
2190 break;
2191 case BRW_OPCODE_WHILE:
2192 loop_depth--;
2193 break;
2194 case BRW_OPCODE_IF:
2195 if_depth++;
2196 break;
2197 case BRW_OPCODE_ENDIF:
2198 if_depth--;
2199 break;
2200 default:
2201 break;
2202 }
2203 if (loop_depth || if_depth)
2204 continue;
2205
2206 if (inst->opcode != BRW_OPCODE_MOV ||
2207 inst->is_partial_write() ||
2208 inst->saturate ||
2209 inst->dst.file != GRF || (inst->src[0].file != GRF &&
2210 inst->src[0].file != UNIFORM)||
2211 inst->dst.type != inst->src[0].type)
2212 continue;
2213
2214 bool has_source_modifiers = (inst->src[0].abs ||
2215 inst->src[0].negate ||
2216 inst->src[0].smear != -1 ||
2217 inst->src[0].file == UNIFORM);
2218
2219 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
2220 * them: check for no writes to either one until the exit of the
2221 * program.
2222 */
2223 bool interfered = false;
2224
2225 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2226 !scan_inst->is_tail_sentinel();
2227 scan_inst = (fs_inst *)scan_inst->next) {
2228 if (scan_inst->dst.file == GRF) {
2229 if (scan_inst->overwrites_reg(inst->dst) ||
2230 scan_inst->overwrites_reg(inst->src[0])) {
2231 interfered = true;
2232 break;
2233 }
2234 }
2235
2236 if (has_source_modifiers) {
2237 for (int i = 0; i < 3; i++) {
2238 if (scan_inst->src[i].file == GRF &&
2239 scan_inst->src[i].reg == inst->dst.reg &&
2240 scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2241 inst->dst.type != scan_inst->src[i].type)
2242 {
2243 interfered = true;
2244 break;
2245 }
2246 }
2247 }
2248
2249
2250 /* The gen6 MATH instruction can't handle source modifiers or
2251 * unusual register regions, so avoid coalescing those for
2252 * now. We should do something more specific.
2253 */
2254 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2255 interfered = true;
2256 break;
2257 }
2258
2259 if (scan_inst->mlen > 0 && scan_inst->base_mrf == -1 &&
2260 scan_inst->src[0].file == GRF &&
2261 scan_inst->src[0].reg == inst->dst.reg) {
2262 interfered = true;
2263 break;
2264 }
2265
2266 /* The accumulator result appears to get used for the
2267 * conditional modifier generation. When negating a UD
2268 * value, there is a 33rd bit generated for the sign in the
2269 * accumulator value, so now you can't check, for example,
2270 * equality with a 32-bit value. See piglit fs-op-neg-uint.
2271 */
2272 if (scan_inst->conditional_mod &&
2273 inst->src[0].negate &&
2274 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2275 interfered = true;
2276 break;
2277 }
2278 }
2279 if (interfered) {
2280 continue;
2281 }
2282
2283 /* Rewrite the later usage to point at the source of the move to
2284 * be removed.
2285 */
2286 for (fs_inst *scan_inst = inst;
2287 !scan_inst->is_tail_sentinel();
2288 scan_inst = (fs_inst *)scan_inst->next) {
2289 for (int i = 0; i < 3; i++) {
2290 if (scan_inst->src[i].file == GRF &&
2291 scan_inst->src[i].reg == inst->dst.reg &&
2292 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2293 fs_reg new_src = inst->src[0];
2294 if (scan_inst->src[i].abs) {
2295 new_src.negate = 0;
2296 new_src.abs = 1;
2297 }
2298 new_src.negate ^= scan_inst->src[i].negate;
2299 new_src.sechalf = scan_inst->src[i].sechalf;
2300 scan_inst->src[i] = new_src;
2301 }
2302 }
2303 }
2304
2305 inst->remove();
2306 progress = true;
2307 }
2308
2309 if (progress)
2310 invalidate_live_intervals();
2311
2312 return progress;
2313 }
2314
2315
2316 bool
2317 fs_visitor::compute_to_mrf()
2318 {
2319 bool progress = false;
2320 int next_ip = 0;
2321
2322 calculate_live_intervals();
2323
2324 foreach_list_safe(node, &this->instructions) {
2325 fs_inst *inst = (fs_inst *)node;
2326
2327 int ip = next_ip;
2328 next_ip++;
2329
2330 if (inst->opcode != BRW_OPCODE_MOV ||
2331 inst->is_partial_write() ||
2332 inst->dst.file != MRF || inst->src[0].file != GRF ||
2333 inst->dst.type != inst->src[0].type ||
2334 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2335 continue;
2336
2337 /* Work out which hardware MRF registers are written by this
2338 * instruction.
2339 */
2340 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2341 int mrf_high;
2342 if (inst->dst.reg & BRW_MRF_COMPR4) {
2343 mrf_high = mrf_low + 4;
2344 } else if (dispatch_width == 16 &&
2345 (!inst->force_uncompressed && !inst->force_sechalf)) {
2346 mrf_high = mrf_low + 1;
2347 } else {
2348 mrf_high = mrf_low;
2349 }
2350
2351 /* Can't compute-to-MRF this GRF if someone else was going to
2352 * read it later.
2353 */
2354 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2355 continue;
2356
2357 /* Found a move of a GRF to a MRF. Let's see if we can go
2358 * rewrite the thing that made this GRF to write into the MRF.
2359 */
2360 fs_inst *scan_inst;
2361 for (scan_inst = (fs_inst *)inst->prev;
2362 scan_inst->prev != NULL;
2363 scan_inst = (fs_inst *)scan_inst->prev) {
2364 if (scan_inst->dst.file == GRF &&
2365 scan_inst->dst.reg == inst->src[0].reg) {
2366 /* Found the last thing to write our reg we want to turn
2367 * into a compute-to-MRF.
2368 */
2369
2370 /* If this one instruction didn't populate all the
2371 * channels, bail. We might be able to rewrite everything
2372 * that writes that reg, but it would require smarter
2373 * tracking to delay the rewriting until complete success.
2374 */
2375 if (scan_inst->is_partial_write())
2376 break;
2377
2378 /* Things returning more than one register would need us to
2379 * understand coalescing out more than one MOV at a time.
2380 */
2381 if (scan_inst->regs_written > 1)
2382 break;
2383
2384 /* SEND instructions can't have MRF as a destination. */
2385 if (scan_inst->mlen)
2386 break;
2387
2388 if (brw->gen == 6) {
2389 /* gen6 math instructions must have the destination be
2390 * GRF, so no compute-to-MRF for them.
2391 */
2392 if (scan_inst->is_math()) {
2393 break;
2394 }
2395 }
2396
2397 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2398 /* Found the creator of our MRF's source value. */
2399 scan_inst->dst.file = MRF;
2400 scan_inst->dst.reg = inst->dst.reg;
2401 scan_inst->saturate |= inst->saturate;
2402 inst->remove();
2403 progress = true;
2404 }
2405 break;
2406 }
2407
2408 /* We don't handle control flow here. Most computation of
2409 * values that end up in MRFs are shortly before the MRF
2410 * write anyway.
2411 */
2412 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2413 break;
2414
2415 /* You can't read from an MRF, so if someone else reads our
2416 * MRF's source GRF that we wanted to rewrite, that stops us.
2417 */
2418 bool interfered = false;
2419 for (int i = 0; i < 3; i++) {
2420 if (scan_inst->src[i].file == GRF &&
2421 scan_inst->src[i].reg == inst->src[0].reg &&
2422 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2423 interfered = true;
2424 }
2425 }
2426 if (interfered)
2427 break;
2428
2429 if (scan_inst->dst.file == MRF) {
2430 /* If somebody else writes our MRF here, we can't
2431 * compute-to-MRF before that.
2432 */
2433 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2434 int scan_mrf_high;
2435
2436 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2437 scan_mrf_high = scan_mrf_low + 4;
2438 } else if (dispatch_width == 16 &&
2439 (!scan_inst->force_uncompressed &&
2440 !scan_inst->force_sechalf)) {
2441 scan_mrf_high = scan_mrf_low + 1;
2442 } else {
2443 scan_mrf_high = scan_mrf_low;
2444 }
2445
2446 if (mrf_low == scan_mrf_low ||
2447 mrf_low == scan_mrf_high ||
2448 mrf_high == scan_mrf_low ||
2449 mrf_high == scan_mrf_high) {
2450 break;
2451 }
2452 }
2453
2454 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2455 /* Found a SEND instruction, which means that there are
2456 * live values in MRFs from base_mrf to base_mrf +
2457 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2458 * above it.
2459 */
2460 if (mrf_low >= scan_inst->base_mrf &&
2461 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2462 break;
2463 }
2464 if (mrf_high >= scan_inst->base_mrf &&
2465 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2466 break;
2467 }
2468 }
2469 }
2470 }
2471
2472 if (progress)
2473 invalidate_live_intervals();
2474
2475 return progress;
2476 }
2477
2478 /**
2479 * Walks through basic blocks, looking for repeated MRF writes and
2480 * removing the later ones.
2481 */
2482 bool
2483 fs_visitor::remove_duplicate_mrf_writes()
2484 {
2485 fs_inst *last_mrf_move[16];
2486 bool progress = false;
2487
2488 /* Need to update the MRF tracking for compressed instructions. */
2489 if (dispatch_width == 16)
2490 return false;
2491
2492 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2493
2494 foreach_list_safe(node, &this->instructions) {
2495 fs_inst *inst = (fs_inst *)node;
2496
2497 if (inst->is_control_flow()) {
2498 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2499 }
2500
2501 if (inst->opcode == BRW_OPCODE_MOV &&
2502 inst->dst.file == MRF) {
2503 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2504 if (prev_inst && inst->equals(prev_inst)) {
2505 inst->remove();
2506 progress = true;
2507 continue;
2508 }
2509 }
2510
2511 /* Clear out the last-write records for MRFs that were overwritten. */
2512 if (inst->dst.file == MRF) {
2513 last_mrf_move[inst->dst.reg] = NULL;
2514 }
2515
2516 if (inst->mlen > 0 && inst->base_mrf != -1) {
2517 /* Found a SEND instruction, which will include two or fewer
2518 * implied MRF writes. We could do better here.
2519 */
2520 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2521 last_mrf_move[inst->base_mrf + i] = NULL;
2522 }
2523 }
2524
2525 /* Clear out any MRF move records whose sources got overwritten. */
2526 if (inst->dst.file == GRF) {
2527 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2528 if (last_mrf_move[i] &&
2529 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2530 last_mrf_move[i] = NULL;
2531 }
2532 }
2533 }
2534
2535 if (inst->opcode == BRW_OPCODE_MOV &&
2536 inst->dst.file == MRF &&
2537 inst->src[0].file == GRF &&
2538 !inst->is_partial_write()) {
2539 last_mrf_move[inst->dst.reg] = inst;
2540 }
2541 }
2542
2543 if (progress)
2544 invalidate_live_intervals();
2545
2546 return progress;
2547 }
2548
2549 static void
2550 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2551 int first_grf, int grf_len)
2552 {
2553 bool inst_16wide = (dispatch_width > 8 &&
2554 !inst->force_uncompressed &&
2555 !inst->force_sechalf);
2556
2557 /* Clear the flag for registers that actually got read (as expected). */
2558 for (int i = 0; i < 3; i++) {
2559 int grf;
2560 if (inst->src[i].file == GRF) {
2561 grf = inst->src[i].reg;
2562 } else if (inst->src[i].file == HW_REG &&
2563 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2564 grf = inst->src[i].fixed_hw_reg.nr;
2565 } else {
2566 continue;
2567 }
2568
2569 if (grf >= first_grf &&
2570 grf < first_grf + grf_len) {
2571 deps[grf - first_grf] = false;
2572 if (inst_16wide)
2573 deps[grf - first_grf + 1] = false;
2574 }
2575 }
2576 }
2577
2578 /**
2579 * Implements this workaround for the original 965:
2580 *
2581 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2582 * check for post destination dependencies on this instruction, software
2583 * must ensure that there is no destination hazard for the case of ‘write
2584 * followed by a posted write’ shown in the following example.
2585 *
2586 * 1. mov r3 0
2587 * 2. send r3.xy <rest of send instruction>
2588 * 3. mov r2 r3
2589 *
2590 * Due to no post-destination dependency check on the ‘send’, the above
2591 * code sequence could have two instructions (1 and 2) in flight at the
2592 * same time that both consider ‘r3’ as the target of their final writes.
2593 */
2594 void
2595 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2596 {
2597 int reg_size = dispatch_width / 8;
2598 int write_len = inst->regs_written * reg_size;
2599 int first_write_grf = inst->dst.reg;
2600 bool needs_dep[BRW_MAX_MRF];
2601 assert(write_len < (int)sizeof(needs_dep) - 1);
2602
2603 memset(needs_dep, false, sizeof(needs_dep));
2604 memset(needs_dep, true, write_len);
2605
2606 clear_deps_for_inst_src(inst, dispatch_width,
2607 needs_dep, first_write_grf, write_len);
2608
2609 /* Walk backwards looking for writes to registers we're writing which
2610 * aren't read since being written. If we hit the start of the program,
2611 * we assume that there are no outstanding dependencies on entry to the
2612 * program.
2613 */
2614 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2615 scan_inst != NULL;
2616 scan_inst = (fs_inst *)scan_inst->prev) {
2617
2618 /* If we hit control flow, assume that there *are* outstanding
2619 * dependencies, and force their cleanup before our instruction.
2620 */
2621 if (scan_inst->is_control_flow()) {
2622 for (int i = 0; i < write_len; i++) {
2623 if (needs_dep[i]) {
2624 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2625 }
2626 }
2627 return;
2628 }
2629
2630 bool scan_inst_16wide = (dispatch_width > 8 &&
2631 !scan_inst->force_uncompressed &&
2632 !scan_inst->force_sechalf);
2633
2634 /* We insert our reads as late as possible on the assumption that any
2635 * instruction but a MOV that might have left us an outstanding
2636 * dependency has more latency than a MOV.
2637 */
2638 if (scan_inst->dst.file == GRF) {
2639 for (int i = 0; i < scan_inst->regs_written; i++) {
2640 int reg = scan_inst->dst.reg + i * reg_size;
2641
2642 if (reg >= first_write_grf &&
2643 reg < first_write_grf + write_len &&
2644 needs_dep[reg - first_write_grf]) {
2645 inst->insert_before(DEP_RESOLVE_MOV(reg));
2646 needs_dep[reg - first_write_grf] = false;
2647 if (scan_inst_16wide)
2648 needs_dep[reg - first_write_grf + 1] = false;
2649 }
2650 }
2651 }
2652
2653 /* Clear the flag for registers that actually got read (as expected). */
2654 clear_deps_for_inst_src(scan_inst, dispatch_width,
2655 needs_dep, first_write_grf, write_len);
2656
2657 /* Continue the loop only if we haven't resolved all the dependencies */
2658 int i;
2659 for (i = 0; i < write_len; i++) {
2660 if (needs_dep[i])
2661 break;
2662 }
2663 if (i == write_len)
2664 return;
2665 }
2666 }
2667
2668 /**
2669 * Implements this workaround for the original 965:
2670 *
2671 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2672 * used as a destination register until after it has been sourced by an
2673 * instruction with a different destination register.
2674 */
2675 void
2676 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2677 {
2678 int write_len = inst->regs_written * dispatch_width / 8;
2679 int first_write_grf = inst->dst.reg;
2680 bool needs_dep[BRW_MAX_MRF];
2681 assert(write_len < (int)sizeof(needs_dep) - 1);
2682
2683 memset(needs_dep, false, sizeof(needs_dep));
2684 memset(needs_dep, true, write_len);
2685 /* Walk forwards looking for writes to registers we're writing which aren't
2686 * read before being written.
2687 */
2688 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2689 !scan_inst->is_tail_sentinel();
2690 scan_inst = (fs_inst *)scan_inst->next) {
2691 /* If we hit control flow, force resolve all remaining dependencies. */
2692 if (scan_inst->is_control_flow()) {
2693 for (int i = 0; i < write_len; i++) {
2694 if (needs_dep[i])
2695 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2696 }
2697 return;
2698 }
2699
2700 /* Clear the flag for registers that actually got read (as expected). */
2701 clear_deps_for_inst_src(scan_inst, dispatch_width,
2702 needs_dep, first_write_grf, write_len);
2703
2704 /* We insert our reads as late as possible since they're reading the
2705 * result of a SEND, which has massive latency.
2706 */
2707 if (scan_inst->dst.file == GRF &&
2708 scan_inst->dst.reg >= first_write_grf &&
2709 scan_inst->dst.reg < first_write_grf + write_len &&
2710 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2711 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2712 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2713 }
2714
2715 /* Continue the loop only if we haven't resolved all the dependencies */
2716 int i;
2717 for (i = 0; i < write_len; i++) {
2718 if (needs_dep[i])
2719 break;
2720 }
2721 if (i == write_len)
2722 return;
2723 }
2724
2725 /* If we hit the end of the program, resolve all remaining dependencies out
2726 * of paranoia.
2727 */
2728 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2729 assert(last_inst->eot);
2730 for (int i = 0; i < write_len; i++) {
2731 if (needs_dep[i])
2732 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2733 }
2734 }
2735
2736 void
2737 fs_visitor::insert_gen4_send_dependency_workarounds()
2738 {
2739 if (brw->gen != 4 || brw->is_g4x)
2740 return;
2741
2742 /* Note that we're done with register allocation, so GRF fs_regs always
2743 * have a .reg_offset of 0.
2744 */
2745
2746 foreach_list_safe(node, &this->instructions) {
2747 fs_inst *inst = (fs_inst *)node;
2748
2749 if (inst->mlen != 0 && inst->dst.file == GRF) {
2750 insert_gen4_pre_send_dependency_workarounds(inst);
2751 insert_gen4_post_send_dependency_workarounds(inst);
2752 }
2753 }
2754 }
2755
2756 /**
2757 * Turns the generic expression-style uniform pull constant load instruction
2758 * into a hardware-specific series of instructions for loading a pull
2759 * constant.
2760 *
2761 * The expression style allows the CSE pass before this to optimize out
2762 * repeated loads from the same offset, and gives the pre-register-allocation
2763 * scheduling full flexibility, while the conversion to native instructions
2764 * allows the post-register-allocation scheduler the best information
2765 * possible.
2766 *
2767 * Note that execution masking for setting up pull constant loads is special:
2768 * the channels that need to be written are unrelated to the current execution
2769 * mask, since a later instruction will use one of the result channels as a
2770 * source operand for all 8 or 16 of its channels.
2771 */
2772 void
2773 fs_visitor::lower_uniform_pull_constant_loads()
2774 {
2775 foreach_list(node, &this->instructions) {
2776 fs_inst *inst = (fs_inst *)node;
2777
2778 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2779 continue;
2780
2781 if (brw->gen >= 7) {
2782 /* The offset arg before was a vec4-aligned byte offset. We need to
2783 * turn it into a dword offset.
2784 */
2785 fs_reg const_offset_reg = inst->src[1];
2786 assert(const_offset_reg.file == IMM &&
2787 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2788 const_offset_reg.imm.u /= 4;
2789 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2790
2791 /* This is actually going to be a MOV, but since only the first dword
2792 * is accessed, we have a special opcode to do just that one. Note
2793 * that this needs to be an operation that will be considered a def
2794 * by live variable analysis, or register allocation will explode.
2795 */
2796 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2797 payload, const_offset_reg);
2798 setup->force_writemask_all = true;
2799
2800 setup->ir = inst->ir;
2801 setup->annotation = inst->annotation;
2802 inst->insert_before(setup);
2803
2804 /* Similarly, this will only populate the first 4 channels of the
2805 * result register (since we only use smear values from 0-3), but we
2806 * don't tell the optimizer.
2807 */
2808 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2809 inst->src[1] = payload;
2810
2811 invalidate_live_intervals();
2812 } else {
2813 /* Before register allocation, we didn't tell the scheduler about the
2814 * MRF we use. We know it's safe to use this MRF because nothing
2815 * else does except for register spill/unspill, which generates and
2816 * uses its MRF within a single IR instruction.
2817 */
2818 inst->base_mrf = 14;
2819 inst->mlen = 1;
2820 }
2821 }
2822 }
2823
2824 void
2825 fs_visitor::dump_instruction(backend_instruction *be_inst)
2826 {
2827 fs_inst *inst = (fs_inst *)be_inst;
2828
2829 if (inst->predicate) {
2830 printf("(%cf0.%d) ",
2831 inst->predicate_inverse ? '-' : '+',
2832 inst->flag_subreg);
2833 }
2834
2835 printf("%s", brw_instruction_name(inst->opcode));
2836 if (inst->saturate)
2837 printf(".sat");
2838 if (inst->conditional_mod) {
2839 printf(".cmod");
2840 if (!inst->predicate &&
2841 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2842 inst->opcode != BRW_OPCODE_IF &&
2843 inst->opcode != BRW_OPCODE_WHILE))) {
2844 printf(".f0.%d", inst->flag_subreg);
2845 }
2846 }
2847 printf(" ");
2848
2849
2850 switch (inst->dst.file) {
2851 case GRF:
2852 printf("vgrf%d", inst->dst.reg);
2853 if (inst->dst.reg_offset)
2854 printf("+%d", inst->dst.reg_offset);
2855 break;
2856 case MRF:
2857 printf("m%d", inst->dst.reg);
2858 break;
2859 case BAD_FILE:
2860 printf("(null)");
2861 break;
2862 case UNIFORM:
2863 printf("***u%d***", inst->dst.reg);
2864 break;
2865 case HW_REG:
2866 printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
2867 if (inst->dst.fixed_hw_reg.subnr)
2868 printf("+%d", inst->dst.fixed_hw_reg.subnr);
2869 break;
2870 default:
2871 printf("???");
2872 break;
2873 }
2874 printf(", ");
2875
2876 for (int i = 0; i < 3; i++) {
2877 if (inst->src[i].negate)
2878 printf("-");
2879 if (inst->src[i].abs)
2880 printf("|");
2881 switch (inst->src[i].file) {
2882 case GRF:
2883 printf("vgrf%d", inst->src[i].reg);
2884 if (inst->src[i].reg_offset)
2885 printf("+%d", inst->src[i].reg_offset);
2886 break;
2887 case MRF:
2888 printf("***m%d***", inst->src[i].reg);
2889 break;
2890 case UNIFORM:
2891 printf("u%d", inst->src[i].reg);
2892 if (inst->src[i].reg_offset)
2893 printf(".%d", inst->src[i].reg_offset);
2894 break;
2895 case BAD_FILE:
2896 printf("(null)");
2897 break;
2898 case IMM:
2899 switch (inst->src[i].type) {
2900 case BRW_REGISTER_TYPE_F:
2901 printf("%ff", inst->src[i].imm.f);
2902 break;
2903 case BRW_REGISTER_TYPE_D:
2904 printf("%dd", inst->src[i].imm.i);
2905 break;
2906 case BRW_REGISTER_TYPE_UD:
2907 printf("%uu", inst->src[i].imm.u);
2908 break;
2909 default:
2910 printf("???");
2911 break;
2912 }
2913 break;
2914 case HW_REG:
2915 if (inst->src[i].fixed_hw_reg.negate)
2916 printf("-");
2917 if (inst->src[i].fixed_hw_reg.abs)
2918 printf("|");
2919 printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2920 if (inst->src[i].fixed_hw_reg.subnr)
2921 printf("+%d", inst->src[i].fixed_hw_reg.subnr);
2922 if (inst->src[i].fixed_hw_reg.abs)
2923 printf("|");
2924 break;
2925 default:
2926 printf("???");
2927 break;
2928 }
2929 if (inst->src[i].abs)
2930 printf("|");
2931
2932 if (i < 3)
2933 printf(", ");
2934 }
2935
2936 printf(" ");
2937
2938 if (inst->force_uncompressed)
2939 printf("1sthalf ");
2940
2941 if (inst->force_sechalf)
2942 printf("2ndhalf ");
2943
2944 printf("\n");
2945 }
2946
2947 /**
2948 * Possibly returns an instruction that set up @param reg.
2949 *
2950 * Sometimes we want to take the result of some expression/variable
2951 * dereference tree and rewrite the instruction generating the result
2952 * of the tree. When processing the tree, we know that the
2953 * instructions generated are all writing temporaries that are dead
2954 * outside of this tree. So, if we have some instructions that write
2955 * a temporary, we're free to point that temp write somewhere else.
2956 *
2957 * Note that this doesn't guarantee that the instruction generated
2958 * only reg -- it might be the size=4 destination of a texture instruction.
2959 */
2960 fs_inst *
2961 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2962 fs_inst *end,
2963 fs_reg reg)
2964 {
2965 if (end == start ||
2966 end->is_partial_write() ||
2967 reg.reladdr ||
2968 !reg.equals(end->dst)) {
2969 return NULL;
2970 } else {
2971 return end;
2972 }
2973 }
2974
2975 void
2976 fs_visitor::setup_payload_gen6()
2977 {
2978 bool uses_depth =
2979 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2980 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2981
2982 assert(brw->gen >= 6);
2983
2984 /* R0-1: masks, pixel X/Y coordinates. */
2985 c->nr_payload_regs = 2;
2986 /* R2: only for 32-pixel dispatch.*/
2987
2988 /* R3-26: barycentric interpolation coordinates. These appear in the
2989 * same order that they appear in the brw_wm_barycentric_interp_mode
2990 * enum. Each set of coordinates occupies 2 registers if dispatch width
2991 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2992 * appear if they were enabled using the "Barycentric Interpolation
2993 * Mode" bits in WM_STATE.
2994 */
2995 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2996 if (barycentric_interp_modes & (1 << i)) {
2997 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2998 c->nr_payload_regs += 2;
2999 if (dispatch_width == 16) {
3000 c->nr_payload_regs += 2;
3001 }
3002 }
3003 }
3004
3005 /* R27: interpolated depth if uses source depth */
3006 if (uses_depth) {
3007 c->source_depth_reg = c->nr_payload_regs;
3008 c->nr_payload_regs++;
3009 if (dispatch_width == 16) {
3010 /* R28: interpolated depth if not 8-wide. */
3011 c->nr_payload_regs++;
3012 }
3013 }
3014 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3015 if (uses_depth) {
3016 c->source_w_reg = c->nr_payload_regs;
3017 c->nr_payload_regs++;
3018 if (dispatch_width == 16) {
3019 /* R30: interpolated W if not 8-wide. */
3020 c->nr_payload_regs++;
3021 }
3022 }
3023 /* R31: MSAA position offsets. */
3024 /* R32-: bary for 32-pixel. */
3025 /* R58-59: interp W for 32-pixel. */
3026
3027 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3028 c->source_depth_to_render_target = true;
3029 }
3030 }
3031
3032 void
3033 fs_visitor::assign_binding_table_offsets()
3034 {
3035 uint32_t next_binding_table_offset = 0;
3036
3037 c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3038 next_binding_table_offset += c->key.nr_color_regions;
3039
3040 assign_common_binding_table_offsets(next_binding_table_offset);
3041 }
3042
3043 bool
3044 fs_visitor::run()
3045 {
3046 sanity_param_count = fp->Base.Parameters->NumParameters;
3047 uint32_t orig_nr_params = c->prog_data.nr_params;
3048
3049 assign_binding_table_offsets();
3050
3051 if (brw->gen >= 6)
3052 setup_payload_gen6();
3053 else
3054 setup_payload_gen4();
3055
3056 if (0) {
3057 emit_dummy_fs();
3058 } else {
3059 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3060 emit_shader_time_begin();
3061
3062 calculate_urb_setup();
3063 if (fp->Base.InputsRead > 0) {
3064 if (brw->gen < 6)
3065 emit_interpolation_setup_gen4();
3066 else
3067 emit_interpolation_setup_gen6();
3068 }
3069
3070 /* We handle discards by keeping track of the still-live pixels in f0.1.
3071 * Initialize it with the dispatched pixels.
3072 */
3073 if (fp->UsesKill) {
3074 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3075 discard_init->flag_subreg = 1;
3076 }
3077
3078 /* Generate FS IR for main(). (the visitor only descends into
3079 * functions called "main").
3080 */
3081 if (shader) {
3082 foreach_list(node, &*shader->ir) {
3083 ir_instruction *ir = (ir_instruction *)node;
3084 base_ir = ir;
3085 this->result = reg_undef;
3086 ir->accept(this);
3087 }
3088 } else {
3089 emit_fragment_program_code();
3090 }
3091 base_ir = NULL;
3092 if (failed)
3093 return false;
3094
3095 emit(FS_OPCODE_PLACEHOLDER_HALT);
3096
3097 emit_fb_writes();
3098
3099 split_virtual_grfs();
3100
3101 move_uniform_array_access_to_pull_constants();
3102 remove_dead_constants();
3103 setup_pull_constants();
3104
3105 bool progress;
3106 do {
3107 progress = false;
3108
3109 compact_virtual_grfs();
3110
3111 progress = remove_duplicate_mrf_writes() || progress;
3112
3113 progress = opt_algebraic() || progress;
3114 progress = opt_cse() || progress;
3115 progress = opt_copy_propagate() || progress;
3116 progress = dead_code_eliminate() || progress;
3117 progress = dead_code_eliminate_local() || progress;
3118 progress = register_coalesce() || progress;
3119 progress = register_coalesce_2() || progress;
3120 progress = compute_to_mrf() || progress;
3121 } while (progress);
3122
3123 schedule_instructions(false);
3124
3125 lower_uniform_pull_constant_loads();
3126
3127 assign_curb_setup();
3128 assign_urb_setup();
3129
3130 if (0)
3131 assign_regs_trivial();
3132 else {
3133 while (!assign_regs()) {
3134 if (failed)
3135 break;
3136 }
3137 }
3138 }
3139 assert(force_uncompressed_stack == 0);
3140 assert(force_sechalf_stack == 0);
3141
3142 /* This must come after all optimization and register allocation, since
3143 * it inserts dead code that happens to have side effects, and it does
3144 * so based on the actual physical registers in use.
3145 */
3146 insert_gen4_send_dependency_workarounds();
3147
3148 if (failed)
3149 return false;
3150
3151 schedule_instructions(true);
3152
3153 if (dispatch_width == 8) {
3154 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3155 } else {
3156 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3157
3158 /* Make sure we didn't try to sneak in an extra uniform */
3159 assert(orig_nr_params == c->prog_data.nr_params);
3160 (void) orig_nr_params;
3161 }
3162
3163 /* If any state parameters were appended, then ParameterValues could have
3164 * been realloced, in which case the driver uniform storage set up by
3165 * _mesa_associate_uniform_storage() would point to freed memory. Make
3166 * sure that didn't happen.
3167 */
3168 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3169
3170 return !failed;
3171 }
3172
3173 const unsigned *
3174 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3175 struct gl_fragment_program *fp,
3176 struct gl_shader_program *prog,
3177 unsigned *final_assembly_size)
3178 {
3179 bool start_busy = false;
3180 float start_time = 0;
3181
3182 if (unlikely(brw->perf_debug)) {
3183 start_busy = (brw->batch.last_bo &&
3184 drm_intel_bo_busy(brw->batch.last_bo));
3185 start_time = get_time();
3186 }
3187
3188 struct brw_shader *shader = NULL;
3189 if (prog)
3190 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3191
3192 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3193 if (prog) {
3194 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3195 _mesa_print_ir(shader->ir, NULL);
3196 printf("\n\n");
3197 } else {
3198 printf("ARB_fragment_program %d ir for native fragment shader\n",
3199 fp->Base.Id);
3200 _mesa_print_program(&fp->Base);
3201 }
3202 }
3203
3204 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3205 */
3206 fs_visitor v(brw, c, prog, fp, 8);
3207 if (!v.run()) {
3208 if (prog) {
3209 prog->LinkStatus = false;
3210 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3211 }
3212
3213 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3214 v.fail_msg);
3215
3216 return NULL;
3217 }
3218
3219 exec_list *simd16_instructions = NULL;
3220 fs_visitor v2(brw, c, prog, fp, 16);
3221 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3222 if (c->prog_data.nr_pull_params == 0) {
3223 /* Try a 16-wide compile */
3224 v2.import_uniforms(&v);
3225 if (!v2.run()) {
3226 perf_debug("16-wide shader failed to compile, falling back to "
3227 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3228 } else {
3229 simd16_instructions = &v2.instructions;
3230 }
3231 } else {
3232 perf_debug("Skipping 16-wide due to pull parameters.\n");
3233 }
3234 }
3235
3236 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3237 const unsigned *generated = g.generate_assembly(&v.instructions,
3238 simd16_instructions,
3239 final_assembly_size);
3240
3241 if (unlikely(brw->perf_debug) && shader) {
3242 if (shader->compiled_once)
3243 brw_wm_debug_recompile(brw, prog, &c->key);
3244 shader->compiled_once = true;
3245
3246 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3247 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3248 (get_time() - start_time) * 1000);
3249 }
3250 }
3251
3252 return generated;
3253 }
3254
3255 bool
3256 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3257 {
3258 struct brw_context *brw = brw_context(ctx);
3259 struct brw_wm_prog_key key;
3260
3261 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3262 return true;
3263
3264 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3265 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3266 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3267 bool program_uses_dfdy = fp->UsesDFdy;
3268
3269 memset(&key, 0, sizeof(key));
3270
3271 if (brw->gen < 6) {
3272 if (fp->UsesKill)
3273 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3274
3275 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3276 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3277
3278 /* Just assume depth testing. */
3279 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3280 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3281 }
3282
3283 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3284 BRW_FS_VARYING_INPUT_MASK) > 16)
3285 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3286
3287 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3288
3289 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3290 for (unsigned i = 0; i < sampler_count; i++) {
3291 if (fp->Base.ShadowSamplers & (1 << i)) {
3292 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3293 key.tex.swizzles[i] =
3294 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3295 } else {
3296 /* Color sampler: assume no swizzling. */
3297 key.tex.swizzles[i] = SWIZZLE_XYZW;
3298 }
3299 }
3300
3301 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3302 key.drawable_height = ctx->DrawBuffer->Height;
3303 }
3304
3305 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3306 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3307 }
3308
3309 key.nr_color_regions = 1;
3310
3311 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3312 * quality of the derivatives is likely to be determined by the driconf
3313 * option.
3314 */
3315 key.high_quality_derivatives = brw->disable_derivative_optimization;
3316
3317 key.program_string_id = bfp->id;
3318
3319 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3320 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3321
3322 bool success = do_wm_prog(brw, prog, bfp, &key);
3323
3324 brw->wm.base.prog_offset = old_prog_offset;
3325 brw->wm.prog_data = old_prog_data;
3326
3327 return success;
3328 }