i965/vec4: Add constructor of src_reg from a fixed hardware reg.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53
54 void
55 fs_inst::init()
56 {
57 memset(this, 0, sizeof(*this));
58 this->opcode = BRW_OPCODE_NOP;
59 this->conditional_mod = BRW_CONDITIONAL_NONE;
60
61 this->dst = reg_undef;
62 this->src[0] = reg_undef;
63 this->src[1] = reg_undef;
64 this->src[2] = reg_undef;
65
66 /* This will be the case for almost all instructions. */
67 this->regs_written = 1;
68 }
69
70 fs_inst::fs_inst()
71 {
72 init();
73 }
74
75 fs_inst::fs_inst(enum opcode opcode)
76 {
77 init();
78 this->opcode = opcode;
79 }
80
81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
82 {
83 init();
84 this->opcode = opcode;
85 this->dst = dst;
86
87 if (dst.file == GRF)
88 assert(dst.reg_offset >= 0);
89 }
90
91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
92 {
93 init();
94 this->opcode = opcode;
95 this->dst = dst;
96 this->src[0] = src0;
97
98 if (dst.file == GRF)
99 assert(dst.reg_offset >= 0);
100 if (src[0].file == GRF)
101 assert(src[0].reg_offset >= 0);
102 }
103
104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
105 {
106 init();
107 this->opcode = opcode;
108 this->dst = dst;
109 this->src[0] = src0;
110 this->src[1] = src1;
111
112 if (dst.file == GRF)
113 assert(dst.reg_offset >= 0);
114 if (src[0].file == GRF)
115 assert(src[0].reg_offset >= 0);
116 if (src[1].file == GRF)
117 assert(src[1].reg_offset >= 0);
118 }
119
120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
121 fs_reg src0, fs_reg src1, fs_reg src2)
122 {
123 init();
124 this->opcode = opcode;
125 this->dst = dst;
126 this->src[0] = src0;
127 this->src[1] = src1;
128 this->src[2] = src2;
129
130 if (dst.file == GRF)
131 assert(dst.reg_offset >= 0);
132 if (src[0].file == GRF)
133 assert(src[0].reg_offset >= 0);
134 if (src[1].file == GRF)
135 assert(src[1].reg_offset >= 0);
136 if (src[2].file == GRF)
137 assert(src[2].reg_offset >= 0);
138 }
139
140 #define ALU1(op) \
141 fs_inst * \
142 fs_visitor::op(fs_reg dst, fs_reg src0) \
143 { \
144 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
145 }
146
147 #define ALU2(op) \
148 fs_inst * \
149 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
150 { \
151 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
152 }
153
154 #define ALU3(op) \
155 fs_inst * \
156 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
157 { \
158 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
159 }
160
161 ALU1(NOT)
162 ALU1(MOV)
163 ALU1(FRC)
164 ALU1(RNDD)
165 ALU1(RNDE)
166 ALU1(RNDZ)
167 ALU2(ADD)
168 ALU2(MUL)
169 ALU2(MACH)
170 ALU2(AND)
171 ALU2(OR)
172 ALU2(XOR)
173 ALU2(SHL)
174 ALU2(SHR)
175 ALU2(ASR)
176 ALU3(LRP)
177 ALU1(BFREV)
178 ALU3(BFE)
179 ALU2(BFI1)
180 ALU3(BFI2)
181 ALU1(FBH)
182 ALU1(FBL)
183 ALU1(CBIT)
184 ALU3(MAD)
185 ALU2(ADDC)
186 ALU2(SUBB)
187 ALU2(SEL)
188
189 /** Gen4 predicated IF. */
190 fs_inst *
191 fs_visitor::IF(uint32_t predicate)
192 {
193 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195 return inst;
196 }
197
198 /** Gen6 IF with embedded comparison. */
199 fs_inst *
200 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
201 {
202 assert(brw->gen == 6);
203 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
204 reg_null_d, src0, src1);
205 inst->conditional_mod = condition;
206 return inst;
207 }
208
209 /**
210 * CMP: Sets the low bit of the destination channels with the result
211 * of the comparison, while the upper bits are undefined, and updates
212 * the flag register with the packed 16 bits of the result.
213 */
214 fs_inst *
215 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
216 {
217 fs_inst *inst;
218
219 /* Take the instruction:
220 *
221 * CMP null<d> src0<f> src1<f>
222 *
223 * Original gen4 does type conversion to the destination type before
224 * comparison, producing garbage results for floating point comparisons.
225 * gen5 does the comparison on the execution type (resolved source types),
226 * so dst type doesn't matter. gen6 does comparison and then uses the
227 * result as if it was the dst type with no conversion, which happens to
228 * mostly work out for float-interpreted-as-int since our comparisons are
229 * for >0, =0, <0.
230 */
231 if (brw->gen == 4) {
232 dst.type = src0.type;
233 if (dst.file == HW_REG)
234 dst.fixed_hw_reg.type = dst.type;
235 }
236
237 resolve_ud_negate(&src0);
238 resolve_ud_negate(&src1);
239
240 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
241 inst->conditional_mod = condition;
242
243 return inst;
244 }
245
246 exec_list
247 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
248 fs_reg varying_offset,
249 uint32_t const_offset)
250 {
251 exec_list instructions;
252 fs_inst *inst;
253
254 /* We have our constant surface use a pitch of 4 bytes, so our index can
255 * be any component of a vector, and then we load 4 contiguous
256 * components starting from that.
257 *
258 * We break down the const_offset to a portion added to the variable
259 * offset and a portion done using reg_offset, which means that if you
260 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
261 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
262 * CSE can later notice that those loads are all the same and eliminate
263 * the redundant ones.
264 */
265 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
266 instructions.push_tail(ADD(vec4_offset,
267 varying_offset, const_offset & ~3));
268
269 int scale = 1;
270 if (brw->gen == 4 && dispatch_width == 8) {
271 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
272 * u, v, r) as parameters, or we can just use the SIMD16 message
273 * consisting of (header, u). We choose the second, at the cost of a
274 * longer return length.
275 */
276 scale = 2;
277 }
278
279 enum opcode op;
280 if (brw->gen >= 7)
281 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
282 else
283 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
284 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
285 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
286 inst->regs_written = 4 * scale;
287 instructions.push_tail(inst);
288
289 if (brw->gen < 7) {
290 inst->base_mrf = 13;
291 inst->header_present = true;
292 if (brw->gen == 4)
293 inst->mlen = 3;
294 else
295 inst->mlen = 1 + dispatch_width / 8;
296 }
297
298 vec4_result.reg_offset += (const_offset & 3) * scale;
299 instructions.push_tail(MOV(dst, vec4_result));
300
301 return instructions;
302 }
303
304 /**
305 * A helper for MOV generation for fixing up broken hardware SEND dependency
306 * handling.
307 */
308 fs_inst *
309 fs_visitor::DEP_RESOLVE_MOV(int grf)
310 {
311 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
312
313 inst->ir = NULL;
314 inst->annotation = "send dependency resolve";
315
316 /* The caller always wants uncompressed to emit the minimal extra
317 * dependencies, and to avoid having to deal with aligning its regs to 2.
318 */
319 inst->force_uncompressed = true;
320
321 return inst;
322 }
323
324 bool
325 fs_inst::equals(fs_inst *inst)
326 {
327 return (opcode == inst->opcode &&
328 dst.equals(inst->dst) &&
329 src[0].equals(inst->src[0]) &&
330 src[1].equals(inst->src[1]) &&
331 src[2].equals(inst->src[2]) &&
332 saturate == inst->saturate &&
333 predicate == inst->predicate &&
334 conditional_mod == inst->conditional_mod &&
335 mlen == inst->mlen &&
336 base_mrf == inst->base_mrf &&
337 sampler == inst->sampler &&
338 target == inst->target &&
339 eot == inst->eot &&
340 header_present == inst->header_present &&
341 shadow_compare == inst->shadow_compare &&
342 offset == inst->offset);
343 }
344
345 bool
346 fs_inst::overwrites_reg(const fs_reg &reg)
347 {
348 return (reg.file == dst.file &&
349 reg.reg == dst.reg &&
350 reg.reg_offset >= dst.reg_offset &&
351 reg.reg_offset < dst.reg_offset + regs_written);
352 }
353
354 bool
355 fs_inst::is_send_from_grf()
356 {
357 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
358 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
359 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
360 src[1].file == GRF) ||
361 (is_tex() && src[0].file == GRF));
362 }
363
364 bool
365 fs_visitor::can_do_source_mods(fs_inst *inst)
366 {
367 if (brw->gen == 6 && inst->is_math())
368 return false;
369
370 if (inst->is_send_from_grf())
371 return false;
372
373 if (!inst->can_do_source_mods())
374 return false;
375
376 return true;
377 }
378
379 void
380 fs_reg::init()
381 {
382 memset(this, 0, sizeof(*this));
383 stride = 1;
384 }
385
386 /** Generic unset register constructor. */
387 fs_reg::fs_reg()
388 {
389 init();
390 this->file = BAD_FILE;
391 }
392
393 /** Immediate value constructor. */
394 fs_reg::fs_reg(float f)
395 {
396 init();
397 this->file = IMM;
398 this->type = BRW_REGISTER_TYPE_F;
399 this->imm.f = f;
400 }
401
402 /** Immediate value constructor. */
403 fs_reg::fs_reg(int32_t i)
404 {
405 init();
406 this->file = IMM;
407 this->type = BRW_REGISTER_TYPE_D;
408 this->imm.i = i;
409 }
410
411 /** Immediate value constructor. */
412 fs_reg::fs_reg(uint32_t u)
413 {
414 init();
415 this->file = IMM;
416 this->type = BRW_REGISTER_TYPE_UD;
417 this->imm.u = u;
418 }
419
420 /** Fixed brw_reg. */
421 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
422 {
423 init();
424 this->file = HW_REG;
425 this->fixed_hw_reg = fixed_hw_reg;
426 this->type = fixed_hw_reg.type;
427 }
428
429 bool
430 fs_reg::equals(const fs_reg &r) const
431 {
432 return (file == r.file &&
433 reg == r.reg &&
434 reg_offset == r.reg_offset &&
435 subreg_offset == r.subreg_offset &&
436 type == r.type &&
437 negate == r.negate &&
438 abs == r.abs &&
439 !reladdr && !r.reladdr &&
440 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
441 sizeof(fixed_hw_reg)) == 0 &&
442 stride == r.stride &&
443 imm.u == r.imm.u);
444 }
445
446 fs_reg
447 fs_reg::retype(uint32_t type)
448 {
449 fs_reg result = *this;
450 result.type = type;
451 return result;
452 }
453
454 fs_reg &
455 fs_reg::apply_stride(unsigned stride)
456 {
457 assert((this->stride * stride) <= 4 &&
458 (is_power_of_two(stride) || stride == 0) &&
459 file != HW_REG && file != IMM);
460 this->stride *= stride;
461 return *this;
462 }
463
464 fs_reg &
465 fs_reg::set_smear(unsigned subreg)
466 {
467 assert(file != HW_REG && file != IMM);
468 subreg_offset = subreg * type_sz(type);
469 stride = 0;
470 return *this;
471 }
472
473 bool
474 fs_reg::is_contiguous() const
475 {
476 return stride == 1;
477 }
478
479 bool
480 fs_reg::is_zero() const
481 {
482 if (file != IMM)
483 return false;
484
485 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
486 }
487
488 bool
489 fs_reg::is_one() const
490 {
491 if (file != IMM)
492 return false;
493
494 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
495 }
496
497 bool
498 fs_reg::is_null() const
499 {
500 return file == HW_REG &&
501 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
502 fixed_hw_reg.nr == BRW_ARF_NULL;
503 }
504
505 bool
506 fs_reg::is_valid_3src() const
507 {
508 return file == GRF || file == UNIFORM;
509 }
510
511 int
512 fs_visitor::type_size(const struct glsl_type *type)
513 {
514 unsigned int size, i;
515
516 switch (type->base_type) {
517 case GLSL_TYPE_UINT:
518 case GLSL_TYPE_INT:
519 case GLSL_TYPE_FLOAT:
520 case GLSL_TYPE_BOOL:
521 return type->components();
522 case GLSL_TYPE_ARRAY:
523 return type_size(type->fields.array) * type->length;
524 case GLSL_TYPE_STRUCT:
525 size = 0;
526 for (i = 0; i < type->length; i++) {
527 size += type_size(type->fields.structure[i].type);
528 }
529 return size;
530 case GLSL_TYPE_SAMPLER:
531 /* Samplers take up no register space, since they're baked in at
532 * link time.
533 */
534 return 0;
535 case GLSL_TYPE_ATOMIC_UINT:
536 return 0;
537 case GLSL_TYPE_IMAGE:
538 case GLSL_TYPE_VOID:
539 case GLSL_TYPE_ERROR:
540 case GLSL_TYPE_INTERFACE:
541 assert(!"not reached");
542 break;
543 }
544
545 return 0;
546 }
547
548 fs_reg
549 fs_visitor::get_timestamp()
550 {
551 assert(brw->gen >= 7);
552
553 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
554 BRW_ARF_TIMESTAMP,
555 0),
556 BRW_REGISTER_TYPE_UD));
557
558 fs_reg dst = fs_reg(this, glsl_type::uint_type);
559
560 fs_inst *mov = emit(MOV(dst, ts));
561 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
562 * even if it's not enabled in the dispatch.
563 */
564 mov->force_writemask_all = true;
565 mov->force_uncompressed = true;
566
567 /* The caller wants the low 32 bits of the timestamp. Since it's running
568 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
569 * which is plenty of time for our purposes. It is identical across the
570 * EUs, but since it's tracking GPU core speed it will increment at a
571 * varying rate as render P-states change.
572 *
573 * The caller could also check if render P-states have changed (or anything
574 * else that might disrupt timing) by setting smear to 2 and checking if
575 * that field is != 0.
576 */
577 dst.set_smear(0);
578
579 return dst;
580 }
581
582 void
583 fs_visitor::emit_shader_time_begin()
584 {
585 current_annotation = "shader time start";
586 shader_start_time = get_timestamp();
587 }
588
589 void
590 fs_visitor::emit_shader_time_end()
591 {
592 current_annotation = "shader time end";
593
594 enum shader_time_shader_type type, written_type, reset_type;
595 if (dispatch_width == 8) {
596 type = ST_FS8;
597 written_type = ST_FS8_WRITTEN;
598 reset_type = ST_FS8_RESET;
599 } else {
600 assert(dispatch_width == 16);
601 type = ST_FS16;
602 written_type = ST_FS16_WRITTEN;
603 reset_type = ST_FS16_RESET;
604 }
605
606 fs_reg shader_end_time = get_timestamp();
607
608 /* Check that there weren't any timestamp reset events (assuming these
609 * were the only two timestamp reads that happened).
610 */
611 fs_reg reset = shader_end_time;
612 reset.set_smear(2);
613 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
614 test->conditional_mod = BRW_CONDITIONAL_Z;
615 emit(IF(BRW_PREDICATE_NORMAL));
616
617 push_force_uncompressed();
618 fs_reg start = shader_start_time;
619 start.negate = true;
620 fs_reg diff = fs_reg(this, glsl_type::uint_type);
621 emit(ADD(diff, start, shader_end_time));
622
623 /* If there were no instructions between the two timestamp gets, the diff
624 * is 2 cycles. Remove that overhead, so I can forget about that when
625 * trying to determine the time taken for single instructions.
626 */
627 emit(ADD(diff, diff, fs_reg(-2u)));
628
629 emit_shader_time_write(type, diff);
630 emit_shader_time_write(written_type, fs_reg(1u));
631 emit(BRW_OPCODE_ELSE);
632 emit_shader_time_write(reset_type, fs_reg(1u));
633 emit(BRW_OPCODE_ENDIF);
634
635 pop_force_uncompressed();
636 }
637
638 void
639 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
640 fs_reg value)
641 {
642 int shader_time_index =
643 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
644 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
645
646 fs_reg payload;
647 if (dispatch_width == 8)
648 payload = fs_reg(this, glsl_type::uvec2_type);
649 else
650 payload = fs_reg(this, glsl_type::uint_type);
651
652 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
653 fs_reg(), payload, offset, value));
654 }
655
656 void
657 fs_visitor::fail(const char *format, ...)
658 {
659 va_list va;
660 char *msg;
661
662 if (failed)
663 return;
664
665 failed = true;
666
667 va_start(va, format);
668 msg = ralloc_vasprintf(mem_ctx, format, va);
669 va_end(va);
670 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
671
672 this->fail_msg = msg;
673
674 if (INTEL_DEBUG & DEBUG_WM) {
675 fprintf(stderr, "%s", msg);
676 }
677 }
678
679 fs_inst *
680 fs_visitor::emit(enum opcode opcode)
681 {
682 return emit(fs_inst(opcode));
683 }
684
685 fs_inst *
686 fs_visitor::emit(enum opcode opcode, fs_reg dst)
687 {
688 return emit(fs_inst(opcode, dst));
689 }
690
691 fs_inst *
692 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
693 {
694 return emit(fs_inst(opcode, dst, src0));
695 }
696
697 fs_inst *
698 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
699 {
700 return emit(fs_inst(opcode, dst, src0, src1));
701 }
702
703 fs_inst *
704 fs_visitor::emit(enum opcode opcode, fs_reg dst,
705 fs_reg src0, fs_reg src1, fs_reg src2)
706 {
707 return emit(fs_inst(opcode, dst, src0, src1, src2));
708 }
709
710 void
711 fs_visitor::push_force_uncompressed()
712 {
713 force_uncompressed_stack++;
714 }
715
716 void
717 fs_visitor::pop_force_uncompressed()
718 {
719 force_uncompressed_stack--;
720 assert(force_uncompressed_stack >= 0);
721 }
722
723 /**
724 * Returns true if the instruction has a flag that means it won't
725 * update an entire destination register.
726 *
727 * For example, dead code elimination and live variable analysis want to know
728 * when a write to a variable screens off any preceding values that were in
729 * it.
730 */
731 bool
732 fs_inst::is_partial_write()
733 {
734 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
735 this->force_uncompressed ||
736 this->force_sechalf || !this->dst.is_contiguous());
737 }
738
739 int
740 fs_inst::regs_read(fs_visitor *v, int arg)
741 {
742 if (is_tex() && arg == 0 && src[0].file == GRF) {
743 if (v->dispatch_width == 16)
744 return (mlen + 1) / 2;
745 else
746 return mlen;
747 }
748 return 1;
749 }
750
751 bool
752 fs_inst::reads_flag()
753 {
754 return predicate;
755 }
756
757 bool
758 fs_inst::writes_flag()
759 {
760 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
761 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
762 }
763
764 /**
765 * Returns how many MRFs an FS opcode will write over.
766 *
767 * Note that this is not the 0 or 1 implied writes in an actual gen
768 * instruction -- the FS opcodes often generate MOVs in addition.
769 */
770 int
771 fs_visitor::implied_mrf_writes(fs_inst *inst)
772 {
773 if (inst->mlen == 0)
774 return 0;
775
776 if (inst->base_mrf == -1)
777 return 0;
778
779 switch (inst->opcode) {
780 case SHADER_OPCODE_RCP:
781 case SHADER_OPCODE_RSQ:
782 case SHADER_OPCODE_SQRT:
783 case SHADER_OPCODE_EXP2:
784 case SHADER_OPCODE_LOG2:
785 case SHADER_OPCODE_SIN:
786 case SHADER_OPCODE_COS:
787 return 1 * dispatch_width / 8;
788 case SHADER_OPCODE_POW:
789 case SHADER_OPCODE_INT_QUOTIENT:
790 case SHADER_OPCODE_INT_REMAINDER:
791 return 2 * dispatch_width / 8;
792 case SHADER_OPCODE_TEX:
793 case FS_OPCODE_TXB:
794 case SHADER_OPCODE_TXD:
795 case SHADER_OPCODE_TXF:
796 case SHADER_OPCODE_TXF_CMS:
797 case SHADER_OPCODE_TXF_MCS:
798 case SHADER_OPCODE_TG4:
799 case SHADER_OPCODE_TG4_OFFSET:
800 case SHADER_OPCODE_TXL:
801 case SHADER_OPCODE_TXS:
802 case SHADER_OPCODE_LOD:
803 return 1;
804 case FS_OPCODE_FB_WRITE:
805 return 2;
806 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
807 case SHADER_OPCODE_GEN4_SCRATCH_READ:
808 return 1;
809 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
810 return inst->mlen;
811 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
812 return 2;
813 case SHADER_OPCODE_UNTYPED_ATOMIC:
814 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
815 return 0;
816 default:
817 assert(!"not reached");
818 return inst->mlen;
819 }
820 }
821
822 int
823 fs_visitor::virtual_grf_alloc(int size)
824 {
825 if (virtual_grf_array_size <= virtual_grf_count) {
826 if (virtual_grf_array_size == 0)
827 virtual_grf_array_size = 16;
828 else
829 virtual_grf_array_size *= 2;
830 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
831 virtual_grf_array_size);
832 }
833 virtual_grf_sizes[virtual_grf_count] = size;
834 return virtual_grf_count++;
835 }
836
837 /** Fixed HW reg constructor. */
838 fs_reg::fs_reg(enum register_file file, int reg)
839 {
840 init();
841 this->file = file;
842 this->reg = reg;
843 this->type = BRW_REGISTER_TYPE_F;
844 }
845
846 /** Fixed HW reg constructor. */
847 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
848 {
849 init();
850 this->file = file;
851 this->reg = reg;
852 this->type = type;
853 }
854
855 /** Automatic reg constructor. */
856 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
857 {
858 init();
859
860 this->file = GRF;
861 this->reg = v->virtual_grf_alloc(v->type_size(type));
862 this->reg_offset = 0;
863 this->type = brw_type_for_base_type(type);
864 }
865
866 fs_reg *
867 fs_visitor::variable_storage(ir_variable *var)
868 {
869 return (fs_reg *)hash_table_find(this->variable_ht, var);
870 }
871
872 void
873 import_uniforms_callback(const void *key,
874 void *data,
875 void *closure)
876 {
877 struct hash_table *dst_ht = (struct hash_table *)closure;
878 const fs_reg *reg = (const fs_reg *)data;
879
880 if (reg->file != UNIFORM)
881 return;
882
883 hash_table_insert(dst_ht, data, key);
884 }
885
886 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
887 * This brings in those uniform definitions
888 */
889 void
890 fs_visitor::import_uniforms(fs_visitor *v)
891 {
892 hash_table_call_foreach(v->variable_ht,
893 import_uniforms_callback,
894 variable_ht);
895 this->params_remap = v->params_remap;
896 this->nr_params_remap = v->nr_params_remap;
897 }
898
899 /* Our support for uniforms is piggy-backed on the struct
900 * gl_fragment_program, because that's where the values actually
901 * get stored, rather than in some global gl_shader_program uniform
902 * store.
903 */
904 void
905 fs_visitor::setup_uniform_values(ir_variable *ir)
906 {
907 int namelen = strlen(ir->name);
908
909 /* The data for our (non-builtin) uniforms is stored in a series of
910 * gl_uniform_driver_storage structs for each subcomponent that
911 * glGetUniformLocation() could name. We know it's been set up in the same
912 * order we'd walk the type, so walk the list of storage and find anything
913 * with our name, or the prefix of a component that starts with our name.
914 */
915 unsigned params_before = c->prog_data.nr_params;
916 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
917 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
918
919 if (strncmp(ir->name, storage->name, namelen) != 0 ||
920 (storage->name[namelen] != 0 &&
921 storage->name[namelen] != '.' &&
922 storage->name[namelen] != '[')) {
923 continue;
924 }
925
926 unsigned slots = storage->type->component_slots();
927 if (storage->array_elements)
928 slots *= storage->array_elements;
929
930 for (unsigned i = 0; i < slots; i++) {
931 c->prog_data.param[c->prog_data.nr_params++] =
932 &storage->storage[i].f;
933 }
934 }
935
936 /* Make sure we actually initialized the right amount of stuff here. */
937 assert(params_before + ir->type->component_slots() ==
938 c->prog_data.nr_params);
939 (void)params_before;
940 }
941
942
943 /* Our support for builtin uniforms is even scarier than non-builtin.
944 * It sits on top of the PROG_STATE_VAR parameters that are
945 * automatically updated from GL context state.
946 */
947 void
948 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
949 {
950 const ir_state_slot *const slots = ir->state_slots;
951 assert(ir->state_slots != NULL);
952
953 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
954 /* This state reference has already been setup by ir_to_mesa, but we'll
955 * get the same index back here.
956 */
957 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
958 (gl_state_index *)slots[i].tokens);
959
960 /* Add each of the unique swizzles of the element as a parameter.
961 * This'll end up matching the expected layout of the
962 * array/matrix/structure we're trying to fill in.
963 */
964 int last_swiz = -1;
965 for (unsigned int j = 0; j < 4; j++) {
966 int swiz = GET_SWZ(slots[i].swizzle, j);
967 if (swiz == last_swiz)
968 break;
969 last_swiz = swiz;
970
971 c->prog_data.param[c->prog_data.nr_params++] =
972 &fp->Base.Parameters->ParameterValues[index][swiz].f;
973 }
974 }
975 }
976
977 fs_reg *
978 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
979 {
980 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
981 fs_reg wpos = *reg;
982 bool flip = !ir->data.origin_upper_left ^ c->key.render_to_fbo;
983
984 /* gl_FragCoord.x */
985 if (ir->data.pixel_center_integer) {
986 emit(MOV(wpos, this->pixel_x));
987 } else {
988 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
989 }
990 wpos.reg_offset++;
991
992 /* gl_FragCoord.y */
993 if (!flip && ir->data.pixel_center_integer) {
994 emit(MOV(wpos, this->pixel_y));
995 } else {
996 fs_reg pixel_y = this->pixel_y;
997 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
998
999 if (flip) {
1000 pixel_y.negate = true;
1001 offset += c->key.drawable_height - 1.0;
1002 }
1003
1004 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1005 }
1006 wpos.reg_offset++;
1007
1008 /* gl_FragCoord.z */
1009 if (brw->gen >= 6) {
1010 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
1011 } else {
1012 emit(FS_OPCODE_LINTERP, wpos,
1013 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1014 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1015 interp_reg(VARYING_SLOT_POS, 2));
1016 }
1017 wpos.reg_offset++;
1018
1019 /* gl_FragCoord.w: Already set up in emit_interpolation */
1020 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1021
1022 return reg;
1023 }
1024
1025 fs_inst *
1026 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1027 glsl_interp_qualifier interpolation_mode,
1028 bool is_centroid, bool is_sample)
1029 {
1030 brw_wm_barycentric_interp_mode barycoord_mode;
1031 if (brw->gen >= 6) {
1032 if (is_centroid) {
1033 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1034 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1035 else
1036 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1037 } else if (is_sample) {
1038 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1039 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1040 else
1041 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1042 } else {
1043 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1044 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1045 else
1046 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1047 }
1048 } else {
1049 /* On Ironlake and below, there is only one interpolation mode.
1050 * Centroid interpolation doesn't mean anything on this hardware --
1051 * there is no multisampling.
1052 */
1053 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1054 }
1055 return emit(FS_OPCODE_LINTERP, attr,
1056 this->delta_x[barycoord_mode],
1057 this->delta_y[barycoord_mode], interp);
1058 }
1059
1060 fs_reg *
1061 fs_visitor::emit_general_interpolation(ir_variable *ir)
1062 {
1063 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1064 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1065 fs_reg attr = *reg;
1066
1067 unsigned int array_elements;
1068 const glsl_type *type;
1069
1070 if (ir->type->is_array()) {
1071 array_elements = ir->type->length;
1072 if (array_elements == 0) {
1073 fail("dereferenced array '%s' has length 0\n", ir->name);
1074 }
1075 type = ir->type->fields.array;
1076 } else {
1077 array_elements = 1;
1078 type = ir->type;
1079 }
1080
1081 glsl_interp_qualifier interpolation_mode =
1082 ir->determine_interpolation_mode(c->key.flat_shade);
1083
1084 int location = ir->data.location;
1085 for (unsigned int i = 0; i < array_elements; i++) {
1086 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1087 if (c->prog_data.urb_setup[location] == -1) {
1088 /* If there's no incoming setup data for this slot, don't
1089 * emit interpolation for it.
1090 */
1091 attr.reg_offset += type->vector_elements;
1092 location++;
1093 continue;
1094 }
1095
1096 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1097 /* Constant interpolation (flat shading) case. The SF has
1098 * handed us defined values in only the constant offset
1099 * field of the setup reg.
1100 */
1101 for (unsigned int k = 0; k < type->vector_elements; k++) {
1102 struct brw_reg interp = interp_reg(location, k);
1103 interp = suboffset(interp, 3);
1104 interp.type = reg->type;
1105 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1106 attr.reg_offset++;
1107 }
1108 } else {
1109 /* Smooth/noperspective interpolation case. */
1110 for (unsigned int k = 0; k < type->vector_elements; k++) {
1111 struct brw_reg interp = interp_reg(location, k);
1112 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1113 ir->data.centroid && !c->key.persample_shading,
1114 ir->data.sample || c->key.persample_shading);
1115 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1116 /* Get the pixel/sample mask into f0 so that we know
1117 * which pixels are lit. Then, for each channel that is
1118 * unlit, replace the centroid data with non-centroid
1119 * data.
1120 */
1121 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1122 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1123 interpolation_mode,
1124 false, false);
1125 inst->predicate = BRW_PREDICATE_NORMAL;
1126 inst->predicate_inverse = true;
1127 }
1128 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1129 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1130 }
1131 attr.reg_offset++;
1132 }
1133
1134 }
1135 location++;
1136 }
1137 }
1138
1139 return reg;
1140 }
1141
1142 fs_reg *
1143 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1144 {
1145 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1146
1147 /* The frontfacing comes in as a bit in the thread payload. */
1148 if (brw->gen >= 6) {
1149 emit(BRW_OPCODE_ASR, *reg,
1150 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1151 fs_reg(15));
1152 emit(BRW_OPCODE_NOT, *reg, *reg);
1153 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1154 } else {
1155 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1156 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1157 * us front face
1158 */
1159 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1160 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1161 }
1162
1163 return reg;
1164 }
1165
1166 void
1167 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1168 {
1169 assert(dst.type == BRW_REGISTER_TYPE_F);
1170
1171 if (c->key.compute_pos_offset) {
1172 /* Convert int_sample_pos to floating point */
1173 emit(MOV(dst, int_sample_pos));
1174 /* Scale to the range [0, 1] */
1175 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1176 }
1177 else {
1178 /* From ARB_sample_shading specification:
1179 * "When rendering to a non-multisample buffer, or if multisample
1180 * rasterization is disabled, gl_SamplePosition will always be
1181 * (0.5, 0.5).
1182 */
1183 emit(MOV(dst, fs_reg(0.5f)));
1184 }
1185 }
1186
1187 fs_reg *
1188 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1189 {
1190 assert(brw->gen >= 6);
1191 assert(ir->type == glsl_type::vec2_type);
1192
1193 this->current_annotation = "compute sample position";
1194 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1195 fs_reg pos = *reg;
1196 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1197 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1198
1199 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1200 * mode will be enabled.
1201 *
1202 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1203 * R31.1:0 Position Offset X/Y for Slot[3:0]
1204 * R31.3:2 Position Offset X/Y for Slot[7:4]
1205 * .....
1206 *
1207 * The X, Y sample positions come in as bytes in thread payload. So, read
1208 * the positions using vstride=16, width=8, hstride=2.
1209 */
1210 struct brw_reg sample_pos_reg =
1211 stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1212 BRW_REGISTER_TYPE_B), 16, 8, 2);
1213
1214 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1215 if (dispatch_width == 16) {
1216 fs_inst *inst = emit(MOV(half(int_sample_x, 1),
1217 fs_reg(suboffset(sample_pos_reg, 16))));
1218 inst->force_sechalf = true;
1219 }
1220 /* Compute gl_SamplePosition.x */
1221 compute_sample_position(pos, int_sample_x);
1222 pos.reg_offset++;
1223 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1224 if (dispatch_width == 16) {
1225 fs_inst *inst = emit(MOV(half(int_sample_y, 1),
1226 fs_reg(suboffset(sample_pos_reg, 17))));
1227 inst->force_sechalf = true;
1228 }
1229 /* Compute gl_SamplePosition.y */
1230 compute_sample_position(pos, int_sample_y);
1231 return reg;
1232 }
1233
1234 fs_reg *
1235 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1236 {
1237 assert(brw->gen >= 6);
1238
1239 this->current_annotation = "compute sample id";
1240 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1241
1242 if (c->key.compute_sample_id) {
1243 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1244 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1245 t2.type = BRW_REGISTER_TYPE_UW;
1246
1247 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1248 * 8x multisampling, subspan 0 will represent sample N (where N
1249 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1250 * 7. We can find the value of N by looking at R0.0 bits 7:6
1251 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1252 * (since samples are always delivered in pairs). That is, we
1253 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1254 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1255 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1256 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1257 * populating a temporary variable with the sequence (0, 1, 2, 3),
1258 * and then reading from it using vstride=1, width=4, hstride=0.
1259 * These computations hold good for 4x multisampling as well.
1260 */
1261 emit(BRW_OPCODE_AND, t1,
1262 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1263 fs_reg(brw_imm_d(0xc0)));
1264 emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1265 /* This works for both SIMD8 and SIMD16 */
1266 emit(MOV(t2, brw_imm_v(0x3210)));
1267 /* This special instruction takes care of setting vstride=1,
1268 * width=4, hstride=0 of t2 during an ADD instruction.
1269 */
1270 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1271 } else {
1272 /* As per GL_ARB_sample_shading specification:
1273 * "When rendering to a non-multisample buffer, or if multisample
1274 * rasterization is disabled, gl_SampleID will always be zero."
1275 */
1276 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1277 }
1278
1279 return reg;
1280 }
1281
1282 fs_reg *
1283 fs_visitor::emit_samplemaskin_setup(ir_variable *ir)
1284 {
1285 assert(brw->gen >= 7);
1286 this->current_annotation = "compute gl_SampleMaskIn";
1287 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1288 emit(MOV(*reg, fs_reg(retype(brw_vec8_grf(c->sample_mask_reg, 0), BRW_REGISTER_TYPE_D))));
1289 return reg;
1290 }
1291
1292 fs_reg
1293 fs_visitor::fix_math_operand(fs_reg src)
1294 {
1295 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1296 * might be able to do better by doing execsize = 1 math and then
1297 * expanding that result out, but we would need to be careful with
1298 * masking.
1299 *
1300 * The hardware ignores source modifiers (negate and abs) on math
1301 * instructions, so we also move to a temp to set those up.
1302 */
1303 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1304 !src.abs && !src.negate)
1305 return src;
1306
1307 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1308 * operands to math
1309 */
1310 if (brw->gen >= 7 && src.file != IMM)
1311 return src;
1312
1313 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1314 expanded.type = src.type;
1315 emit(BRW_OPCODE_MOV, expanded, src);
1316 return expanded;
1317 }
1318
1319 fs_inst *
1320 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1321 {
1322 switch (opcode) {
1323 case SHADER_OPCODE_RCP:
1324 case SHADER_OPCODE_RSQ:
1325 case SHADER_OPCODE_SQRT:
1326 case SHADER_OPCODE_EXP2:
1327 case SHADER_OPCODE_LOG2:
1328 case SHADER_OPCODE_SIN:
1329 case SHADER_OPCODE_COS:
1330 break;
1331 default:
1332 assert(!"not reached: bad math opcode");
1333 return NULL;
1334 }
1335
1336 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1337 * might be able to do better by doing execsize = 1 math and then
1338 * expanding that result out, but we would need to be careful with
1339 * masking.
1340 *
1341 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1342 * instructions, so we also move to a temp to set those up.
1343 */
1344 if (brw->gen >= 6)
1345 src = fix_math_operand(src);
1346
1347 fs_inst *inst = emit(opcode, dst, src);
1348
1349 if (brw->gen < 6) {
1350 inst->base_mrf = 2;
1351 inst->mlen = dispatch_width / 8;
1352 }
1353
1354 return inst;
1355 }
1356
1357 fs_inst *
1358 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1359 {
1360 int base_mrf = 2;
1361 fs_inst *inst;
1362
1363 switch (opcode) {
1364 case SHADER_OPCODE_INT_QUOTIENT:
1365 case SHADER_OPCODE_INT_REMAINDER:
1366 if (brw->gen >= 7 && dispatch_width == 16)
1367 fail("SIMD16 INTDIV unsupported\n");
1368 break;
1369 case SHADER_OPCODE_POW:
1370 break;
1371 default:
1372 assert(!"not reached: unsupported binary math opcode.");
1373 return NULL;
1374 }
1375
1376 if (brw->gen >= 6) {
1377 src0 = fix_math_operand(src0);
1378 src1 = fix_math_operand(src1);
1379
1380 inst = emit(opcode, dst, src0, src1);
1381 } else {
1382 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1383 * "Message Payload":
1384 *
1385 * "Operand0[7]. For the INT DIV functions, this operand is the
1386 * denominator."
1387 * ...
1388 * "Operand1[7]. For the INT DIV functions, this operand is the
1389 * numerator."
1390 */
1391 bool is_int_div = opcode != SHADER_OPCODE_POW;
1392 fs_reg &op0 = is_int_div ? src1 : src0;
1393 fs_reg &op1 = is_int_div ? src0 : src1;
1394
1395 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1396 inst = emit(opcode, dst, op0, reg_null_f);
1397
1398 inst->base_mrf = base_mrf;
1399 inst->mlen = 2 * dispatch_width / 8;
1400 }
1401 return inst;
1402 }
1403
1404 void
1405 fs_visitor::assign_curb_setup()
1406 {
1407 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1408 if (dispatch_width == 8) {
1409 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1410 } else {
1411 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1412 }
1413
1414 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1415 foreach_list(node, &this->instructions) {
1416 fs_inst *inst = (fs_inst *)node;
1417
1418 for (unsigned int i = 0; i < 3; i++) {
1419 if (inst->src[i].file == UNIFORM) {
1420 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1421 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1422 constant_nr / 8,
1423 constant_nr % 8);
1424
1425 inst->src[i].file = HW_REG;
1426 inst->src[i].fixed_hw_reg = byte_offset(
1427 retype(brw_reg, inst->src[i].type),
1428 inst->src[i].subreg_offset);
1429 }
1430 }
1431 }
1432 }
1433
1434 void
1435 fs_visitor::calculate_urb_setup()
1436 {
1437 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1438 c->prog_data.urb_setup[i] = -1;
1439 }
1440
1441 int urb_next = 0;
1442 /* Figure out where each of the incoming setup attributes lands. */
1443 if (brw->gen >= 6) {
1444 if (_mesa_bitcount_64(fp->Base.InputsRead &
1445 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1446 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1447 * first 16 varying inputs, so we can put them wherever we want.
1448 * Just put them in order.
1449 *
1450 * This is useful because it means that (a) inputs not used by the
1451 * fragment shader won't take up valuable register space, and (b) we
1452 * won't have to recompile the fragment shader if it gets paired with
1453 * a different vertex (or geometry) shader.
1454 */
1455 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1456 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1457 BITFIELD64_BIT(i)) {
1458 c->prog_data.urb_setup[i] = urb_next++;
1459 }
1460 }
1461 } else {
1462 /* We have enough input varyings that the SF/SBE pipeline stage can't
1463 * arbitrarily rearrange them to suit our whim; we have to put them
1464 * in an order that matches the output of the previous pipeline stage
1465 * (geometry or vertex shader).
1466 */
1467 struct brw_vue_map prev_stage_vue_map;
1468 brw_compute_vue_map(brw, &prev_stage_vue_map,
1469 c->key.input_slots_valid);
1470 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1471 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1472 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1473 slot++) {
1474 int varying = prev_stage_vue_map.slot_to_varying[slot];
1475 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1476 * unused.
1477 */
1478 if (varying != BRW_VARYING_SLOT_COUNT &&
1479 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1480 BITFIELD64_BIT(varying))) {
1481 c->prog_data.urb_setup[varying] = slot - first_slot;
1482 }
1483 }
1484 urb_next = prev_stage_vue_map.num_slots - first_slot;
1485 }
1486 } else {
1487 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1488 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1489 /* Point size is packed into the header, not as a general attribute */
1490 if (i == VARYING_SLOT_PSIZ)
1491 continue;
1492
1493 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1494 /* The back color slot is skipped when the front color is
1495 * also written to. In addition, some slots can be
1496 * written in the vertex shader and not read in the
1497 * fragment shader. So the register number must always be
1498 * incremented, mapped or not.
1499 */
1500 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1501 c->prog_data.urb_setup[i] = urb_next;
1502 urb_next++;
1503 }
1504 }
1505
1506 /*
1507 * It's a FS only attribute, and we did interpolation for this attribute
1508 * in SF thread. So, count it here, too.
1509 *
1510 * See compile_sf_prog() for more info.
1511 */
1512 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1513 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1514 }
1515
1516 c->prog_data.num_varying_inputs = urb_next;
1517 }
1518
1519 void
1520 fs_visitor::assign_urb_setup()
1521 {
1522 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1523
1524 /* Offset all the urb_setup[] index by the actual position of the
1525 * setup regs, now that the location of the constants has been chosen.
1526 */
1527 foreach_list(node, &this->instructions) {
1528 fs_inst *inst = (fs_inst *)node;
1529
1530 if (inst->opcode == FS_OPCODE_LINTERP) {
1531 assert(inst->src[2].file == HW_REG);
1532 inst->src[2].fixed_hw_reg.nr += urb_start;
1533 }
1534
1535 if (inst->opcode == FS_OPCODE_CINTERP) {
1536 assert(inst->src[0].file == HW_REG);
1537 inst->src[0].fixed_hw_reg.nr += urb_start;
1538 }
1539 }
1540
1541 /* Each attribute is 4 setup channels, each of which is half a reg. */
1542 this->first_non_payload_grf =
1543 urb_start + c->prog_data.num_varying_inputs * 2;
1544 }
1545
1546 /**
1547 * Split large virtual GRFs into separate components if we can.
1548 *
1549 * This is mostly duplicated with what brw_fs_vector_splitting does,
1550 * but that's really conservative because it's afraid of doing
1551 * splitting that doesn't result in real progress after the rest of
1552 * the optimization phases, which would cause infinite looping in
1553 * optimization. We can do it once here, safely. This also has the
1554 * opportunity to split interpolated values, or maybe even uniforms,
1555 * which we don't have at the IR level.
1556 *
1557 * We want to split, because virtual GRFs are what we register
1558 * allocate and spill (due to contiguousness requirements for some
1559 * instructions), and they're what we naturally generate in the
1560 * codegen process, but most virtual GRFs don't actually need to be
1561 * contiguous sets of GRFs. If we split, we'll end up with reduced
1562 * live intervals and better dead code elimination and coalescing.
1563 */
1564 void
1565 fs_visitor::split_virtual_grfs()
1566 {
1567 int num_vars = this->virtual_grf_count;
1568 bool split_grf[num_vars];
1569 int new_virtual_grf[num_vars];
1570
1571 /* Try to split anything > 0 sized. */
1572 for (int i = 0; i < num_vars; i++) {
1573 if (this->virtual_grf_sizes[i] != 1)
1574 split_grf[i] = true;
1575 else
1576 split_grf[i] = false;
1577 }
1578
1579 if (brw->has_pln &&
1580 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1581 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1582 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1583 * Gen6, that was the only supported interpolation mode, and since Gen6,
1584 * delta_x and delta_y are in fixed hardware registers.
1585 */
1586 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1587 false;
1588 }
1589
1590 foreach_list(node, &this->instructions) {
1591 fs_inst *inst = (fs_inst *)node;
1592
1593 /* If there's a SEND message that requires contiguous destination
1594 * registers, no splitting is allowed.
1595 */
1596 if (inst->regs_written > 1) {
1597 split_grf[inst->dst.reg] = false;
1598 }
1599
1600 /* If we're sending from a GRF, don't split it, on the assumption that
1601 * the send is reading the whole thing.
1602 */
1603 if (inst->is_send_from_grf()) {
1604 for (int i = 0; i < 3; i++) {
1605 if (inst->src[i].file == GRF) {
1606 split_grf[inst->src[i].reg] = false;
1607 }
1608 }
1609 }
1610 }
1611
1612 /* Allocate new space for split regs. Note that the virtual
1613 * numbers will be contiguous.
1614 */
1615 for (int i = 0; i < num_vars; i++) {
1616 if (split_grf[i]) {
1617 new_virtual_grf[i] = virtual_grf_alloc(1);
1618 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1619 int reg = virtual_grf_alloc(1);
1620 assert(reg == new_virtual_grf[i] + j - 1);
1621 (void) reg;
1622 }
1623 this->virtual_grf_sizes[i] = 1;
1624 }
1625 }
1626
1627 foreach_list(node, &this->instructions) {
1628 fs_inst *inst = (fs_inst *)node;
1629
1630 if (inst->dst.file == GRF &&
1631 split_grf[inst->dst.reg] &&
1632 inst->dst.reg_offset != 0) {
1633 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1634 inst->dst.reg_offset - 1);
1635 inst->dst.reg_offset = 0;
1636 }
1637 for (int i = 0; i < 3; i++) {
1638 if (inst->src[i].file == GRF &&
1639 split_grf[inst->src[i].reg] &&
1640 inst->src[i].reg_offset != 0) {
1641 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1642 inst->src[i].reg_offset - 1);
1643 inst->src[i].reg_offset = 0;
1644 }
1645 }
1646 }
1647 invalidate_live_intervals();
1648 }
1649
1650 /**
1651 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1652 *
1653 * During code generation, we create tons of temporary variables, many of
1654 * which get immediately killed and are never used again. Yet, in later
1655 * optimization and analysis passes, such as compute_live_intervals, we need
1656 * to loop over all the virtual GRFs. Compacting them can save a lot of
1657 * overhead.
1658 */
1659 void
1660 fs_visitor::compact_virtual_grfs()
1661 {
1662 /* Mark which virtual GRFs are used, and count how many. */
1663 int remap_table[this->virtual_grf_count];
1664 memset(remap_table, -1, sizeof(remap_table));
1665
1666 foreach_list(node, &this->instructions) {
1667 const fs_inst *inst = (const fs_inst *) node;
1668
1669 if (inst->dst.file == GRF)
1670 remap_table[inst->dst.reg] = 0;
1671
1672 for (int i = 0; i < 3; i++) {
1673 if (inst->src[i].file == GRF)
1674 remap_table[inst->src[i].reg] = 0;
1675 }
1676 }
1677
1678 /* In addition to registers used in instructions, fs_visitor keeps
1679 * direct references to certain special values which must be patched:
1680 */
1681 fs_reg *special[] = {
1682 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1683 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1684 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1685 &delta_x[0], &delta_x[1], &delta_x[2],
1686 &delta_x[3], &delta_x[4], &delta_x[5],
1687 &delta_y[0], &delta_y[1], &delta_y[2],
1688 &delta_y[3], &delta_y[4], &delta_y[5],
1689 };
1690 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1691 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1692
1693 /* Treat all special values as used, to be conservative */
1694 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1695 if (special[i]->file == GRF)
1696 remap_table[special[i]->reg] = 0;
1697 }
1698
1699 /* Compact the GRF arrays. */
1700 int new_index = 0;
1701 for (int i = 0; i < this->virtual_grf_count; i++) {
1702 if (remap_table[i] != -1) {
1703 remap_table[i] = new_index;
1704 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1705 invalidate_live_intervals();
1706 ++new_index;
1707 }
1708 }
1709
1710 this->virtual_grf_count = new_index;
1711
1712 /* Patch all the instructions to use the newly renumbered registers */
1713 foreach_list(node, &this->instructions) {
1714 fs_inst *inst = (fs_inst *) node;
1715
1716 if (inst->dst.file == GRF)
1717 inst->dst.reg = remap_table[inst->dst.reg];
1718
1719 for (int i = 0; i < 3; i++) {
1720 if (inst->src[i].file == GRF)
1721 inst->src[i].reg = remap_table[inst->src[i].reg];
1722 }
1723 }
1724
1725 /* Patch all the references to special values */
1726 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1727 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1728 special[i]->reg = remap_table[special[i]->reg];
1729 }
1730 }
1731
1732 bool
1733 fs_visitor::remove_dead_constants()
1734 {
1735 if (dispatch_width == 8) {
1736 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1737 this->nr_params_remap = c->prog_data.nr_params;
1738
1739 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1740 this->params_remap[i] = -1;
1741
1742 /* Find which params are still in use. */
1743 foreach_list(node, &this->instructions) {
1744 fs_inst *inst = (fs_inst *)node;
1745
1746 for (int i = 0; i < 3; i++) {
1747 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1748
1749 if (inst->src[i].file != UNIFORM)
1750 continue;
1751
1752 /* Section 5.11 of the OpenGL 4.3 spec says:
1753 *
1754 * "Out-of-bounds reads return undefined values, which include
1755 * values from other variables of the active program or zero."
1756 */
1757 if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1758 constant_nr = 0;
1759 }
1760
1761 /* For now, set this to non-negative. We'll give it the
1762 * actual new number in a moment, in order to keep the
1763 * register numbers nicely ordered.
1764 */
1765 this->params_remap[constant_nr] = 0;
1766 }
1767 }
1768
1769 /* Figure out what the new numbers for the params will be. At some
1770 * point when we're doing uniform array access, we're going to want
1771 * to keep the distinction between .reg and .reg_offset, but for
1772 * now we don't care.
1773 */
1774 unsigned int new_nr_params = 0;
1775 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1776 if (this->params_remap[i] != -1) {
1777 this->params_remap[i] = new_nr_params++;
1778 }
1779 }
1780
1781 /* Update the list of params to be uploaded to match our new numbering. */
1782 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1783 int remapped = this->params_remap[i];
1784
1785 if (remapped == -1)
1786 continue;
1787
1788 c->prog_data.param[remapped] = c->prog_data.param[i];
1789 }
1790
1791 c->prog_data.nr_params = new_nr_params;
1792 } else {
1793 /* This should have been generated in the SIMD8 pass already. */
1794 assert(this->params_remap);
1795 }
1796
1797 /* Now do the renumbering of the shader to remove unused params. */
1798 foreach_list(node, &this->instructions) {
1799 fs_inst *inst = (fs_inst *)node;
1800
1801 for (int i = 0; i < 3; i++) {
1802 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1803
1804 if (inst->src[i].file != UNIFORM)
1805 continue;
1806
1807 /* as above alias to 0 */
1808 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1809 constant_nr = 0;
1810 }
1811 assert(this->params_remap[constant_nr] != -1);
1812 inst->src[i].reg = this->params_remap[constant_nr];
1813 inst->src[i].reg_offset = 0;
1814 }
1815 }
1816
1817 return true;
1818 }
1819
1820 /*
1821 * Implements array access of uniforms by inserting a
1822 * PULL_CONSTANT_LOAD instruction.
1823 *
1824 * Unlike temporary GRF array access (where we don't support it due to
1825 * the difficulty of doing relative addressing on instruction
1826 * destinations), we could potentially do array access of uniforms
1827 * that were loaded in GRF space as push constants. In real-world
1828 * usage we've seen, though, the arrays being used are always larger
1829 * than we could load as push constants, so just always move all
1830 * uniform array access out to a pull constant buffer.
1831 */
1832 void
1833 fs_visitor::move_uniform_array_access_to_pull_constants()
1834 {
1835 int pull_constant_loc[c->prog_data.nr_params];
1836
1837 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1838 pull_constant_loc[i] = -1;
1839 }
1840
1841 /* Walk through and find array access of uniforms. Put a copy of that
1842 * uniform in the pull constant buffer.
1843 *
1844 * Note that we don't move constant-indexed accesses to arrays. No
1845 * testing has been done of the performance impact of this choice.
1846 */
1847 foreach_list_safe(node, &this->instructions) {
1848 fs_inst *inst = (fs_inst *)node;
1849
1850 for (int i = 0 ; i < 3; i++) {
1851 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1852 continue;
1853
1854 int uniform = inst->src[i].reg;
1855
1856 /* If this array isn't already present in the pull constant buffer,
1857 * add it.
1858 */
1859 if (pull_constant_loc[uniform] == -1) {
1860 const float **values = &c->prog_data.param[uniform];
1861
1862 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1863
1864 assert(param_size[uniform]);
1865
1866 for (int j = 0; j < param_size[uniform]; j++) {
1867 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1868 values[j];
1869 }
1870 }
1871
1872 /* Set up the annotation tracking for new generated instructions. */
1873 base_ir = inst->ir;
1874 current_annotation = inst->annotation;
1875
1876 fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1877 fs_reg temp = fs_reg(this, glsl_type::float_type);
1878 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1879 surf_index,
1880 *inst->src[i].reladdr,
1881 pull_constant_loc[uniform] +
1882 inst->src[i].reg_offset);
1883 inst->insert_before(&list);
1884
1885 inst->src[i].file = temp.file;
1886 inst->src[i].reg = temp.reg;
1887 inst->src[i].reg_offset = temp.reg_offset;
1888 inst->src[i].reladdr = NULL;
1889 }
1890 }
1891 }
1892
1893 /**
1894 * Choose accesses from the UNIFORM file to demote to using the pull
1895 * constant buffer.
1896 *
1897 * We allow a fragment shader to have more than the specified minimum
1898 * maximum number of fragment shader uniform components (64). If
1899 * there are too many of these, they'd fill up all of register space.
1900 * So, this will push some of them out to the pull constant buffer and
1901 * update the program to load them.
1902 */
1903 void
1904 fs_visitor::setup_pull_constants()
1905 {
1906 /* Only allow 16 registers (128 uniform components) as push constants. */
1907 unsigned int max_uniform_components = 16 * 8;
1908 if (c->prog_data.nr_params <= max_uniform_components)
1909 return;
1910
1911 if (dispatch_width == 16) {
1912 fail("Pull constants not supported in SIMD16\n");
1913 return;
1914 }
1915
1916 /* Just demote the end of the list. We could probably do better
1917 * here, demoting things that are rarely used in the program first.
1918 */
1919 unsigned int pull_uniform_base = max_uniform_components;
1920
1921 int pull_constant_loc[c->prog_data.nr_params];
1922 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1923 if (i < pull_uniform_base) {
1924 pull_constant_loc[i] = -1;
1925 } else {
1926 pull_constant_loc[i] = -1;
1927 /* If our constant is already being uploaded for reladdr purposes,
1928 * reuse it.
1929 */
1930 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1931 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1932 pull_constant_loc[i] = j;
1933 break;
1934 }
1935 }
1936 if (pull_constant_loc[i] == -1) {
1937 int pull_index = c->prog_data.nr_pull_params++;
1938 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1939 pull_constant_loc[i] = pull_index;;
1940 }
1941 }
1942 }
1943 c->prog_data.nr_params = pull_uniform_base;
1944
1945 foreach_list(node, &this->instructions) {
1946 fs_inst *inst = (fs_inst *)node;
1947
1948 for (int i = 0; i < 3; i++) {
1949 if (inst->src[i].file != UNIFORM)
1950 continue;
1951
1952 int pull_index = pull_constant_loc[inst->src[i].reg +
1953 inst->src[i].reg_offset];
1954 if (pull_index == -1)
1955 continue;
1956
1957 assert(!inst->src[i].reladdr);
1958
1959 fs_reg dst = fs_reg(this, glsl_type::float_type);
1960 fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1961 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1962 fs_inst *pull =
1963 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1964 dst, index, offset);
1965 pull->ir = inst->ir;
1966 pull->annotation = inst->annotation;
1967
1968 inst->insert_before(pull);
1969
1970 inst->src[i].file = GRF;
1971 inst->src[i].reg = dst.reg;
1972 inst->src[i].reg_offset = 0;
1973 inst->src[i].set_smear(pull_index & 3);
1974 }
1975 }
1976 }
1977
1978 bool
1979 fs_visitor::opt_algebraic()
1980 {
1981 bool progress = false;
1982
1983 foreach_list(node, &this->instructions) {
1984 fs_inst *inst = (fs_inst *)node;
1985
1986 switch (inst->opcode) {
1987 case BRW_OPCODE_MUL:
1988 if (inst->src[1].file != IMM)
1989 continue;
1990
1991 /* a * 1.0 = a */
1992 if (inst->src[1].is_one()) {
1993 inst->opcode = BRW_OPCODE_MOV;
1994 inst->src[1] = reg_undef;
1995 progress = true;
1996 break;
1997 }
1998
1999 /* a * 0.0 = 0.0 */
2000 if (inst->src[1].is_zero()) {
2001 inst->opcode = BRW_OPCODE_MOV;
2002 inst->src[0] = inst->src[1];
2003 inst->src[1] = reg_undef;
2004 progress = true;
2005 break;
2006 }
2007
2008 break;
2009 case BRW_OPCODE_ADD:
2010 if (inst->src[1].file != IMM)
2011 continue;
2012
2013 /* a + 0.0 = a */
2014 if (inst->src[1].is_zero()) {
2015 inst->opcode = BRW_OPCODE_MOV;
2016 inst->src[1] = reg_undef;
2017 progress = true;
2018 break;
2019 }
2020 break;
2021 case BRW_OPCODE_OR:
2022 if (inst->src[0].equals(inst->src[1])) {
2023 inst->opcode = BRW_OPCODE_MOV;
2024 inst->src[1] = reg_undef;
2025 progress = true;
2026 break;
2027 }
2028 break;
2029 case BRW_OPCODE_LRP:
2030 if (inst->src[1].equals(inst->src[2])) {
2031 inst->opcode = BRW_OPCODE_MOV;
2032 inst->src[0] = inst->src[1];
2033 inst->src[1] = reg_undef;
2034 inst->src[2] = reg_undef;
2035 progress = true;
2036 break;
2037 }
2038 break;
2039 case BRW_OPCODE_SEL:
2040 if (inst->saturate && inst->src[1].file == IMM) {
2041 switch (inst->conditional_mod) {
2042 case BRW_CONDITIONAL_LE:
2043 case BRW_CONDITIONAL_L:
2044 switch (inst->src[1].type) {
2045 case BRW_REGISTER_TYPE_F:
2046 if (inst->src[1].imm.f >= 1.0f) {
2047 inst->opcode = BRW_OPCODE_MOV;
2048 inst->src[1] = reg_undef;
2049 progress = true;
2050 }
2051 break;
2052 default:
2053 break;
2054 }
2055 break;
2056 case BRW_CONDITIONAL_GE:
2057 case BRW_CONDITIONAL_G:
2058 switch (inst->src[1].type) {
2059 case BRW_REGISTER_TYPE_F:
2060 if (inst->src[1].imm.f <= 0.0f) {
2061 inst->opcode = BRW_OPCODE_MOV;
2062 inst->src[1] = reg_undef;
2063 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2064 progress = true;
2065 }
2066 break;
2067 default:
2068 break;
2069 }
2070 default:
2071 break;
2072 }
2073 }
2074 break;
2075 default:
2076 break;
2077 }
2078 }
2079
2080 return progress;
2081 }
2082
2083 /**
2084 * Removes any instructions writing a VGRF where that VGRF is not used by any
2085 * later instruction.
2086 */
2087 bool
2088 fs_visitor::dead_code_eliminate()
2089 {
2090 bool progress = false;
2091 int pc = 0;
2092
2093 calculate_live_intervals();
2094
2095 foreach_list_safe(node, &this->instructions) {
2096 fs_inst *inst = (fs_inst *)node;
2097
2098 if (inst->dst.file == GRF && !inst->has_side_effects()) {
2099 bool dead = true;
2100
2101 for (int i = 0; i < inst->regs_written; i++) {
2102 int var = live_intervals->var_from_vgrf[inst->dst.reg];
2103 assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2104 if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2105 dead = false;
2106 break;
2107 }
2108 }
2109
2110 if (dead) {
2111 /* Don't dead code eliminate instructions that write to the
2112 * accumulator as a side-effect. Instead just set the destination
2113 * to the null register to free it.
2114 */
2115 switch (inst->opcode) {
2116 case BRW_OPCODE_ADDC:
2117 case BRW_OPCODE_SUBB:
2118 case BRW_OPCODE_MACH:
2119 inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2120 break;
2121 default:
2122 inst->remove();
2123 progress = true;
2124 break;
2125 }
2126 }
2127 }
2128
2129 pc++;
2130 }
2131
2132 if (progress)
2133 invalidate_live_intervals();
2134
2135 return progress;
2136 }
2137
2138 struct dead_code_hash_key
2139 {
2140 int vgrf;
2141 int reg_offset;
2142 };
2143
2144 static bool
2145 dead_code_hash_compare(const void *a, const void *b)
2146 {
2147 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2148 }
2149
2150 static void
2151 clear_dead_code_hash(struct hash_table *ht)
2152 {
2153 struct hash_entry *entry;
2154
2155 hash_table_foreach(ht, entry) {
2156 _mesa_hash_table_remove(ht, entry);
2157 }
2158 }
2159
2160 static void
2161 insert_dead_code_hash(struct hash_table *ht,
2162 int vgrf, int reg_offset, fs_inst *inst)
2163 {
2164 /* We don't bother freeing keys, because they'll be GCed with the ht. */
2165 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2166
2167 key->vgrf = vgrf;
2168 key->reg_offset = reg_offset;
2169
2170 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2171 }
2172
2173 static struct hash_entry *
2174 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2175 {
2176 struct dead_code_hash_key key;
2177
2178 key.vgrf = vgrf;
2179 key.reg_offset = reg_offset;
2180
2181 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2182 }
2183
2184 static void
2185 remove_dead_code_hash(struct hash_table *ht,
2186 int vgrf, int reg_offset)
2187 {
2188 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2189 if (!entry)
2190 return;
2191
2192 _mesa_hash_table_remove(ht, entry);
2193 }
2194
2195 /**
2196 * Walks basic blocks, removing any regs that are written but not read before
2197 * being redefined.
2198 *
2199 * The dead_code_eliminate() function implements a global dead code
2200 * elimination, but it only handles the removing the last write to a register
2201 * if it's never read. This one can handle intermediate writes, but only
2202 * within a basic block.
2203 */
2204 bool
2205 fs_visitor::dead_code_eliminate_local()
2206 {
2207 struct hash_table *ht;
2208 bool progress = false;
2209
2210 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2211
2212 if (ht == NULL) {
2213 return false;
2214 }
2215
2216 foreach_list_safe(node, &this->instructions) {
2217 fs_inst *inst = (fs_inst *)node;
2218
2219 /* At a basic block, empty the HT since we don't understand dataflow
2220 * here.
2221 */
2222 if (inst->is_control_flow()) {
2223 clear_dead_code_hash(ht);
2224 continue;
2225 }
2226
2227 /* Clear the HT of any instructions that got read. */
2228 for (int i = 0; i < 3; i++) {
2229 fs_reg src = inst->src[i];
2230 if (src.file != GRF)
2231 continue;
2232
2233 int read = 1;
2234 if (inst->is_send_from_grf())
2235 read = virtual_grf_sizes[src.reg] - src.reg_offset;
2236
2237 for (int reg_offset = src.reg_offset;
2238 reg_offset < src.reg_offset + read;
2239 reg_offset++) {
2240 remove_dead_code_hash(ht, src.reg, reg_offset);
2241 }
2242 }
2243
2244 /* Add any update of a GRF to the HT, removing a previous write if it
2245 * wasn't read.
2246 */
2247 if (inst->dst.file == GRF) {
2248 if (inst->regs_written > 1) {
2249 /* We don't know how to trim channels from an instruction's
2250 * writes, so we can't incrementally remove unread channels from
2251 * it. Just remove whatever it overwrites from the table
2252 */
2253 for (int i = 0; i < inst->regs_written; i++) {
2254 remove_dead_code_hash(ht,
2255 inst->dst.reg,
2256 inst->dst.reg_offset + i);
2257 }
2258 } else {
2259 struct hash_entry *entry =
2260 get_dead_code_hash_entry(ht, inst->dst.reg,
2261 inst->dst.reg_offset);
2262
2263 if (entry) {
2264 if (inst->is_partial_write()) {
2265 /* For a partial write, we can't remove any previous dead code
2266 * candidate, since we're just modifying their result.
2267 */
2268 } else {
2269 /* We're completely updating a channel, and there was a
2270 * previous write to the channel that wasn't read. Kill it!
2271 */
2272 fs_inst *inst = (fs_inst *)entry->data;
2273 inst->remove();
2274 progress = true;
2275 }
2276
2277 _mesa_hash_table_remove(ht, entry);
2278 }
2279
2280 if (!inst->has_side_effects())
2281 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2282 inst);
2283 }
2284 }
2285 }
2286
2287 _mesa_hash_table_destroy(ht, NULL);
2288
2289 if (progress)
2290 invalidate_live_intervals();
2291
2292 return progress;
2293 }
2294
2295 /**
2296 * Implements register coalescing: Checks if the two registers involved in a
2297 * raw move don't interfere, in which case they can both be stored in the same
2298 * place and the MOV removed.
2299 *
2300 * To do this, all uses of the source of the MOV in the shader are replaced
2301 * with the destination of the MOV. For example:
2302 *
2303 * add vgrf3:F, vgrf1:F, vgrf2:F
2304 * mov vgrf4:F, vgrf3:F
2305 * mul vgrf5:F, vgrf5:F, vgrf4:F
2306 *
2307 * becomes
2308 *
2309 * add vgrf4:F, vgrf1:F, vgrf2:F
2310 * mul vgrf5:F, vgrf5:F, vgrf4:F
2311 */
2312 bool
2313 fs_visitor::register_coalesce()
2314 {
2315 bool progress = false;
2316
2317 calculate_live_intervals();
2318
2319 int src_size = 0;
2320 int channels_remaining = 0;
2321 int reg_from = -1, reg_to = -1;
2322 int reg_to_offset[MAX_SAMPLER_MESSAGE_SIZE];
2323 fs_inst *mov[MAX_SAMPLER_MESSAGE_SIZE];
2324
2325 foreach_list(node, &this->instructions) {
2326 fs_inst *inst = (fs_inst *)node;
2327
2328 if (inst->opcode != BRW_OPCODE_MOV ||
2329 inst->is_partial_write() ||
2330 inst->saturate ||
2331 inst->src[0].file != GRF ||
2332 inst->src[0].negate ||
2333 inst->src[0].abs ||
2334 !inst->src[0].is_contiguous() ||
2335 inst->dst.file != GRF ||
2336 inst->dst.type != inst->src[0].type) {
2337 continue;
2338 }
2339
2340 if (virtual_grf_sizes[inst->src[0].reg] >
2341 virtual_grf_sizes[inst->dst.reg])
2342 continue;
2343
2344 int var_from = live_intervals->var_from_reg(&inst->src[0]);
2345 int var_to = live_intervals->var_from_reg(&inst->dst);
2346
2347 if (live_intervals->vars_interfere(var_from, var_to) &&
2348 !inst->dst.equals(inst->src[0])) {
2349
2350 /* We know that the live ranges of A (var_from) and B (var_to)
2351 * interfere because of the ->vars_interfere() call above. If the end
2352 * of B's live range is after the end of A's range, then we know two
2353 * things:
2354 * - the start of B's live range must be in A's live range (since we
2355 * already know the two ranges interfere, this is the only remaining
2356 * possibility)
2357 * - the interference isn't of the form we're looking for (where B is
2358 * entirely inside A)
2359 */
2360 if (live_intervals->end[var_to] > live_intervals->end[var_from])
2361 continue;
2362
2363 bool overwritten = false;
2364 int scan_ip = -1;
2365
2366 foreach_list(n, &this->instructions) {
2367 fs_inst *scan_inst = (fs_inst *)n;
2368 scan_ip++;
2369
2370 if (scan_inst->is_control_flow()) {
2371 overwritten = true;
2372 break;
2373 }
2374
2375 if (scan_ip <= live_intervals->start[var_to])
2376 continue;
2377
2378 if (scan_ip > live_intervals->end[var_to])
2379 break;
2380
2381 if (scan_inst->dst.equals(inst->dst) ||
2382 scan_inst->dst.equals(inst->src[0])) {
2383 overwritten = true;
2384 break;
2385 }
2386 }
2387
2388 if (overwritten)
2389 continue;
2390 }
2391
2392 if (reg_from != inst->src[0].reg) {
2393 reg_from = inst->src[0].reg;
2394
2395 src_size = virtual_grf_sizes[inst->src[0].reg];
2396 assert(src_size <= MAX_SAMPLER_MESSAGE_SIZE);
2397
2398 channels_remaining = src_size;
2399 memset(mov, 0, sizeof(mov));
2400
2401 reg_to = inst->dst.reg;
2402 }
2403
2404 if (reg_to != inst->dst.reg)
2405 continue;
2406
2407 const int offset = inst->src[0].reg_offset;
2408 reg_to_offset[offset] = inst->dst.reg_offset;
2409 mov[offset] = inst;
2410 channels_remaining--;
2411
2412 if (channels_remaining)
2413 continue;
2414
2415 bool removed = false;
2416 for (int i = 0; i < src_size; i++) {
2417 if (mov[i]) {
2418 removed = true;
2419
2420 mov[i]->opcode = BRW_OPCODE_NOP;
2421 mov[i]->conditional_mod = BRW_CONDITIONAL_NONE;
2422 mov[i]->dst = reg_undef;
2423 mov[i]->src[0] = reg_undef;
2424 mov[i]->src[1] = reg_undef;
2425 mov[i]->src[2] = reg_undef;
2426 }
2427 }
2428
2429 foreach_list(node, &this->instructions) {
2430 fs_inst *scan_inst = (fs_inst *)node;
2431
2432 for (int i = 0; i < src_size; i++) {
2433 if (mov[i]) {
2434 if (scan_inst->dst.file == GRF &&
2435 scan_inst->dst.reg == reg_from &&
2436 scan_inst->dst.reg_offset == i) {
2437 scan_inst->dst.reg = reg_to;
2438 scan_inst->dst.reg_offset = reg_to_offset[i];
2439 }
2440 for (int j = 0; j < 3; j++) {
2441 if (scan_inst->src[j].file == GRF &&
2442 scan_inst->src[j].reg == reg_from &&
2443 scan_inst->src[j].reg_offset == i) {
2444 scan_inst->src[j].reg = reg_to;
2445 scan_inst->src[j].reg_offset = reg_to_offset[i];
2446 }
2447 }
2448 }
2449 }
2450 }
2451
2452 if (removed) {
2453 live_intervals->start[var_to] = MIN2(live_intervals->start[var_to],
2454 live_intervals->start[var_from]);
2455 live_intervals->end[var_to] = MAX2(live_intervals->end[var_to],
2456 live_intervals->end[var_from]);
2457 reg_from = -1;
2458 }
2459 }
2460
2461 foreach_list_safe(node, &this->instructions) {
2462 fs_inst *inst = (fs_inst *)node;
2463
2464 if (inst->opcode == BRW_OPCODE_NOP) {
2465 inst->remove();
2466 progress = true;
2467 }
2468 }
2469
2470 if (progress)
2471 invalidate_live_intervals();
2472
2473 return progress;
2474 }
2475
2476 bool
2477 fs_visitor::compute_to_mrf()
2478 {
2479 bool progress = false;
2480 int next_ip = 0;
2481
2482 calculate_live_intervals();
2483
2484 foreach_list_safe(node, &this->instructions) {
2485 fs_inst *inst = (fs_inst *)node;
2486
2487 int ip = next_ip;
2488 next_ip++;
2489
2490 if (inst->opcode != BRW_OPCODE_MOV ||
2491 inst->is_partial_write() ||
2492 inst->dst.file != MRF || inst->src[0].file != GRF ||
2493 inst->dst.type != inst->src[0].type ||
2494 inst->src[0].abs || inst->src[0].negate ||
2495 !inst->src[0].is_contiguous() ||
2496 inst->src[0].subreg_offset)
2497 continue;
2498
2499 /* Work out which hardware MRF registers are written by this
2500 * instruction.
2501 */
2502 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2503 int mrf_high;
2504 if (inst->dst.reg & BRW_MRF_COMPR4) {
2505 mrf_high = mrf_low + 4;
2506 } else if (dispatch_width == 16 &&
2507 (!inst->force_uncompressed && !inst->force_sechalf)) {
2508 mrf_high = mrf_low + 1;
2509 } else {
2510 mrf_high = mrf_low;
2511 }
2512
2513 /* Can't compute-to-MRF this GRF if someone else was going to
2514 * read it later.
2515 */
2516 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2517 continue;
2518
2519 /* Found a move of a GRF to a MRF. Let's see if we can go
2520 * rewrite the thing that made this GRF to write into the MRF.
2521 */
2522 fs_inst *scan_inst;
2523 for (scan_inst = (fs_inst *)inst->prev;
2524 scan_inst->prev != NULL;
2525 scan_inst = (fs_inst *)scan_inst->prev) {
2526 if (scan_inst->dst.file == GRF &&
2527 scan_inst->dst.reg == inst->src[0].reg) {
2528 /* Found the last thing to write our reg we want to turn
2529 * into a compute-to-MRF.
2530 */
2531
2532 /* If this one instruction didn't populate all the
2533 * channels, bail. We might be able to rewrite everything
2534 * that writes that reg, but it would require smarter
2535 * tracking to delay the rewriting until complete success.
2536 */
2537 if (scan_inst->is_partial_write())
2538 break;
2539
2540 /* Things returning more than one register would need us to
2541 * understand coalescing out more than one MOV at a time.
2542 */
2543 if (scan_inst->regs_written > 1)
2544 break;
2545
2546 /* SEND instructions can't have MRF as a destination. */
2547 if (scan_inst->mlen)
2548 break;
2549
2550 if (brw->gen == 6) {
2551 /* gen6 math instructions must have the destination be
2552 * GRF, so no compute-to-MRF for them.
2553 */
2554 if (scan_inst->is_math()) {
2555 break;
2556 }
2557 }
2558
2559 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2560 /* Found the creator of our MRF's source value. */
2561 scan_inst->dst.file = MRF;
2562 scan_inst->dst.reg = inst->dst.reg;
2563 scan_inst->saturate |= inst->saturate;
2564 inst->remove();
2565 progress = true;
2566 }
2567 break;
2568 }
2569
2570 /* We don't handle control flow here. Most computation of
2571 * values that end up in MRFs are shortly before the MRF
2572 * write anyway.
2573 */
2574 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2575 break;
2576
2577 /* You can't read from an MRF, so if someone else reads our
2578 * MRF's source GRF that we wanted to rewrite, that stops us.
2579 */
2580 bool interfered = false;
2581 for (int i = 0; i < 3; i++) {
2582 if (scan_inst->src[i].file == GRF &&
2583 scan_inst->src[i].reg == inst->src[0].reg &&
2584 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2585 interfered = true;
2586 }
2587 }
2588 if (interfered)
2589 break;
2590
2591 if (scan_inst->dst.file == MRF) {
2592 /* If somebody else writes our MRF here, we can't
2593 * compute-to-MRF before that.
2594 */
2595 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2596 int scan_mrf_high;
2597
2598 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2599 scan_mrf_high = scan_mrf_low + 4;
2600 } else if (dispatch_width == 16 &&
2601 (!scan_inst->force_uncompressed &&
2602 !scan_inst->force_sechalf)) {
2603 scan_mrf_high = scan_mrf_low + 1;
2604 } else {
2605 scan_mrf_high = scan_mrf_low;
2606 }
2607
2608 if (mrf_low == scan_mrf_low ||
2609 mrf_low == scan_mrf_high ||
2610 mrf_high == scan_mrf_low ||
2611 mrf_high == scan_mrf_high) {
2612 break;
2613 }
2614 }
2615
2616 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2617 /* Found a SEND instruction, which means that there are
2618 * live values in MRFs from base_mrf to base_mrf +
2619 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2620 * above it.
2621 */
2622 if (mrf_low >= scan_inst->base_mrf &&
2623 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2624 break;
2625 }
2626 if (mrf_high >= scan_inst->base_mrf &&
2627 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2628 break;
2629 }
2630 }
2631 }
2632 }
2633
2634 if (progress)
2635 invalidate_live_intervals();
2636
2637 return progress;
2638 }
2639
2640 /**
2641 * Walks through basic blocks, looking for repeated MRF writes and
2642 * removing the later ones.
2643 */
2644 bool
2645 fs_visitor::remove_duplicate_mrf_writes()
2646 {
2647 fs_inst *last_mrf_move[16];
2648 bool progress = false;
2649
2650 /* Need to update the MRF tracking for compressed instructions. */
2651 if (dispatch_width == 16)
2652 return false;
2653
2654 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2655
2656 foreach_list_safe(node, &this->instructions) {
2657 fs_inst *inst = (fs_inst *)node;
2658
2659 if (inst->is_control_flow()) {
2660 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2661 }
2662
2663 if (inst->opcode == BRW_OPCODE_MOV &&
2664 inst->dst.file == MRF) {
2665 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2666 if (prev_inst && inst->equals(prev_inst)) {
2667 inst->remove();
2668 progress = true;
2669 continue;
2670 }
2671 }
2672
2673 /* Clear out the last-write records for MRFs that were overwritten. */
2674 if (inst->dst.file == MRF) {
2675 last_mrf_move[inst->dst.reg] = NULL;
2676 }
2677
2678 if (inst->mlen > 0 && inst->base_mrf != -1) {
2679 /* Found a SEND instruction, which will include two or fewer
2680 * implied MRF writes. We could do better here.
2681 */
2682 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2683 last_mrf_move[inst->base_mrf + i] = NULL;
2684 }
2685 }
2686
2687 /* Clear out any MRF move records whose sources got overwritten. */
2688 if (inst->dst.file == GRF) {
2689 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2690 if (last_mrf_move[i] &&
2691 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2692 last_mrf_move[i] = NULL;
2693 }
2694 }
2695 }
2696
2697 if (inst->opcode == BRW_OPCODE_MOV &&
2698 inst->dst.file == MRF &&
2699 inst->src[0].file == GRF &&
2700 !inst->is_partial_write()) {
2701 last_mrf_move[inst->dst.reg] = inst;
2702 }
2703 }
2704
2705 if (progress)
2706 invalidate_live_intervals();
2707
2708 return progress;
2709 }
2710
2711 static void
2712 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2713 int first_grf, int grf_len)
2714 {
2715 bool inst_simd16 = (dispatch_width > 8 &&
2716 !inst->force_uncompressed &&
2717 !inst->force_sechalf);
2718
2719 /* Clear the flag for registers that actually got read (as expected). */
2720 for (int i = 0; i < 3; i++) {
2721 int grf;
2722 if (inst->src[i].file == GRF) {
2723 grf = inst->src[i].reg;
2724 } else if (inst->src[i].file == HW_REG &&
2725 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2726 grf = inst->src[i].fixed_hw_reg.nr;
2727 } else {
2728 continue;
2729 }
2730
2731 if (grf >= first_grf &&
2732 grf < first_grf + grf_len) {
2733 deps[grf - first_grf] = false;
2734 if (inst_simd16)
2735 deps[grf - first_grf + 1] = false;
2736 }
2737 }
2738 }
2739
2740 /**
2741 * Implements this workaround for the original 965:
2742 *
2743 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2744 * check for post destination dependencies on this instruction, software
2745 * must ensure that there is no destination hazard for the case of ‘write
2746 * followed by a posted write’ shown in the following example.
2747 *
2748 * 1. mov r3 0
2749 * 2. send r3.xy <rest of send instruction>
2750 * 3. mov r2 r3
2751 *
2752 * Due to no post-destination dependency check on the ‘send’, the above
2753 * code sequence could have two instructions (1 and 2) in flight at the
2754 * same time that both consider ‘r3’ as the target of their final writes.
2755 */
2756 void
2757 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2758 {
2759 int reg_size = dispatch_width / 8;
2760 int write_len = inst->regs_written * reg_size;
2761 int first_write_grf = inst->dst.reg;
2762 bool needs_dep[BRW_MAX_MRF];
2763 assert(write_len < (int)sizeof(needs_dep) - 1);
2764
2765 memset(needs_dep, false, sizeof(needs_dep));
2766 memset(needs_dep, true, write_len);
2767
2768 clear_deps_for_inst_src(inst, dispatch_width,
2769 needs_dep, first_write_grf, write_len);
2770
2771 /* Walk backwards looking for writes to registers we're writing which
2772 * aren't read since being written. If we hit the start of the program,
2773 * we assume that there are no outstanding dependencies on entry to the
2774 * program.
2775 */
2776 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2777 scan_inst != NULL;
2778 scan_inst = (fs_inst *)scan_inst->prev) {
2779
2780 /* If we hit control flow, assume that there *are* outstanding
2781 * dependencies, and force their cleanup before our instruction.
2782 */
2783 if (scan_inst->is_control_flow()) {
2784 for (int i = 0; i < write_len; i++) {
2785 if (needs_dep[i]) {
2786 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2787 }
2788 }
2789 return;
2790 }
2791
2792 bool scan_inst_simd16 = (dispatch_width > 8 &&
2793 !scan_inst->force_uncompressed &&
2794 !scan_inst->force_sechalf);
2795
2796 /* We insert our reads as late as possible on the assumption that any
2797 * instruction but a MOV that might have left us an outstanding
2798 * dependency has more latency than a MOV.
2799 */
2800 if (scan_inst->dst.file == GRF) {
2801 for (int i = 0; i < scan_inst->regs_written; i++) {
2802 int reg = scan_inst->dst.reg + i * reg_size;
2803
2804 if (reg >= first_write_grf &&
2805 reg < first_write_grf + write_len &&
2806 needs_dep[reg - first_write_grf]) {
2807 inst->insert_before(DEP_RESOLVE_MOV(reg));
2808 needs_dep[reg - first_write_grf] = false;
2809 if (scan_inst_simd16)
2810 needs_dep[reg - first_write_grf + 1] = false;
2811 }
2812 }
2813 }
2814
2815 /* Clear the flag for registers that actually got read (as expected). */
2816 clear_deps_for_inst_src(scan_inst, dispatch_width,
2817 needs_dep, first_write_grf, write_len);
2818
2819 /* Continue the loop only if we haven't resolved all the dependencies */
2820 int i;
2821 for (i = 0; i < write_len; i++) {
2822 if (needs_dep[i])
2823 break;
2824 }
2825 if (i == write_len)
2826 return;
2827 }
2828 }
2829
2830 /**
2831 * Implements this workaround for the original 965:
2832 *
2833 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2834 * used as a destination register until after it has been sourced by an
2835 * instruction with a different destination register.
2836 */
2837 void
2838 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2839 {
2840 int write_len = inst->regs_written * dispatch_width / 8;
2841 int first_write_grf = inst->dst.reg;
2842 bool needs_dep[BRW_MAX_MRF];
2843 assert(write_len < (int)sizeof(needs_dep) - 1);
2844
2845 memset(needs_dep, false, sizeof(needs_dep));
2846 memset(needs_dep, true, write_len);
2847 /* Walk forwards looking for writes to registers we're writing which aren't
2848 * read before being written.
2849 */
2850 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2851 !scan_inst->is_tail_sentinel();
2852 scan_inst = (fs_inst *)scan_inst->next) {
2853 /* If we hit control flow, force resolve all remaining dependencies. */
2854 if (scan_inst->is_control_flow()) {
2855 for (int i = 0; i < write_len; i++) {
2856 if (needs_dep[i])
2857 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2858 }
2859 return;
2860 }
2861
2862 /* Clear the flag for registers that actually got read (as expected). */
2863 clear_deps_for_inst_src(scan_inst, dispatch_width,
2864 needs_dep, first_write_grf, write_len);
2865
2866 /* We insert our reads as late as possible since they're reading the
2867 * result of a SEND, which has massive latency.
2868 */
2869 if (scan_inst->dst.file == GRF &&
2870 scan_inst->dst.reg >= first_write_grf &&
2871 scan_inst->dst.reg < first_write_grf + write_len &&
2872 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2873 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2874 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2875 }
2876
2877 /* Continue the loop only if we haven't resolved all the dependencies */
2878 int i;
2879 for (i = 0; i < write_len; i++) {
2880 if (needs_dep[i])
2881 break;
2882 }
2883 if (i == write_len)
2884 return;
2885 }
2886
2887 /* If we hit the end of the program, resolve all remaining dependencies out
2888 * of paranoia.
2889 */
2890 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2891 assert(last_inst->eot);
2892 for (int i = 0; i < write_len; i++) {
2893 if (needs_dep[i])
2894 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2895 }
2896 }
2897
2898 void
2899 fs_visitor::insert_gen4_send_dependency_workarounds()
2900 {
2901 if (brw->gen != 4 || brw->is_g4x)
2902 return;
2903
2904 /* Note that we're done with register allocation, so GRF fs_regs always
2905 * have a .reg_offset of 0.
2906 */
2907
2908 foreach_list_safe(node, &this->instructions) {
2909 fs_inst *inst = (fs_inst *)node;
2910
2911 if (inst->mlen != 0 && inst->dst.file == GRF) {
2912 insert_gen4_pre_send_dependency_workarounds(inst);
2913 insert_gen4_post_send_dependency_workarounds(inst);
2914 }
2915 }
2916 }
2917
2918 /**
2919 * Turns the generic expression-style uniform pull constant load instruction
2920 * into a hardware-specific series of instructions for loading a pull
2921 * constant.
2922 *
2923 * The expression style allows the CSE pass before this to optimize out
2924 * repeated loads from the same offset, and gives the pre-register-allocation
2925 * scheduling full flexibility, while the conversion to native instructions
2926 * allows the post-register-allocation scheduler the best information
2927 * possible.
2928 *
2929 * Note that execution masking for setting up pull constant loads is special:
2930 * the channels that need to be written are unrelated to the current execution
2931 * mask, since a later instruction will use one of the result channels as a
2932 * source operand for all 8 or 16 of its channels.
2933 */
2934 void
2935 fs_visitor::lower_uniform_pull_constant_loads()
2936 {
2937 foreach_list(node, &this->instructions) {
2938 fs_inst *inst = (fs_inst *)node;
2939
2940 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2941 continue;
2942
2943 if (brw->gen >= 7) {
2944 /* The offset arg before was a vec4-aligned byte offset. We need to
2945 * turn it into a dword offset.
2946 */
2947 fs_reg const_offset_reg = inst->src[1];
2948 assert(const_offset_reg.file == IMM &&
2949 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2950 const_offset_reg.imm.u /= 4;
2951 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2952
2953 /* This is actually going to be a MOV, but since only the first dword
2954 * is accessed, we have a special opcode to do just that one. Note
2955 * that this needs to be an operation that will be considered a def
2956 * by live variable analysis, or register allocation will explode.
2957 */
2958 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2959 payload, const_offset_reg);
2960 setup->force_writemask_all = true;
2961
2962 setup->ir = inst->ir;
2963 setup->annotation = inst->annotation;
2964 inst->insert_before(setup);
2965
2966 /* Similarly, this will only populate the first 4 channels of the
2967 * result register (since we only use smear values from 0-3), but we
2968 * don't tell the optimizer.
2969 */
2970 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2971 inst->src[1] = payload;
2972
2973 invalidate_live_intervals();
2974 } else {
2975 /* Before register allocation, we didn't tell the scheduler about the
2976 * MRF we use. We know it's safe to use this MRF because nothing
2977 * else does except for register spill/unspill, which generates and
2978 * uses its MRF within a single IR instruction.
2979 */
2980 inst->base_mrf = 14;
2981 inst->mlen = 1;
2982 }
2983 }
2984 }
2985
2986 void
2987 fs_visitor::dump_instructions()
2988 {
2989 calculate_register_pressure();
2990
2991 int ip = 0, max_pressure = 0;
2992 foreach_list(node, &this->instructions) {
2993 backend_instruction *inst = (backend_instruction *)node;
2994 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2995 printf("{%3d} %4d: ", regs_live_at_ip[ip], ip);
2996 dump_instruction(inst);
2997 ++ip;
2998 }
2999 printf("Maximum %3d registers live at once.\n", max_pressure);
3000 }
3001
3002 void
3003 fs_visitor::dump_instruction(backend_instruction *be_inst)
3004 {
3005 fs_inst *inst = (fs_inst *)be_inst;
3006
3007 if (inst->predicate) {
3008 printf("(%cf0.%d) ",
3009 inst->predicate_inverse ? '-' : '+',
3010 inst->flag_subreg);
3011 }
3012
3013 printf("%s", brw_instruction_name(inst->opcode));
3014 if (inst->saturate)
3015 printf(".sat");
3016 if (inst->conditional_mod) {
3017 printf("%s", conditional_modifier[inst->conditional_mod]);
3018 if (!inst->predicate &&
3019 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3020 inst->opcode != BRW_OPCODE_IF &&
3021 inst->opcode != BRW_OPCODE_WHILE))) {
3022 printf(".f0.%d", inst->flag_subreg);
3023 }
3024 }
3025 printf(" ");
3026
3027
3028 switch (inst->dst.file) {
3029 case GRF:
3030 printf("vgrf%d", inst->dst.reg);
3031 if (virtual_grf_sizes[inst->dst.reg] != 1 ||
3032 inst->dst.subreg_offset)
3033 printf("+%d.%d", inst->dst.reg_offset, inst->dst.subreg_offset);
3034 break;
3035 case MRF:
3036 printf("m%d", inst->dst.reg);
3037 break;
3038 case BAD_FILE:
3039 printf("(null)");
3040 break;
3041 case UNIFORM:
3042 printf("***u%d***", inst->dst.reg);
3043 break;
3044 case HW_REG:
3045 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3046 switch (inst->dst.fixed_hw_reg.nr) {
3047 case BRW_ARF_NULL:
3048 printf("null");
3049 break;
3050 case BRW_ARF_ADDRESS:
3051 printf("a0.%d", inst->dst.fixed_hw_reg.subnr);
3052 break;
3053 case BRW_ARF_ACCUMULATOR:
3054 printf("acc%d", inst->dst.fixed_hw_reg.subnr);
3055 break;
3056 case BRW_ARF_FLAG:
3057 printf("f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3058 inst->dst.fixed_hw_reg.subnr);
3059 break;
3060 default:
3061 printf("arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3062 inst->dst.fixed_hw_reg.subnr);
3063 break;
3064 }
3065 } else {
3066 printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
3067 }
3068 if (inst->dst.fixed_hw_reg.subnr)
3069 printf("+%d", inst->dst.fixed_hw_reg.subnr);
3070 break;
3071 default:
3072 printf("???");
3073 break;
3074 }
3075 printf(":%s, ", reg_encoding[inst->dst.type]);
3076
3077 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
3078 if (inst->src[i].negate)
3079 printf("-");
3080 if (inst->src[i].abs)
3081 printf("|");
3082 switch (inst->src[i].file) {
3083 case GRF:
3084 printf("vgrf%d", inst->src[i].reg);
3085 if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
3086 inst->src[i].subreg_offset)
3087 printf("+%d.%d", inst->src[i].reg_offset,
3088 inst->src[i].subreg_offset);
3089 break;
3090 case MRF:
3091 printf("***m%d***", inst->src[i].reg);
3092 break;
3093 case UNIFORM:
3094 printf("u%d", inst->src[i].reg);
3095 if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
3096 inst->src[i].subreg_offset)
3097 printf("+%d.%d", inst->src[i].reg_offset,
3098 inst->src[i].subreg_offset);
3099 break;
3100 case BAD_FILE:
3101 printf("(null)");
3102 break;
3103 case IMM:
3104 switch (inst->src[i].type) {
3105 case BRW_REGISTER_TYPE_F:
3106 printf("%ff", inst->src[i].imm.f);
3107 break;
3108 case BRW_REGISTER_TYPE_D:
3109 printf("%dd", inst->src[i].imm.i);
3110 break;
3111 case BRW_REGISTER_TYPE_UD:
3112 printf("%uu", inst->src[i].imm.u);
3113 break;
3114 default:
3115 printf("???");
3116 break;
3117 }
3118 break;
3119 case HW_REG:
3120 if (inst->src[i].fixed_hw_reg.negate)
3121 printf("-");
3122 if (inst->src[i].fixed_hw_reg.abs)
3123 printf("|");
3124 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3125 switch (inst->src[i].fixed_hw_reg.nr) {
3126 case BRW_ARF_NULL:
3127 printf("null");
3128 break;
3129 case BRW_ARF_ADDRESS:
3130 printf("a0.%d", inst->src[i].fixed_hw_reg.subnr);
3131 break;
3132 case BRW_ARF_ACCUMULATOR:
3133 printf("acc%d", inst->src[i].fixed_hw_reg.subnr);
3134 break;
3135 case BRW_ARF_FLAG:
3136 printf("f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3137 inst->src[i].fixed_hw_reg.subnr);
3138 break;
3139 default:
3140 printf("arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3141 inst->src[i].fixed_hw_reg.subnr);
3142 break;
3143 }
3144 } else {
3145 printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3146 }
3147 if (inst->src[i].fixed_hw_reg.subnr)
3148 printf("+%d", inst->src[i].fixed_hw_reg.subnr);
3149 if (inst->src[i].fixed_hw_reg.abs)
3150 printf("|");
3151 break;
3152 default:
3153 printf("???");
3154 break;
3155 }
3156 if (inst->src[i].abs)
3157 printf("|");
3158
3159 if (inst->src[i].file != IMM) {
3160 printf(":%s", brw_reg_type_letters(inst->src[i].type));
3161 }
3162
3163 if (i < 2 && inst->src[i + 1].file != BAD_FILE)
3164 printf(", ");
3165 }
3166
3167 printf(" ");
3168
3169 if (inst->force_uncompressed)
3170 printf("1sthalf ");
3171
3172 if (inst->force_sechalf)
3173 printf("2ndhalf ");
3174
3175 printf("\n");
3176 }
3177
3178 /**
3179 * Possibly returns an instruction that set up @param reg.
3180 *
3181 * Sometimes we want to take the result of some expression/variable
3182 * dereference tree and rewrite the instruction generating the result
3183 * of the tree. When processing the tree, we know that the
3184 * instructions generated are all writing temporaries that are dead
3185 * outside of this tree. So, if we have some instructions that write
3186 * a temporary, we're free to point that temp write somewhere else.
3187 *
3188 * Note that this doesn't guarantee that the instruction generated
3189 * only reg -- it might be the size=4 destination of a texture instruction.
3190 */
3191 fs_inst *
3192 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3193 fs_inst *end,
3194 fs_reg reg)
3195 {
3196 if (end == start ||
3197 end->is_partial_write() ||
3198 reg.reladdr ||
3199 !reg.equals(end->dst)) {
3200 return NULL;
3201 } else {
3202 return end;
3203 }
3204 }
3205
3206 void
3207 fs_visitor::setup_payload_gen6()
3208 {
3209 bool uses_depth =
3210 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3211 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3212
3213 assert(brw->gen >= 6);
3214
3215 /* R0-1: masks, pixel X/Y coordinates. */
3216 c->nr_payload_regs = 2;
3217 /* R2: only for 32-pixel dispatch.*/
3218
3219 /* R3-26: barycentric interpolation coordinates. These appear in the
3220 * same order that they appear in the brw_wm_barycentric_interp_mode
3221 * enum. Each set of coordinates occupies 2 registers if dispatch width
3222 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3223 * appear if they were enabled using the "Barycentric Interpolation
3224 * Mode" bits in WM_STATE.
3225 */
3226 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3227 if (barycentric_interp_modes & (1 << i)) {
3228 c->barycentric_coord_reg[i] = c->nr_payload_regs;
3229 c->nr_payload_regs += 2;
3230 if (dispatch_width == 16) {
3231 c->nr_payload_regs += 2;
3232 }
3233 }
3234 }
3235
3236 /* R27: interpolated depth if uses source depth */
3237 if (uses_depth) {
3238 c->source_depth_reg = c->nr_payload_regs;
3239 c->nr_payload_regs++;
3240 if (dispatch_width == 16) {
3241 /* R28: interpolated depth if not SIMD8. */
3242 c->nr_payload_regs++;
3243 }
3244 }
3245 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3246 if (uses_depth) {
3247 c->source_w_reg = c->nr_payload_regs;
3248 c->nr_payload_regs++;
3249 if (dispatch_width == 16) {
3250 /* R30: interpolated W if not SIMD8. */
3251 c->nr_payload_regs++;
3252 }
3253 }
3254
3255 c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3256 /* R31: MSAA position offsets. */
3257 if (c->prog_data.uses_pos_offset) {
3258 c->sample_pos_reg = c->nr_payload_regs;
3259 c->nr_payload_regs++;
3260 }
3261
3262 /* R32: MSAA input coverage mask */
3263 if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3264 assert(brw->gen >= 7);
3265 c->sample_mask_reg = c->nr_payload_regs;
3266 c->nr_payload_regs++;
3267 if (dispatch_width == 16) {
3268 /* R33: input coverage mask if not SIMD8. */
3269 c->nr_payload_regs++;
3270 }
3271 }
3272
3273 /* R34-: bary for 32-pixel. */
3274 /* R58-59: interp W for 32-pixel. */
3275
3276 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3277 c->source_depth_to_render_target = true;
3278 }
3279 }
3280
3281 void
3282 fs_visitor::assign_binding_table_offsets()
3283 {
3284 uint32_t next_binding_table_offset = 0;
3285
3286 /* If there are no color regions, we still perform an FB write to a null
3287 * renderbuffer, which we place at surface index 0.
3288 */
3289 c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3290 next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3291
3292 assign_common_binding_table_offsets(next_binding_table_offset);
3293 }
3294
3295 void
3296 fs_visitor::calculate_register_pressure()
3297 {
3298 calculate_live_intervals();
3299
3300 int num_instructions = 0;
3301 foreach_list(node, &this->instructions) {
3302 ++num_instructions;
3303 }
3304
3305 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3306
3307 for (int reg = 0; reg < virtual_grf_count; reg++) {
3308 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3309 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3310 }
3311 }
3312
3313 bool
3314 fs_visitor::run()
3315 {
3316 sanity_param_count = fp->Base.Parameters->NumParameters;
3317 uint32_t orig_nr_params = c->prog_data.nr_params;
3318 bool allocated_without_spills;
3319
3320 assign_binding_table_offsets();
3321
3322 if (brw->gen >= 6)
3323 setup_payload_gen6();
3324 else
3325 setup_payload_gen4();
3326
3327 if (0) {
3328 emit_dummy_fs();
3329 } else {
3330 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3331 emit_shader_time_begin();
3332
3333 calculate_urb_setup();
3334 if (fp->Base.InputsRead > 0) {
3335 if (brw->gen < 6)
3336 emit_interpolation_setup_gen4();
3337 else
3338 emit_interpolation_setup_gen6();
3339 }
3340
3341 /* We handle discards by keeping track of the still-live pixels in f0.1.
3342 * Initialize it with the dispatched pixels.
3343 */
3344 if (fp->UsesKill || c->key.alpha_test_func) {
3345 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3346 discard_init->flag_subreg = 1;
3347 }
3348
3349 /* Generate FS IR for main(). (the visitor only descends into
3350 * functions called "main").
3351 */
3352 if (shader) {
3353 foreach_list(node, &*shader->base.ir) {
3354 ir_instruction *ir = (ir_instruction *)node;
3355 base_ir = ir;
3356 this->result = reg_undef;
3357 ir->accept(this);
3358 }
3359 } else {
3360 emit_fragment_program_code();
3361 }
3362 base_ir = NULL;
3363 if (failed)
3364 return false;
3365
3366 emit(FS_OPCODE_PLACEHOLDER_HALT);
3367
3368 if (c->key.alpha_test_func)
3369 emit_alpha_test();
3370
3371 emit_fb_writes();
3372
3373 split_virtual_grfs();
3374
3375 move_uniform_array_access_to_pull_constants();
3376 remove_dead_constants();
3377 setup_pull_constants();
3378
3379 bool progress;
3380 do {
3381 progress = false;
3382
3383 compact_virtual_grfs();
3384
3385 progress = remove_duplicate_mrf_writes() || progress;
3386
3387 progress = opt_algebraic() || progress;
3388 progress = opt_cse() || progress;
3389 progress = opt_copy_propagate() || progress;
3390 progress = opt_peephole_predicated_break() || progress;
3391 progress = dead_code_eliminate() || progress;
3392 progress = dead_code_eliminate_local() || progress;
3393 progress = opt_peephole_sel() || progress;
3394 progress = dead_control_flow_eliminate(this) || progress;
3395 progress = opt_saturate_propagation() || progress;
3396 progress = register_coalesce() || progress;
3397 progress = compute_to_mrf() || progress;
3398 } while (progress);
3399
3400 lower_uniform_pull_constant_loads();
3401
3402 assign_curb_setup();
3403 assign_urb_setup();
3404
3405 static enum instruction_scheduler_mode pre_modes[] = {
3406 SCHEDULE_PRE,
3407 SCHEDULE_PRE_NON_LIFO,
3408 SCHEDULE_PRE_LIFO,
3409 };
3410
3411 /* Try each scheduling heuristic to see if it can successfully register
3412 * allocate without spilling. They should be ordered by decreasing
3413 * performance but increasing likelihood of allocating.
3414 */
3415 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3416 schedule_instructions(pre_modes[i]);
3417
3418 if (0) {
3419 assign_regs_trivial();
3420 allocated_without_spills = true;
3421 } else {
3422 allocated_without_spills = assign_regs(false);
3423 }
3424 if (allocated_without_spills)
3425 break;
3426 }
3427
3428 if (!allocated_without_spills) {
3429 /* We assume that any spilling is worse than just dropping back to
3430 * SIMD8. There's probably actually some intermediate point where
3431 * SIMD16 with a couple of spills is still better.
3432 */
3433 if (dispatch_width == 16) {
3434 fail("Failure to register allocate. Reduce number of "
3435 "live scalar values to avoid this.");
3436 }
3437
3438 /* Since we're out of heuristics, just go spill registers until we
3439 * get an allocation.
3440 */
3441 while (!assign_regs(true)) {
3442 if (failed)
3443 break;
3444 }
3445 }
3446 }
3447 assert(force_uncompressed_stack == 0);
3448
3449 /* This must come after all optimization and register allocation, since
3450 * it inserts dead code that happens to have side effects, and it does
3451 * so based on the actual physical registers in use.
3452 */
3453 insert_gen4_send_dependency_workarounds();
3454
3455 if (failed)
3456 return false;
3457
3458 if (!allocated_without_spills)
3459 schedule_instructions(SCHEDULE_POST);
3460
3461 if (dispatch_width == 8) {
3462 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3463 } else {
3464 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3465
3466 /* Make sure we didn't try to sneak in an extra uniform */
3467 assert(orig_nr_params == c->prog_data.nr_params);
3468 (void) orig_nr_params;
3469 }
3470
3471 /* If any state parameters were appended, then ParameterValues could have
3472 * been realloced, in which case the driver uniform storage set up by
3473 * _mesa_associate_uniform_storage() would point to freed memory. Make
3474 * sure that didn't happen.
3475 */
3476 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3477
3478 return !failed;
3479 }
3480
3481 const unsigned *
3482 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3483 struct gl_fragment_program *fp,
3484 struct gl_shader_program *prog,
3485 unsigned *final_assembly_size)
3486 {
3487 bool start_busy = false;
3488 float start_time = 0;
3489
3490 if (unlikely(brw->perf_debug)) {
3491 start_busy = (brw->batch.last_bo &&
3492 drm_intel_bo_busy(brw->batch.last_bo));
3493 start_time = get_time();
3494 }
3495
3496 struct brw_shader *shader = NULL;
3497 if (prog)
3498 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3499
3500 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3501 if (prog) {
3502 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3503 _mesa_print_ir(shader->base.ir, NULL);
3504 printf("\n\n");
3505 } else {
3506 printf("ARB_fragment_program %d ir for native fragment shader\n",
3507 fp->Base.Id);
3508 _mesa_print_program(&fp->Base);
3509 }
3510 }
3511
3512 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3513 */
3514 fs_visitor v(brw, c, prog, fp, 8);
3515 if (!v.run()) {
3516 if (prog) {
3517 prog->LinkStatus = false;
3518 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3519 }
3520
3521 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3522 v.fail_msg);
3523
3524 return NULL;
3525 }
3526
3527 exec_list *simd16_instructions = NULL;
3528 fs_visitor v2(brw, c, prog, fp, 16);
3529 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3530 if (c->prog_data.nr_pull_params == 0) {
3531 /* Try a SIMD16 compile */
3532 v2.import_uniforms(&v);
3533 if (!v2.run()) {
3534 perf_debug("SIMD16 shader failed to compile, falling back to "
3535 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3536 } else {
3537 simd16_instructions = &v2.instructions;
3538 }
3539 } else {
3540 perf_debug("Skipping SIMD16 due to pull parameters.\n");
3541 }
3542 }
3543
3544 const unsigned *assembly = NULL;
3545 if (brw->gen >= 8) {
3546 gen8_fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3547 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3548 final_assembly_size);
3549 } else {
3550 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3551 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3552 final_assembly_size);
3553 }
3554
3555 if (unlikely(brw->perf_debug) && shader) {
3556 if (shader->compiled_once)
3557 brw_wm_debug_recompile(brw, prog, &c->key);
3558 shader->compiled_once = true;
3559
3560 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3561 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3562 (get_time() - start_time) * 1000);
3563 }
3564 }
3565
3566 return assembly;
3567 }
3568
3569 bool
3570 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3571 {
3572 struct brw_context *brw = brw_context(ctx);
3573 struct brw_wm_prog_key key;
3574
3575 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3576 return true;
3577
3578 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3579 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3580 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3581 bool program_uses_dfdy = fp->UsesDFdy;
3582
3583 memset(&key, 0, sizeof(key));
3584
3585 if (brw->gen < 6) {
3586 if (fp->UsesKill)
3587 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3588
3589 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3590 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3591
3592 /* Just assume depth testing. */
3593 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3594 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3595 }
3596
3597 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3598 BRW_FS_VARYING_INPUT_MASK) > 16)
3599 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3600
3601 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3602
3603 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3604 for (unsigned i = 0; i < sampler_count; i++) {
3605 if (fp->Base.ShadowSamplers & (1 << i)) {
3606 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3607 key.tex.swizzles[i] =
3608 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3609 } else {
3610 /* Color sampler: assume no swizzling. */
3611 key.tex.swizzles[i] = SWIZZLE_XYZW;
3612 }
3613 }
3614
3615 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3616 key.drawable_height = ctx->DrawBuffer->Height;
3617 }
3618
3619 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3620 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3621 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3622
3623 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3624 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3625 key.nr_color_regions > 1;
3626 }
3627
3628 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3629 * quality of the derivatives is likely to be determined by the driconf
3630 * option.
3631 */
3632 key.high_quality_derivatives = brw->disable_derivative_optimization;
3633
3634 key.program_string_id = bfp->id;
3635
3636 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3637 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3638
3639 bool success = do_wm_prog(brw, prog, bfp, &key);
3640
3641 brw->wm.base.prog_offset = old_prog_offset;
3642 brw->wm.prog_data = old_prog_data;
3643
3644 return success;
3645 }