i965: Move up duplicated fields from stage-specific prog_data to brw_stage_prog_data.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53
54 void
55 fs_inst::init()
56 {
57 memset(this, 0, sizeof(*this));
58 this->opcode = BRW_OPCODE_NOP;
59 this->conditional_mod = BRW_CONDITIONAL_NONE;
60
61 this->dst = reg_undef;
62 this->src[0] = reg_undef;
63 this->src[1] = reg_undef;
64 this->src[2] = reg_undef;
65
66 /* This will be the case for almost all instructions. */
67 this->regs_written = 1;
68 }
69
70 fs_inst::fs_inst()
71 {
72 init();
73 }
74
75 fs_inst::fs_inst(enum opcode opcode)
76 {
77 init();
78 this->opcode = opcode;
79 }
80
81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
82 {
83 init();
84 this->opcode = opcode;
85 this->dst = dst;
86
87 if (dst.file == GRF)
88 assert(dst.reg_offset >= 0);
89 }
90
91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
92 {
93 init();
94 this->opcode = opcode;
95 this->dst = dst;
96 this->src[0] = src0;
97
98 if (dst.file == GRF)
99 assert(dst.reg_offset >= 0);
100 if (src[0].file == GRF)
101 assert(src[0].reg_offset >= 0);
102 }
103
104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
105 {
106 init();
107 this->opcode = opcode;
108 this->dst = dst;
109 this->src[0] = src0;
110 this->src[1] = src1;
111
112 if (dst.file == GRF)
113 assert(dst.reg_offset >= 0);
114 if (src[0].file == GRF)
115 assert(src[0].reg_offset >= 0);
116 if (src[1].file == GRF)
117 assert(src[1].reg_offset >= 0);
118 }
119
120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
121 fs_reg src0, fs_reg src1, fs_reg src2)
122 {
123 init();
124 this->opcode = opcode;
125 this->dst = dst;
126 this->src[0] = src0;
127 this->src[1] = src1;
128 this->src[2] = src2;
129
130 if (dst.file == GRF)
131 assert(dst.reg_offset >= 0);
132 if (src[0].file == GRF)
133 assert(src[0].reg_offset >= 0);
134 if (src[1].file == GRF)
135 assert(src[1].reg_offset >= 0);
136 if (src[2].file == GRF)
137 assert(src[2].reg_offset >= 0);
138 }
139
140 #define ALU1(op) \
141 fs_inst * \
142 fs_visitor::op(fs_reg dst, fs_reg src0) \
143 { \
144 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
145 }
146
147 #define ALU2(op) \
148 fs_inst * \
149 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
150 { \
151 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
152 }
153
154 #define ALU3(op) \
155 fs_inst * \
156 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
157 { \
158 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
159 }
160
161 ALU1(NOT)
162 ALU1(MOV)
163 ALU1(FRC)
164 ALU1(RNDD)
165 ALU1(RNDE)
166 ALU1(RNDZ)
167 ALU2(ADD)
168 ALU2(MUL)
169 ALU2(MACH)
170 ALU2(AND)
171 ALU2(OR)
172 ALU2(XOR)
173 ALU2(SHL)
174 ALU2(SHR)
175 ALU2(ASR)
176 ALU3(LRP)
177 ALU1(BFREV)
178 ALU3(BFE)
179 ALU2(BFI1)
180 ALU3(BFI2)
181 ALU1(FBH)
182 ALU1(FBL)
183 ALU1(CBIT)
184 ALU3(MAD)
185 ALU2(ADDC)
186 ALU2(SUBB)
187 ALU2(SEL)
188
189 /** Gen4 predicated IF. */
190 fs_inst *
191 fs_visitor::IF(uint32_t predicate)
192 {
193 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195 return inst;
196 }
197
198 /** Gen6 IF with embedded comparison. */
199 fs_inst *
200 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
201 {
202 assert(brw->gen == 6);
203 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
204 reg_null_d, src0, src1);
205 inst->conditional_mod = condition;
206 return inst;
207 }
208
209 /**
210 * CMP: Sets the low bit of the destination channels with the result
211 * of the comparison, while the upper bits are undefined, and updates
212 * the flag register with the packed 16 bits of the result.
213 */
214 fs_inst *
215 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
216 {
217 fs_inst *inst;
218
219 /* Take the instruction:
220 *
221 * CMP null<d> src0<f> src1<f>
222 *
223 * Original gen4 does type conversion to the destination type before
224 * comparison, producing garbage results for floating point comparisons.
225 * gen5 does the comparison on the execution type (resolved source types),
226 * so dst type doesn't matter. gen6 does comparison and then uses the
227 * result as if it was the dst type with no conversion, which happens to
228 * mostly work out for float-interpreted-as-int since our comparisons are
229 * for >0, =0, <0.
230 */
231 if (brw->gen == 4) {
232 dst.type = src0.type;
233 if (dst.file == HW_REG)
234 dst.fixed_hw_reg.type = dst.type;
235 }
236
237 resolve_ud_negate(&src0);
238 resolve_ud_negate(&src1);
239
240 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
241 inst->conditional_mod = condition;
242
243 return inst;
244 }
245
246 exec_list
247 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
248 fs_reg varying_offset,
249 uint32_t const_offset)
250 {
251 exec_list instructions;
252 fs_inst *inst;
253
254 /* We have our constant surface use a pitch of 4 bytes, so our index can
255 * be any component of a vector, and then we load 4 contiguous
256 * components starting from that.
257 *
258 * We break down the const_offset to a portion added to the variable
259 * offset and a portion done using reg_offset, which means that if you
260 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
261 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
262 * CSE can later notice that those loads are all the same and eliminate
263 * the redundant ones.
264 */
265 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
266 instructions.push_tail(ADD(vec4_offset,
267 varying_offset, const_offset & ~3));
268
269 int scale = 1;
270 if (brw->gen == 4 && dispatch_width == 8) {
271 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
272 * u, v, r) as parameters, or we can just use the SIMD16 message
273 * consisting of (header, u). We choose the second, at the cost of a
274 * longer return length.
275 */
276 scale = 2;
277 }
278
279 enum opcode op;
280 if (brw->gen >= 7)
281 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
282 else
283 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
284 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
285 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
286 inst->regs_written = 4 * scale;
287 instructions.push_tail(inst);
288
289 if (brw->gen < 7) {
290 inst->base_mrf = 13;
291 inst->header_present = true;
292 if (brw->gen == 4)
293 inst->mlen = 3;
294 else
295 inst->mlen = 1 + dispatch_width / 8;
296 }
297
298 vec4_result.reg_offset += (const_offset & 3) * scale;
299 instructions.push_tail(MOV(dst, vec4_result));
300
301 return instructions;
302 }
303
304 /**
305 * A helper for MOV generation for fixing up broken hardware SEND dependency
306 * handling.
307 */
308 fs_inst *
309 fs_visitor::DEP_RESOLVE_MOV(int grf)
310 {
311 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
312
313 inst->ir = NULL;
314 inst->annotation = "send dependency resolve";
315
316 /* The caller always wants uncompressed to emit the minimal extra
317 * dependencies, and to avoid having to deal with aligning its regs to 2.
318 */
319 inst->force_uncompressed = true;
320
321 return inst;
322 }
323
324 bool
325 fs_inst::equals(fs_inst *inst)
326 {
327 return (opcode == inst->opcode &&
328 dst.equals(inst->dst) &&
329 src[0].equals(inst->src[0]) &&
330 src[1].equals(inst->src[1]) &&
331 src[2].equals(inst->src[2]) &&
332 saturate == inst->saturate &&
333 predicate == inst->predicate &&
334 conditional_mod == inst->conditional_mod &&
335 mlen == inst->mlen &&
336 base_mrf == inst->base_mrf &&
337 sampler == inst->sampler &&
338 target == inst->target &&
339 eot == inst->eot &&
340 header_present == inst->header_present &&
341 shadow_compare == inst->shadow_compare &&
342 offset == inst->offset);
343 }
344
345 bool
346 fs_inst::overwrites_reg(const fs_reg &reg)
347 {
348 return (reg.file == dst.file &&
349 reg.reg == dst.reg &&
350 reg.reg_offset >= dst.reg_offset &&
351 reg.reg_offset < dst.reg_offset + regs_written);
352 }
353
354 bool
355 fs_inst::is_send_from_grf()
356 {
357 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
358 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
359 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
360 src[1].file == GRF) ||
361 (is_tex() && src[0].file == GRF));
362 }
363
364 bool
365 fs_visitor::can_do_source_mods(fs_inst *inst)
366 {
367 if (brw->gen == 6 && inst->is_math())
368 return false;
369
370 if (inst->is_send_from_grf())
371 return false;
372
373 if (!inst->can_do_source_mods())
374 return false;
375
376 return true;
377 }
378
379 void
380 fs_reg::init()
381 {
382 memset(this, 0, sizeof(*this));
383 stride = 1;
384 }
385
386 /** Generic unset register constructor. */
387 fs_reg::fs_reg()
388 {
389 init();
390 this->file = BAD_FILE;
391 }
392
393 /** Immediate value constructor. */
394 fs_reg::fs_reg(float f)
395 {
396 init();
397 this->file = IMM;
398 this->type = BRW_REGISTER_TYPE_F;
399 this->imm.f = f;
400 }
401
402 /** Immediate value constructor. */
403 fs_reg::fs_reg(int32_t i)
404 {
405 init();
406 this->file = IMM;
407 this->type = BRW_REGISTER_TYPE_D;
408 this->imm.i = i;
409 }
410
411 /** Immediate value constructor. */
412 fs_reg::fs_reg(uint32_t u)
413 {
414 init();
415 this->file = IMM;
416 this->type = BRW_REGISTER_TYPE_UD;
417 this->imm.u = u;
418 }
419
420 /** Fixed brw_reg. */
421 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
422 {
423 init();
424 this->file = HW_REG;
425 this->fixed_hw_reg = fixed_hw_reg;
426 this->type = fixed_hw_reg.type;
427 }
428
429 bool
430 fs_reg::equals(const fs_reg &r) const
431 {
432 return (file == r.file &&
433 reg == r.reg &&
434 reg_offset == r.reg_offset &&
435 subreg_offset == r.subreg_offset &&
436 type == r.type &&
437 negate == r.negate &&
438 abs == r.abs &&
439 !reladdr && !r.reladdr &&
440 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
441 sizeof(fixed_hw_reg)) == 0 &&
442 stride == r.stride &&
443 imm.u == r.imm.u);
444 }
445
446 fs_reg
447 fs_reg::retype(uint32_t type)
448 {
449 fs_reg result = *this;
450 result.type = type;
451 return result;
452 }
453
454 fs_reg &
455 fs_reg::apply_stride(unsigned stride)
456 {
457 assert((this->stride * stride) <= 4 &&
458 (is_power_of_two(stride) || stride == 0) &&
459 file != HW_REG && file != IMM);
460 this->stride *= stride;
461 return *this;
462 }
463
464 fs_reg &
465 fs_reg::set_smear(unsigned subreg)
466 {
467 assert(file != HW_REG && file != IMM);
468 subreg_offset = subreg * type_sz(type);
469 stride = 0;
470 return *this;
471 }
472
473 bool
474 fs_reg::is_contiguous() const
475 {
476 return stride == 1;
477 }
478
479 bool
480 fs_reg::is_zero() const
481 {
482 if (file != IMM)
483 return false;
484
485 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
486 }
487
488 bool
489 fs_reg::is_one() const
490 {
491 if (file != IMM)
492 return false;
493
494 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
495 }
496
497 bool
498 fs_reg::is_null() const
499 {
500 return file == HW_REG &&
501 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
502 fixed_hw_reg.nr == BRW_ARF_NULL;
503 }
504
505 bool
506 fs_reg::is_valid_3src() const
507 {
508 return file == GRF || file == UNIFORM;
509 }
510
511 int
512 fs_visitor::type_size(const struct glsl_type *type)
513 {
514 unsigned int size, i;
515
516 switch (type->base_type) {
517 case GLSL_TYPE_UINT:
518 case GLSL_TYPE_INT:
519 case GLSL_TYPE_FLOAT:
520 case GLSL_TYPE_BOOL:
521 return type->components();
522 case GLSL_TYPE_ARRAY:
523 return type_size(type->fields.array) * type->length;
524 case GLSL_TYPE_STRUCT:
525 size = 0;
526 for (i = 0; i < type->length; i++) {
527 size += type_size(type->fields.structure[i].type);
528 }
529 return size;
530 case GLSL_TYPE_SAMPLER:
531 /* Samplers take up no register space, since they're baked in at
532 * link time.
533 */
534 return 0;
535 case GLSL_TYPE_ATOMIC_UINT:
536 return 0;
537 case GLSL_TYPE_IMAGE:
538 case GLSL_TYPE_VOID:
539 case GLSL_TYPE_ERROR:
540 case GLSL_TYPE_INTERFACE:
541 assert(!"not reached");
542 break;
543 }
544
545 return 0;
546 }
547
548 fs_reg
549 fs_visitor::get_timestamp()
550 {
551 assert(brw->gen >= 7);
552
553 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
554 BRW_ARF_TIMESTAMP,
555 0),
556 BRW_REGISTER_TYPE_UD));
557
558 fs_reg dst = fs_reg(this, glsl_type::uint_type);
559
560 fs_inst *mov = emit(MOV(dst, ts));
561 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
562 * even if it's not enabled in the dispatch.
563 */
564 mov->force_writemask_all = true;
565 mov->force_uncompressed = true;
566
567 /* The caller wants the low 32 bits of the timestamp. Since it's running
568 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
569 * which is plenty of time for our purposes. It is identical across the
570 * EUs, but since it's tracking GPU core speed it will increment at a
571 * varying rate as render P-states change.
572 *
573 * The caller could also check if render P-states have changed (or anything
574 * else that might disrupt timing) by setting smear to 2 and checking if
575 * that field is != 0.
576 */
577 dst.set_smear(0);
578
579 return dst;
580 }
581
582 void
583 fs_visitor::emit_shader_time_begin()
584 {
585 current_annotation = "shader time start";
586 shader_start_time = get_timestamp();
587 }
588
589 void
590 fs_visitor::emit_shader_time_end()
591 {
592 current_annotation = "shader time end";
593
594 enum shader_time_shader_type type, written_type, reset_type;
595 if (dispatch_width == 8) {
596 type = ST_FS8;
597 written_type = ST_FS8_WRITTEN;
598 reset_type = ST_FS8_RESET;
599 } else {
600 assert(dispatch_width == 16);
601 type = ST_FS16;
602 written_type = ST_FS16_WRITTEN;
603 reset_type = ST_FS16_RESET;
604 }
605
606 fs_reg shader_end_time = get_timestamp();
607
608 /* Check that there weren't any timestamp reset events (assuming these
609 * were the only two timestamp reads that happened).
610 */
611 fs_reg reset = shader_end_time;
612 reset.set_smear(2);
613 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
614 test->conditional_mod = BRW_CONDITIONAL_Z;
615 emit(IF(BRW_PREDICATE_NORMAL));
616
617 push_force_uncompressed();
618 fs_reg start = shader_start_time;
619 start.negate = true;
620 fs_reg diff = fs_reg(this, glsl_type::uint_type);
621 emit(ADD(diff, start, shader_end_time));
622
623 /* If there were no instructions between the two timestamp gets, the diff
624 * is 2 cycles. Remove that overhead, so I can forget about that when
625 * trying to determine the time taken for single instructions.
626 */
627 emit(ADD(diff, diff, fs_reg(-2u)));
628
629 emit_shader_time_write(type, diff);
630 emit_shader_time_write(written_type, fs_reg(1u));
631 emit(BRW_OPCODE_ELSE);
632 emit_shader_time_write(reset_type, fs_reg(1u));
633 emit(BRW_OPCODE_ENDIF);
634
635 pop_force_uncompressed();
636 }
637
638 void
639 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
640 fs_reg value)
641 {
642 int shader_time_index =
643 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
644 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
645
646 fs_reg payload;
647 if (dispatch_width == 8)
648 payload = fs_reg(this, glsl_type::uvec2_type);
649 else
650 payload = fs_reg(this, glsl_type::uint_type);
651
652 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
653 fs_reg(), payload, offset, value));
654 }
655
656 void
657 fs_visitor::fail(const char *format, ...)
658 {
659 va_list va;
660 char *msg;
661
662 if (failed)
663 return;
664
665 failed = true;
666
667 va_start(va, format);
668 msg = ralloc_vasprintf(mem_ctx, format, va);
669 va_end(va);
670 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
671
672 this->fail_msg = msg;
673
674 if (INTEL_DEBUG & DEBUG_WM) {
675 fprintf(stderr, "%s", msg);
676 }
677 }
678
679 fs_inst *
680 fs_visitor::emit(enum opcode opcode)
681 {
682 return emit(fs_inst(opcode));
683 }
684
685 fs_inst *
686 fs_visitor::emit(enum opcode opcode, fs_reg dst)
687 {
688 return emit(fs_inst(opcode, dst));
689 }
690
691 fs_inst *
692 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
693 {
694 return emit(fs_inst(opcode, dst, src0));
695 }
696
697 fs_inst *
698 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
699 {
700 return emit(fs_inst(opcode, dst, src0, src1));
701 }
702
703 fs_inst *
704 fs_visitor::emit(enum opcode opcode, fs_reg dst,
705 fs_reg src0, fs_reg src1, fs_reg src2)
706 {
707 return emit(fs_inst(opcode, dst, src0, src1, src2));
708 }
709
710 void
711 fs_visitor::push_force_uncompressed()
712 {
713 force_uncompressed_stack++;
714 }
715
716 void
717 fs_visitor::pop_force_uncompressed()
718 {
719 force_uncompressed_stack--;
720 assert(force_uncompressed_stack >= 0);
721 }
722
723 /**
724 * Returns true if the instruction has a flag that means it won't
725 * update an entire destination register.
726 *
727 * For example, dead code elimination and live variable analysis want to know
728 * when a write to a variable screens off any preceding values that were in
729 * it.
730 */
731 bool
732 fs_inst::is_partial_write()
733 {
734 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
735 this->force_uncompressed ||
736 this->force_sechalf || !this->dst.is_contiguous());
737 }
738
739 int
740 fs_inst::regs_read(fs_visitor *v, int arg)
741 {
742 if (is_tex() && arg == 0 && src[0].file == GRF) {
743 if (v->dispatch_width == 16)
744 return (mlen + 1) / 2;
745 else
746 return mlen;
747 }
748 return 1;
749 }
750
751 bool
752 fs_inst::reads_flag()
753 {
754 return predicate;
755 }
756
757 bool
758 fs_inst::writes_flag()
759 {
760 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
761 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
762 }
763
764 /**
765 * Returns how many MRFs an FS opcode will write over.
766 *
767 * Note that this is not the 0 or 1 implied writes in an actual gen
768 * instruction -- the FS opcodes often generate MOVs in addition.
769 */
770 int
771 fs_visitor::implied_mrf_writes(fs_inst *inst)
772 {
773 if (inst->mlen == 0)
774 return 0;
775
776 if (inst->base_mrf == -1)
777 return 0;
778
779 switch (inst->opcode) {
780 case SHADER_OPCODE_RCP:
781 case SHADER_OPCODE_RSQ:
782 case SHADER_OPCODE_SQRT:
783 case SHADER_OPCODE_EXP2:
784 case SHADER_OPCODE_LOG2:
785 case SHADER_OPCODE_SIN:
786 case SHADER_OPCODE_COS:
787 return 1 * dispatch_width / 8;
788 case SHADER_OPCODE_POW:
789 case SHADER_OPCODE_INT_QUOTIENT:
790 case SHADER_OPCODE_INT_REMAINDER:
791 return 2 * dispatch_width / 8;
792 case SHADER_OPCODE_TEX:
793 case FS_OPCODE_TXB:
794 case SHADER_OPCODE_TXD:
795 case SHADER_OPCODE_TXF:
796 case SHADER_OPCODE_TXF_CMS:
797 case SHADER_OPCODE_TXF_MCS:
798 case SHADER_OPCODE_TG4:
799 case SHADER_OPCODE_TG4_OFFSET:
800 case SHADER_OPCODE_TXL:
801 case SHADER_OPCODE_TXS:
802 case SHADER_OPCODE_LOD:
803 return 1;
804 case FS_OPCODE_FB_WRITE:
805 return 2;
806 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
807 case SHADER_OPCODE_GEN4_SCRATCH_READ:
808 return 1;
809 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
810 return inst->mlen;
811 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
812 return 2;
813 case SHADER_OPCODE_UNTYPED_ATOMIC:
814 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
815 return 0;
816 default:
817 assert(!"not reached");
818 return inst->mlen;
819 }
820 }
821
822 int
823 fs_visitor::virtual_grf_alloc(int size)
824 {
825 if (virtual_grf_array_size <= virtual_grf_count) {
826 if (virtual_grf_array_size == 0)
827 virtual_grf_array_size = 16;
828 else
829 virtual_grf_array_size *= 2;
830 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
831 virtual_grf_array_size);
832 }
833 virtual_grf_sizes[virtual_grf_count] = size;
834 return virtual_grf_count++;
835 }
836
837 /** Fixed HW reg constructor. */
838 fs_reg::fs_reg(enum register_file file, int reg)
839 {
840 init();
841 this->file = file;
842 this->reg = reg;
843 this->type = BRW_REGISTER_TYPE_F;
844 }
845
846 /** Fixed HW reg constructor. */
847 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
848 {
849 init();
850 this->file = file;
851 this->reg = reg;
852 this->type = type;
853 }
854
855 /** Automatic reg constructor. */
856 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
857 {
858 init();
859
860 this->file = GRF;
861 this->reg = v->virtual_grf_alloc(v->type_size(type));
862 this->reg_offset = 0;
863 this->type = brw_type_for_base_type(type);
864 }
865
866 fs_reg *
867 fs_visitor::variable_storage(ir_variable *var)
868 {
869 return (fs_reg *)hash_table_find(this->variable_ht, var);
870 }
871
872 void
873 import_uniforms_callback(const void *key,
874 void *data,
875 void *closure)
876 {
877 struct hash_table *dst_ht = (struct hash_table *)closure;
878 const fs_reg *reg = (const fs_reg *)data;
879
880 if (reg->file != UNIFORM)
881 return;
882
883 hash_table_insert(dst_ht, data, key);
884 }
885
886 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
887 * This brings in those uniform definitions
888 */
889 void
890 fs_visitor::import_uniforms(fs_visitor *v)
891 {
892 hash_table_call_foreach(v->variable_ht,
893 import_uniforms_callback,
894 variable_ht);
895 this->params_remap = v->params_remap;
896 this->nr_params_remap = v->nr_params_remap;
897 }
898
899 /* Our support for uniforms is piggy-backed on the struct
900 * gl_fragment_program, because that's where the values actually
901 * get stored, rather than in some global gl_shader_program uniform
902 * store.
903 */
904 void
905 fs_visitor::setup_uniform_values(ir_variable *ir)
906 {
907 int namelen = strlen(ir->name);
908
909 /* The data for our (non-builtin) uniforms is stored in a series of
910 * gl_uniform_driver_storage structs for each subcomponent that
911 * glGetUniformLocation() could name. We know it's been set up in the same
912 * order we'd walk the type, so walk the list of storage and find anything
913 * with our name, or the prefix of a component that starts with our name.
914 */
915 unsigned params_before = stage_prog_data->nr_params;
916 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
917 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
918
919 if (strncmp(ir->name, storage->name, namelen) != 0 ||
920 (storage->name[namelen] != 0 &&
921 storage->name[namelen] != '.' &&
922 storage->name[namelen] != '[')) {
923 continue;
924 }
925
926 unsigned slots = storage->type->component_slots();
927 if (storage->array_elements)
928 slots *= storage->array_elements;
929
930 for (unsigned i = 0; i < slots; i++) {
931 stage_prog_data->param[stage_prog_data->nr_params++] =
932 &storage->storage[i].f;
933 }
934 }
935
936 /* Make sure we actually initialized the right amount of stuff here. */
937 assert(params_before + ir->type->component_slots() ==
938 stage_prog_data->nr_params);
939 (void)params_before;
940 }
941
942
943 /* Our support for builtin uniforms is even scarier than non-builtin.
944 * It sits on top of the PROG_STATE_VAR parameters that are
945 * automatically updated from GL context state.
946 */
947 void
948 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
949 {
950 const ir_state_slot *const slots = ir->state_slots;
951 assert(ir->state_slots != NULL);
952
953 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
954 /* This state reference has already been setup by ir_to_mesa, but we'll
955 * get the same index back here.
956 */
957 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
958 (gl_state_index *)slots[i].tokens);
959
960 /* Add each of the unique swizzles of the element as a parameter.
961 * This'll end up matching the expected layout of the
962 * array/matrix/structure we're trying to fill in.
963 */
964 int last_swiz = -1;
965 for (unsigned int j = 0; j < 4; j++) {
966 int swiz = GET_SWZ(slots[i].swizzle, j);
967 if (swiz == last_swiz)
968 break;
969 last_swiz = swiz;
970
971 stage_prog_data->param[stage_prog_data->nr_params++] =
972 &fp->Base.Parameters->ParameterValues[index][swiz].f;
973 }
974 }
975 }
976
977 fs_reg *
978 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
979 {
980 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
981 fs_reg wpos = *reg;
982 bool flip = !ir->data.origin_upper_left ^ c->key.render_to_fbo;
983
984 /* gl_FragCoord.x */
985 if (ir->data.pixel_center_integer) {
986 emit(MOV(wpos, this->pixel_x));
987 } else {
988 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
989 }
990 wpos.reg_offset++;
991
992 /* gl_FragCoord.y */
993 if (!flip && ir->data.pixel_center_integer) {
994 emit(MOV(wpos, this->pixel_y));
995 } else {
996 fs_reg pixel_y = this->pixel_y;
997 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
998
999 if (flip) {
1000 pixel_y.negate = true;
1001 offset += c->key.drawable_height - 1.0;
1002 }
1003
1004 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1005 }
1006 wpos.reg_offset++;
1007
1008 /* gl_FragCoord.z */
1009 if (brw->gen >= 6) {
1010 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
1011 } else {
1012 emit(FS_OPCODE_LINTERP, wpos,
1013 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1014 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1015 interp_reg(VARYING_SLOT_POS, 2));
1016 }
1017 wpos.reg_offset++;
1018
1019 /* gl_FragCoord.w: Already set up in emit_interpolation */
1020 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1021
1022 return reg;
1023 }
1024
1025 fs_inst *
1026 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1027 glsl_interp_qualifier interpolation_mode,
1028 bool is_centroid, bool is_sample)
1029 {
1030 brw_wm_barycentric_interp_mode barycoord_mode;
1031 if (brw->gen >= 6) {
1032 if (is_centroid) {
1033 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1034 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1035 else
1036 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1037 } else if (is_sample) {
1038 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1039 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1040 else
1041 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1042 } else {
1043 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1044 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1045 else
1046 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1047 }
1048 } else {
1049 /* On Ironlake and below, there is only one interpolation mode.
1050 * Centroid interpolation doesn't mean anything on this hardware --
1051 * there is no multisampling.
1052 */
1053 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1054 }
1055 return emit(FS_OPCODE_LINTERP, attr,
1056 this->delta_x[barycoord_mode],
1057 this->delta_y[barycoord_mode], interp);
1058 }
1059
1060 fs_reg *
1061 fs_visitor::emit_general_interpolation(ir_variable *ir)
1062 {
1063 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1064 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1065 fs_reg attr = *reg;
1066
1067 unsigned int array_elements;
1068 const glsl_type *type;
1069
1070 if (ir->type->is_array()) {
1071 array_elements = ir->type->length;
1072 if (array_elements == 0) {
1073 fail("dereferenced array '%s' has length 0\n", ir->name);
1074 }
1075 type = ir->type->fields.array;
1076 } else {
1077 array_elements = 1;
1078 type = ir->type;
1079 }
1080
1081 glsl_interp_qualifier interpolation_mode =
1082 ir->determine_interpolation_mode(c->key.flat_shade);
1083
1084 int location = ir->data.location;
1085 for (unsigned int i = 0; i < array_elements; i++) {
1086 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1087 if (c->prog_data.urb_setup[location] == -1) {
1088 /* If there's no incoming setup data for this slot, don't
1089 * emit interpolation for it.
1090 */
1091 attr.reg_offset += type->vector_elements;
1092 location++;
1093 continue;
1094 }
1095
1096 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1097 /* Constant interpolation (flat shading) case. The SF has
1098 * handed us defined values in only the constant offset
1099 * field of the setup reg.
1100 */
1101 for (unsigned int k = 0; k < type->vector_elements; k++) {
1102 struct brw_reg interp = interp_reg(location, k);
1103 interp = suboffset(interp, 3);
1104 interp.type = reg->type;
1105 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1106 attr.reg_offset++;
1107 }
1108 } else {
1109 /* Smooth/noperspective interpolation case. */
1110 for (unsigned int k = 0; k < type->vector_elements; k++) {
1111 struct brw_reg interp = interp_reg(location, k);
1112 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1113 ir->data.centroid && !c->key.persample_shading,
1114 ir->data.sample || c->key.persample_shading);
1115 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1116 /* Get the pixel/sample mask into f0 so that we know
1117 * which pixels are lit. Then, for each channel that is
1118 * unlit, replace the centroid data with non-centroid
1119 * data.
1120 */
1121 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1122 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1123 interpolation_mode,
1124 false, false);
1125 inst->predicate = BRW_PREDICATE_NORMAL;
1126 inst->predicate_inverse = true;
1127 }
1128 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1129 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1130 }
1131 attr.reg_offset++;
1132 }
1133
1134 }
1135 location++;
1136 }
1137 }
1138
1139 return reg;
1140 }
1141
1142 fs_reg *
1143 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1144 {
1145 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1146
1147 /* The frontfacing comes in as a bit in the thread payload. */
1148 if (brw->gen >= 6) {
1149 emit(BRW_OPCODE_ASR, *reg,
1150 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1151 fs_reg(15));
1152 emit(BRW_OPCODE_NOT, *reg, *reg);
1153 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1154 } else {
1155 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1156 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1157 * us front face
1158 */
1159 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1160 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1161 }
1162
1163 return reg;
1164 }
1165
1166 void
1167 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1168 {
1169 assert(dst.type == BRW_REGISTER_TYPE_F);
1170
1171 if (c->key.compute_pos_offset) {
1172 /* Convert int_sample_pos to floating point */
1173 emit(MOV(dst, int_sample_pos));
1174 /* Scale to the range [0, 1] */
1175 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1176 }
1177 else {
1178 /* From ARB_sample_shading specification:
1179 * "When rendering to a non-multisample buffer, or if multisample
1180 * rasterization is disabled, gl_SamplePosition will always be
1181 * (0.5, 0.5).
1182 */
1183 emit(MOV(dst, fs_reg(0.5f)));
1184 }
1185 }
1186
1187 fs_reg *
1188 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1189 {
1190 assert(brw->gen >= 6);
1191 assert(ir->type == glsl_type::vec2_type);
1192
1193 this->current_annotation = "compute sample position";
1194 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1195 fs_reg pos = *reg;
1196 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1197 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1198
1199 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1200 * mode will be enabled.
1201 *
1202 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1203 * R31.1:0 Position Offset X/Y for Slot[3:0]
1204 * R31.3:2 Position Offset X/Y for Slot[7:4]
1205 * .....
1206 *
1207 * The X, Y sample positions come in as bytes in thread payload. So, read
1208 * the positions using vstride=16, width=8, hstride=2.
1209 */
1210 struct brw_reg sample_pos_reg =
1211 stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1212 BRW_REGISTER_TYPE_B), 16, 8, 2);
1213
1214 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1215 if (dispatch_width == 16) {
1216 fs_inst *inst = emit(MOV(half(int_sample_x, 1),
1217 fs_reg(suboffset(sample_pos_reg, 16))));
1218 inst->force_sechalf = true;
1219 }
1220 /* Compute gl_SamplePosition.x */
1221 compute_sample_position(pos, int_sample_x);
1222 pos.reg_offset++;
1223 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1224 if (dispatch_width == 16) {
1225 fs_inst *inst = emit(MOV(half(int_sample_y, 1),
1226 fs_reg(suboffset(sample_pos_reg, 17))));
1227 inst->force_sechalf = true;
1228 }
1229 /* Compute gl_SamplePosition.y */
1230 compute_sample_position(pos, int_sample_y);
1231 return reg;
1232 }
1233
1234 fs_reg *
1235 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1236 {
1237 assert(brw->gen >= 6);
1238
1239 this->current_annotation = "compute sample id";
1240 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1241
1242 if (c->key.compute_sample_id) {
1243 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1244 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1245 t2.type = BRW_REGISTER_TYPE_UW;
1246
1247 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1248 * 8x multisampling, subspan 0 will represent sample N (where N
1249 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1250 * 7. We can find the value of N by looking at R0.0 bits 7:6
1251 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1252 * (since samples are always delivered in pairs). That is, we
1253 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1254 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1255 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1256 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1257 * populating a temporary variable with the sequence (0, 1, 2, 3),
1258 * and then reading from it using vstride=1, width=4, hstride=0.
1259 * These computations hold good for 4x multisampling as well.
1260 */
1261 emit(BRW_OPCODE_AND, t1,
1262 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1263 fs_reg(brw_imm_d(0xc0)));
1264 emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1265 /* This works for both SIMD8 and SIMD16 */
1266 emit(MOV(t2, brw_imm_v(0x3210)));
1267 /* This special instruction takes care of setting vstride=1,
1268 * width=4, hstride=0 of t2 during an ADD instruction.
1269 */
1270 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1271 } else {
1272 /* As per GL_ARB_sample_shading specification:
1273 * "When rendering to a non-multisample buffer, or if multisample
1274 * rasterization is disabled, gl_SampleID will always be zero."
1275 */
1276 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1277 }
1278
1279 return reg;
1280 }
1281
1282 fs_reg *
1283 fs_visitor::emit_samplemaskin_setup(ir_variable *ir)
1284 {
1285 assert(brw->gen >= 7);
1286 this->current_annotation = "compute gl_SampleMaskIn";
1287 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1288 emit(MOV(*reg, fs_reg(retype(brw_vec8_grf(c->sample_mask_reg, 0), BRW_REGISTER_TYPE_D))));
1289 return reg;
1290 }
1291
1292 fs_reg
1293 fs_visitor::fix_math_operand(fs_reg src)
1294 {
1295 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1296 * might be able to do better by doing execsize = 1 math and then
1297 * expanding that result out, but we would need to be careful with
1298 * masking.
1299 *
1300 * The hardware ignores source modifiers (negate and abs) on math
1301 * instructions, so we also move to a temp to set those up.
1302 */
1303 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1304 !src.abs && !src.negate)
1305 return src;
1306
1307 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1308 * operands to math
1309 */
1310 if (brw->gen >= 7 && src.file != IMM)
1311 return src;
1312
1313 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1314 expanded.type = src.type;
1315 emit(BRW_OPCODE_MOV, expanded, src);
1316 return expanded;
1317 }
1318
1319 fs_inst *
1320 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1321 {
1322 switch (opcode) {
1323 case SHADER_OPCODE_RCP:
1324 case SHADER_OPCODE_RSQ:
1325 case SHADER_OPCODE_SQRT:
1326 case SHADER_OPCODE_EXP2:
1327 case SHADER_OPCODE_LOG2:
1328 case SHADER_OPCODE_SIN:
1329 case SHADER_OPCODE_COS:
1330 break;
1331 default:
1332 assert(!"not reached: bad math opcode");
1333 return NULL;
1334 }
1335
1336 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1337 * might be able to do better by doing execsize = 1 math and then
1338 * expanding that result out, but we would need to be careful with
1339 * masking.
1340 *
1341 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1342 * instructions, so we also move to a temp to set those up.
1343 */
1344 if (brw->gen >= 6)
1345 src = fix_math_operand(src);
1346
1347 fs_inst *inst = emit(opcode, dst, src);
1348
1349 if (brw->gen < 6) {
1350 inst->base_mrf = 2;
1351 inst->mlen = dispatch_width / 8;
1352 }
1353
1354 return inst;
1355 }
1356
1357 fs_inst *
1358 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1359 {
1360 int base_mrf = 2;
1361 fs_inst *inst;
1362
1363 switch (opcode) {
1364 case SHADER_OPCODE_INT_QUOTIENT:
1365 case SHADER_OPCODE_INT_REMAINDER:
1366 if (brw->gen >= 7 && dispatch_width == 16)
1367 fail("SIMD16 INTDIV unsupported\n");
1368 break;
1369 case SHADER_OPCODE_POW:
1370 break;
1371 default:
1372 assert(!"not reached: unsupported binary math opcode.");
1373 return NULL;
1374 }
1375
1376 if (brw->gen >= 6) {
1377 src0 = fix_math_operand(src0);
1378 src1 = fix_math_operand(src1);
1379
1380 inst = emit(opcode, dst, src0, src1);
1381 } else {
1382 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1383 * "Message Payload":
1384 *
1385 * "Operand0[7]. For the INT DIV functions, this operand is the
1386 * denominator."
1387 * ...
1388 * "Operand1[7]. For the INT DIV functions, this operand is the
1389 * numerator."
1390 */
1391 bool is_int_div = opcode != SHADER_OPCODE_POW;
1392 fs_reg &op0 = is_int_div ? src1 : src0;
1393 fs_reg &op1 = is_int_div ? src0 : src1;
1394
1395 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1396 inst = emit(opcode, dst, op0, reg_null_f);
1397
1398 inst->base_mrf = base_mrf;
1399 inst->mlen = 2 * dispatch_width / 8;
1400 }
1401 return inst;
1402 }
1403
1404 void
1405 fs_visitor::assign_curb_setup()
1406 {
1407 c->prog_data.curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1408 if (dispatch_width == 8) {
1409 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1410 } else {
1411 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1412 }
1413
1414 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1415 foreach_list(node, &this->instructions) {
1416 fs_inst *inst = (fs_inst *)node;
1417
1418 for (unsigned int i = 0; i < 3; i++) {
1419 if (inst->src[i].file == UNIFORM) {
1420 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1421 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1422 constant_nr / 8,
1423 constant_nr % 8);
1424
1425 inst->src[i].file = HW_REG;
1426 inst->src[i].fixed_hw_reg = byte_offset(
1427 retype(brw_reg, inst->src[i].type),
1428 inst->src[i].subreg_offset);
1429 }
1430 }
1431 }
1432 }
1433
1434 void
1435 fs_visitor::calculate_urb_setup()
1436 {
1437 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1438 c->prog_data.urb_setup[i] = -1;
1439 }
1440
1441 int urb_next = 0;
1442 /* Figure out where each of the incoming setup attributes lands. */
1443 if (brw->gen >= 6) {
1444 if (_mesa_bitcount_64(fp->Base.InputsRead &
1445 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1446 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1447 * first 16 varying inputs, so we can put them wherever we want.
1448 * Just put them in order.
1449 *
1450 * This is useful because it means that (a) inputs not used by the
1451 * fragment shader won't take up valuable register space, and (b) we
1452 * won't have to recompile the fragment shader if it gets paired with
1453 * a different vertex (or geometry) shader.
1454 */
1455 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1456 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1457 BITFIELD64_BIT(i)) {
1458 c->prog_data.urb_setup[i] = urb_next++;
1459 }
1460 }
1461 } else {
1462 /* We have enough input varyings that the SF/SBE pipeline stage can't
1463 * arbitrarily rearrange them to suit our whim; we have to put them
1464 * in an order that matches the output of the previous pipeline stage
1465 * (geometry or vertex shader).
1466 */
1467 struct brw_vue_map prev_stage_vue_map;
1468 brw_compute_vue_map(brw, &prev_stage_vue_map,
1469 c->key.input_slots_valid);
1470 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1471 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1472 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1473 slot++) {
1474 int varying = prev_stage_vue_map.slot_to_varying[slot];
1475 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1476 * unused.
1477 */
1478 if (varying != BRW_VARYING_SLOT_COUNT &&
1479 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1480 BITFIELD64_BIT(varying))) {
1481 c->prog_data.urb_setup[varying] = slot - first_slot;
1482 }
1483 }
1484 urb_next = prev_stage_vue_map.num_slots - first_slot;
1485 }
1486 } else {
1487 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1488 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1489 /* Point size is packed into the header, not as a general attribute */
1490 if (i == VARYING_SLOT_PSIZ)
1491 continue;
1492
1493 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1494 /* The back color slot is skipped when the front color is
1495 * also written to. In addition, some slots can be
1496 * written in the vertex shader and not read in the
1497 * fragment shader. So the register number must always be
1498 * incremented, mapped or not.
1499 */
1500 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1501 c->prog_data.urb_setup[i] = urb_next;
1502 urb_next++;
1503 }
1504 }
1505
1506 /*
1507 * It's a FS only attribute, and we did interpolation for this attribute
1508 * in SF thread. So, count it here, too.
1509 *
1510 * See compile_sf_prog() for more info.
1511 */
1512 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1513 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1514 }
1515
1516 c->prog_data.num_varying_inputs = urb_next;
1517 }
1518
1519 void
1520 fs_visitor::assign_urb_setup()
1521 {
1522 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1523
1524 /* Offset all the urb_setup[] index by the actual position of the
1525 * setup regs, now that the location of the constants has been chosen.
1526 */
1527 foreach_list(node, &this->instructions) {
1528 fs_inst *inst = (fs_inst *)node;
1529
1530 if (inst->opcode == FS_OPCODE_LINTERP) {
1531 assert(inst->src[2].file == HW_REG);
1532 inst->src[2].fixed_hw_reg.nr += urb_start;
1533 }
1534
1535 if (inst->opcode == FS_OPCODE_CINTERP) {
1536 assert(inst->src[0].file == HW_REG);
1537 inst->src[0].fixed_hw_reg.nr += urb_start;
1538 }
1539 }
1540
1541 /* Each attribute is 4 setup channels, each of which is half a reg. */
1542 this->first_non_payload_grf =
1543 urb_start + c->prog_data.num_varying_inputs * 2;
1544 }
1545
1546 /**
1547 * Split large virtual GRFs into separate components if we can.
1548 *
1549 * This is mostly duplicated with what brw_fs_vector_splitting does,
1550 * but that's really conservative because it's afraid of doing
1551 * splitting that doesn't result in real progress after the rest of
1552 * the optimization phases, which would cause infinite looping in
1553 * optimization. We can do it once here, safely. This also has the
1554 * opportunity to split interpolated values, or maybe even uniforms,
1555 * which we don't have at the IR level.
1556 *
1557 * We want to split, because virtual GRFs are what we register
1558 * allocate and spill (due to contiguousness requirements for some
1559 * instructions), and they're what we naturally generate in the
1560 * codegen process, but most virtual GRFs don't actually need to be
1561 * contiguous sets of GRFs. If we split, we'll end up with reduced
1562 * live intervals and better dead code elimination and coalescing.
1563 */
1564 void
1565 fs_visitor::split_virtual_grfs()
1566 {
1567 int num_vars = this->virtual_grf_count;
1568 bool split_grf[num_vars];
1569 int new_virtual_grf[num_vars];
1570
1571 /* Try to split anything > 0 sized. */
1572 for (int i = 0; i < num_vars; i++) {
1573 if (this->virtual_grf_sizes[i] != 1)
1574 split_grf[i] = true;
1575 else
1576 split_grf[i] = false;
1577 }
1578
1579 if (brw->has_pln &&
1580 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1581 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1582 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1583 * Gen6, that was the only supported interpolation mode, and since Gen6,
1584 * delta_x and delta_y are in fixed hardware registers.
1585 */
1586 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1587 false;
1588 }
1589
1590 foreach_list(node, &this->instructions) {
1591 fs_inst *inst = (fs_inst *)node;
1592
1593 /* If there's a SEND message that requires contiguous destination
1594 * registers, no splitting is allowed.
1595 */
1596 if (inst->regs_written > 1) {
1597 split_grf[inst->dst.reg] = false;
1598 }
1599
1600 /* If we're sending from a GRF, don't split it, on the assumption that
1601 * the send is reading the whole thing.
1602 */
1603 if (inst->is_send_from_grf()) {
1604 for (int i = 0; i < 3; i++) {
1605 if (inst->src[i].file == GRF) {
1606 split_grf[inst->src[i].reg] = false;
1607 }
1608 }
1609 }
1610 }
1611
1612 /* Allocate new space for split regs. Note that the virtual
1613 * numbers will be contiguous.
1614 */
1615 for (int i = 0; i < num_vars; i++) {
1616 if (split_grf[i]) {
1617 new_virtual_grf[i] = virtual_grf_alloc(1);
1618 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1619 int reg = virtual_grf_alloc(1);
1620 assert(reg == new_virtual_grf[i] + j - 1);
1621 (void) reg;
1622 }
1623 this->virtual_grf_sizes[i] = 1;
1624 }
1625 }
1626
1627 foreach_list(node, &this->instructions) {
1628 fs_inst *inst = (fs_inst *)node;
1629
1630 if (inst->dst.file == GRF &&
1631 split_grf[inst->dst.reg] &&
1632 inst->dst.reg_offset != 0) {
1633 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1634 inst->dst.reg_offset - 1);
1635 inst->dst.reg_offset = 0;
1636 }
1637 for (int i = 0; i < 3; i++) {
1638 if (inst->src[i].file == GRF &&
1639 split_grf[inst->src[i].reg] &&
1640 inst->src[i].reg_offset != 0) {
1641 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1642 inst->src[i].reg_offset - 1);
1643 inst->src[i].reg_offset = 0;
1644 }
1645 }
1646 }
1647 invalidate_live_intervals();
1648 }
1649
1650 /**
1651 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1652 *
1653 * During code generation, we create tons of temporary variables, many of
1654 * which get immediately killed and are never used again. Yet, in later
1655 * optimization and analysis passes, such as compute_live_intervals, we need
1656 * to loop over all the virtual GRFs. Compacting them can save a lot of
1657 * overhead.
1658 */
1659 void
1660 fs_visitor::compact_virtual_grfs()
1661 {
1662 /* Mark which virtual GRFs are used, and count how many. */
1663 int remap_table[this->virtual_grf_count];
1664 memset(remap_table, -1, sizeof(remap_table));
1665
1666 foreach_list(node, &this->instructions) {
1667 const fs_inst *inst = (const fs_inst *) node;
1668
1669 if (inst->dst.file == GRF)
1670 remap_table[inst->dst.reg] = 0;
1671
1672 for (int i = 0; i < 3; i++) {
1673 if (inst->src[i].file == GRF)
1674 remap_table[inst->src[i].reg] = 0;
1675 }
1676 }
1677
1678 /* In addition to registers used in instructions, fs_visitor keeps
1679 * direct references to certain special values which must be patched:
1680 */
1681 fs_reg *special[] = {
1682 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1683 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1684 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1685 &delta_x[0], &delta_x[1], &delta_x[2],
1686 &delta_x[3], &delta_x[4], &delta_x[5],
1687 &delta_y[0], &delta_y[1], &delta_y[2],
1688 &delta_y[3], &delta_y[4], &delta_y[5],
1689 };
1690 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1691 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1692
1693 /* Treat all special values as used, to be conservative */
1694 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1695 if (special[i]->file == GRF)
1696 remap_table[special[i]->reg] = 0;
1697 }
1698
1699 /* Compact the GRF arrays. */
1700 int new_index = 0;
1701 for (int i = 0; i < this->virtual_grf_count; i++) {
1702 if (remap_table[i] != -1) {
1703 remap_table[i] = new_index;
1704 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1705 invalidate_live_intervals();
1706 ++new_index;
1707 }
1708 }
1709
1710 this->virtual_grf_count = new_index;
1711
1712 /* Patch all the instructions to use the newly renumbered registers */
1713 foreach_list(node, &this->instructions) {
1714 fs_inst *inst = (fs_inst *) node;
1715
1716 if (inst->dst.file == GRF)
1717 inst->dst.reg = remap_table[inst->dst.reg];
1718
1719 for (int i = 0; i < 3; i++) {
1720 if (inst->src[i].file == GRF)
1721 inst->src[i].reg = remap_table[inst->src[i].reg];
1722 }
1723 }
1724
1725 /* Patch all the references to special values */
1726 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1727 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1728 special[i]->reg = remap_table[special[i]->reg];
1729 }
1730 }
1731
1732 bool
1733 fs_visitor::remove_dead_constants()
1734 {
1735 if (dispatch_width == 8) {
1736 this->params_remap = ralloc_array(mem_ctx, int, stage_prog_data->nr_params);
1737 this->nr_params_remap = stage_prog_data->nr_params;
1738
1739 for (unsigned int i = 0; i < stage_prog_data->nr_params; i++)
1740 this->params_remap[i] = -1;
1741
1742 /* Find which params are still in use. */
1743 foreach_list(node, &this->instructions) {
1744 fs_inst *inst = (fs_inst *)node;
1745
1746 for (int i = 0; i < 3; i++) {
1747 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1748
1749 if (inst->src[i].file != UNIFORM)
1750 continue;
1751
1752 /* Section 5.11 of the OpenGL 4.3 spec says:
1753 *
1754 * "Out-of-bounds reads return undefined values, which include
1755 * values from other variables of the active program or zero."
1756 */
1757 if (constant_nr < 0 ||
1758 constant_nr >= (int)stage_prog_data->nr_params) {
1759 constant_nr = 0;
1760 }
1761
1762 /* For now, set this to non-negative. We'll give it the
1763 * actual new number in a moment, in order to keep the
1764 * register numbers nicely ordered.
1765 */
1766 this->params_remap[constant_nr] = 0;
1767 }
1768 }
1769
1770 /* Figure out what the new numbers for the params will be. At some
1771 * point when we're doing uniform array access, we're going to want
1772 * to keep the distinction between .reg and .reg_offset, but for
1773 * now we don't care.
1774 */
1775 unsigned int new_nr_params = 0;
1776 for (unsigned int i = 0; i < stage_prog_data->nr_params; i++) {
1777 if (this->params_remap[i] != -1) {
1778 this->params_remap[i] = new_nr_params++;
1779 }
1780 }
1781
1782 /* Update the list of params to be uploaded to match our new numbering. */
1783 for (unsigned int i = 0; i < stage_prog_data->nr_params; i++) {
1784 int remapped = this->params_remap[i];
1785
1786 if (remapped == -1)
1787 continue;
1788
1789 stage_prog_data->param[remapped] = stage_prog_data->param[i];
1790 }
1791
1792 stage_prog_data->nr_params = new_nr_params;
1793 } else {
1794 /* This should have been generated in the SIMD8 pass already. */
1795 assert(this->params_remap);
1796 }
1797
1798 /* Now do the renumbering of the shader to remove unused params. */
1799 foreach_list(node, &this->instructions) {
1800 fs_inst *inst = (fs_inst *)node;
1801
1802 for (int i = 0; i < 3; i++) {
1803 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1804
1805 if (inst->src[i].file != UNIFORM)
1806 continue;
1807
1808 /* as above alias to 0 */
1809 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1810 constant_nr = 0;
1811 }
1812 assert(this->params_remap[constant_nr] != -1);
1813 inst->src[i].reg = this->params_remap[constant_nr];
1814 inst->src[i].reg_offset = 0;
1815 }
1816 }
1817
1818 return true;
1819 }
1820
1821 /*
1822 * Implements array access of uniforms by inserting a
1823 * PULL_CONSTANT_LOAD instruction.
1824 *
1825 * Unlike temporary GRF array access (where we don't support it due to
1826 * the difficulty of doing relative addressing on instruction
1827 * destinations), we could potentially do array access of uniforms
1828 * that were loaded in GRF space as push constants. In real-world
1829 * usage we've seen, though, the arrays being used are always larger
1830 * than we could load as push constants, so just always move all
1831 * uniform array access out to a pull constant buffer.
1832 */
1833 void
1834 fs_visitor::move_uniform_array_access_to_pull_constants()
1835 {
1836 int pull_constant_loc[stage_prog_data->nr_params];
1837
1838 for (unsigned int i = 0; i < stage_prog_data->nr_params; i++) {
1839 pull_constant_loc[i] = -1;
1840 }
1841
1842 /* Walk through and find array access of uniforms. Put a copy of that
1843 * uniform in the pull constant buffer.
1844 *
1845 * Note that we don't move constant-indexed accesses to arrays. No
1846 * testing has been done of the performance impact of this choice.
1847 */
1848 foreach_list_safe(node, &this->instructions) {
1849 fs_inst *inst = (fs_inst *)node;
1850
1851 for (int i = 0 ; i < 3; i++) {
1852 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1853 continue;
1854
1855 int uniform = inst->src[i].reg;
1856
1857 /* If this array isn't already present in the pull constant buffer,
1858 * add it.
1859 */
1860 if (pull_constant_loc[uniform] == -1) {
1861 const float **values = &stage_prog_data->param[uniform];
1862
1863 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params;
1864
1865 assert(param_size[uniform]);
1866
1867 for (int j = 0; j < param_size[uniform]; j++) {
1868 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1869 values[j];
1870 }
1871 }
1872
1873 /* Set up the annotation tracking for new generated instructions. */
1874 base_ir = inst->ir;
1875 current_annotation = inst->annotation;
1876
1877 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1878 fs_reg temp = fs_reg(this, glsl_type::float_type);
1879 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1880 surf_index,
1881 *inst->src[i].reladdr,
1882 pull_constant_loc[uniform] +
1883 inst->src[i].reg_offset);
1884 inst->insert_before(&list);
1885
1886 inst->src[i].file = temp.file;
1887 inst->src[i].reg = temp.reg;
1888 inst->src[i].reg_offset = temp.reg_offset;
1889 inst->src[i].reladdr = NULL;
1890 }
1891 }
1892 }
1893
1894 /**
1895 * Choose accesses from the UNIFORM file to demote to using the pull
1896 * constant buffer.
1897 *
1898 * We allow a fragment shader to have more than the specified minimum
1899 * maximum number of fragment shader uniform components (64). If
1900 * there are too many of these, they'd fill up all of register space.
1901 * So, this will push some of them out to the pull constant buffer and
1902 * update the program to load them.
1903 */
1904 void
1905 fs_visitor::setup_pull_constants()
1906 {
1907 /* Only allow 16 registers (128 uniform components) as push constants. */
1908 unsigned int max_uniform_components = 16 * 8;
1909 if (stage_prog_data->nr_params <= max_uniform_components)
1910 return;
1911
1912 if (dispatch_width == 16) {
1913 fail("Pull constants not supported in SIMD16\n");
1914 return;
1915 }
1916
1917 /* Just demote the end of the list. We could probably do better
1918 * here, demoting things that are rarely used in the program first.
1919 */
1920 unsigned int pull_uniform_base = max_uniform_components;
1921
1922 int pull_constant_loc[stage_prog_data->nr_params];
1923 for (unsigned int i = 0; i < stage_prog_data->nr_params; i++) {
1924 if (i < pull_uniform_base) {
1925 pull_constant_loc[i] = -1;
1926 } else {
1927 pull_constant_loc[i] = -1;
1928 /* If our constant is already being uploaded for reladdr purposes,
1929 * reuse it.
1930 */
1931 for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j++) {
1932 if (stage_prog_data->pull_param[j] == stage_prog_data->param[i]) {
1933 pull_constant_loc[i] = j;
1934 break;
1935 }
1936 }
1937 if (pull_constant_loc[i] == -1) {
1938 int pull_index = stage_prog_data->nr_pull_params++;
1939 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1940 pull_constant_loc[i] = pull_index;
1941 }
1942 }
1943 }
1944 stage_prog_data->nr_params = pull_uniform_base;
1945
1946 foreach_list(node, &this->instructions) {
1947 fs_inst *inst = (fs_inst *)node;
1948
1949 for (int i = 0; i < 3; i++) {
1950 if (inst->src[i].file != UNIFORM)
1951 continue;
1952
1953 int pull_index = pull_constant_loc[inst->src[i].reg +
1954 inst->src[i].reg_offset];
1955 if (pull_index == -1)
1956 continue;
1957
1958 assert(!inst->src[i].reladdr);
1959
1960 fs_reg dst = fs_reg(this, glsl_type::float_type);
1961 fs_reg index(stage_prog_data->binding_table.pull_constants_start);
1962 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1963 fs_inst *pull =
1964 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1965 dst, index, offset);
1966 pull->ir = inst->ir;
1967 pull->annotation = inst->annotation;
1968
1969 inst->insert_before(pull);
1970
1971 inst->src[i].file = GRF;
1972 inst->src[i].reg = dst.reg;
1973 inst->src[i].reg_offset = 0;
1974 inst->src[i].set_smear(pull_index & 3);
1975 }
1976 }
1977 }
1978
1979 bool
1980 fs_visitor::opt_algebraic()
1981 {
1982 bool progress = false;
1983
1984 foreach_list(node, &this->instructions) {
1985 fs_inst *inst = (fs_inst *)node;
1986
1987 switch (inst->opcode) {
1988 case BRW_OPCODE_MUL:
1989 if (inst->src[1].file != IMM)
1990 continue;
1991
1992 /* a * 1.0 = a */
1993 if (inst->src[1].is_one()) {
1994 inst->opcode = BRW_OPCODE_MOV;
1995 inst->src[1] = reg_undef;
1996 progress = true;
1997 break;
1998 }
1999
2000 /* a * 0.0 = 0.0 */
2001 if (inst->src[1].is_zero()) {
2002 inst->opcode = BRW_OPCODE_MOV;
2003 inst->src[0] = inst->src[1];
2004 inst->src[1] = reg_undef;
2005 progress = true;
2006 break;
2007 }
2008
2009 break;
2010 case BRW_OPCODE_ADD:
2011 if (inst->src[1].file != IMM)
2012 continue;
2013
2014 /* a + 0.0 = a */
2015 if (inst->src[1].is_zero()) {
2016 inst->opcode = BRW_OPCODE_MOV;
2017 inst->src[1] = reg_undef;
2018 progress = true;
2019 break;
2020 }
2021 break;
2022 case BRW_OPCODE_OR:
2023 if (inst->src[0].equals(inst->src[1])) {
2024 inst->opcode = BRW_OPCODE_MOV;
2025 inst->src[1] = reg_undef;
2026 progress = true;
2027 break;
2028 }
2029 break;
2030 case BRW_OPCODE_LRP:
2031 if (inst->src[1].equals(inst->src[2])) {
2032 inst->opcode = BRW_OPCODE_MOV;
2033 inst->src[0] = inst->src[1];
2034 inst->src[1] = reg_undef;
2035 inst->src[2] = reg_undef;
2036 progress = true;
2037 break;
2038 }
2039 break;
2040 case BRW_OPCODE_SEL:
2041 if (inst->saturate && inst->src[1].file == IMM) {
2042 switch (inst->conditional_mod) {
2043 case BRW_CONDITIONAL_LE:
2044 case BRW_CONDITIONAL_L:
2045 switch (inst->src[1].type) {
2046 case BRW_REGISTER_TYPE_F:
2047 if (inst->src[1].imm.f >= 1.0f) {
2048 inst->opcode = BRW_OPCODE_MOV;
2049 inst->src[1] = reg_undef;
2050 progress = true;
2051 }
2052 break;
2053 default:
2054 break;
2055 }
2056 break;
2057 case BRW_CONDITIONAL_GE:
2058 case BRW_CONDITIONAL_G:
2059 switch (inst->src[1].type) {
2060 case BRW_REGISTER_TYPE_F:
2061 if (inst->src[1].imm.f <= 0.0f) {
2062 inst->opcode = BRW_OPCODE_MOV;
2063 inst->src[1] = reg_undef;
2064 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2065 progress = true;
2066 }
2067 break;
2068 default:
2069 break;
2070 }
2071 default:
2072 break;
2073 }
2074 }
2075 break;
2076 default:
2077 break;
2078 }
2079 }
2080
2081 return progress;
2082 }
2083
2084 /**
2085 * Removes any instructions writing a VGRF where that VGRF is not used by any
2086 * later instruction.
2087 */
2088 bool
2089 fs_visitor::dead_code_eliminate()
2090 {
2091 bool progress = false;
2092 int pc = 0;
2093
2094 calculate_live_intervals();
2095
2096 foreach_list_safe(node, &this->instructions) {
2097 fs_inst *inst = (fs_inst *)node;
2098
2099 if (inst->dst.file == GRF && !inst->has_side_effects()) {
2100 bool dead = true;
2101
2102 for (int i = 0; i < inst->regs_written; i++) {
2103 int var = live_intervals->var_from_vgrf[inst->dst.reg];
2104 assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2105 if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2106 dead = false;
2107 break;
2108 }
2109 }
2110
2111 if (dead) {
2112 /* Don't dead code eliminate instructions that write to the
2113 * accumulator as a side-effect. Instead just set the destination
2114 * to the null register to free it.
2115 */
2116 switch (inst->opcode) {
2117 case BRW_OPCODE_ADDC:
2118 case BRW_OPCODE_SUBB:
2119 case BRW_OPCODE_MACH:
2120 inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2121 break;
2122 default:
2123 inst->remove();
2124 progress = true;
2125 break;
2126 }
2127 }
2128 }
2129
2130 pc++;
2131 }
2132
2133 if (progress)
2134 invalidate_live_intervals();
2135
2136 return progress;
2137 }
2138
2139 struct dead_code_hash_key
2140 {
2141 int vgrf;
2142 int reg_offset;
2143 };
2144
2145 static bool
2146 dead_code_hash_compare(const void *a, const void *b)
2147 {
2148 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2149 }
2150
2151 static void
2152 clear_dead_code_hash(struct hash_table *ht)
2153 {
2154 struct hash_entry *entry;
2155
2156 hash_table_foreach(ht, entry) {
2157 _mesa_hash_table_remove(ht, entry);
2158 }
2159 }
2160
2161 static void
2162 insert_dead_code_hash(struct hash_table *ht,
2163 int vgrf, int reg_offset, fs_inst *inst)
2164 {
2165 /* We don't bother freeing keys, because they'll be GCed with the ht. */
2166 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2167
2168 key->vgrf = vgrf;
2169 key->reg_offset = reg_offset;
2170
2171 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2172 }
2173
2174 static struct hash_entry *
2175 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2176 {
2177 struct dead_code_hash_key key;
2178
2179 key.vgrf = vgrf;
2180 key.reg_offset = reg_offset;
2181
2182 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2183 }
2184
2185 static void
2186 remove_dead_code_hash(struct hash_table *ht,
2187 int vgrf, int reg_offset)
2188 {
2189 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2190 if (!entry)
2191 return;
2192
2193 _mesa_hash_table_remove(ht, entry);
2194 }
2195
2196 /**
2197 * Walks basic blocks, removing any regs that are written but not read before
2198 * being redefined.
2199 *
2200 * The dead_code_eliminate() function implements a global dead code
2201 * elimination, but it only handles the removing the last write to a register
2202 * if it's never read. This one can handle intermediate writes, but only
2203 * within a basic block.
2204 */
2205 bool
2206 fs_visitor::dead_code_eliminate_local()
2207 {
2208 struct hash_table *ht;
2209 bool progress = false;
2210
2211 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2212
2213 if (ht == NULL) {
2214 return false;
2215 }
2216
2217 foreach_list_safe(node, &this->instructions) {
2218 fs_inst *inst = (fs_inst *)node;
2219
2220 /* At a basic block, empty the HT since we don't understand dataflow
2221 * here.
2222 */
2223 if (inst->is_control_flow()) {
2224 clear_dead_code_hash(ht);
2225 continue;
2226 }
2227
2228 /* Clear the HT of any instructions that got read. */
2229 for (int i = 0; i < 3; i++) {
2230 fs_reg src = inst->src[i];
2231 if (src.file != GRF)
2232 continue;
2233
2234 int read = 1;
2235 if (inst->is_send_from_grf())
2236 read = virtual_grf_sizes[src.reg] - src.reg_offset;
2237
2238 for (int reg_offset = src.reg_offset;
2239 reg_offset < src.reg_offset + read;
2240 reg_offset++) {
2241 remove_dead_code_hash(ht, src.reg, reg_offset);
2242 }
2243 }
2244
2245 /* Add any update of a GRF to the HT, removing a previous write if it
2246 * wasn't read.
2247 */
2248 if (inst->dst.file == GRF) {
2249 if (inst->regs_written > 1) {
2250 /* We don't know how to trim channels from an instruction's
2251 * writes, so we can't incrementally remove unread channels from
2252 * it. Just remove whatever it overwrites from the table
2253 */
2254 for (int i = 0; i < inst->regs_written; i++) {
2255 remove_dead_code_hash(ht,
2256 inst->dst.reg,
2257 inst->dst.reg_offset + i);
2258 }
2259 } else {
2260 struct hash_entry *entry =
2261 get_dead_code_hash_entry(ht, inst->dst.reg,
2262 inst->dst.reg_offset);
2263
2264 if (entry) {
2265 if (inst->is_partial_write()) {
2266 /* For a partial write, we can't remove any previous dead code
2267 * candidate, since we're just modifying their result.
2268 */
2269 } else {
2270 /* We're completely updating a channel, and there was a
2271 * previous write to the channel that wasn't read. Kill it!
2272 */
2273 fs_inst *inst = (fs_inst *)entry->data;
2274 inst->remove();
2275 progress = true;
2276 }
2277
2278 _mesa_hash_table_remove(ht, entry);
2279 }
2280
2281 if (!inst->has_side_effects())
2282 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2283 inst);
2284 }
2285 }
2286 }
2287
2288 _mesa_hash_table_destroy(ht, NULL);
2289
2290 if (progress)
2291 invalidate_live_intervals();
2292
2293 return progress;
2294 }
2295
2296 /**
2297 * Implements register coalescing: Checks if the two registers involved in a
2298 * raw move don't interfere, in which case they can both be stored in the same
2299 * place and the MOV removed.
2300 *
2301 * To do this, all uses of the source of the MOV in the shader are replaced
2302 * with the destination of the MOV. For example:
2303 *
2304 * add vgrf3:F, vgrf1:F, vgrf2:F
2305 * mov vgrf4:F, vgrf3:F
2306 * mul vgrf5:F, vgrf5:F, vgrf4:F
2307 *
2308 * becomes
2309 *
2310 * add vgrf4:F, vgrf1:F, vgrf2:F
2311 * mul vgrf5:F, vgrf5:F, vgrf4:F
2312 */
2313 bool
2314 fs_visitor::register_coalesce()
2315 {
2316 bool progress = false;
2317
2318 calculate_live_intervals();
2319
2320 int src_size = 0;
2321 int channels_remaining = 0;
2322 int reg_from = -1, reg_to = -1;
2323 int reg_to_offset[MAX_SAMPLER_MESSAGE_SIZE];
2324 fs_inst *mov[MAX_SAMPLER_MESSAGE_SIZE];
2325
2326 foreach_list(node, &this->instructions) {
2327 fs_inst *inst = (fs_inst *)node;
2328
2329 if (inst->opcode != BRW_OPCODE_MOV ||
2330 inst->is_partial_write() ||
2331 inst->saturate ||
2332 inst->src[0].file != GRF ||
2333 inst->src[0].negate ||
2334 inst->src[0].abs ||
2335 !inst->src[0].is_contiguous() ||
2336 inst->dst.file != GRF ||
2337 inst->dst.type != inst->src[0].type) {
2338 continue;
2339 }
2340
2341 if (virtual_grf_sizes[inst->src[0].reg] >
2342 virtual_grf_sizes[inst->dst.reg])
2343 continue;
2344
2345 int var_from = live_intervals->var_from_reg(&inst->src[0]);
2346 int var_to = live_intervals->var_from_reg(&inst->dst);
2347
2348 if (live_intervals->vars_interfere(var_from, var_to) &&
2349 !inst->dst.equals(inst->src[0])) {
2350
2351 /* We know that the live ranges of A (var_from) and B (var_to)
2352 * interfere because of the ->vars_interfere() call above. If the end
2353 * of B's live range is after the end of A's range, then we know two
2354 * things:
2355 * - the start of B's live range must be in A's live range (since we
2356 * already know the two ranges interfere, this is the only remaining
2357 * possibility)
2358 * - the interference isn't of the form we're looking for (where B is
2359 * entirely inside A)
2360 */
2361 if (live_intervals->end[var_to] > live_intervals->end[var_from])
2362 continue;
2363
2364 bool overwritten = false;
2365 int scan_ip = -1;
2366
2367 foreach_list(n, &this->instructions) {
2368 fs_inst *scan_inst = (fs_inst *)n;
2369 scan_ip++;
2370
2371 if (scan_inst->is_control_flow()) {
2372 overwritten = true;
2373 break;
2374 }
2375
2376 if (scan_ip <= live_intervals->start[var_to])
2377 continue;
2378
2379 if (scan_ip > live_intervals->end[var_to])
2380 break;
2381
2382 if (scan_inst->dst.equals(inst->dst) ||
2383 scan_inst->dst.equals(inst->src[0])) {
2384 overwritten = true;
2385 break;
2386 }
2387 }
2388
2389 if (overwritten)
2390 continue;
2391 }
2392
2393 if (reg_from != inst->src[0].reg) {
2394 reg_from = inst->src[0].reg;
2395
2396 src_size = virtual_grf_sizes[inst->src[0].reg];
2397 assert(src_size <= MAX_SAMPLER_MESSAGE_SIZE);
2398
2399 channels_remaining = src_size;
2400 memset(mov, 0, sizeof(mov));
2401
2402 reg_to = inst->dst.reg;
2403 }
2404
2405 if (reg_to != inst->dst.reg)
2406 continue;
2407
2408 const int offset = inst->src[0].reg_offset;
2409 reg_to_offset[offset] = inst->dst.reg_offset;
2410 mov[offset] = inst;
2411 channels_remaining--;
2412
2413 if (channels_remaining)
2414 continue;
2415
2416 bool removed = false;
2417 for (int i = 0; i < src_size; i++) {
2418 if (mov[i]) {
2419 removed = true;
2420
2421 mov[i]->opcode = BRW_OPCODE_NOP;
2422 mov[i]->conditional_mod = BRW_CONDITIONAL_NONE;
2423 mov[i]->dst = reg_undef;
2424 mov[i]->src[0] = reg_undef;
2425 mov[i]->src[1] = reg_undef;
2426 mov[i]->src[2] = reg_undef;
2427 }
2428 }
2429
2430 foreach_list(node, &this->instructions) {
2431 fs_inst *scan_inst = (fs_inst *)node;
2432
2433 for (int i = 0; i < src_size; i++) {
2434 if (mov[i]) {
2435 if (scan_inst->dst.file == GRF &&
2436 scan_inst->dst.reg == reg_from &&
2437 scan_inst->dst.reg_offset == i) {
2438 scan_inst->dst.reg = reg_to;
2439 scan_inst->dst.reg_offset = reg_to_offset[i];
2440 }
2441 for (int j = 0; j < 3; j++) {
2442 if (scan_inst->src[j].file == GRF &&
2443 scan_inst->src[j].reg == reg_from &&
2444 scan_inst->src[j].reg_offset == i) {
2445 scan_inst->src[j].reg = reg_to;
2446 scan_inst->src[j].reg_offset = reg_to_offset[i];
2447 }
2448 }
2449 }
2450 }
2451 }
2452
2453 if (removed) {
2454 live_intervals->start[var_to] = MIN2(live_intervals->start[var_to],
2455 live_intervals->start[var_from]);
2456 live_intervals->end[var_to] = MAX2(live_intervals->end[var_to],
2457 live_intervals->end[var_from]);
2458 reg_from = -1;
2459 }
2460 }
2461
2462 foreach_list_safe(node, &this->instructions) {
2463 fs_inst *inst = (fs_inst *)node;
2464
2465 if (inst->opcode == BRW_OPCODE_NOP) {
2466 inst->remove();
2467 progress = true;
2468 }
2469 }
2470
2471 if (progress)
2472 invalidate_live_intervals();
2473
2474 return progress;
2475 }
2476
2477 bool
2478 fs_visitor::compute_to_mrf()
2479 {
2480 bool progress = false;
2481 int next_ip = 0;
2482
2483 calculate_live_intervals();
2484
2485 foreach_list_safe(node, &this->instructions) {
2486 fs_inst *inst = (fs_inst *)node;
2487
2488 int ip = next_ip;
2489 next_ip++;
2490
2491 if (inst->opcode != BRW_OPCODE_MOV ||
2492 inst->is_partial_write() ||
2493 inst->dst.file != MRF || inst->src[0].file != GRF ||
2494 inst->dst.type != inst->src[0].type ||
2495 inst->src[0].abs || inst->src[0].negate ||
2496 !inst->src[0].is_contiguous() ||
2497 inst->src[0].subreg_offset)
2498 continue;
2499
2500 /* Work out which hardware MRF registers are written by this
2501 * instruction.
2502 */
2503 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2504 int mrf_high;
2505 if (inst->dst.reg & BRW_MRF_COMPR4) {
2506 mrf_high = mrf_low + 4;
2507 } else if (dispatch_width == 16 &&
2508 (!inst->force_uncompressed && !inst->force_sechalf)) {
2509 mrf_high = mrf_low + 1;
2510 } else {
2511 mrf_high = mrf_low;
2512 }
2513
2514 /* Can't compute-to-MRF this GRF if someone else was going to
2515 * read it later.
2516 */
2517 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2518 continue;
2519
2520 /* Found a move of a GRF to a MRF. Let's see if we can go
2521 * rewrite the thing that made this GRF to write into the MRF.
2522 */
2523 fs_inst *scan_inst;
2524 for (scan_inst = (fs_inst *)inst->prev;
2525 scan_inst->prev != NULL;
2526 scan_inst = (fs_inst *)scan_inst->prev) {
2527 if (scan_inst->dst.file == GRF &&
2528 scan_inst->dst.reg == inst->src[0].reg) {
2529 /* Found the last thing to write our reg we want to turn
2530 * into a compute-to-MRF.
2531 */
2532
2533 /* If this one instruction didn't populate all the
2534 * channels, bail. We might be able to rewrite everything
2535 * that writes that reg, but it would require smarter
2536 * tracking to delay the rewriting until complete success.
2537 */
2538 if (scan_inst->is_partial_write())
2539 break;
2540
2541 /* Things returning more than one register would need us to
2542 * understand coalescing out more than one MOV at a time.
2543 */
2544 if (scan_inst->regs_written > 1)
2545 break;
2546
2547 /* SEND instructions can't have MRF as a destination. */
2548 if (scan_inst->mlen)
2549 break;
2550
2551 if (brw->gen == 6) {
2552 /* gen6 math instructions must have the destination be
2553 * GRF, so no compute-to-MRF for them.
2554 */
2555 if (scan_inst->is_math()) {
2556 break;
2557 }
2558 }
2559
2560 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2561 /* Found the creator of our MRF's source value. */
2562 scan_inst->dst.file = MRF;
2563 scan_inst->dst.reg = inst->dst.reg;
2564 scan_inst->saturate |= inst->saturate;
2565 inst->remove();
2566 progress = true;
2567 }
2568 break;
2569 }
2570
2571 /* We don't handle control flow here. Most computation of
2572 * values that end up in MRFs are shortly before the MRF
2573 * write anyway.
2574 */
2575 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2576 break;
2577
2578 /* You can't read from an MRF, so if someone else reads our
2579 * MRF's source GRF that we wanted to rewrite, that stops us.
2580 */
2581 bool interfered = false;
2582 for (int i = 0; i < 3; i++) {
2583 if (scan_inst->src[i].file == GRF &&
2584 scan_inst->src[i].reg == inst->src[0].reg &&
2585 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2586 interfered = true;
2587 }
2588 }
2589 if (interfered)
2590 break;
2591
2592 if (scan_inst->dst.file == MRF) {
2593 /* If somebody else writes our MRF here, we can't
2594 * compute-to-MRF before that.
2595 */
2596 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2597 int scan_mrf_high;
2598
2599 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2600 scan_mrf_high = scan_mrf_low + 4;
2601 } else if (dispatch_width == 16 &&
2602 (!scan_inst->force_uncompressed &&
2603 !scan_inst->force_sechalf)) {
2604 scan_mrf_high = scan_mrf_low + 1;
2605 } else {
2606 scan_mrf_high = scan_mrf_low;
2607 }
2608
2609 if (mrf_low == scan_mrf_low ||
2610 mrf_low == scan_mrf_high ||
2611 mrf_high == scan_mrf_low ||
2612 mrf_high == scan_mrf_high) {
2613 break;
2614 }
2615 }
2616
2617 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2618 /* Found a SEND instruction, which means that there are
2619 * live values in MRFs from base_mrf to base_mrf +
2620 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2621 * above it.
2622 */
2623 if (mrf_low >= scan_inst->base_mrf &&
2624 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2625 break;
2626 }
2627 if (mrf_high >= scan_inst->base_mrf &&
2628 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2629 break;
2630 }
2631 }
2632 }
2633 }
2634
2635 if (progress)
2636 invalidate_live_intervals();
2637
2638 return progress;
2639 }
2640
2641 /**
2642 * Walks through basic blocks, looking for repeated MRF writes and
2643 * removing the later ones.
2644 */
2645 bool
2646 fs_visitor::remove_duplicate_mrf_writes()
2647 {
2648 fs_inst *last_mrf_move[16];
2649 bool progress = false;
2650
2651 /* Need to update the MRF tracking for compressed instructions. */
2652 if (dispatch_width == 16)
2653 return false;
2654
2655 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2656
2657 foreach_list_safe(node, &this->instructions) {
2658 fs_inst *inst = (fs_inst *)node;
2659
2660 if (inst->is_control_flow()) {
2661 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2662 }
2663
2664 if (inst->opcode == BRW_OPCODE_MOV &&
2665 inst->dst.file == MRF) {
2666 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2667 if (prev_inst && inst->equals(prev_inst)) {
2668 inst->remove();
2669 progress = true;
2670 continue;
2671 }
2672 }
2673
2674 /* Clear out the last-write records for MRFs that were overwritten. */
2675 if (inst->dst.file == MRF) {
2676 last_mrf_move[inst->dst.reg] = NULL;
2677 }
2678
2679 if (inst->mlen > 0 && inst->base_mrf != -1) {
2680 /* Found a SEND instruction, which will include two or fewer
2681 * implied MRF writes. We could do better here.
2682 */
2683 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2684 last_mrf_move[inst->base_mrf + i] = NULL;
2685 }
2686 }
2687
2688 /* Clear out any MRF move records whose sources got overwritten. */
2689 if (inst->dst.file == GRF) {
2690 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2691 if (last_mrf_move[i] &&
2692 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2693 last_mrf_move[i] = NULL;
2694 }
2695 }
2696 }
2697
2698 if (inst->opcode == BRW_OPCODE_MOV &&
2699 inst->dst.file == MRF &&
2700 inst->src[0].file == GRF &&
2701 !inst->is_partial_write()) {
2702 last_mrf_move[inst->dst.reg] = inst;
2703 }
2704 }
2705
2706 if (progress)
2707 invalidate_live_intervals();
2708
2709 return progress;
2710 }
2711
2712 static void
2713 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2714 int first_grf, int grf_len)
2715 {
2716 bool inst_simd16 = (dispatch_width > 8 &&
2717 !inst->force_uncompressed &&
2718 !inst->force_sechalf);
2719
2720 /* Clear the flag for registers that actually got read (as expected). */
2721 for (int i = 0; i < 3; i++) {
2722 int grf;
2723 if (inst->src[i].file == GRF) {
2724 grf = inst->src[i].reg;
2725 } else if (inst->src[i].file == HW_REG &&
2726 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2727 grf = inst->src[i].fixed_hw_reg.nr;
2728 } else {
2729 continue;
2730 }
2731
2732 if (grf >= first_grf &&
2733 grf < first_grf + grf_len) {
2734 deps[grf - first_grf] = false;
2735 if (inst_simd16)
2736 deps[grf - first_grf + 1] = false;
2737 }
2738 }
2739 }
2740
2741 /**
2742 * Implements this workaround for the original 965:
2743 *
2744 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2745 * check for post destination dependencies on this instruction, software
2746 * must ensure that there is no destination hazard for the case of ‘write
2747 * followed by a posted write’ shown in the following example.
2748 *
2749 * 1. mov r3 0
2750 * 2. send r3.xy <rest of send instruction>
2751 * 3. mov r2 r3
2752 *
2753 * Due to no post-destination dependency check on the ‘send’, the above
2754 * code sequence could have two instructions (1 and 2) in flight at the
2755 * same time that both consider ‘r3’ as the target of their final writes.
2756 */
2757 void
2758 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2759 {
2760 int reg_size = dispatch_width / 8;
2761 int write_len = inst->regs_written * reg_size;
2762 int first_write_grf = inst->dst.reg;
2763 bool needs_dep[BRW_MAX_MRF];
2764 assert(write_len < (int)sizeof(needs_dep) - 1);
2765
2766 memset(needs_dep, false, sizeof(needs_dep));
2767 memset(needs_dep, true, write_len);
2768
2769 clear_deps_for_inst_src(inst, dispatch_width,
2770 needs_dep, first_write_grf, write_len);
2771
2772 /* Walk backwards looking for writes to registers we're writing which
2773 * aren't read since being written. If we hit the start of the program,
2774 * we assume that there are no outstanding dependencies on entry to the
2775 * program.
2776 */
2777 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2778 scan_inst != NULL;
2779 scan_inst = (fs_inst *)scan_inst->prev) {
2780
2781 /* If we hit control flow, assume that there *are* outstanding
2782 * dependencies, and force their cleanup before our instruction.
2783 */
2784 if (scan_inst->is_control_flow()) {
2785 for (int i = 0; i < write_len; i++) {
2786 if (needs_dep[i]) {
2787 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2788 }
2789 }
2790 return;
2791 }
2792
2793 bool scan_inst_simd16 = (dispatch_width > 8 &&
2794 !scan_inst->force_uncompressed &&
2795 !scan_inst->force_sechalf);
2796
2797 /* We insert our reads as late as possible on the assumption that any
2798 * instruction but a MOV that might have left us an outstanding
2799 * dependency has more latency than a MOV.
2800 */
2801 if (scan_inst->dst.file == GRF) {
2802 for (int i = 0; i < scan_inst->regs_written; i++) {
2803 int reg = scan_inst->dst.reg + i * reg_size;
2804
2805 if (reg >= first_write_grf &&
2806 reg < first_write_grf + write_len &&
2807 needs_dep[reg - first_write_grf]) {
2808 inst->insert_before(DEP_RESOLVE_MOV(reg));
2809 needs_dep[reg - first_write_grf] = false;
2810 if (scan_inst_simd16)
2811 needs_dep[reg - first_write_grf + 1] = false;
2812 }
2813 }
2814 }
2815
2816 /* Clear the flag for registers that actually got read (as expected). */
2817 clear_deps_for_inst_src(scan_inst, dispatch_width,
2818 needs_dep, first_write_grf, write_len);
2819
2820 /* Continue the loop only if we haven't resolved all the dependencies */
2821 int i;
2822 for (i = 0; i < write_len; i++) {
2823 if (needs_dep[i])
2824 break;
2825 }
2826 if (i == write_len)
2827 return;
2828 }
2829 }
2830
2831 /**
2832 * Implements this workaround for the original 965:
2833 *
2834 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2835 * used as a destination register until after it has been sourced by an
2836 * instruction with a different destination register.
2837 */
2838 void
2839 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2840 {
2841 int write_len = inst->regs_written * dispatch_width / 8;
2842 int first_write_grf = inst->dst.reg;
2843 bool needs_dep[BRW_MAX_MRF];
2844 assert(write_len < (int)sizeof(needs_dep) - 1);
2845
2846 memset(needs_dep, false, sizeof(needs_dep));
2847 memset(needs_dep, true, write_len);
2848 /* Walk forwards looking for writes to registers we're writing which aren't
2849 * read before being written.
2850 */
2851 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2852 !scan_inst->is_tail_sentinel();
2853 scan_inst = (fs_inst *)scan_inst->next) {
2854 /* If we hit control flow, force resolve all remaining dependencies. */
2855 if (scan_inst->is_control_flow()) {
2856 for (int i = 0; i < write_len; i++) {
2857 if (needs_dep[i])
2858 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2859 }
2860 return;
2861 }
2862
2863 /* Clear the flag for registers that actually got read (as expected). */
2864 clear_deps_for_inst_src(scan_inst, dispatch_width,
2865 needs_dep, first_write_grf, write_len);
2866
2867 /* We insert our reads as late as possible since they're reading the
2868 * result of a SEND, which has massive latency.
2869 */
2870 if (scan_inst->dst.file == GRF &&
2871 scan_inst->dst.reg >= first_write_grf &&
2872 scan_inst->dst.reg < first_write_grf + write_len &&
2873 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2874 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2875 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2876 }
2877
2878 /* Continue the loop only if we haven't resolved all the dependencies */
2879 int i;
2880 for (i = 0; i < write_len; i++) {
2881 if (needs_dep[i])
2882 break;
2883 }
2884 if (i == write_len)
2885 return;
2886 }
2887
2888 /* If we hit the end of the program, resolve all remaining dependencies out
2889 * of paranoia.
2890 */
2891 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2892 assert(last_inst->eot);
2893 for (int i = 0; i < write_len; i++) {
2894 if (needs_dep[i])
2895 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2896 }
2897 }
2898
2899 void
2900 fs_visitor::insert_gen4_send_dependency_workarounds()
2901 {
2902 if (brw->gen != 4 || brw->is_g4x)
2903 return;
2904
2905 /* Note that we're done with register allocation, so GRF fs_regs always
2906 * have a .reg_offset of 0.
2907 */
2908
2909 foreach_list_safe(node, &this->instructions) {
2910 fs_inst *inst = (fs_inst *)node;
2911
2912 if (inst->mlen != 0 && inst->dst.file == GRF) {
2913 insert_gen4_pre_send_dependency_workarounds(inst);
2914 insert_gen4_post_send_dependency_workarounds(inst);
2915 }
2916 }
2917 }
2918
2919 /**
2920 * Turns the generic expression-style uniform pull constant load instruction
2921 * into a hardware-specific series of instructions for loading a pull
2922 * constant.
2923 *
2924 * The expression style allows the CSE pass before this to optimize out
2925 * repeated loads from the same offset, and gives the pre-register-allocation
2926 * scheduling full flexibility, while the conversion to native instructions
2927 * allows the post-register-allocation scheduler the best information
2928 * possible.
2929 *
2930 * Note that execution masking for setting up pull constant loads is special:
2931 * the channels that need to be written are unrelated to the current execution
2932 * mask, since a later instruction will use one of the result channels as a
2933 * source operand for all 8 or 16 of its channels.
2934 */
2935 void
2936 fs_visitor::lower_uniform_pull_constant_loads()
2937 {
2938 foreach_list(node, &this->instructions) {
2939 fs_inst *inst = (fs_inst *)node;
2940
2941 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2942 continue;
2943
2944 if (brw->gen >= 7) {
2945 /* The offset arg before was a vec4-aligned byte offset. We need to
2946 * turn it into a dword offset.
2947 */
2948 fs_reg const_offset_reg = inst->src[1];
2949 assert(const_offset_reg.file == IMM &&
2950 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2951 const_offset_reg.imm.u /= 4;
2952 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2953
2954 /* This is actually going to be a MOV, but since only the first dword
2955 * is accessed, we have a special opcode to do just that one. Note
2956 * that this needs to be an operation that will be considered a def
2957 * by live variable analysis, or register allocation will explode.
2958 */
2959 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2960 payload, const_offset_reg);
2961 setup->force_writemask_all = true;
2962
2963 setup->ir = inst->ir;
2964 setup->annotation = inst->annotation;
2965 inst->insert_before(setup);
2966
2967 /* Similarly, this will only populate the first 4 channels of the
2968 * result register (since we only use smear values from 0-3), but we
2969 * don't tell the optimizer.
2970 */
2971 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2972 inst->src[1] = payload;
2973
2974 invalidate_live_intervals();
2975 } else {
2976 /* Before register allocation, we didn't tell the scheduler about the
2977 * MRF we use. We know it's safe to use this MRF because nothing
2978 * else does except for register spill/unspill, which generates and
2979 * uses its MRF within a single IR instruction.
2980 */
2981 inst->base_mrf = 14;
2982 inst->mlen = 1;
2983 }
2984 }
2985 }
2986
2987 void
2988 fs_visitor::dump_instructions()
2989 {
2990 calculate_register_pressure();
2991
2992 int ip = 0, max_pressure = 0;
2993 foreach_list(node, &this->instructions) {
2994 backend_instruction *inst = (backend_instruction *)node;
2995 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2996 printf("{%3d} %4d: ", regs_live_at_ip[ip], ip);
2997 dump_instruction(inst);
2998 ++ip;
2999 }
3000 printf("Maximum %3d registers live at once.\n", max_pressure);
3001 }
3002
3003 void
3004 fs_visitor::dump_instruction(backend_instruction *be_inst)
3005 {
3006 fs_inst *inst = (fs_inst *)be_inst;
3007
3008 if (inst->predicate) {
3009 printf("(%cf0.%d) ",
3010 inst->predicate_inverse ? '-' : '+',
3011 inst->flag_subreg);
3012 }
3013
3014 printf("%s", brw_instruction_name(inst->opcode));
3015 if (inst->saturate)
3016 printf(".sat");
3017 if (inst->conditional_mod) {
3018 printf("%s", conditional_modifier[inst->conditional_mod]);
3019 if (!inst->predicate &&
3020 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3021 inst->opcode != BRW_OPCODE_IF &&
3022 inst->opcode != BRW_OPCODE_WHILE))) {
3023 printf(".f0.%d", inst->flag_subreg);
3024 }
3025 }
3026 printf(" ");
3027
3028
3029 switch (inst->dst.file) {
3030 case GRF:
3031 printf("vgrf%d", inst->dst.reg);
3032 if (virtual_grf_sizes[inst->dst.reg] != 1 ||
3033 inst->dst.subreg_offset)
3034 printf("+%d.%d", inst->dst.reg_offset, inst->dst.subreg_offset);
3035 break;
3036 case MRF:
3037 printf("m%d", inst->dst.reg);
3038 break;
3039 case BAD_FILE:
3040 printf("(null)");
3041 break;
3042 case UNIFORM:
3043 printf("***u%d***", inst->dst.reg);
3044 break;
3045 case HW_REG:
3046 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3047 switch (inst->dst.fixed_hw_reg.nr) {
3048 case BRW_ARF_NULL:
3049 printf("null");
3050 break;
3051 case BRW_ARF_ADDRESS:
3052 printf("a0.%d", inst->dst.fixed_hw_reg.subnr);
3053 break;
3054 case BRW_ARF_ACCUMULATOR:
3055 printf("acc%d", inst->dst.fixed_hw_reg.subnr);
3056 break;
3057 case BRW_ARF_FLAG:
3058 printf("f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3059 inst->dst.fixed_hw_reg.subnr);
3060 break;
3061 default:
3062 printf("arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3063 inst->dst.fixed_hw_reg.subnr);
3064 break;
3065 }
3066 } else {
3067 printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
3068 }
3069 if (inst->dst.fixed_hw_reg.subnr)
3070 printf("+%d", inst->dst.fixed_hw_reg.subnr);
3071 break;
3072 default:
3073 printf("???");
3074 break;
3075 }
3076 printf(":%s, ", reg_encoding[inst->dst.type]);
3077
3078 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
3079 if (inst->src[i].negate)
3080 printf("-");
3081 if (inst->src[i].abs)
3082 printf("|");
3083 switch (inst->src[i].file) {
3084 case GRF:
3085 printf("vgrf%d", inst->src[i].reg);
3086 if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
3087 inst->src[i].subreg_offset)
3088 printf("+%d.%d", inst->src[i].reg_offset,
3089 inst->src[i].subreg_offset);
3090 break;
3091 case MRF:
3092 printf("***m%d***", inst->src[i].reg);
3093 break;
3094 case UNIFORM:
3095 printf("u%d", inst->src[i].reg);
3096 if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
3097 inst->src[i].subreg_offset)
3098 printf("+%d.%d", inst->src[i].reg_offset,
3099 inst->src[i].subreg_offset);
3100 break;
3101 case BAD_FILE:
3102 printf("(null)");
3103 break;
3104 case IMM:
3105 switch (inst->src[i].type) {
3106 case BRW_REGISTER_TYPE_F:
3107 printf("%ff", inst->src[i].imm.f);
3108 break;
3109 case BRW_REGISTER_TYPE_D:
3110 printf("%dd", inst->src[i].imm.i);
3111 break;
3112 case BRW_REGISTER_TYPE_UD:
3113 printf("%uu", inst->src[i].imm.u);
3114 break;
3115 default:
3116 printf("???");
3117 break;
3118 }
3119 break;
3120 case HW_REG:
3121 if (inst->src[i].fixed_hw_reg.negate)
3122 printf("-");
3123 if (inst->src[i].fixed_hw_reg.abs)
3124 printf("|");
3125 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3126 switch (inst->src[i].fixed_hw_reg.nr) {
3127 case BRW_ARF_NULL:
3128 printf("null");
3129 break;
3130 case BRW_ARF_ADDRESS:
3131 printf("a0.%d", inst->src[i].fixed_hw_reg.subnr);
3132 break;
3133 case BRW_ARF_ACCUMULATOR:
3134 printf("acc%d", inst->src[i].fixed_hw_reg.subnr);
3135 break;
3136 case BRW_ARF_FLAG:
3137 printf("f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3138 inst->src[i].fixed_hw_reg.subnr);
3139 break;
3140 default:
3141 printf("arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3142 inst->src[i].fixed_hw_reg.subnr);
3143 break;
3144 }
3145 } else {
3146 printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3147 }
3148 if (inst->src[i].fixed_hw_reg.subnr)
3149 printf("+%d", inst->src[i].fixed_hw_reg.subnr);
3150 if (inst->src[i].fixed_hw_reg.abs)
3151 printf("|");
3152 break;
3153 default:
3154 printf("???");
3155 break;
3156 }
3157 if (inst->src[i].abs)
3158 printf("|");
3159
3160 if (inst->src[i].file != IMM) {
3161 printf(":%s", brw_reg_type_letters(inst->src[i].type));
3162 }
3163
3164 if (i < 2 && inst->src[i + 1].file != BAD_FILE)
3165 printf(", ");
3166 }
3167
3168 printf(" ");
3169
3170 if (inst->force_uncompressed)
3171 printf("1sthalf ");
3172
3173 if (inst->force_sechalf)
3174 printf("2ndhalf ");
3175
3176 printf("\n");
3177 }
3178
3179 /**
3180 * Possibly returns an instruction that set up @param reg.
3181 *
3182 * Sometimes we want to take the result of some expression/variable
3183 * dereference tree and rewrite the instruction generating the result
3184 * of the tree. When processing the tree, we know that the
3185 * instructions generated are all writing temporaries that are dead
3186 * outside of this tree. So, if we have some instructions that write
3187 * a temporary, we're free to point that temp write somewhere else.
3188 *
3189 * Note that this doesn't guarantee that the instruction generated
3190 * only reg -- it might be the size=4 destination of a texture instruction.
3191 */
3192 fs_inst *
3193 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3194 fs_inst *end,
3195 fs_reg reg)
3196 {
3197 if (end == start ||
3198 end->is_partial_write() ||
3199 reg.reladdr ||
3200 !reg.equals(end->dst)) {
3201 return NULL;
3202 } else {
3203 return end;
3204 }
3205 }
3206
3207 void
3208 fs_visitor::setup_payload_gen6()
3209 {
3210 bool uses_depth =
3211 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3212 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3213
3214 assert(brw->gen >= 6);
3215
3216 /* R0-1: masks, pixel X/Y coordinates. */
3217 c->nr_payload_regs = 2;
3218 /* R2: only for 32-pixel dispatch.*/
3219
3220 /* R3-26: barycentric interpolation coordinates. These appear in the
3221 * same order that they appear in the brw_wm_barycentric_interp_mode
3222 * enum. Each set of coordinates occupies 2 registers if dispatch width
3223 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3224 * appear if they were enabled using the "Barycentric Interpolation
3225 * Mode" bits in WM_STATE.
3226 */
3227 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3228 if (barycentric_interp_modes & (1 << i)) {
3229 c->barycentric_coord_reg[i] = c->nr_payload_regs;
3230 c->nr_payload_regs += 2;
3231 if (dispatch_width == 16) {
3232 c->nr_payload_regs += 2;
3233 }
3234 }
3235 }
3236
3237 /* R27: interpolated depth if uses source depth */
3238 if (uses_depth) {
3239 c->source_depth_reg = c->nr_payload_regs;
3240 c->nr_payload_regs++;
3241 if (dispatch_width == 16) {
3242 /* R28: interpolated depth if not SIMD8. */
3243 c->nr_payload_regs++;
3244 }
3245 }
3246 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3247 if (uses_depth) {
3248 c->source_w_reg = c->nr_payload_regs;
3249 c->nr_payload_regs++;
3250 if (dispatch_width == 16) {
3251 /* R30: interpolated W if not SIMD8. */
3252 c->nr_payload_regs++;
3253 }
3254 }
3255
3256 c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3257 /* R31: MSAA position offsets. */
3258 if (c->prog_data.uses_pos_offset) {
3259 c->sample_pos_reg = c->nr_payload_regs;
3260 c->nr_payload_regs++;
3261 }
3262
3263 /* R32: MSAA input coverage mask */
3264 if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3265 assert(brw->gen >= 7);
3266 c->sample_mask_reg = c->nr_payload_regs;
3267 c->nr_payload_regs++;
3268 if (dispatch_width == 16) {
3269 /* R33: input coverage mask if not SIMD8. */
3270 c->nr_payload_regs++;
3271 }
3272 }
3273
3274 /* R34-: bary for 32-pixel. */
3275 /* R58-59: interp W for 32-pixel. */
3276
3277 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3278 c->source_depth_to_render_target = true;
3279 }
3280 }
3281
3282 void
3283 fs_visitor::assign_binding_table_offsets()
3284 {
3285 uint32_t next_binding_table_offset = 0;
3286
3287 /* If there are no color regions, we still perform an FB write to a null
3288 * renderbuffer, which we place at surface index 0.
3289 */
3290 c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3291 next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3292
3293 assign_common_binding_table_offsets(next_binding_table_offset);
3294 }
3295
3296 void
3297 fs_visitor::calculate_register_pressure()
3298 {
3299 calculate_live_intervals();
3300
3301 int num_instructions = 0;
3302 foreach_list(node, &this->instructions) {
3303 ++num_instructions;
3304 }
3305
3306 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3307
3308 for (int reg = 0; reg < virtual_grf_count; reg++) {
3309 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3310 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3311 }
3312 }
3313
3314 bool
3315 fs_visitor::run()
3316 {
3317 sanity_param_count = fp->Base.Parameters->NumParameters;
3318 uint32_t orig_nr_params = stage_prog_data->nr_params;
3319 bool allocated_without_spills;
3320
3321 assign_binding_table_offsets();
3322
3323 if (brw->gen >= 6)
3324 setup_payload_gen6();
3325 else
3326 setup_payload_gen4();
3327
3328 if (0) {
3329 emit_dummy_fs();
3330 } else {
3331 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3332 emit_shader_time_begin();
3333
3334 calculate_urb_setup();
3335 if (fp->Base.InputsRead > 0) {
3336 if (brw->gen < 6)
3337 emit_interpolation_setup_gen4();
3338 else
3339 emit_interpolation_setup_gen6();
3340 }
3341
3342 /* We handle discards by keeping track of the still-live pixels in f0.1.
3343 * Initialize it with the dispatched pixels.
3344 */
3345 if (fp->UsesKill || c->key.alpha_test_func) {
3346 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3347 discard_init->flag_subreg = 1;
3348 }
3349
3350 /* Generate FS IR for main(). (the visitor only descends into
3351 * functions called "main").
3352 */
3353 if (shader) {
3354 foreach_list(node, &*shader->base.ir) {
3355 ir_instruction *ir = (ir_instruction *)node;
3356 base_ir = ir;
3357 this->result = reg_undef;
3358 ir->accept(this);
3359 }
3360 } else {
3361 emit_fragment_program_code();
3362 }
3363 base_ir = NULL;
3364 if (failed)
3365 return false;
3366
3367 emit(FS_OPCODE_PLACEHOLDER_HALT);
3368
3369 if (c->key.alpha_test_func)
3370 emit_alpha_test();
3371
3372 emit_fb_writes();
3373
3374 split_virtual_grfs();
3375
3376 move_uniform_array_access_to_pull_constants();
3377 remove_dead_constants();
3378 setup_pull_constants();
3379
3380 bool progress;
3381 do {
3382 progress = false;
3383
3384 compact_virtual_grfs();
3385
3386 progress = remove_duplicate_mrf_writes() || progress;
3387
3388 progress = opt_algebraic() || progress;
3389 progress = opt_cse() || progress;
3390 progress = opt_copy_propagate() || progress;
3391 progress = opt_peephole_predicated_break() || progress;
3392 progress = dead_code_eliminate() || progress;
3393 progress = dead_code_eliminate_local() || progress;
3394 progress = opt_peephole_sel() || progress;
3395 progress = dead_control_flow_eliminate(this) || progress;
3396 progress = opt_saturate_propagation() || progress;
3397 progress = register_coalesce() || progress;
3398 progress = compute_to_mrf() || progress;
3399 } while (progress);
3400
3401 lower_uniform_pull_constant_loads();
3402
3403 assign_curb_setup();
3404 assign_urb_setup();
3405
3406 static enum instruction_scheduler_mode pre_modes[] = {
3407 SCHEDULE_PRE,
3408 SCHEDULE_PRE_NON_LIFO,
3409 SCHEDULE_PRE_LIFO,
3410 };
3411
3412 /* Try each scheduling heuristic to see if it can successfully register
3413 * allocate without spilling. They should be ordered by decreasing
3414 * performance but increasing likelihood of allocating.
3415 */
3416 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3417 schedule_instructions(pre_modes[i]);
3418
3419 if (0) {
3420 assign_regs_trivial();
3421 allocated_without_spills = true;
3422 } else {
3423 allocated_without_spills = assign_regs(false);
3424 }
3425 if (allocated_without_spills)
3426 break;
3427 }
3428
3429 if (!allocated_without_spills) {
3430 /* We assume that any spilling is worse than just dropping back to
3431 * SIMD8. There's probably actually some intermediate point where
3432 * SIMD16 with a couple of spills is still better.
3433 */
3434 if (dispatch_width == 16) {
3435 fail("Failure to register allocate. Reduce number of "
3436 "live scalar values to avoid this.");
3437 }
3438
3439 /* Since we're out of heuristics, just go spill registers until we
3440 * get an allocation.
3441 */
3442 while (!assign_regs(true)) {
3443 if (failed)
3444 break;
3445 }
3446 }
3447 }
3448 assert(force_uncompressed_stack == 0);
3449
3450 /* This must come after all optimization and register allocation, since
3451 * it inserts dead code that happens to have side effects, and it does
3452 * so based on the actual physical registers in use.
3453 */
3454 insert_gen4_send_dependency_workarounds();
3455
3456 if (failed)
3457 return false;
3458
3459 if (!allocated_without_spills)
3460 schedule_instructions(SCHEDULE_POST);
3461
3462 if (dispatch_width == 8) {
3463 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3464 } else {
3465 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3466
3467 /* Make sure we didn't try to sneak in an extra uniform */
3468 assert(orig_nr_params == stage_prog_data->nr_params);
3469 (void) orig_nr_params;
3470 }
3471
3472 /* If any state parameters were appended, then ParameterValues could have
3473 * been realloced, in which case the driver uniform storage set up by
3474 * _mesa_associate_uniform_storage() would point to freed memory. Make
3475 * sure that didn't happen.
3476 */
3477 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3478
3479 return !failed;
3480 }
3481
3482 const unsigned *
3483 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3484 struct gl_fragment_program *fp,
3485 struct gl_shader_program *prog,
3486 unsigned *final_assembly_size)
3487 {
3488 bool start_busy = false;
3489 float start_time = 0;
3490
3491 if (unlikely(brw->perf_debug)) {
3492 start_busy = (brw->batch.last_bo &&
3493 drm_intel_bo_busy(brw->batch.last_bo));
3494 start_time = get_time();
3495 }
3496
3497 struct brw_shader *shader = NULL;
3498 if (prog)
3499 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3500
3501 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3502 if (prog) {
3503 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3504 _mesa_print_ir(shader->base.ir, NULL);
3505 printf("\n\n");
3506 } else {
3507 printf("ARB_fragment_program %d ir for native fragment shader\n",
3508 fp->Base.Id);
3509 _mesa_print_program(&fp->Base);
3510 }
3511 }
3512
3513 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3514 */
3515 fs_visitor v(brw, c, prog, fp, 8);
3516 if (!v.run()) {
3517 if (prog) {
3518 prog->LinkStatus = false;
3519 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3520 }
3521
3522 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3523 v.fail_msg);
3524
3525 return NULL;
3526 }
3527
3528 exec_list *simd16_instructions = NULL;
3529 fs_visitor v2(brw, c, prog, fp, 16);
3530 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3531 if (c->prog_data.base.nr_pull_params == 0) {
3532 /* Try a SIMD16 compile */
3533 v2.import_uniforms(&v);
3534 if (!v2.run()) {
3535 perf_debug("SIMD16 shader failed to compile, falling back to "
3536 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3537 } else {
3538 simd16_instructions = &v2.instructions;
3539 }
3540 } else {
3541 perf_debug("Skipping SIMD16 due to pull parameters.\n");
3542 }
3543 }
3544
3545 const unsigned *assembly = NULL;
3546 if (brw->gen >= 8) {
3547 gen8_fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3548 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3549 final_assembly_size);
3550 } else {
3551 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3552 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3553 final_assembly_size);
3554 }
3555
3556 if (unlikely(brw->perf_debug) && shader) {
3557 if (shader->compiled_once)
3558 brw_wm_debug_recompile(brw, prog, &c->key);
3559 shader->compiled_once = true;
3560
3561 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3562 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3563 (get_time() - start_time) * 1000);
3564 }
3565 }
3566
3567 return assembly;
3568 }
3569
3570 bool
3571 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3572 {
3573 struct brw_context *brw = brw_context(ctx);
3574 struct brw_wm_prog_key key;
3575
3576 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3577 return true;
3578
3579 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3580 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3581 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3582 bool program_uses_dfdy = fp->UsesDFdy;
3583
3584 memset(&key, 0, sizeof(key));
3585
3586 if (brw->gen < 6) {
3587 if (fp->UsesKill)
3588 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3589
3590 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3591 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3592
3593 /* Just assume depth testing. */
3594 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3595 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3596 }
3597
3598 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3599 BRW_FS_VARYING_INPUT_MASK) > 16)
3600 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3601
3602 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3603
3604 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3605 for (unsigned i = 0; i < sampler_count; i++) {
3606 if (fp->Base.ShadowSamplers & (1 << i)) {
3607 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3608 key.tex.swizzles[i] =
3609 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3610 } else {
3611 /* Color sampler: assume no swizzling. */
3612 key.tex.swizzles[i] = SWIZZLE_XYZW;
3613 }
3614 }
3615
3616 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3617 key.drawable_height = ctx->DrawBuffer->Height;
3618 }
3619
3620 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3621 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3622 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3623
3624 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3625 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3626 key.nr_color_regions > 1;
3627 }
3628
3629 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3630 * quality of the derivatives is likely to be determined by the driconf
3631 * option.
3632 */
3633 key.high_quality_derivatives = brw->disable_derivative_optimization;
3634
3635 key.program_string_id = bfp->id;
3636
3637 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3638 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3639
3640 bool success = do_wm_prog(brw, prog, bfp, &key);
3641
3642 brw->wm.base.prog_offset = old_prog_offset;
3643 brw->wm.prog_data = old_prog_data;
3644
3645 return success;
3646 }