i965/fs: New peephole optimization to flatten IF/BREAK/ENDIF.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53
54 void
55 fs_inst::init()
56 {
57 memset(this, 0, sizeof(*this));
58 this->opcode = BRW_OPCODE_NOP;
59 this->conditional_mod = BRW_CONDITIONAL_NONE;
60
61 this->dst = reg_undef;
62 this->src[0] = reg_undef;
63 this->src[1] = reg_undef;
64 this->src[2] = reg_undef;
65
66 /* This will be the case for almost all instructions. */
67 this->regs_written = 1;
68 }
69
70 fs_inst::fs_inst()
71 {
72 init();
73 }
74
75 fs_inst::fs_inst(enum opcode opcode)
76 {
77 init();
78 this->opcode = opcode;
79 }
80
81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
82 {
83 init();
84 this->opcode = opcode;
85 this->dst = dst;
86
87 if (dst.file == GRF)
88 assert(dst.reg_offset >= 0);
89 }
90
91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
92 {
93 init();
94 this->opcode = opcode;
95 this->dst = dst;
96 this->src[0] = src0;
97
98 if (dst.file == GRF)
99 assert(dst.reg_offset >= 0);
100 if (src[0].file == GRF)
101 assert(src[0].reg_offset >= 0);
102 }
103
104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
105 {
106 init();
107 this->opcode = opcode;
108 this->dst = dst;
109 this->src[0] = src0;
110 this->src[1] = src1;
111
112 if (dst.file == GRF)
113 assert(dst.reg_offset >= 0);
114 if (src[0].file == GRF)
115 assert(src[0].reg_offset >= 0);
116 if (src[1].file == GRF)
117 assert(src[1].reg_offset >= 0);
118 }
119
120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
121 fs_reg src0, fs_reg src1, fs_reg src2)
122 {
123 init();
124 this->opcode = opcode;
125 this->dst = dst;
126 this->src[0] = src0;
127 this->src[1] = src1;
128 this->src[2] = src2;
129
130 if (dst.file == GRF)
131 assert(dst.reg_offset >= 0);
132 if (src[0].file == GRF)
133 assert(src[0].reg_offset >= 0);
134 if (src[1].file == GRF)
135 assert(src[1].reg_offset >= 0);
136 if (src[2].file == GRF)
137 assert(src[2].reg_offset >= 0);
138 }
139
140 #define ALU1(op) \
141 fs_inst * \
142 fs_visitor::op(fs_reg dst, fs_reg src0) \
143 { \
144 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
145 }
146
147 #define ALU2(op) \
148 fs_inst * \
149 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
150 { \
151 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
152 }
153
154 #define ALU3(op) \
155 fs_inst * \
156 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
157 { \
158 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
159 }
160
161 ALU1(NOT)
162 ALU1(MOV)
163 ALU1(FRC)
164 ALU1(RNDD)
165 ALU1(RNDE)
166 ALU1(RNDZ)
167 ALU2(ADD)
168 ALU2(MUL)
169 ALU2(MACH)
170 ALU2(AND)
171 ALU2(OR)
172 ALU2(XOR)
173 ALU2(SHL)
174 ALU2(SHR)
175 ALU2(ASR)
176 ALU3(LRP)
177 ALU1(BFREV)
178 ALU3(BFE)
179 ALU2(BFI1)
180 ALU3(BFI2)
181 ALU1(FBH)
182 ALU1(FBL)
183 ALU1(CBIT)
184 ALU3(MAD)
185 ALU2(ADDC)
186 ALU2(SUBB)
187 ALU2(SEL)
188
189 /** Gen4 predicated IF. */
190 fs_inst *
191 fs_visitor::IF(uint32_t predicate)
192 {
193 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195 return inst;
196 }
197
198 /** Gen6 IF with embedded comparison. */
199 fs_inst *
200 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
201 {
202 assert(brw->gen == 6);
203 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
204 reg_null_d, src0, src1);
205 inst->conditional_mod = condition;
206 return inst;
207 }
208
209 /**
210 * CMP: Sets the low bit of the destination channels with the result
211 * of the comparison, while the upper bits are undefined, and updates
212 * the flag register with the packed 16 bits of the result.
213 */
214 fs_inst *
215 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
216 {
217 fs_inst *inst;
218
219 /* Take the instruction:
220 *
221 * CMP null<d> src0<f> src1<f>
222 *
223 * Original gen4 does type conversion to the destination type before
224 * comparison, producing garbage results for floating point comparisons.
225 * gen5 does the comparison on the execution type (resolved source types),
226 * so dst type doesn't matter. gen6 does comparison and then uses the
227 * result as if it was the dst type with no conversion, which happens to
228 * mostly work out for float-interpreted-as-int since our comparisons are
229 * for >0, =0, <0.
230 */
231 if (brw->gen == 4) {
232 dst.type = src0.type;
233 if (dst.file == HW_REG)
234 dst.fixed_hw_reg.type = dst.type;
235 }
236
237 resolve_ud_negate(&src0);
238 resolve_ud_negate(&src1);
239
240 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
241 inst->conditional_mod = condition;
242
243 return inst;
244 }
245
246 exec_list
247 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
248 fs_reg varying_offset,
249 uint32_t const_offset)
250 {
251 exec_list instructions;
252 fs_inst *inst;
253
254 /* We have our constant surface use a pitch of 4 bytes, so our index can
255 * be any component of a vector, and then we load 4 contiguous
256 * components starting from that.
257 *
258 * We break down the const_offset to a portion added to the variable
259 * offset and a portion done using reg_offset, which means that if you
260 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
261 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
262 * CSE can later notice that those loads are all the same and eliminate
263 * the redundant ones.
264 */
265 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
266 instructions.push_tail(ADD(vec4_offset,
267 varying_offset, const_offset & ~3));
268
269 int scale = 1;
270 if (brw->gen == 4 && dispatch_width == 8) {
271 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
272 * u, v, r) as parameters, or we can just use the SIMD16 message
273 * consisting of (header, u). We choose the second, at the cost of a
274 * longer return length.
275 */
276 scale = 2;
277 }
278
279 enum opcode op;
280 if (brw->gen >= 7)
281 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
282 else
283 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
284 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
285 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
286 inst->regs_written = 4 * scale;
287 instructions.push_tail(inst);
288
289 if (brw->gen < 7) {
290 inst->base_mrf = 13;
291 inst->header_present = true;
292 if (brw->gen == 4)
293 inst->mlen = 3;
294 else
295 inst->mlen = 1 + dispatch_width / 8;
296 }
297
298 vec4_result.reg_offset += (const_offset & 3) * scale;
299 instructions.push_tail(MOV(dst, vec4_result));
300
301 return instructions;
302 }
303
304 /**
305 * A helper for MOV generation for fixing up broken hardware SEND dependency
306 * handling.
307 */
308 fs_inst *
309 fs_visitor::DEP_RESOLVE_MOV(int grf)
310 {
311 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
312
313 inst->ir = NULL;
314 inst->annotation = "send dependency resolve";
315
316 /* The caller always wants uncompressed to emit the minimal extra
317 * dependencies, and to avoid having to deal with aligning its regs to 2.
318 */
319 inst->force_uncompressed = true;
320
321 return inst;
322 }
323
324 bool
325 fs_inst::equals(fs_inst *inst)
326 {
327 return (opcode == inst->opcode &&
328 dst.equals(inst->dst) &&
329 src[0].equals(inst->src[0]) &&
330 src[1].equals(inst->src[1]) &&
331 src[2].equals(inst->src[2]) &&
332 saturate == inst->saturate &&
333 predicate == inst->predicate &&
334 conditional_mod == inst->conditional_mod &&
335 mlen == inst->mlen &&
336 base_mrf == inst->base_mrf &&
337 sampler == inst->sampler &&
338 target == inst->target &&
339 eot == inst->eot &&
340 header_present == inst->header_present &&
341 shadow_compare == inst->shadow_compare &&
342 offset == inst->offset);
343 }
344
345 bool
346 fs_inst::overwrites_reg(const fs_reg &reg)
347 {
348 return (reg.file == dst.file &&
349 reg.reg == dst.reg &&
350 reg.reg_offset >= dst.reg_offset &&
351 reg.reg_offset < dst.reg_offset + regs_written);
352 }
353
354 bool
355 fs_inst::is_send_from_grf()
356 {
357 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
358 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
359 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
360 src[1].file == GRF) ||
361 (is_tex() && src[0].file == GRF));
362 }
363
364 bool
365 fs_visitor::can_do_source_mods(fs_inst *inst)
366 {
367 if (brw->gen == 6 && inst->is_math())
368 return false;
369
370 if (inst->is_send_from_grf())
371 return false;
372
373 if (!inst->can_do_source_mods())
374 return false;
375
376 return true;
377 }
378
379 void
380 fs_reg::init()
381 {
382 memset(this, 0, sizeof(*this));
383 this->smear = -1;
384 }
385
386 /** Generic unset register constructor. */
387 fs_reg::fs_reg()
388 {
389 init();
390 this->file = BAD_FILE;
391 }
392
393 /** Immediate value constructor. */
394 fs_reg::fs_reg(float f)
395 {
396 init();
397 this->file = IMM;
398 this->type = BRW_REGISTER_TYPE_F;
399 this->imm.f = f;
400 }
401
402 /** Immediate value constructor. */
403 fs_reg::fs_reg(int32_t i)
404 {
405 init();
406 this->file = IMM;
407 this->type = BRW_REGISTER_TYPE_D;
408 this->imm.i = i;
409 }
410
411 /** Immediate value constructor. */
412 fs_reg::fs_reg(uint32_t u)
413 {
414 init();
415 this->file = IMM;
416 this->type = BRW_REGISTER_TYPE_UD;
417 this->imm.u = u;
418 }
419
420 /** Fixed brw_reg. */
421 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
422 {
423 init();
424 this->file = HW_REG;
425 this->fixed_hw_reg = fixed_hw_reg;
426 this->type = fixed_hw_reg.type;
427 }
428
429 bool
430 fs_reg::equals(const fs_reg &r) const
431 {
432 return (file == r.file &&
433 reg == r.reg &&
434 reg_offset == r.reg_offset &&
435 type == r.type &&
436 negate == r.negate &&
437 abs == r.abs &&
438 !reladdr && !r.reladdr &&
439 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
440 sizeof(fixed_hw_reg)) == 0 &&
441 smear == r.smear &&
442 imm.u == r.imm.u);
443 }
444
445 fs_reg
446 fs_reg::retype(uint32_t type)
447 {
448 fs_reg result = *this;
449 result.type = type;
450 return result;
451 }
452
453 bool
454 fs_reg::is_zero() const
455 {
456 if (file != IMM)
457 return false;
458
459 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
460 }
461
462 bool
463 fs_reg::is_one() const
464 {
465 if (file != IMM)
466 return false;
467
468 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
469 }
470
471 bool
472 fs_reg::is_null() const
473 {
474 return file == HW_REG &&
475 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
476 fixed_hw_reg.nr == BRW_ARF_NULL;
477 }
478
479 bool
480 fs_reg::is_valid_3src() const
481 {
482 return file == GRF || file == UNIFORM;
483 }
484
485 int
486 fs_visitor::type_size(const struct glsl_type *type)
487 {
488 unsigned int size, i;
489
490 switch (type->base_type) {
491 case GLSL_TYPE_UINT:
492 case GLSL_TYPE_INT:
493 case GLSL_TYPE_FLOAT:
494 case GLSL_TYPE_BOOL:
495 return type->components();
496 case GLSL_TYPE_ARRAY:
497 return type_size(type->fields.array) * type->length;
498 case GLSL_TYPE_STRUCT:
499 size = 0;
500 for (i = 0; i < type->length; i++) {
501 size += type_size(type->fields.structure[i].type);
502 }
503 return size;
504 case GLSL_TYPE_SAMPLER:
505 /* Samplers take up no register space, since they're baked in at
506 * link time.
507 */
508 return 0;
509 case GLSL_TYPE_ATOMIC_UINT:
510 return 0;
511 case GLSL_TYPE_VOID:
512 case GLSL_TYPE_ERROR:
513 case GLSL_TYPE_INTERFACE:
514 assert(!"not reached");
515 break;
516 }
517
518 return 0;
519 }
520
521 fs_reg
522 fs_visitor::get_timestamp()
523 {
524 assert(brw->gen >= 7);
525
526 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
527 BRW_ARF_TIMESTAMP,
528 0),
529 BRW_REGISTER_TYPE_UD));
530
531 fs_reg dst = fs_reg(this, glsl_type::uint_type);
532
533 fs_inst *mov = emit(MOV(dst, ts));
534 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
535 * even if it's not enabled in the dispatch.
536 */
537 mov->force_writemask_all = true;
538 mov->force_uncompressed = true;
539
540 /* The caller wants the low 32 bits of the timestamp. Since it's running
541 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
542 * which is plenty of time for our purposes. It is identical across the
543 * EUs, but since it's tracking GPU core speed it will increment at a
544 * varying rate as render P-states change.
545 *
546 * The caller could also check if render P-states have changed (or anything
547 * else that might disrupt timing) by setting smear to 2 and checking if
548 * that field is != 0.
549 */
550 dst.smear = 0;
551
552 return dst;
553 }
554
555 void
556 fs_visitor::emit_shader_time_begin()
557 {
558 current_annotation = "shader time start";
559 shader_start_time = get_timestamp();
560 }
561
562 void
563 fs_visitor::emit_shader_time_end()
564 {
565 current_annotation = "shader time end";
566
567 enum shader_time_shader_type type, written_type, reset_type;
568 if (dispatch_width == 8) {
569 type = ST_FS8;
570 written_type = ST_FS8_WRITTEN;
571 reset_type = ST_FS8_RESET;
572 } else {
573 assert(dispatch_width == 16);
574 type = ST_FS16;
575 written_type = ST_FS16_WRITTEN;
576 reset_type = ST_FS16_RESET;
577 }
578
579 fs_reg shader_end_time = get_timestamp();
580
581 /* Check that there weren't any timestamp reset events (assuming these
582 * were the only two timestamp reads that happened).
583 */
584 fs_reg reset = shader_end_time;
585 reset.smear = 2;
586 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
587 test->conditional_mod = BRW_CONDITIONAL_Z;
588 emit(IF(BRW_PREDICATE_NORMAL));
589
590 push_force_uncompressed();
591 fs_reg start = shader_start_time;
592 start.negate = true;
593 fs_reg diff = fs_reg(this, glsl_type::uint_type);
594 emit(ADD(diff, start, shader_end_time));
595
596 /* If there were no instructions between the two timestamp gets, the diff
597 * is 2 cycles. Remove that overhead, so I can forget about that when
598 * trying to determine the time taken for single instructions.
599 */
600 emit(ADD(diff, diff, fs_reg(-2u)));
601
602 emit_shader_time_write(type, diff);
603 emit_shader_time_write(written_type, fs_reg(1u));
604 emit(BRW_OPCODE_ELSE);
605 emit_shader_time_write(reset_type, fs_reg(1u));
606 emit(BRW_OPCODE_ENDIF);
607
608 pop_force_uncompressed();
609 }
610
611 void
612 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
613 fs_reg value)
614 {
615 int shader_time_index =
616 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
617 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
618
619 fs_reg payload;
620 if (dispatch_width == 8)
621 payload = fs_reg(this, glsl_type::uvec2_type);
622 else
623 payload = fs_reg(this, glsl_type::uint_type);
624
625 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
626 fs_reg(), payload, offset, value));
627 }
628
629 void
630 fs_visitor::fail(const char *format, ...)
631 {
632 va_list va;
633 char *msg;
634
635 if (failed)
636 return;
637
638 failed = true;
639
640 va_start(va, format);
641 msg = ralloc_vasprintf(mem_ctx, format, va);
642 va_end(va);
643 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
644
645 this->fail_msg = msg;
646
647 if (INTEL_DEBUG & DEBUG_WM) {
648 fprintf(stderr, "%s", msg);
649 }
650 }
651
652 fs_inst *
653 fs_visitor::emit(enum opcode opcode)
654 {
655 return emit(fs_inst(opcode));
656 }
657
658 fs_inst *
659 fs_visitor::emit(enum opcode opcode, fs_reg dst)
660 {
661 return emit(fs_inst(opcode, dst));
662 }
663
664 fs_inst *
665 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
666 {
667 return emit(fs_inst(opcode, dst, src0));
668 }
669
670 fs_inst *
671 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
672 {
673 return emit(fs_inst(opcode, dst, src0, src1));
674 }
675
676 fs_inst *
677 fs_visitor::emit(enum opcode opcode, fs_reg dst,
678 fs_reg src0, fs_reg src1, fs_reg src2)
679 {
680 return emit(fs_inst(opcode, dst, src0, src1, src2));
681 }
682
683 void
684 fs_visitor::push_force_uncompressed()
685 {
686 force_uncompressed_stack++;
687 }
688
689 void
690 fs_visitor::pop_force_uncompressed()
691 {
692 force_uncompressed_stack--;
693 assert(force_uncompressed_stack >= 0);
694 }
695
696 /**
697 * Returns true if the instruction has a flag that means it won't
698 * update an entire destination register.
699 *
700 * For example, dead code elimination and live variable analysis want to know
701 * when a write to a variable screens off any preceding values that were in
702 * it.
703 */
704 bool
705 fs_inst::is_partial_write()
706 {
707 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
708 this->force_uncompressed ||
709 this->force_sechalf);
710 }
711
712 int
713 fs_inst::regs_read(fs_visitor *v, int arg)
714 {
715 if (is_tex() && arg == 0 && src[0].file == GRF) {
716 if (v->dispatch_width == 16)
717 return (mlen + 1) / 2;
718 else
719 return mlen;
720 }
721 return 1;
722 }
723
724 bool
725 fs_inst::reads_flag()
726 {
727 return predicate;
728 }
729
730 bool
731 fs_inst::writes_flag()
732 {
733 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
734 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
735 }
736
737 /**
738 * Returns how many MRFs an FS opcode will write over.
739 *
740 * Note that this is not the 0 or 1 implied writes in an actual gen
741 * instruction -- the FS opcodes often generate MOVs in addition.
742 */
743 int
744 fs_visitor::implied_mrf_writes(fs_inst *inst)
745 {
746 if (inst->mlen == 0)
747 return 0;
748
749 if (inst->base_mrf == -1)
750 return 0;
751
752 switch (inst->opcode) {
753 case SHADER_OPCODE_RCP:
754 case SHADER_OPCODE_RSQ:
755 case SHADER_OPCODE_SQRT:
756 case SHADER_OPCODE_EXP2:
757 case SHADER_OPCODE_LOG2:
758 case SHADER_OPCODE_SIN:
759 case SHADER_OPCODE_COS:
760 return 1 * dispatch_width / 8;
761 case SHADER_OPCODE_POW:
762 case SHADER_OPCODE_INT_QUOTIENT:
763 case SHADER_OPCODE_INT_REMAINDER:
764 return 2 * dispatch_width / 8;
765 case SHADER_OPCODE_TEX:
766 case FS_OPCODE_TXB:
767 case SHADER_OPCODE_TXD:
768 case SHADER_OPCODE_TXF:
769 case SHADER_OPCODE_TXF_MS:
770 case SHADER_OPCODE_TG4:
771 case SHADER_OPCODE_TG4_OFFSET:
772 case SHADER_OPCODE_TXL:
773 case SHADER_OPCODE_TXS:
774 case SHADER_OPCODE_LOD:
775 return 1;
776 case FS_OPCODE_FB_WRITE:
777 return 2;
778 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
779 case SHADER_OPCODE_GEN4_SCRATCH_READ:
780 return 1;
781 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
782 return inst->mlen;
783 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
784 return 2;
785 case SHADER_OPCODE_UNTYPED_ATOMIC:
786 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
787 return 0;
788 default:
789 assert(!"not reached");
790 return inst->mlen;
791 }
792 }
793
794 int
795 fs_visitor::virtual_grf_alloc(int size)
796 {
797 if (virtual_grf_array_size <= virtual_grf_count) {
798 if (virtual_grf_array_size == 0)
799 virtual_grf_array_size = 16;
800 else
801 virtual_grf_array_size *= 2;
802 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
803 virtual_grf_array_size);
804 }
805 virtual_grf_sizes[virtual_grf_count] = size;
806 return virtual_grf_count++;
807 }
808
809 /** Fixed HW reg constructor. */
810 fs_reg::fs_reg(enum register_file file, int reg)
811 {
812 init();
813 this->file = file;
814 this->reg = reg;
815 this->type = BRW_REGISTER_TYPE_F;
816 }
817
818 /** Fixed HW reg constructor. */
819 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
820 {
821 init();
822 this->file = file;
823 this->reg = reg;
824 this->type = type;
825 }
826
827 /** Automatic reg constructor. */
828 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
829 {
830 init();
831
832 this->file = GRF;
833 this->reg = v->virtual_grf_alloc(v->type_size(type));
834 this->reg_offset = 0;
835 this->type = brw_type_for_base_type(type);
836 }
837
838 fs_reg *
839 fs_visitor::variable_storage(ir_variable *var)
840 {
841 return (fs_reg *)hash_table_find(this->variable_ht, var);
842 }
843
844 void
845 import_uniforms_callback(const void *key,
846 void *data,
847 void *closure)
848 {
849 struct hash_table *dst_ht = (struct hash_table *)closure;
850 const fs_reg *reg = (const fs_reg *)data;
851
852 if (reg->file != UNIFORM)
853 return;
854
855 hash_table_insert(dst_ht, data, key);
856 }
857
858 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
859 * This brings in those uniform definitions
860 */
861 void
862 fs_visitor::import_uniforms(fs_visitor *v)
863 {
864 hash_table_call_foreach(v->variable_ht,
865 import_uniforms_callback,
866 variable_ht);
867 this->params_remap = v->params_remap;
868 this->nr_params_remap = v->nr_params_remap;
869 }
870
871 /* Our support for uniforms is piggy-backed on the struct
872 * gl_fragment_program, because that's where the values actually
873 * get stored, rather than in some global gl_shader_program uniform
874 * store.
875 */
876 void
877 fs_visitor::setup_uniform_values(ir_variable *ir)
878 {
879 int namelen = strlen(ir->name);
880
881 /* The data for our (non-builtin) uniforms is stored in a series of
882 * gl_uniform_driver_storage structs for each subcomponent that
883 * glGetUniformLocation() could name. We know it's been set up in the same
884 * order we'd walk the type, so walk the list of storage and find anything
885 * with our name, or the prefix of a component that starts with our name.
886 */
887 unsigned params_before = c->prog_data.nr_params;
888 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
889 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
890
891 if (strncmp(ir->name, storage->name, namelen) != 0 ||
892 (storage->name[namelen] != 0 &&
893 storage->name[namelen] != '.' &&
894 storage->name[namelen] != '[')) {
895 continue;
896 }
897
898 unsigned slots = storage->type->component_slots();
899 if (storage->array_elements)
900 slots *= storage->array_elements;
901
902 for (unsigned i = 0; i < slots; i++) {
903 c->prog_data.param[c->prog_data.nr_params++] =
904 &storage->storage[i].f;
905 }
906 }
907
908 /* Make sure we actually initialized the right amount of stuff here. */
909 assert(params_before + ir->type->component_slots() ==
910 c->prog_data.nr_params);
911 (void)params_before;
912 }
913
914
915 /* Our support for builtin uniforms is even scarier than non-builtin.
916 * It sits on top of the PROG_STATE_VAR parameters that are
917 * automatically updated from GL context state.
918 */
919 void
920 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
921 {
922 const ir_state_slot *const slots = ir->state_slots;
923 assert(ir->state_slots != NULL);
924
925 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
926 /* This state reference has already been setup by ir_to_mesa, but we'll
927 * get the same index back here.
928 */
929 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
930 (gl_state_index *)slots[i].tokens);
931
932 /* Add each of the unique swizzles of the element as a parameter.
933 * This'll end up matching the expected layout of the
934 * array/matrix/structure we're trying to fill in.
935 */
936 int last_swiz = -1;
937 for (unsigned int j = 0; j < 4; j++) {
938 int swiz = GET_SWZ(slots[i].swizzle, j);
939 if (swiz == last_swiz)
940 break;
941 last_swiz = swiz;
942
943 c->prog_data.param[c->prog_data.nr_params++] =
944 &fp->Base.Parameters->ParameterValues[index][swiz].f;
945 }
946 }
947 }
948
949 fs_reg *
950 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
951 {
952 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
953 fs_reg wpos = *reg;
954 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
955
956 /* gl_FragCoord.x */
957 if (ir->pixel_center_integer) {
958 emit(MOV(wpos, this->pixel_x));
959 } else {
960 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
961 }
962 wpos.reg_offset++;
963
964 /* gl_FragCoord.y */
965 if (!flip && ir->pixel_center_integer) {
966 emit(MOV(wpos, this->pixel_y));
967 } else {
968 fs_reg pixel_y = this->pixel_y;
969 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
970
971 if (flip) {
972 pixel_y.negate = true;
973 offset += c->key.drawable_height - 1.0;
974 }
975
976 emit(ADD(wpos, pixel_y, fs_reg(offset)));
977 }
978 wpos.reg_offset++;
979
980 /* gl_FragCoord.z */
981 if (brw->gen >= 6) {
982 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
983 } else {
984 emit(FS_OPCODE_LINTERP, wpos,
985 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
986 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
987 interp_reg(VARYING_SLOT_POS, 2));
988 }
989 wpos.reg_offset++;
990
991 /* gl_FragCoord.w: Already set up in emit_interpolation */
992 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
993
994 return reg;
995 }
996
997 fs_inst *
998 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
999 glsl_interp_qualifier interpolation_mode,
1000 bool is_centroid)
1001 {
1002 brw_wm_barycentric_interp_mode barycoord_mode;
1003 if (brw->gen >= 6) {
1004 if (is_centroid) {
1005 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1006 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1007 else
1008 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1009 } else {
1010 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1011 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1012 else
1013 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1014 }
1015 } else {
1016 /* On Ironlake and below, there is only one interpolation mode.
1017 * Centroid interpolation doesn't mean anything on this hardware --
1018 * there is no multisampling.
1019 */
1020 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1021 }
1022 return emit(FS_OPCODE_LINTERP, attr,
1023 this->delta_x[barycoord_mode],
1024 this->delta_y[barycoord_mode], interp);
1025 }
1026
1027 fs_reg *
1028 fs_visitor::emit_general_interpolation(ir_variable *ir)
1029 {
1030 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1031 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1032 fs_reg attr = *reg;
1033
1034 unsigned int array_elements;
1035 const glsl_type *type;
1036
1037 if (ir->type->is_array()) {
1038 array_elements = ir->type->length;
1039 if (array_elements == 0) {
1040 fail("dereferenced array '%s' has length 0\n", ir->name);
1041 }
1042 type = ir->type->fields.array;
1043 } else {
1044 array_elements = 1;
1045 type = ir->type;
1046 }
1047
1048 glsl_interp_qualifier interpolation_mode =
1049 ir->determine_interpolation_mode(c->key.flat_shade);
1050
1051 int location = ir->location;
1052 for (unsigned int i = 0; i < array_elements; i++) {
1053 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1054 if (c->prog_data.urb_setup[location] == -1) {
1055 /* If there's no incoming setup data for this slot, don't
1056 * emit interpolation for it.
1057 */
1058 attr.reg_offset += type->vector_elements;
1059 location++;
1060 continue;
1061 }
1062
1063 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1064 /* Constant interpolation (flat shading) case. The SF has
1065 * handed us defined values in only the constant offset
1066 * field of the setup reg.
1067 */
1068 for (unsigned int k = 0; k < type->vector_elements; k++) {
1069 struct brw_reg interp = interp_reg(location, k);
1070 interp = suboffset(interp, 3);
1071 interp.type = reg->type;
1072 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1073 attr.reg_offset++;
1074 }
1075 } else {
1076 /* Smooth/noperspective interpolation case. */
1077 for (unsigned int k = 0; k < type->vector_elements; k++) {
1078 /* FINISHME: At some point we probably want to push
1079 * this farther by giving similar treatment to the
1080 * other potentially constant components of the
1081 * attribute, as well as making brw_vs_constval.c
1082 * handle varyings other than gl_TexCoord.
1083 */
1084 struct brw_reg interp = interp_reg(location, k);
1085 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1086 ir->centroid);
1087 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1088 /* Get the pixel/sample mask into f0 so that we know
1089 * which pixels are lit. Then, for each channel that is
1090 * unlit, replace the centroid data with non-centroid
1091 * data.
1092 */
1093 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1094 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1095 interpolation_mode, false);
1096 inst->predicate = BRW_PREDICATE_NORMAL;
1097 inst->predicate_inverse = true;
1098 }
1099 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1100 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1101 }
1102 attr.reg_offset++;
1103 }
1104
1105 }
1106 location++;
1107 }
1108 }
1109
1110 return reg;
1111 }
1112
1113 fs_reg *
1114 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1115 {
1116 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1117
1118 /* The frontfacing comes in as a bit in the thread payload. */
1119 if (brw->gen >= 6) {
1120 emit(BRW_OPCODE_ASR, *reg,
1121 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1122 fs_reg(15));
1123 emit(BRW_OPCODE_NOT, *reg, *reg);
1124 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1125 } else {
1126 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1127 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1128 * us front face
1129 */
1130 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1131 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1132 }
1133
1134 return reg;
1135 }
1136
1137 void
1138 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1139 {
1140 assert(dst.type == BRW_REGISTER_TYPE_F);
1141
1142 if (c->key.compute_pos_offset) {
1143 /* Convert int_sample_pos to floating point */
1144 emit(MOV(dst, int_sample_pos));
1145 /* Scale to the range [0, 1] */
1146 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1147 }
1148 else {
1149 /* From ARB_sample_shading specification:
1150 * "When rendering to a non-multisample buffer, or if multisample
1151 * rasterization is disabled, gl_SamplePosition will always be
1152 * (0.5, 0.5).
1153 */
1154 emit(MOV(dst, fs_reg(0.5f)));
1155 }
1156 }
1157
1158 fs_reg *
1159 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1160 {
1161 assert(brw->gen >= 6);
1162 assert(ir->type == glsl_type::vec2_type);
1163
1164 this->current_annotation = "compute sample position";
1165 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1166 fs_reg pos = *reg;
1167 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1168 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1169
1170 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1171 * mode will be enabled.
1172 *
1173 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1174 * R31.1:0 Position Offset X/Y for Slot[3:0]
1175 * R31.3:2 Position Offset X/Y for Slot[7:4]
1176 * .....
1177 *
1178 * The X, Y sample positions come in as bytes in thread payload. So, read
1179 * the positions using vstride=16, width=8, hstride=2.
1180 */
1181 struct brw_reg sample_pos_reg =
1182 stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1183 BRW_REGISTER_TYPE_B), 16, 8, 2);
1184
1185 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1186 if (dispatch_width == 16) {
1187 int_sample_x.sechalf = true;
1188 fs_inst *inst = emit(MOV(int_sample_x,
1189 fs_reg(suboffset(sample_pos_reg, 16))));
1190 inst->force_sechalf = true;
1191 int_sample_x.sechalf = false;
1192 }
1193 /* Compute gl_SamplePosition.x */
1194 compute_sample_position(pos, int_sample_x);
1195 pos.reg_offset++;
1196 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1197 if (dispatch_width == 16) {
1198 int_sample_y.sechalf = true;
1199 fs_inst *inst = emit(MOV(int_sample_y,
1200 fs_reg(suboffset(sample_pos_reg, 17))));
1201 inst->force_sechalf = true;
1202 int_sample_y.sechalf = false;
1203 }
1204 /* Compute gl_SamplePosition.y */
1205 compute_sample_position(pos, int_sample_y);
1206 return reg;
1207 }
1208
1209 fs_reg *
1210 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1211 {
1212 assert(brw->gen >= 6);
1213
1214 this->current_annotation = "compute sample id";
1215 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1216
1217 if (c->key.compute_sample_id) {
1218 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1219 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1220 t2.type = BRW_REGISTER_TYPE_UW;
1221
1222 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1223 * 8x multisampling, subspan 0 will represent sample N (where N
1224 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1225 * 7. We can find the value of N by looking at R0.0 bits 7:6
1226 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1227 * (since samples are always delivered in pairs). That is, we
1228 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1229 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1230 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1231 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1232 * populating a temporary variable with the sequence (0, 1, 2, 3),
1233 * and then reading from it using vstride=1, width=4, hstride=0.
1234 * These computations hold good for 4x multisampling as well.
1235 */
1236 emit(BRW_OPCODE_AND, t1,
1237 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1238 fs_reg(brw_imm_d(0xc0)));
1239 emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1240 /* This works for both SIMD8 and SIMD16 */
1241 emit(MOV(t2, brw_imm_v(0x3210)));
1242 /* This special instruction takes care of setting vstride=1,
1243 * width=4, hstride=0 of t2 during an ADD instruction.
1244 */
1245 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1246 } else {
1247 /* As per GL_ARB_sample_shading specification:
1248 * "When rendering to a non-multisample buffer, or if multisample
1249 * rasterization is disabled, gl_SampleID will always be zero."
1250 */
1251 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1252 }
1253
1254 return reg;
1255 }
1256
1257 fs_reg
1258 fs_visitor::fix_math_operand(fs_reg src)
1259 {
1260 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1261 * might be able to do better by doing execsize = 1 math and then
1262 * expanding that result out, but we would need to be careful with
1263 * masking.
1264 *
1265 * The hardware ignores source modifiers (negate and abs) on math
1266 * instructions, so we also move to a temp to set those up.
1267 */
1268 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1269 !src.abs && !src.negate)
1270 return src;
1271
1272 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1273 * operands to math
1274 */
1275 if (brw->gen >= 7 && src.file != IMM)
1276 return src;
1277
1278 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1279 expanded.type = src.type;
1280 emit(BRW_OPCODE_MOV, expanded, src);
1281 return expanded;
1282 }
1283
1284 fs_inst *
1285 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1286 {
1287 switch (opcode) {
1288 case SHADER_OPCODE_RCP:
1289 case SHADER_OPCODE_RSQ:
1290 case SHADER_OPCODE_SQRT:
1291 case SHADER_OPCODE_EXP2:
1292 case SHADER_OPCODE_LOG2:
1293 case SHADER_OPCODE_SIN:
1294 case SHADER_OPCODE_COS:
1295 break;
1296 default:
1297 assert(!"not reached: bad math opcode");
1298 return NULL;
1299 }
1300
1301 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1302 * might be able to do better by doing execsize = 1 math and then
1303 * expanding that result out, but we would need to be careful with
1304 * masking.
1305 *
1306 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1307 * instructions, so we also move to a temp to set those up.
1308 */
1309 if (brw->gen >= 6)
1310 src = fix_math_operand(src);
1311
1312 fs_inst *inst = emit(opcode, dst, src);
1313
1314 if (brw->gen < 6) {
1315 inst->base_mrf = 2;
1316 inst->mlen = dispatch_width / 8;
1317 }
1318
1319 return inst;
1320 }
1321
1322 fs_inst *
1323 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1324 {
1325 int base_mrf = 2;
1326 fs_inst *inst;
1327
1328 switch (opcode) {
1329 case SHADER_OPCODE_INT_QUOTIENT:
1330 case SHADER_OPCODE_INT_REMAINDER:
1331 if (brw->gen >= 7 && dispatch_width == 16)
1332 fail("16-wide INTDIV unsupported\n");
1333 break;
1334 case SHADER_OPCODE_POW:
1335 break;
1336 default:
1337 assert(!"not reached: unsupported binary math opcode.");
1338 return NULL;
1339 }
1340
1341 if (brw->gen >= 6) {
1342 src0 = fix_math_operand(src0);
1343 src1 = fix_math_operand(src1);
1344
1345 inst = emit(opcode, dst, src0, src1);
1346 } else {
1347 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1348 * "Message Payload":
1349 *
1350 * "Operand0[7]. For the INT DIV functions, this operand is the
1351 * denominator."
1352 * ...
1353 * "Operand1[7]. For the INT DIV functions, this operand is the
1354 * numerator."
1355 */
1356 bool is_int_div = opcode != SHADER_OPCODE_POW;
1357 fs_reg &op0 = is_int_div ? src1 : src0;
1358 fs_reg &op1 = is_int_div ? src0 : src1;
1359
1360 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1361 inst = emit(opcode, dst, op0, reg_null_f);
1362
1363 inst->base_mrf = base_mrf;
1364 inst->mlen = 2 * dispatch_width / 8;
1365 }
1366 return inst;
1367 }
1368
1369 void
1370 fs_visitor::assign_curb_setup()
1371 {
1372 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1373 if (dispatch_width == 8) {
1374 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1375 } else {
1376 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1377 }
1378
1379 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1380 foreach_list(node, &this->instructions) {
1381 fs_inst *inst = (fs_inst *)node;
1382
1383 for (unsigned int i = 0; i < 3; i++) {
1384 if (inst->src[i].file == UNIFORM) {
1385 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1386 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1387 constant_nr / 8,
1388 constant_nr % 8);
1389
1390 inst->src[i].file = HW_REG;
1391 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1392 }
1393 }
1394 }
1395 }
1396
1397 void
1398 fs_visitor::calculate_urb_setup()
1399 {
1400 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1401 c->prog_data.urb_setup[i] = -1;
1402 }
1403
1404 int urb_next = 0;
1405 /* Figure out where each of the incoming setup attributes lands. */
1406 if (brw->gen >= 6) {
1407 if (_mesa_bitcount_64(fp->Base.InputsRead &
1408 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1409 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1410 * first 16 varying inputs, so we can put them wherever we want.
1411 * Just put them in order.
1412 *
1413 * This is useful because it means that (a) inputs not used by the
1414 * fragment shader won't take up valuable register space, and (b) we
1415 * won't have to recompile the fragment shader if it gets paired with
1416 * a different vertex (or geometry) shader.
1417 */
1418 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1419 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1420 BITFIELD64_BIT(i)) {
1421 c->prog_data.urb_setup[i] = urb_next++;
1422 }
1423 }
1424 } else {
1425 /* We have enough input varyings that the SF/SBE pipeline stage can't
1426 * arbitrarily rearrange them to suit our whim; we have to put them
1427 * in an order that matches the output of the previous pipeline stage
1428 * (geometry or vertex shader).
1429 */
1430 struct brw_vue_map prev_stage_vue_map;
1431 brw_compute_vue_map(brw, &prev_stage_vue_map,
1432 c->key.input_slots_valid);
1433 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1434 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1435 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1436 slot++) {
1437 int varying = prev_stage_vue_map.slot_to_varying[slot];
1438 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1439 * unused.
1440 */
1441 if (varying != BRW_VARYING_SLOT_COUNT &&
1442 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1443 BITFIELD64_BIT(varying))) {
1444 c->prog_data.urb_setup[varying] = slot - first_slot;
1445 }
1446 }
1447 urb_next = prev_stage_vue_map.num_slots - first_slot;
1448 }
1449 } else {
1450 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1451 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1452 /* Point size is packed into the header, not as a general attribute */
1453 if (i == VARYING_SLOT_PSIZ)
1454 continue;
1455
1456 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1457 /* The back color slot is skipped when the front color is
1458 * also written to. In addition, some slots can be
1459 * written in the vertex shader and not read in the
1460 * fragment shader. So the register number must always be
1461 * incremented, mapped or not.
1462 */
1463 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1464 c->prog_data.urb_setup[i] = urb_next;
1465 urb_next++;
1466 }
1467 }
1468
1469 /*
1470 * It's a FS only attribute, and we did interpolation for this attribute
1471 * in SF thread. So, count it here, too.
1472 *
1473 * See compile_sf_prog() for more info.
1474 */
1475 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1476 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1477 }
1478
1479 c->prog_data.num_varying_inputs = urb_next;
1480 }
1481
1482 void
1483 fs_visitor::assign_urb_setup()
1484 {
1485 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1486
1487 /* Offset all the urb_setup[] index by the actual position of the
1488 * setup regs, now that the location of the constants has been chosen.
1489 */
1490 foreach_list(node, &this->instructions) {
1491 fs_inst *inst = (fs_inst *)node;
1492
1493 if (inst->opcode == FS_OPCODE_LINTERP) {
1494 assert(inst->src[2].file == HW_REG);
1495 inst->src[2].fixed_hw_reg.nr += urb_start;
1496 }
1497
1498 if (inst->opcode == FS_OPCODE_CINTERP) {
1499 assert(inst->src[0].file == HW_REG);
1500 inst->src[0].fixed_hw_reg.nr += urb_start;
1501 }
1502 }
1503
1504 /* Each attribute is 4 setup channels, each of which is half a reg. */
1505 this->first_non_payload_grf =
1506 urb_start + c->prog_data.num_varying_inputs * 2;
1507 }
1508
1509 /**
1510 * Split large virtual GRFs into separate components if we can.
1511 *
1512 * This is mostly duplicated with what brw_fs_vector_splitting does,
1513 * but that's really conservative because it's afraid of doing
1514 * splitting that doesn't result in real progress after the rest of
1515 * the optimization phases, which would cause infinite looping in
1516 * optimization. We can do it once here, safely. This also has the
1517 * opportunity to split interpolated values, or maybe even uniforms,
1518 * which we don't have at the IR level.
1519 *
1520 * We want to split, because virtual GRFs are what we register
1521 * allocate and spill (due to contiguousness requirements for some
1522 * instructions), and they're what we naturally generate in the
1523 * codegen process, but most virtual GRFs don't actually need to be
1524 * contiguous sets of GRFs. If we split, we'll end up with reduced
1525 * live intervals and better dead code elimination and coalescing.
1526 */
1527 void
1528 fs_visitor::split_virtual_grfs()
1529 {
1530 int num_vars = this->virtual_grf_count;
1531 bool split_grf[num_vars];
1532 int new_virtual_grf[num_vars];
1533
1534 /* Try to split anything > 0 sized. */
1535 for (int i = 0; i < num_vars; i++) {
1536 if (this->virtual_grf_sizes[i] != 1)
1537 split_grf[i] = true;
1538 else
1539 split_grf[i] = false;
1540 }
1541
1542 if (brw->has_pln &&
1543 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1544 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1545 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1546 * Gen6, that was the only supported interpolation mode, and since Gen6,
1547 * delta_x and delta_y are in fixed hardware registers.
1548 */
1549 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1550 false;
1551 }
1552
1553 foreach_list(node, &this->instructions) {
1554 fs_inst *inst = (fs_inst *)node;
1555
1556 /* If there's a SEND message that requires contiguous destination
1557 * registers, no splitting is allowed.
1558 */
1559 if (inst->regs_written > 1) {
1560 split_grf[inst->dst.reg] = false;
1561 }
1562
1563 /* If we're sending from a GRF, don't split it, on the assumption that
1564 * the send is reading the whole thing.
1565 */
1566 if (inst->is_send_from_grf()) {
1567 for (int i = 0; i < 3; i++) {
1568 if (inst->src[i].file == GRF) {
1569 split_grf[inst->src[i].reg] = false;
1570 }
1571 }
1572 }
1573 }
1574
1575 /* Allocate new space for split regs. Note that the virtual
1576 * numbers will be contiguous.
1577 */
1578 for (int i = 0; i < num_vars; i++) {
1579 if (split_grf[i]) {
1580 new_virtual_grf[i] = virtual_grf_alloc(1);
1581 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1582 int reg = virtual_grf_alloc(1);
1583 assert(reg == new_virtual_grf[i] + j - 1);
1584 (void) reg;
1585 }
1586 this->virtual_grf_sizes[i] = 1;
1587 }
1588 }
1589
1590 foreach_list(node, &this->instructions) {
1591 fs_inst *inst = (fs_inst *)node;
1592
1593 if (inst->dst.file == GRF &&
1594 split_grf[inst->dst.reg] &&
1595 inst->dst.reg_offset != 0) {
1596 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1597 inst->dst.reg_offset - 1);
1598 inst->dst.reg_offset = 0;
1599 }
1600 for (int i = 0; i < 3; i++) {
1601 if (inst->src[i].file == GRF &&
1602 split_grf[inst->src[i].reg] &&
1603 inst->src[i].reg_offset != 0) {
1604 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1605 inst->src[i].reg_offset - 1);
1606 inst->src[i].reg_offset = 0;
1607 }
1608 }
1609 }
1610 invalidate_live_intervals();
1611 }
1612
1613 /**
1614 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1615 *
1616 * During code generation, we create tons of temporary variables, many of
1617 * which get immediately killed and are never used again. Yet, in later
1618 * optimization and analysis passes, such as compute_live_intervals, we need
1619 * to loop over all the virtual GRFs. Compacting them can save a lot of
1620 * overhead.
1621 */
1622 void
1623 fs_visitor::compact_virtual_grfs()
1624 {
1625 /* Mark which virtual GRFs are used, and count how many. */
1626 int remap_table[this->virtual_grf_count];
1627 memset(remap_table, -1, sizeof(remap_table));
1628
1629 foreach_list(node, &this->instructions) {
1630 const fs_inst *inst = (const fs_inst *) node;
1631
1632 if (inst->dst.file == GRF)
1633 remap_table[inst->dst.reg] = 0;
1634
1635 for (int i = 0; i < 3; i++) {
1636 if (inst->src[i].file == GRF)
1637 remap_table[inst->src[i].reg] = 0;
1638 }
1639 }
1640
1641 /* In addition to registers used in instructions, fs_visitor keeps
1642 * direct references to certain special values which must be patched:
1643 */
1644 fs_reg *special[] = {
1645 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1646 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1647 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1648 &delta_x[0], &delta_x[1], &delta_x[2],
1649 &delta_x[3], &delta_x[4], &delta_x[5],
1650 &delta_y[0], &delta_y[1], &delta_y[2],
1651 &delta_y[3], &delta_y[4], &delta_y[5],
1652 };
1653 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1654 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1655
1656 /* Treat all special values as used, to be conservative */
1657 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1658 if (special[i]->file == GRF)
1659 remap_table[special[i]->reg] = 0;
1660 }
1661
1662 /* Compact the GRF arrays. */
1663 int new_index = 0;
1664 for (int i = 0; i < this->virtual_grf_count; i++) {
1665 if (remap_table[i] != -1) {
1666 remap_table[i] = new_index;
1667 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1668 invalidate_live_intervals();
1669 ++new_index;
1670 }
1671 }
1672
1673 this->virtual_grf_count = new_index;
1674
1675 /* Patch all the instructions to use the newly renumbered registers */
1676 foreach_list(node, &this->instructions) {
1677 fs_inst *inst = (fs_inst *) node;
1678
1679 if (inst->dst.file == GRF)
1680 inst->dst.reg = remap_table[inst->dst.reg];
1681
1682 for (int i = 0; i < 3; i++) {
1683 if (inst->src[i].file == GRF)
1684 inst->src[i].reg = remap_table[inst->src[i].reg];
1685 }
1686 }
1687
1688 /* Patch all the references to special values */
1689 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1690 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1691 special[i]->reg = remap_table[special[i]->reg];
1692 }
1693 }
1694
1695 bool
1696 fs_visitor::remove_dead_constants()
1697 {
1698 if (dispatch_width == 8) {
1699 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1700 this->nr_params_remap = c->prog_data.nr_params;
1701
1702 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1703 this->params_remap[i] = -1;
1704
1705 /* Find which params are still in use. */
1706 foreach_list(node, &this->instructions) {
1707 fs_inst *inst = (fs_inst *)node;
1708
1709 for (int i = 0; i < 3; i++) {
1710 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1711
1712 if (inst->src[i].file != UNIFORM)
1713 continue;
1714
1715 /* Section 5.11 of the OpenGL 4.3 spec says:
1716 *
1717 * "Out-of-bounds reads return undefined values, which include
1718 * values from other variables of the active program or zero."
1719 */
1720 if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1721 constant_nr = 0;
1722 }
1723
1724 /* For now, set this to non-negative. We'll give it the
1725 * actual new number in a moment, in order to keep the
1726 * register numbers nicely ordered.
1727 */
1728 this->params_remap[constant_nr] = 0;
1729 }
1730 }
1731
1732 /* Figure out what the new numbers for the params will be. At some
1733 * point when we're doing uniform array access, we're going to want
1734 * to keep the distinction between .reg and .reg_offset, but for
1735 * now we don't care.
1736 */
1737 unsigned int new_nr_params = 0;
1738 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1739 if (this->params_remap[i] != -1) {
1740 this->params_remap[i] = new_nr_params++;
1741 }
1742 }
1743
1744 /* Update the list of params to be uploaded to match our new numbering. */
1745 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1746 int remapped = this->params_remap[i];
1747
1748 if (remapped == -1)
1749 continue;
1750
1751 c->prog_data.param[remapped] = c->prog_data.param[i];
1752 }
1753
1754 c->prog_data.nr_params = new_nr_params;
1755 } else {
1756 /* This should have been generated in the 8-wide pass already. */
1757 assert(this->params_remap);
1758 }
1759
1760 /* Now do the renumbering of the shader to remove unused params. */
1761 foreach_list(node, &this->instructions) {
1762 fs_inst *inst = (fs_inst *)node;
1763
1764 for (int i = 0; i < 3; i++) {
1765 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1766
1767 if (inst->src[i].file != UNIFORM)
1768 continue;
1769
1770 /* as above alias to 0 */
1771 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1772 constant_nr = 0;
1773 }
1774 assert(this->params_remap[constant_nr] != -1);
1775 inst->src[i].reg = this->params_remap[constant_nr];
1776 inst->src[i].reg_offset = 0;
1777 }
1778 }
1779
1780 return true;
1781 }
1782
1783 /*
1784 * Implements array access of uniforms by inserting a
1785 * PULL_CONSTANT_LOAD instruction.
1786 *
1787 * Unlike temporary GRF array access (where we don't support it due to
1788 * the difficulty of doing relative addressing on instruction
1789 * destinations), we could potentially do array access of uniforms
1790 * that were loaded in GRF space as push constants. In real-world
1791 * usage we've seen, though, the arrays being used are always larger
1792 * than we could load as push constants, so just always move all
1793 * uniform array access out to a pull constant buffer.
1794 */
1795 void
1796 fs_visitor::move_uniform_array_access_to_pull_constants()
1797 {
1798 int pull_constant_loc[c->prog_data.nr_params];
1799
1800 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1801 pull_constant_loc[i] = -1;
1802 }
1803
1804 /* Walk through and find array access of uniforms. Put a copy of that
1805 * uniform in the pull constant buffer.
1806 *
1807 * Note that we don't move constant-indexed accesses to arrays. No
1808 * testing has been done of the performance impact of this choice.
1809 */
1810 foreach_list_safe(node, &this->instructions) {
1811 fs_inst *inst = (fs_inst *)node;
1812
1813 for (int i = 0 ; i < 3; i++) {
1814 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1815 continue;
1816
1817 int uniform = inst->src[i].reg;
1818
1819 /* If this array isn't already present in the pull constant buffer,
1820 * add it.
1821 */
1822 if (pull_constant_loc[uniform] == -1) {
1823 const float **values = &c->prog_data.param[uniform];
1824
1825 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1826
1827 assert(param_size[uniform]);
1828
1829 for (int j = 0; j < param_size[uniform]; j++) {
1830 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1831 values[j];
1832 }
1833 }
1834
1835 /* Set up the annotation tracking for new generated instructions. */
1836 base_ir = inst->ir;
1837 current_annotation = inst->annotation;
1838
1839 fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1840 fs_reg temp = fs_reg(this, glsl_type::float_type);
1841 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1842 surf_index,
1843 *inst->src[i].reladdr,
1844 pull_constant_loc[uniform] +
1845 inst->src[i].reg_offset);
1846 inst->insert_before(&list);
1847
1848 inst->src[i].file = temp.file;
1849 inst->src[i].reg = temp.reg;
1850 inst->src[i].reg_offset = temp.reg_offset;
1851 inst->src[i].reladdr = NULL;
1852 }
1853 }
1854 }
1855
1856 /**
1857 * Choose accesses from the UNIFORM file to demote to using the pull
1858 * constant buffer.
1859 *
1860 * We allow a fragment shader to have more than the specified minimum
1861 * maximum number of fragment shader uniform components (64). If
1862 * there are too many of these, they'd fill up all of register space.
1863 * So, this will push some of them out to the pull constant buffer and
1864 * update the program to load them.
1865 */
1866 void
1867 fs_visitor::setup_pull_constants()
1868 {
1869 /* Only allow 16 registers (128 uniform components) as push constants. */
1870 unsigned int max_uniform_components = 16 * 8;
1871 if (c->prog_data.nr_params <= max_uniform_components)
1872 return;
1873
1874 if (dispatch_width == 16) {
1875 fail("Pull constants not supported in 16-wide\n");
1876 return;
1877 }
1878
1879 /* Just demote the end of the list. We could probably do better
1880 * here, demoting things that are rarely used in the program first.
1881 */
1882 unsigned int pull_uniform_base = max_uniform_components;
1883
1884 int pull_constant_loc[c->prog_data.nr_params];
1885 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1886 if (i < pull_uniform_base) {
1887 pull_constant_loc[i] = -1;
1888 } else {
1889 pull_constant_loc[i] = -1;
1890 /* If our constant is already being uploaded for reladdr purposes,
1891 * reuse it.
1892 */
1893 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1894 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1895 pull_constant_loc[i] = j;
1896 break;
1897 }
1898 }
1899 if (pull_constant_loc[i] == -1) {
1900 int pull_index = c->prog_data.nr_pull_params++;
1901 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1902 pull_constant_loc[i] = pull_index;;
1903 }
1904 }
1905 }
1906 c->prog_data.nr_params = pull_uniform_base;
1907
1908 foreach_list(node, &this->instructions) {
1909 fs_inst *inst = (fs_inst *)node;
1910
1911 for (int i = 0; i < 3; i++) {
1912 if (inst->src[i].file != UNIFORM)
1913 continue;
1914
1915 int pull_index = pull_constant_loc[inst->src[i].reg +
1916 inst->src[i].reg_offset];
1917 if (pull_index == -1)
1918 continue;
1919
1920 assert(!inst->src[i].reladdr);
1921
1922 fs_reg dst = fs_reg(this, glsl_type::float_type);
1923 fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1924 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1925 fs_inst *pull =
1926 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1927 dst, index, offset);
1928 pull->ir = inst->ir;
1929 pull->annotation = inst->annotation;
1930
1931 inst->insert_before(pull);
1932
1933 inst->src[i].file = GRF;
1934 inst->src[i].reg = dst.reg;
1935 inst->src[i].reg_offset = 0;
1936 inst->src[i].smear = pull_index & 3;
1937 }
1938 }
1939 }
1940
1941 bool
1942 fs_visitor::opt_algebraic()
1943 {
1944 bool progress = false;
1945
1946 foreach_list(node, &this->instructions) {
1947 fs_inst *inst = (fs_inst *)node;
1948
1949 switch (inst->opcode) {
1950 case BRW_OPCODE_MUL:
1951 if (inst->src[1].file != IMM)
1952 continue;
1953
1954 /* a * 1.0 = a */
1955 if (inst->src[1].is_one()) {
1956 inst->opcode = BRW_OPCODE_MOV;
1957 inst->src[1] = reg_undef;
1958 progress = true;
1959 break;
1960 }
1961
1962 /* a * 0.0 = 0.0 */
1963 if (inst->src[1].is_zero()) {
1964 inst->opcode = BRW_OPCODE_MOV;
1965 inst->src[0] = inst->src[1];
1966 inst->src[1] = reg_undef;
1967 progress = true;
1968 break;
1969 }
1970
1971 break;
1972 case BRW_OPCODE_ADD:
1973 if (inst->src[1].file != IMM)
1974 continue;
1975
1976 /* a + 0.0 = a */
1977 if (inst->src[1].is_zero()) {
1978 inst->opcode = BRW_OPCODE_MOV;
1979 inst->src[1] = reg_undef;
1980 progress = true;
1981 break;
1982 }
1983 break;
1984 case BRW_OPCODE_OR:
1985 if (inst->src[0].equals(inst->src[1])) {
1986 inst->opcode = BRW_OPCODE_MOV;
1987 inst->src[1] = reg_undef;
1988 progress = true;
1989 break;
1990 }
1991 break;
1992 case BRW_OPCODE_SEL:
1993 if (inst->saturate && inst->src[1].file == IMM) {
1994 switch (inst->conditional_mod) {
1995 case BRW_CONDITIONAL_LE:
1996 case BRW_CONDITIONAL_L:
1997 switch (inst->src[1].type) {
1998 case BRW_REGISTER_TYPE_F:
1999 if (inst->src[1].imm.f >= 1.0f) {
2000 inst->opcode = BRW_OPCODE_MOV;
2001 inst->src[1] = reg_undef;
2002 progress = true;
2003 }
2004 break;
2005 default:
2006 break;
2007 }
2008 break;
2009 case BRW_CONDITIONAL_GE:
2010 case BRW_CONDITIONAL_G:
2011 switch (inst->src[1].type) {
2012 case BRW_REGISTER_TYPE_F:
2013 if (inst->src[1].imm.f <= 0.0f) {
2014 inst->opcode = BRW_OPCODE_MOV;
2015 inst->src[1] = reg_undef;
2016 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2017 progress = true;
2018 }
2019 break;
2020 default:
2021 break;
2022 }
2023 default:
2024 break;
2025 }
2026 }
2027 break;
2028 default:
2029 break;
2030 }
2031 }
2032
2033 return progress;
2034 }
2035
2036 /**
2037 * Removes any instructions writing a VGRF where that VGRF is not used by any
2038 * later instruction.
2039 */
2040 bool
2041 fs_visitor::dead_code_eliminate()
2042 {
2043 bool progress = false;
2044 int pc = 0;
2045
2046 calculate_live_intervals();
2047
2048 foreach_list_safe(node, &this->instructions) {
2049 fs_inst *inst = (fs_inst *)node;
2050
2051 if (inst->dst.file == GRF && !inst->has_side_effects()) {
2052 bool dead = true;
2053
2054 for (int i = 0; i < inst->regs_written; i++) {
2055 int var = live_intervals->var_from_vgrf[inst->dst.reg];
2056 assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2057 if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2058 dead = false;
2059 break;
2060 }
2061 }
2062
2063 if (dead) {
2064 /* Don't dead code eliminate instructions that write to the
2065 * accumulator as a side-effect. Instead just set the destination
2066 * to the null register to free it.
2067 */
2068 switch (inst->opcode) {
2069 case BRW_OPCODE_ADDC:
2070 case BRW_OPCODE_SUBB:
2071 case BRW_OPCODE_MACH:
2072 inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2073 break;
2074 default:
2075 inst->remove();
2076 progress = true;
2077 break;
2078 }
2079 }
2080 }
2081
2082 pc++;
2083 }
2084
2085 if (progress)
2086 invalidate_live_intervals();
2087
2088 return progress;
2089 }
2090
2091 struct dead_code_hash_key
2092 {
2093 int vgrf;
2094 int reg_offset;
2095 };
2096
2097 static bool
2098 dead_code_hash_compare(const void *a, const void *b)
2099 {
2100 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2101 }
2102
2103 static void
2104 clear_dead_code_hash(struct hash_table *ht)
2105 {
2106 struct hash_entry *entry;
2107
2108 hash_table_foreach(ht, entry) {
2109 _mesa_hash_table_remove(ht, entry);
2110 }
2111 }
2112
2113 static void
2114 insert_dead_code_hash(struct hash_table *ht,
2115 int vgrf, int reg_offset, fs_inst *inst)
2116 {
2117 /* We don't bother freeing keys, because they'll be GCed with the ht. */
2118 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2119
2120 key->vgrf = vgrf;
2121 key->reg_offset = reg_offset;
2122
2123 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2124 }
2125
2126 static struct hash_entry *
2127 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2128 {
2129 struct dead_code_hash_key key;
2130
2131 key.vgrf = vgrf;
2132 key.reg_offset = reg_offset;
2133
2134 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2135 }
2136
2137 static void
2138 remove_dead_code_hash(struct hash_table *ht,
2139 int vgrf, int reg_offset)
2140 {
2141 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2142 if (!entry)
2143 return;
2144
2145 _mesa_hash_table_remove(ht, entry);
2146 }
2147
2148 /**
2149 * Walks basic blocks, removing any regs that are written but not read before
2150 * being redefined.
2151 *
2152 * The dead_code_eliminate() function implements a global dead code
2153 * elimination, but it only handles the removing the last write to a register
2154 * if it's never read. This one can handle intermediate writes, but only
2155 * within a basic block.
2156 */
2157 bool
2158 fs_visitor::dead_code_eliminate_local()
2159 {
2160 struct hash_table *ht;
2161 bool progress = false;
2162
2163 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2164
2165 foreach_list_safe(node, &this->instructions) {
2166 fs_inst *inst = (fs_inst *)node;
2167
2168 /* At a basic block, empty the HT since we don't understand dataflow
2169 * here.
2170 */
2171 if (inst->is_control_flow()) {
2172 clear_dead_code_hash(ht);
2173 continue;
2174 }
2175
2176 /* Clear the HT of any instructions that got read. */
2177 for (int i = 0; i < 3; i++) {
2178 fs_reg src = inst->src[i];
2179 if (src.file != GRF)
2180 continue;
2181
2182 int read = 1;
2183 if (inst->is_send_from_grf())
2184 read = virtual_grf_sizes[src.reg] - src.reg_offset;
2185
2186 for (int reg_offset = src.reg_offset;
2187 reg_offset < src.reg_offset + read;
2188 reg_offset++) {
2189 remove_dead_code_hash(ht, src.reg, reg_offset);
2190 }
2191 }
2192
2193 /* Add any update of a GRF to the HT, removing a previous write if it
2194 * wasn't read.
2195 */
2196 if (inst->dst.file == GRF) {
2197 if (inst->regs_written > 1) {
2198 /* We don't know how to trim channels from an instruction's
2199 * writes, so we can't incrementally remove unread channels from
2200 * it. Just remove whatever it overwrites from the table
2201 */
2202 for (int i = 0; i < inst->regs_written; i++) {
2203 remove_dead_code_hash(ht,
2204 inst->dst.reg,
2205 inst->dst.reg_offset + i);
2206 }
2207 } else {
2208 struct hash_entry *entry =
2209 get_dead_code_hash_entry(ht, inst->dst.reg,
2210 inst->dst.reg_offset);
2211
2212 if (entry) {
2213 if (inst->is_partial_write()) {
2214 /* For a partial write, we can't remove any previous dead code
2215 * candidate, since we're just modifying their result.
2216 */
2217 } else {
2218 /* We're completely updating a channel, and there was a
2219 * previous write to the channel that wasn't read. Kill it!
2220 */
2221 fs_inst *inst = (fs_inst *)entry->data;
2222 inst->remove();
2223 progress = true;
2224 }
2225
2226 _mesa_hash_table_remove(ht, entry);
2227 }
2228
2229 if (!inst->has_side_effects())
2230 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2231 inst);
2232 }
2233 }
2234 }
2235
2236 _mesa_hash_table_destroy(ht, NULL);
2237
2238 if (progress)
2239 invalidate_live_intervals();
2240
2241 return progress;
2242 }
2243
2244 /**
2245 * Implements register coalescing: Checks if the two registers involved in a
2246 * raw move don't interfere, in which case they can both be stored in the same
2247 * place and the MOV removed.
2248 */
2249 bool
2250 fs_visitor::register_coalesce()
2251 {
2252 bool progress = false;
2253
2254 calculate_live_intervals();
2255
2256 foreach_list_safe(node, &this->instructions) {
2257 fs_inst *inst = (fs_inst *)node;
2258
2259 if (inst->opcode != BRW_OPCODE_MOV ||
2260 inst->is_partial_write() ||
2261 inst->saturate ||
2262 inst->src[0].file != GRF ||
2263 inst->src[0].negate ||
2264 inst->src[0].abs ||
2265 inst->src[0].smear != -1 ||
2266 inst->dst.file != GRF ||
2267 inst->dst.type != inst->src[0].type ||
2268 virtual_grf_sizes[inst->src[0].reg] != 1) {
2269 continue;
2270 }
2271
2272 int var_from = live_intervals->var_from_reg(&inst->src[0]);
2273 int var_to = live_intervals->var_from_reg(&inst->dst);
2274
2275 if (live_intervals->vars_interfere(var_from, var_to) &&
2276 !inst->dst.equals(inst->src[0]))
2277 continue;
2278
2279 int reg_from = inst->src[0].reg;
2280 assert(inst->src[0].reg_offset == 0);
2281 int reg_to = inst->dst.reg;
2282 int reg_to_offset = inst->dst.reg_offset;
2283
2284 foreach_list(node, &this->instructions) {
2285 fs_inst *scan_inst = (fs_inst *)node;
2286
2287 if (scan_inst->dst.file == GRF &&
2288 scan_inst->dst.reg == reg_from) {
2289 scan_inst->dst.reg = reg_to;
2290 scan_inst->dst.reg_offset = reg_to_offset;
2291 }
2292 for (int i = 0; i < 3; i++) {
2293 if (scan_inst->src[i].file == GRF &&
2294 scan_inst->src[i].reg == reg_from) {
2295 scan_inst->src[i].reg = reg_to;
2296 scan_inst->src[i].reg_offset = reg_to_offset;
2297 }
2298 }
2299 }
2300
2301 inst->remove();
2302 progress = true;
2303 continue;
2304 }
2305
2306 if (progress)
2307 invalidate_live_intervals();
2308
2309 return progress;
2310 }
2311
2312 bool
2313 fs_visitor::compute_to_mrf()
2314 {
2315 bool progress = false;
2316 int next_ip = 0;
2317
2318 calculate_live_intervals();
2319
2320 foreach_list_safe(node, &this->instructions) {
2321 fs_inst *inst = (fs_inst *)node;
2322
2323 int ip = next_ip;
2324 next_ip++;
2325
2326 if (inst->opcode != BRW_OPCODE_MOV ||
2327 inst->is_partial_write() ||
2328 inst->dst.file != MRF || inst->src[0].file != GRF ||
2329 inst->dst.type != inst->src[0].type ||
2330 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2331 continue;
2332
2333 /* Work out which hardware MRF registers are written by this
2334 * instruction.
2335 */
2336 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2337 int mrf_high;
2338 if (inst->dst.reg & BRW_MRF_COMPR4) {
2339 mrf_high = mrf_low + 4;
2340 } else if (dispatch_width == 16 &&
2341 (!inst->force_uncompressed && !inst->force_sechalf)) {
2342 mrf_high = mrf_low + 1;
2343 } else {
2344 mrf_high = mrf_low;
2345 }
2346
2347 /* Can't compute-to-MRF this GRF if someone else was going to
2348 * read it later.
2349 */
2350 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2351 continue;
2352
2353 /* Found a move of a GRF to a MRF. Let's see if we can go
2354 * rewrite the thing that made this GRF to write into the MRF.
2355 */
2356 fs_inst *scan_inst;
2357 for (scan_inst = (fs_inst *)inst->prev;
2358 scan_inst->prev != NULL;
2359 scan_inst = (fs_inst *)scan_inst->prev) {
2360 if (scan_inst->dst.file == GRF &&
2361 scan_inst->dst.reg == inst->src[0].reg) {
2362 /* Found the last thing to write our reg we want to turn
2363 * into a compute-to-MRF.
2364 */
2365
2366 /* If this one instruction didn't populate all the
2367 * channels, bail. We might be able to rewrite everything
2368 * that writes that reg, but it would require smarter
2369 * tracking to delay the rewriting until complete success.
2370 */
2371 if (scan_inst->is_partial_write())
2372 break;
2373
2374 /* Things returning more than one register would need us to
2375 * understand coalescing out more than one MOV at a time.
2376 */
2377 if (scan_inst->regs_written > 1)
2378 break;
2379
2380 /* SEND instructions can't have MRF as a destination. */
2381 if (scan_inst->mlen)
2382 break;
2383
2384 if (brw->gen == 6) {
2385 /* gen6 math instructions must have the destination be
2386 * GRF, so no compute-to-MRF for them.
2387 */
2388 if (scan_inst->is_math()) {
2389 break;
2390 }
2391 }
2392
2393 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2394 /* Found the creator of our MRF's source value. */
2395 scan_inst->dst.file = MRF;
2396 scan_inst->dst.reg = inst->dst.reg;
2397 scan_inst->saturate |= inst->saturate;
2398 inst->remove();
2399 progress = true;
2400 }
2401 break;
2402 }
2403
2404 /* We don't handle control flow here. Most computation of
2405 * values that end up in MRFs are shortly before the MRF
2406 * write anyway.
2407 */
2408 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2409 break;
2410
2411 /* You can't read from an MRF, so if someone else reads our
2412 * MRF's source GRF that we wanted to rewrite, that stops us.
2413 */
2414 bool interfered = false;
2415 for (int i = 0; i < 3; i++) {
2416 if (scan_inst->src[i].file == GRF &&
2417 scan_inst->src[i].reg == inst->src[0].reg &&
2418 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2419 interfered = true;
2420 }
2421 }
2422 if (interfered)
2423 break;
2424
2425 if (scan_inst->dst.file == MRF) {
2426 /* If somebody else writes our MRF here, we can't
2427 * compute-to-MRF before that.
2428 */
2429 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2430 int scan_mrf_high;
2431
2432 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2433 scan_mrf_high = scan_mrf_low + 4;
2434 } else if (dispatch_width == 16 &&
2435 (!scan_inst->force_uncompressed &&
2436 !scan_inst->force_sechalf)) {
2437 scan_mrf_high = scan_mrf_low + 1;
2438 } else {
2439 scan_mrf_high = scan_mrf_low;
2440 }
2441
2442 if (mrf_low == scan_mrf_low ||
2443 mrf_low == scan_mrf_high ||
2444 mrf_high == scan_mrf_low ||
2445 mrf_high == scan_mrf_high) {
2446 break;
2447 }
2448 }
2449
2450 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2451 /* Found a SEND instruction, which means that there are
2452 * live values in MRFs from base_mrf to base_mrf +
2453 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2454 * above it.
2455 */
2456 if (mrf_low >= scan_inst->base_mrf &&
2457 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2458 break;
2459 }
2460 if (mrf_high >= scan_inst->base_mrf &&
2461 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2462 break;
2463 }
2464 }
2465 }
2466 }
2467
2468 if (progress)
2469 invalidate_live_intervals();
2470
2471 return progress;
2472 }
2473
2474 /**
2475 * Walks through basic blocks, looking for repeated MRF writes and
2476 * removing the later ones.
2477 */
2478 bool
2479 fs_visitor::remove_duplicate_mrf_writes()
2480 {
2481 fs_inst *last_mrf_move[16];
2482 bool progress = false;
2483
2484 /* Need to update the MRF tracking for compressed instructions. */
2485 if (dispatch_width == 16)
2486 return false;
2487
2488 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2489
2490 foreach_list_safe(node, &this->instructions) {
2491 fs_inst *inst = (fs_inst *)node;
2492
2493 if (inst->is_control_flow()) {
2494 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2495 }
2496
2497 if (inst->opcode == BRW_OPCODE_MOV &&
2498 inst->dst.file == MRF) {
2499 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2500 if (prev_inst && inst->equals(prev_inst)) {
2501 inst->remove();
2502 progress = true;
2503 continue;
2504 }
2505 }
2506
2507 /* Clear out the last-write records for MRFs that were overwritten. */
2508 if (inst->dst.file == MRF) {
2509 last_mrf_move[inst->dst.reg] = NULL;
2510 }
2511
2512 if (inst->mlen > 0 && inst->base_mrf != -1) {
2513 /* Found a SEND instruction, which will include two or fewer
2514 * implied MRF writes. We could do better here.
2515 */
2516 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2517 last_mrf_move[inst->base_mrf + i] = NULL;
2518 }
2519 }
2520
2521 /* Clear out any MRF move records whose sources got overwritten. */
2522 if (inst->dst.file == GRF) {
2523 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2524 if (last_mrf_move[i] &&
2525 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2526 last_mrf_move[i] = NULL;
2527 }
2528 }
2529 }
2530
2531 if (inst->opcode == BRW_OPCODE_MOV &&
2532 inst->dst.file == MRF &&
2533 inst->src[0].file == GRF &&
2534 !inst->is_partial_write()) {
2535 last_mrf_move[inst->dst.reg] = inst;
2536 }
2537 }
2538
2539 if (progress)
2540 invalidate_live_intervals();
2541
2542 return progress;
2543 }
2544
2545 static void
2546 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2547 int first_grf, int grf_len)
2548 {
2549 bool inst_16wide = (dispatch_width > 8 &&
2550 !inst->force_uncompressed &&
2551 !inst->force_sechalf);
2552
2553 /* Clear the flag for registers that actually got read (as expected). */
2554 for (int i = 0; i < 3; i++) {
2555 int grf;
2556 if (inst->src[i].file == GRF) {
2557 grf = inst->src[i].reg;
2558 } else if (inst->src[i].file == HW_REG &&
2559 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2560 grf = inst->src[i].fixed_hw_reg.nr;
2561 } else {
2562 continue;
2563 }
2564
2565 if (grf >= first_grf &&
2566 grf < first_grf + grf_len) {
2567 deps[grf - first_grf] = false;
2568 if (inst_16wide)
2569 deps[grf - first_grf + 1] = false;
2570 }
2571 }
2572 }
2573
2574 /**
2575 * Implements this workaround for the original 965:
2576 *
2577 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2578 * check for post destination dependencies on this instruction, software
2579 * must ensure that there is no destination hazard for the case of ‘write
2580 * followed by a posted write’ shown in the following example.
2581 *
2582 * 1. mov r3 0
2583 * 2. send r3.xy <rest of send instruction>
2584 * 3. mov r2 r3
2585 *
2586 * Due to no post-destination dependency check on the ‘send’, the above
2587 * code sequence could have two instructions (1 and 2) in flight at the
2588 * same time that both consider ‘r3’ as the target of their final writes.
2589 */
2590 void
2591 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2592 {
2593 int reg_size = dispatch_width / 8;
2594 int write_len = inst->regs_written * reg_size;
2595 int first_write_grf = inst->dst.reg;
2596 bool needs_dep[BRW_MAX_MRF];
2597 assert(write_len < (int)sizeof(needs_dep) - 1);
2598
2599 memset(needs_dep, false, sizeof(needs_dep));
2600 memset(needs_dep, true, write_len);
2601
2602 clear_deps_for_inst_src(inst, dispatch_width,
2603 needs_dep, first_write_grf, write_len);
2604
2605 /* Walk backwards looking for writes to registers we're writing which
2606 * aren't read since being written. If we hit the start of the program,
2607 * we assume that there are no outstanding dependencies on entry to the
2608 * program.
2609 */
2610 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2611 scan_inst != NULL;
2612 scan_inst = (fs_inst *)scan_inst->prev) {
2613
2614 /* If we hit control flow, assume that there *are* outstanding
2615 * dependencies, and force their cleanup before our instruction.
2616 */
2617 if (scan_inst->is_control_flow()) {
2618 for (int i = 0; i < write_len; i++) {
2619 if (needs_dep[i]) {
2620 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2621 }
2622 }
2623 return;
2624 }
2625
2626 bool scan_inst_16wide = (dispatch_width > 8 &&
2627 !scan_inst->force_uncompressed &&
2628 !scan_inst->force_sechalf);
2629
2630 /* We insert our reads as late as possible on the assumption that any
2631 * instruction but a MOV that might have left us an outstanding
2632 * dependency has more latency than a MOV.
2633 */
2634 if (scan_inst->dst.file == GRF) {
2635 for (int i = 0; i < scan_inst->regs_written; i++) {
2636 int reg = scan_inst->dst.reg + i * reg_size;
2637
2638 if (reg >= first_write_grf &&
2639 reg < first_write_grf + write_len &&
2640 needs_dep[reg - first_write_grf]) {
2641 inst->insert_before(DEP_RESOLVE_MOV(reg));
2642 needs_dep[reg - first_write_grf] = false;
2643 if (scan_inst_16wide)
2644 needs_dep[reg - first_write_grf + 1] = false;
2645 }
2646 }
2647 }
2648
2649 /* Clear the flag for registers that actually got read (as expected). */
2650 clear_deps_for_inst_src(scan_inst, dispatch_width,
2651 needs_dep, first_write_grf, write_len);
2652
2653 /* Continue the loop only if we haven't resolved all the dependencies */
2654 int i;
2655 for (i = 0; i < write_len; i++) {
2656 if (needs_dep[i])
2657 break;
2658 }
2659 if (i == write_len)
2660 return;
2661 }
2662 }
2663
2664 /**
2665 * Implements this workaround for the original 965:
2666 *
2667 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2668 * used as a destination register until after it has been sourced by an
2669 * instruction with a different destination register.
2670 */
2671 void
2672 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2673 {
2674 int write_len = inst->regs_written * dispatch_width / 8;
2675 int first_write_grf = inst->dst.reg;
2676 bool needs_dep[BRW_MAX_MRF];
2677 assert(write_len < (int)sizeof(needs_dep) - 1);
2678
2679 memset(needs_dep, false, sizeof(needs_dep));
2680 memset(needs_dep, true, write_len);
2681 /* Walk forwards looking for writes to registers we're writing which aren't
2682 * read before being written.
2683 */
2684 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2685 !scan_inst->is_tail_sentinel();
2686 scan_inst = (fs_inst *)scan_inst->next) {
2687 /* If we hit control flow, force resolve all remaining dependencies. */
2688 if (scan_inst->is_control_flow()) {
2689 for (int i = 0; i < write_len; i++) {
2690 if (needs_dep[i])
2691 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2692 }
2693 return;
2694 }
2695
2696 /* Clear the flag for registers that actually got read (as expected). */
2697 clear_deps_for_inst_src(scan_inst, dispatch_width,
2698 needs_dep, first_write_grf, write_len);
2699
2700 /* We insert our reads as late as possible since they're reading the
2701 * result of a SEND, which has massive latency.
2702 */
2703 if (scan_inst->dst.file == GRF &&
2704 scan_inst->dst.reg >= first_write_grf &&
2705 scan_inst->dst.reg < first_write_grf + write_len &&
2706 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2707 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2708 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2709 }
2710
2711 /* Continue the loop only if we haven't resolved all the dependencies */
2712 int i;
2713 for (i = 0; i < write_len; i++) {
2714 if (needs_dep[i])
2715 break;
2716 }
2717 if (i == write_len)
2718 return;
2719 }
2720
2721 /* If we hit the end of the program, resolve all remaining dependencies out
2722 * of paranoia.
2723 */
2724 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2725 assert(last_inst->eot);
2726 for (int i = 0; i < write_len; i++) {
2727 if (needs_dep[i])
2728 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2729 }
2730 }
2731
2732 void
2733 fs_visitor::insert_gen4_send_dependency_workarounds()
2734 {
2735 if (brw->gen != 4 || brw->is_g4x)
2736 return;
2737
2738 /* Note that we're done with register allocation, so GRF fs_regs always
2739 * have a .reg_offset of 0.
2740 */
2741
2742 foreach_list_safe(node, &this->instructions) {
2743 fs_inst *inst = (fs_inst *)node;
2744
2745 if (inst->mlen != 0 && inst->dst.file == GRF) {
2746 insert_gen4_pre_send_dependency_workarounds(inst);
2747 insert_gen4_post_send_dependency_workarounds(inst);
2748 }
2749 }
2750 }
2751
2752 /**
2753 * Turns the generic expression-style uniform pull constant load instruction
2754 * into a hardware-specific series of instructions for loading a pull
2755 * constant.
2756 *
2757 * The expression style allows the CSE pass before this to optimize out
2758 * repeated loads from the same offset, and gives the pre-register-allocation
2759 * scheduling full flexibility, while the conversion to native instructions
2760 * allows the post-register-allocation scheduler the best information
2761 * possible.
2762 *
2763 * Note that execution masking for setting up pull constant loads is special:
2764 * the channels that need to be written are unrelated to the current execution
2765 * mask, since a later instruction will use one of the result channels as a
2766 * source operand for all 8 or 16 of its channels.
2767 */
2768 void
2769 fs_visitor::lower_uniform_pull_constant_loads()
2770 {
2771 foreach_list(node, &this->instructions) {
2772 fs_inst *inst = (fs_inst *)node;
2773
2774 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2775 continue;
2776
2777 if (brw->gen >= 7) {
2778 /* The offset arg before was a vec4-aligned byte offset. We need to
2779 * turn it into a dword offset.
2780 */
2781 fs_reg const_offset_reg = inst->src[1];
2782 assert(const_offset_reg.file == IMM &&
2783 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2784 const_offset_reg.imm.u /= 4;
2785 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2786
2787 /* This is actually going to be a MOV, but since only the first dword
2788 * is accessed, we have a special opcode to do just that one. Note
2789 * that this needs to be an operation that will be considered a def
2790 * by live variable analysis, or register allocation will explode.
2791 */
2792 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2793 payload, const_offset_reg);
2794 setup->force_writemask_all = true;
2795
2796 setup->ir = inst->ir;
2797 setup->annotation = inst->annotation;
2798 inst->insert_before(setup);
2799
2800 /* Similarly, this will only populate the first 4 channels of the
2801 * result register (since we only use smear values from 0-3), but we
2802 * don't tell the optimizer.
2803 */
2804 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2805 inst->src[1] = payload;
2806
2807 invalidate_live_intervals();
2808 } else {
2809 /* Before register allocation, we didn't tell the scheduler about the
2810 * MRF we use. We know it's safe to use this MRF because nothing
2811 * else does except for register spill/unspill, which generates and
2812 * uses its MRF within a single IR instruction.
2813 */
2814 inst->base_mrf = 14;
2815 inst->mlen = 1;
2816 }
2817 }
2818 }
2819
2820 void
2821 fs_visitor::dump_instruction(backend_instruction *be_inst)
2822 {
2823 fs_inst *inst = (fs_inst *)be_inst;
2824
2825 if (inst->predicate) {
2826 printf("(%cf0.%d) ",
2827 inst->predicate_inverse ? '-' : '+',
2828 inst->flag_subreg);
2829 }
2830
2831 printf("%s", brw_instruction_name(inst->opcode));
2832 if (inst->saturate)
2833 printf(".sat");
2834 if (inst->conditional_mod) {
2835 printf("%s", conditional_modifier[inst->conditional_mod]);
2836 if (!inst->predicate &&
2837 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2838 inst->opcode != BRW_OPCODE_IF &&
2839 inst->opcode != BRW_OPCODE_WHILE))) {
2840 printf(".f0.%d", inst->flag_subreg);
2841 }
2842 }
2843 printf(" ");
2844
2845
2846 switch (inst->dst.file) {
2847 case GRF:
2848 printf("vgrf%d", inst->dst.reg);
2849 if (inst->dst.reg_offset)
2850 printf("+%d", inst->dst.reg_offset);
2851 break;
2852 case MRF:
2853 printf("m%d", inst->dst.reg);
2854 break;
2855 case BAD_FILE:
2856 printf("(null)");
2857 break;
2858 case UNIFORM:
2859 printf("***u%d***", inst->dst.reg);
2860 break;
2861 case HW_REG:
2862 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2863 switch (inst->dst.fixed_hw_reg.nr) {
2864 case BRW_ARF_NULL:
2865 printf("null");
2866 break;
2867 case BRW_ARF_ADDRESS:
2868 printf("a0.%d", inst->dst.fixed_hw_reg.subnr);
2869 break;
2870 case BRW_ARF_ACCUMULATOR:
2871 printf("acc%d", inst->dst.fixed_hw_reg.subnr);
2872 break;
2873 case BRW_ARF_FLAG:
2874 printf("f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2875 inst->dst.fixed_hw_reg.subnr);
2876 break;
2877 default:
2878 printf("arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2879 inst->dst.fixed_hw_reg.subnr);
2880 break;
2881 }
2882 } else {
2883 printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
2884 }
2885 if (inst->dst.fixed_hw_reg.subnr)
2886 printf("+%d", inst->dst.fixed_hw_reg.subnr);
2887 break;
2888 default:
2889 printf("???");
2890 break;
2891 }
2892 printf(":%s, ", reg_encoding[inst->dst.type]);
2893
2894 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
2895 if (inst->src[i].negate)
2896 printf("-");
2897 if (inst->src[i].abs)
2898 printf("|");
2899 switch (inst->src[i].file) {
2900 case GRF:
2901 printf("vgrf%d", inst->src[i].reg);
2902 if (inst->src[i].reg_offset)
2903 printf("+%d", inst->src[i].reg_offset);
2904 break;
2905 case MRF:
2906 printf("***m%d***", inst->src[i].reg);
2907 break;
2908 case UNIFORM:
2909 printf("u%d", inst->src[i].reg);
2910 if (inst->src[i].reg_offset)
2911 printf(".%d", inst->src[i].reg_offset);
2912 break;
2913 case BAD_FILE:
2914 printf("(null)");
2915 break;
2916 case IMM:
2917 switch (inst->src[i].type) {
2918 case BRW_REGISTER_TYPE_F:
2919 printf("%ff", inst->src[i].imm.f);
2920 break;
2921 case BRW_REGISTER_TYPE_D:
2922 printf("%dd", inst->src[i].imm.i);
2923 break;
2924 case BRW_REGISTER_TYPE_UD:
2925 printf("%uu", inst->src[i].imm.u);
2926 break;
2927 default:
2928 printf("???");
2929 break;
2930 }
2931 break;
2932 case HW_REG:
2933 if (inst->src[i].fixed_hw_reg.negate)
2934 printf("-");
2935 if (inst->src[i].fixed_hw_reg.abs)
2936 printf("|");
2937 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2938 switch (inst->src[i].fixed_hw_reg.nr) {
2939 case BRW_ARF_NULL:
2940 printf("null");
2941 break;
2942 case BRW_ARF_ADDRESS:
2943 printf("a0.%d", inst->src[i].fixed_hw_reg.subnr);
2944 break;
2945 case BRW_ARF_ACCUMULATOR:
2946 printf("acc%d", inst->src[i].fixed_hw_reg.subnr);
2947 break;
2948 case BRW_ARF_FLAG:
2949 printf("f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2950 inst->src[i].fixed_hw_reg.subnr);
2951 break;
2952 default:
2953 printf("arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2954 inst->src[i].fixed_hw_reg.subnr);
2955 break;
2956 }
2957 } else {
2958 printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2959 }
2960 if (inst->src[i].fixed_hw_reg.subnr)
2961 printf("+%d", inst->src[i].fixed_hw_reg.subnr);
2962 if (inst->src[i].fixed_hw_reg.abs)
2963 printf("|");
2964 break;
2965 default:
2966 printf("???");
2967 break;
2968 }
2969 if (inst->src[i].abs)
2970 printf("|");
2971
2972 if (inst->src[i].file != IMM) {
2973 printf(":%s", reg_encoding[inst->src[i].type]);
2974 }
2975
2976 if (i < 2 && inst->src[i + 1].file != BAD_FILE)
2977 printf(", ");
2978 }
2979
2980 printf(" ");
2981
2982 if (inst->force_uncompressed)
2983 printf("1sthalf ");
2984
2985 if (inst->force_sechalf)
2986 printf("2ndhalf ");
2987
2988 printf("\n");
2989 }
2990
2991 /**
2992 * Possibly returns an instruction that set up @param reg.
2993 *
2994 * Sometimes we want to take the result of some expression/variable
2995 * dereference tree and rewrite the instruction generating the result
2996 * of the tree. When processing the tree, we know that the
2997 * instructions generated are all writing temporaries that are dead
2998 * outside of this tree. So, if we have some instructions that write
2999 * a temporary, we're free to point that temp write somewhere else.
3000 *
3001 * Note that this doesn't guarantee that the instruction generated
3002 * only reg -- it might be the size=4 destination of a texture instruction.
3003 */
3004 fs_inst *
3005 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3006 fs_inst *end,
3007 fs_reg reg)
3008 {
3009 if (end == start ||
3010 end->is_partial_write() ||
3011 reg.reladdr ||
3012 !reg.equals(end->dst)) {
3013 return NULL;
3014 } else {
3015 return end;
3016 }
3017 }
3018
3019 void
3020 fs_visitor::setup_payload_gen6()
3021 {
3022 bool uses_depth =
3023 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3024 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3025
3026 assert(brw->gen >= 6);
3027
3028 /* R0-1: masks, pixel X/Y coordinates. */
3029 c->nr_payload_regs = 2;
3030 /* R2: only for 32-pixel dispatch.*/
3031
3032 /* R3-26: barycentric interpolation coordinates. These appear in the
3033 * same order that they appear in the brw_wm_barycentric_interp_mode
3034 * enum. Each set of coordinates occupies 2 registers if dispatch width
3035 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3036 * appear if they were enabled using the "Barycentric Interpolation
3037 * Mode" bits in WM_STATE.
3038 */
3039 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3040 if (barycentric_interp_modes & (1 << i)) {
3041 c->barycentric_coord_reg[i] = c->nr_payload_regs;
3042 c->nr_payload_regs += 2;
3043 if (dispatch_width == 16) {
3044 c->nr_payload_regs += 2;
3045 }
3046 }
3047 }
3048
3049 /* R27: interpolated depth if uses source depth */
3050 if (uses_depth) {
3051 c->source_depth_reg = c->nr_payload_regs;
3052 c->nr_payload_regs++;
3053 if (dispatch_width == 16) {
3054 /* R28: interpolated depth if not 8-wide. */
3055 c->nr_payload_regs++;
3056 }
3057 }
3058 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3059 if (uses_depth) {
3060 c->source_w_reg = c->nr_payload_regs;
3061 c->nr_payload_regs++;
3062 if (dispatch_width == 16) {
3063 /* R30: interpolated W if not 8-wide. */
3064 c->nr_payload_regs++;
3065 }
3066 }
3067
3068 c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3069 /* R31: MSAA position offsets. */
3070 if (c->prog_data.uses_pos_offset) {
3071 c->sample_pos_reg = c->nr_payload_regs;
3072 c->nr_payload_regs++;
3073 }
3074
3075 /* R32-: bary for 32-pixel. */
3076 /* R58-59: interp W for 32-pixel. */
3077
3078 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3079 c->source_depth_to_render_target = true;
3080 }
3081 }
3082
3083 void
3084 fs_visitor::assign_binding_table_offsets()
3085 {
3086 uint32_t next_binding_table_offset = 0;
3087
3088 /* If there are no color regions, we still perform an FB write to a null
3089 * renderbuffer, which we place at surface index 0.
3090 */
3091 c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3092 next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3093
3094 assign_common_binding_table_offsets(next_binding_table_offset);
3095 }
3096
3097 bool
3098 fs_visitor::run()
3099 {
3100 sanity_param_count = fp->Base.Parameters->NumParameters;
3101 uint32_t orig_nr_params = c->prog_data.nr_params;
3102 bool allocated_without_spills;
3103
3104 assign_binding_table_offsets();
3105
3106 if (brw->gen >= 6)
3107 setup_payload_gen6();
3108 else
3109 setup_payload_gen4();
3110
3111 if (0) {
3112 emit_dummy_fs();
3113 } else {
3114 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3115 emit_shader_time_begin();
3116
3117 calculate_urb_setup();
3118 if (fp->Base.InputsRead > 0) {
3119 if (brw->gen < 6)
3120 emit_interpolation_setup_gen4();
3121 else
3122 emit_interpolation_setup_gen6();
3123 }
3124
3125 /* We handle discards by keeping track of the still-live pixels in f0.1.
3126 * Initialize it with the dispatched pixels.
3127 */
3128 if (fp->UsesKill || c->key.alpha_test_func) {
3129 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3130 discard_init->flag_subreg = 1;
3131 }
3132
3133 /* Generate FS IR for main(). (the visitor only descends into
3134 * functions called "main").
3135 */
3136 if (shader) {
3137 foreach_list(node, &*shader->ir) {
3138 ir_instruction *ir = (ir_instruction *)node;
3139 base_ir = ir;
3140 this->result = reg_undef;
3141 ir->accept(this);
3142 }
3143 } else {
3144 emit_fragment_program_code();
3145 }
3146 base_ir = NULL;
3147 if (failed)
3148 return false;
3149
3150 emit(FS_OPCODE_PLACEHOLDER_HALT);
3151
3152 if (c->key.alpha_test_func)
3153 emit_alpha_test();
3154
3155 emit_fb_writes();
3156
3157 split_virtual_grfs();
3158
3159 move_uniform_array_access_to_pull_constants();
3160 remove_dead_constants();
3161 setup_pull_constants();
3162
3163 bool progress;
3164 do {
3165 progress = false;
3166
3167 compact_virtual_grfs();
3168
3169 progress = remove_duplicate_mrf_writes() || progress;
3170
3171 progress = opt_algebraic() || progress;
3172 progress = opt_cse() || progress;
3173 progress = opt_copy_propagate() || progress;
3174 progress = opt_peephole_sel() || progress;
3175 progress = opt_peephole_predicated_break() || progress;
3176 progress = dead_code_eliminate() || progress;
3177 progress = dead_code_eliminate_local() || progress;
3178 progress = dead_control_flow_eliminate(this) || progress;
3179 progress = register_coalesce() || progress;
3180 progress = compute_to_mrf() || progress;
3181 } while (progress);
3182
3183 lower_uniform_pull_constant_loads();
3184
3185 assign_curb_setup();
3186 assign_urb_setup();
3187
3188 static enum instruction_scheduler_mode pre_modes[] = {
3189 SCHEDULE_PRE,
3190 SCHEDULE_PRE_NON_LIFO,
3191 SCHEDULE_PRE_LIFO,
3192 };
3193
3194 /* Try each scheduling heuristic to see if it can successfully register
3195 * allocate without spilling. They should be ordered by decreasing
3196 * performance but increasing likelihood of allocating.
3197 */
3198 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3199 schedule_instructions(pre_modes[i]);
3200
3201 if (0) {
3202 assign_regs_trivial();
3203 allocated_without_spills = true;
3204 } else {
3205 allocated_without_spills = assign_regs(false);
3206 }
3207 if (allocated_without_spills)
3208 break;
3209 }
3210
3211 if (!allocated_without_spills) {
3212 /* We assume that any spilling is worse than just dropping back to
3213 * SIMD8. There's probably actually some intermediate point where
3214 * SIMD16 with a couple of spills is still better.
3215 */
3216 if (dispatch_width == 16) {
3217 fail("Failure to register allocate. Reduce number of "
3218 "live scalar values to avoid this.");
3219 }
3220
3221 /* Since we're out of heuristics, just go spill registers until we
3222 * get an allocation.
3223 */
3224 while (!assign_regs(true)) {
3225 if (failed)
3226 break;
3227 }
3228 }
3229 }
3230 assert(force_uncompressed_stack == 0);
3231
3232 /* This must come after all optimization and register allocation, since
3233 * it inserts dead code that happens to have side effects, and it does
3234 * so based on the actual physical registers in use.
3235 */
3236 insert_gen4_send_dependency_workarounds();
3237
3238 if (failed)
3239 return false;
3240
3241 if (!allocated_without_spills)
3242 schedule_instructions(SCHEDULE_POST);
3243
3244 if (dispatch_width == 8) {
3245 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3246 } else {
3247 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3248
3249 /* Make sure we didn't try to sneak in an extra uniform */
3250 assert(orig_nr_params == c->prog_data.nr_params);
3251 (void) orig_nr_params;
3252 }
3253
3254 /* If any state parameters were appended, then ParameterValues could have
3255 * been realloced, in which case the driver uniform storage set up by
3256 * _mesa_associate_uniform_storage() would point to freed memory. Make
3257 * sure that didn't happen.
3258 */
3259 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3260
3261 return !failed;
3262 }
3263
3264 const unsigned *
3265 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3266 struct gl_fragment_program *fp,
3267 struct gl_shader_program *prog,
3268 unsigned *final_assembly_size)
3269 {
3270 bool start_busy = false;
3271 float start_time = 0;
3272
3273 if (unlikely(brw->perf_debug)) {
3274 start_busy = (brw->batch.last_bo &&
3275 drm_intel_bo_busy(brw->batch.last_bo));
3276 start_time = get_time();
3277 }
3278
3279 struct brw_shader *shader = NULL;
3280 if (prog)
3281 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3282
3283 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3284 if (prog) {
3285 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3286 _mesa_print_ir(shader->ir, NULL);
3287 printf("\n\n");
3288 } else {
3289 printf("ARB_fragment_program %d ir for native fragment shader\n",
3290 fp->Base.Id);
3291 _mesa_print_program(&fp->Base);
3292 }
3293 }
3294
3295 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3296 */
3297 fs_visitor v(brw, c, prog, fp, 8);
3298 if (!v.run()) {
3299 if (prog) {
3300 prog->LinkStatus = false;
3301 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3302 }
3303
3304 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3305 v.fail_msg);
3306
3307 return NULL;
3308 }
3309
3310 exec_list *simd16_instructions = NULL;
3311 fs_visitor v2(brw, c, prog, fp, 16);
3312 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3313 if (c->prog_data.nr_pull_params == 0) {
3314 /* Try a 16-wide compile */
3315 v2.import_uniforms(&v);
3316 if (!v2.run()) {
3317 perf_debug("16-wide shader failed to compile, falling back to "
3318 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3319 } else {
3320 simd16_instructions = &v2.instructions;
3321 }
3322 } else {
3323 perf_debug("Skipping 16-wide due to pull parameters.\n");
3324 }
3325 }
3326
3327 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3328 const unsigned *generated = g.generate_assembly(&v.instructions,
3329 simd16_instructions,
3330 final_assembly_size);
3331
3332 if (unlikely(brw->perf_debug) && shader) {
3333 if (shader->compiled_once)
3334 brw_wm_debug_recompile(brw, prog, &c->key);
3335 shader->compiled_once = true;
3336
3337 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3338 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3339 (get_time() - start_time) * 1000);
3340 }
3341 }
3342
3343 return generated;
3344 }
3345
3346 bool
3347 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3348 {
3349 struct brw_context *brw = brw_context(ctx);
3350 struct brw_wm_prog_key key;
3351
3352 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3353 return true;
3354
3355 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3356 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3357 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3358 bool program_uses_dfdy = fp->UsesDFdy;
3359
3360 memset(&key, 0, sizeof(key));
3361
3362 if (brw->gen < 6) {
3363 if (fp->UsesKill)
3364 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3365
3366 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3367 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3368
3369 /* Just assume depth testing. */
3370 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3371 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3372 }
3373
3374 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3375 BRW_FS_VARYING_INPUT_MASK) > 16)
3376 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3377
3378 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3379
3380 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3381 for (unsigned i = 0; i < sampler_count; i++) {
3382 if (fp->Base.ShadowSamplers & (1 << i)) {
3383 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3384 key.tex.swizzles[i] =
3385 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3386 } else {
3387 /* Color sampler: assume no swizzling. */
3388 key.tex.swizzles[i] = SWIZZLE_XYZW;
3389 }
3390 }
3391
3392 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3393 key.drawable_height = ctx->DrawBuffer->Height;
3394 }
3395
3396 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3397 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3398 }
3399
3400 key.nr_color_regions = 1;
3401
3402 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3403 * quality of the derivatives is likely to be determined by the driconf
3404 * option.
3405 */
3406 key.high_quality_derivatives = brw->disable_derivative_optimization;
3407
3408 key.program_string_id = bfp->id;
3409
3410 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3411 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3412
3413 bool success = do_wm_prog(brw, prog, bfp, &key);
3414
3415 brw->wm.base.prog_offset = old_prog_offset;
3416 brw->wm.prog_data = old_prog_data;
3417
3418 return success;
3419 }