374801fab207eb19c9ce64790b69221b7dc4901f
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53
54 void
55 fs_inst::init()
56 {
57 memset(this, 0, sizeof(*this));
58 this->opcode = BRW_OPCODE_NOP;
59 this->conditional_mod = BRW_CONDITIONAL_NONE;
60
61 this->dst = reg_undef;
62 this->src[0] = reg_undef;
63 this->src[1] = reg_undef;
64 this->src[2] = reg_undef;
65
66 /* This will be the case for almost all instructions. */
67 this->regs_written = 1;
68 }
69
70 fs_inst::fs_inst()
71 {
72 init();
73 }
74
75 fs_inst::fs_inst(enum opcode opcode)
76 {
77 init();
78 this->opcode = opcode;
79 }
80
81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
82 {
83 init();
84 this->opcode = opcode;
85 this->dst = dst;
86
87 if (dst.file == GRF)
88 assert(dst.reg_offset >= 0);
89 }
90
91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
92 {
93 init();
94 this->opcode = opcode;
95 this->dst = dst;
96 this->src[0] = src0;
97
98 if (dst.file == GRF)
99 assert(dst.reg_offset >= 0);
100 if (src[0].file == GRF)
101 assert(src[0].reg_offset >= 0);
102 }
103
104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
105 {
106 init();
107 this->opcode = opcode;
108 this->dst = dst;
109 this->src[0] = src0;
110 this->src[1] = src1;
111
112 if (dst.file == GRF)
113 assert(dst.reg_offset >= 0);
114 if (src[0].file == GRF)
115 assert(src[0].reg_offset >= 0);
116 if (src[1].file == GRF)
117 assert(src[1].reg_offset >= 0);
118 }
119
120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
121 fs_reg src0, fs_reg src1, fs_reg src2)
122 {
123 init();
124 this->opcode = opcode;
125 this->dst = dst;
126 this->src[0] = src0;
127 this->src[1] = src1;
128 this->src[2] = src2;
129
130 if (dst.file == GRF)
131 assert(dst.reg_offset >= 0);
132 if (src[0].file == GRF)
133 assert(src[0].reg_offset >= 0);
134 if (src[1].file == GRF)
135 assert(src[1].reg_offset >= 0);
136 if (src[2].file == GRF)
137 assert(src[2].reg_offset >= 0);
138 }
139
140 #define ALU1(op) \
141 fs_inst * \
142 fs_visitor::op(fs_reg dst, fs_reg src0) \
143 { \
144 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
145 }
146
147 #define ALU2(op) \
148 fs_inst * \
149 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
150 { \
151 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
152 }
153
154 #define ALU3(op) \
155 fs_inst * \
156 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
157 { \
158 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
159 }
160
161 ALU1(NOT)
162 ALU1(MOV)
163 ALU1(FRC)
164 ALU1(RNDD)
165 ALU1(RNDE)
166 ALU1(RNDZ)
167 ALU2(ADD)
168 ALU2(MUL)
169 ALU2(MACH)
170 ALU2(AND)
171 ALU2(OR)
172 ALU2(XOR)
173 ALU2(SHL)
174 ALU2(SHR)
175 ALU2(ASR)
176 ALU3(LRP)
177 ALU1(BFREV)
178 ALU3(BFE)
179 ALU2(BFI1)
180 ALU3(BFI2)
181 ALU1(FBH)
182 ALU1(FBL)
183 ALU1(CBIT)
184 ALU3(MAD)
185 ALU2(ADDC)
186 ALU2(SUBB)
187 ALU2(SEL)
188
189 /** Gen4 predicated IF. */
190 fs_inst *
191 fs_visitor::IF(uint32_t predicate)
192 {
193 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195 return inst;
196 }
197
198 /** Gen6 IF with embedded comparison. */
199 fs_inst *
200 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
201 {
202 assert(brw->gen == 6);
203 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
204 reg_null_d, src0, src1);
205 inst->conditional_mod = condition;
206 return inst;
207 }
208
209 /**
210 * CMP: Sets the low bit of the destination channels with the result
211 * of the comparison, while the upper bits are undefined, and updates
212 * the flag register with the packed 16 bits of the result.
213 */
214 fs_inst *
215 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
216 {
217 fs_inst *inst;
218
219 /* Take the instruction:
220 *
221 * CMP null<d> src0<f> src1<f>
222 *
223 * Original gen4 does type conversion to the destination type before
224 * comparison, producing garbage results for floating point comparisons.
225 * gen5 does the comparison on the execution type (resolved source types),
226 * so dst type doesn't matter. gen6 does comparison and then uses the
227 * result as if it was the dst type with no conversion, which happens to
228 * mostly work out for float-interpreted-as-int since our comparisons are
229 * for >0, =0, <0.
230 */
231 if (brw->gen == 4) {
232 dst.type = src0.type;
233 if (dst.file == HW_REG)
234 dst.fixed_hw_reg.type = dst.type;
235 }
236
237 resolve_ud_negate(&src0);
238 resolve_ud_negate(&src1);
239
240 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
241 inst->conditional_mod = condition;
242
243 return inst;
244 }
245
246 exec_list
247 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
248 fs_reg varying_offset,
249 uint32_t const_offset)
250 {
251 exec_list instructions;
252 fs_inst *inst;
253
254 /* We have our constant surface use a pitch of 4 bytes, so our index can
255 * be any component of a vector, and then we load 4 contiguous
256 * components starting from that.
257 *
258 * We break down the const_offset to a portion added to the variable
259 * offset and a portion done using reg_offset, which means that if you
260 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
261 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
262 * CSE can later notice that those loads are all the same and eliminate
263 * the redundant ones.
264 */
265 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
266 instructions.push_tail(ADD(vec4_offset,
267 varying_offset, const_offset & ~3));
268
269 int scale = 1;
270 if (brw->gen == 4 && dispatch_width == 8) {
271 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
272 * u, v, r) as parameters, or we can just use the SIMD16 message
273 * consisting of (header, u). We choose the second, at the cost of a
274 * longer return length.
275 */
276 scale = 2;
277 }
278
279 enum opcode op;
280 if (brw->gen >= 7)
281 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
282 else
283 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
284 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
285 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
286 inst->regs_written = 4 * scale;
287 instructions.push_tail(inst);
288
289 if (brw->gen < 7) {
290 inst->base_mrf = 13;
291 inst->header_present = true;
292 if (brw->gen == 4)
293 inst->mlen = 3;
294 else
295 inst->mlen = 1 + dispatch_width / 8;
296 }
297
298 vec4_result.reg_offset += (const_offset & 3) * scale;
299 instructions.push_tail(MOV(dst, vec4_result));
300
301 return instructions;
302 }
303
304 /**
305 * A helper for MOV generation for fixing up broken hardware SEND dependency
306 * handling.
307 */
308 fs_inst *
309 fs_visitor::DEP_RESOLVE_MOV(int grf)
310 {
311 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
312
313 inst->ir = NULL;
314 inst->annotation = "send dependency resolve";
315
316 /* The caller always wants uncompressed to emit the minimal extra
317 * dependencies, and to avoid having to deal with aligning its regs to 2.
318 */
319 inst->force_uncompressed = true;
320
321 return inst;
322 }
323
324 bool
325 fs_inst::equals(fs_inst *inst)
326 {
327 return (opcode == inst->opcode &&
328 dst.equals(inst->dst) &&
329 src[0].equals(inst->src[0]) &&
330 src[1].equals(inst->src[1]) &&
331 src[2].equals(inst->src[2]) &&
332 saturate == inst->saturate &&
333 predicate == inst->predicate &&
334 conditional_mod == inst->conditional_mod &&
335 mlen == inst->mlen &&
336 base_mrf == inst->base_mrf &&
337 sampler == inst->sampler &&
338 target == inst->target &&
339 eot == inst->eot &&
340 header_present == inst->header_present &&
341 shadow_compare == inst->shadow_compare &&
342 offset == inst->offset);
343 }
344
345 bool
346 fs_inst::overwrites_reg(const fs_reg &reg)
347 {
348 return (reg.file == dst.file &&
349 reg.reg == dst.reg &&
350 reg.reg_offset >= dst.reg_offset &&
351 reg.reg_offset < dst.reg_offset + regs_written);
352 }
353
354 bool
355 fs_inst::is_send_from_grf()
356 {
357 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
358 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
359 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
360 src[1].file == GRF) ||
361 (is_tex() && src[0].file == GRF));
362 }
363
364 bool
365 fs_visitor::can_do_source_mods(fs_inst *inst)
366 {
367 if (brw->gen == 6 && inst->is_math())
368 return false;
369
370 if (inst->is_send_from_grf())
371 return false;
372
373 if (!inst->can_do_source_mods())
374 return false;
375
376 return true;
377 }
378
379 void
380 fs_reg::init()
381 {
382 memset(this, 0, sizeof(*this));
383 this->smear = -1;
384 }
385
386 /** Generic unset register constructor. */
387 fs_reg::fs_reg()
388 {
389 init();
390 this->file = BAD_FILE;
391 }
392
393 /** Immediate value constructor. */
394 fs_reg::fs_reg(float f)
395 {
396 init();
397 this->file = IMM;
398 this->type = BRW_REGISTER_TYPE_F;
399 this->imm.f = f;
400 }
401
402 /** Immediate value constructor. */
403 fs_reg::fs_reg(int32_t i)
404 {
405 init();
406 this->file = IMM;
407 this->type = BRW_REGISTER_TYPE_D;
408 this->imm.i = i;
409 }
410
411 /** Immediate value constructor. */
412 fs_reg::fs_reg(uint32_t u)
413 {
414 init();
415 this->file = IMM;
416 this->type = BRW_REGISTER_TYPE_UD;
417 this->imm.u = u;
418 }
419
420 /** Fixed brw_reg. */
421 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
422 {
423 init();
424 this->file = HW_REG;
425 this->fixed_hw_reg = fixed_hw_reg;
426 this->type = fixed_hw_reg.type;
427 }
428
429 bool
430 fs_reg::equals(const fs_reg &r) const
431 {
432 return (file == r.file &&
433 reg == r.reg &&
434 reg_offset == r.reg_offset &&
435 type == r.type &&
436 negate == r.negate &&
437 abs == r.abs &&
438 !reladdr && !r.reladdr &&
439 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
440 sizeof(fixed_hw_reg)) == 0 &&
441 smear == r.smear &&
442 imm.u == r.imm.u);
443 }
444
445 fs_reg
446 fs_reg::retype(uint32_t type)
447 {
448 fs_reg result = *this;
449 result.type = type;
450 return result;
451 }
452
453 bool
454 fs_reg::is_zero() const
455 {
456 if (file != IMM)
457 return false;
458
459 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
460 }
461
462 bool
463 fs_reg::is_one() const
464 {
465 if (file != IMM)
466 return false;
467
468 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
469 }
470
471 bool
472 fs_reg::is_null() const
473 {
474 return file == HW_REG &&
475 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
476 fixed_hw_reg.nr == BRW_ARF_NULL;
477 }
478
479 bool
480 fs_reg::is_valid_3src() const
481 {
482 return file == GRF || file == UNIFORM;
483 }
484
485 int
486 fs_visitor::type_size(const struct glsl_type *type)
487 {
488 unsigned int size, i;
489
490 switch (type->base_type) {
491 case GLSL_TYPE_UINT:
492 case GLSL_TYPE_INT:
493 case GLSL_TYPE_FLOAT:
494 case GLSL_TYPE_BOOL:
495 return type->components();
496 case GLSL_TYPE_ARRAY:
497 return type_size(type->fields.array) * type->length;
498 case GLSL_TYPE_STRUCT:
499 size = 0;
500 for (i = 0; i < type->length; i++) {
501 size += type_size(type->fields.structure[i].type);
502 }
503 return size;
504 case GLSL_TYPE_SAMPLER:
505 /* Samplers take up no register space, since they're baked in at
506 * link time.
507 */
508 return 0;
509 case GLSL_TYPE_ATOMIC_UINT:
510 return 0;
511 case GLSL_TYPE_VOID:
512 case GLSL_TYPE_ERROR:
513 case GLSL_TYPE_INTERFACE:
514 assert(!"not reached");
515 break;
516 }
517
518 return 0;
519 }
520
521 fs_reg
522 fs_visitor::get_timestamp()
523 {
524 assert(brw->gen >= 7);
525
526 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
527 BRW_ARF_TIMESTAMP,
528 0),
529 BRW_REGISTER_TYPE_UD));
530
531 fs_reg dst = fs_reg(this, glsl_type::uint_type);
532
533 fs_inst *mov = emit(MOV(dst, ts));
534 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
535 * even if it's not enabled in the dispatch.
536 */
537 mov->force_writemask_all = true;
538 mov->force_uncompressed = true;
539
540 /* The caller wants the low 32 bits of the timestamp. Since it's running
541 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
542 * which is plenty of time for our purposes. It is identical across the
543 * EUs, but since it's tracking GPU core speed it will increment at a
544 * varying rate as render P-states change.
545 *
546 * The caller could also check if render P-states have changed (or anything
547 * else that might disrupt timing) by setting smear to 2 and checking if
548 * that field is != 0.
549 */
550 dst.smear = 0;
551
552 return dst;
553 }
554
555 void
556 fs_visitor::emit_shader_time_begin()
557 {
558 current_annotation = "shader time start";
559 shader_start_time = get_timestamp();
560 }
561
562 void
563 fs_visitor::emit_shader_time_end()
564 {
565 current_annotation = "shader time end";
566
567 enum shader_time_shader_type type, written_type, reset_type;
568 if (dispatch_width == 8) {
569 type = ST_FS8;
570 written_type = ST_FS8_WRITTEN;
571 reset_type = ST_FS8_RESET;
572 } else {
573 assert(dispatch_width == 16);
574 type = ST_FS16;
575 written_type = ST_FS16_WRITTEN;
576 reset_type = ST_FS16_RESET;
577 }
578
579 fs_reg shader_end_time = get_timestamp();
580
581 /* Check that there weren't any timestamp reset events (assuming these
582 * were the only two timestamp reads that happened).
583 */
584 fs_reg reset = shader_end_time;
585 reset.smear = 2;
586 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
587 test->conditional_mod = BRW_CONDITIONAL_Z;
588 emit(IF(BRW_PREDICATE_NORMAL));
589
590 push_force_uncompressed();
591 fs_reg start = shader_start_time;
592 start.negate = true;
593 fs_reg diff = fs_reg(this, glsl_type::uint_type);
594 emit(ADD(diff, start, shader_end_time));
595
596 /* If there were no instructions between the two timestamp gets, the diff
597 * is 2 cycles. Remove that overhead, so I can forget about that when
598 * trying to determine the time taken for single instructions.
599 */
600 emit(ADD(diff, diff, fs_reg(-2u)));
601
602 emit_shader_time_write(type, diff);
603 emit_shader_time_write(written_type, fs_reg(1u));
604 emit(BRW_OPCODE_ELSE);
605 emit_shader_time_write(reset_type, fs_reg(1u));
606 emit(BRW_OPCODE_ENDIF);
607
608 pop_force_uncompressed();
609 }
610
611 void
612 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
613 fs_reg value)
614 {
615 int shader_time_index =
616 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
617 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
618
619 fs_reg payload;
620 if (dispatch_width == 8)
621 payload = fs_reg(this, glsl_type::uvec2_type);
622 else
623 payload = fs_reg(this, glsl_type::uint_type);
624
625 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
626 fs_reg(), payload, offset, value));
627 }
628
629 void
630 fs_visitor::fail(const char *format, ...)
631 {
632 va_list va;
633 char *msg;
634
635 if (failed)
636 return;
637
638 failed = true;
639
640 va_start(va, format);
641 msg = ralloc_vasprintf(mem_ctx, format, va);
642 va_end(va);
643 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
644
645 this->fail_msg = msg;
646
647 if (INTEL_DEBUG & DEBUG_WM) {
648 fprintf(stderr, "%s", msg);
649 }
650 }
651
652 fs_inst *
653 fs_visitor::emit(enum opcode opcode)
654 {
655 return emit(fs_inst(opcode));
656 }
657
658 fs_inst *
659 fs_visitor::emit(enum opcode opcode, fs_reg dst)
660 {
661 return emit(fs_inst(opcode, dst));
662 }
663
664 fs_inst *
665 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
666 {
667 return emit(fs_inst(opcode, dst, src0));
668 }
669
670 fs_inst *
671 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
672 {
673 return emit(fs_inst(opcode, dst, src0, src1));
674 }
675
676 fs_inst *
677 fs_visitor::emit(enum opcode opcode, fs_reg dst,
678 fs_reg src0, fs_reg src1, fs_reg src2)
679 {
680 return emit(fs_inst(opcode, dst, src0, src1, src2));
681 }
682
683 void
684 fs_visitor::push_force_uncompressed()
685 {
686 force_uncompressed_stack++;
687 }
688
689 void
690 fs_visitor::pop_force_uncompressed()
691 {
692 force_uncompressed_stack--;
693 assert(force_uncompressed_stack >= 0);
694 }
695
696 /**
697 * Returns true if the instruction has a flag that means it won't
698 * update an entire destination register.
699 *
700 * For example, dead code elimination and live variable analysis want to know
701 * when a write to a variable screens off any preceding values that were in
702 * it.
703 */
704 bool
705 fs_inst::is_partial_write()
706 {
707 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
708 this->force_uncompressed ||
709 this->force_sechalf);
710 }
711
712 int
713 fs_inst::regs_read(fs_visitor *v, int arg)
714 {
715 if (is_tex() && arg == 0 && src[0].file == GRF) {
716 if (v->dispatch_width == 16)
717 return (mlen + 1) / 2;
718 else
719 return mlen;
720 }
721 return 1;
722 }
723
724 bool
725 fs_inst::reads_flag()
726 {
727 return predicate;
728 }
729
730 bool
731 fs_inst::writes_flag()
732 {
733 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
734 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
735 }
736
737 /**
738 * Returns how many MRFs an FS opcode will write over.
739 *
740 * Note that this is not the 0 or 1 implied writes in an actual gen
741 * instruction -- the FS opcodes often generate MOVs in addition.
742 */
743 int
744 fs_visitor::implied_mrf_writes(fs_inst *inst)
745 {
746 if (inst->mlen == 0)
747 return 0;
748
749 if (inst->base_mrf == -1)
750 return 0;
751
752 switch (inst->opcode) {
753 case SHADER_OPCODE_RCP:
754 case SHADER_OPCODE_RSQ:
755 case SHADER_OPCODE_SQRT:
756 case SHADER_OPCODE_EXP2:
757 case SHADER_OPCODE_LOG2:
758 case SHADER_OPCODE_SIN:
759 case SHADER_OPCODE_COS:
760 return 1 * dispatch_width / 8;
761 case SHADER_OPCODE_POW:
762 case SHADER_OPCODE_INT_QUOTIENT:
763 case SHADER_OPCODE_INT_REMAINDER:
764 return 2 * dispatch_width / 8;
765 case SHADER_OPCODE_TEX:
766 case FS_OPCODE_TXB:
767 case SHADER_OPCODE_TXD:
768 case SHADER_OPCODE_TXF:
769 case SHADER_OPCODE_TXF_MS:
770 case SHADER_OPCODE_TXF_MCS:
771 case SHADER_OPCODE_TG4:
772 case SHADER_OPCODE_TG4_OFFSET:
773 case SHADER_OPCODE_TXL:
774 case SHADER_OPCODE_TXS:
775 case SHADER_OPCODE_LOD:
776 return 1;
777 case FS_OPCODE_FB_WRITE:
778 return 2;
779 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
780 case SHADER_OPCODE_GEN4_SCRATCH_READ:
781 return 1;
782 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
783 return inst->mlen;
784 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
785 return 2;
786 case SHADER_OPCODE_UNTYPED_ATOMIC:
787 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
788 return 0;
789 default:
790 assert(!"not reached");
791 return inst->mlen;
792 }
793 }
794
795 int
796 fs_visitor::virtual_grf_alloc(int size)
797 {
798 if (virtual_grf_array_size <= virtual_grf_count) {
799 if (virtual_grf_array_size == 0)
800 virtual_grf_array_size = 16;
801 else
802 virtual_grf_array_size *= 2;
803 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
804 virtual_grf_array_size);
805 }
806 virtual_grf_sizes[virtual_grf_count] = size;
807 return virtual_grf_count++;
808 }
809
810 /** Fixed HW reg constructor. */
811 fs_reg::fs_reg(enum register_file file, int reg)
812 {
813 init();
814 this->file = file;
815 this->reg = reg;
816 this->type = BRW_REGISTER_TYPE_F;
817 }
818
819 /** Fixed HW reg constructor. */
820 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
821 {
822 init();
823 this->file = file;
824 this->reg = reg;
825 this->type = type;
826 }
827
828 /** Automatic reg constructor. */
829 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
830 {
831 init();
832
833 this->file = GRF;
834 this->reg = v->virtual_grf_alloc(v->type_size(type));
835 this->reg_offset = 0;
836 this->type = brw_type_for_base_type(type);
837 }
838
839 fs_reg *
840 fs_visitor::variable_storage(ir_variable *var)
841 {
842 return (fs_reg *)hash_table_find(this->variable_ht, var);
843 }
844
845 void
846 import_uniforms_callback(const void *key,
847 void *data,
848 void *closure)
849 {
850 struct hash_table *dst_ht = (struct hash_table *)closure;
851 const fs_reg *reg = (const fs_reg *)data;
852
853 if (reg->file != UNIFORM)
854 return;
855
856 hash_table_insert(dst_ht, data, key);
857 }
858
859 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
860 * This brings in those uniform definitions
861 */
862 void
863 fs_visitor::import_uniforms(fs_visitor *v)
864 {
865 hash_table_call_foreach(v->variable_ht,
866 import_uniforms_callback,
867 variable_ht);
868 this->params_remap = v->params_remap;
869 this->nr_params_remap = v->nr_params_remap;
870 }
871
872 /* Our support for uniforms is piggy-backed on the struct
873 * gl_fragment_program, because that's where the values actually
874 * get stored, rather than in some global gl_shader_program uniform
875 * store.
876 */
877 void
878 fs_visitor::setup_uniform_values(ir_variable *ir)
879 {
880 int namelen = strlen(ir->name);
881
882 /* The data for our (non-builtin) uniforms is stored in a series of
883 * gl_uniform_driver_storage structs for each subcomponent that
884 * glGetUniformLocation() could name. We know it's been set up in the same
885 * order we'd walk the type, so walk the list of storage and find anything
886 * with our name, or the prefix of a component that starts with our name.
887 */
888 unsigned params_before = c->prog_data.nr_params;
889 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
890 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
891
892 if (strncmp(ir->name, storage->name, namelen) != 0 ||
893 (storage->name[namelen] != 0 &&
894 storage->name[namelen] != '.' &&
895 storage->name[namelen] != '[')) {
896 continue;
897 }
898
899 unsigned slots = storage->type->component_slots();
900 if (storage->array_elements)
901 slots *= storage->array_elements;
902
903 for (unsigned i = 0; i < slots; i++) {
904 c->prog_data.param[c->prog_data.nr_params++] =
905 &storage->storage[i].f;
906 }
907 }
908
909 /* Make sure we actually initialized the right amount of stuff here. */
910 assert(params_before + ir->type->component_slots() ==
911 c->prog_data.nr_params);
912 (void)params_before;
913 }
914
915
916 /* Our support for builtin uniforms is even scarier than non-builtin.
917 * It sits on top of the PROG_STATE_VAR parameters that are
918 * automatically updated from GL context state.
919 */
920 void
921 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
922 {
923 const ir_state_slot *const slots = ir->state_slots;
924 assert(ir->state_slots != NULL);
925
926 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
927 /* This state reference has already been setup by ir_to_mesa, but we'll
928 * get the same index back here.
929 */
930 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
931 (gl_state_index *)slots[i].tokens);
932
933 /* Add each of the unique swizzles of the element as a parameter.
934 * This'll end up matching the expected layout of the
935 * array/matrix/structure we're trying to fill in.
936 */
937 int last_swiz = -1;
938 for (unsigned int j = 0; j < 4; j++) {
939 int swiz = GET_SWZ(slots[i].swizzle, j);
940 if (swiz == last_swiz)
941 break;
942 last_swiz = swiz;
943
944 c->prog_data.param[c->prog_data.nr_params++] =
945 &fp->Base.Parameters->ParameterValues[index][swiz].f;
946 }
947 }
948 }
949
950 fs_reg *
951 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
952 {
953 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
954 fs_reg wpos = *reg;
955 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
956
957 /* gl_FragCoord.x */
958 if (ir->pixel_center_integer) {
959 emit(MOV(wpos, this->pixel_x));
960 } else {
961 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
962 }
963 wpos.reg_offset++;
964
965 /* gl_FragCoord.y */
966 if (!flip && ir->pixel_center_integer) {
967 emit(MOV(wpos, this->pixel_y));
968 } else {
969 fs_reg pixel_y = this->pixel_y;
970 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
971
972 if (flip) {
973 pixel_y.negate = true;
974 offset += c->key.drawable_height - 1.0;
975 }
976
977 emit(ADD(wpos, pixel_y, fs_reg(offset)));
978 }
979 wpos.reg_offset++;
980
981 /* gl_FragCoord.z */
982 if (brw->gen >= 6) {
983 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
984 } else {
985 emit(FS_OPCODE_LINTERP, wpos,
986 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
987 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
988 interp_reg(VARYING_SLOT_POS, 2));
989 }
990 wpos.reg_offset++;
991
992 /* gl_FragCoord.w: Already set up in emit_interpolation */
993 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
994
995 return reg;
996 }
997
998 fs_inst *
999 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1000 glsl_interp_qualifier interpolation_mode,
1001 bool is_centroid)
1002 {
1003 brw_wm_barycentric_interp_mode barycoord_mode;
1004 if (brw->gen >= 6) {
1005 if (is_centroid) {
1006 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1007 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1008 else
1009 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1010 } else {
1011 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1012 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1013 else
1014 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1015 }
1016 } else {
1017 /* On Ironlake and below, there is only one interpolation mode.
1018 * Centroid interpolation doesn't mean anything on this hardware --
1019 * there is no multisampling.
1020 */
1021 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1022 }
1023 return emit(FS_OPCODE_LINTERP, attr,
1024 this->delta_x[barycoord_mode],
1025 this->delta_y[barycoord_mode], interp);
1026 }
1027
1028 fs_reg *
1029 fs_visitor::emit_general_interpolation(ir_variable *ir)
1030 {
1031 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1032 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1033 fs_reg attr = *reg;
1034
1035 unsigned int array_elements;
1036 const glsl_type *type;
1037
1038 if (ir->type->is_array()) {
1039 array_elements = ir->type->length;
1040 if (array_elements == 0) {
1041 fail("dereferenced array '%s' has length 0\n", ir->name);
1042 }
1043 type = ir->type->fields.array;
1044 } else {
1045 array_elements = 1;
1046 type = ir->type;
1047 }
1048
1049 glsl_interp_qualifier interpolation_mode =
1050 ir->determine_interpolation_mode(c->key.flat_shade);
1051
1052 int location = ir->location;
1053 for (unsigned int i = 0; i < array_elements; i++) {
1054 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1055 if (c->prog_data.urb_setup[location] == -1) {
1056 /* If there's no incoming setup data for this slot, don't
1057 * emit interpolation for it.
1058 */
1059 attr.reg_offset += type->vector_elements;
1060 location++;
1061 continue;
1062 }
1063
1064 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1065 /* Constant interpolation (flat shading) case. The SF has
1066 * handed us defined values in only the constant offset
1067 * field of the setup reg.
1068 */
1069 for (unsigned int k = 0; k < type->vector_elements; k++) {
1070 struct brw_reg interp = interp_reg(location, k);
1071 interp = suboffset(interp, 3);
1072 interp.type = reg->type;
1073 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1074 attr.reg_offset++;
1075 }
1076 } else {
1077 /* Smooth/noperspective interpolation case. */
1078 for (unsigned int k = 0; k < type->vector_elements; k++) {
1079 /* FINISHME: At some point we probably want to push
1080 * this farther by giving similar treatment to the
1081 * other potentially constant components of the
1082 * attribute, as well as making brw_vs_constval.c
1083 * handle varyings other than gl_TexCoord.
1084 */
1085 struct brw_reg interp = interp_reg(location, k);
1086 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1087 ir->data.centroid);
1088 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1089 /* Get the pixel/sample mask into f0 so that we know
1090 * which pixels are lit. Then, for each channel that is
1091 * unlit, replace the centroid data with non-centroid
1092 * data.
1093 */
1094 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1095 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1096 interpolation_mode, false);
1097 inst->predicate = BRW_PREDICATE_NORMAL;
1098 inst->predicate_inverse = true;
1099 }
1100 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1101 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1102 }
1103 attr.reg_offset++;
1104 }
1105
1106 }
1107 location++;
1108 }
1109 }
1110
1111 return reg;
1112 }
1113
1114 fs_reg *
1115 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1116 {
1117 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1118
1119 /* The frontfacing comes in as a bit in the thread payload. */
1120 if (brw->gen >= 6) {
1121 emit(BRW_OPCODE_ASR, *reg,
1122 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1123 fs_reg(15));
1124 emit(BRW_OPCODE_NOT, *reg, *reg);
1125 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1126 } else {
1127 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1128 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1129 * us front face
1130 */
1131 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1132 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1133 }
1134
1135 return reg;
1136 }
1137
1138 void
1139 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1140 {
1141 assert(dst.type == BRW_REGISTER_TYPE_F);
1142
1143 if (c->key.compute_pos_offset) {
1144 /* Convert int_sample_pos to floating point */
1145 emit(MOV(dst, int_sample_pos));
1146 /* Scale to the range [0, 1] */
1147 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1148 }
1149 else {
1150 /* From ARB_sample_shading specification:
1151 * "When rendering to a non-multisample buffer, or if multisample
1152 * rasterization is disabled, gl_SamplePosition will always be
1153 * (0.5, 0.5).
1154 */
1155 emit(MOV(dst, fs_reg(0.5f)));
1156 }
1157 }
1158
1159 fs_reg *
1160 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1161 {
1162 assert(brw->gen >= 6);
1163 assert(ir->type == glsl_type::vec2_type);
1164
1165 this->current_annotation = "compute sample position";
1166 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1167 fs_reg pos = *reg;
1168 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1169 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1170
1171 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1172 * mode will be enabled.
1173 *
1174 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1175 * R31.1:0 Position Offset X/Y for Slot[3:0]
1176 * R31.3:2 Position Offset X/Y for Slot[7:4]
1177 * .....
1178 *
1179 * The X, Y sample positions come in as bytes in thread payload. So, read
1180 * the positions using vstride=16, width=8, hstride=2.
1181 */
1182 struct brw_reg sample_pos_reg =
1183 stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1184 BRW_REGISTER_TYPE_B), 16, 8, 2);
1185
1186 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1187 if (dispatch_width == 16) {
1188 int_sample_x.sechalf = true;
1189 fs_inst *inst = emit(MOV(int_sample_x,
1190 fs_reg(suboffset(sample_pos_reg, 16))));
1191 inst->force_sechalf = true;
1192 int_sample_x.sechalf = false;
1193 }
1194 /* Compute gl_SamplePosition.x */
1195 compute_sample_position(pos, int_sample_x);
1196 pos.reg_offset++;
1197 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1198 if (dispatch_width == 16) {
1199 int_sample_y.sechalf = true;
1200 fs_inst *inst = emit(MOV(int_sample_y,
1201 fs_reg(suboffset(sample_pos_reg, 17))));
1202 inst->force_sechalf = true;
1203 int_sample_y.sechalf = false;
1204 }
1205 /* Compute gl_SamplePosition.y */
1206 compute_sample_position(pos, int_sample_y);
1207 return reg;
1208 }
1209
1210 fs_reg *
1211 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1212 {
1213 assert(brw->gen >= 6);
1214
1215 this->current_annotation = "compute sample id";
1216 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1217
1218 if (c->key.compute_sample_id) {
1219 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1220 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1221 t2.type = BRW_REGISTER_TYPE_UW;
1222
1223 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1224 * 8x multisampling, subspan 0 will represent sample N (where N
1225 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1226 * 7. We can find the value of N by looking at R0.0 bits 7:6
1227 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1228 * (since samples are always delivered in pairs). That is, we
1229 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1230 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1231 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1232 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1233 * populating a temporary variable with the sequence (0, 1, 2, 3),
1234 * and then reading from it using vstride=1, width=4, hstride=0.
1235 * These computations hold good for 4x multisampling as well.
1236 */
1237 emit(BRW_OPCODE_AND, t1,
1238 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1239 fs_reg(brw_imm_d(0xc0)));
1240 emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1241 /* This works for both SIMD8 and SIMD16 */
1242 emit(MOV(t2, brw_imm_v(0x3210)));
1243 /* This special instruction takes care of setting vstride=1,
1244 * width=4, hstride=0 of t2 during an ADD instruction.
1245 */
1246 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1247 } else {
1248 /* As per GL_ARB_sample_shading specification:
1249 * "When rendering to a non-multisample buffer, or if multisample
1250 * rasterization is disabled, gl_SampleID will always be zero."
1251 */
1252 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1253 }
1254
1255 return reg;
1256 }
1257
1258 fs_reg
1259 fs_visitor::fix_math_operand(fs_reg src)
1260 {
1261 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1262 * might be able to do better by doing execsize = 1 math and then
1263 * expanding that result out, but we would need to be careful with
1264 * masking.
1265 *
1266 * The hardware ignores source modifiers (negate and abs) on math
1267 * instructions, so we also move to a temp to set those up.
1268 */
1269 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1270 !src.abs && !src.negate)
1271 return src;
1272
1273 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1274 * operands to math
1275 */
1276 if (brw->gen >= 7 && src.file != IMM)
1277 return src;
1278
1279 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1280 expanded.type = src.type;
1281 emit(BRW_OPCODE_MOV, expanded, src);
1282 return expanded;
1283 }
1284
1285 fs_inst *
1286 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1287 {
1288 switch (opcode) {
1289 case SHADER_OPCODE_RCP:
1290 case SHADER_OPCODE_RSQ:
1291 case SHADER_OPCODE_SQRT:
1292 case SHADER_OPCODE_EXP2:
1293 case SHADER_OPCODE_LOG2:
1294 case SHADER_OPCODE_SIN:
1295 case SHADER_OPCODE_COS:
1296 break;
1297 default:
1298 assert(!"not reached: bad math opcode");
1299 return NULL;
1300 }
1301
1302 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1303 * might be able to do better by doing execsize = 1 math and then
1304 * expanding that result out, but we would need to be careful with
1305 * masking.
1306 *
1307 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1308 * instructions, so we also move to a temp to set those up.
1309 */
1310 if (brw->gen >= 6)
1311 src = fix_math_operand(src);
1312
1313 fs_inst *inst = emit(opcode, dst, src);
1314
1315 if (brw->gen < 6) {
1316 inst->base_mrf = 2;
1317 inst->mlen = dispatch_width / 8;
1318 }
1319
1320 return inst;
1321 }
1322
1323 fs_inst *
1324 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1325 {
1326 int base_mrf = 2;
1327 fs_inst *inst;
1328
1329 switch (opcode) {
1330 case SHADER_OPCODE_INT_QUOTIENT:
1331 case SHADER_OPCODE_INT_REMAINDER:
1332 if (brw->gen >= 7 && dispatch_width == 16)
1333 fail("16-wide INTDIV unsupported\n");
1334 break;
1335 case SHADER_OPCODE_POW:
1336 break;
1337 default:
1338 assert(!"not reached: unsupported binary math opcode.");
1339 return NULL;
1340 }
1341
1342 if (brw->gen >= 6) {
1343 src0 = fix_math_operand(src0);
1344 src1 = fix_math_operand(src1);
1345
1346 inst = emit(opcode, dst, src0, src1);
1347 } else {
1348 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1349 * "Message Payload":
1350 *
1351 * "Operand0[7]. For the INT DIV functions, this operand is the
1352 * denominator."
1353 * ...
1354 * "Operand1[7]. For the INT DIV functions, this operand is the
1355 * numerator."
1356 */
1357 bool is_int_div = opcode != SHADER_OPCODE_POW;
1358 fs_reg &op0 = is_int_div ? src1 : src0;
1359 fs_reg &op1 = is_int_div ? src0 : src1;
1360
1361 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1362 inst = emit(opcode, dst, op0, reg_null_f);
1363
1364 inst->base_mrf = base_mrf;
1365 inst->mlen = 2 * dispatch_width / 8;
1366 }
1367 return inst;
1368 }
1369
1370 void
1371 fs_visitor::assign_curb_setup()
1372 {
1373 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1374 if (dispatch_width == 8) {
1375 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1376 } else {
1377 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1378 }
1379
1380 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1381 foreach_list(node, &this->instructions) {
1382 fs_inst *inst = (fs_inst *)node;
1383
1384 for (unsigned int i = 0; i < 3; i++) {
1385 if (inst->src[i].file == UNIFORM) {
1386 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1387 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1388 constant_nr / 8,
1389 constant_nr % 8);
1390
1391 inst->src[i].file = HW_REG;
1392 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1393 }
1394 }
1395 }
1396 }
1397
1398 void
1399 fs_visitor::calculate_urb_setup()
1400 {
1401 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1402 c->prog_data.urb_setup[i] = -1;
1403 }
1404
1405 int urb_next = 0;
1406 /* Figure out where each of the incoming setup attributes lands. */
1407 if (brw->gen >= 6) {
1408 if (_mesa_bitcount_64(fp->Base.InputsRead &
1409 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1410 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1411 * first 16 varying inputs, so we can put them wherever we want.
1412 * Just put them in order.
1413 *
1414 * This is useful because it means that (a) inputs not used by the
1415 * fragment shader won't take up valuable register space, and (b) we
1416 * won't have to recompile the fragment shader if it gets paired with
1417 * a different vertex (or geometry) shader.
1418 */
1419 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1420 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1421 BITFIELD64_BIT(i)) {
1422 c->prog_data.urb_setup[i] = urb_next++;
1423 }
1424 }
1425 } else {
1426 /* We have enough input varyings that the SF/SBE pipeline stage can't
1427 * arbitrarily rearrange them to suit our whim; we have to put them
1428 * in an order that matches the output of the previous pipeline stage
1429 * (geometry or vertex shader).
1430 */
1431 struct brw_vue_map prev_stage_vue_map;
1432 brw_compute_vue_map(brw, &prev_stage_vue_map,
1433 c->key.input_slots_valid);
1434 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1435 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1436 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1437 slot++) {
1438 int varying = prev_stage_vue_map.slot_to_varying[slot];
1439 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1440 * unused.
1441 */
1442 if (varying != BRW_VARYING_SLOT_COUNT &&
1443 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1444 BITFIELD64_BIT(varying))) {
1445 c->prog_data.urb_setup[varying] = slot - first_slot;
1446 }
1447 }
1448 urb_next = prev_stage_vue_map.num_slots - first_slot;
1449 }
1450 } else {
1451 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1452 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1453 /* Point size is packed into the header, not as a general attribute */
1454 if (i == VARYING_SLOT_PSIZ)
1455 continue;
1456
1457 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1458 /* The back color slot is skipped when the front color is
1459 * also written to. In addition, some slots can be
1460 * written in the vertex shader and not read in the
1461 * fragment shader. So the register number must always be
1462 * incremented, mapped or not.
1463 */
1464 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1465 c->prog_data.urb_setup[i] = urb_next;
1466 urb_next++;
1467 }
1468 }
1469
1470 /*
1471 * It's a FS only attribute, and we did interpolation for this attribute
1472 * in SF thread. So, count it here, too.
1473 *
1474 * See compile_sf_prog() for more info.
1475 */
1476 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1477 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1478 }
1479
1480 c->prog_data.num_varying_inputs = urb_next;
1481 }
1482
1483 void
1484 fs_visitor::assign_urb_setup()
1485 {
1486 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1487
1488 /* Offset all the urb_setup[] index by the actual position of the
1489 * setup regs, now that the location of the constants has been chosen.
1490 */
1491 foreach_list(node, &this->instructions) {
1492 fs_inst *inst = (fs_inst *)node;
1493
1494 if (inst->opcode == FS_OPCODE_LINTERP) {
1495 assert(inst->src[2].file == HW_REG);
1496 inst->src[2].fixed_hw_reg.nr += urb_start;
1497 }
1498
1499 if (inst->opcode == FS_OPCODE_CINTERP) {
1500 assert(inst->src[0].file == HW_REG);
1501 inst->src[0].fixed_hw_reg.nr += urb_start;
1502 }
1503 }
1504
1505 /* Each attribute is 4 setup channels, each of which is half a reg. */
1506 this->first_non_payload_grf =
1507 urb_start + c->prog_data.num_varying_inputs * 2;
1508 }
1509
1510 /**
1511 * Split large virtual GRFs into separate components if we can.
1512 *
1513 * This is mostly duplicated with what brw_fs_vector_splitting does,
1514 * but that's really conservative because it's afraid of doing
1515 * splitting that doesn't result in real progress after the rest of
1516 * the optimization phases, which would cause infinite looping in
1517 * optimization. We can do it once here, safely. This also has the
1518 * opportunity to split interpolated values, or maybe even uniforms,
1519 * which we don't have at the IR level.
1520 *
1521 * We want to split, because virtual GRFs are what we register
1522 * allocate and spill (due to contiguousness requirements for some
1523 * instructions), and they're what we naturally generate in the
1524 * codegen process, but most virtual GRFs don't actually need to be
1525 * contiguous sets of GRFs. If we split, we'll end up with reduced
1526 * live intervals and better dead code elimination and coalescing.
1527 */
1528 void
1529 fs_visitor::split_virtual_grfs()
1530 {
1531 int num_vars = this->virtual_grf_count;
1532 bool split_grf[num_vars];
1533 int new_virtual_grf[num_vars];
1534
1535 /* Try to split anything > 0 sized. */
1536 for (int i = 0; i < num_vars; i++) {
1537 if (this->virtual_grf_sizes[i] != 1)
1538 split_grf[i] = true;
1539 else
1540 split_grf[i] = false;
1541 }
1542
1543 if (brw->has_pln &&
1544 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1545 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1546 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1547 * Gen6, that was the only supported interpolation mode, and since Gen6,
1548 * delta_x and delta_y are in fixed hardware registers.
1549 */
1550 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1551 false;
1552 }
1553
1554 foreach_list(node, &this->instructions) {
1555 fs_inst *inst = (fs_inst *)node;
1556
1557 /* If there's a SEND message that requires contiguous destination
1558 * registers, no splitting is allowed.
1559 */
1560 if (inst->regs_written > 1) {
1561 split_grf[inst->dst.reg] = false;
1562 }
1563
1564 /* If we're sending from a GRF, don't split it, on the assumption that
1565 * the send is reading the whole thing.
1566 */
1567 if (inst->is_send_from_grf()) {
1568 for (int i = 0; i < 3; i++) {
1569 if (inst->src[i].file == GRF) {
1570 split_grf[inst->src[i].reg] = false;
1571 }
1572 }
1573 }
1574 }
1575
1576 /* Allocate new space for split regs. Note that the virtual
1577 * numbers will be contiguous.
1578 */
1579 for (int i = 0; i < num_vars; i++) {
1580 if (split_grf[i]) {
1581 new_virtual_grf[i] = virtual_grf_alloc(1);
1582 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1583 int reg = virtual_grf_alloc(1);
1584 assert(reg == new_virtual_grf[i] + j - 1);
1585 (void) reg;
1586 }
1587 this->virtual_grf_sizes[i] = 1;
1588 }
1589 }
1590
1591 foreach_list(node, &this->instructions) {
1592 fs_inst *inst = (fs_inst *)node;
1593
1594 if (inst->dst.file == GRF &&
1595 split_grf[inst->dst.reg] &&
1596 inst->dst.reg_offset != 0) {
1597 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1598 inst->dst.reg_offset - 1);
1599 inst->dst.reg_offset = 0;
1600 }
1601 for (int i = 0; i < 3; i++) {
1602 if (inst->src[i].file == GRF &&
1603 split_grf[inst->src[i].reg] &&
1604 inst->src[i].reg_offset != 0) {
1605 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1606 inst->src[i].reg_offset - 1);
1607 inst->src[i].reg_offset = 0;
1608 }
1609 }
1610 }
1611 invalidate_live_intervals();
1612 }
1613
1614 /**
1615 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1616 *
1617 * During code generation, we create tons of temporary variables, many of
1618 * which get immediately killed and are never used again. Yet, in later
1619 * optimization and analysis passes, such as compute_live_intervals, we need
1620 * to loop over all the virtual GRFs. Compacting them can save a lot of
1621 * overhead.
1622 */
1623 void
1624 fs_visitor::compact_virtual_grfs()
1625 {
1626 /* Mark which virtual GRFs are used, and count how many. */
1627 int remap_table[this->virtual_grf_count];
1628 memset(remap_table, -1, sizeof(remap_table));
1629
1630 foreach_list(node, &this->instructions) {
1631 const fs_inst *inst = (const fs_inst *) node;
1632
1633 if (inst->dst.file == GRF)
1634 remap_table[inst->dst.reg] = 0;
1635
1636 for (int i = 0; i < 3; i++) {
1637 if (inst->src[i].file == GRF)
1638 remap_table[inst->src[i].reg] = 0;
1639 }
1640 }
1641
1642 /* In addition to registers used in instructions, fs_visitor keeps
1643 * direct references to certain special values which must be patched:
1644 */
1645 fs_reg *special[] = {
1646 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1647 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1648 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1649 &delta_x[0], &delta_x[1], &delta_x[2],
1650 &delta_x[3], &delta_x[4], &delta_x[5],
1651 &delta_y[0], &delta_y[1], &delta_y[2],
1652 &delta_y[3], &delta_y[4], &delta_y[5],
1653 };
1654 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1655 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1656
1657 /* Treat all special values as used, to be conservative */
1658 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1659 if (special[i]->file == GRF)
1660 remap_table[special[i]->reg] = 0;
1661 }
1662
1663 /* Compact the GRF arrays. */
1664 int new_index = 0;
1665 for (int i = 0; i < this->virtual_grf_count; i++) {
1666 if (remap_table[i] != -1) {
1667 remap_table[i] = new_index;
1668 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1669 invalidate_live_intervals();
1670 ++new_index;
1671 }
1672 }
1673
1674 this->virtual_grf_count = new_index;
1675
1676 /* Patch all the instructions to use the newly renumbered registers */
1677 foreach_list(node, &this->instructions) {
1678 fs_inst *inst = (fs_inst *) node;
1679
1680 if (inst->dst.file == GRF)
1681 inst->dst.reg = remap_table[inst->dst.reg];
1682
1683 for (int i = 0; i < 3; i++) {
1684 if (inst->src[i].file == GRF)
1685 inst->src[i].reg = remap_table[inst->src[i].reg];
1686 }
1687 }
1688
1689 /* Patch all the references to special values */
1690 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1691 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1692 special[i]->reg = remap_table[special[i]->reg];
1693 }
1694 }
1695
1696 bool
1697 fs_visitor::remove_dead_constants()
1698 {
1699 if (dispatch_width == 8) {
1700 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1701 this->nr_params_remap = c->prog_data.nr_params;
1702
1703 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1704 this->params_remap[i] = -1;
1705
1706 /* Find which params are still in use. */
1707 foreach_list(node, &this->instructions) {
1708 fs_inst *inst = (fs_inst *)node;
1709
1710 for (int i = 0; i < 3; i++) {
1711 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1712
1713 if (inst->src[i].file != UNIFORM)
1714 continue;
1715
1716 /* Section 5.11 of the OpenGL 4.3 spec says:
1717 *
1718 * "Out-of-bounds reads return undefined values, which include
1719 * values from other variables of the active program or zero."
1720 */
1721 if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1722 constant_nr = 0;
1723 }
1724
1725 /* For now, set this to non-negative. We'll give it the
1726 * actual new number in a moment, in order to keep the
1727 * register numbers nicely ordered.
1728 */
1729 this->params_remap[constant_nr] = 0;
1730 }
1731 }
1732
1733 /* Figure out what the new numbers for the params will be. At some
1734 * point when we're doing uniform array access, we're going to want
1735 * to keep the distinction between .reg and .reg_offset, but for
1736 * now we don't care.
1737 */
1738 unsigned int new_nr_params = 0;
1739 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1740 if (this->params_remap[i] != -1) {
1741 this->params_remap[i] = new_nr_params++;
1742 }
1743 }
1744
1745 /* Update the list of params to be uploaded to match our new numbering. */
1746 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1747 int remapped = this->params_remap[i];
1748
1749 if (remapped == -1)
1750 continue;
1751
1752 c->prog_data.param[remapped] = c->prog_data.param[i];
1753 }
1754
1755 c->prog_data.nr_params = new_nr_params;
1756 } else {
1757 /* This should have been generated in the 8-wide pass already. */
1758 assert(this->params_remap);
1759 }
1760
1761 /* Now do the renumbering of the shader to remove unused params. */
1762 foreach_list(node, &this->instructions) {
1763 fs_inst *inst = (fs_inst *)node;
1764
1765 for (int i = 0; i < 3; i++) {
1766 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1767
1768 if (inst->src[i].file != UNIFORM)
1769 continue;
1770
1771 /* as above alias to 0 */
1772 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1773 constant_nr = 0;
1774 }
1775 assert(this->params_remap[constant_nr] != -1);
1776 inst->src[i].reg = this->params_remap[constant_nr];
1777 inst->src[i].reg_offset = 0;
1778 }
1779 }
1780
1781 return true;
1782 }
1783
1784 /*
1785 * Implements array access of uniforms by inserting a
1786 * PULL_CONSTANT_LOAD instruction.
1787 *
1788 * Unlike temporary GRF array access (where we don't support it due to
1789 * the difficulty of doing relative addressing on instruction
1790 * destinations), we could potentially do array access of uniforms
1791 * that were loaded in GRF space as push constants. In real-world
1792 * usage we've seen, though, the arrays being used are always larger
1793 * than we could load as push constants, so just always move all
1794 * uniform array access out to a pull constant buffer.
1795 */
1796 void
1797 fs_visitor::move_uniform_array_access_to_pull_constants()
1798 {
1799 int pull_constant_loc[c->prog_data.nr_params];
1800
1801 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1802 pull_constant_loc[i] = -1;
1803 }
1804
1805 /* Walk through and find array access of uniforms. Put a copy of that
1806 * uniform in the pull constant buffer.
1807 *
1808 * Note that we don't move constant-indexed accesses to arrays. No
1809 * testing has been done of the performance impact of this choice.
1810 */
1811 foreach_list_safe(node, &this->instructions) {
1812 fs_inst *inst = (fs_inst *)node;
1813
1814 for (int i = 0 ; i < 3; i++) {
1815 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1816 continue;
1817
1818 int uniform = inst->src[i].reg;
1819
1820 /* If this array isn't already present in the pull constant buffer,
1821 * add it.
1822 */
1823 if (pull_constant_loc[uniform] == -1) {
1824 const float **values = &c->prog_data.param[uniform];
1825
1826 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1827
1828 assert(param_size[uniform]);
1829
1830 for (int j = 0; j < param_size[uniform]; j++) {
1831 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1832 values[j];
1833 }
1834 }
1835
1836 /* Set up the annotation tracking for new generated instructions. */
1837 base_ir = inst->ir;
1838 current_annotation = inst->annotation;
1839
1840 fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1841 fs_reg temp = fs_reg(this, glsl_type::float_type);
1842 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1843 surf_index,
1844 *inst->src[i].reladdr,
1845 pull_constant_loc[uniform] +
1846 inst->src[i].reg_offset);
1847 inst->insert_before(&list);
1848
1849 inst->src[i].file = temp.file;
1850 inst->src[i].reg = temp.reg;
1851 inst->src[i].reg_offset = temp.reg_offset;
1852 inst->src[i].reladdr = NULL;
1853 }
1854 }
1855 }
1856
1857 /**
1858 * Choose accesses from the UNIFORM file to demote to using the pull
1859 * constant buffer.
1860 *
1861 * We allow a fragment shader to have more than the specified minimum
1862 * maximum number of fragment shader uniform components (64). If
1863 * there are too many of these, they'd fill up all of register space.
1864 * So, this will push some of them out to the pull constant buffer and
1865 * update the program to load them.
1866 */
1867 void
1868 fs_visitor::setup_pull_constants()
1869 {
1870 /* Only allow 16 registers (128 uniform components) as push constants. */
1871 unsigned int max_uniform_components = 16 * 8;
1872 if (c->prog_data.nr_params <= max_uniform_components)
1873 return;
1874
1875 if (dispatch_width == 16) {
1876 fail("Pull constants not supported in 16-wide\n");
1877 return;
1878 }
1879
1880 /* Just demote the end of the list. We could probably do better
1881 * here, demoting things that are rarely used in the program first.
1882 */
1883 unsigned int pull_uniform_base = max_uniform_components;
1884
1885 int pull_constant_loc[c->prog_data.nr_params];
1886 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1887 if (i < pull_uniform_base) {
1888 pull_constant_loc[i] = -1;
1889 } else {
1890 pull_constant_loc[i] = -1;
1891 /* If our constant is already being uploaded for reladdr purposes,
1892 * reuse it.
1893 */
1894 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1895 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1896 pull_constant_loc[i] = j;
1897 break;
1898 }
1899 }
1900 if (pull_constant_loc[i] == -1) {
1901 int pull_index = c->prog_data.nr_pull_params++;
1902 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1903 pull_constant_loc[i] = pull_index;;
1904 }
1905 }
1906 }
1907 c->prog_data.nr_params = pull_uniform_base;
1908
1909 foreach_list(node, &this->instructions) {
1910 fs_inst *inst = (fs_inst *)node;
1911
1912 for (int i = 0; i < 3; i++) {
1913 if (inst->src[i].file != UNIFORM)
1914 continue;
1915
1916 int pull_index = pull_constant_loc[inst->src[i].reg +
1917 inst->src[i].reg_offset];
1918 if (pull_index == -1)
1919 continue;
1920
1921 assert(!inst->src[i].reladdr);
1922
1923 fs_reg dst = fs_reg(this, glsl_type::float_type);
1924 fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1925 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1926 fs_inst *pull =
1927 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1928 dst, index, offset);
1929 pull->ir = inst->ir;
1930 pull->annotation = inst->annotation;
1931
1932 inst->insert_before(pull);
1933
1934 inst->src[i].file = GRF;
1935 inst->src[i].reg = dst.reg;
1936 inst->src[i].reg_offset = 0;
1937 inst->src[i].smear = pull_index & 3;
1938 }
1939 }
1940 }
1941
1942 bool
1943 fs_visitor::opt_algebraic()
1944 {
1945 bool progress = false;
1946
1947 foreach_list(node, &this->instructions) {
1948 fs_inst *inst = (fs_inst *)node;
1949
1950 switch (inst->opcode) {
1951 case BRW_OPCODE_MUL:
1952 if (inst->src[1].file != IMM)
1953 continue;
1954
1955 /* a * 1.0 = a */
1956 if (inst->src[1].is_one()) {
1957 inst->opcode = BRW_OPCODE_MOV;
1958 inst->src[1] = reg_undef;
1959 progress = true;
1960 break;
1961 }
1962
1963 /* a * 0.0 = 0.0 */
1964 if (inst->src[1].is_zero()) {
1965 inst->opcode = BRW_OPCODE_MOV;
1966 inst->src[0] = inst->src[1];
1967 inst->src[1] = reg_undef;
1968 progress = true;
1969 break;
1970 }
1971
1972 break;
1973 case BRW_OPCODE_ADD:
1974 if (inst->src[1].file != IMM)
1975 continue;
1976
1977 /* a + 0.0 = a */
1978 if (inst->src[1].is_zero()) {
1979 inst->opcode = BRW_OPCODE_MOV;
1980 inst->src[1] = reg_undef;
1981 progress = true;
1982 break;
1983 }
1984 break;
1985 case BRW_OPCODE_OR:
1986 if (inst->src[0].equals(inst->src[1])) {
1987 inst->opcode = BRW_OPCODE_MOV;
1988 inst->src[1] = reg_undef;
1989 progress = true;
1990 break;
1991 }
1992 break;
1993 case BRW_OPCODE_SEL:
1994 if (inst->saturate && inst->src[1].file == IMM) {
1995 switch (inst->conditional_mod) {
1996 case BRW_CONDITIONAL_LE:
1997 case BRW_CONDITIONAL_L:
1998 switch (inst->src[1].type) {
1999 case BRW_REGISTER_TYPE_F:
2000 if (inst->src[1].imm.f >= 1.0f) {
2001 inst->opcode = BRW_OPCODE_MOV;
2002 inst->src[1] = reg_undef;
2003 progress = true;
2004 }
2005 break;
2006 default:
2007 break;
2008 }
2009 break;
2010 case BRW_CONDITIONAL_GE:
2011 case BRW_CONDITIONAL_G:
2012 switch (inst->src[1].type) {
2013 case BRW_REGISTER_TYPE_F:
2014 if (inst->src[1].imm.f <= 0.0f) {
2015 inst->opcode = BRW_OPCODE_MOV;
2016 inst->src[1] = reg_undef;
2017 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2018 progress = true;
2019 }
2020 break;
2021 default:
2022 break;
2023 }
2024 default:
2025 break;
2026 }
2027 }
2028 break;
2029 default:
2030 break;
2031 }
2032 }
2033
2034 return progress;
2035 }
2036
2037 /**
2038 * Removes any instructions writing a VGRF where that VGRF is not used by any
2039 * later instruction.
2040 */
2041 bool
2042 fs_visitor::dead_code_eliminate()
2043 {
2044 bool progress = false;
2045 int pc = 0;
2046
2047 calculate_live_intervals();
2048
2049 foreach_list_safe(node, &this->instructions) {
2050 fs_inst *inst = (fs_inst *)node;
2051
2052 if (inst->dst.file == GRF && !inst->has_side_effects()) {
2053 bool dead = true;
2054
2055 for (int i = 0; i < inst->regs_written; i++) {
2056 int var = live_intervals->var_from_vgrf[inst->dst.reg];
2057 assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2058 if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2059 dead = false;
2060 break;
2061 }
2062 }
2063
2064 if (dead) {
2065 /* Don't dead code eliminate instructions that write to the
2066 * accumulator as a side-effect. Instead just set the destination
2067 * to the null register to free it.
2068 */
2069 switch (inst->opcode) {
2070 case BRW_OPCODE_ADDC:
2071 case BRW_OPCODE_SUBB:
2072 case BRW_OPCODE_MACH:
2073 inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2074 break;
2075 default:
2076 inst->remove();
2077 progress = true;
2078 break;
2079 }
2080 }
2081 }
2082
2083 pc++;
2084 }
2085
2086 if (progress)
2087 invalidate_live_intervals();
2088
2089 return progress;
2090 }
2091
2092 struct dead_code_hash_key
2093 {
2094 int vgrf;
2095 int reg_offset;
2096 };
2097
2098 static bool
2099 dead_code_hash_compare(const void *a, const void *b)
2100 {
2101 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2102 }
2103
2104 static void
2105 clear_dead_code_hash(struct hash_table *ht)
2106 {
2107 struct hash_entry *entry;
2108
2109 hash_table_foreach(ht, entry) {
2110 _mesa_hash_table_remove(ht, entry);
2111 }
2112 }
2113
2114 static void
2115 insert_dead_code_hash(struct hash_table *ht,
2116 int vgrf, int reg_offset, fs_inst *inst)
2117 {
2118 /* We don't bother freeing keys, because they'll be GCed with the ht. */
2119 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2120
2121 key->vgrf = vgrf;
2122 key->reg_offset = reg_offset;
2123
2124 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2125 }
2126
2127 static struct hash_entry *
2128 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2129 {
2130 struct dead_code_hash_key key;
2131
2132 key.vgrf = vgrf;
2133 key.reg_offset = reg_offset;
2134
2135 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2136 }
2137
2138 static void
2139 remove_dead_code_hash(struct hash_table *ht,
2140 int vgrf, int reg_offset)
2141 {
2142 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2143 if (!entry)
2144 return;
2145
2146 _mesa_hash_table_remove(ht, entry);
2147 }
2148
2149 /**
2150 * Walks basic blocks, removing any regs that are written but not read before
2151 * being redefined.
2152 *
2153 * The dead_code_eliminate() function implements a global dead code
2154 * elimination, but it only handles the removing the last write to a register
2155 * if it's never read. This one can handle intermediate writes, but only
2156 * within a basic block.
2157 */
2158 bool
2159 fs_visitor::dead_code_eliminate_local()
2160 {
2161 struct hash_table *ht;
2162 bool progress = false;
2163
2164 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2165
2166 foreach_list_safe(node, &this->instructions) {
2167 fs_inst *inst = (fs_inst *)node;
2168
2169 /* At a basic block, empty the HT since we don't understand dataflow
2170 * here.
2171 */
2172 if (inst->is_control_flow()) {
2173 clear_dead_code_hash(ht);
2174 continue;
2175 }
2176
2177 /* Clear the HT of any instructions that got read. */
2178 for (int i = 0; i < 3; i++) {
2179 fs_reg src = inst->src[i];
2180 if (src.file != GRF)
2181 continue;
2182
2183 int read = 1;
2184 if (inst->is_send_from_grf())
2185 read = virtual_grf_sizes[src.reg] - src.reg_offset;
2186
2187 for (int reg_offset = src.reg_offset;
2188 reg_offset < src.reg_offset + read;
2189 reg_offset++) {
2190 remove_dead_code_hash(ht, src.reg, reg_offset);
2191 }
2192 }
2193
2194 /* Add any update of a GRF to the HT, removing a previous write if it
2195 * wasn't read.
2196 */
2197 if (inst->dst.file == GRF) {
2198 if (inst->regs_written > 1) {
2199 /* We don't know how to trim channels from an instruction's
2200 * writes, so we can't incrementally remove unread channels from
2201 * it. Just remove whatever it overwrites from the table
2202 */
2203 for (int i = 0; i < inst->regs_written; i++) {
2204 remove_dead_code_hash(ht,
2205 inst->dst.reg,
2206 inst->dst.reg_offset + i);
2207 }
2208 } else {
2209 struct hash_entry *entry =
2210 get_dead_code_hash_entry(ht, inst->dst.reg,
2211 inst->dst.reg_offset);
2212
2213 if (entry) {
2214 if (inst->is_partial_write()) {
2215 /* For a partial write, we can't remove any previous dead code
2216 * candidate, since we're just modifying their result.
2217 */
2218 } else {
2219 /* We're completely updating a channel, and there was a
2220 * previous write to the channel that wasn't read. Kill it!
2221 */
2222 fs_inst *inst = (fs_inst *)entry->data;
2223 inst->remove();
2224 progress = true;
2225 }
2226
2227 _mesa_hash_table_remove(ht, entry);
2228 }
2229
2230 if (!inst->has_side_effects())
2231 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2232 inst);
2233 }
2234 }
2235 }
2236
2237 _mesa_hash_table_destroy(ht, NULL);
2238
2239 if (progress)
2240 invalidate_live_intervals();
2241
2242 return progress;
2243 }
2244
2245 /**
2246 * Implements register coalescing: Checks if the two registers involved in a
2247 * raw move don't interfere, in which case they can both be stored in the same
2248 * place and the MOV removed.
2249 */
2250 bool
2251 fs_visitor::register_coalesce()
2252 {
2253 bool progress = false;
2254
2255 calculate_live_intervals();
2256
2257 foreach_list_safe(node, &this->instructions) {
2258 fs_inst *inst = (fs_inst *)node;
2259
2260 if (inst->opcode != BRW_OPCODE_MOV ||
2261 inst->is_partial_write() ||
2262 inst->saturate ||
2263 inst->src[0].file != GRF ||
2264 inst->src[0].negate ||
2265 inst->src[0].abs ||
2266 inst->src[0].smear != -1 ||
2267 inst->dst.file != GRF ||
2268 inst->dst.type != inst->src[0].type ||
2269 virtual_grf_sizes[inst->src[0].reg] != 1) {
2270 continue;
2271 }
2272
2273 int var_from = live_intervals->var_from_reg(&inst->src[0]);
2274 int var_to = live_intervals->var_from_reg(&inst->dst);
2275
2276 if (live_intervals->vars_interfere(var_from, var_to) &&
2277 !inst->dst.equals(inst->src[0]))
2278 continue;
2279
2280 int reg_from = inst->src[0].reg;
2281 assert(inst->src[0].reg_offset == 0);
2282 int reg_to = inst->dst.reg;
2283 int reg_to_offset = inst->dst.reg_offset;
2284
2285 foreach_list(node, &this->instructions) {
2286 fs_inst *scan_inst = (fs_inst *)node;
2287
2288 if (scan_inst->dst.file == GRF &&
2289 scan_inst->dst.reg == reg_from) {
2290 scan_inst->dst.reg = reg_to;
2291 scan_inst->dst.reg_offset = reg_to_offset;
2292 }
2293 for (int i = 0; i < 3; i++) {
2294 if (scan_inst->src[i].file == GRF &&
2295 scan_inst->src[i].reg == reg_from) {
2296 scan_inst->src[i].reg = reg_to;
2297 scan_inst->src[i].reg_offset = reg_to_offset;
2298 }
2299 }
2300 }
2301
2302 inst->remove();
2303 progress = true;
2304 continue;
2305 }
2306
2307 if (progress)
2308 invalidate_live_intervals();
2309
2310 return progress;
2311 }
2312
2313 bool
2314 fs_visitor::compute_to_mrf()
2315 {
2316 bool progress = false;
2317 int next_ip = 0;
2318
2319 calculate_live_intervals();
2320
2321 foreach_list_safe(node, &this->instructions) {
2322 fs_inst *inst = (fs_inst *)node;
2323
2324 int ip = next_ip;
2325 next_ip++;
2326
2327 if (inst->opcode != BRW_OPCODE_MOV ||
2328 inst->is_partial_write() ||
2329 inst->dst.file != MRF || inst->src[0].file != GRF ||
2330 inst->dst.type != inst->src[0].type ||
2331 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2332 continue;
2333
2334 /* Work out which hardware MRF registers are written by this
2335 * instruction.
2336 */
2337 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2338 int mrf_high;
2339 if (inst->dst.reg & BRW_MRF_COMPR4) {
2340 mrf_high = mrf_low + 4;
2341 } else if (dispatch_width == 16 &&
2342 (!inst->force_uncompressed && !inst->force_sechalf)) {
2343 mrf_high = mrf_low + 1;
2344 } else {
2345 mrf_high = mrf_low;
2346 }
2347
2348 /* Can't compute-to-MRF this GRF if someone else was going to
2349 * read it later.
2350 */
2351 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2352 continue;
2353
2354 /* Found a move of a GRF to a MRF. Let's see if we can go
2355 * rewrite the thing that made this GRF to write into the MRF.
2356 */
2357 fs_inst *scan_inst;
2358 for (scan_inst = (fs_inst *)inst->prev;
2359 scan_inst->prev != NULL;
2360 scan_inst = (fs_inst *)scan_inst->prev) {
2361 if (scan_inst->dst.file == GRF &&
2362 scan_inst->dst.reg == inst->src[0].reg) {
2363 /* Found the last thing to write our reg we want to turn
2364 * into a compute-to-MRF.
2365 */
2366
2367 /* If this one instruction didn't populate all the
2368 * channels, bail. We might be able to rewrite everything
2369 * that writes that reg, but it would require smarter
2370 * tracking to delay the rewriting until complete success.
2371 */
2372 if (scan_inst->is_partial_write())
2373 break;
2374
2375 /* Things returning more than one register would need us to
2376 * understand coalescing out more than one MOV at a time.
2377 */
2378 if (scan_inst->regs_written > 1)
2379 break;
2380
2381 /* SEND instructions can't have MRF as a destination. */
2382 if (scan_inst->mlen)
2383 break;
2384
2385 if (brw->gen == 6) {
2386 /* gen6 math instructions must have the destination be
2387 * GRF, so no compute-to-MRF for them.
2388 */
2389 if (scan_inst->is_math()) {
2390 break;
2391 }
2392 }
2393
2394 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2395 /* Found the creator of our MRF's source value. */
2396 scan_inst->dst.file = MRF;
2397 scan_inst->dst.reg = inst->dst.reg;
2398 scan_inst->saturate |= inst->saturate;
2399 inst->remove();
2400 progress = true;
2401 }
2402 break;
2403 }
2404
2405 /* We don't handle control flow here. Most computation of
2406 * values that end up in MRFs are shortly before the MRF
2407 * write anyway.
2408 */
2409 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2410 break;
2411
2412 /* You can't read from an MRF, so if someone else reads our
2413 * MRF's source GRF that we wanted to rewrite, that stops us.
2414 */
2415 bool interfered = false;
2416 for (int i = 0; i < 3; i++) {
2417 if (scan_inst->src[i].file == GRF &&
2418 scan_inst->src[i].reg == inst->src[0].reg &&
2419 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2420 interfered = true;
2421 }
2422 }
2423 if (interfered)
2424 break;
2425
2426 if (scan_inst->dst.file == MRF) {
2427 /* If somebody else writes our MRF here, we can't
2428 * compute-to-MRF before that.
2429 */
2430 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2431 int scan_mrf_high;
2432
2433 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2434 scan_mrf_high = scan_mrf_low + 4;
2435 } else if (dispatch_width == 16 &&
2436 (!scan_inst->force_uncompressed &&
2437 !scan_inst->force_sechalf)) {
2438 scan_mrf_high = scan_mrf_low + 1;
2439 } else {
2440 scan_mrf_high = scan_mrf_low;
2441 }
2442
2443 if (mrf_low == scan_mrf_low ||
2444 mrf_low == scan_mrf_high ||
2445 mrf_high == scan_mrf_low ||
2446 mrf_high == scan_mrf_high) {
2447 break;
2448 }
2449 }
2450
2451 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2452 /* Found a SEND instruction, which means that there are
2453 * live values in MRFs from base_mrf to base_mrf +
2454 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2455 * above it.
2456 */
2457 if (mrf_low >= scan_inst->base_mrf &&
2458 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2459 break;
2460 }
2461 if (mrf_high >= scan_inst->base_mrf &&
2462 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2463 break;
2464 }
2465 }
2466 }
2467 }
2468
2469 if (progress)
2470 invalidate_live_intervals();
2471
2472 return progress;
2473 }
2474
2475 /**
2476 * Walks through basic blocks, looking for repeated MRF writes and
2477 * removing the later ones.
2478 */
2479 bool
2480 fs_visitor::remove_duplicate_mrf_writes()
2481 {
2482 fs_inst *last_mrf_move[16];
2483 bool progress = false;
2484
2485 /* Need to update the MRF tracking for compressed instructions. */
2486 if (dispatch_width == 16)
2487 return false;
2488
2489 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2490
2491 foreach_list_safe(node, &this->instructions) {
2492 fs_inst *inst = (fs_inst *)node;
2493
2494 if (inst->is_control_flow()) {
2495 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2496 }
2497
2498 if (inst->opcode == BRW_OPCODE_MOV &&
2499 inst->dst.file == MRF) {
2500 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2501 if (prev_inst && inst->equals(prev_inst)) {
2502 inst->remove();
2503 progress = true;
2504 continue;
2505 }
2506 }
2507
2508 /* Clear out the last-write records for MRFs that were overwritten. */
2509 if (inst->dst.file == MRF) {
2510 last_mrf_move[inst->dst.reg] = NULL;
2511 }
2512
2513 if (inst->mlen > 0 && inst->base_mrf != -1) {
2514 /* Found a SEND instruction, which will include two or fewer
2515 * implied MRF writes. We could do better here.
2516 */
2517 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2518 last_mrf_move[inst->base_mrf + i] = NULL;
2519 }
2520 }
2521
2522 /* Clear out any MRF move records whose sources got overwritten. */
2523 if (inst->dst.file == GRF) {
2524 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2525 if (last_mrf_move[i] &&
2526 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2527 last_mrf_move[i] = NULL;
2528 }
2529 }
2530 }
2531
2532 if (inst->opcode == BRW_OPCODE_MOV &&
2533 inst->dst.file == MRF &&
2534 inst->src[0].file == GRF &&
2535 !inst->is_partial_write()) {
2536 last_mrf_move[inst->dst.reg] = inst;
2537 }
2538 }
2539
2540 if (progress)
2541 invalidate_live_intervals();
2542
2543 return progress;
2544 }
2545
2546 static void
2547 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2548 int first_grf, int grf_len)
2549 {
2550 bool inst_16wide = (dispatch_width > 8 &&
2551 !inst->force_uncompressed &&
2552 !inst->force_sechalf);
2553
2554 /* Clear the flag for registers that actually got read (as expected). */
2555 for (int i = 0; i < 3; i++) {
2556 int grf;
2557 if (inst->src[i].file == GRF) {
2558 grf = inst->src[i].reg;
2559 } else if (inst->src[i].file == HW_REG &&
2560 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2561 grf = inst->src[i].fixed_hw_reg.nr;
2562 } else {
2563 continue;
2564 }
2565
2566 if (grf >= first_grf &&
2567 grf < first_grf + grf_len) {
2568 deps[grf - first_grf] = false;
2569 if (inst_16wide)
2570 deps[grf - first_grf + 1] = false;
2571 }
2572 }
2573 }
2574
2575 /**
2576 * Implements this workaround for the original 965:
2577 *
2578 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2579 * check for post destination dependencies on this instruction, software
2580 * must ensure that there is no destination hazard for the case of ‘write
2581 * followed by a posted write’ shown in the following example.
2582 *
2583 * 1. mov r3 0
2584 * 2. send r3.xy <rest of send instruction>
2585 * 3. mov r2 r3
2586 *
2587 * Due to no post-destination dependency check on the ‘send’, the above
2588 * code sequence could have two instructions (1 and 2) in flight at the
2589 * same time that both consider ‘r3’ as the target of their final writes.
2590 */
2591 void
2592 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2593 {
2594 int reg_size = dispatch_width / 8;
2595 int write_len = inst->regs_written * reg_size;
2596 int first_write_grf = inst->dst.reg;
2597 bool needs_dep[BRW_MAX_MRF];
2598 assert(write_len < (int)sizeof(needs_dep) - 1);
2599
2600 memset(needs_dep, false, sizeof(needs_dep));
2601 memset(needs_dep, true, write_len);
2602
2603 clear_deps_for_inst_src(inst, dispatch_width,
2604 needs_dep, first_write_grf, write_len);
2605
2606 /* Walk backwards looking for writes to registers we're writing which
2607 * aren't read since being written. If we hit the start of the program,
2608 * we assume that there are no outstanding dependencies on entry to the
2609 * program.
2610 */
2611 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2612 scan_inst != NULL;
2613 scan_inst = (fs_inst *)scan_inst->prev) {
2614
2615 /* If we hit control flow, assume that there *are* outstanding
2616 * dependencies, and force their cleanup before our instruction.
2617 */
2618 if (scan_inst->is_control_flow()) {
2619 for (int i = 0; i < write_len; i++) {
2620 if (needs_dep[i]) {
2621 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2622 }
2623 }
2624 return;
2625 }
2626
2627 bool scan_inst_16wide = (dispatch_width > 8 &&
2628 !scan_inst->force_uncompressed &&
2629 !scan_inst->force_sechalf);
2630
2631 /* We insert our reads as late as possible on the assumption that any
2632 * instruction but a MOV that might have left us an outstanding
2633 * dependency has more latency than a MOV.
2634 */
2635 if (scan_inst->dst.file == GRF) {
2636 for (int i = 0; i < scan_inst->regs_written; i++) {
2637 int reg = scan_inst->dst.reg + i * reg_size;
2638
2639 if (reg >= first_write_grf &&
2640 reg < first_write_grf + write_len &&
2641 needs_dep[reg - first_write_grf]) {
2642 inst->insert_before(DEP_RESOLVE_MOV(reg));
2643 needs_dep[reg - first_write_grf] = false;
2644 if (scan_inst_16wide)
2645 needs_dep[reg - first_write_grf + 1] = false;
2646 }
2647 }
2648 }
2649
2650 /* Clear the flag for registers that actually got read (as expected). */
2651 clear_deps_for_inst_src(scan_inst, dispatch_width,
2652 needs_dep, first_write_grf, write_len);
2653
2654 /* Continue the loop only if we haven't resolved all the dependencies */
2655 int i;
2656 for (i = 0; i < write_len; i++) {
2657 if (needs_dep[i])
2658 break;
2659 }
2660 if (i == write_len)
2661 return;
2662 }
2663 }
2664
2665 /**
2666 * Implements this workaround for the original 965:
2667 *
2668 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2669 * used as a destination register until after it has been sourced by an
2670 * instruction with a different destination register.
2671 */
2672 void
2673 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2674 {
2675 int write_len = inst->regs_written * dispatch_width / 8;
2676 int first_write_grf = inst->dst.reg;
2677 bool needs_dep[BRW_MAX_MRF];
2678 assert(write_len < (int)sizeof(needs_dep) - 1);
2679
2680 memset(needs_dep, false, sizeof(needs_dep));
2681 memset(needs_dep, true, write_len);
2682 /* Walk forwards looking for writes to registers we're writing which aren't
2683 * read before being written.
2684 */
2685 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2686 !scan_inst->is_tail_sentinel();
2687 scan_inst = (fs_inst *)scan_inst->next) {
2688 /* If we hit control flow, force resolve all remaining dependencies. */
2689 if (scan_inst->is_control_flow()) {
2690 for (int i = 0; i < write_len; i++) {
2691 if (needs_dep[i])
2692 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2693 }
2694 return;
2695 }
2696
2697 /* Clear the flag for registers that actually got read (as expected). */
2698 clear_deps_for_inst_src(scan_inst, dispatch_width,
2699 needs_dep, first_write_grf, write_len);
2700
2701 /* We insert our reads as late as possible since they're reading the
2702 * result of a SEND, which has massive latency.
2703 */
2704 if (scan_inst->dst.file == GRF &&
2705 scan_inst->dst.reg >= first_write_grf &&
2706 scan_inst->dst.reg < first_write_grf + write_len &&
2707 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2708 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2709 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2710 }
2711
2712 /* Continue the loop only if we haven't resolved all the dependencies */
2713 int i;
2714 for (i = 0; i < write_len; i++) {
2715 if (needs_dep[i])
2716 break;
2717 }
2718 if (i == write_len)
2719 return;
2720 }
2721
2722 /* If we hit the end of the program, resolve all remaining dependencies out
2723 * of paranoia.
2724 */
2725 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2726 assert(last_inst->eot);
2727 for (int i = 0; i < write_len; i++) {
2728 if (needs_dep[i])
2729 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2730 }
2731 }
2732
2733 void
2734 fs_visitor::insert_gen4_send_dependency_workarounds()
2735 {
2736 if (brw->gen != 4 || brw->is_g4x)
2737 return;
2738
2739 /* Note that we're done with register allocation, so GRF fs_regs always
2740 * have a .reg_offset of 0.
2741 */
2742
2743 foreach_list_safe(node, &this->instructions) {
2744 fs_inst *inst = (fs_inst *)node;
2745
2746 if (inst->mlen != 0 && inst->dst.file == GRF) {
2747 insert_gen4_pre_send_dependency_workarounds(inst);
2748 insert_gen4_post_send_dependency_workarounds(inst);
2749 }
2750 }
2751 }
2752
2753 /**
2754 * Turns the generic expression-style uniform pull constant load instruction
2755 * into a hardware-specific series of instructions for loading a pull
2756 * constant.
2757 *
2758 * The expression style allows the CSE pass before this to optimize out
2759 * repeated loads from the same offset, and gives the pre-register-allocation
2760 * scheduling full flexibility, while the conversion to native instructions
2761 * allows the post-register-allocation scheduler the best information
2762 * possible.
2763 *
2764 * Note that execution masking for setting up pull constant loads is special:
2765 * the channels that need to be written are unrelated to the current execution
2766 * mask, since a later instruction will use one of the result channels as a
2767 * source operand for all 8 or 16 of its channels.
2768 */
2769 void
2770 fs_visitor::lower_uniform_pull_constant_loads()
2771 {
2772 foreach_list(node, &this->instructions) {
2773 fs_inst *inst = (fs_inst *)node;
2774
2775 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2776 continue;
2777
2778 if (brw->gen >= 7) {
2779 /* The offset arg before was a vec4-aligned byte offset. We need to
2780 * turn it into a dword offset.
2781 */
2782 fs_reg const_offset_reg = inst->src[1];
2783 assert(const_offset_reg.file == IMM &&
2784 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2785 const_offset_reg.imm.u /= 4;
2786 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2787
2788 /* This is actually going to be a MOV, but since only the first dword
2789 * is accessed, we have a special opcode to do just that one. Note
2790 * that this needs to be an operation that will be considered a def
2791 * by live variable analysis, or register allocation will explode.
2792 */
2793 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2794 payload, const_offset_reg);
2795 setup->force_writemask_all = true;
2796
2797 setup->ir = inst->ir;
2798 setup->annotation = inst->annotation;
2799 inst->insert_before(setup);
2800
2801 /* Similarly, this will only populate the first 4 channels of the
2802 * result register (since we only use smear values from 0-3), but we
2803 * don't tell the optimizer.
2804 */
2805 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2806 inst->src[1] = payload;
2807
2808 invalidate_live_intervals();
2809 } else {
2810 /* Before register allocation, we didn't tell the scheduler about the
2811 * MRF we use. We know it's safe to use this MRF because nothing
2812 * else does except for register spill/unspill, which generates and
2813 * uses its MRF within a single IR instruction.
2814 */
2815 inst->base_mrf = 14;
2816 inst->mlen = 1;
2817 }
2818 }
2819 }
2820
2821 void
2822 fs_visitor::dump_instruction(backend_instruction *be_inst)
2823 {
2824 fs_inst *inst = (fs_inst *)be_inst;
2825
2826 if (inst->predicate) {
2827 printf("(%cf0.%d) ",
2828 inst->predicate_inverse ? '-' : '+',
2829 inst->flag_subreg);
2830 }
2831
2832 printf("%s", brw_instruction_name(inst->opcode));
2833 if (inst->saturate)
2834 printf(".sat");
2835 if (inst->conditional_mod) {
2836 printf("%s", conditional_modifier[inst->conditional_mod]);
2837 if (!inst->predicate &&
2838 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2839 inst->opcode != BRW_OPCODE_IF &&
2840 inst->opcode != BRW_OPCODE_WHILE))) {
2841 printf(".f0.%d", inst->flag_subreg);
2842 }
2843 }
2844 printf(" ");
2845
2846
2847 switch (inst->dst.file) {
2848 case GRF:
2849 printf("vgrf%d", inst->dst.reg);
2850 if (inst->dst.reg_offset)
2851 printf("+%d", inst->dst.reg_offset);
2852 break;
2853 case MRF:
2854 printf("m%d", inst->dst.reg);
2855 break;
2856 case BAD_FILE:
2857 printf("(null)");
2858 break;
2859 case UNIFORM:
2860 printf("***u%d***", inst->dst.reg);
2861 break;
2862 case HW_REG:
2863 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2864 switch (inst->dst.fixed_hw_reg.nr) {
2865 case BRW_ARF_NULL:
2866 printf("null");
2867 break;
2868 case BRW_ARF_ADDRESS:
2869 printf("a0.%d", inst->dst.fixed_hw_reg.subnr);
2870 break;
2871 case BRW_ARF_ACCUMULATOR:
2872 printf("acc%d", inst->dst.fixed_hw_reg.subnr);
2873 break;
2874 case BRW_ARF_FLAG:
2875 printf("f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2876 inst->dst.fixed_hw_reg.subnr);
2877 break;
2878 default:
2879 printf("arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2880 inst->dst.fixed_hw_reg.subnr);
2881 break;
2882 }
2883 } else {
2884 printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
2885 }
2886 if (inst->dst.fixed_hw_reg.subnr)
2887 printf("+%d", inst->dst.fixed_hw_reg.subnr);
2888 break;
2889 default:
2890 printf("???");
2891 break;
2892 }
2893 printf(":%s, ", reg_encoding[inst->dst.type]);
2894
2895 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
2896 if (inst->src[i].negate)
2897 printf("-");
2898 if (inst->src[i].abs)
2899 printf("|");
2900 switch (inst->src[i].file) {
2901 case GRF:
2902 printf("vgrf%d", inst->src[i].reg);
2903 if (inst->src[i].reg_offset)
2904 printf("+%d", inst->src[i].reg_offset);
2905 break;
2906 case MRF:
2907 printf("***m%d***", inst->src[i].reg);
2908 break;
2909 case UNIFORM:
2910 printf("u%d", inst->src[i].reg);
2911 if (inst->src[i].reg_offset)
2912 printf(".%d", inst->src[i].reg_offset);
2913 break;
2914 case BAD_FILE:
2915 printf("(null)");
2916 break;
2917 case IMM:
2918 switch (inst->src[i].type) {
2919 case BRW_REGISTER_TYPE_F:
2920 printf("%ff", inst->src[i].imm.f);
2921 break;
2922 case BRW_REGISTER_TYPE_D:
2923 printf("%dd", inst->src[i].imm.i);
2924 break;
2925 case BRW_REGISTER_TYPE_UD:
2926 printf("%uu", inst->src[i].imm.u);
2927 break;
2928 default:
2929 printf("???");
2930 break;
2931 }
2932 break;
2933 case HW_REG:
2934 if (inst->src[i].fixed_hw_reg.negate)
2935 printf("-");
2936 if (inst->src[i].fixed_hw_reg.abs)
2937 printf("|");
2938 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2939 switch (inst->src[i].fixed_hw_reg.nr) {
2940 case BRW_ARF_NULL:
2941 printf("null");
2942 break;
2943 case BRW_ARF_ADDRESS:
2944 printf("a0.%d", inst->src[i].fixed_hw_reg.subnr);
2945 break;
2946 case BRW_ARF_ACCUMULATOR:
2947 printf("acc%d", inst->src[i].fixed_hw_reg.subnr);
2948 break;
2949 case BRW_ARF_FLAG:
2950 printf("f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2951 inst->src[i].fixed_hw_reg.subnr);
2952 break;
2953 default:
2954 printf("arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2955 inst->src[i].fixed_hw_reg.subnr);
2956 break;
2957 }
2958 } else {
2959 printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2960 }
2961 if (inst->src[i].fixed_hw_reg.subnr)
2962 printf("+%d", inst->src[i].fixed_hw_reg.subnr);
2963 if (inst->src[i].fixed_hw_reg.abs)
2964 printf("|");
2965 break;
2966 default:
2967 printf("???");
2968 break;
2969 }
2970 if (inst->src[i].abs)
2971 printf("|");
2972
2973 if (inst->src[i].file != IMM) {
2974 printf(":%s", reg_encoding[inst->src[i].type]);
2975 }
2976
2977 if (i < 2 && inst->src[i + 1].file != BAD_FILE)
2978 printf(", ");
2979 }
2980
2981 printf(" ");
2982
2983 if (inst->force_uncompressed)
2984 printf("1sthalf ");
2985
2986 if (inst->force_sechalf)
2987 printf("2ndhalf ");
2988
2989 printf("\n");
2990 }
2991
2992 /**
2993 * Possibly returns an instruction that set up @param reg.
2994 *
2995 * Sometimes we want to take the result of some expression/variable
2996 * dereference tree and rewrite the instruction generating the result
2997 * of the tree. When processing the tree, we know that the
2998 * instructions generated are all writing temporaries that are dead
2999 * outside of this tree. So, if we have some instructions that write
3000 * a temporary, we're free to point that temp write somewhere else.
3001 *
3002 * Note that this doesn't guarantee that the instruction generated
3003 * only reg -- it might be the size=4 destination of a texture instruction.
3004 */
3005 fs_inst *
3006 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3007 fs_inst *end,
3008 fs_reg reg)
3009 {
3010 if (end == start ||
3011 end->is_partial_write() ||
3012 reg.reladdr ||
3013 !reg.equals(end->dst)) {
3014 return NULL;
3015 } else {
3016 return end;
3017 }
3018 }
3019
3020 void
3021 fs_visitor::setup_payload_gen6()
3022 {
3023 bool uses_depth =
3024 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3025 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3026
3027 assert(brw->gen >= 6);
3028
3029 /* R0-1: masks, pixel X/Y coordinates. */
3030 c->nr_payload_regs = 2;
3031 /* R2: only for 32-pixel dispatch.*/
3032
3033 /* R3-26: barycentric interpolation coordinates. These appear in the
3034 * same order that they appear in the brw_wm_barycentric_interp_mode
3035 * enum. Each set of coordinates occupies 2 registers if dispatch width
3036 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3037 * appear if they were enabled using the "Barycentric Interpolation
3038 * Mode" bits in WM_STATE.
3039 */
3040 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3041 if (barycentric_interp_modes & (1 << i)) {
3042 c->barycentric_coord_reg[i] = c->nr_payload_regs;
3043 c->nr_payload_regs += 2;
3044 if (dispatch_width == 16) {
3045 c->nr_payload_regs += 2;
3046 }
3047 }
3048 }
3049
3050 /* R27: interpolated depth if uses source depth */
3051 if (uses_depth) {
3052 c->source_depth_reg = c->nr_payload_regs;
3053 c->nr_payload_regs++;
3054 if (dispatch_width == 16) {
3055 /* R28: interpolated depth if not 8-wide. */
3056 c->nr_payload_regs++;
3057 }
3058 }
3059 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3060 if (uses_depth) {
3061 c->source_w_reg = c->nr_payload_regs;
3062 c->nr_payload_regs++;
3063 if (dispatch_width == 16) {
3064 /* R30: interpolated W if not 8-wide. */
3065 c->nr_payload_regs++;
3066 }
3067 }
3068
3069 c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3070 /* R31: MSAA position offsets. */
3071 if (c->prog_data.uses_pos_offset) {
3072 c->sample_pos_reg = c->nr_payload_regs;
3073 c->nr_payload_regs++;
3074 }
3075
3076 /* R32-: bary for 32-pixel. */
3077 /* R58-59: interp W for 32-pixel. */
3078
3079 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3080 c->source_depth_to_render_target = true;
3081 }
3082 }
3083
3084 void
3085 fs_visitor::assign_binding_table_offsets()
3086 {
3087 uint32_t next_binding_table_offset = 0;
3088
3089 /* If there are no color regions, we still perform an FB write to a null
3090 * renderbuffer, which we place at surface index 0.
3091 */
3092 c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3093 next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3094
3095 assign_common_binding_table_offsets(next_binding_table_offset);
3096 }
3097
3098 bool
3099 fs_visitor::run()
3100 {
3101 sanity_param_count = fp->Base.Parameters->NumParameters;
3102 uint32_t orig_nr_params = c->prog_data.nr_params;
3103 bool allocated_without_spills;
3104
3105 assign_binding_table_offsets();
3106
3107 if (brw->gen >= 6)
3108 setup_payload_gen6();
3109 else
3110 setup_payload_gen4();
3111
3112 if (0) {
3113 emit_dummy_fs();
3114 } else {
3115 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3116 emit_shader_time_begin();
3117
3118 calculate_urb_setup();
3119 if (fp->Base.InputsRead > 0) {
3120 if (brw->gen < 6)
3121 emit_interpolation_setup_gen4();
3122 else
3123 emit_interpolation_setup_gen6();
3124 }
3125
3126 /* We handle discards by keeping track of the still-live pixels in f0.1.
3127 * Initialize it with the dispatched pixels.
3128 */
3129 if (fp->UsesKill || c->key.alpha_test_func) {
3130 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3131 discard_init->flag_subreg = 1;
3132 }
3133
3134 /* Generate FS IR for main(). (the visitor only descends into
3135 * functions called "main").
3136 */
3137 if (shader) {
3138 foreach_list(node, &*shader->ir) {
3139 ir_instruction *ir = (ir_instruction *)node;
3140 base_ir = ir;
3141 this->result = reg_undef;
3142 ir->accept(this);
3143 }
3144 } else {
3145 emit_fragment_program_code();
3146 }
3147 base_ir = NULL;
3148 if (failed)
3149 return false;
3150
3151 emit(FS_OPCODE_PLACEHOLDER_HALT);
3152
3153 if (c->key.alpha_test_func)
3154 emit_alpha_test();
3155
3156 emit_fb_writes();
3157
3158 split_virtual_grfs();
3159
3160 move_uniform_array_access_to_pull_constants();
3161 remove_dead_constants();
3162 setup_pull_constants();
3163
3164 bool progress;
3165 do {
3166 progress = false;
3167
3168 compact_virtual_grfs();
3169
3170 progress = remove_duplicate_mrf_writes() || progress;
3171
3172 progress = opt_algebraic() || progress;
3173 progress = opt_cse() || progress;
3174 progress = opt_copy_propagate() || progress;
3175 progress = opt_peephole_sel() || progress;
3176 progress = opt_peephole_predicated_break() || progress;
3177 progress = dead_code_eliminate() || progress;
3178 progress = dead_code_eliminate_local() || progress;
3179 progress = dead_control_flow_eliminate(this) || progress;
3180 progress = register_coalesce() || progress;
3181 progress = compute_to_mrf() || progress;
3182 } while (progress);
3183
3184 lower_uniform_pull_constant_loads();
3185
3186 assign_curb_setup();
3187 assign_urb_setup();
3188
3189 static enum instruction_scheduler_mode pre_modes[] = {
3190 SCHEDULE_PRE,
3191 SCHEDULE_PRE_NON_LIFO,
3192 SCHEDULE_PRE_LIFO,
3193 };
3194
3195 /* Try each scheduling heuristic to see if it can successfully register
3196 * allocate without spilling. They should be ordered by decreasing
3197 * performance but increasing likelihood of allocating.
3198 */
3199 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3200 schedule_instructions(pre_modes[i]);
3201
3202 if (0) {
3203 assign_regs_trivial();
3204 allocated_without_spills = true;
3205 } else {
3206 allocated_without_spills = assign_regs(false);
3207 }
3208 if (allocated_without_spills)
3209 break;
3210 }
3211
3212 if (!allocated_without_spills) {
3213 /* We assume that any spilling is worse than just dropping back to
3214 * SIMD8. There's probably actually some intermediate point where
3215 * SIMD16 with a couple of spills is still better.
3216 */
3217 if (dispatch_width == 16) {
3218 fail("Failure to register allocate. Reduce number of "
3219 "live scalar values to avoid this.");
3220 }
3221
3222 /* Since we're out of heuristics, just go spill registers until we
3223 * get an allocation.
3224 */
3225 while (!assign_regs(true)) {
3226 if (failed)
3227 break;
3228 }
3229 }
3230 }
3231 assert(force_uncompressed_stack == 0);
3232
3233 /* This must come after all optimization and register allocation, since
3234 * it inserts dead code that happens to have side effects, and it does
3235 * so based on the actual physical registers in use.
3236 */
3237 insert_gen4_send_dependency_workarounds();
3238
3239 if (failed)
3240 return false;
3241
3242 if (!allocated_without_spills)
3243 schedule_instructions(SCHEDULE_POST);
3244
3245 if (dispatch_width == 8) {
3246 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3247 } else {
3248 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3249
3250 /* Make sure we didn't try to sneak in an extra uniform */
3251 assert(orig_nr_params == c->prog_data.nr_params);
3252 (void) orig_nr_params;
3253 }
3254
3255 /* If any state parameters were appended, then ParameterValues could have
3256 * been realloced, in which case the driver uniform storage set up by
3257 * _mesa_associate_uniform_storage() would point to freed memory. Make
3258 * sure that didn't happen.
3259 */
3260 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3261
3262 return !failed;
3263 }
3264
3265 const unsigned *
3266 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3267 struct gl_fragment_program *fp,
3268 struct gl_shader_program *prog,
3269 unsigned *final_assembly_size)
3270 {
3271 bool start_busy = false;
3272 float start_time = 0;
3273
3274 if (unlikely(brw->perf_debug)) {
3275 start_busy = (brw->batch.last_bo &&
3276 drm_intel_bo_busy(brw->batch.last_bo));
3277 start_time = get_time();
3278 }
3279
3280 struct brw_shader *shader = NULL;
3281 if (prog)
3282 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3283
3284 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3285 if (prog) {
3286 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3287 _mesa_print_ir(shader->ir, NULL);
3288 printf("\n\n");
3289 } else {
3290 printf("ARB_fragment_program %d ir for native fragment shader\n",
3291 fp->Base.Id);
3292 _mesa_print_program(&fp->Base);
3293 }
3294 }
3295
3296 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3297 */
3298 fs_visitor v(brw, c, prog, fp, 8);
3299 if (!v.run()) {
3300 if (prog) {
3301 prog->LinkStatus = false;
3302 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3303 }
3304
3305 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3306 v.fail_msg);
3307
3308 return NULL;
3309 }
3310
3311 exec_list *simd16_instructions = NULL;
3312 fs_visitor v2(brw, c, prog, fp, 16);
3313 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3314 if (c->prog_data.nr_pull_params == 0) {
3315 /* Try a 16-wide compile */
3316 v2.import_uniforms(&v);
3317 if (!v2.run()) {
3318 perf_debug("16-wide shader failed to compile, falling back to "
3319 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3320 } else {
3321 simd16_instructions = &v2.instructions;
3322 }
3323 } else {
3324 perf_debug("Skipping 16-wide due to pull parameters.\n");
3325 }
3326 }
3327
3328 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3329 const unsigned *generated = g.generate_assembly(&v.instructions,
3330 simd16_instructions,
3331 final_assembly_size);
3332
3333 if (unlikely(brw->perf_debug) && shader) {
3334 if (shader->compiled_once)
3335 brw_wm_debug_recompile(brw, prog, &c->key);
3336 shader->compiled_once = true;
3337
3338 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3339 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3340 (get_time() - start_time) * 1000);
3341 }
3342 }
3343
3344 return generated;
3345 }
3346
3347 bool
3348 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3349 {
3350 struct brw_context *brw = brw_context(ctx);
3351 struct brw_wm_prog_key key;
3352
3353 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3354 return true;
3355
3356 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3357 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3358 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3359 bool program_uses_dfdy = fp->UsesDFdy;
3360
3361 memset(&key, 0, sizeof(key));
3362
3363 if (brw->gen < 6) {
3364 if (fp->UsesKill)
3365 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3366
3367 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3368 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3369
3370 /* Just assume depth testing. */
3371 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3372 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3373 }
3374
3375 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3376 BRW_FS_VARYING_INPUT_MASK) > 16)
3377 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3378
3379 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3380
3381 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3382 for (unsigned i = 0; i < sampler_count; i++) {
3383 if (fp->Base.ShadowSamplers & (1 << i)) {
3384 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3385 key.tex.swizzles[i] =
3386 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3387 } else {
3388 /* Color sampler: assume no swizzling. */
3389 key.tex.swizzles[i] = SWIZZLE_XYZW;
3390 }
3391 }
3392
3393 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3394 key.drawable_height = ctx->DrawBuffer->Height;
3395 }
3396
3397 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3398 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3399 }
3400
3401 key.nr_color_regions = 1;
3402
3403 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3404 * quality of the derivatives is likely to be determined by the driconf
3405 * option.
3406 */
3407 key.high_quality_derivatives = brw->disable_derivative_optimization;
3408
3409 key.program_string_id = bfp->id;
3410
3411 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3412 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3413
3414 bool success = do_wm_prog(brw, prog, bfp, &key);
3415
3416 brw->wm.base.prog_offset = old_prog_offset;
3417 brw->wm.prog_data = old_prog_data;
3418
3419 return success;
3420 }