i965: Add support for Broadwell's new register types.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53
54 void
55 fs_inst::init()
56 {
57 memset(this, 0, sizeof(*this));
58 this->opcode = BRW_OPCODE_NOP;
59 this->conditional_mod = BRW_CONDITIONAL_NONE;
60
61 this->dst = reg_undef;
62 this->src[0] = reg_undef;
63 this->src[1] = reg_undef;
64 this->src[2] = reg_undef;
65
66 /* This will be the case for almost all instructions. */
67 this->regs_written = 1;
68 }
69
70 fs_inst::fs_inst()
71 {
72 init();
73 }
74
75 fs_inst::fs_inst(enum opcode opcode)
76 {
77 init();
78 this->opcode = opcode;
79 }
80
81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
82 {
83 init();
84 this->opcode = opcode;
85 this->dst = dst;
86
87 if (dst.file == GRF)
88 assert(dst.reg_offset >= 0);
89 }
90
91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
92 {
93 init();
94 this->opcode = opcode;
95 this->dst = dst;
96 this->src[0] = src0;
97
98 if (dst.file == GRF)
99 assert(dst.reg_offset >= 0);
100 if (src[0].file == GRF)
101 assert(src[0].reg_offset >= 0);
102 }
103
104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
105 {
106 init();
107 this->opcode = opcode;
108 this->dst = dst;
109 this->src[0] = src0;
110 this->src[1] = src1;
111
112 if (dst.file == GRF)
113 assert(dst.reg_offset >= 0);
114 if (src[0].file == GRF)
115 assert(src[0].reg_offset >= 0);
116 if (src[1].file == GRF)
117 assert(src[1].reg_offset >= 0);
118 }
119
120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
121 fs_reg src0, fs_reg src1, fs_reg src2)
122 {
123 init();
124 this->opcode = opcode;
125 this->dst = dst;
126 this->src[0] = src0;
127 this->src[1] = src1;
128 this->src[2] = src2;
129
130 if (dst.file == GRF)
131 assert(dst.reg_offset >= 0);
132 if (src[0].file == GRF)
133 assert(src[0].reg_offset >= 0);
134 if (src[1].file == GRF)
135 assert(src[1].reg_offset >= 0);
136 if (src[2].file == GRF)
137 assert(src[2].reg_offset >= 0);
138 }
139
140 #define ALU1(op) \
141 fs_inst * \
142 fs_visitor::op(fs_reg dst, fs_reg src0) \
143 { \
144 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
145 }
146
147 #define ALU2(op) \
148 fs_inst * \
149 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
150 { \
151 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
152 }
153
154 #define ALU3(op) \
155 fs_inst * \
156 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
157 { \
158 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
159 }
160
161 ALU1(NOT)
162 ALU1(MOV)
163 ALU1(FRC)
164 ALU1(RNDD)
165 ALU1(RNDE)
166 ALU1(RNDZ)
167 ALU2(ADD)
168 ALU2(MUL)
169 ALU2(MACH)
170 ALU2(AND)
171 ALU2(OR)
172 ALU2(XOR)
173 ALU2(SHL)
174 ALU2(SHR)
175 ALU2(ASR)
176 ALU3(LRP)
177 ALU1(BFREV)
178 ALU3(BFE)
179 ALU2(BFI1)
180 ALU3(BFI2)
181 ALU1(FBH)
182 ALU1(FBL)
183 ALU1(CBIT)
184 ALU3(MAD)
185 ALU2(ADDC)
186 ALU2(SUBB)
187 ALU2(SEL)
188
189 /** Gen4 predicated IF. */
190 fs_inst *
191 fs_visitor::IF(uint32_t predicate)
192 {
193 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195 return inst;
196 }
197
198 /** Gen6 IF with embedded comparison. */
199 fs_inst *
200 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
201 {
202 assert(brw->gen == 6);
203 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
204 reg_null_d, src0, src1);
205 inst->conditional_mod = condition;
206 return inst;
207 }
208
209 /**
210 * CMP: Sets the low bit of the destination channels with the result
211 * of the comparison, while the upper bits are undefined, and updates
212 * the flag register with the packed 16 bits of the result.
213 */
214 fs_inst *
215 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
216 {
217 fs_inst *inst;
218
219 /* Take the instruction:
220 *
221 * CMP null<d> src0<f> src1<f>
222 *
223 * Original gen4 does type conversion to the destination type before
224 * comparison, producing garbage results for floating point comparisons.
225 * gen5 does the comparison on the execution type (resolved source types),
226 * so dst type doesn't matter. gen6 does comparison and then uses the
227 * result as if it was the dst type with no conversion, which happens to
228 * mostly work out for float-interpreted-as-int since our comparisons are
229 * for >0, =0, <0.
230 */
231 if (brw->gen == 4) {
232 dst.type = src0.type;
233 if (dst.file == HW_REG)
234 dst.fixed_hw_reg.type = dst.type;
235 }
236
237 resolve_ud_negate(&src0);
238 resolve_ud_negate(&src1);
239
240 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
241 inst->conditional_mod = condition;
242
243 return inst;
244 }
245
246 exec_list
247 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
248 fs_reg varying_offset,
249 uint32_t const_offset)
250 {
251 exec_list instructions;
252 fs_inst *inst;
253
254 /* We have our constant surface use a pitch of 4 bytes, so our index can
255 * be any component of a vector, and then we load 4 contiguous
256 * components starting from that.
257 *
258 * We break down the const_offset to a portion added to the variable
259 * offset and a portion done using reg_offset, which means that if you
260 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
261 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
262 * CSE can later notice that those loads are all the same and eliminate
263 * the redundant ones.
264 */
265 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
266 instructions.push_tail(ADD(vec4_offset,
267 varying_offset, const_offset & ~3));
268
269 int scale = 1;
270 if (brw->gen == 4 && dispatch_width == 8) {
271 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
272 * u, v, r) as parameters, or we can just use the SIMD16 message
273 * consisting of (header, u). We choose the second, at the cost of a
274 * longer return length.
275 */
276 scale = 2;
277 }
278
279 enum opcode op;
280 if (brw->gen >= 7)
281 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
282 else
283 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
284 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
285 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
286 inst->regs_written = 4 * scale;
287 instructions.push_tail(inst);
288
289 if (brw->gen < 7) {
290 inst->base_mrf = 13;
291 inst->header_present = true;
292 if (brw->gen == 4)
293 inst->mlen = 3;
294 else
295 inst->mlen = 1 + dispatch_width / 8;
296 }
297
298 vec4_result.reg_offset += (const_offset & 3) * scale;
299 instructions.push_tail(MOV(dst, vec4_result));
300
301 return instructions;
302 }
303
304 /**
305 * A helper for MOV generation for fixing up broken hardware SEND dependency
306 * handling.
307 */
308 fs_inst *
309 fs_visitor::DEP_RESOLVE_MOV(int grf)
310 {
311 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
312
313 inst->ir = NULL;
314 inst->annotation = "send dependency resolve";
315
316 /* The caller always wants uncompressed to emit the minimal extra
317 * dependencies, and to avoid having to deal with aligning its regs to 2.
318 */
319 inst->force_uncompressed = true;
320
321 return inst;
322 }
323
324 bool
325 fs_inst::equals(fs_inst *inst)
326 {
327 return (opcode == inst->opcode &&
328 dst.equals(inst->dst) &&
329 src[0].equals(inst->src[0]) &&
330 src[1].equals(inst->src[1]) &&
331 src[2].equals(inst->src[2]) &&
332 saturate == inst->saturate &&
333 predicate == inst->predicate &&
334 conditional_mod == inst->conditional_mod &&
335 mlen == inst->mlen &&
336 base_mrf == inst->base_mrf &&
337 sampler == inst->sampler &&
338 target == inst->target &&
339 eot == inst->eot &&
340 header_present == inst->header_present &&
341 shadow_compare == inst->shadow_compare &&
342 offset == inst->offset);
343 }
344
345 bool
346 fs_inst::overwrites_reg(const fs_reg &reg)
347 {
348 return (reg.file == dst.file &&
349 reg.reg == dst.reg &&
350 reg.reg_offset >= dst.reg_offset &&
351 reg.reg_offset < dst.reg_offset + regs_written);
352 }
353
354 bool
355 fs_inst::is_send_from_grf()
356 {
357 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
358 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
359 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
360 src[1].file == GRF) ||
361 (is_tex() && src[0].file == GRF));
362 }
363
364 bool
365 fs_visitor::can_do_source_mods(fs_inst *inst)
366 {
367 if (brw->gen == 6 && inst->is_math())
368 return false;
369
370 if (inst->is_send_from_grf())
371 return false;
372
373 if (!inst->can_do_source_mods())
374 return false;
375
376 return true;
377 }
378
379 void
380 fs_reg::init()
381 {
382 memset(this, 0, sizeof(*this));
383 this->smear = -1;
384 }
385
386 /** Generic unset register constructor. */
387 fs_reg::fs_reg()
388 {
389 init();
390 this->file = BAD_FILE;
391 }
392
393 /** Immediate value constructor. */
394 fs_reg::fs_reg(float f)
395 {
396 init();
397 this->file = IMM;
398 this->type = BRW_REGISTER_TYPE_F;
399 this->imm.f = f;
400 }
401
402 /** Immediate value constructor. */
403 fs_reg::fs_reg(int32_t i)
404 {
405 init();
406 this->file = IMM;
407 this->type = BRW_REGISTER_TYPE_D;
408 this->imm.i = i;
409 }
410
411 /** Immediate value constructor. */
412 fs_reg::fs_reg(uint32_t u)
413 {
414 init();
415 this->file = IMM;
416 this->type = BRW_REGISTER_TYPE_UD;
417 this->imm.u = u;
418 }
419
420 /** Fixed brw_reg. */
421 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
422 {
423 init();
424 this->file = HW_REG;
425 this->fixed_hw_reg = fixed_hw_reg;
426 this->type = fixed_hw_reg.type;
427 }
428
429 bool
430 fs_reg::equals(const fs_reg &r) const
431 {
432 return (file == r.file &&
433 reg == r.reg &&
434 reg_offset == r.reg_offset &&
435 type == r.type &&
436 negate == r.negate &&
437 abs == r.abs &&
438 !reladdr && !r.reladdr &&
439 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
440 sizeof(fixed_hw_reg)) == 0 &&
441 smear == r.smear &&
442 imm.u == r.imm.u);
443 }
444
445 fs_reg
446 fs_reg::retype(uint32_t type)
447 {
448 fs_reg result = *this;
449 result.type = type;
450 return result;
451 }
452
453 bool
454 fs_reg::is_zero() const
455 {
456 if (file != IMM)
457 return false;
458
459 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
460 }
461
462 bool
463 fs_reg::is_one() const
464 {
465 if (file != IMM)
466 return false;
467
468 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
469 }
470
471 bool
472 fs_reg::is_null() const
473 {
474 return file == HW_REG &&
475 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
476 fixed_hw_reg.nr == BRW_ARF_NULL;
477 }
478
479 bool
480 fs_reg::is_valid_3src() const
481 {
482 return file == GRF || file == UNIFORM;
483 }
484
485 int
486 fs_visitor::type_size(const struct glsl_type *type)
487 {
488 unsigned int size, i;
489
490 switch (type->base_type) {
491 case GLSL_TYPE_UINT:
492 case GLSL_TYPE_INT:
493 case GLSL_TYPE_FLOAT:
494 case GLSL_TYPE_BOOL:
495 return type->components();
496 case GLSL_TYPE_ARRAY:
497 return type_size(type->fields.array) * type->length;
498 case GLSL_TYPE_STRUCT:
499 size = 0;
500 for (i = 0; i < type->length; i++) {
501 size += type_size(type->fields.structure[i].type);
502 }
503 return size;
504 case GLSL_TYPE_SAMPLER:
505 /* Samplers take up no register space, since they're baked in at
506 * link time.
507 */
508 return 0;
509 case GLSL_TYPE_ATOMIC_UINT:
510 return 0;
511 case GLSL_TYPE_VOID:
512 case GLSL_TYPE_ERROR:
513 case GLSL_TYPE_INTERFACE:
514 assert(!"not reached");
515 break;
516 }
517
518 return 0;
519 }
520
521 fs_reg
522 fs_visitor::get_timestamp()
523 {
524 assert(brw->gen >= 7);
525
526 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
527 BRW_ARF_TIMESTAMP,
528 0),
529 BRW_REGISTER_TYPE_UD));
530
531 fs_reg dst = fs_reg(this, glsl_type::uint_type);
532
533 fs_inst *mov = emit(MOV(dst, ts));
534 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
535 * even if it's not enabled in the dispatch.
536 */
537 mov->force_writemask_all = true;
538 mov->force_uncompressed = true;
539
540 /* The caller wants the low 32 bits of the timestamp. Since it's running
541 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
542 * which is plenty of time for our purposes. It is identical across the
543 * EUs, but since it's tracking GPU core speed it will increment at a
544 * varying rate as render P-states change.
545 *
546 * The caller could also check if render P-states have changed (or anything
547 * else that might disrupt timing) by setting smear to 2 and checking if
548 * that field is != 0.
549 */
550 dst.smear = 0;
551
552 return dst;
553 }
554
555 void
556 fs_visitor::emit_shader_time_begin()
557 {
558 current_annotation = "shader time start";
559 shader_start_time = get_timestamp();
560 }
561
562 void
563 fs_visitor::emit_shader_time_end()
564 {
565 current_annotation = "shader time end";
566
567 enum shader_time_shader_type type, written_type, reset_type;
568 if (dispatch_width == 8) {
569 type = ST_FS8;
570 written_type = ST_FS8_WRITTEN;
571 reset_type = ST_FS8_RESET;
572 } else {
573 assert(dispatch_width == 16);
574 type = ST_FS16;
575 written_type = ST_FS16_WRITTEN;
576 reset_type = ST_FS16_RESET;
577 }
578
579 fs_reg shader_end_time = get_timestamp();
580
581 /* Check that there weren't any timestamp reset events (assuming these
582 * were the only two timestamp reads that happened).
583 */
584 fs_reg reset = shader_end_time;
585 reset.smear = 2;
586 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
587 test->conditional_mod = BRW_CONDITIONAL_Z;
588 emit(IF(BRW_PREDICATE_NORMAL));
589
590 push_force_uncompressed();
591 fs_reg start = shader_start_time;
592 start.negate = true;
593 fs_reg diff = fs_reg(this, glsl_type::uint_type);
594 emit(ADD(diff, start, shader_end_time));
595
596 /* If there were no instructions between the two timestamp gets, the diff
597 * is 2 cycles. Remove that overhead, so I can forget about that when
598 * trying to determine the time taken for single instructions.
599 */
600 emit(ADD(diff, diff, fs_reg(-2u)));
601
602 emit_shader_time_write(type, diff);
603 emit_shader_time_write(written_type, fs_reg(1u));
604 emit(BRW_OPCODE_ELSE);
605 emit_shader_time_write(reset_type, fs_reg(1u));
606 emit(BRW_OPCODE_ENDIF);
607
608 pop_force_uncompressed();
609 }
610
611 void
612 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
613 fs_reg value)
614 {
615 int shader_time_index =
616 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
617 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
618
619 fs_reg payload;
620 if (dispatch_width == 8)
621 payload = fs_reg(this, glsl_type::uvec2_type);
622 else
623 payload = fs_reg(this, glsl_type::uint_type);
624
625 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
626 fs_reg(), payload, offset, value));
627 }
628
629 void
630 fs_visitor::fail(const char *format, ...)
631 {
632 va_list va;
633 char *msg;
634
635 if (failed)
636 return;
637
638 failed = true;
639
640 va_start(va, format);
641 msg = ralloc_vasprintf(mem_ctx, format, va);
642 va_end(va);
643 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
644
645 this->fail_msg = msg;
646
647 if (INTEL_DEBUG & DEBUG_WM) {
648 fprintf(stderr, "%s", msg);
649 }
650 }
651
652 fs_inst *
653 fs_visitor::emit(enum opcode opcode)
654 {
655 return emit(fs_inst(opcode));
656 }
657
658 fs_inst *
659 fs_visitor::emit(enum opcode opcode, fs_reg dst)
660 {
661 return emit(fs_inst(opcode, dst));
662 }
663
664 fs_inst *
665 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
666 {
667 return emit(fs_inst(opcode, dst, src0));
668 }
669
670 fs_inst *
671 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
672 {
673 return emit(fs_inst(opcode, dst, src0, src1));
674 }
675
676 fs_inst *
677 fs_visitor::emit(enum opcode opcode, fs_reg dst,
678 fs_reg src0, fs_reg src1, fs_reg src2)
679 {
680 return emit(fs_inst(opcode, dst, src0, src1, src2));
681 }
682
683 void
684 fs_visitor::push_force_uncompressed()
685 {
686 force_uncompressed_stack++;
687 }
688
689 void
690 fs_visitor::pop_force_uncompressed()
691 {
692 force_uncompressed_stack--;
693 assert(force_uncompressed_stack >= 0);
694 }
695
696 /**
697 * Returns true if the instruction has a flag that means it won't
698 * update an entire destination register.
699 *
700 * For example, dead code elimination and live variable analysis want to know
701 * when a write to a variable screens off any preceding values that were in
702 * it.
703 */
704 bool
705 fs_inst::is_partial_write()
706 {
707 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
708 this->force_uncompressed ||
709 this->force_sechalf);
710 }
711
712 int
713 fs_inst::regs_read(fs_visitor *v, int arg)
714 {
715 if (is_tex() && arg == 0 && src[0].file == GRF) {
716 if (v->dispatch_width == 16)
717 return (mlen + 1) / 2;
718 else
719 return mlen;
720 }
721 return 1;
722 }
723
724 bool
725 fs_inst::reads_flag()
726 {
727 return predicate;
728 }
729
730 bool
731 fs_inst::writes_flag()
732 {
733 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
734 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
735 }
736
737 /**
738 * Returns how many MRFs an FS opcode will write over.
739 *
740 * Note that this is not the 0 or 1 implied writes in an actual gen
741 * instruction -- the FS opcodes often generate MOVs in addition.
742 */
743 int
744 fs_visitor::implied_mrf_writes(fs_inst *inst)
745 {
746 if (inst->mlen == 0)
747 return 0;
748
749 if (inst->base_mrf == -1)
750 return 0;
751
752 switch (inst->opcode) {
753 case SHADER_OPCODE_RCP:
754 case SHADER_OPCODE_RSQ:
755 case SHADER_OPCODE_SQRT:
756 case SHADER_OPCODE_EXP2:
757 case SHADER_OPCODE_LOG2:
758 case SHADER_OPCODE_SIN:
759 case SHADER_OPCODE_COS:
760 return 1 * dispatch_width / 8;
761 case SHADER_OPCODE_POW:
762 case SHADER_OPCODE_INT_QUOTIENT:
763 case SHADER_OPCODE_INT_REMAINDER:
764 return 2 * dispatch_width / 8;
765 case SHADER_OPCODE_TEX:
766 case FS_OPCODE_TXB:
767 case SHADER_OPCODE_TXD:
768 case SHADER_OPCODE_TXF:
769 case SHADER_OPCODE_TXF_MS:
770 case SHADER_OPCODE_TXF_MCS:
771 case SHADER_OPCODE_TG4:
772 case SHADER_OPCODE_TG4_OFFSET:
773 case SHADER_OPCODE_TXL:
774 case SHADER_OPCODE_TXS:
775 case SHADER_OPCODE_LOD:
776 return 1;
777 case FS_OPCODE_FB_WRITE:
778 return 2;
779 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
780 case SHADER_OPCODE_GEN4_SCRATCH_READ:
781 return 1;
782 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
783 return inst->mlen;
784 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
785 return 2;
786 case SHADER_OPCODE_UNTYPED_ATOMIC:
787 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
788 return 0;
789 default:
790 assert(!"not reached");
791 return inst->mlen;
792 }
793 }
794
795 int
796 fs_visitor::virtual_grf_alloc(int size)
797 {
798 if (virtual_grf_array_size <= virtual_grf_count) {
799 if (virtual_grf_array_size == 0)
800 virtual_grf_array_size = 16;
801 else
802 virtual_grf_array_size *= 2;
803 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
804 virtual_grf_array_size);
805 }
806 virtual_grf_sizes[virtual_grf_count] = size;
807 return virtual_grf_count++;
808 }
809
810 /** Fixed HW reg constructor. */
811 fs_reg::fs_reg(enum register_file file, int reg)
812 {
813 init();
814 this->file = file;
815 this->reg = reg;
816 this->type = BRW_REGISTER_TYPE_F;
817 }
818
819 /** Fixed HW reg constructor. */
820 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
821 {
822 init();
823 this->file = file;
824 this->reg = reg;
825 this->type = type;
826 }
827
828 /** Automatic reg constructor. */
829 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
830 {
831 init();
832
833 this->file = GRF;
834 this->reg = v->virtual_grf_alloc(v->type_size(type));
835 this->reg_offset = 0;
836 this->type = brw_type_for_base_type(type);
837 }
838
839 fs_reg *
840 fs_visitor::variable_storage(ir_variable *var)
841 {
842 return (fs_reg *)hash_table_find(this->variable_ht, var);
843 }
844
845 void
846 import_uniforms_callback(const void *key,
847 void *data,
848 void *closure)
849 {
850 struct hash_table *dst_ht = (struct hash_table *)closure;
851 const fs_reg *reg = (const fs_reg *)data;
852
853 if (reg->file != UNIFORM)
854 return;
855
856 hash_table_insert(dst_ht, data, key);
857 }
858
859 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
860 * This brings in those uniform definitions
861 */
862 void
863 fs_visitor::import_uniforms(fs_visitor *v)
864 {
865 hash_table_call_foreach(v->variable_ht,
866 import_uniforms_callback,
867 variable_ht);
868 this->params_remap = v->params_remap;
869 this->nr_params_remap = v->nr_params_remap;
870 }
871
872 /* Our support for uniforms is piggy-backed on the struct
873 * gl_fragment_program, because that's where the values actually
874 * get stored, rather than in some global gl_shader_program uniform
875 * store.
876 */
877 void
878 fs_visitor::setup_uniform_values(ir_variable *ir)
879 {
880 int namelen = strlen(ir->name);
881
882 /* The data for our (non-builtin) uniforms is stored in a series of
883 * gl_uniform_driver_storage structs for each subcomponent that
884 * glGetUniformLocation() could name. We know it's been set up in the same
885 * order we'd walk the type, so walk the list of storage and find anything
886 * with our name, or the prefix of a component that starts with our name.
887 */
888 unsigned params_before = c->prog_data.nr_params;
889 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
890 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
891
892 if (strncmp(ir->name, storage->name, namelen) != 0 ||
893 (storage->name[namelen] != 0 &&
894 storage->name[namelen] != '.' &&
895 storage->name[namelen] != '[')) {
896 continue;
897 }
898
899 unsigned slots = storage->type->component_slots();
900 if (storage->array_elements)
901 slots *= storage->array_elements;
902
903 for (unsigned i = 0; i < slots; i++) {
904 c->prog_data.param[c->prog_data.nr_params++] =
905 &storage->storage[i].f;
906 }
907 }
908
909 /* Make sure we actually initialized the right amount of stuff here. */
910 assert(params_before + ir->type->component_slots() ==
911 c->prog_data.nr_params);
912 (void)params_before;
913 }
914
915
916 /* Our support for builtin uniforms is even scarier than non-builtin.
917 * It sits on top of the PROG_STATE_VAR parameters that are
918 * automatically updated from GL context state.
919 */
920 void
921 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
922 {
923 const ir_state_slot *const slots = ir->state_slots;
924 assert(ir->state_slots != NULL);
925
926 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
927 /* This state reference has already been setup by ir_to_mesa, but we'll
928 * get the same index back here.
929 */
930 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
931 (gl_state_index *)slots[i].tokens);
932
933 /* Add each of the unique swizzles of the element as a parameter.
934 * This'll end up matching the expected layout of the
935 * array/matrix/structure we're trying to fill in.
936 */
937 int last_swiz = -1;
938 for (unsigned int j = 0; j < 4; j++) {
939 int swiz = GET_SWZ(slots[i].swizzle, j);
940 if (swiz == last_swiz)
941 break;
942 last_swiz = swiz;
943
944 c->prog_data.param[c->prog_data.nr_params++] =
945 &fp->Base.Parameters->ParameterValues[index][swiz].f;
946 }
947 }
948 }
949
950 fs_reg *
951 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
952 {
953 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
954 fs_reg wpos = *reg;
955 bool flip = !ir->data.origin_upper_left ^ c->key.render_to_fbo;
956
957 /* gl_FragCoord.x */
958 if (ir->data.pixel_center_integer) {
959 emit(MOV(wpos, this->pixel_x));
960 } else {
961 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
962 }
963 wpos.reg_offset++;
964
965 /* gl_FragCoord.y */
966 if (!flip && ir->data.pixel_center_integer) {
967 emit(MOV(wpos, this->pixel_y));
968 } else {
969 fs_reg pixel_y = this->pixel_y;
970 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
971
972 if (flip) {
973 pixel_y.negate = true;
974 offset += c->key.drawable_height - 1.0;
975 }
976
977 emit(ADD(wpos, pixel_y, fs_reg(offset)));
978 }
979 wpos.reg_offset++;
980
981 /* gl_FragCoord.z */
982 if (brw->gen >= 6) {
983 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
984 } else {
985 emit(FS_OPCODE_LINTERP, wpos,
986 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
987 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
988 interp_reg(VARYING_SLOT_POS, 2));
989 }
990 wpos.reg_offset++;
991
992 /* gl_FragCoord.w: Already set up in emit_interpolation */
993 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
994
995 return reg;
996 }
997
998 fs_inst *
999 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1000 glsl_interp_qualifier interpolation_mode,
1001 bool is_centroid)
1002 {
1003 brw_wm_barycentric_interp_mode barycoord_mode;
1004 if (brw->gen >= 6) {
1005 if (is_centroid) {
1006 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1007 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1008 else
1009 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1010 } else {
1011 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1012 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1013 else
1014 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1015 }
1016 } else {
1017 /* On Ironlake and below, there is only one interpolation mode.
1018 * Centroid interpolation doesn't mean anything on this hardware --
1019 * there is no multisampling.
1020 */
1021 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1022 }
1023 return emit(FS_OPCODE_LINTERP, attr,
1024 this->delta_x[barycoord_mode],
1025 this->delta_y[barycoord_mode], interp);
1026 }
1027
1028 fs_reg *
1029 fs_visitor::emit_general_interpolation(ir_variable *ir)
1030 {
1031 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1032 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1033 fs_reg attr = *reg;
1034
1035 unsigned int array_elements;
1036 const glsl_type *type;
1037
1038 if (ir->type->is_array()) {
1039 array_elements = ir->type->length;
1040 if (array_elements == 0) {
1041 fail("dereferenced array '%s' has length 0\n", ir->name);
1042 }
1043 type = ir->type->fields.array;
1044 } else {
1045 array_elements = 1;
1046 type = ir->type;
1047 }
1048
1049 glsl_interp_qualifier interpolation_mode =
1050 ir->determine_interpolation_mode(c->key.flat_shade);
1051
1052 int location = ir->data.location;
1053 for (unsigned int i = 0; i < array_elements; i++) {
1054 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1055 if (c->prog_data.urb_setup[location] == -1) {
1056 /* If there's no incoming setup data for this slot, don't
1057 * emit interpolation for it.
1058 */
1059 attr.reg_offset += type->vector_elements;
1060 location++;
1061 continue;
1062 }
1063
1064 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1065 /* Constant interpolation (flat shading) case. The SF has
1066 * handed us defined values in only the constant offset
1067 * field of the setup reg.
1068 */
1069 for (unsigned int k = 0; k < type->vector_elements; k++) {
1070 struct brw_reg interp = interp_reg(location, k);
1071 interp = suboffset(interp, 3);
1072 interp.type = reg->type;
1073 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1074 attr.reg_offset++;
1075 }
1076 } else {
1077 /* Smooth/noperspective interpolation case. */
1078 for (unsigned int k = 0; k < type->vector_elements; k++) {
1079 /* FINISHME: At some point we probably want to push
1080 * this farther by giving similar treatment to the
1081 * other potentially constant components of the
1082 * attribute, as well as making brw_vs_constval.c
1083 * handle varyings other than gl_TexCoord.
1084 */
1085 struct brw_reg interp = interp_reg(location, k);
1086 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1087 ir->data.centroid);
1088 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1089 /* Get the pixel/sample mask into f0 so that we know
1090 * which pixels are lit. Then, for each channel that is
1091 * unlit, replace the centroid data with non-centroid
1092 * data.
1093 */
1094 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1095 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1096 interpolation_mode, false);
1097 inst->predicate = BRW_PREDICATE_NORMAL;
1098 inst->predicate_inverse = true;
1099 }
1100 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1101 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1102 }
1103 attr.reg_offset++;
1104 }
1105
1106 }
1107 location++;
1108 }
1109 }
1110
1111 return reg;
1112 }
1113
1114 fs_reg *
1115 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1116 {
1117 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1118
1119 /* The frontfacing comes in as a bit in the thread payload. */
1120 if (brw->gen >= 6) {
1121 emit(BRW_OPCODE_ASR, *reg,
1122 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1123 fs_reg(15));
1124 emit(BRW_OPCODE_NOT, *reg, *reg);
1125 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1126 } else {
1127 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1128 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1129 * us front face
1130 */
1131 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1132 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1133 }
1134
1135 return reg;
1136 }
1137
1138 void
1139 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1140 {
1141 assert(dst.type == BRW_REGISTER_TYPE_F);
1142
1143 if (c->key.compute_pos_offset) {
1144 /* Convert int_sample_pos to floating point */
1145 emit(MOV(dst, int_sample_pos));
1146 /* Scale to the range [0, 1] */
1147 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1148 }
1149 else {
1150 /* From ARB_sample_shading specification:
1151 * "When rendering to a non-multisample buffer, or if multisample
1152 * rasterization is disabled, gl_SamplePosition will always be
1153 * (0.5, 0.5).
1154 */
1155 emit(MOV(dst, fs_reg(0.5f)));
1156 }
1157 }
1158
1159 fs_reg *
1160 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1161 {
1162 assert(brw->gen >= 6);
1163 assert(ir->type == glsl_type::vec2_type);
1164
1165 this->current_annotation = "compute sample position";
1166 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1167 fs_reg pos = *reg;
1168 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1169 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1170
1171 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1172 * mode will be enabled.
1173 *
1174 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1175 * R31.1:0 Position Offset X/Y for Slot[3:0]
1176 * R31.3:2 Position Offset X/Y for Slot[7:4]
1177 * .....
1178 *
1179 * The X, Y sample positions come in as bytes in thread payload. So, read
1180 * the positions using vstride=16, width=8, hstride=2.
1181 */
1182 struct brw_reg sample_pos_reg =
1183 stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1184 BRW_REGISTER_TYPE_B), 16, 8, 2);
1185
1186 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1187 if (dispatch_width == 16) {
1188 int_sample_x.sechalf = true;
1189 fs_inst *inst = emit(MOV(int_sample_x,
1190 fs_reg(suboffset(sample_pos_reg, 16))));
1191 inst->force_sechalf = true;
1192 int_sample_x.sechalf = false;
1193 }
1194 /* Compute gl_SamplePosition.x */
1195 compute_sample_position(pos, int_sample_x);
1196 pos.reg_offset++;
1197 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1198 if (dispatch_width == 16) {
1199 int_sample_y.sechalf = true;
1200 fs_inst *inst = emit(MOV(int_sample_y,
1201 fs_reg(suboffset(sample_pos_reg, 17))));
1202 inst->force_sechalf = true;
1203 int_sample_y.sechalf = false;
1204 }
1205 /* Compute gl_SamplePosition.y */
1206 compute_sample_position(pos, int_sample_y);
1207 return reg;
1208 }
1209
1210 fs_reg *
1211 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1212 {
1213 assert(brw->gen >= 6);
1214
1215 this->current_annotation = "compute sample id";
1216 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1217
1218 if (c->key.compute_sample_id) {
1219 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1220 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1221 t2.type = BRW_REGISTER_TYPE_UW;
1222
1223 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1224 * 8x multisampling, subspan 0 will represent sample N (where N
1225 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1226 * 7. We can find the value of N by looking at R0.0 bits 7:6
1227 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1228 * (since samples are always delivered in pairs). That is, we
1229 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1230 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1231 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1232 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1233 * populating a temporary variable with the sequence (0, 1, 2, 3),
1234 * and then reading from it using vstride=1, width=4, hstride=0.
1235 * These computations hold good for 4x multisampling as well.
1236 */
1237 emit(BRW_OPCODE_AND, t1,
1238 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1239 fs_reg(brw_imm_d(0xc0)));
1240 emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1241 /* This works for both SIMD8 and SIMD16 */
1242 emit(MOV(t2, brw_imm_v(0x3210)));
1243 /* This special instruction takes care of setting vstride=1,
1244 * width=4, hstride=0 of t2 during an ADD instruction.
1245 */
1246 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1247 } else {
1248 /* As per GL_ARB_sample_shading specification:
1249 * "When rendering to a non-multisample buffer, or if multisample
1250 * rasterization is disabled, gl_SampleID will always be zero."
1251 */
1252 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1253 }
1254
1255 return reg;
1256 }
1257
1258 fs_reg *
1259 fs_visitor::emit_samplemaskin_setup(ir_variable *ir)
1260 {
1261 assert(brw->gen >= 7);
1262 this->current_annotation = "compute gl_SampleMaskIn";
1263 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1264 emit(MOV(*reg, fs_reg(retype(brw_vec8_grf(c->sample_mask_reg, 0), BRW_REGISTER_TYPE_D))));
1265 return reg;
1266 }
1267
1268 fs_reg
1269 fs_visitor::fix_math_operand(fs_reg src)
1270 {
1271 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1272 * might be able to do better by doing execsize = 1 math and then
1273 * expanding that result out, but we would need to be careful with
1274 * masking.
1275 *
1276 * The hardware ignores source modifiers (negate and abs) on math
1277 * instructions, so we also move to a temp to set those up.
1278 */
1279 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1280 !src.abs && !src.negate)
1281 return src;
1282
1283 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1284 * operands to math
1285 */
1286 if (brw->gen >= 7 && src.file != IMM)
1287 return src;
1288
1289 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1290 expanded.type = src.type;
1291 emit(BRW_OPCODE_MOV, expanded, src);
1292 return expanded;
1293 }
1294
1295 fs_inst *
1296 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1297 {
1298 switch (opcode) {
1299 case SHADER_OPCODE_RCP:
1300 case SHADER_OPCODE_RSQ:
1301 case SHADER_OPCODE_SQRT:
1302 case SHADER_OPCODE_EXP2:
1303 case SHADER_OPCODE_LOG2:
1304 case SHADER_OPCODE_SIN:
1305 case SHADER_OPCODE_COS:
1306 break;
1307 default:
1308 assert(!"not reached: bad math opcode");
1309 return NULL;
1310 }
1311
1312 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1313 * might be able to do better by doing execsize = 1 math and then
1314 * expanding that result out, but we would need to be careful with
1315 * masking.
1316 *
1317 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1318 * instructions, so we also move to a temp to set those up.
1319 */
1320 if (brw->gen >= 6)
1321 src = fix_math_operand(src);
1322
1323 fs_inst *inst = emit(opcode, dst, src);
1324
1325 if (brw->gen < 6) {
1326 inst->base_mrf = 2;
1327 inst->mlen = dispatch_width / 8;
1328 }
1329
1330 return inst;
1331 }
1332
1333 fs_inst *
1334 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1335 {
1336 int base_mrf = 2;
1337 fs_inst *inst;
1338
1339 switch (opcode) {
1340 case SHADER_OPCODE_INT_QUOTIENT:
1341 case SHADER_OPCODE_INT_REMAINDER:
1342 if (brw->gen >= 7 && dispatch_width == 16)
1343 fail("16-wide INTDIV unsupported\n");
1344 break;
1345 case SHADER_OPCODE_POW:
1346 break;
1347 default:
1348 assert(!"not reached: unsupported binary math opcode.");
1349 return NULL;
1350 }
1351
1352 if (brw->gen >= 6) {
1353 src0 = fix_math_operand(src0);
1354 src1 = fix_math_operand(src1);
1355
1356 inst = emit(opcode, dst, src0, src1);
1357 } else {
1358 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1359 * "Message Payload":
1360 *
1361 * "Operand0[7]. For the INT DIV functions, this operand is the
1362 * denominator."
1363 * ...
1364 * "Operand1[7]. For the INT DIV functions, this operand is the
1365 * numerator."
1366 */
1367 bool is_int_div = opcode != SHADER_OPCODE_POW;
1368 fs_reg &op0 = is_int_div ? src1 : src0;
1369 fs_reg &op1 = is_int_div ? src0 : src1;
1370
1371 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1372 inst = emit(opcode, dst, op0, reg_null_f);
1373
1374 inst->base_mrf = base_mrf;
1375 inst->mlen = 2 * dispatch_width / 8;
1376 }
1377 return inst;
1378 }
1379
1380 void
1381 fs_visitor::assign_curb_setup()
1382 {
1383 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1384 if (dispatch_width == 8) {
1385 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1386 } else {
1387 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1388 }
1389
1390 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1391 foreach_list(node, &this->instructions) {
1392 fs_inst *inst = (fs_inst *)node;
1393
1394 for (unsigned int i = 0; i < 3; i++) {
1395 if (inst->src[i].file == UNIFORM) {
1396 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1397 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1398 constant_nr / 8,
1399 constant_nr % 8);
1400
1401 inst->src[i].file = HW_REG;
1402 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1403 }
1404 }
1405 }
1406 }
1407
1408 void
1409 fs_visitor::calculate_urb_setup()
1410 {
1411 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1412 c->prog_data.urb_setup[i] = -1;
1413 }
1414
1415 int urb_next = 0;
1416 /* Figure out where each of the incoming setup attributes lands. */
1417 if (brw->gen >= 6) {
1418 if (_mesa_bitcount_64(fp->Base.InputsRead &
1419 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1420 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1421 * first 16 varying inputs, so we can put them wherever we want.
1422 * Just put them in order.
1423 *
1424 * This is useful because it means that (a) inputs not used by the
1425 * fragment shader won't take up valuable register space, and (b) we
1426 * won't have to recompile the fragment shader if it gets paired with
1427 * a different vertex (or geometry) shader.
1428 */
1429 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1430 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1431 BITFIELD64_BIT(i)) {
1432 c->prog_data.urb_setup[i] = urb_next++;
1433 }
1434 }
1435 } else {
1436 /* We have enough input varyings that the SF/SBE pipeline stage can't
1437 * arbitrarily rearrange them to suit our whim; we have to put them
1438 * in an order that matches the output of the previous pipeline stage
1439 * (geometry or vertex shader).
1440 */
1441 struct brw_vue_map prev_stage_vue_map;
1442 brw_compute_vue_map(brw, &prev_stage_vue_map,
1443 c->key.input_slots_valid);
1444 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1445 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1446 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1447 slot++) {
1448 int varying = prev_stage_vue_map.slot_to_varying[slot];
1449 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1450 * unused.
1451 */
1452 if (varying != BRW_VARYING_SLOT_COUNT &&
1453 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1454 BITFIELD64_BIT(varying))) {
1455 c->prog_data.urb_setup[varying] = slot - first_slot;
1456 }
1457 }
1458 urb_next = prev_stage_vue_map.num_slots - first_slot;
1459 }
1460 } else {
1461 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1462 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1463 /* Point size is packed into the header, not as a general attribute */
1464 if (i == VARYING_SLOT_PSIZ)
1465 continue;
1466
1467 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1468 /* The back color slot is skipped when the front color is
1469 * also written to. In addition, some slots can be
1470 * written in the vertex shader and not read in the
1471 * fragment shader. So the register number must always be
1472 * incremented, mapped or not.
1473 */
1474 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1475 c->prog_data.urb_setup[i] = urb_next;
1476 urb_next++;
1477 }
1478 }
1479
1480 /*
1481 * It's a FS only attribute, and we did interpolation for this attribute
1482 * in SF thread. So, count it here, too.
1483 *
1484 * See compile_sf_prog() for more info.
1485 */
1486 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1487 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1488 }
1489
1490 c->prog_data.num_varying_inputs = urb_next;
1491 }
1492
1493 void
1494 fs_visitor::assign_urb_setup()
1495 {
1496 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1497
1498 /* Offset all the urb_setup[] index by the actual position of the
1499 * setup regs, now that the location of the constants has been chosen.
1500 */
1501 foreach_list(node, &this->instructions) {
1502 fs_inst *inst = (fs_inst *)node;
1503
1504 if (inst->opcode == FS_OPCODE_LINTERP) {
1505 assert(inst->src[2].file == HW_REG);
1506 inst->src[2].fixed_hw_reg.nr += urb_start;
1507 }
1508
1509 if (inst->opcode == FS_OPCODE_CINTERP) {
1510 assert(inst->src[0].file == HW_REG);
1511 inst->src[0].fixed_hw_reg.nr += urb_start;
1512 }
1513 }
1514
1515 /* Each attribute is 4 setup channels, each of which is half a reg. */
1516 this->first_non_payload_grf =
1517 urb_start + c->prog_data.num_varying_inputs * 2;
1518 }
1519
1520 /**
1521 * Split large virtual GRFs into separate components if we can.
1522 *
1523 * This is mostly duplicated with what brw_fs_vector_splitting does,
1524 * but that's really conservative because it's afraid of doing
1525 * splitting that doesn't result in real progress after the rest of
1526 * the optimization phases, which would cause infinite looping in
1527 * optimization. We can do it once here, safely. This also has the
1528 * opportunity to split interpolated values, or maybe even uniforms,
1529 * which we don't have at the IR level.
1530 *
1531 * We want to split, because virtual GRFs are what we register
1532 * allocate and spill (due to contiguousness requirements for some
1533 * instructions), and they're what we naturally generate in the
1534 * codegen process, but most virtual GRFs don't actually need to be
1535 * contiguous sets of GRFs. If we split, we'll end up with reduced
1536 * live intervals and better dead code elimination and coalescing.
1537 */
1538 void
1539 fs_visitor::split_virtual_grfs()
1540 {
1541 int num_vars = this->virtual_grf_count;
1542 bool split_grf[num_vars];
1543 int new_virtual_grf[num_vars];
1544
1545 /* Try to split anything > 0 sized. */
1546 for (int i = 0; i < num_vars; i++) {
1547 if (this->virtual_grf_sizes[i] != 1)
1548 split_grf[i] = true;
1549 else
1550 split_grf[i] = false;
1551 }
1552
1553 if (brw->has_pln &&
1554 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1555 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1556 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1557 * Gen6, that was the only supported interpolation mode, and since Gen6,
1558 * delta_x and delta_y are in fixed hardware registers.
1559 */
1560 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1561 false;
1562 }
1563
1564 foreach_list(node, &this->instructions) {
1565 fs_inst *inst = (fs_inst *)node;
1566
1567 /* If there's a SEND message that requires contiguous destination
1568 * registers, no splitting is allowed.
1569 */
1570 if (inst->regs_written > 1) {
1571 split_grf[inst->dst.reg] = false;
1572 }
1573
1574 /* If we're sending from a GRF, don't split it, on the assumption that
1575 * the send is reading the whole thing.
1576 */
1577 if (inst->is_send_from_grf()) {
1578 for (int i = 0; i < 3; i++) {
1579 if (inst->src[i].file == GRF) {
1580 split_grf[inst->src[i].reg] = false;
1581 }
1582 }
1583 }
1584 }
1585
1586 /* Allocate new space for split regs. Note that the virtual
1587 * numbers will be contiguous.
1588 */
1589 for (int i = 0; i < num_vars; i++) {
1590 if (split_grf[i]) {
1591 new_virtual_grf[i] = virtual_grf_alloc(1);
1592 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1593 int reg = virtual_grf_alloc(1);
1594 assert(reg == new_virtual_grf[i] + j - 1);
1595 (void) reg;
1596 }
1597 this->virtual_grf_sizes[i] = 1;
1598 }
1599 }
1600
1601 foreach_list(node, &this->instructions) {
1602 fs_inst *inst = (fs_inst *)node;
1603
1604 if (inst->dst.file == GRF &&
1605 split_grf[inst->dst.reg] &&
1606 inst->dst.reg_offset != 0) {
1607 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1608 inst->dst.reg_offset - 1);
1609 inst->dst.reg_offset = 0;
1610 }
1611 for (int i = 0; i < 3; i++) {
1612 if (inst->src[i].file == GRF &&
1613 split_grf[inst->src[i].reg] &&
1614 inst->src[i].reg_offset != 0) {
1615 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1616 inst->src[i].reg_offset - 1);
1617 inst->src[i].reg_offset = 0;
1618 }
1619 }
1620 }
1621 invalidate_live_intervals();
1622 }
1623
1624 /**
1625 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1626 *
1627 * During code generation, we create tons of temporary variables, many of
1628 * which get immediately killed and are never used again. Yet, in later
1629 * optimization and analysis passes, such as compute_live_intervals, we need
1630 * to loop over all the virtual GRFs. Compacting them can save a lot of
1631 * overhead.
1632 */
1633 void
1634 fs_visitor::compact_virtual_grfs()
1635 {
1636 /* Mark which virtual GRFs are used, and count how many. */
1637 int remap_table[this->virtual_grf_count];
1638 memset(remap_table, -1, sizeof(remap_table));
1639
1640 foreach_list(node, &this->instructions) {
1641 const fs_inst *inst = (const fs_inst *) node;
1642
1643 if (inst->dst.file == GRF)
1644 remap_table[inst->dst.reg] = 0;
1645
1646 for (int i = 0; i < 3; i++) {
1647 if (inst->src[i].file == GRF)
1648 remap_table[inst->src[i].reg] = 0;
1649 }
1650 }
1651
1652 /* In addition to registers used in instructions, fs_visitor keeps
1653 * direct references to certain special values which must be patched:
1654 */
1655 fs_reg *special[] = {
1656 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1657 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1658 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1659 &delta_x[0], &delta_x[1], &delta_x[2],
1660 &delta_x[3], &delta_x[4], &delta_x[5],
1661 &delta_y[0], &delta_y[1], &delta_y[2],
1662 &delta_y[3], &delta_y[4], &delta_y[5],
1663 };
1664 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1665 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1666
1667 /* Treat all special values as used, to be conservative */
1668 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1669 if (special[i]->file == GRF)
1670 remap_table[special[i]->reg] = 0;
1671 }
1672
1673 /* Compact the GRF arrays. */
1674 int new_index = 0;
1675 for (int i = 0; i < this->virtual_grf_count; i++) {
1676 if (remap_table[i] != -1) {
1677 remap_table[i] = new_index;
1678 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1679 invalidate_live_intervals();
1680 ++new_index;
1681 }
1682 }
1683
1684 this->virtual_grf_count = new_index;
1685
1686 /* Patch all the instructions to use the newly renumbered registers */
1687 foreach_list(node, &this->instructions) {
1688 fs_inst *inst = (fs_inst *) node;
1689
1690 if (inst->dst.file == GRF)
1691 inst->dst.reg = remap_table[inst->dst.reg];
1692
1693 for (int i = 0; i < 3; i++) {
1694 if (inst->src[i].file == GRF)
1695 inst->src[i].reg = remap_table[inst->src[i].reg];
1696 }
1697 }
1698
1699 /* Patch all the references to special values */
1700 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1701 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1702 special[i]->reg = remap_table[special[i]->reg];
1703 }
1704 }
1705
1706 bool
1707 fs_visitor::remove_dead_constants()
1708 {
1709 if (dispatch_width == 8) {
1710 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1711 this->nr_params_remap = c->prog_data.nr_params;
1712
1713 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1714 this->params_remap[i] = -1;
1715
1716 /* Find which params are still in use. */
1717 foreach_list(node, &this->instructions) {
1718 fs_inst *inst = (fs_inst *)node;
1719
1720 for (int i = 0; i < 3; i++) {
1721 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1722
1723 if (inst->src[i].file != UNIFORM)
1724 continue;
1725
1726 /* Section 5.11 of the OpenGL 4.3 spec says:
1727 *
1728 * "Out-of-bounds reads return undefined values, which include
1729 * values from other variables of the active program or zero."
1730 */
1731 if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1732 constant_nr = 0;
1733 }
1734
1735 /* For now, set this to non-negative. We'll give it the
1736 * actual new number in a moment, in order to keep the
1737 * register numbers nicely ordered.
1738 */
1739 this->params_remap[constant_nr] = 0;
1740 }
1741 }
1742
1743 /* Figure out what the new numbers for the params will be. At some
1744 * point when we're doing uniform array access, we're going to want
1745 * to keep the distinction between .reg and .reg_offset, but for
1746 * now we don't care.
1747 */
1748 unsigned int new_nr_params = 0;
1749 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1750 if (this->params_remap[i] != -1) {
1751 this->params_remap[i] = new_nr_params++;
1752 }
1753 }
1754
1755 /* Update the list of params to be uploaded to match our new numbering. */
1756 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1757 int remapped = this->params_remap[i];
1758
1759 if (remapped == -1)
1760 continue;
1761
1762 c->prog_data.param[remapped] = c->prog_data.param[i];
1763 }
1764
1765 c->prog_data.nr_params = new_nr_params;
1766 } else {
1767 /* This should have been generated in the 8-wide pass already. */
1768 assert(this->params_remap);
1769 }
1770
1771 /* Now do the renumbering of the shader to remove unused params. */
1772 foreach_list(node, &this->instructions) {
1773 fs_inst *inst = (fs_inst *)node;
1774
1775 for (int i = 0; i < 3; i++) {
1776 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1777
1778 if (inst->src[i].file != UNIFORM)
1779 continue;
1780
1781 /* as above alias to 0 */
1782 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1783 constant_nr = 0;
1784 }
1785 assert(this->params_remap[constant_nr] != -1);
1786 inst->src[i].reg = this->params_remap[constant_nr];
1787 inst->src[i].reg_offset = 0;
1788 }
1789 }
1790
1791 return true;
1792 }
1793
1794 /*
1795 * Implements array access of uniforms by inserting a
1796 * PULL_CONSTANT_LOAD instruction.
1797 *
1798 * Unlike temporary GRF array access (where we don't support it due to
1799 * the difficulty of doing relative addressing on instruction
1800 * destinations), we could potentially do array access of uniforms
1801 * that were loaded in GRF space as push constants. In real-world
1802 * usage we've seen, though, the arrays being used are always larger
1803 * than we could load as push constants, so just always move all
1804 * uniform array access out to a pull constant buffer.
1805 */
1806 void
1807 fs_visitor::move_uniform_array_access_to_pull_constants()
1808 {
1809 int pull_constant_loc[c->prog_data.nr_params];
1810
1811 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1812 pull_constant_loc[i] = -1;
1813 }
1814
1815 /* Walk through and find array access of uniforms. Put a copy of that
1816 * uniform in the pull constant buffer.
1817 *
1818 * Note that we don't move constant-indexed accesses to arrays. No
1819 * testing has been done of the performance impact of this choice.
1820 */
1821 foreach_list_safe(node, &this->instructions) {
1822 fs_inst *inst = (fs_inst *)node;
1823
1824 for (int i = 0 ; i < 3; i++) {
1825 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1826 continue;
1827
1828 int uniform = inst->src[i].reg;
1829
1830 /* If this array isn't already present in the pull constant buffer,
1831 * add it.
1832 */
1833 if (pull_constant_loc[uniform] == -1) {
1834 const float **values = &c->prog_data.param[uniform];
1835
1836 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1837
1838 assert(param_size[uniform]);
1839
1840 for (int j = 0; j < param_size[uniform]; j++) {
1841 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1842 values[j];
1843 }
1844 }
1845
1846 /* Set up the annotation tracking for new generated instructions. */
1847 base_ir = inst->ir;
1848 current_annotation = inst->annotation;
1849
1850 fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1851 fs_reg temp = fs_reg(this, glsl_type::float_type);
1852 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1853 surf_index,
1854 *inst->src[i].reladdr,
1855 pull_constant_loc[uniform] +
1856 inst->src[i].reg_offset);
1857 inst->insert_before(&list);
1858
1859 inst->src[i].file = temp.file;
1860 inst->src[i].reg = temp.reg;
1861 inst->src[i].reg_offset = temp.reg_offset;
1862 inst->src[i].reladdr = NULL;
1863 }
1864 }
1865 }
1866
1867 /**
1868 * Choose accesses from the UNIFORM file to demote to using the pull
1869 * constant buffer.
1870 *
1871 * We allow a fragment shader to have more than the specified minimum
1872 * maximum number of fragment shader uniform components (64). If
1873 * there are too many of these, they'd fill up all of register space.
1874 * So, this will push some of them out to the pull constant buffer and
1875 * update the program to load them.
1876 */
1877 void
1878 fs_visitor::setup_pull_constants()
1879 {
1880 /* Only allow 16 registers (128 uniform components) as push constants. */
1881 unsigned int max_uniform_components = 16 * 8;
1882 if (c->prog_data.nr_params <= max_uniform_components)
1883 return;
1884
1885 if (dispatch_width == 16) {
1886 fail("Pull constants not supported in 16-wide\n");
1887 return;
1888 }
1889
1890 /* Just demote the end of the list. We could probably do better
1891 * here, demoting things that are rarely used in the program first.
1892 */
1893 unsigned int pull_uniform_base = max_uniform_components;
1894
1895 int pull_constant_loc[c->prog_data.nr_params];
1896 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1897 if (i < pull_uniform_base) {
1898 pull_constant_loc[i] = -1;
1899 } else {
1900 pull_constant_loc[i] = -1;
1901 /* If our constant is already being uploaded for reladdr purposes,
1902 * reuse it.
1903 */
1904 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1905 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1906 pull_constant_loc[i] = j;
1907 break;
1908 }
1909 }
1910 if (pull_constant_loc[i] == -1) {
1911 int pull_index = c->prog_data.nr_pull_params++;
1912 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1913 pull_constant_loc[i] = pull_index;;
1914 }
1915 }
1916 }
1917 c->prog_data.nr_params = pull_uniform_base;
1918
1919 foreach_list(node, &this->instructions) {
1920 fs_inst *inst = (fs_inst *)node;
1921
1922 for (int i = 0; i < 3; i++) {
1923 if (inst->src[i].file != UNIFORM)
1924 continue;
1925
1926 int pull_index = pull_constant_loc[inst->src[i].reg +
1927 inst->src[i].reg_offset];
1928 if (pull_index == -1)
1929 continue;
1930
1931 assert(!inst->src[i].reladdr);
1932
1933 fs_reg dst = fs_reg(this, glsl_type::float_type);
1934 fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1935 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1936 fs_inst *pull =
1937 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1938 dst, index, offset);
1939 pull->ir = inst->ir;
1940 pull->annotation = inst->annotation;
1941
1942 inst->insert_before(pull);
1943
1944 inst->src[i].file = GRF;
1945 inst->src[i].reg = dst.reg;
1946 inst->src[i].reg_offset = 0;
1947 inst->src[i].smear = pull_index & 3;
1948 }
1949 }
1950 }
1951
1952 bool
1953 fs_visitor::opt_algebraic()
1954 {
1955 bool progress = false;
1956
1957 foreach_list(node, &this->instructions) {
1958 fs_inst *inst = (fs_inst *)node;
1959
1960 switch (inst->opcode) {
1961 case BRW_OPCODE_MUL:
1962 if (inst->src[1].file != IMM)
1963 continue;
1964
1965 /* a * 1.0 = a */
1966 if (inst->src[1].is_one()) {
1967 inst->opcode = BRW_OPCODE_MOV;
1968 inst->src[1] = reg_undef;
1969 progress = true;
1970 break;
1971 }
1972
1973 /* a * 0.0 = 0.0 */
1974 if (inst->src[1].is_zero()) {
1975 inst->opcode = BRW_OPCODE_MOV;
1976 inst->src[0] = inst->src[1];
1977 inst->src[1] = reg_undef;
1978 progress = true;
1979 break;
1980 }
1981
1982 break;
1983 case BRW_OPCODE_ADD:
1984 if (inst->src[1].file != IMM)
1985 continue;
1986
1987 /* a + 0.0 = a */
1988 if (inst->src[1].is_zero()) {
1989 inst->opcode = BRW_OPCODE_MOV;
1990 inst->src[1] = reg_undef;
1991 progress = true;
1992 break;
1993 }
1994 break;
1995 case BRW_OPCODE_OR:
1996 if (inst->src[0].equals(inst->src[1])) {
1997 inst->opcode = BRW_OPCODE_MOV;
1998 inst->src[1] = reg_undef;
1999 progress = true;
2000 break;
2001 }
2002 break;
2003 case BRW_OPCODE_SEL:
2004 if (inst->saturate && inst->src[1].file == IMM) {
2005 switch (inst->conditional_mod) {
2006 case BRW_CONDITIONAL_LE:
2007 case BRW_CONDITIONAL_L:
2008 switch (inst->src[1].type) {
2009 case BRW_REGISTER_TYPE_F:
2010 if (inst->src[1].imm.f >= 1.0f) {
2011 inst->opcode = BRW_OPCODE_MOV;
2012 inst->src[1] = reg_undef;
2013 progress = true;
2014 }
2015 break;
2016 default:
2017 break;
2018 }
2019 break;
2020 case BRW_CONDITIONAL_GE:
2021 case BRW_CONDITIONAL_G:
2022 switch (inst->src[1].type) {
2023 case BRW_REGISTER_TYPE_F:
2024 if (inst->src[1].imm.f <= 0.0f) {
2025 inst->opcode = BRW_OPCODE_MOV;
2026 inst->src[1] = reg_undef;
2027 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2028 progress = true;
2029 }
2030 break;
2031 default:
2032 break;
2033 }
2034 default:
2035 break;
2036 }
2037 }
2038 break;
2039 default:
2040 break;
2041 }
2042 }
2043
2044 return progress;
2045 }
2046
2047 /**
2048 * Removes any instructions writing a VGRF where that VGRF is not used by any
2049 * later instruction.
2050 */
2051 bool
2052 fs_visitor::dead_code_eliminate()
2053 {
2054 bool progress = false;
2055 int pc = 0;
2056
2057 calculate_live_intervals();
2058
2059 foreach_list_safe(node, &this->instructions) {
2060 fs_inst *inst = (fs_inst *)node;
2061
2062 if (inst->dst.file == GRF && !inst->has_side_effects()) {
2063 bool dead = true;
2064
2065 for (int i = 0; i < inst->regs_written; i++) {
2066 int var = live_intervals->var_from_vgrf[inst->dst.reg];
2067 assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2068 if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2069 dead = false;
2070 break;
2071 }
2072 }
2073
2074 if (dead) {
2075 /* Don't dead code eliminate instructions that write to the
2076 * accumulator as a side-effect. Instead just set the destination
2077 * to the null register to free it.
2078 */
2079 switch (inst->opcode) {
2080 case BRW_OPCODE_ADDC:
2081 case BRW_OPCODE_SUBB:
2082 case BRW_OPCODE_MACH:
2083 inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2084 break;
2085 default:
2086 inst->remove();
2087 progress = true;
2088 break;
2089 }
2090 }
2091 }
2092
2093 pc++;
2094 }
2095
2096 if (progress)
2097 invalidate_live_intervals();
2098
2099 return progress;
2100 }
2101
2102 struct dead_code_hash_key
2103 {
2104 int vgrf;
2105 int reg_offset;
2106 };
2107
2108 static bool
2109 dead_code_hash_compare(const void *a, const void *b)
2110 {
2111 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2112 }
2113
2114 static void
2115 clear_dead_code_hash(struct hash_table *ht)
2116 {
2117 struct hash_entry *entry;
2118
2119 hash_table_foreach(ht, entry) {
2120 _mesa_hash_table_remove(ht, entry);
2121 }
2122 }
2123
2124 static void
2125 insert_dead_code_hash(struct hash_table *ht,
2126 int vgrf, int reg_offset, fs_inst *inst)
2127 {
2128 /* We don't bother freeing keys, because they'll be GCed with the ht. */
2129 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2130
2131 key->vgrf = vgrf;
2132 key->reg_offset = reg_offset;
2133
2134 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2135 }
2136
2137 static struct hash_entry *
2138 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2139 {
2140 struct dead_code_hash_key key;
2141
2142 key.vgrf = vgrf;
2143 key.reg_offset = reg_offset;
2144
2145 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2146 }
2147
2148 static void
2149 remove_dead_code_hash(struct hash_table *ht,
2150 int vgrf, int reg_offset)
2151 {
2152 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2153 if (!entry)
2154 return;
2155
2156 _mesa_hash_table_remove(ht, entry);
2157 }
2158
2159 /**
2160 * Walks basic blocks, removing any regs that are written but not read before
2161 * being redefined.
2162 *
2163 * The dead_code_eliminate() function implements a global dead code
2164 * elimination, but it only handles the removing the last write to a register
2165 * if it's never read. This one can handle intermediate writes, but only
2166 * within a basic block.
2167 */
2168 bool
2169 fs_visitor::dead_code_eliminate_local()
2170 {
2171 struct hash_table *ht;
2172 bool progress = false;
2173
2174 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2175
2176 foreach_list_safe(node, &this->instructions) {
2177 fs_inst *inst = (fs_inst *)node;
2178
2179 /* At a basic block, empty the HT since we don't understand dataflow
2180 * here.
2181 */
2182 if (inst->is_control_flow()) {
2183 clear_dead_code_hash(ht);
2184 continue;
2185 }
2186
2187 /* Clear the HT of any instructions that got read. */
2188 for (int i = 0; i < 3; i++) {
2189 fs_reg src = inst->src[i];
2190 if (src.file != GRF)
2191 continue;
2192
2193 int read = 1;
2194 if (inst->is_send_from_grf())
2195 read = virtual_grf_sizes[src.reg] - src.reg_offset;
2196
2197 for (int reg_offset = src.reg_offset;
2198 reg_offset < src.reg_offset + read;
2199 reg_offset++) {
2200 remove_dead_code_hash(ht, src.reg, reg_offset);
2201 }
2202 }
2203
2204 /* Add any update of a GRF to the HT, removing a previous write if it
2205 * wasn't read.
2206 */
2207 if (inst->dst.file == GRF) {
2208 if (inst->regs_written > 1) {
2209 /* We don't know how to trim channels from an instruction's
2210 * writes, so we can't incrementally remove unread channels from
2211 * it. Just remove whatever it overwrites from the table
2212 */
2213 for (int i = 0; i < inst->regs_written; i++) {
2214 remove_dead_code_hash(ht,
2215 inst->dst.reg,
2216 inst->dst.reg_offset + i);
2217 }
2218 } else {
2219 struct hash_entry *entry =
2220 get_dead_code_hash_entry(ht, inst->dst.reg,
2221 inst->dst.reg_offset);
2222
2223 if (entry) {
2224 if (inst->is_partial_write()) {
2225 /* For a partial write, we can't remove any previous dead code
2226 * candidate, since we're just modifying their result.
2227 */
2228 } else {
2229 /* We're completely updating a channel, and there was a
2230 * previous write to the channel that wasn't read. Kill it!
2231 */
2232 fs_inst *inst = (fs_inst *)entry->data;
2233 inst->remove();
2234 progress = true;
2235 }
2236
2237 _mesa_hash_table_remove(ht, entry);
2238 }
2239
2240 if (!inst->has_side_effects())
2241 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2242 inst);
2243 }
2244 }
2245 }
2246
2247 _mesa_hash_table_destroy(ht, NULL);
2248
2249 if (progress)
2250 invalidate_live_intervals();
2251
2252 return progress;
2253 }
2254
2255 /**
2256 * Implements register coalescing: Checks if the two registers involved in a
2257 * raw move don't interfere, in which case they can both be stored in the same
2258 * place and the MOV removed.
2259 */
2260 bool
2261 fs_visitor::register_coalesce()
2262 {
2263 bool progress = false;
2264
2265 calculate_live_intervals();
2266
2267 foreach_list_safe(node, &this->instructions) {
2268 fs_inst *inst = (fs_inst *)node;
2269
2270 if (inst->opcode != BRW_OPCODE_MOV ||
2271 inst->is_partial_write() ||
2272 inst->saturate ||
2273 inst->src[0].file != GRF ||
2274 inst->src[0].negate ||
2275 inst->src[0].abs ||
2276 inst->src[0].smear != -1 ||
2277 inst->dst.file != GRF ||
2278 inst->dst.type != inst->src[0].type ||
2279 virtual_grf_sizes[inst->src[0].reg] != 1) {
2280 continue;
2281 }
2282
2283 int var_from = live_intervals->var_from_reg(&inst->src[0]);
2284 int var_to = live_intervals->var_from_reg(&inst->dst);
2285
2286 if (live_intervals->vars_interfere(var_from, var_to) &&
2287 !inst->dst.equals(inst->src[0]))
2288 continue;
2289
2290 int reg_from = inst->src[0].reg;
2291 assert(inst->src[0].reg_offset == 0);
2292 int reg_to = inst->dst.reg;
2293 int reg_to_offset = inst->dst.reg_offset;
2294
2295 foreach_list(node, &this->instructions) {
2296 fs_inst *scan_inst = (fs_inst *)node;
2297
2298 if (scan_inst->dst.file == GRF &&
2299 scan_inst->dst.reg == reg_from) {
2300 scan_inst->dst.reg = reg_to;
2301 scan_inst->dst.reg_offset = reg_to_offset;
2302 }
2303 for (int i = 0; i < 3; i++) {
2304 if (scan_inst->src[i].file == GRF &&
2305 scan_inst->src[i].reg == reg_from) {
2306 scan_inst->src[i].reg = reg_to;
2307 scan_inst->src[i].reg_offset = reg_to_offset;
2308 }
2309 }
2310 }
2311
2312 inst->remove();
2313 progress = true;
2314 continue;
2315 }
2316
2317 if (progress)
2318 invalidate_live_intervals();
2319
2320 return progress;
2321 }
2322
2323 bool
2324 fs_visitor::compute_to_mrf()
2325 {
2326 bool progress = false;
2327 int next_ip = 0;
2328
2329 calculate_live_intervals();
2330
2331 foreach_list_safe(node, &this->instructions) {
2332 fs_inst *inst = (fs_inst *)node;
2333
2334 int ip = next_ip;
2335 next_ip++;
2336
2337 if (inst->opcode != BRW_OPCODE_MOV ||
2338 inst->is_partial_write() ||
2339 inst->dst.file != MRF || inst->src[0].file != GRF ||
2340 inst->dst.type != inst->src[0].type ||
2341 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2342 continue;
2343
2344 /* Work out which hardware MRF registers are written by this
2345 * instruction.
2346 */
2347 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2348 int mrf_high;
2349 if (inst->dst.reg & BRW_MRF_COMPR4) {
2350 mrf_high = mrf_low + 4;
2351 } else if (dispatch_width == 16 &&
2352 (!inst->force_uncompressed && !inst->force_sechalf)) {
2353 mrf_high = mrf_low + 1;
2354 } else {
2355 mrf_high = mrf_low;
2356 }
2357
2358 /* Can't compute-to-MRF this GRF if someone else was going to
2359 * read it later.
2360 */
2361 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2362 continue;
2363
2364 /* Found a move of a GRF to a MRF. Let's see if we can go
2365 * rewrite the thing that made this GRF to write into the MRF.
2366 */
2367 fs_inst *scan_inst;
2368 for (scan_inst = (fs_inst *)inst->prev;
2369 scan_inst->prev != NULL;
2370 scan_inst = (fs_inst *)scan_inst->prev) {
2371 if (scan_inst->dst.file == GRF &&
2372 scan_inst->dst.reg == inst->src[0].reg) {
2373 /* Found the last thing to write our reg we want to turn
2374 * into a compute-to-MRF.
2375 */
2376
2377 /* If this one instruction didn't populate all the
2378 * channels, bail. We might be able to rewrite everything
2379 * that writes that reg, but it would require smarter
2380 * tracking to delay the rewriting until complete success.
2381 */
2382 if (scan_inst->is_partial_write())
2383 break;
2384
2385 /* Things returning more than one register would need us to
2386 * understand coalescing out more than one MOV at a time.
2387 */
2388 if (scan_inst->regs_written > 1)
2389 break;
2390
2391 /* SEND instructions can't have MRF as a destination. */
2392 if (scan_inst->mlen)
2393 break;
2394
2395 if (brw->gen == 6) {
2396 /* gen6 math instructions must have the destination be
2397 * GRF, so no compute-to-MRF for them.
2398 */
2399 if (scan_inst->is_math()) {
2400 break;
2401 }
2402 }
2403
2404 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2405 /* Found the creator of our MRF's source value. */
2406 scan_inst->dst.file = MRF;
2407 scan_inst->dst.reg = inst->dst.reg;
2408 scan_inst->saturate |= inst->saturate;
2409 inst->remove();
2410 progress = true;
2411 }
2412 break;
2413 }
2414
2415 /* We don't handle control flow here. Most computation of
2416 * values that end up in MRFs are shortly before the MRF
2417 * write anyway.
2418 */
2419 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2420 break;
2421
2422 /* You can't read from an MRF, so if someone else reads our
2423 * MRF's source GRF that we wanted to rewrite, that stops us.
2424 */
2425 bool interfered = false;
2426 for (int i = 0; i < 3; i++) {
2427 if (scan_inst->src[i].file == GRF &&
2428 scan_inst->src[i].reg == inst->src[0].reg &&
2429 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2430 interfered = true;
2431 }
2432 }
2433 if (interfered)
2434 break;
2435
2436 if (scan_inst->dst.file == MRF) {
2437 /* If somebody else writes our MRF here, we can't
2438 * compute-to-MRF before that.
2439 */
2440 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2441 int scan_mrf_high;
2442
2443 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2444 scan_mrf_high = scan_mrf_low + 4;
2445 } else if (dispatch_width == 16 &&
2446 (!scan_inst->force_uncompressed &&
2447 !scan_inst->force_sechalf)) {
2448 scan_mrf_high = scan_mrf_low + 1;
2449 } else {
2450 scan_mrf_high = scan_mrf_low;
2451 }
2452
2453 if (mrf_low == scan_mrf_low ||
2454 mrf_low == scan_mrf_high ||
2455 mrf_high == scan_mrf_low ||
2456 mrf_high == scan_mrf_high) {
2457 break;
2458 }
2459 }
2460
2461 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2462 /* Found a SEND instruction, which means that there are
2463 * live values in MRFs from base_mrf to base_mrf +
2464 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2465 * above it.
2466 */
2467 if (mrf_low >= scan_inst->base_mrf &&
2468 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2469 break;
2470 }
2471 if (mrf_high >= scan_inst->base_mrf &&
2472 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2473 break;
2474 }
2475 }
2476 }
2477 }
2478
2479 if (progress)
2480 invalidate_live_intervals();
2481
2482 return progress;
2483 }
2484
2485 /**
2486 * Walks through basic blocks, looking for repeated MRF writes and
2487 * removing the later ones.
2488 */
2489 bool
2490 fs_visitor::remove_duplicate_mrf_writes()
2491 {
2492 fs_inst *last_mrf_move[16];
2493 bool progress = false;
2494
2495 /* Need to update the MRF tracking for compressed instructions. */
2496 if (dispatch_width == 16)
2497 return false;
2498
2499 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2500
2501 foreach_list_safe(node, &this->instructions) {
2502 fs_inst *inst = (fs_inst *)node;
2503
2504 if (inst->is_control_flow()) {
2505 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2506 }
2507
2508 if (inst->opcode == BRW_OPCODE_MOV &&
2509 inst->dst.file == MRF) {
2510 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2511 if (prev_inst && inst->equals(prev_inst)) {
2512 inst->remove();
2513 progress = true;
2514 continue;
2515 }
2516 }
2517
2518 /* Clear out the last-write records for MRFs that were overwritten. */
2519 if (inst->dst.file == MRF) {
2520 last_mrf_move[inst->dst.reg] = NULL;
2521 }
2522
2523 if (inst->mlen > 0 && inst->base_mrf != -1) {
2524 /* Found a SEND instruction, which will include two or fewer
2525 * implied MRF writes. We could do better here.
2526 */
2527 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2528 last_mrf_move[inst->base_mrf + i] = NULL;
2529 }
2530 }
2531
2532 /* Clear out any MRF move records whose sources got overwritten. */
2533 if (inst->dst.file == GRF) {
2534 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2535 if (last_mrf_move[i] &&
2536 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2537 last_mrf_move[i] = NULL;
2538 }
2539 }
2540 }
2541
2542 if (inst->opcode == BRW_OPCODE_MOV &&
2543 inst->dst.file == MRF &&
2544 inst->src[0].file == GRF &&
2545 !inst->is_partial_write()) {
2546 last_mrf_move[inst->dst.reg] = inst;
2547 }
2548 }
2549
2550 if (progress)
2551 invalidate_live_intervals();
2552
2553 return progress;
2554 }
2555
2556 static void
2557 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2558 int first_grf, int grf_len)
2559 {
2560 bool inst_16wide = (dispatch_width > 8 &&
2561 !inst->force_uncompressed &&
2562 !inst->force_sechalf);
2563
2564 /* Clear the flag for registers that actually got read (as expected). */
2565 for (int i = 0; i < 3; i++) {
2566 int grf;
2567 if (inst->src[i].file == GRF) {
2568 grf = inst->src[i].reg;
2569 } else if (inst->src[i].file == HW_REG &&
2570 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2571 grf = inst->src[i].fixed_hw_reg.nr;
2572 } else {
2573 continue;
2574 }
2575
2576 if (grf >= first_grf &&
2577 grf < first_grf + grf_len) {
2578 deps[grf - first_grf] = false;
2579 if (inst_16wide)
2580 deps[grf - first_grf + 1] = false;
2581 }
2582 }
2583 }
2584
2585 /**
2586 * Implements this workaround for the original 965:
2587 *
2588 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2589 * check for post destination dependencies on this instruction, software
2590 * must ensure that there is no destination hazard for the case of ‘write
2591 * followed by a posted write’ shown in the following example.
2592 *
2593 * 1. mov r3 0
2594 * 2. send r3.xy <rest of send instruction>
2595 * 3. mov r2 r3
2596 *
2597 * Due to no post-destination dependency check on the ‘send’, the above
2598 * code sequence could have two instructions (1 and 2) in flight at the
2599 * same time that both consider ‘r3’ as the target of their final writes.
2600 */
2601 void
2602 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2603 {
2604 int reg_size = dispatch_width / 8;
2605 int write_len = inst->regs_written * reg_size;
2606 int first_write_grf = inst->dst.reg;
2607 bool needs_dep[BRW_MAX_MRF];
2608 assert(write_len < (int)sizeof(needs_dep) - 1);
2609
2610 memset(needs_dep, false, sizeof(needs_dep));
2611 memset(needs_dep, true, write_len);
2612
2613 clear_deps_for_inst_src(inst, dispatch_width,
2614 needs_dep, first_write_grf, write_len);
2615
2616 /* Walk backwards looking for writes to registers we're writing which
2617 * aren't read since being written. If we hit the start of the program,
2618 * we assume that there are no outstanding dependencies on entry to the
2619 * program.
2620 */
2621 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2622 scan_inst != NULL;
2623 scan_inst = (fs_inst *)scan_inst->prev) {
2624
2625 /* If we hit control flow, assume that there *are* outstanding
2626 * dependencies, and force their cleanup before our instruction.
2627 */
2628 if (scan_inst->is_control_flow()) {
2629 for (int i = 0; i < write_len; i++) {
2630 if (needs_dep[i]) {
2631 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2632 }
2633 }
2634 return;
2635 }
2636
2637 bool scan_inst_16wide = (dispatch_width > 8 &&
2638 !scan_inst->force_uncompressed &&
2639 !scan_inst->force_sechalf);
2640
2641 /* We insert our reads as late as possible on the assumption that any
2642 * instruction but a MOV that might have left us an outstanding
2643 * dependency has more latency than a MOV.
2644 */
2645 if (scan_inst->dst.file == GRF) {
2646 for (int i = 0; i < scan_inst->regs_written; i++) {
2647 int reg = scan_inst->dst.reg + i * reg_size;
2648
2649 if (reg >= first_write_grf &&
2650 reg < first_write_grf + write_len &&
2651 needs_dep[reg - first_write_grf]) {
2652 inst->insert_before(DEP_RESOLVE_MOV(reg));
2653 needs_dep[reg - first_write_grf] = false;
2654 if (scan_inst_16wide)
2655 needs_dep[reg - first_write_grf + 1] = false;
2656 }
2657 }
2658 }
2659
2660 /* Clear the flag for registers that actually got read (as expected). */
2661 clear_deps_for_inst_src(scan_inst, dispatch_width,
2662 needs_dep, first_write_grf, write_len);
2663
2664 /* Continue the loop only if we haven't resolved all the dependencies */
2665 int i;
2666 for (i = 0; i < write_len; i++) {
2667 if (needs_dep[i])
2668 break;
2669 }
2670 if (i == write_len)
2671 return;
2672 }
2673 }
2674
2675 /**
2676 * Implements this workaround for the original 965:
2677 *
2678 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2679 * used as a destination register until after it has been sourced by an
2680 * instruction with a different destination register.
2681 */
2682 void
2683 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2684 {
2685 int write_len = inst->regs_written * dispatch_width / 8;
2686 int first_write_grf = inst->dst.reg;
2687 bool needs_dep[BRW_MAX_MRF];
2688 assert(write_len < (int)sizeof(needs_dep) - 1);
2689
2690 memset(needs_dep, false, sizeof(needs_dep));
2691 memset(needs_dep, true, write_len);
2692 /* Walk forwards looking for writes to registers we're writing which aren't
2693 * read before being written.
2694 */
2695 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2696 !scan_inst->is_tail_sentinel();
2697 scan_inst = (fs_inst *)scan_inst->next) {
2698 /* If we hit control flow, force resolve all remaining dependencies. */
2699 if (scan_inst->is_control_flow()) {
2700 for (int i = 0; i < write_len; i++) {
2701 if (needs_dep[i])
2702 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2703 }
2704 return;
2705 }
2706
2707 /* Clear the flag for registers that actually got read (as expected). */
2708 clear_deps_for_inst_src(scan_inst, dispatch_width,
2709 needs_dep, first_write_grf, write_len);
2710
2711 /* We insert our reads as late as possible since they're reading the
2712 * result of a SEND, which has massive latency.
2713 */
2714 if (scan_inst->dst.file == GRF &&
2715 scan_inst->dst.reg >= first_write_grf &&
2716 scan_inst->dst.reg < first_write_grf + write_len &&
2717 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2718 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2719 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2720 }
2721
2722 /* Continue the loop only if we haven't resolved all the dependencies */
2723 int i;
2724 for (i = 0; i < write_len; i++) {
2725 if (needs_dep[i])
2726 break;
2727 }
2728 if (i == write_len)
2729 return;
2730 }
2731
2732 /* If we hit the end of the program, resolve all remaining dependencies out
2733 * of paranoia.
2734 */
2735 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2736 assert(last_inst->eot);
2737 for (int i = 0; i < write_len; i++) {
2738 if (needs_dep[i])
2739 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2740 }
2741 }
2742
2743 void
2744 fs_visitor::insert_gen4_send_dependency_workarounds()
2745 {
2746 if (brw->gen != 4 || brw->is_g4x)
2747 return;
2748
2749 /* Note that we're done with register allocation, so GRF fs_regs always
2750 * have a .reg_offset of 0.
2751 */
2752
2753 foreach_list_safe(node, &this->instructions) {
2754 fs_inst *inst = (fs_inst *)node;
2755
2756 if (inst->mlen != 0 && inst->dst.file == GRF) {
2757 insert_gen4_pre_send_dependency_workarounds(inst);
2758 insert_gen4_post_send_dependency_workarounds(inst);
2759 }
2760 }
2761 }
2762
2763 /**
2764 * Turns the generic expression-style uniform pull constant load instruction
2765 * into a hardware-specific series of instructions for loading a pull
2766 * constant.
2767 *
2768 * The expression style allows the CSE pass before this to optimize out
2769 * repeated loads from the same offset, and gives the pre-register-allocation
2770 * scheduling full flexibility, while the conversion to native instructions
2771 * allows the post-register-allocation scheduler the best information
2772 * possible.
2773 *
2774 * Note that execution masking for setting up pull constant loads is special:
2775 * the channels that need to be written are unrelated to the current execution
2776 * mask, since a later instruction will use one of the result channels as a
2777 * source operand for all 8 or 16 of its channels.
2778 */
2779 void
2780 fs_visitor::lower_uniform_pull_constant_loads()
2781 {
2782 foreach_list(node, &this->instructions) {
2783 fs_inst *inst = (fs_inst *)node;
2784
2785 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2786 continue;
2787
2788 if (brw->gen >= 7) {
2789 /* The offset arg before was a vec4-aligned byte offset. We need to
2790 * turn it into a dword offset.
2791 */
2792 fs_reg const_offset_reg = inst->src[1];
2793 assert(const_offset_reg.file == IMM &&
2794 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2795 const_offset_reg.imm.u /= 4;
2796 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2797
2798 /* This is actually going to be a MOV, but since only the first dword
2799 * is accessed, we have a special opcode to do just that one. Note
2800 * that this needs to be an operation that will be considered a def
2801 * by live variable analysis, or register allocation will explode.
2802 */
2803 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2804 payload, const_offset_reg);
2805 setup->force_writemask_all = true;
2806
2807 setup->ir = inst->ir;
2808 setup->annotation = inst->annotation;
2809 inst->insert_before(setup);
2810
2811 /* Similarly, this will only populate the first 4 channels of the
2812 * result register (since we only use smear values from 0-3), but we
2813 * don't tell the optimizer.
2814 */
2815 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2816 inst->src[1] = payload;
2817
2818 invalidate_live_intervals();
2819 } else {
2820 /* Before register allocation, we didn't tell the scheduler about the
2821 * MRF we use. We know it's safe to use this MRF because nothing
2822 * else does except for register spill/unspill, which generates and
2823 * uses its MRF within a single IR instruction.
2824 */
2825 inst->base_mrf = 14;
2826 inst->mlen = 1;
2827 }
2828 }
2829 }
2830
2831 void
2832 fs_visitor::dump_instruction(backend_instruction *be_inst)
2833 {
2834 fs_inst *inst = (fs_inst *)be_inst;
2835
2836 if (inst->predicate) {
2837 printf("(%cf0.%d) ",
2838 inst->predicate_inverse ? '-' : '+',
2839 inst->flag_subreg);
2840 }
2841
2842 printf("%s", brw_instruction_name(inst->opcode));
2843 if (inst->saturate)
2844 printf(".sat");
2845 if (inst->conditional_mod) {
2846 printf("%s", conditional_modifier[inst->conditional_mod]);
2847 if (!inst->predicate &&
2848 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2849 inst->opcode != BRW_OPCODE_IF &&
2850 inst->opcode != BRW_OPCODE_WHILE))) {
2851 printf(".f0.%d", inst->flag_subreg);
2852 }
2853 }
2854 printf(" ");
2855
2856
2857 switch (inst->dst.file) {
2858 case GRF:
2859 printf("vgrf%d", inst->dst.reg);
2860 if (inst->dst.reg_offset)
2861 printf("+%d", inst->dst.reg_offset);
2862 break;
2863 case MRF:
2864 printf("m%d", inst->dst.reg);
2865 break;
2866 case BAD_FILE:
2867 printf("(null)");
2868 break;
2869 case UNIFORM:
2870 printf("***u%d***", inst->dst.reg);
2871 break;
2872 case HW_REG:
2873 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2874 switch (inst->dst.fixed_hw_reg.nr) {
2875 case BRW_ARF_NULL:
2876 printf("null");
2877 break;
2878 case BRW_ARF_ADDRESS:
2879 printf("a0.%d", inst->dst.fixed_hw_reg.subnr);
2880 break;
2881 case BRW_ARF_ACCUMULATOR:
2882 printf("acc%d", inst->dst.fixed_hw_reg.subnr);
2883 break;
2884 case BRW_ARF_FLAG:
2885 printf("f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2886 inst->dst.fixed_hw_reg.subnr);
2887 break;
2888 default:
2889 printf("arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2890 inst->dst.fixed_hw_reg.subnr);
2891 break;
2892 }
2893 } else {
2894 printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
2895 }
2896 if (inst->dst.fixed_hw_reg.subnr)
2897 printf("+%d", inst->dst.fixed_hw_reg.subnr);
2898 break;
2899 default:
2900 printf("???");
2901 break;
2902 }
2903 printf(":%s, ", reg_encoding[inst->dst.type]);
2904
2905 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
2906 if (inst->src[i].negate)
2907 printf("-");
2908 if (inst->src[i].abs)
2909 printf("|");
2910 switch (inst->src[i].file) {
2911 case GRF:
2912 printf("vgrf%d", inst->src[i].reg);
2913 if (inst->src[i].reg_offset)
2914 printf("+%d", inst->src[i].reg_offset);
2915 break;
2916 case MRF:
2917 printf("***m%d***", inst->src[i].reg);
2918 break;
2919 case UNIFORM:
2920 printf("u%d", inst->src[i].reg);
2921 if (inst->src[i].reg_offset)
2922 printf(".%d", inst->src[i].reg_offset);
2923 break;
2924 case BAD_FILE:
2925 printf("(null)");
2926 break;
2927 case IMM:
2928 switch (inst->src[i].type) {
2929 case BRW_REGISTER_TYPE_F:
2930 printf("%ff", inst->src[i].imm.f);
2931 break;
2932 case BRW_REGISTER_TYPE_D:
2933 printf("%dd", inst->src[i].imm.i);
2934 break;
2935 case BRW_REGISTER_TYPE_UD:
2936 printf("%uu", inst->src[i].imm.u);
2937 break;
2938 default:
2939 printf("???");
2940 break;
2941 }
2942 break;
2943 case HW_REG:
2944 if (inst->src[i].fixed_hw_reg.negate)
2945 printf("-");
2946 if (inst->src[i].fixed_hw_reg.abs)
2947 printf("|");
2948 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2949 switch (inst->src[i].fixed_hw_reg.nr) {
2950 case BRW_ARF_NULL:
2951 printf("null");
2952 break;
2953 case BRW_ARF_ADDRESS:
2954 printf("a0.%d", inst->src[i].fixed_hw_reg.subnr);
2955 break;
2956 case BRW_ARF_ACCUMULATOR:
2957 printf("acc%d", inst->src[i].fixed_hw_reg.subnr);
2958 break;
2959 case BRW_ARF_FLAG:
2960 printf("f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2961 inst->src[i].fixed_hw_reg.subnr);
2962 break;
2963 default:
2964 printf("arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2965 inst->src[i].fixed_hw_reg.subnr);
2966 break;
2967 }
2968 } else {
2969 printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2970 }
2971 if (inst->src[i].fixed_hw_reg.subnr)
2972 printf("+%d", inst->src[i].fixed_hw_reg.subnr);
2973 if (inst->src[i].fixed_hw_reg.abs)
2974 printf("|");
2975 break;
2976 default:
2977 printf("???");
2978 break;
2979 }
2980 if (inst->src[i].abs)
2981 printf("|");
2982
2983 if (inst->src[i].file != IMM) {
2984 printf(":%s", reg_encoding[inst->src[i].type]);
2985 }
2986
2987 if (i < 2 && inst->src[i + 1].file != BAD_FILE)
2988 printf(", ");
2989 }
2990
2991 printf(" ");
2992
2993 if (inst->force_uncompressed)
2994 printf("1sthalf ");
2995
2996 if (inst->force_sechalf)
2997 printf("2ndhalf ");
2998
2999 printf("\n");
3000 }
3001
3002 /**
3003 * Possibly returns an instruction that set up @param reg.
3004 *
3005 * Sometimes we want to take the result of some expression/variable
3006 * dereference tree and rewrite the instruction generating the result
3007 * of the tree. When processing the tree, we know that the
3008 * instructions generated are all writing temporaries that are dead
3009 * outside of this tree. So, if we have some instructions that write
3010 * a temporary, we're free to point that temp write somewhere else.
3011 *
3012 * Note that this doesn't guarantee that the instruction generated
3013 * only reg -- it might be the size=4 destination of a texture instruction.
3014 */
3015 fs_inst *
3016 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3017 fs_inst *end,
3018 fs_reg reg)
3019 {
3020 if (end == start ||
3021 end->is_partial_write() ||
3022 reg.reladdr ||
3023 !reg.equals(end->dst)) {
3024 return NULL;
3025 } else {
3026 return end;
3027 }
3028 }
3029
3030 void
3031 fs_visitor::setup_payload_gen6()
3032 {
3033 bool uses_depth =
3034 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3035 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3036
3037 assert(brw->gen >= 6);
3038
3039 /* R0-1: masks, pixel X/Y coordinates. */
3040 c->nr_payload_regs = 2;
3041 /* R2: only for 32-pixel dispatch.*/
3042
3043 /* R3-26: barycentric interpolation coordinates. These appear in the
3044 * same order that they appear in the brw_wm_barycentric_interp_mode
3045 * enum. Each set of coordinates occupies 2 registers if dispatch width
3046 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3047 * appear if they were enabled using the "Barycentric Interpolation
3048 * Mode" bits in WM_STATE.
3049 */
3050 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3051 if (barycentric_interp_modes & (1 << i)) {
3052 c->barycentric_coord_reg[i] = c->nr_payload_regs;
3053 c->nr_payload_regs += 2;
3054 if (dispatch_width == 16) {
3055 c->nr_payload_regs += 2;
3056 }
3057 }
3058 }
3059
3060 /* R27: interpolated depth if uses source depth */
3061 if (uses_depth) {
3062 c->source_depth_reg = c->nr_payload_regs;
3063 c->nr_payload_regs++;
3064 if (dispatch_width == 16) {
3065 /* R28: interpolated depth if not 8-wide. */
3066 c->nr_payload_regs++;
3067 }
3068 }
3069 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3070 if (uses_depth) {
3071 c->source_w_reg = c->nr_payload_regs;
3072 c->nr_payload_regs++;
3073 if (dispatch_width == 16) {
3074 /* R30: interpolated W if not 8-wide. */
3075 c->nr_payload_regs++;
3076 }
3077 }
3078
3079 c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3080 /* R31: MSAA position offsets. */
3081 if (c->prog_data.uses_pos_offset) {
3082 c->sample_pos_reg = c->nr_payload_regs;
3083 c->nr_payload_regs++;
3084 }
3085
3086 /* R32: MSAA input coverage mask */
3087 if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3088 assert(brw->gen >= 7);
3089 c->sample_mask_reg = c->nr_payload_regs;
3090 c->nr_payload_regs++;
3091 if (dispatch_width == 16) {
3092 /* R33: input coverage mask if not 8-wide. */
3093 c->nr_payload_regs++;
3094 }
3095 }
3096
3097 /* R34-: bary for 32-pixel. */
3098 /* R58-59: interp W for 32-pixel. */
3099
3100 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3101 c->source_depth_to_render_target = true;
3102 }
3103 }
3104
3105 void
3106 fs_visitor::assign_binding_table_offsets()
3107 {
3108 uint32_t next_binding_table_offset = 0;
3109
3110 /* If there are no color regions, we still perform an FB write to a null
3111 * renderbuffer, which we place at surface index 0.
3112 */
3113 c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3114 next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3115
3116 assign_common_binding_table_offsets(next_binding_table_offset);
3117 }
3118
3119 bool
3120 fs_visitor::run()
3121 {
3122 sanity_param_count = fp->Base.Parameters->NumParameters;
3123 uint32_t orig_nr_params = c->prog_data.nr_params;
3124 bool allocated_without_spills;
3125
3126 assign_binding_table_offsets();
3127
3128 if (brw->gen >= 6)
3129 setup_payload_gen6();
3130 else
3131 setup_payload_gen4();
3132
3133 if (0) {
3134 emit_dummy_fs();
3135 } else {
3136 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3137 emit_shader_time_begin();
3138
3139 calculate_urb_setup();
3140 if (fp->Base.InputsRead > 0) {
3141 if (brw->gen < 6)
3142 emit_interpolation_setup_gen4();
3143 else
3144 emit_interpolation_setup_gen6();
3145 }
3146
3147 /* We handle discards by keeping track of the still-live pixels in f0.1.
3148 * Initialize it with the dispatched pixels.
3149 */
3150 if (fp->UsesKill || c->key.alpha_test_func) {
3151 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3152 discard_init->flag_subreg = 1;
3153 }
3154
3155 /* Generate FS IR for main(). (the visitor only descends into
3156 * functions called "main").
3157 */
3158 if (shader) {
3159 foreach_list(node, &*shader->ir) {
3160 ir_instruction *ir = (ir_instruction *)node;
3161 base_ir = ir;
3162 this->result = reg_undef;
3163 ir->accept(this);
3164 }
3165 } else {
3166 emit_fragment_program_code();
3167 }
3168 base_ir = NULL;
3169 if (failed)
3170 return false;
3171
3172 emit(FS_OPCODE_PLACEHOLDER_HALT);
3173
3174 if (c->key.alpha_test_func)
3175 emit_alpha_test();
3176
3177 emit_fb_writes();
3178
3179 split_virtual_grfs();
3180
3181 move_uniform_array_access_to_pull_constants();
3182 remove_dead_constants();
3183 setup_pull_constants();
3184
3185 bool progress;
3186 do {
3187 progress = false;
3188
3189 compact_virtual_grfs();
3190
3191 progress = remove_duplicate_mrf_writes() || progress;
3192
3193 progress = opt_algebraic() || progress;
3194 progress = opt_cse() || progress;
3195 progress = opt_copy_propagate() || progress;
3196 progress = opt_peephole_sel() || progress;
3197 progress = opt_peephole_predicated_break() || progress;
3198 progress = dead_code_eliminate() || progress;
3199 progress = dead_code_eliminate_local() || progress;
3200 progress = dead_control_flow_eliminate(this) || progress;
3201 progress = register_coalesce() || progress;
3202 progress = compute_to_mrf() || progress;
3203 } while (progress);
3204
3205 lower_uniform_pull_constant_loads();
3206
3207 assign_curb_setup();
3208 assign_urb_setup();
3209
3210 static enum instruction_scheduler_mode pre_modes[] = {
3211 SCHEDULE_PRE,
3212 SCHEDULE_PRE_NON_LIFO,
3213 SCHEDULE_PRE_LIFO,
3214 };
3215
3216 /* Try each scheduling heuristic to see if it can successfully register
3217 * allocate without spilling. They should be ordered by decreasing
3218 * performance but increasing likelihood of allocating.
3219 */
3220 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3221 schedule_instructions(pre_modes[i]);
3222
3223 if (0) {
3224 assign_regs_trivial();
3225 allocated_without_spills = true;
3226 } else {
3227 allocated_without_spills = assign_regs(false);
3228 }
3229 if (allocated_without_spills)
3230 break;
3231 }
3232
3233 if (!allocated_without_spills) {
3234 /* We assume that any spilling is worse than just dropping back to
3235 * SIMD8. There's probably actually some intermediate point where
3236 * SIMD16 with a couple of spills is still better.
3237 */
3238 if (dispatch_width == 16) {
3239 fail("Failure to register allocate. Reduce number of "
3240 "live scalar values to avoid this.");
3241 }
3242
3243 /* Since we're out of heuristics, just go spill registers until we
3244 * get an allocation.
3245 */
3246 while (!assign_regs(true)) {
3247 if (failed)
3248 break;
3249 }
3250 }
3251 }
3252 assert(force_uncompressed_stack == 0);
3253
3254 /* This must come after all optimization and register allocation, since
3255 * it inserts dead code that happens to have side effects, and it does
3256 * so based on the actual physical registers in use.
3257 */
3258 insert_gen4_send_dependency_workarounds();
3259
3260 if (failed)
3261 return false;
3262
3263 if (!allocated_without_spills)
3264 schedule_instructions(SCHEDULE_POST);
3265
3266 if (dispatch_width == 8) {
3267 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3268 } else {
3269 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3270
3271 /* Make sure we didn't try to sneak in an extra uniform */
3272 assert(orig_nr_params == c->prog_data.nr_params);
3273 (void) orig_nr_params;
3274 }
3275
3276 /* If any state parameters were appended, then ParameterValues could have
3277 * been realloced, in which case the driver uniform storage set up by
3278 * _mesa_associate_uniform_storage() would point to freed memory. Make
3279 * sure that didn't happen.
3280 */
3281 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3282
3283 return !failed;
3284 }
3285
3286 const unsigned *
3287 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3288 struct gl_fragment_program *fp,
3289 struct gl_shader_program *prog,
3290 unsigned *final_assembly_size)
3291 {
3292 bool start_busy = false;
3293 float start_time = 0;
3294
3295 if (unlikely(brw->perf_debug)) {
3296 start_busy = (brw->batch.last_bo &&
3297 drm_intel_bo_busy(brw->batch.last_bo));
3298 start_time = get_time();
3299 }
3300
3301 struct brw_shader *shader = NULL;
3302 if (prog)
3303 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3304
3305 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3306 if (prog) {
3307 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3308 _mesa_print_ir(shader->ir, NULL);
3309 printf("\n\n");
3310 } else {
3311 printf("ARB_fragment_program %d ir for native fragment shader\n",
3312 fp->Base.Id);
3313 _mesa_print_program(&fp->Base);
3314 }
3315 }
3316
3317 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3318 */
3319 fs_visitor v(brw, c, prog, fp, 8);
3320 if (!v.run()) {
3321 if (prog) {
3322 prog->LinkStatus = false;
3323 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3324 }
3325
3326 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3327 v.fail_msg);
3328
3329 return NULL;
3330 }
3331
3332 exec_list *simd16_instructions = NULL;
3333 fs_visitor v2(brw, c, prog, fp, 16);
3334 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3335 if (c->prog_data.nr_pull_params == 0) {
3336 /* Try a 16-wide compile */
3337 v2.import_uniforms(&v);
3338 if (!v2.run()) {
3339 perf_debug("16-wide shader failed to compile, falling back to "
3340 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3341 } else {
3342 simd16_instructions = &v2.instructions;
3343 }
3344 } else {
3345 perf_debug("Skipping 16-wide due to pull parameters.\n");
3346 }
3347 }
3348
3349 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3350 const unsigned *generated = g.generate_assembly(&v.instructions,
3351 simd16_instructions,
3352 final_assembly_size);
3353
3354 if (unlikely(brw->perf_debug) && shader) {
3355 if (shader->compiled_once)
3356 brw_wm_debug_recompile(brw, prog, &c->key);
3357 shader->compiled_once = true;
3358
3359 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3360 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3361 (get_time() - start_time) * 1000);
3362 }
3363 }
3364
3365 return generated;
3366 }
3367
3368 bool
3369 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3370 {
3371 struct brw_context *brw = brw_context(ctx);
3372 struct brw_wm_prog_key key;
3373
3374 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3375 return true;
3376
3377 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3378 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3379 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3380 bool program_uses_dfdy = fp->UsesDFdy;
3381
3382 memset(&key, 0, sizeof(key));
3383
3384 if (brw->gen < 6) {
3385 if (fp->UsesKill)
3386 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3387
3388 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3389 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3390
3391 /* Just assume depth testing. */
3392 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3393 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3394 }
3395
3396 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3397 BRW_FS_VARYING_INPUT_MASK) > 16)
3398 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3399
3400 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3401
3402 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3403 for (unsigned i = 0; i < sampler_count; i++) {
3404 if (fp->Base.ShadowSamplers & (1 << i)) {
3405 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3406 key.tex.swizzles[i] =
3407 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3408 } else {
3409 /* Color sampler: assume no swizzling. */
3410 key.tex.swizzles[i] = SWIZZLE_XYZW;
3411 }
3412 }
3413
3414 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3415 key.drawable_height = ctx->DrawBuffer->Height;
3416 }
3417
3418 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3419 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3420 }
3421
3422 key.nr_color_regions = 1;
3423
3424 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3425 * quality of the derivatives is likely to be determined by the driconf
3426 * option.
3427 */
3428 key.high_quality_derivatives = brw->disable_derivative_optimization;
3429
3430 key.program_string_id = bfp->id;
3431
3432 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3433 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3434
3435 bool success = do_wm_prog(brw, prog, bfp, &key);
3436
3437 brw->wm.base.prog_offset = old_prog_offset;
3438 brw->wm.prog_data = old_prog_data;
3439
3440 return success;
3441 }