i965: Store uniform constant values in a gl_constant_value instead of float
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "util/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53
54 void
55 fs_inst::init(enum opcode opcode, const fs_reg &dst, fs_reg *src, int sources)
56 {
57 memset(this, 0, sizeof(*this));
58
59 this->opcode = opcode;
60 this->dst = dst;
61 this->src = src;
62 this->sources = sources;
63
64 this->conditional_mod = BRW_CONDITIONAL_NONE;
65
66 /* This will be the case for almost all instructions. */
67 this->regs_written = 1;
68
69 this->writes_accumulator = false;
70 }
71
72 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
73 {
74 fs_reg *src = ralloc_array(this, fs_reg, 3);
75 init(opcode, dst, src, 0);
76 }
77
78 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
79 {
80 fs_reg *src = ralloc_array(this, fs_reg, 3);
81 src[0] = src0;
82 init(opcode, dst, src, 1);
83 }
84
85 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
86 const fs_reg &src1)
87 {
88 fs_reg *src = ralloc_array(this, fs_reg, 3);
89 src[0] = src0;
90 src[1] = src1;
91 init(opcode, dst, src, 2);
92 }
93
94 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
95 const fs_reg &src1, const fs_reg &src2)
96 {
97 fs_reg *src = ralloc_array(this, fs_reg, 3);
98 src[0] = src0;
99 src[1] = src1;
100 src[2] = src2;
101 init(opcode, dst, src, 3);
102 }
103
104 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
105 {
106 init(opcode, dst, src, sources);
107 }
108
109 fs_inst::fs_inst(const fs_inst &that)
110 {
111 memcpy(this, &that, sizeof(that));
112
113 this->src = ralloc_array(this, fs_reg, that.sources);
114
115 for (int i = 0; i < that.sources; i++)
116 this->src[i] = that.src[i];
117 }
118
119 void
120 fs_inst::resize_sources(uint8_t num_sources)
121 {
122 if (this->sources != num_sources) {
123 this->src = reralloc(this, this->src, fs_reg, num_sources);
124 this->sources = num_sources;
125 }
126 }
127
128 #define ALU1(op) \
129 fs_inst * \
130 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
131 { \
132 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
133 }
134
135 #define ALU2(op) \
136 fs_inst * \
137 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
138 const fs_reg &src1) \
139 { \
140 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
141 }
142
143 #define ALU2_ACC(op) \
144 fs_inst * \
145 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
146 const fs_reg &src1) \
147 { \
148 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
149 inst->writes_accumulator = true; \
150 return inst; \
151 }
152
153 #define ALU3(op) \
154 fs_inst * \
155 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
156 const fs_reg &src1, const fs_reg &src2) \
157 { \
158 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
159 }
160
161 ALU1(NOT)
162 ALU1(MOV)
163 ALU1(FRC)
164 ALU1(RNDD)
165 ALU1(RNDE)
166 ALU1(RNDZ)
167 ALU2(ADD)
168 ALU2(MUL)
169 ALU2_ACC(MACH)
170 ALU2(AND)
171 ALU2(OR)
172 ALU2(XOR)
173 ALU2(SHL)
174 ALU2(SHR)
175 ALU2(ASR)
176 ALU3(LRP)
177 ALU1(BFREV)
178 ALU3(BFE)
179 ALU2(BFI1)
180 ALU3(BFI2)
181 ALU1(FBH)
182 ALU1(FBL)
183 ALU1(CBIT)
184 ALU3(MAD)
185 ALU2_ACC(ADDC)
186 ALU2_ACC(SUBB)
187 ALU2(SEL)
188 ALU2(MAC)
189
190 /** Gen4 predicated IF. */
191 fs_inst *
192 fs_visitor::IF(enum brw_predicate predicate)
193 {
194 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
195 inst->predicate = predicate;
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 fs_inst *
201 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(brw->gen == 6);
205 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
206 reg_null_d, src0, src1);
207 inst->conditional_mod = condition;
208 return inst;
209 }
210
211 /**
212 * CMP: Sets the low bit of the destination channels with the result
213 * of the comparison, while the upper bits are undefined, and updates
214 * the flag register with the packed 16 bits of the result.
215 */
216 fs_inst *
217 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
218 enum brw_conditional_mod condition)
219 {
220 fs_inst *inst;
221
222 /* Take the instruction:
223 *
224 * CMP null<d> src0<f> src1<f>
225 *
226 * Original gen4 does type conversion to the destination type before
227 * comparison, producing garbage results for floating point comparisons.
228 * gen5 does the comparison on the execution type (resolved source types),
229 * so dst type doesn't matter. gen6 does comparison and then uses the
230 * result as if it was the dst type with no conversion, which happens to
231 * mostly work out for float-interpreted-as-int since our comparisons are
232 * for >0, =0, <0.
233 */
234 if (brw->gen == 4) {
235 dst.type = src0.type;
236 if (dst.file == HW_REG)
237 dst.fixed_hw_reg.type = dst.type;
238 }
239
240 resolve_ud_negate(&src0);
241 resolve_ud_negate(&src1);
242
243 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
244 inst->conditional_mod = condition;
245
246 return inst;
247 }
248
249 fs_inst *
250 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
251 {
252 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst, src,
253 sources);
254 inst->regs_written = sources;
255
256 return inst;
257 }
258
259 exec_list
260 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
261 const fs_reg &surf_index,
262 const fs_reg &varying_offset,
263 uint32_t const_offset)
264 {
265 exec_list instructions;
266 fs_inst *inst;
267
268 /* We have our constant surface use a pitch of 4 bytes, so our index can
269 * be any component of a vector, and then we load 4 contiguous
270 * components starting from that.
271 *
272 * We break down the const_offset to a portion added to the variable
273 * offset and a portion done using reg_offset, which means that if you
274 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
275 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
276 * CSE can later notice that those loads are all the same and eliminate
277 * the redundant ones.
278 */
279 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
280 instructions.push_tail(ADD(vec4_offset,
281 varying_offset, const_offset & ~3));
282
283 int scale = 1;
284 if (brw->gen == 4 && dispatch_width == 8) {
285 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
286 * u, v, r) as parameters, or we can just use the SIMD16 message
287 * consisting of (header, u). We choose the second, at the cost of a
288 * longer return length.
289 */
290 scale = 2;
291 }
292
293 enum opcode op;
294 if (brw->gen >= 7)
295 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
296 else
297 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
298 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
299 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
300 inst->regs_written = 4 * scale;
301 instructions.push_tail(inst);
302
303 if (brw->gen < 7) {
304 inst->base_mrf = 13;
305 inst->header_present = true;
306 if (brw->gen == 4)
307 inst->mlen = 3;
308 else
309 inst->mlen = 1 + dispatch_width / 8;
310 }
311
312 vec4_result.reg_offset += (const_offset & 3) * scale;
313 instructions.push_tail(MOV(dst, vec4_result));
314
315 return instructions;
316 }
317
318 /**
319 * A helper for MOV generation for fixing up broken hardware SEND dependency
320 * handling.
321 */
322 fs_inst *
323 fs_visitor::DEP_RESOLVE_MOV(int grf)
324 {
325 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
326
327 inst->ir = NULL;
328 inst->annotation = "send dependency resolve";
329
330 /* The caller always wants uncompressed to emit the minimal extra
331 * dependencies, and to avoid having to deal with aligning its regs to 2.
332 */
333 inst->force_uncompressed = true;
334
335 return inst;
336 }
337
338 bool
339 fs_inst::equals(fs_inst *inst) const
340 {
341 return (opcode == inst->opcode &&
342 dst.equals(inst->dst) &&
343 src[0].equals(inst->src[0]) &&
344 src[1].equals(inst->src[1]) &&
345 src[2].equals(inst->src[2]) &&
346 saturate == inst->saturate &&
347 predicate == inst->predicate &&
348 conditional_mod == inst->conditional_mod &&
349 mlen == inst->mlen &&
350 base_mrf == inst->base_mrf &&
351 target == inst->target &&
352 eot == inst->eot &&
353 header_present == inst->header_present &&
354 shadow_compare == inst->shadow_compare &&
355 offset == inst->offset);
356 }
357
358 bool
359 fs_inst::overwrites_reg(const fs_reg &reg) const
360 {
361 return (reg.file == dst.file &&
362 reg.reg == dst.reg &&
363 reg.reg_offset >= dst.reg_offset &&
364 reg.reg_offset < dst.reg_offset + regs_written);
365 }
366
367 bool
368 fs_inst::is_send_from_grf() const
369 {
370 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
371 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
372 opcode == FS_OPCODE_INTERPOLATE_AT_CENTROID ||
373 opcode == FS_OPCODE_INTERPOLATE_AT_SAMPLE ||
374 opcode == FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET ||
375 opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET ||
376 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
377 src[1].file == GRF) ||
378 (is_tex() && src[0].file == GRF));
379 }
380
381 bool
382 fs_inst::can_do_source_mods(struct brw_context *brw)
383 {
384 if (brw->gen == 6 && is_math())
385 return false;
386
387 if (is_send_from_grf())
388 return false;
389
390 if (!backend_instruction::can_do_source_mods())
391 return false;
392
393 return true;
394 }
395
396 void
397 fs_reg::init()
398 {
399 memset(this, 0, sizeof(*this));
400 stride = 1;
401 }
402
403 /** Generic unset register constructor. */
404 fs_reg::fs_reg()
405 {
406 init();
407 this->file = BAD_FILE;
408 }
409
410 /** Immediate value constructor. */
411 fs_reg::fs_reg(float f)
412 {
413 init();
414 this->file = IMM;
415 this->type = BRW_REGISTER_TYPE_F;
416 this->fixed_hw_reg.dw1.f = f;
417 }
418
419 /** Immediate value constructor. */
420 fs_reg::fs_reg(int32_t i)
421 {
422 init();
423 this->file = IMM;
424 this->type = BRW_REGISTER_TYPE_D;
425 this->fixed_hw_reg.dw1.d = i;
426 }
427
428 /** Immediate value constructor. */
429 fs_reg::fs_reg(uint32_t u)
430 {
431 init();
432 this->file = IMM;
433 this->type = BRW_REGISTER_TYPE_UD;
434 this->fixed_hw_reg.dw1.ud = u;
435 }
436
437 /** Fixed brw_reg. */
438 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
439 {
440 init();
441 this->file = HW_REG;
442 this->fixed_hw_reg = fixed_hw_reg;
443 this->type = fixed_hw_reg.type;
444 }
445
446 bool
447 fs_reg::equals(const fs_reg &r) const
448 {
449 return (file == r.file &&
450 reg == r.reg &&
451 reg_offset == r.reg_offset &&
452 subreg_offset == r.subreg_offset &&
453 type == r.type &&
454 negate == r.negate &&
455 abs == r.abs &&
456 !reladdr && !r.reladdr &&
457 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
458 sizeof(fixed_hw_reg)) == 0 &&
459 stride == r.stride);
460 }
461
462 fs_reg &
463 fs_reg::apply_stride(unsigned stride)
464 {
465 assert((this->stride * stride) <= 4 &&
466 (is_power_of_two(stride) || stride == 0) &&
467 file != HW_REG && file != IMM);
468 this->stride *= stride;
469 return *this;
470 }
471
472 fs_reg &
473 fs_reg::set_smear(unsigned subreg)
474 {
475 assert(file != HW_REG && file != IMM);
476 subreg_offset = subreg * type_sz(type);
477 stride = 0;
478 return *this;
479 }
480
481 bool
482 fs_reg::is_contiguous() const
483 {
484 return stride == 1;
485 }
486
487 bool
488 fs_reg::is_valid_3src() const
489 {
490 return file == GRF || file == UNIFORM;
491 }
492
493 int
494 fs_visitor::type_size(const struct glsl_type *type)
495 {
496 unsigned int size, i;
497
498 switch (type->base_type) {
499 case GLSL_TYPE_UINT:
500 case GLSL_TYPE_INT:
501 case GLSL_TYPE_FLOAT:
502 case GLSL_TYPE_BOOL:
503 return type->components();
504 case GLSL_TYPE_ARRAY:
505 return type_size(type->fields.array) * type->length;
506 case GLSL_TYPE_STRUCT:
507 size = 0;
508 for (i = 0; i < type->length; i++) {
509 size += type_size(type->fields.structure[i].type);
510 }
511 return size;
512 case GLSL_TYPE_SAMPLER:
513 /* Samplers take up no register space, since they're baked in at
514 * link time.
515 */
516 return 0;
517 case GLSL_TYPE_ATOMIC_UINT:
518 return 0;
519 case GLSL_TYPE_IMAGE:
520 case GLSL_TYPE_VOID:
521 case GLSL_TYPE_ERROR:
522 case GLSL_TYPE_INTERFACE:
523 unreachable("not reached");
524 }
525
526 return 0;
527 }
528
529 fs_reg
530 fs_visitor::get_timestamp()
531 {
532 assert(brw->gen >= 7);
533
534 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
535 BRW_ARF_TIMESTAMP,
536 0),
537 BRW_REGISTER_TYPE_UD));
538
539 fs_reg dst = fs_reg(this, glsl_type::uint_type);
540
541 fs_inst *mov = emit(MOV(dst, ts));
542 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
543 * even if it's not enabled in the dispatch.
544 */
545 mov->force_writemask_all = true;
546 mov->force_uncompressed = true;
547
548 /* The caller wants the low 32 bits of the timestamp. Since it's running
549 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
550 * which is plenty of time for our purposes. It is identical across the
551 * EUs, but since it's tracking GPU core speed it will increment at a
552 * varying rate as render P-states change.
553 *
554 * The caller could also check if render P-states have changed (or anything
555 * else that might disrupt timing) by setting smear to 2 and checking if
556 * that field is != 0.
557 */
558 dst.set_smear(0);
559
560 return dst;
561 }
562
563 void
564 fs_visitor::emit_shader_time_begin()
565 {
566 current_annotation = "shader time start";
567 shader_start_time = get_timestamp();
568 }
569
570 void
571 fs_visitor::emit_shader_time_end()
572 {
573 current_annotation = "shader time end";
574
575 enum shader_time_shader_type type, written_type, reset_type;
576 if (dispatch_width == 8) {
577 type = ST_FS8;
578 written_type = ST_FS8_WRITTEN;
579 reset_type = ST_FS8_RESET;
580 } else {
581 assert(dispatch_width == 16);
582 type = ST_FS16;
583 written_type = ST_FS16_WRITTEN;
584 reset_type = ST_FS16_RESET;
585 }
586
587 fs_reg shader_end_time = get_timestamp();
588
589 /* Check that there weren't any timestamp reset events (assuming these
590 * were the only two timestamp reads that happened).
591 */
592 fs_reg reset = shader_end_time;
593 reset.set_smear(2);
594 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
595 test->conditional_mod = BRW_CONDITIONAL_Z;
596 emit(IF(BRW_PREDICATE_NORMAL));
597
598 push_force_uncompressed();
599 fs_reg start = shader_start_time;
600 start.negate = true;
601 fs_reg diff = fs_reg(this, glsl_type::uint_type);
602 emit(ADD(diff, start, shader_end_time));
603
604 /* If there were no instructions between the two timestamp gets, the diff
605 * is 2 cycles. Remove that overhead, so I can forget about that when
606 * trying to determine the time taken for single instructions.
607 */
608 emit(ADD(diff, diff, fs_reg(-2u)));
609
610 emit_shader_time_write(type, diff);
611 emit_shader_time_write(written_type, fs_reg(1u));
612 emit(BRW_OPCODE_ELSE);
613 emit_shader_time_write(reset_type, fs_reg(1u));
614 emit(BRW_OPCODE_ENDIF);
615
616 pop_force_uncompressed();
617 }
618
619 void
620 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
621 fs_reg value)
622 {
623 int shader_time_index =
624 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
625 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
626
627 fs_reg payload;
628 if (dispatch_width == 8)
629 payload = fs_reg(this, glsl_type::uvec2_type);
630 else
631 payload = fs_reg(this, glsl_type::uint_type);
632
633 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
634 fs_reg(), payload, offset, value));
635 }
636
637 void
638 fs_visitor::vfail(const char *format, va_list va)
639 {
640 char *msg;
641
642 if (failed)
643 return;
644
645 failed = true;
646
647 msg = ralloc_vasprintf(mem_ctx, format, va);
648 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
649
650 this->fail_msg = msg;
651
652 if (INTEL_DEBUG & DEBUG_WM) {
653 fprintf(stderr, "%s", msg);
654 }
655 }
656
657 void
658 fs_visitor::fail(const char *format, ...)
659 {
660 va_list va;
661
662 va_start(va, format);
663 vfail(format, va);
664 va_end(va);
665 }
666
667 /**
668 * Mark this program as impossible to compile in SIMD16 mode.
669 *
670 * During the SIMD8 compile (which happens first), we can detect and flag
671 * things that are unsupported in SIMD16 mode, so the compiler can skip
672 * the SIMD16 compile altogether.
673 *
674 * During a SIMD16 compile (if one happens anyway), this just calls fail().
675 */
676 void
677 fs_visitor::no16(const char *format, ...)
678 {
679 va_list va;
680
681 va_start(va, format);
682
683 if (dispatch_width == 16) {
684 vfail(format, va);
685 } else {
686 simd16_unsupported = true;
687
688 if (brw->perf_debug) {
689 if (no16_msg)
690 ralloc_vasprintf_append(&no16_msg, format, va);
691 else
692 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
693 }
694 }
695
696 va_end(va);
697 }
698
699 fs_inst *
700 fs_visitor::emit(enum opcode opcode)
701 {
702 return emit(new(mem_ctx) fs_inst(opcode));
703 }
704
705 fs_inst *
706 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
707 {
708 return emit(new(mem_ctx) fs_inst(opcode, dst));
709 }
710
711 fs_inst *
712 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
713 {
714 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
715 }
716
717 fs_inst *
718 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
719 const fs_reg &src1)
720 {
721 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
722 }
723
724 fs_inst *
725 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
726 const fs_reg &src1, const fs_reg &src2)
727 {
728 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
729 }
730
731 fs_inst *
732 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
733 fs_reg src[], int sources)
734 {
735 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
736 }
737
738 void
739 fs_visitor::push_force_uncompressed()
740 {
741 force_uncompressed_stack++;
742 }
743
744 void
745 fs_visitor::pop_force_uncompressed()
746 {
747 force_uncompressed_stack--;
748 assert(force_uncompressed_stack >= 0);
749 }
750
751 /**
752 * Returns true if the instruction has a flag that means it won't
753 * update an entire destination register.
754 *
755 * For example, dead code elimination and live variable analysis want to know
756 * when a write to a variable screens off any preceding values that were in
757 * it.
758 */
759 bool
760 fs_inst::is_partial_write() const
761 {
762 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
763 this->force_uncompressed ||
764 this->force_sechalf || !this->dst.is_contiguous());
765 }
766
767 int
768 fs_inst::regs_read(fs_visitor *v, int arg) const
769 {
770 if (is_tex() && arg == 0 && src[0].file == GRF) {
771 if (v->dispatch_width == 16)
772 return (mlen + 1) / 2;
773 else
774 return mlen;
775 }
776 return 1;
777 }
778
779 bool
780 fs_inst::reads_flag() const
781 {
782 return predicate;
783 }
784
785 bool
786 fs_inst::writes_flag() const
787 {
788 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
789 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
790 }
791
792 /**
793 * Returns how many MRFs an FS opcode will write over.
794 *
795 * Note that this is not the 0 or 1 implied writes in an actual gen
796 * instruction -- the FS opcodes often generate MOVs in addition.
797 */
798 int
799 fs_visitor::implied_mrf_writes(fs_inst *inst)
800 {
801 if (inst->mlen == 0)
802 return 0;
803
804 if (inst->base_mrf == -1)
805 return 0;
806
807 switch (inst->opcode) {
808 case SHADER_OPCODE_RCP:
809 case SHADER_OPCODE_RSQ:
810 case SHADER_OPCODE_SQRT:
811 case SHADER_OPCODE_EXP2:
812 case SHADER_OPCODE_LOG2:
813 case SHADER_OPCODE_SIN:
814 case SHADER_OPCODE_COS:
815 return 1 * dispatch_width / 8;
816 case SHADER_OPCODE_POW:
817 case SHADER_OPCODE_INT_QUOTIENT:
818 case SHADER_OPCODE_INT_REMAINDER:
819 return 2 * dispatch_width / 8;
820 case SHADER_OPCODE_TEX:
821 case FS_OPCODE_TXB:
822 case SHADER_OPCODE_TXD:
823 case SHADER_OPCODE_TXF:
824 case SHADER_OPCODE_TXF_CMS:
825 case SHADER_OPCODE_TXF_MCS:
826 case SHADER_OPCODE_TG4:
827 case SHADER_OPCODE_TG4_OFFSET:
828 case SHADER_OPCODE_TXL:
829 case SHADER_OPCODE_TXS:
830 case SHADER_OPCODE_LOD:
831 return 1;
832 case FS_OPCODE_FB_WRITE:
833 return 2;
834 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
835 case SHADER_OPCODE_GEN4_SCRATCH_READ:
836 return 1;
837 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
838 return inst->mlen;
839 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
840 return 2;
841 case SHADER_OPCODE_UNTYPED_ATOMIC:
842 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
843 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
844 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
845 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
846 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
847 return 0;
848 default:
849 unreachable("not reached");
850 }
851 }
852
853 int
854 fs_visitor::virtual_grf_alloc(int size)
855 {
856 if (virtual_grf_array_size <= virtual_grf_count) {
857 if (virtual_grf_array_size == 0)
858 virtual_grf_array_size = 16;
859 else
860 virtual_grf_array_size *= 2;
861 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
862 virtual_grf_array_size);
863 }
864 virtual_grf_sizes[virtual_grf_count] = size;
865 return virtual_grf_count++;
866 }
867
868 /** Fixed HW reg constructor. */
869 fs_reg::fs_reg(enum register_file file, int reg)
870 {
871 init();
872 this->file = file;
873 this->reg = reg;
874 this->type = BRW_REGISTER_TYPE_F;
875 }
876
877 /** Fixed HW reg constructor. */
878 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
879 {
880 init();
881 this->file = file;
882 this->reg = reg;
883 this->type = type;
884 }
885
886 /** Automatic reg constructor. */
887 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
888 {
889 init();
890
891 this->file = GRF;
892 this->reg = v->virtual_grf_alloc(v->type_size(type));
893 this->reg_offset = 0;
894 this->type = brw_type_for_base_type(type);
895 }
896
897 fs_reg *
898 fs_visitor::variable_storage(ir_variable *var)
899 {
900 return (fs_reg *)hash_table_find(this->variable_ht, var);
901 }
902
903 void
904 import_uniforms_callback(const void *key,
905 void *data,
906 void *closure)
907 {
908 struct hash_table *dst_ht = (struct hash_table *)closure;
909 const fs_reg *reg = (const fs_reg *)data;
910
911 if (reg->file != UNIFORM)
912 return;
913
914 hash_table_insert(dst_ht, data, key);
915 }
916
917 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
918 * This brings in those uniform definitions
919 */
920 void
921 fs_visitor::import_uniforms(fs_visitor *v)
922 {
923 hash_table_call_foreach(v->variable_ht,
924 import_uniforms_callback,
925 variable_ht);
926 this->push_constant_loc = v->push_constant_loc;
927 this->pull_constant_loc = v->pull_constant_loc;
928 this->uniforms = v->uniforms;
929 this->param_size = v->param_size;
930 }
931
932 /* Our support for uniforms is piggy-backed on the struct
933 * gl_fragment_program, because that's where the values actually
934 * get stored, rather than in some global gl_shader_program uniform
935 * store.
936 */
937 void
938 fs_visitor::setup_uniform_values(ir_variable *ir)
939 {
940 int namelen = strlen(ir->name);
941
942 /* The data for our (non-builtin) uniforms is stored in a series of
943 * gl_uniform_driver_storage structs for each subcomponent that
944 * glGetUniformLocation() could name. We know it's been set up in the same
945 * order we'd walk the type, so walk the list of storage and find anything
946 * with our name, or the prefix of a component that starts with our name.
947 */
948 unsigned params_before = uniforms;
949 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
950 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
951
952 if (strncmp(ir->name, storage->name, namelen) != 0 ||
953 (storage->name[namelen] != 0 &&
954 storage->name[namelen] != '.' &&
955 storage->name[namelen] != '[')) {
956 continue;
957 }
958
959 unsigned slots = storage->type->component_slots();
960 if (storage->array_elements)
961 slots *= storage->array_elements;
962
963 for (unsigned i = 0; i < slots; i++) {
964 stage_prog_data->param[uniforms++] = &storage->storage[i];
965 }
966 }
967
968 /* Make sure we actually initialized the right amount of stuff here. */
969 assert(params_before + ir->type->component_slots() == uniforms);
970 (void)params_before;
971 }
972
973
974 /* Our support for builtin uniforms is even scarier than non-builtin.
975 * It sits on top of the PROG_STATE_VAR parameters that are
976 * automatically updated from GL context state.
977 */
978 void
979 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
980 {
981 const ir_state_slot *const slots = ir->state_slots;
982 assert(ir->state_slots != NULL);
983
984 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
985 /* This state reference has already been setup by ir_to_mesa, but we'll
986 * get the same index back here.
987 */
988 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
989 (gl_state_index *)slots[i].tokens);
990
991 /* Add each of the unique swizzles of the element as a parameter.
992 * This'll end up matching the expected layout of the
993 * array/matrix/structure we're trying to fill in.
994 */
995 int last_swiz = -1;
996 for (unsigned int j = 0; j < 4; j++) {
997 int swiz = GET_SWZ(slots[i].swizzle, j);
998 if (swiz == last_swiz)
999 break;
1000 last_swiz = swiz;
1001
1002 stage_prog_data->param[uniforms++] =
1003 &fp->Base.Parameters->ParameterValues[index][swiz];
1004 }
1005 }
1006 }
1007
1008 fs_reg *
1009 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1010 {
1011 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1012 fs_reg wpos = *reg;
1013 bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1014
1015 /* gl_FragCoord.x */
1016 if (ir->data.pixel_center_integer) {
1017 emit(MOV(wpos, this->pixel_x));
1018 } else {
1019 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1020 }
1021 wpos.reg_offset++;
1022
1023 /* gl_FragCoord.y */
1024 if (!flip && ir->data.pixel_center_integer) {
1025 emit(MOV(wpos, this->pixel_y));
1026 } else {
1027 fs_reg pixel_y = this->pixel_y;
1028 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1029
1030 if (flip) {
1031 pixel_y.negate = true;
1032 offset += key->drawable_height - 1.0;
1033 }
1034
1035 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1036 }
1037 wpos.reg_offset++;
1038
1039 /* gl_FragCoord.z */
1040 if (brw->gen >= 6) {
1041 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1042 } else {
1043 emit(FS_OPCODE_LINTERP, wpos,
1044 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1045 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1046 interp_reg(VARYING_SLOT_POS, 2));
1047 }
1048 wpos.reg_offset++;
1049
1050 /* gl_FragCoord.w: Already set up in emit_interpolation */
1051 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1052
1053 return reg;
1054 }
1055
1056 fs_inst *
1057 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1058 glsl_interp_qualifier interpolation_mode,
1059 bool is_centroid, bool is_sample)
1060 {
1061 brw_wm_barycentric_interp_mode barycoord_mode;
1062 if (brw->gen >= 6) {
1063 if (is_centroid) {
1064 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1065 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1066 else
1067 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1068 } else if (is_sample) {
1069 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1070 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1071 else
1072 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1073 } else {
1074 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1075 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1076 else
1077 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1078 }
1079 } else {
1080 /* On Ironlake and below, there is only one interpolation mode.
1081 * Centroid interpolation doesn't mean anything on this hardware --
1082 * there is no multisampling.
1083 */
1084 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1085 }
1086 return emit(FS_OPCODE_LINTERP, attr,
1087 this->delta_x[barycoord_mode],
1088 this->delta_y[barycoord_mode], interp);
1089 }
1090
1091 fs_reg *
1092 fs_visitor::emit_general_interpolation(ir_variable *ir)
1093 {
1094 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1095 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1096 fs_reg attr = *reg;
1097
1098 unsigned int array_elements;
1099 const glsl_type *type;
1100
1101 if (ir->type->is_array()) {
1102 array_elements = ir->type->length;
1103 if (array_elements == 0) {
1104 fail("dereferenced array '%s' has length 0\n", ir->name);
1105 }
1106 type = ir->type->fields.array;
1107 } else {
1108 array_elements = 1;
1109 type = ir->type;
1110 }
1111
1112 glsl_interp_qualifier interpolation_mode =
1113 ir->determine_interpolation_mode(key->flat_shade);
1114
1115 int location = ir->data.location;
1116 for (unsigned int i = 0; i < array_elements; i++) {
1117 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1118 if (prog_data->urb_setup[location] == -1) {
1119 /* If there's no incoming setup data for this slot, don't
1120 * emit interpolation for it.
1121 */
1122 attr.reg_offset += type->vector_elements;
1123 location++;
1124 continue;
1125 }
1126
1127 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1128 /* Constant interpolation (flat shading) case. The SF has
1129 * handed us defined values in only the constant offset
1130 * field of the setup reg.
1131 */
1132 for (unsigned int k = 0; k < type->vector_elements; k++) {
1133 struct brw_reg interp = interp_reg(location, k);
1134 interp = suboffset(interp, 3);
1135 interp.type = reg->type;
1136 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1137 attr.reg_offset++;
1138 }
1139 } else {
1140 /* Smooth/noperspective interpolation case. */
1141 for (unsigned int k = 0; k < type->vector_elements; k++) {
1142 struct brw_reg interp = interp_reg(location, k);
1143 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1144 /* Get the pixel/sample mask into f0 so that we know
1145 * which pixels are lit. Then, for each channel that is
1146 * unlit, replace the centroid data with non-centroid
1147 * data.
1148 */
1149 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1150
1151 fs_inst *inst;
1152 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1153 false, false);
1154 inst->predicate = BRW_PREDICATE_NORMAL;
1155 inst->predicate_inverse = true;
1156 if (brw->has_pln)
1157 inst->no_dd_clear = true;
1158
1159 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1160 ir->data.centroid && !key->persample_shading,
1161 ir->data.sample || key->persample_shading);
1162 inst->predicate = BRW_PREDICATE_NORMAL;
1163 inst->predicate_inverse = false;
1164 if (brw->has_pln)
1165 inst->no_dd_check = true;
1166
1167 } else {
1168 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1169 ir->data.centroid && !key->persample_shading,
1170 ir->data.sample || key->persample_shading);
1171 }
1172 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1173 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1174 }
1175 attr.reg_offset++;
1176 }
1177
1178 }
1179 location++;
1180 }
1181 }
1182
1183 return reg;
1184 }
1185
1186 fs_reg *
1187 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1188 {
1189 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1190
1191 /* The frontfacing comes in as a bit in the thread payload. */
1192 if (brw->gen >= 6) {
1193 emit(BRW_OPCODE_ASR, *reg,
1194 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1195 fs_reg(15));
1196 emit(BRW_OPCODE_NOT, *reg, *reg);
1197 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1198 } else {
1199 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1200 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1201 * us front face
1202 */
1203 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1204 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1205 }
1206
1207 return reg;
1208 }
1209
1210 void
1211 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1212 {
1213 assert(dst.type == BRW_REGISTER_TYPE_F);
1214
1215 if (key->compute_pos_offset) {
1216 /* Convert int_sample_pos to floating point */
1217 emit(MOV(dst, int_sample_pos));
1218 /* Scale to the range [0, 1] */
1219 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1220 }
1221 else {
1222 /* From ARB_sample_shading specification:
1223 * "When rendering to a non-multisample buffer, or if multisample
1224 * rasterization is disabled, gl_SamplePosition will always be
1225 * (0.5, 0.5).
1226 */
1227 emit(MOV(dst, fs_reg(0.5f)));
1228 }
1229 }
1230
1231 fs_reg *
1232 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1233 {
1234 assert(brw->gen >= 6);
1235 assert(ir->type == glsl_type::vec2_type);
1236
1237 this->current_annotation = "compute sample position";
1238 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1239 fs_reg pos = *reg;
1240 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1241 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1242
1243 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1244 * mode will be enabled.
1245 *
1246 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1247 * R31.1:0 Position Offset X/Y for Slot[3:0]
1248 * R31.3:2 Position Offset X/Y for Slot[7:4]
1249 * .....
1250 *
1251 * The X, Y sample positions come in as bytes in thread payload. So, read
1252 * the positions using vstride=16, width=8, hstride=2.
1253 */
1254 struct brw_reg sample_pos_reg =
1255 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1256 BRW_REGISTER_TYPE_B), 16, 8, 2);
1257
1258 fs_inst *inst = emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1259 if (dispatch_width == 16) {
1260 inst->force_uncompressed = true;
1261 inst = emit(MOV(half(int_sample_x, 1),
1262 fs_reg(suboffset(sample_pos_reg, 16))));
1263 inst->force_sechalf = true;
1264 }
1265 /* Compute gl_SamplePosition.x */
1266 compute_sample_position(pos, int_sample_x);
1267 pos.reg_offset++;
1268 inst = emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1269 if (dispatch_width == 16) {
1270 inst->force_uncompressed = true;
1271 inst = emit(MOV(half(int_sample_y, 1),
1272 fs_reg(suboffset(sample_pos_reg, 17))));
1273 inst->force_sechalf = true;
1274 }
1275 /* Compute gl_SamplePosition.y */
1276 compute_sample_position(pos, int_sample_y);
1277 return reg;
1278 }
1279
1280 fs_reg *
1281 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1282 {
1283 assert(brw->gen >= 6);
1284
1285 this->current_annotation = "compute sample id";
1286 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1287
1288 if (key->compute_sample_id) {
1289 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1290 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1291 t2.type = BRW_REGISTER_TYPE_UW;
1292
1293 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1294 * 8x multisampling, subspan 0 will represent sample N (where N
1295 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1296 * 7. We can find the value of N by looking at R0.0 bits 7:6
1297 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1298 * (since samples are always delivered in pairs). That is, we
1299 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1300 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1301 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1302 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1303 * populating a temporary variable with the sequence (0, 1, 2, 3),
1304 * and then reading from it using vstride=1, width=4, hstride=0.
1305 * These computations hold good for 4x multisampling as well.
1306 *
1307 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1308 * the first four slots are sample 0 of subspan 0; the next four
1309 * are sample 1 of subspan 0; the third group is sample 0 of
1310 * subspan 1, and finally sample 1 of subspan 1.
1311 */
1312 fs_inst *inst;
1313 inst = emit(BRW_OPCODE_AND, t1,
1314 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1315 fs_reg(0xc0));
1316 inst->force_writemask_all = true;
1317 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1318 inst->force_writemask_all = true;
1319 /* This works for both SIMD8 and SIMD16 */
1320 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1321 inst->force_writemask_all = true;
1322 /* This special instruction takes care of setting vstride=1,
1323 * width=4, hstride=0 of t2 during an ADD instruction.
1324 */
1325 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1326 } else {
1327 /* As per GL_ARB_sample_shading specification:
1328 * "When rendering to a non-multisample buffer, or if multisample
1329 * rasterization is disabled, gl_SampleID will always be zero."
1330 */
1331 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1332 }
1333
1334 return reg;
1335 }
1336
1337 fs_reg
1338 fs_visitor::fix_math_operand(fs_reg src)
1339 {
1340 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1341 * might be able to do better by doing execsize = 1 math and then
1342 * expanding that result out, but we would need to be careful with
1343 * masking.
1344 *
1345 * The hardware ignores source modifiers (negate and abs) on math
1346 * instructions, so we also move to a temp to set those up.
1347 */
1348 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1349 !src.abs && !src.negate)
1350 return src;
1351
1352 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1353 * operands to math
1354 */
1355 if (brw->gen >= 7 && src.file != IMM)
1356 return src;
1357
1358 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1359 expanded.type = src.type;
1360 emit(BRW_OPCODE_MOV, expanded, src);
1361 return expanded;
1362 }
1363
1364 fs_inst *
1365 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1366 {
1367 switch (opcode) {
1368 case SHADER_OPCODE_RCP:
1369 case SHADER_OPCODE_RSQ:
1370 case SHADER_OPCODE_SQRT:
1371 case SHADER_OPCODE_EXP2:
1372 case SHADER_OPCODE_LOG2:
1373 case SHADER_OPCODE_SIN:
1374 case SHADER_OPCODE_COS:
1375 break;
1376 default:
1377 unreachable("not reached: bad math opcode");
1378 }
1379
1380 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1381 * might be able to do better by doing execsize = 1 math and then
1382 * expanding that result out, but we would need to be careful with
1383 * masking.
1384 *
1385 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1386 * instructions, so we also move to a temp to set those up.
1387 */
1388 if (brw->gen == 6 || brw->gen == 7)
1389 src = fix_math_operand(src);
1390
1391 fs_inst *inst = emit(opcode, dst, src);
1392
1393 if (brw->gen < 6) {
1394 inst->base_mrf = 2;
1395 inst->mlen = dispatch_width / 8;
1396 }
1397
1398 return inst;
1399 }
1400
1401 fs_inst *
1402 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1403 {
1404 int base_mrf = 2;
1405 fs_inst *inst;
1406
1407 if (brw->gen >= 8) {
1408 inst = emit(opcode, dst, src0, src1);
1409 } else if (brw->gen >= 6) {
1410 src0 = fix_math_operand(src0);
1411 src1 = fix_math_operand(src1);
1412
1413 inst = emit(opcode, dst, src0, src1);
1414 } else {
1415 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1416 * "Message Payload":
1417 *
1418 * "Operand0[7]. For the INT DIV functions, this operand is the
1419 * denominator."
1420 * ...
1421 * "Operand1[7]. For the INT DIV functions, this operand is the
1422 * numerator."
1423 */
1424 bool is_int_div = opcode != SHADER_OPCODE_POW;
1425 fs_reg &op0 = is_int_div ? src1 : src0;
1426 fs_reg &op1 = is_int_div ? src0 : src1;
1427
1428 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1429 inst = emit(opcode, dst, op0, reg_null_f);
1430
1431 inst->base_mrf = base_mrf;
1432 inst->mlen = 2 * dispatch_width / 8;
1433 }
1434 return inst;
1435 }
1436
1437 void
1438 fs_visitor::assign_curb_setup()
1439 {
1440 if (dispatch_width == 8) {
1441 prog_data->base.dispatch_grf_start_reg = payload.num_regs;
1442 } else {
1443 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1444 }
1445
1446 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1447
1448 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1449 foreach_in_list(fs_inst, inst, &instructions) {
1450 for (unsigned int i = 0; i < inst->sources; i++) {
1451 if (inst->src[i].file == UNIFORM) {
1452 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1453 int constant_nr;
1454 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1455 constant_nr = push_constant_loc[uniform_nr];
1456 } else {
1457 /* Section 5.11 of the OpenGL 4.1 spec says:
1458 * "Out-of-bounds reads return undefined values, which include
1459 * values from other variables of the active program or zero."
1460 * Just return the first push constant.
1461 */
1462 constant_nr = 0;
1463 }
1464
1465 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1466 constant_nr / 8,
1467 constant_nr % 8);
1468
1469 inst->src[i].file = HW_REG;
1470 inst->src[i].fixed_hw_reg = byte_offset(
1471 retype(brw_reg, inst->src[i].type),
1472 inst->src[i].subreg_offset);
1473 }
1474 }
1475 }
1476 }
1477
1478 void
1479 fs_visitor::calculate_urb_setup()
1480 {
1481 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1482 prog_data->urb_setup[i] = -1;
1483 }
1484
1485 int urb_next = 0;
1486 /* Figure out where each of the incoming setup attributes lands. */
1487 if (brw->gen >= 6) {
1488 if (_mesa_bitcount_64(fp->Base.InputsRead &
1489 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1490 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1491 * first 16 varying inputs, so we can put them wherever we want.
1492 * Just put them in order.
1493 *
1494 * This is useful because it means that (a) inputs not used by the
1495 * fragment shader won't take up valuable register space, and (b) we
1496 * won't have to recompile the fragment shader if it gets paired with
1497 * a different vertex (or geometry) shader.
1498 */
1499 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1500 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1501 BITFIELD64_BIT(i)) {
1502 prog_data->urb_setup[i] = urb_next++;
1503 }
1504 }
1505 } else {
1506 /* We have enough input varyings that the SF/SBE pipeline stage can't
1507 * arbitrarily rearrange them to suit our whim; we have to put them
1508 * in an order that matches the output of the previous pipeline stage
1509 * (geometry or vertex shader).
1510 */
1511 struct brw_vue_map prev_stage_vue_map;
1512 brw_compute_vue_map(brw, &prev_stage_vue_map,
1513 key->input_slots_valid);
1514 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1515 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1516 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1517 slot++) {
1518 int varying = prev_stage_vue_map.slot_to_varying[slot];
1519 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1520 * unused.
1521 */
1522 if (varying != BRW_VARYING_SLOT_COUNT &&
1523 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1524 BITFIELD64_BIT(varying))) {
1525 prog_data->urb_setup[varying] = slot - first_slot;
1526 }
1527 }
1528 urb_next = prev_stage_vue_map.num_slots - first_slot;
1529 }
1530 } else {
1531 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1532 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1533 /* Point size is packed into the header, not as a general attribute */
1534 if (i == VARYING_SLOT_PSIZ)
1535 continue;
1536
1537 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1538 /* The back color slot is skipped when the front color is
1539 * also written to. In addition, some slots can be
1540 * written in the vertex shader and not read in the
1541 * fragment shader. So the register number must always be
1542 * incremented, mapped or not.
1543 */
1544 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1545 prog_data->urb_setup[i] = urb_next;
1546 urb_next++;
1547 }
1548 }
1549
1550 /*
1551 * It's a FS only attribute, and we did interpolation for this attribute
1552 * in SF thread. So, count it here, too.
1553 *
1554 * See compile_sf_prog() for more info.
1555 */
1556 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1557 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1558 }
1559
1560 prog_data->num_varying_inputs = urb_next;
1561 }
1562
1563 void
1564 fs_visitor::assign_urb_setup()
1565 {
1566 int urb_start = payload.num_regs + prog_data->curb_read_length;
1567
1568 /* Offset all the urb_setup[] index by the actual position of the
1569 * setup regs, now that the location of the constants has been chosen.
1570 */
1571 foreach_in_list(fs_inst, inst, &instructions) {
1572 if (inst->opcode == FS_OPCODE_LINTERP) {
1573 assert(inst->src[2].file == HW_REG);
1574 inst->src[2].fixed_hw_reg.nr += urb_start;
1575 }
1576
1577 if (inst->opcode == FS_OPCODE_CINTERP) {
1578 assert(inst->src[0].file == HW_REG);
1579 inst->src[0].fixed_hw_reg.nr += urb_start;
1580 }
1581 }
1582
1583 /* Each attribute is 4 setup channels, each of which is half a reg. */
1584 this->first_non_payload_grf =
1585 urb_start + prog_data->num_varying_inputs * 2;
1586 }
1587
1588 /**
1589 * Split large virtual GRFs into separate components if we can.
1590 *
1591 * This is mostly duplicated with what brw_fs_vector_splitting does,
1592 * but that's really conservative because it's afraid of doing
1593 * splitting that doesn't result in real progress after the rest of
1594 * the optimization phases, which would cause infinite looping in
1595 * optimization. We can do it once here, safely. This also has the
1596 * opportunity to split interpolated values, or maybe even uniforms,
1597 * which we don't have at the IR level.
1598 *
1599 * We want to split, because virtual GRFs are what we register
1600 * allocate and spill (due to contiguousness requirements for some
1601 * instructions), and they're what we naturally generate in the
1602 * codegen process, but most virtual GRFs don't actually need to be
1603 * contiguous sets of GRFs. If we split, we'll end up with reduced
1604 * live intervals and better dead code elimination and coalescing.
1605 */
1606 void
1607 fs_visitor::split_virtual_grfs()
1608 {
1609 int num_vars = this->virtual_grf_count;
1610 bool split_grf[num_vars];
1611 int new_virtual_grf[num_vars];
1612
1613 /* Try to split anything > 0 sized. */
1614 for (int i = 0; i < num_vars; i++) {
1615 if (this->virtual_grf_sizes[i] != 1)
1616 split_grf[i] = true;
1617 else
1618 split_grf[i] = false;
1619 }
1620
1621 if (brw->has_pln &&
1622 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1623 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1624 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1625 * Gen6, that was the only supported interpolation mode, and since Gen6,
1626 * delta_x and delta_y are in fixed hardware registers.
1627 */
1628 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1629 false;
1630 }
1631
1632 foreach_in_list(fs_inst, inst, &instructions) {
1633 /* If there's a SEND message that requires contiguous destination
1634 * registers, no splitting is allowed.
1635 */
1636 if (inst->regs_written > 1) {
1637 split_grf[inst->dst.reg] = false;
1638 }
1639
1640 /* If we're sending from a GRF, don't split it, on the assumption that
1641 * the send is reading the whole thing.
1642 */
1643 if (inst->is_send_from_grf()) {
1644 for (int i = 0; i < inst->sources; i++) {
1645 if (inst->src[i].file == GRF) {
1646 split_grf[inst->src[i].reg] = false;
1647 }
1648 }
1649 }
1650 }
1651
1652 /* Allocate new space for split regs. Note that the virtual
1653 * numbers will be contiguous.
1654 */
1655 for (int i = 0; i < num_vars; i++) {
1656 if (split_grf[i]) {
1657 new_virtual_grf[i] = virtual_grf_alloc(1);
1658 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1659 int reg = virtual_grf_alloc(1);
1660 assert(reg == new_virtual_grf[i] + j - 1);
1661 (void) reg;
1662 }
1663 this->virtual_grf_sizes[i] = 1;
1664 }
1665 }
1666
1667 foreach_in_list(fs_inst, inst, &instructions) {
1668 if (inst->dst.file == GRF &&
1669 split_grf[inst->dst.reg] &&
1670 inst->dst.reg_offset != 0) {
1671 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1672 inst->dst.reg_offset - 1);
1673 inst->dst.reg_offset = 0;
1674 }
1675 for (int i = 0; i < inst->sources; i++) {
1676 if (inst->src[i].file == GRF &&
1677 split_grf[inst->src[i].reg] &&
1678 inst->src[i].reg_offset != 0) {
1679 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1680 inst->src[i].reg_offset - 1);
1681 inst->src[i].reg_offset = 0;
1682 }
1683 }
1684 }
1685 invalidate_live_intervals();
1686 }
1687
1688 /**
1689 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1690 *
1691 * During code generation, we create tons of temporary variables, many of
1692 * which get immediately killed and are never used again. Yet, in later
1693 * optimization and analysis passes, such as compute_live_intervals, we need
1694 * to loop over all the virtual GRFs. Compacting them can save a lot of
1695 * overhead.
1696 */
1697 void
1698 fs_visitor::compact_virtual_grfs()
1699 {
1700 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER))
1701 return;
1702
1703 /* Mark which virtual GRFs are used, and count how many. */
1704 int remap_table[this->virtual_grf_count];
1705 memset(remap_table, -1, sizeof(remap_table));
1706
1707 foreach_in_list(const fs_inst, inst, &instructions) {
1708 if (inst->dst.file == GRF)
1709 remap_table[inst->dst.reg] = 0;
1710
1711 for (int i = 0; i < inst->sources; i++) {
1712 if (inst->src[i].file == GRF)
1713 remap_table[inst->src[i].reg] = 0;
1714 }
1715 }
1716
1717 /* Compact the GRF arrays. */
1718 int new_index = 0;
1719 for (int i = 0; i < this->virtual_grf_count; i++) {
1720 if (remap_table[i] != -1) {
1721 remap_table[i] = new_index;
1722 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1723 invalidate_live_intervals();
1724 ++new_index;
1725 }
1726 }
1727
1728 this->virtual_grf_count = new_index;
1729
1730 /* Patch all the instructions to use the newly renumbered registers */
1731 foreach_in_list(fs_inst, inst, &instructions) {
1732 if (inst->dst.file == GRF)
1733 inst->dst.reg = remap_table[inst->dst.reg];
1734
1735 for (int i = 0; i < inst->sources; i++) {
1736 if (inst->src[i].file == GRF)
1737 inst->src[i].reg = remap_table[inst->src[i].reg];
1738 }
1739 }
1740
1741 /* Patch all the references to delta_x/delta_y, since they're used in
1742 * register allocation.
1743 */
1744 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
1745 if (delta_x[i].file == GRF && remap_table[delta_x[i].reg] != -1) {
1746 delta_x[i].reg = remap_table[delta_x[i].reg];
1747 }
1748 }
1749 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
1750 if (delta_y[i].file == GRF && remap_table[delta_y[i].reg] != -1) {
1751 delta_y[i].reg = remap_table[delta_y[i].reg];
1752 }
1753 }
1754 }
1755
1756 /*
1757 * Implements array access of uniforms by inserting a
1758 * PULL_CONSTANT_LOAD instruction.
1759 *
1760 * Unlike temporary GRF array access (where we don't support it due to
1761 * the difficulty of doing relative addressing on instruction
1762 * destinations), we could potentially do array access of uniforms
1763 * that were loaded in GRF space as push constants. In real-world
1764 * usage we've seen, though, the arrays being used are always larger
1765 * than we could load as push constants, so just always move all
1766 * uniform array access out to a pull constant buffer.
1767 */
1768 void
1769 fs_visitor::move_uniform_array_access_to_pull_constants()
1770 {
1771 if (dispatch_width != 8)
1772 return;
1773
1774 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1775
1776 for (unsigned int i = 0; i < uniforms; i++) {
1777 pull_constant_loc[i] = -1;
1778 }
1779
1780 /* Walk through and find array access of uniforms. Put a copy of that
1781 * uniform in the pull constant buffer.
1782 *
1783 * Note that we don't move constant-indexed accesses to arrays. No
1784 * testing has been done of the performance impact of this choice.
1785 */
1786 foreach_in_list_safe(fs_inst, inst, &instructions) {
1787 for (int i = 0 ; i < inst->sources; i++) {
1788 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1789 continue;
1790
1791 int uniform = inst->src[i].reg;
1792
1793 /* If this array isn't already present in the pull constant buffer,
1794 * add it.
1795 */
1796 if (pull_constant_loc[uniform] == -1) {
1797 const gl_constant_value **values = &stage_prog_data->param[uniform];
1798
1799 assert(param_size[uniform]);
1800
1801 for (int j = 0; j < param_size[uniform]; j++) {
1802 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1803
1804 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1805 values[j];
1806 }
1807 }
1808 }
1809 }
1810 }
1811
1812 /**
1813 * Assign UNIFORM file registers to either push constants or pull constants.
1814 *
1815 * We allow a fragment shader to have more than the specified minimum
1816 * maximum number of fragment shader uniform components (64). If
1817 * there are too many of these, they'd fill up all of register space.
1818 * So, this will push some of them out to the pull constant buffer and
1819 * update the program to load them.
1820 */
1821 void
1822 fs_visitor::assign_constant_locations()
1823 {
1824 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1825 if (dispatch_width != 8)
1826 return;
1827
1828 /* Find which UNIFORM registers are still in use. */
1829 bool is_live[uniforms];
1830 for (unsigned int i = 0; i < uniforms; i++) {
1831 is_live[i] = false;
1832 }
1833
1834 foreach_in_list(fs_inst, inst, &instructions) {
1835 for (int i = 0; i < inst->sources; i++) {
1836 if (inst->src[i].file != UNIFORM)
1837 continue;
1838
1839 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1840 if (constant_nr >= 0 && constant_nr < (int) uniforms)
1841 is_live[constant_nr] = true;
1842 }
1843 }
1844
1845 /* Only allow 16 registers (128 uniform components) as push constants.
1846 *
1847 * Just demote the end of the list. We could probably do better
1848 * here, demoting things that are rarely used in the program first.
1849 *
1850 * If changing this value, note the limitation about total_regs in
1851 * brw_curbe.c.
1852 */
1853 unsigned int max_push_components = 16 * 8;
1854 unsigned int num_push_constants = 0;
1855
1856 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1857
1858 for (unsigned int i = 0; i < uniforms; i++) {
1859 if (!is_live[i] || pull_constant_loc[i] != -1) {
1860 /* This UNIFORM register is either dead, or has already been demoted
1861 * to a pull const. Mark it as no longer living in the param[] array.
1862 */
1863 push_constant_loc[i] = -1;
1864 continue;
1865 }
1866
1867 if (num_push_constants < max_push_components) {
1868 /* Retain as a push constant. Record the location in the params[]
1869 * array.
1870 */
1871 push_constant_loc[i] = num_push_constants++;
1872 } else {
1873 /* Demote to a pull constant. */
1874 push_constant_loc[i] = -1;
1875
1876 int pull_index = stage_prog_data->nr_pull_params++;
1877 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1878 pull_constant_loc[i] = pull_index;
1879 }
1880 }
1881
1882 stage_prog_data->nr_params = num_push_constants;
1883
1884 /* Up until now, the param[] array has been indexed by reg + reg_offset
1885 * of UNIFORM registers. Condense it to only contain the uniforms we
1886 * chose to upload as push constants.
1887 */
1888 for (unsigned int i = 0; i < uniforms; i++) {
1889 int remapped = push_constant_loc[i];
1890
1891 if (remapped == -1)
1892 continue;
1893
1894 assert(remapped <= (int)i);
1895 stage_prog_data->param[remapped] = stage_prog_data->param[i];
1896 }
1897 }
1898
1899 /**
1900 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1901 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1902 */
1903 void
1904 fs_visitor::demote_pull_constants()
1905 {
1906 foreach_in_list(fs_inst, inst, &instructions) {
1907 for (int i = 0; i < inst->sources; i++) {
1908 if (inst->src[i].file != UNIFORM)
1909 continue;
1910
1911 int pull_index = pull_constant_loc[inst->src[i].reg +
1912 inst->src[i].reg_offset];
1913 if (pull_index == -1)
1914 continue;
1915
1916 /* Set up the annotation tracking for new generated instructions. */
1917 base_ir = inst->ir;
1918 current_annotation = inst->annotation;
1919
1920 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1921 fs_reg dst = fs_reg(this, glsl_type::float_type);
1922
1923 /* Generate a pull load into dst. */
1924 if (inst->src[i].reladdr) {
1925 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
1926 surf_index,
1927 *inst->src[i].reladdr,
1928 pull_index);
1929 inst->insert_before(&list);
1930 inst->src[i].reladdr = NULL;
1931 } else {
1932 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1933 fs_inst *pull =
1934 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1935 dst, surf_index, offset);
1936 inst->insert_before(pull);
1937 inst->src[i].set_smear(pull_index & 3);
1938 }
1939
1940 /* Rewrite the instruction to use the temporary VGRF. */
1941 inst->src[i].file = GRF;
1942 inst->src[i].reg = dst.reg;
1943 inst->src[i].reg_offset = 0;
1944 }
1945 }
1946 invalidate_live_intervals();
1947 }
1948
1949 bool
1950 fs_visitor::opt_algebraic()
1951 {
1952 bool progress = false;
1953
1954 foreach_in_list(fs_inst, inst, &instructions) {
1955 switch (inst->opcode) {
1956 case BRW_OPCODE_MUL:
1957 if (inst->src[1].file != IMM)
1958 continue;
1959
1960 /* a * 1.0 = a */
1961 if (inst->src[1].is_one()) {
1962 inst->opcode = BRW_OPCODE_MOV;
1963 inst->src[1] = reg_undef;
1964 progress = true;
1965 break;
1966 }
1967
1968 /* a * 0.0 = 0.0 */
1969 if (inst->src[1].is_zero()) {
1970 inst->opcode = BRW_OPCODE_MOV;
1971 inst->src[0] = inst->src[1];
1972 inst->src[1] = reg_undef;
1973 progress = true;
1974 break;
1975 }
1976
1977 break;
1978 case BRW_OPCODE_ADD:
1979 if (inst->src[1].file != IMM)
1980 continue;
1981
1982 /* a + 0.0 = a */
1983 if (inst->src[1].is_zero()) {
1984 inst->opcode = BRW_OPCODE_MOV;
1985 inst->src[1] = reg_undef;
1986 progress = true;
1987 break;
1988 }
1989 break;
1990 case BRW_OPCODE_OR:
1991 if (inst->src[0].equals(inst->src[1])) {
1992 inst->opcode = BRW_OPCODE_MOV;
1993 inst->src[1] = reg_undef;
1994 progress = true;
1995 break;
1996 }
1997 break;
1998 case BRW_OPCODE_LRP:
1999 if (inst->src[1].equals(inst->src[2])) {
2000 inst->opcode = BRW_OPCODE_MOV;
2001 inst->src[0] = inst->src[1];
2002 inst->src[1] = reg_undef;
2003 inst->src[2] = reg_undef;
2004 progress = true;
2005 break;
2006 }
2007 break;
2008 case BRW_OPCODE_SEL:
2009 if (inst->src[0].equals(inst->src[1])) {
2010 inst->opcode = BRW_OPCODE_MOV;
2011 inst->src[1] = reg_undef;
2012 inst->predicate = BRW_PREDICATE_NONE;
2013 inst->predicate_inverse = false;
2014 progress = true;
2015 } else if (inst->saturate && inst->src[1].file == IMM) {
2016 switch (inst->conditional_mod) {
2017 case BRW_CONDITIONAL_LE:
2018 case BRW_CONDITIONAL_L:
2019 switch (inst->src[1].type) {
2020 case BRW_REGISTER_TYPE_F:
2021 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2022 inst->opcode = BRW_OPCODE_MOV;
2023 inst->src[1] = reg_undef;
2024 progress = true;
2025 }
2026 break;
2027 default:
2028 break;
2029 }
2030 break;
2031 case BRW_CONDITIONAL_GE:
2032 case BRW_CONDITIONAL_G:
2033 switch (inst->src[1].type) {
2034 case BRW_REGISTER_TYPE_F:
2035 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2036 inst->opcode = BRW_OPCODE_MOV;
2037 inst->src[1] = reg_undef;
2038 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2039 progress = true;
2040 }
2041 break;
2042 default:
2043 break;
2044 }
2045 default:
2046 break;
2047 }
2048 }
2049 break;
2050 default:
2051 break;
2052 }
2053 }
2054
2055 return progress;
2056 }
2057
2058 bool
2059 fs_visitor::compute_to_mrf()
2060 {
2061 bool progress = false;
2062 int next_ip = 0;
2063
2064 calculate_live_intervals();
2065
2066 foreach_in_list_safe(fs_inst, inst, &instructions) {
2067 int ip = next_ip;
2068 next_ip++;
2069
2070 if (inst->opcode != BRW_OPCODE_MOV ||
2071 inst->is_partial_write() ||
2072 inst->dst.file != MRF || inst->src[0].file != GRF ||
2073 inst->dst.type != inst->src[0].type ||
2074 inst->src[0].abs || inst->src[0].negate ||
2075 !inst->src[0].is_contiguous() ||
2076 inst->src[0].subreg_offset)
2077 continue;
2078
2079 /* Work out which hardware MRF registers are written by this
2080 * instruction.
2081 */
2082 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2083 int mrf_high;
2084 if (inst->dst.reg & BRW_MRF_COMPR4) {
2085 mrf_high = mrf_low + 4;
2086 } else if (dispatch_width == 16 &&
2087 (!inst->force_uncompressed && !inst->force_sechalf)) {
2088 mrf_high = mrf_low + 1;
2089 } else {
2090 mrf_high = mrf_low;
2091 }
2092
2093 /* Can't compute-to-MRF this GRF if someone else was going to
2094 * read it later.
2095 */
2096 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2097 continue;
2098
2099 /* Found a move of a GRF to a MRF. Let's see if we can go
2100 * rewrite the thing that made this GRF to write into the MRF.
2101 */
2102 fs_inst *scan_inst;
2103 for (scan_inst = (fs_inst *)inst->prev;
2104 !scan_inst->is_head_sentinel();
2105 scan_inst = (fs_inst *)scan_inst->prev) {
2106 if (scan_inst->dst.file == GRF &&
2107 scan_inst->dst.reg == inst->src[0].reg) {
2108 /* Found the last thing to write our reg we want to turn
2109 * into a compute-to-MRF.
2110 */
2111
2112 /* If this one instruction didn't populate all the
2113 * channels, bail. We might be able to rewrite everything
2114 * that writes that reg, but it would require smarter
2115 * tracking to delay the rewriting until complete success.
2116 */
2117 if (scan_inst->is_partial_write())
2118 break;
2119
2120 /* Things returning more than one register would need us to
2121 * understand coalescing out more than one MOV at a time.
2122 */
2123 if (scan_inst->regs_written > 1)
2124 break;
2125
2126 /* SEND instructions can't have MRF as a destination. */
2127 if (scan_inst->mlen)
2128 break;
2129
2130 if (brw->gen == 6) {
2131 /* gen6 math instructions must have the destination be
2132 * GRF, so no compute-to-MRF for them.
2133 */
2134 if (scan_inst->is_math()) {
2135 break;
2136 }
2137 }
2138
2139 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2140 /* Found the creator of our MRF's source value. */
2141 scan_inst->dst.file = MRF;
2142 scan_inst->dst.reg = inst->dst.reg;
2143 scan_inst->saturate |= inst->saturate;
2144 inst->remove();
2145 progress = true;
2146 }
2147 break;
2148 }
2149
2150 /* We don't handle control flow here. Most computation of
2151 * values that end up in MRFs are shortly before the MRF
2152 * write anyway.
2153 */
2154 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2155 break;
2156
2157 /* You can't read from an MRF, so if someone else reads our
2158 * MRF's source GRF that we wanted to rewrite, that stops us.
2159 */
2160 bool interfered = false;
2161 for (int i = 0; i < scan_inst->sources; i++) {
2162 if (scan_inst->src[i].file == GRF &&
2163 scan_inst->src[i].reg == inst->src[0].reg &&
2164 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2165 interfered = true;
2166 }
2167 }
2168 if (interfered)
2169 break;
2170
2171 if (scan_inst->dst.file == MRF) {
2172 /* If somebody else writes our MRF here, we can't
2173 * compute-to-MRF before that.
2174 */
2175 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2176 int scan_mrf_high;
2177
2178 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2179 scan_mrf_high = scan_mrf_low + 4;
2180 } else if (dispatch_width == 16 &&
2181 (!scan_inst->force_uncompressed &&
2182 !scan_inst->force_sechalf)) {
2183 scan_mrf_high = scan_mrf_low + 1;
2184 } else {
2185 scan_mrf_high = scan_mrf_low;
2186 }
2187
2188 if (mrf_low == scan_mrf_low ||
2189 mrf_low == scan_mrf_high ||
2190 mrf_high == scan_mrf_low ||
2191 mrf_high == scan_mrf_high) {
2192 break;
2193 }
2194 }
2195
2196 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2197 /* Found a SEND instruction, which means that there are
2198 * live values in MRFs from base_mrf to base_mrf +
2199 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2200 * above it.
2201 */
2202 if (mrf_low >= scan_inst->base_mrf &&
2203 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2204 break;
2205 }
2206 if (mrf_high >= scan_inst->base_mrf &&
2207 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2208 break;
2209 }
2210 }
2211 }
2212 }
2213
2214 if (progress)
2215 invalidate_live_intervals();
2216
2217 return progress;
2218 }
2219
2220 /**
2221 * Walks through basic blocks, looking for repeated MRF writes and
2222 * removing the later ones.
2223 */
2224 bool
2225 fs_visitor::remove_duplicate_mrf_writes()
2226 {
2227 fs_inst *last_mrf_move[16];
2228 bool progress = false;
2229
2230 /* Need to update the MRF tracking for compressed instructions. */
2231 if (dispatch_width == 16)
2232 return false;
2233
2234 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2235
2236 foreach_in_list_safe(fs_inst, inst, &instructions) {
2237 if (inst->is_control_flow()) {
2238 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2239 }
2240
2241 if (inst->opcode == BRW_OPCODE_MOV &&
2242 inst->dst.file == MRF) {
2243 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2244 if (prev_inst && inst->equals(prev_inst)) {
2245 inst->remove();
2246 progress = true;
2247 continue;
2248 }
2249 }
2250
2251 /* Clear out the last-write records for MRFs that were overwritten. */
2252 if (inst->dst.file == MRF) {
2253 last_mrf_move[inst->dst.reg] = NULL;
2254 }
2255
2256 if (inst->mlen > 0 && inst->base_mrf != -1) {
2257 /* Found a SEND instruction, which will include two or fewer
2258 * implied MRF writes. We could do better here.
2259 */
2260 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2261 last_mrf_move[inst->base_mrf + i] = NULL;
2262 }
2263 }
2264
2265 /* Clear out any MRF move records whose sources got overwritten. */
2266 if (inst->dst.file == GRF) {
2267 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2268 if (last_mrf_move[i] &&
2269 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2270 last_mrf_move[i] = NULL;
2271 }
2272 }
2273 }
2274
2275 if (inst->opcode == BRW_OPCODE_MOV &&
2276 inst->dst.file == MRF &&
2277 inst->src[0].file == GRF &&
2278 !inst->is_partial_write()) {
2279 last_mrf_move[inst->dst.reg] = inst;
2280 }
2281 }
2282
2283 if (progress)
2284 invalidate_live_intervals();
2285
2286 return progress;
2287 }
2288
2289 static void
2290 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2291 int first_grf, int grf_len)
2292 {
2293 bool inst_simd16 = (dispatch_width > 8 &&
2294 !inst->force_uncompressed &&
2295 !inst->force_sechalf);
2296
2297 /* Clear the flag for registers that actually got read (as expected). */
2298 for (int i = 0; i < inst->sources; i++) {
2299 int grf;
2300 if (inst->src[i].file == GRF) {
2301 grf = inst->src[i].reg;
2302 } else if (inst->src[i].file == HW_REG &&
2303 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2304 grf = inst->src[i].fixed_hw_reg.nr;
2305 } else {
2306 continue;
2307 }
2308
2309 if (grf >= first_grf &&
2310 grf < first_grf + grf_len) {
2311 deps[grf - first_grf] = false;
2312 if (inst_simd16)
2313 deps[grf - first_grf + 1] = false;
2314 }
2315 }
2316 }
2317
2318 /**
2319 * Implements this workaround for the original 965:
2320 *
2321 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2322 * check for post destination dependencies on this instruction, software
2323 * must ensure that there is no destination hazard for the case of ‘write
2324 * followed by a posted write’ shown in the following example.
2325 *
2326 * 1. mov r3 0
2327 * 2. send r3.xy <rest of send instruction>
2328 * 3. mov r2 r3
2329 *
2330 * Due to no post-destination dependency check on the ‘send’, the above
2331 * code sequence could have two instructions (1 and 2) in flight at the
2332 * same time that both consider ‘r3’ as the target of their final writes.
2333 */
2334 void
2335 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2336 {
2337 int reg_size = dispatch_width / 8;
2338 int write_len = inst->regs_written * reg_size;
2339 int first_write_grf = inst->dst.reg;
2340 bool needs_dep[BRW_MAX_MRF];
2341 assert(write_len < (int)sizeof(needs_dep) - 1);
2342
2343 memset(needs_dep, false, sizeof(needs_dep));
2344 memset(needs_dep, true, write_len);
2345
2346 clear_deps_for_inst_src(inst, dispatch_width,
2347 needs_dep, first_write_grf, write_len);
2348
2349 /* Walk backwards looking for writes to registers we're writing which
2350 * aren't read since being written. If we hit the start of the program,
2351 * we assume that there are no outstanding dependencies on entry to the
2352 * program.
2353 */
2354 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2355 !scan_inst->is_head_sentinel();
2356 scan_inst = (fs_inst *)scan_inst->prev) {
2357
2358 /* If we hit control flow, assume that there *are* outstanding
2359 * dependencies, and force their cleanup before our instruction.
2360 */
2361 if (scan_inst->is_control_flow()) {
2362 for (int i = 0; i < write_len; i++) {
2363 if (needs_dep[i]) {
2364 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2365 }
2366 }
2367 return;
2368 }
2369
2370 bool scan_inst_simd16 = (dispatch_width > 8 &&
2371 !scan_inst->force_uncompressed &&
2372 !scan_inst->force_sechalf);
2373
2374 /* We insert our reads as late as possible on the assumption that any
2375 * instruction but a MOV that might have left us an outstanding
2376 * dependency has more latency than a MOV.
2377 */
2378 if (scan_inst->dst.file == GRF) {
2379 for (int i = 0; i < scan_inst->regs_written; i++) {
2380 int reg = scan_inst->dst.reg + i * reg_size;
2381
2382 if (reg >= first_write_grf &&
2383 reg < first_write_grf + write_len &&
2384 needs_dep[reg - first_write_grf]) {
2385 inst->insert_before(DEP_RESOLVE_MOV(reg));
2386 needs_dep[reg - first_write_grf] = false;
2387 if (scan_inst_simd16)
2388 needs_dep[reg - first_write_grf + 1] = false;
2389 }
2390 }
2391 }
2392
2393 /* Clear the flag for registers that actually got read (as expected). */
2394 clear_deps_for_inst_src(scan_inst, dispatch_width,
2395 needs_dep, first_write_grf, write_len);
2396
2397 /* Continue the loop only if we haven't resolved all the dependencies */
2398 int i;
2399 for (i = 0; i < write_len; i++) {
2400 if (needs_dep[i])
2401 break;
2402 }
2403 if (i == write_len)
2404 return;
2405 }
2406 }
2407
2408 /**
2409 * Implements this workaround for the original 965:
2410 *
2411 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2412 * used as a destination register until after it has been sourced by an
2413 * instruction with a different destination register.
2414 */
2415 void
2416 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2417 {
2418 int write_len = inst->regs_written * dispatch_width / 8;
2419 int first_write_grf = inst->dst.reg;
2420 bool needs_dep[BRW_MAX_MRF];
2421 assert(write_len < (int)sizeof(needs_dep) - 1);
2422
2423 memset(needs_dep, false, sizeof(needs_dep));
2424 memset(needs_dep, true, write_len);
2425 /* Walk forwards looking for writes to registers we're writing which aren't
2426 * read before being written.
2427 */
2428 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2429 !scan_inst->is_tail_sentinel();
2430 scan_inst = (fs_inst *)scan_inst->next) {
2431 /* If we hit control flow, force resolve all remaining dependencies. */
2432 if (scan_inst->is_control_flow()) {
2433 for (int i = 0; i < write_len; i++) {
2434 if (needs_dep[i])
2435 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2436 }
2437 return;
2438 }
2439
2440 /* Clear the flag for registers that actually got read (as expected). */
2441 clear_deps_for_inst_src(scan_inst, dispatch_width,
2442 needs_dep, first_write_grf, write_len);
2443
2444 /* We insert our reads as late as possible since they're reading the
2445 * result of a SEND, which has massive latency.
2446 */
2447 if (scan_inst->dst.file == GRF &&
2448 scan_inst->dst.reg >= first_write_grf &&
2449 scan_inst->dst.reg < first_write_grf + write_len &&
2450 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2451 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2452 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2453 }
2454
2455 /* Continue the loop only if we haven't resolved all the dependencies */
2456 int i;
2457 for (i = 0; i < write_len; i++) {
2458 if (needs_dep[i])
2459 break;
2460 }
2461 if (i == write_len)
2462 return;
2463 }
2464
2465 /* If we hit the end of the program, resolve all remaining dependencies out
2466 * of paranoia.
2467 */
2468 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2469 assert(last_inst->eot);
2470 for (int i = 0; i < write_len; i++) {
2471 if (needs_dep[i])
2472 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2473 }
2474 }
2475
2476 void
2477 fs_visitor::insert_gen4_send_dependency_workarounds()
2478 {
2479 if (brw->gen != 4 || brw->is_g4x)
2480 return;
2481
2482 bool progress = false;
2483
2484 /* Note that we're done with register allocation, so GRF fs_regs always
2485 * have a .reg_offset of 0.
2486 */
2487
2488 foreach_in_list_safe(fs_inst, inst, &instructions) {
2489 if (inst->mlen != 0 && inst->dst.file == GRF) {
2490 insert_gen4_pre_send_dependency_workarounds(inst);
2491 insert_gen4_post_send_dependency_workarounds(inst);
2492 progress = true;
2493 }
2494 }
2495
2496 if (progress)
2497 invalidate_live_intervals();
2498 }
2499
2500 /**
2501 * Turns the generic expression-style uniform pull constant load instruction
2502 * into a hardware-specific series of instructions for loading a pull
2503 * constant.
2504 *
2505 * The expression style allows the CSE pass before this to optimize out
2506 * repeated loads from the same offset, and gives the pre-register-allocation
2507 * scheduling full flexibility, while the conversion to native instructions
2508 * allows the post-register-allocation scheduler the best information
2509 * possible.
2510 *
2511 * Note that execution masking for setting up pull constant loads is special:
2512 * the channels that need to be written are unrelated to the current execution
2513 * mask, since a later instruction will use one of the result channels as a
2514 * source operand for all 8 or 16 of its channels.
2515 */
2516 void
2517 fs_visitor::lower_uniform_pull_constant_loads()
2518 {
2519 foreach_in_list(fs_inst, inst, &instructions) {
2520 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2521 continue;
2522
2523 if (brw->gen >= 7) {
2524 /* The offset arg before was a vec4-aligned byte offset. We need to
2525 * turn it into a dword offset.
2526 */
2527 fs_reg const_offset_reg = inst->src[1];
2528 assert(const_offset_reg.file == IMM &&
2529 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2530 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2531 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2532
2533 /* This is actually going to be a MOV, but since only the first dword
2534 * is accessed, we have a special opcode to do just that one. Note
2535 * that this needs to be an operation that will be considered a def
2536 * by live variable analysis, or register allocation will explode.
2537 */
2538 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2539 payload, const_offset_reg);
2540 setup->force_writemask_all = true;
2541
2542 setup->ir = inst->ir;
2543 setup->annotation = inst->annotation;
2544 inst->insert_before(setup);
2545
2546 /* Similarly, this will only populate the first 4 channels of the
2547 * result register (since we only use smear values from 0-3), but we
2548 * don't tell the optimizer.
2549 */
2550 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2551 inst->src[1] = payload;
2552
2553 invalidate_live_intervals();
2554 } else {
2555 /* Before register allocation, we didn't tell the scheduler about the
2556 * MRF we use. We know it's safe to use this MRF because nothing
2557 * else does except for register spill/unspill, which generates and
2558 * uses its MRF within a single IR instruction.
2559 */
2560 inst->base_mrf = 14;
2561 inst->mlen = 1;
2562 }
2563 }
2564 }
2565
2566 bool
2567 fs_visitor::lower_load_payload()
2568 {
2569 bool progress = false;
2570
2571 foreach_in_list_safe(fs_inst, inst, &instructions) {
2572 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
2573 fs_reg dst = inst->dst;
2574
2575 /* src[0] represents the (optional) message header. */
2576 if (inst->src[0].file != BAD_FILE) {
2577 inst->insert_before(MOV(dst, inst->src[0]));
2578 }
2579 dst.reg_offset++;
2580
2581 for (int i = 1; i < inst->sources; i++) {
2582 inst->insert_before(MOV(dst, inst->src[i]));
2583 dst.reg_offset++;
2584 }
2585
2586 inst->remove();
2587 progress = true;
2588 }
2589 }
2590
2591 if (progress)
2592 invalidate_live_intervals();
2593
2594 return progress;
2595 }
2596
2597 void
2598 fs_visitor::dump_instructions()
2599 {
2600 dump_instructions(NULL);
2601 }
2602
2603 void
2604 fs_visitor::dump_instructions(const char *name)
2605 {
2606 calculate_register_pressure();
2607 FILE *file = stderr;
2608 if (name && geteuid() != 0) {
2609 file = fopen(name, "w");
2610 if (!file)
2611 file = stderr;
2612 }
2613
2614 int ip = 0, max_pressure = 0;
2615 foreach_in_list(backend_instruction, inst, &instructions) {
2616 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2617 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
2618 dump_instruction(inst, file);
2619 ++ip;
2620 }
2621 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
2622
2623 if (file != stderr) {
2624 fclose(file);
2625 }
2626 }
2627
2628 void
2629 fs_visitor::dump_instruction(backend_instruction *be_inst)
2630 {
2631 dump_instruction(be_inst, stderr);
2632 }
2633
2634 void
2635 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
2636 {
2637 fs_inst *inst = (fs_inst *)be_inst;
2638
2639 if (inst->predicate) {
2640 fprintf(file, "(%cf0.%d) ",
2641 inst->predicate_inverse ? '-' : '+',
2642 inst->flag_subreg);
2643 }
2644
2645 fprintf(file, "%s", brw_instruction_name(inst->opcode));
2646 if (inst->saturate)
2647 fprintf(file, ".sat");
2648 if (inst->conditional_mod) {
2649 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
2650 if (!inst->predicate &&
2651 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2652 inst->opcode != BRW_OPCODE_IF &&
2653 inst->opcode != BRW_OPCODE_WHILE))) {
2654 fprintf(file, ".f0.%d", inst->flag_subreg);
2655 }
2656 }
2657 fprintf(file, " ");
2658
2659
2660 switch (inst->dst.file) {
2661 case GRF:
2662 fprintf(file, "vgrf%d", inst->dst.reg);
2663 if (virtual_grf_sizes[inst->dst.reg] != 1 ||
2664 inst->dst.subreg_offset)
2665 fprintf(file, "+%d.%d",
2666 inst->dst.reg_offset, inst->dst.subreg_offset);
2667 break;
2668 case MRF:
2669 fprintf(file, "m%d", inst->dst.reg);
2670 break;
2671 case BAD_FILE:
2672 fprintf(file, "(null)");
2673 break;
2674 case UNIFORM:
2675 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
2676 break;
2677 case HW_REG:
2678 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2679 switch (inst->dst.fixed_hw_reg.nr) {
2680 case BRW_ARF_NULL:
2681 fprintf(file, "null");
2682 break;
2683 case BRW_ARF_ADDRESS:
2684 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
2685 break;
2686 case BRW_ARF_ACCUMULATOR:
2687 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
2688 break;
2689 case BRW_ARF_FLAG:
2690 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2691 inst->dst.fixed_hw_reg.subnr);
2692 break;
2693 default:
2694 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2695 inst->dst.fixed_hw_reg.subnr);
2696 break;
2697 }
2698 } else {
2699 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
2700 }
2701 if (inst->dst.fixed_hw_reg.subnr)
2702 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
2703 break;
2704 default:
2705 fprintf(file, "???");
2706 break;
2707 }
2708 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
2709
2710 for (int i = 0; i < inst->sources && inst->src[i].file != BAD_FILE; i++) {
2711 if (inst->src[i].negate)
2712 fprintf(file, "-");
2713 if (inst->src[i].abs)
2714 fprintf(file, "|");
2715 switch (inst->src[i].file) {
2716 case GRF:
2717 fprintf(file, "vgrf%d", inst->src[i].reg);
2718 if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2719 inst->src[i].subreg_offset)
2720 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2721 inst->src[i].subreg_offset);
2722 break;
2723 case MRF:
2724 fprintf(file, "***m%d***", inst->src[i].reg);
2725 break;
2726 case UNIFORM:
2727 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
2728 if (inst->src[i].reladdr) {
2729 fprintf(file, "+reladdr");
2730 } else if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2731 inst->src[i].subreg_offset) {
2732 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2733 inst->src[i].subreg_offset);
2734 }
2735 break;
2736 case BAD_FILE:
2737 fprintf(file, "(null)");
2738 break;
2739 case IMM:
2740 switch (inst->src[i].type) {
2741 case BRW_REGISTER_TYPE_F:
2742 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
2743 break;
2744 case BRW_REGISTER_TYPE_D:
2745 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
2746 break;
2747 case BRW_REGISTER_TYPE_UD:
2748 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
2749 break;
2750 default:
2751 fprintf(file, "???");
2752 break;
2753 }
2754 break;
2755 case HW_REG:
2756 if (inst->src[i].fixed_hw_reg.negate)
2757 fprintf(file, "-");
2758 if (inst->src[i].fixed_hw_reg.abs)
2759 fprintf(file, "|");
2760 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2761 switch (inst->src[i].fixed_hw_reg.nr) {
2762 case BRW_ARF_NULL:
2763 fprintf(file, "null");
2764 break;
2765 case BRW_ARF_ADDRESS:
2766 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
2767 break;
2768 case BRW_ARF_ACCUMULATOR:
2769 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
2770 break;
2771 case BRW_ARF_FLAG:
2772 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2773 inst->src[i].fixed_hw_reg.subnr);
2774 break;
2775 default:
2776 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2777 inst->src[i].fixed_hw_reg.subnr);
2778 break;
2779 }
2780 } else {
2781 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2782 }
2783 if (inst->src[i].fixed_hw_reg.subnr)
2784 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
2785 if (inst->src[i].fixed_hw_reg.abs)
2786 fprintf(file, "|");
2787 break;
2788 default:
2789 fprintf(file, "???");
2790 break;
2791 }
2792 if (inst->src[i].abs)
2793 fprintf(file, "|");
2794
2795 if (inst->src[i].file != IMM) {
2796 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
2797 }
2798
2799 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
2800 fprintf(file, ", ");
2801 }
2802
2803 fprintf(file, " ");
2804
2805 if (inst->force_uncompressed)
2806 fprintf(file, "1sthalf ");
2807
2808 if (inst->force_sechalf)
2809 fprintf(file, "2ndhalf ");
2810
2811 fprintf(file, "\n");
2812 }
2813
2814 /**
2815 * Possibly returns an instruction that set up @param reg.
2816 *
2817 * Sometimes we want to take the result of some expression/variable
2818 * dereference tree and rewrite the instruction generating the result
2819 * of the tree. When processing the tree, we know that the
2820 * instructions generated are all writing temporaries that are dead
2821 * outside of this tree. So, if we have some instructions that write
2822 * a temporary, we're free to point that temp write somewhere else.
2823 *
2824 * Note that this doesn't guarantee that the instruction generated
2825 * only reg -- it might be the size=4 destination of a texture instruction.
2826 */
2827 fs_inst *
2828 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2829 fs_inst *end,
2830 const fs_reg &reg)
2831 {
2832 if (end == start ||
2833 end->is_partial_write() ||
2834 reg.reladdr ||
2835 !reg.equals(end->dst)) {
2836 return NULL;
2837 } else {
2838 return end;
2839 }
2840 }
2841
2842 void
2843 fs_visitor::setup_payload_gen6()
2844 {
2845 bool uses_depth =
2846 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2847 unsigned barycentric_interp_modes = prog_data->barycentric_interp_modes;
2848
2849 assert(brw->gen >= 6);
2850
2851 /* R0-1: masks, pixel X/Y coordinates. */
2852 payload.num_regs = 2;
2853 /* R2: only for 32-pixel dispatch.*/
2854
2855 /* R3-26: barycentric interpolation coordinates. These appear in the
2856 * same order that they appear in the brw_wm_barycentric_interp_mode
2857 * enum. Each set of coordinates occupies 2 registers if dispatch width
2858 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2859 * appear if they were enabled using the "Barycentric Interpolation
2860 * Mode" bits in WM_STATE.
2861 */
2862 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2863 if (barycentric_interp_modes & (1 << i)) {
2864 payload.barycentric_coord_reg[i] = payload.num_regs;
2865 payload.num_regs += 2;
2866 if (dispatch_width == 16) {
2867 payload.num_regs += 2;
2868 }
2869 }
2870 }
2871
2872 /* R27: interpolated depth if uses source depth */
2873 if (uses_depth) {
2874 payload.source_depth_reg = payload.num_regs;
2875 payload.num_regs++;
2876 if (dispatch_width == 16) {
2877 /* R28: interpolated depth if not SIMD8. */
2878 payload.num_regs++;
2879 }
2880 }
2881 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2882 if (uses_depth) {
2883 payload.source_w_reg = payload.num_regs;
2884 payload.num_regs++;
2885 if (dispatch_width == 16) {
2886 /* R30: interpolated W if not SIMD8. */
2887 payload.num_regs++;
2888 }
2889 }
2890
2891 prog_data->uses_pos_offset = key->compute_pos_offset;
2892 /* R31: MSAA position offsets. */
2893 if (prog_data->uses_pos_offset) {
2894 payload.sample_pos_reg = payload.num_regs;
2895 payload.num_regs++;
2896 }
2897
2898 /* R32: MSAA input coverage mask */
2899 if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
2900 assert(brw->gen >= 7);
2901 payload.sample_mask_in_reg = payload.num_regs;
2902 payload.num_regs++;
2903 if (dispatch_width == 16) {
2904 /* R33: input coverage mask if not SIMD8. */
2905 payload.num_regs++;
2906 }
2907 }
2908
2909 /* R34-: bary for 32-pixel. */
2910 /* R58-59: interp W for 32-pixel. */
2911
2912 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2913 source_depth_to_render_target = true;
2914 }
2915 }
2916
2917 void
2918 fs_visitor::assign_binding_table_offsets()
2919 {
2920 uint32_t next_binding_table_offset = 0;
2921
2922 /* If there are no color regions, we still perform an FB write to a null
2923 * renderbuffer, which we place at surface index 0.
2924 */
2925 prog_data->binding_table.render_target_start = next_binding_table_offset;
2926 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
2927
2928 assign_common_binding_table_offsets(next_binding_table_offset);
2929 }
2930
2931 void
2932 fs_visitor::calculate_register_pressure()
2933 {
2934 invalidate_live_intervals();
2935 calculate_live_intervals();
2936
2937 unsigned num_instructions = instructions.length();
2938
2939 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
2940
2941 for (int reg = 0; reg < virtual_grf_count; reg++) {
2942 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
2943 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
2944 }
2945 }
2946
2947 /**
2948 * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
2949 *
2950 * The needs_unlit_centroid_workaround ends up producing one of these per
2951 * channel of centroid input, so it's good to clean them up.
2952 *
2953 * An assumption here is that nothing ever modifies the dispatched pixels
2954 * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
2955 * dictates that anyway.
2956 */
2957 void
2958 fs_visitor::opt_drop_redundant_mov_to_flags()
2959 {
2960 bool flag_mov_found[2] = {false};
2961
2962 foreach_in_list_safe(fs_inst, inst, &instructions) {
2963 if (inst->is_control_flow()) {
2964 memset(flag_mov_found, 0, sizeof(flag_mov_found));
2965 } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
2966 if (!flag_mov_found[inst->flag_subreg])
2967 flag_mov_found[inst->flag_subreg] = true;
2968 else
2969 inst->remove();
2970 } else if (inst->writes_flag()) {
2971 flag_mov_found[inst->flag_subreg] = false;
2972 }
2973 }
2974 }
2975
2976 bool
2977 fs_visitor::run()
2978 {
2979 sanity_param_count = fp->Base.Parameters->NumParameters;
2980 bool allocated_without_spills;
2981
2982 assign_binding_table_offsets();
2983
2984 if (brw->gen >= 6)
2985 setup_payload_gen6();
2986 else
2987 setup_payload_gen4();
2988
2989 if (0) {
2990 emit_dummy_fs();
2991 } else {
2992 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2993 emit_shader_time_begin();
2994
2995 calculate_urb_setup();
2996 if (fp->Base.InputsRead > 0) {
2997 if (brw->gen < 6)
2998 emit_interpolation_setup_gen4();
2999 else
3000 emit_interpolation_setup_gen6();
3001 }
3002
3003 /* We handle discards by keeping track of the still-live pixels in f0.1.
3004 * Initialize it with the dispatched pixels.
3005 */
3006 if (fp->UsesKill || key->alpha_test_func) {
3007 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3008 discard_init->flag_subreg = 1;
3009 }
3010
3011 /* Generate FS IR for main(). (the visitor only descends into
3012 * functions called "main").
3013 */
3014 if (shader) {
3015 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3016 base_ir = ir;
3017 this->result = reg_undef;
3018 ir->accept(this);
3019 }
3020 } else {
3021 emit_fragment_program_code();
3022 }
3023 base_ir = NULL;
3024 if (failed)
3025 return false;
3026
3027 emit(FS_OPCODE_PLACEHOLDER_HALT);
3028
3029 if (key->alpha_test_func)
3030 emit_alpha_test();
3031
3032 emit_fb_writes();
3033
3034 split_virtual_grfs();
3035
3036 move_uniform_array_access_to_pull_constants();
3037 assign_constant_locations();
3038 demote_pull_constants();
3039
3040 opt_drop_redundant_mov_to_flags();
3041
3042 #define OPT(pass, args...) do { \
3043 pass_num++; \
3044 bool this_progress = pass(args); \
3045 \
3046 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3047 char filename[64]; \
3048 snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass, \
3049 dispatch_width, shader_prog->Name, iteration, pass_num); \
3050 \
3051 backend_visitor::dump_instructions(filename); \
3052 } \
3053 \
3054 progress = progress || this_progress; \
3055 } while (false)
3056
3057 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3058 char filename[64];
3059 snprintf(filename, 64, "fs%d-%04d-00-start",
3060 dispatch_width, shader_prog->Name);
3061
3062 backend_visitor::dump_instructions(filename);
3063 }
3064
3065 bool progress;
3066 int iteration = 0;
3067 do {
3068 progress = false;
3069 iteration++;
3070 int pass_num = 0;
3071
3072 compact_virtual_grfs();
3073
3074 OPT(remove_duplicate_mrf_writes);
3075
3076 OPT(opt_algebraic);
3077 OPT(opt_cse);
3078 OPT(opt_copy_propagate);
3079 OPT(opt_peephole_predicated_break);
3080 OPT(dead_code_eliminate);
3081 OPT(opt_peephole_sel);
3082 OPT(dead_control_flow_eliminate, this);
3083 OPT(opt_saturate_propagation);
3084 OPT(register_coalesce);
3085 OPT(compute_to_mrf);
3086 } while (progress);
3087
3088 if (lower_load_payload()) {
3089 register_coalesce();
3090 dead_code_eliminate();
3091 }
3092
3093 lower_uniform_pull_constant_loads();
3094
3095 assign_curb_setup();
3096 assign_urb_setup();
3097
3098 static enum instruction_scheduler_mode pre_modes[] = {
3099 SCHEDULE_PRE,
3100 SCHEDULE_PRE_NON_LIFO,
3101 SCHEDULE_PRE_LIFO,
3102 };
3103
3104 /* Try each scheduling heuristic to see if it can successfully register
3105 * allocate without spilling. They should be ordered by decreasing
3106 * performance but increasing likelihood of allocating.
3107 */
3108 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3109 schedule_instructions(pre_modes[i]);
3110
3111 if (0) {
3112 assign_regs_trivial();
3113 allocated_without_spills = true;
3114 } else {
3115 allocated_without_spills = assign_regs(false);
3116 }
3117 if (allocated_without_spills)
3118 break;
3119 }
3120
3121 if (!allocated_without_spills) {
3122 /* We assume that any spilling is worse than just dropping back to
3123 * SIMD8. There's probably actually some intermediate point where
3124 * SIMD16 with a couple of spills is still better.
3125 */
3126 if (dispatch_width == 16) {
3127 fail("Failure to register allocate. Reduce number of "
3128 "live scalar values to avoid this.");
3129 } else {
3130 perf_debug("Fragment shader triggered register spilling. "
3131 "Try reducing the number of live scalar values to "
3132 "improve performance.\n");
3133 }
3134
3135 /* Since we're out of heuristics, just go spill registers until we
3136 * get an allocation.
3137 */
3138 while (!assign_regs(true)) {
3139 if (failed)
3140 break;
3141 }
3142 }
3143 }
3144 assert(force_uncompressed_stack == 0);
3145
3146 /* This must come after all optimization and register allocation, since
3147 * it inserts dead code that happens to have side effects, and it does
3148 * so based on the actual physical registers in use.
3149 */
3150 insert_gen4_send_dependency_workarounds();
3151
3152 if (failed)
3153 return false;
3154
3155 if (!allocated_without_spills)
3156 schedule_instructions(SCHEDULE_POST);
3157
3158 if (last_scratch > 0) {
3159 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3160 }
3161
3162 if (dispatch_width == 8)
3163 prog_data->reg_blocks = brw_register_blocks(grf_used);
3164 else
3165 prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3166
3167 /* If any state parameters were appended, then ParameterValues could have
3168 * been realloced, in which case the driver uniform storage set up by
3169 * _mesa_associate_uniform_storage() would point to freed memory. Make
3170 * sure that didn't happen.
3171 */
3172 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3173
3174 return !failed;
3175 }
3176
3177 const unsigned *
3178 brw_wm_fs_emit(struct brw_context *brw,
3179 void *mem_ctx,
3180 const struct brw_wm_prog_key *key,
3181 struct brw_wm_prog_data *prog_data,
3182 struct gl_fragment_program *fp,
3183 struct gl_shader_program *prog,
3184 unsigned *final_assembly_size)
3185 {
3186 bool start_busy = false;
3187 double start_time = 0;
3188
3189 if (unlikely(brw->perf_debug)) {
3190 start_busy = (brw->batch.last_bo &&
3191 drm_intel_bo_busy(brw->batch.last_bo));
3192 start_time = get_time();
3193 }
3194
3195 struct brw_shader *shader = NULL;
3196 if (prog)
3197 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3198
3199 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3200 brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3201
3202 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3203 */
3204 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3205 if (!v.run()) {
3206 if (prog) {
3207 prog->LinkStatus = false;
3208 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3209 }
3210
3211 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3212 v.fail_msg);
3213
3214 return NULL;
3215 }
3216
3217 exec_list *simd16_instructions = NULL;
3218 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3219 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3220 if (!v.simd16_unsupported) {
3221 /* Try a SIMD16 compile */
3222 v2.import_uniforms(&v);
3223 if (!v2.run()) {
3224 perf_debug("SIMD16 shader failed to compile, falling back to "
3225 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3226 } else {
3227 simd16_instructions = &v2.instructions;
3228 }
3229 } else {
3230 perf_debug("SIMD16 shader unsupported, falling back to "
3231 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3232 }
3233 }
3234
3235 const unsigned *assembly = NULL;
3236 fs_generator g(brw, mem_ctx, key, prog_data, prog, fp,
3237 v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
3238 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3239 final_assembly_size);
3240
3241 if (unlikely(brw->perf_debug) && shader) {
3242 if (shader->compiled_once)
3243 brw_wm_debug_recompile(brw, prog, key);
3244 shader->compiled_once = true;
3245
3246 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3247 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3248 (get_time() - start_time) * 1000);
3249 }
3250 }
3251
3252 return assembly;
3253 }
3254
3255 bool
3256 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3257 {
3258 struct brw_context *brw = brw_context(ctx);
3259 struct brw_wm_prog_key key;
3260
3261 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3262 return true;
3263
3264 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3265 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3266 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3267 bool program_uses_dfdy = fp->UsesDFdy;
3268
3269 memset(&key, 0, sizeof(key));
3270
3271 if (brw->gen < 6) {
3272 if (fp->UsesKill)
3273 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3274
3275 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3276 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3277
3278 /* Just assume depth testing. */
3279 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3280 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3281 }
3282
3283 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3284 BRW_FS_VARYING_INPUT_MASK) > 16)
3285 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3286
3287 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3288 for (unsigned i = 0; i < sampler_count; i++) {
3289 if (fp->Base.ShadowSamplers & (1 << i)) {
3290 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3291 key.tex.swizzles[i] =
3292 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3293 } else {
3294 /* Color sampler: assume no swizzling. */
3295 key.tex.swizzles[i] = SWIZZLE_XYZW;
3296 }
3297 }
3298
3299 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3300 key.drawable_height = ctx->DrawBuffer->Height;
3301 }
3302
3303 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3304 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3305 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3306
3307 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3308 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3309 key.nr_color_regions > 1;
3310 }
3311
3312 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3313 * quality of the derivatives is likely to be determined by the driconf
3314 * option.
3315 */
3316 key.high_quality_derivatives = brw->disable_derivative_optimization;
3317
3318 key.program_string_id = bfp->id;
3319
3320 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3321 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3322
3323 bool success = do_wm_prog(brw, prog, bfp, &key);
3324
3325 brw->wm.base.prog_offset = old_prog_offset;
3326 brw->wm.prog_data = old_prog_data;
3327
3328 return success;
3329 }