i965: Return NONE from brw_swap_cmod on unknown input.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "util/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53
54 void
55 fs_inst::init(enum opcode opcode, const fs_reg &dst, fs_reg *src, int sources)
56 {
57 memset(this, 0, sizeof(*this));
58
59 this->opcode = opcode;
60 this->dst = dst;
61 this->src = src;
62 this->sources = sources;
63
64 this->conditional_mod = BRW_CONDITIONAL_NONE;
65
66 /* This will be the case for almost all instructions. */
67 this->regs_written = 1;
68
69 this->writes_accumulator = false;
70 }
71
72 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
73 {
74 fs_reg *src = ralloc_array(this, fs_reg, 3);
75 init(opcode, dst, src, 0);
76 }
77
78 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
79 {
80 fs_reg *src = ralloc_array(this, fs_reg, 3);
81 src[0] = src0;
82 init(opcode, dst, src, 1);
83 }
84
85 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
86 const fs_reg &src1)
87 {
88 fs_reg *src = ralloc_array(this, fs_reg, 3);
89 src[0] = src0;
90 src[1] = src1;
91 init(opcode, dst, src, 2);
92 }
93
94 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
95 const fs_reg &src1, const fs_reg &src2)
96 {
97 fs_reg *src = ralloc_array(this, fs_reg, 3);
98 src[0] = src0;
99 src[1] = src1;
100 src[2] = src2;
101 init(opcode, dst, src, 3);
102 }
103
104 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
105 {
106 init(opcode, dst, src, sources);
107 }
108
109 fs_inst::fs_inst(const fs_inst &that)
110 {
111 memcpy(this, &that, sizeof(that));
112
113 this->src = ralloc_array(this, fs_reg, that.sources);
114
115 for (int i = 0; i < that.sources; i++)
116 this->src[i] = that.src[i];
117 }
118
119 void
120 fs_inst::resize_sources(uint8_t num_sources)
121 {
122 if (this->sources != num_sources) {
123 this->src = reralloc(this, this->src, fs_reg, num_sources);
124 this->sources = num_sources;
125 }
126 }
127
128 #define ALU1(op) \
129 fs_inst * \
130 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
131 { \
132 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
133 }
134
135 #define ALU2(op) \
136 fs_inst * \
137 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
138 const fs_reg &src1) \
139 { \
140 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
141 }
142
143 #define ALU2_ACC(op) \
144 fs_inst * \
145 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
146 const fs_reg &src1) \
147 { \
148 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
149 inst->writes_accumulator = true; \
150 return inst; \
151 }
152
153 #define ALU3(op) \
154 fs_inst * \
155 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
156 const fs_reg &src1, const fs_reg &src2) \
157 { \
158 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
159 }
160
161 ALU1(NOT)
162 ALU1(MOV)
163 ALU1(FRC)
164 ALU1(RNDD)
165 ALU1(RNDE)
166 ALU1(RNDZ)
167 ALU2(ADD)
168 ALU2(MUL)
169 ALU2_ACC(MACH)
170 ALU2(AND)
171 ALU2(OR)
172 ALU2(XOR)
173 ALU2(SHL)
174 ALU2(SHR)
175 ALU2(ASR)
176 ALU3(LRP)
177 ALU1(BFREV)
178 ALU3(BFE)
179 ALU2(BFI1)
180 ALU3(BFI2)
181 ALU1(FBH)
182 ALU1(FBL)
183 ALU1(CBIT)
184 ALU3(MAD)
185 ALU2_ACC(ADDC)
186 ALU2_ACC(SUBB)
187 ALU2(SEL)
188 ALU2(MAC)
189
190 /** Gen4 predicated IF. */
191 fs_inst *
192 fs_visitor::IF(enum brw_predicate predicate)
193 {
194 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
195 inst->predicate = predicate;
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 fs_inst *
201 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(brw->gen == 6);
205 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
206 reg_null_d, src0, src1);
207 inst->conditional_mod = condition;
208 return inst;
209 }
210
211 /**
212 * CMP: Sets the low bit of the destination channels with the result
213 * of the comparison, while the upper bits are undefined, and updates
214 * the flag register with the packed 16 bits of the result.
215 */
216 fs_inst *
217 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
218 enum brw_conditional_mod condition)
219 {
220 fs_inst *inst;
221
222 /* Take the instruction:
223 *
224 * CMP null<d> src0<f> src1<f>
225 *
226 * Original gen4 does type conversion to the destination type before
227 * comparison, producing garbage results for floating point comparisons.
228 * gen5 does the comparison on the execution type (resolved source types),
229 * so dst type doesn't matter. gen6 does comparison and then uses the
230 * result as if it was the dst type with no conversion, which happens to
231 * mostly work out for float-interpreted-as-int since our comparisons are
232 * for >0, =0, <0.
233 */
234 if (brw->gen == 4) {
235 dst.type = src0.type;
236 if (dst.file == HW_REG)
237 dst.fixed_hw_reg.type = dst.type;
238 }
239
240 resolve_ud_negate(&src0);
241 resolve_ud_negate(&src1);
242
243 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
244 inst->conditional_mod = condition;
245
246 return inst;
247 }
248
249 fs_inst *
250 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
251 {
252 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst, src,
253 sources);
254 inst->regs_written = sources;
255
256 return inst;
257 }
258
259 exec_list
260 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
261 const fs_reg &surf_index,
262 const fs_reg &varying_offset,
263 uint32_t const_offset)
264 {
265 exec_list instructions;
266 fs_inst *inst;
267
268 /* We have our constant surface use a pitch of 4 bytes, so our index can
269 * be any component of a vector, and then we load 4 contiguous
270 * components starting from that.
271 *
272 * We break down the const_offset to a portion added to the variable
273 * offset and a portion done using reg_offset, which means that if you
274 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
275 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
276 * CSE can later notice that those loads are all the same and eliminate
277 * the redundant ones.
278 */
279 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
280 instructions.push_tail(ADD(vec4_offset,
281 varying_offset, const_offset & ~3));
282
283 int scale = 1;
284 if (brw->gen == 4 && dispatch_width == 8) {
285 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
286 * u, v, r) as parameters, or we can just use the SIMD16 message
287 * consisting of (header, u). We choose the second, at the cost of a
288 * longer return length.
289 */
290 scale = 2;
291 }
292
293 enum opcode op;
294 if (brw->gen >= 7)
295 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
296 else
297 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
298 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
299 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
300 inst->regs_written = 4 * scale;
301 instructions.push_tail(inst);
302
303 if (brw->gen < 7) {
304 inst->base_mrf = 13;
305 inst->header_present = true;
306 if (brw->gen == 4)
307 inst->mlen = 3;
308 else
309 inst->mlen = 1 + dispatch_width / 8;
310 }
311
312 vec4_result.reg_offset += (const_offset & 3) * scale;
313 instructions.push_tail(MOV(dst, vec4_result));
314
315 return instructions;
316 }
317
318 /**
319 * A helper for MOV generation for fixing up broken hardware SEND dependency
320 * handling.
321 */
322 fs_inst *
323 fs_visitor::DEP_RESOLVE_MOV(int grf)
324 {
325 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
326
327 inst->ir = NULL;
328 inst->annotation = "send dependency resolve";
329
330 /* The caller always wants uncompressed to emit the minimal extra
331 * dependencies, and to avoid having to deal with aligning its regs to 2.
332 */
333 inst->force_uncompressed = true;
334
335 return inst;
336 }
337
338 bool
339 fs_inst::equals(fs_inst *inst) const
340 {
341 return (opcode == inst->opcode &&
342 dst.equals(inst->dst) &&
343 src[0].equals(inst->src[0]) &&
344 src[1].equals(inst->src[1]) &&
345 src[2].equals(inst->src[2]) &&
346 saturate == inst->saturate &&
347 predicate == inst->predicate &&
348 conditional_mod == inst->conditional_mod &&
349 mlen == inst->mlen &&
350 base_mrf == inst->base_mrf &&
351 target == inst->target &&
352 eot == inst->eot &&
353 header_present == inst->header_present &&
354 shadow_compare == inst->shadow_compare &&
355 offset == inst->offset);
356 }
357
358 bool
359 fs_inst::overwrites_reg(const fs_reg &reg) const
360 {
361 return (reg.file == dst.file &&
362 reg.reg == dst.reg &&
363 reg.reg_offset >= dst.reg_offset &&
364 reg.reg_offset < dst.reg_offset + regs_written);
365 }
366
367 bool
368 fs_inst::is_send_from_grf() const
369 {
370 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
371 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
372 opcode == FS_OPCODE_INTERPOLATE_AT_CENTROID ||
373 opcode == FS_OPCODE_INTERPOLATE_AT_SAMPLE ||
374 opcode == FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET ||
375 opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET ||
376 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
377 src[1].file == GRF) ||
378 (is_tex() && src[0].file == GRF));
379 }
380
381 bool
382 fs_inst::can_do_source_mods(struct brw_context *brw)
383 {
384 if (brw->gen == 6 && is_math())
385 return false;
386
387 if (is_send_from_grf())
388 return false;
389
390 if (!backend_instruction::can_do_source_mods())
391 return false;
392
393 return true;
394 }
395
396 void
397 fs_reg::init()
398 {
399 memset(this, 0, sizeof(*this));
400 stride = 1;
401 }
402
403 /** Generic unset register constructor. */
404 fs_reg::fs_reg()
405 {
406 init();
407 this->file = BAD_FILE;
408 }
409
410 /** Immediate value constructor. */
411 fs_reg::fs_reg(float f)
412 {
413 init();
414 this->file = IMM;
415 this->type = BRW_REGISTER_TYPE_F;
416 this->fixed_hw_reg.dw1.f = f;
417 }
418
419 /** Immediate value constructor. */
420 fs_reg::fs_reg(int32_t i)
421 {
422 init();
423 this->file = IMM;
424 this->type = BRW_REGISTER_TYPE_D;
425 this->fixed_hw_reg.dw1.d = i;
426 }
427
428 /** Immediate value constructor. */
429 fs_reg::fs_reg(uint32_t u)
430 {
431 init();
432 this->file = IMM;
433 this->type = BRW_REGISTER_TYPE_UD;
434 this->fixed_hw_reg.dw1.ud = u;
435 }
436
437 /** Fixed brw_reg. */
438 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
439 {
440 init();
441 this->file = HW_REG;
442 this->fixed_hw_reg = fixed_hw_reg;
443 this->type = fixed_hw_reg.type;
444 }
445
446 bool
447 fs_reg::equals(const fs_reg &r) const
448 {
449 return (file == r.file &&
450 reg == r.reg &&
451 reg_offset == r.reg_offset &&
452 subreg_offset == r.subreg_offset &&
453 type == r.type &&
454 negate == r.negate &&
455 abs == r.abs &&
456 !reladdr && !r.reladdr &&
457 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
458 sizeof(fixed_hw_reg)) == 0 &&
459 stride == r.stride);
460 }
461
462 fs_reg &
463 fs_reg::apply_stride(unsigned stride)
464 {
465 assert((this->stride * stride) <= 4 &&
466 (is_power_of_two(stride) || stride == 0) &&
467 file != HW_REG && file != IMM);
468 this->stride *= stride;
469 return *this;
470 }
471
472 fs_reg &
473 fs_reg::set_smear(unsigned subreg)
474 {
475 assert(file != HW_REG && file != IMM);
476 subreg_offset = subreg * type_sz(type);
477 stride = 0;
478 return *this;
479 }
480
481 bool
482 fs_reg::is_contiguous() const
483 {
484 return stride == 1;
485 }
486
487 bool
488 fs_reg::is_valid_3src() const
489 {
490 return file == GRF || file == UNIFORM;
491 }
492
493 int
494 fs_visitor::type_size(const struct glsl_type *type)
495 {
496 unsigned int size, i;
497
498 switch (type->base_type) {
499 case GLSL_TYPE_UINT:
500 case GLSL_TYPE_INT:
501 case GLSL_TYPE_FLOAT:
502 case GLSL_TYPE_BOOL:
503 return type->components();
504 case GLSL_TYPE_ARRAY:
505 return type_size(type->fields.array) * type->length;
506 case GLSL_TYPE_STRUCT:
507 size = 0;
508 for (i = 0; i < type->length; i++) {
509 size += type_size(type->fields.structure[i].type);
510 }
511 return size;
512 case GLSL_TYPE_SAMPLER:
513 /* Samplers take up no register space, since they're baked in at
514 * link time.
515 */
516 return 0;
517 case GLSL_TYPE_ATOMIC_UINT:
518 return 0;
519 case GLSL_TYPE_IMAGE:
520 case GLSL_TYPE_VOID:
521 case GLSL_TYPE_ERROR:
522 case GLSL_TYPE_INTERFACE:
523 unreachable("not reached");
524 }
525
526 return 0;
527 }
528
529 fs_reg
530 fs_visitor::get_timestamp()
531 {
532 assert(brw->gen >= 7);
533
534 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
535 BRW_ARF_TIMESTAMP,
536 0),
537 BRW_REGISTER_TYPE_UD));
538
539 fs_reg dst = fs_reg(this, glsl_type::uint_type);
540
541 fs_inst *mov = emit(MOV(dst, ts));
542 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
543 * even if it's not enabled in the dispatch.
544 */
545 mov->force_writemask_all = true;
546 mov->force_uncompressed = true;
547
548 /* The caller wants the low 32 bits of the timestamp. Since it's running
549 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
550 * which is plenty of time for our purposes. It is identical across the
551 * EUs, but since it's tracking GPU core speed it will increment at a
552 * varying rate as render P-states change.
553 *
554 * The caller could also check if render P-states have changed (or anything
555 * else that might disrupt timing) by setting smear to 2 and checking if
556 * that field is != 0.
557 */
558 dst.set_smear(0);
559
560 return dst;
561 }
562
563 void
564 fs_visitor::emit_shader_time_begin()
565 {
566 current_annotation = "shader time start";
567 shader_start_time = get_timestamp();
568 }
569
570 void
571 fs_visitor::emit_shader_time_end()
572 {
573 current_annotation = "shader time end";
574
575 enum shader_time_shader_type type, written_type, reset_type;
576 if (dispatch_width == 8) {
577 type = ST_FS8;
578 written_type = ST_FS8_WRITTEN;
579 reset_type = ST_FS8_RESET;
580 } else {
581 assert(dispatch_width == 16);
582 type = ST_FS16;
583 written_type = ST_FS16_WRITTEN;
584 reset_type = ST_FS16_RESET;
585 }
586
587 fs_reg shader_end_time = get_timestamp();
588
589 /* Check that there weren't any timestamp reset events (assuming these
590 * were the only two timestamp reads that happened).
591 */
592 fs_reg reset = shader_end_time;
593 reset.set_smear(2);
594 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
595 test->conditional_mod = BRW_CONDITIONAL_Z;
596 emit(IF(BRW_PREDICATE_NORMAL));
597
598 push_force_uncompressed();
599 fs_reg start = shader_start_time;
600 start.negate = true;
601 fs_reg diff = fs_reg(this, glsl_type::uint_type);
602 emit(ADD(diff, start, shader_end_time));
603
604 /* If there were no instructions between the two timestamp gets, the diff
605 * is 2 cycles. Remove that overhead, so I can forget about that when
606 * trying to determine the time taken for single instructions.
607 */
608 emit(ADD(diff, diff, fs_reg(-2u)));
609
610 emit_shader_time_write(type, diff);
611 emit_shader_time_write(written_type, fs_reg(1u));
612 emit(BRW_OPCODE_ELSE);
613 emit_shader_time_write(reset_type, fs_reg(1u));
614 emit(BRW_OPCODE_ENDIF);
615
616 pop_force_uncompressed();
617 }
618
619 void
620 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
621 fs_reg value)
622 {
623 int shader_time_index =
624 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
625 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
626
627 fs_reg payload;
628 if (dispatch_width == 8)
629 payload = fs_reg(this, glsl_type::uvec2_type);
630 else
631 payload = fs_reg(this, glsl_type::uint_type);
632
633 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
634 fs_reg(), payload, offset, value));
635 }
636
637 void
638 fs_visitor::vfail(const char *format, va_list va)
639 {
640 char *msg;
641
642 if (failed)
643 return;
644
645 failed = true;
646
647 msg = ralloc_vasprintf(mem_ctx, format, va);
648 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
649
650 this->fail_msg = msg;
651
652 if (INTEL_DEBUG & DEBUG_WM) {
653 fprintf(stderr, "%s", msg);
654 }
655 }
656
657 void
658 fs_visitor::fail(const char *format, ...)
659 {
660 va_list va;
661
662 va_start(va, format);
663 vfail(format, va);
664 va_end(va);
665 }
666
667 /**
668 * Mark this program as impossible to compile in SIMD16 mode.
669 *
670 * During the SIMD8 compile (which happens first), we can detect and flag
671 * things that are unsupported in SIMD16 mode, so the compiler can skip
672 * the SIMD16 compile altogether.
673 *
674 * During a SIMD16 compile (if one happens anyway), this just calls fail().
675 */
676 void
677 fs_visitor::no16(const char *format, ...)
678 {
679 va_list va;
680
681 va_start(va, format);
682
683 if (dispatch_width == 16) {
684 vfail(format, va);
685 } else {
686 simd16_unsupported = true;
687
688 if (brw->perf_debug) {
689 if (no16_msg)
690 ralloc_vasprintf_append(&no16_msg, format, va);
691 else
692 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
693 }
694 }
695
696 va_end(va);
697 }
698
699 fs_inst *
700 fs_visitor::emit(enum opcode opcode)
701 {
702 return emit(new(mem_ctx) fs_inst(opcode));
703 }
704
705 fs_inst *
706 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
707 {
708 return emit(new(mem_ctx) fs_inst(opcode, dst));
709 }
710
711 fs_inst *
712 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
713 {
714 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
715 }
716
717 fs_inst *
718 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
719 const fs_reg &src1)
720 {
721 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
722 }
723
724 fs_inst *
725 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
726 const fs_reg &src1, const fs_reg &src2)
727 {
728 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
729 }
730
731 fs_inst *
732 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
733 fs_reg src[], int sources)
734 {
735 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
736 }
737
738 void
739 fs_visitor::push_force_uncompressed()
740 {
741 force_uncompressed_stack++;
742 }
743
744 void
745 fs_visitor::pop_force_uncompressed()
746 {
747 force_uncompressed_stack--;
748 assert(force_uncompressed_stack >= 0);
749 }
750
751 /**
752 * Returns true if the instruction has a flag that means it won't
753 * update an entire destination register.
754 *
755 * For example, dead code elimination and live variable analysis want to know
756 * when a write to a variable screens off any preceding values that were in
757 * it.
758 */
759 bool
760 fs_inst::is_partial_write() const
761 {
762 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
763 this->force_uncompressed ||
764 this->force_sechalf || !this->dst.is_contiguous());
765 }
766
767 int
768 fs_inst::regs_read(fs_visitor *v, int arg) const
769 {
770 if (is_tex() && arg == 0 && src[0].file == GRF) {
771 if (v->dispatch_width == 16)
772 return (mlen + 1) / 2;
773 else
774 return mlen;
775 }
776 return 1;
777 }
778
779 bool
780 fs_inst::reads_flag() const
781 {
782 return predicate;
783 }
784
785 bool
786 fs_inst::writes_flag() const
787 {
788 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
789 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
790 }
791
792 /**
793 * Returns how many MRFs an FS opcode will write over.
794 *
795 * Note that this is not the 0 or 1 implied writes in an actual gen
796 * instruction -- the FS opcodes often generate MOVs in addition.
797 */
798 int
799 fs_visitor::implied_mrf_writes(fs_inst *inst)
800 {
801 if (inst->mlen == 0)
802 return 0;
803
804 if (inst->base_mrf == -1)
805 return 0;
806
807 switch (inst->opcode) {
808 case SHADER_OPCODE_RCP:
809 case SHADER_OPCODE_RSQ:
810 case SHADER_OPCODE_SQRT:
811 case SHADER_OPCODE_EXP2:
812 case SHADER_OPCODE_LOG2:
813 case SHADER_OPCODE_SIN:
814 case SHADER_OPCODE_COS:
815 return 1 * dispatch_width / 8;
816 case SHADER_OPCODE_POW:
817 case SHADER_OPCODE_INT_QUOTIENT:
818 case SHADER_OPCODE_INT_REMAINDER:
819 return 2 * dispatch_width / 8;
820 case SHADER_OPCODE_TEX:
821 case FS_OPCODE_TXB:
822 case SHADER_OPCODE_TXD:
823 case SHADER_OPCODE_TXF:
824 case SHADER_OPCODE_TXF_CMS:
825 case SHADER_OPCODE_TXF_MCS:
826 case SHADER_OPCODE_TG4:
827 case SHADER_OPCODE_TG4_OFFSET:
828 case SHADER_OPCODE_TXL:
829 case SHADER_OPCODE_TXS:
830 case SHADER_OPCODE_LOD:
831 return 1;
832 case FS_OPCODE_FB_WRITE:
833 return 2;
834 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
835 case SHADER_OPCODE_GEN4_SCRATCH_READ:
836 return 1;
837 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
838 return inst->mlen;
839 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
840 return 2;
841 case SHADER_OPCODE_UNTYPED_ATOMIC:
842 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
843 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
844 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
845 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
846 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
847 return 0;
848 default:
849 unreachable("not reached");
850 }
851 }
852
853 int
854 fs_visitor::virtual_grf_alloc(int size)
855 {
856 if (virtual_grf_array_size <= virtual_grf_count) {
857 if (virtual_grf_array_size == 0)
858 virtual_grf_array_size = 16;
859 else
860 virtual_grf_array_size *= 2;
861 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
862 virtual_grf_array_size);
863 }
864 virtual_grf_sizes[virtual_grf_count] = size;
865 return virtual_grf_count++;
866 }
867
868 /** Fixed HW reg constructor. */
869 fs_reg::fs_reg(enum register_file file, int reg)
870 {
871 init();
872 this->file = file;
873 this->reg = reg;
874 this->type = BRW_REGISTER_TYPE_F;
875 }
876
877 /** Fixed HW reg constructor. */
878 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
879 {
880 init();
881 this->file = file;
882 this->reg = reg;
883 this->type = type;
884 }
885
886 /** Automatic reg constructor. */
887 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
888 {
889 init();
890
891 this->file = GRF;
892 this->reg = v->virtual_grf_alloc(v->type_size(type));
893 this->reg_offset = 0;
894 this->type = brw_type_for_base_type(type);
895 }
896
897 fs_reg *
898 fs_visitor::variable_storage(ir_variable *var)
899 {
900 return (fs_reg *)hash_table_find(this->variable_ht, var);
901 }
902
903 void
904 import_uniforms_callback(const void *key,
905 void *data,
906 void *closure)
907 {
908 struct hash_table *dst_ht = (struct hash_table *)closure;
909 const fs_reg *reg = (const fs_reg *)data;
910
911 if (reg->file != UNIFORM)
912 return;
913
914 hash_table_insert(dst_ht, data, key);
915 }
916
917 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
918 * This brings in those uniform definitions
919 */
920 void
921 fs_visitor::import_uniforms(fs_visitor *v)
922 {
923 hash_table_call_foreach(v->variable_ht,
924 import_uniforms_callback,
925 variable_ht);
926 this->push_constant_loc = v->push_constant_loc;
927 this->pull_constant_loc = v->pull_constant_loc;
928 this->uniforms = v->uniforms;
929 this->param_size = v->param_size;
930 }
931
932 /* Our support for uniforms is piggy-backed on the struct
933 * gl_fragment_program, because that's where the values actually
934 * get stored, rather than in some global gl_shader_program uniform
935 * store.
936 */
937 void
938 fs_visitor::setup_uniform_values(ir_variable *ir)
939 {
940 int namelen = strlen(ir->name);
941
942 /* The data for our (non-builtin) uniforms is stored in a series of
943 * gl_uniform_driver_storage structs for each subcomponent that
944 * glGetUniformLocation() could name. We know it's been set up in the same
945 * order we'd walk the type, so walk the list of storage and find anything
946 * with our name, or the prefix of a component that starts with our name.
947 */
948 unsigned params_before = uniforms;
949 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
950 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
951
952 if (strncmp(ir->name, storage->name, namelen) != 0 ||
953 (storage->name[namelen] != 0 &&
954 storage->name[namelen] != '.' &&
955 storage->name[namelen] != '[')) {
956 continue;
957 }
958
959 unsigned slots = storage->type->component_slots();
960 if (storage->array_elements)
961 slots *= storage->array_elements;
962
963 for (unsigned i = 0; i < slots; i++) {
964 stage_prog_data->param[uniforms++] = &storage->storage[i].f;
965 }
966 }
967
968 /* Make sure we actually initialized the right amount of stuff here. */
969 assert(params_before + ir->type->component_slots() == uniforms);
970 (void)params_before;
971 }
972
973
974 /* Our support for builtin uniforms is even scarier than non-builtin.
975 * It sits on top of the PROG_STATE_VAR parameters that are
976 * automatically updated from GL context state.
977 */
978 void
979 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
980 {
981 const ir_state_slot *const slots = ir->state_slots;
982 assert(ir->state_slots != NULL);
983
984 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
985 /* This state reference has already been setup by ir_to_mesa, but we'll
986 * get the same index back here.
987 */
988 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
989 (gl_state_index *)slots[i].tokens);
990
991 /* Add each of the unique swizzles of the element as a parameter.
992 * This'll end up matching the expected layout of the
993 * array/matrix/structure we're trying to fill in.
994 */
995 int last_swiz = -1;
996 for (unsigned int j = 0; j < 4; j++) {
997 int swiz = GET_SWZ(slots[i].swizzle, j);
998 if (swiz == last_swiz)
999 break;
1000 last_swiz = swiz;
1001
1002 stage_prog_data->param[uniforms++] =
1003 &fp->Base.Parameters->ParameterValues[index][swiz].f;
1004 }
1005 }
1006 }
1007
1008 fs_reg *
1009 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1010 {
1011 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1012 fs_reg wpos = *reg;
1013 bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1014
1015 /* gl_FragCoord.x */
1016 if (ir->data.pixel_center_integer) {
1017 emit(MOV(wpos, this->pixel_x));
1018 } else {
1019 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1020 }
1021 wpos.reg_offset++;
1022
1023 /* gl_FragCoord.y */
1024 if (!flip && ir->data.pixel_center_integer) {
1025 emit(MOV(wpos, this->pixel_y));
1026 } else {
1027 fs_reg pixel_y = this->pixel_y;
1028 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1029
1030 if (flip) {
1031 pixel_y.negate = true;
1032 offset += key->drawable_height - 1.0;
1033 }
1034
1035 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1036 }
1037 wpos.reg_offset++;
1038
1039 /* gl_FragCoord.z */
1040 if (brw->gen >= 6) {
1041 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1042 } else {
1043 emit(FS_OPCODE_LINTERP, wpos,
1044 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1045 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1046 interp_reg(VARYING_SLOT_POS, 2));
1047 }
1048 wpos.reg_offset++;
1049
1050 /* gl_FragCoord.w: Already set up in emit_interpolation */
1051 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1052
1053 return reg;
1054 }
1055
1056 fs_inst *
1057 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1058 glsl_interp_qualifier interpolation_mode,
1059 bool is_centroid, bool is_sample)
1060 {
1061 brw_wm_barycentric_interp_mode barycoord_mode;
1062 if (brw->gen >= 6) {
1063 if (is_centroid) {
1064 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1065 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1066 else
1067 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1068 } else if (is_sample) {
1069 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1070 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1071 else
1072 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1073 } else {
1074 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1075 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1076 else
1077 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1078 }
1079 } else {
1080 /* On Ironlake and below, there is only one interpolation mode.
1081 * Centroid interpolation doesn't mean anything on this hardware --
1082 * there is no multisampling.
1083 */
1084 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1085 }
1086 return emit(FS_OPCODE_LINTERP, attr,
1087 this->delta_x[barycoord_mode],
1088 this->delta_y[barycoord_mode], interp);
1089 }
1090
1091 fs_reg *
1092 fs_visitor::emit_general_interpolation(ir_variable *ir)
1093 {
1094 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1095 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1096 fs_reg attr = *reg;
1097
1098 unsigned int array_elements;
1099 const glsl_type *type;
1100
1101 if (ir->type->is_array()) {
1102 array_elements = ir->type->length;
1103 if (array_elements == 0) {
1104 fail("dereferenced array '%s' has length 0\n", ir->name);
1105 }
1106 type = ir->type->fields.array;
1107 } else {
1108 array_elements = 1;
1109 type = ir->type;
1110 }
1111
1112 glsl_interp_qualifier interpolation_mode =
1113 ir->determine_interpolation_mode(key->flat_shade);
1114
1115 int location = ir->data.location;
1116 for (unsigned int i = 0; i < array_elements; i++) {
1117 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1118 if (prog_data->urb_setup[location] == -1) {
1119 /* If there's no incoming setup data for this slot, don't
1120 * emit interpolation for it.
1121 */
1122 attr.reg_offset += type->vector_elements;
1123 location++;
1124 continue;
1125 }
1126
1127 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1128 /* Constant interpolation (flat shading) case. The SF has
1129 * handed us defined values in only the constant offset
1130 * field of the setup reg.
1131 */
1132 for (unsigned int k = 0; k < type->vector_elements; k++) {
1133 struct brw_reg interp = interp_reg(location, k);
1134 interp = suboffset(interp, 3);
1135 interp.type = reg->type;
1136 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1137 attr.reg_offset++;
1138 }
1139 } else {
1140 /* Smooth/noperspective interpolation case. */
1141 for (unsigned int k = 0; k < type->vector_elements; k++) {
1142 struct brw_reg interp = interp_reg(location, k);
1143 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1144 /* Get the pixel/sample mask into f0 so that we know
1145 * which pixels are lit. Then, for each channel that is
1146 * unlit, replace the centroid data with non-centroid
1147 * data.
1148 */
1149 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1150
1151 fs_inst *inst;
1152 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1153 false, false);
1154 inst->predicate = BRW_PREDICATE_NORMAL;
1155 inst->predicate_inverse = true;
1156 if (brw->has_pln)
1157 inst->no_dd_clear = true;
1158
1159 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1160 ir->data.centroid && !key->persample_shading,
1161 ir->data.sample || key->persample_shading);
1162 inst->predicate = BRW_PREDICATE_NORMAL;
1163 inst->predicate_inverse = false;
1164 if (brw->has_pln)
1165 inst->no_dd_check = true;
1166
1167 } else {
1168 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1169 ir->data.centroid && !key->persample_shading,
1170 ir->data.sample || key->persample_shading);
1171 }
1172 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1173 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1174 }
1175 attr.reg_offset++;
1176 }
1177
1178 }
1179 location++;
1180 }
1181 }
1182
1183 return reg;
1184 }
1185
1186 fs_reg *
1187 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1188 {
1189 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1190
1191 /* The frontfacing comes in as a bit in the thread payload. */
1192 if (brw->gen >= 6) {
1193 emit(BRW_OPCODE_ASR, *reg,
1194 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1195 fs_reg(15));
1196 emit(BRW_OPCODE_NOT, *reg, *reg);
1197 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1198 } else {
1199 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1200 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1201 * us front face
1202 */
1203 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1204 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1205 }
1206
1207 return reg;
1208 }
1209
1210 void
1211 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1212 {
1213 assert(dst.type == BRW_REGISTER_TYPE_F);
1214
1215 if (key->compute_pos_offset) {
1216 /* Convert int_sample_pos to floating point */
1217 emit(MOV(dst, int_sample_pos));
1218 /* Scale to the range [0, 1] */
1219 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1220 }
1221 else {
1222 /* From ARB_sample_shading specification:
1223 * "When rendering to a non-multisample buffer, or if multisample
1224 * rasterization is disabled, gl_SamplePosition will always be
1225 * (0.5, 0.5).
1226 */
1227 emit(MOV(dst, fs_reg(0.5f)));
1228 }
1229 }
1230
1231 fs_reg *
1232 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1233 {
1234 assert(brw->gen >= 6);
1235 assert(ir->type == glsl_type::vec2_type);
1236
1237 this->current_annotation = "compute sample position";
1238 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1239 fs_reg pos = *reg;
1240 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1241 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1242
1243 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1244 * mode will be enabled.
1245 *
1246 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1247 * R31.1:0 Position Offset X/Y for Slot[3:0]
1248 * R31.3:2 Position Offset X/Y for Slot[7:4]
1249 * .....
1250 *
1251 * The X, Y sample positions come in as bytes in thread payload. So, read
1252 * the positions using vstride=16, width=8, hstride=2.
1253 */
1254 struct brw_reg sample_pos_reg =
1255 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1256 BRW_REGISTER_TYPE_B), 16, 8, 2);
1257
1258 fs_inst *inst = emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1259 if (dispatch_width == 16) {
1260 inst->force_uncompressed = true;
1261 inst = emit(MOV(half(int_sample_x, 1),
1262 fs_reg(suboffset(sample_pos_reg, 16))));
1263 inst->force_sechalf = true;
1264 }
1265 /* Compute gl_SamplePosition.x */
1266 compute_sample_position(pos, int_sample_x);
1267 pos.reg_offset++;
1268 inst = emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1269 if (dispatch_width == 16) {
1270 inst->force_uncompressed = true;
1271 inst = emit(MOV(half(int_sample_y, 1),
1272 fs_reg(suboffset(sample_pos_reg, 17))));
1273 inst->force_sechalf = true;
1274 }
1275 /* Compute gl_SamplePosition.y */
1276 compute_sample_position(pos, int_sample_y);
1277 return reg;
1278 }
1279
1280 fs_reg *
1281 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1282 {
1283 assert(brw->gen >= 6);
1284
1285 this->current_annotation = "compute sample id";
1286 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1287
1288 if (key->compute_sample_id) {
1289 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1290 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1291 t2.type = BRW_REGISTER_TYPE_UW;
1292
1293 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1294 * 8x multisampling, subspan 0 will represent sample N (where N
1295 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1296 * 7. We can find the value of N by looking at R0.0 bits 7:6
1297 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1298 * (since samples are always delivered in pairs). That is, we
1299 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1300 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1301 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1302 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1303 * populating a temporary variable with the sequence (0, 1, 2, 3),
1304 * and then reading from it using vstride=1, width=4, hstride=0.
1305 * These computations hold good for 4x multisampling as well.
1306 *
1307 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1308 * the first four slots are sample 0 of subspan 0; the next four
1309 * are sample 1 of subspan 0; the third group is sample 0 of
1310 * subspan 1, and finally sample 1 of subspan 1.
1311 */
1312 fs_inst *inst;
1313 inst = emit(BRW_OPCODE_AND, t1,
1314 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1315 fs_reg(0xc0));
1316 inst->force_writemask_all = true;
1317 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1318 inst->force_writemask_all = true;
1319 /* This works for both SIMD8 and SIMD16 */
1320 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1321 inst->force_writemask_all = true;
1322 /* This special instruction takes care of setting vstride=1,
1323 * width=4, hstride=0 of t2 during an ADD instruction.
1324 */
1325 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1326 } else {
1327 /* As per GL_ARB_sample_shading specification:
1328 * "When rendering to a non-multisample buffer, or if multisample
1329 * rasterization is disabled, gl_SampleID will always be zero."
1330 */
1331 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1332 }
1333
1334 return reg;
1335 }
1336
1337 fs_reg
1338 fs_visitor::fix_math_operand(fs_reg src)
1339 {
1340 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1341 * might be able to do better by doing execsize = 1 math and then
1342 * expanding that result out, but we would need to be careful with
1343 * masking.
1344 *
1345 * The hardware ignores source modifiers (negate and abs) on math
1346 * instructions, so we also move to a temp to set those up.
1347 */
1348 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1349 !src.abs && !src.negate)
1350 return src;
1351
1352 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1353 * operands to math
1354 */
1355 if (brw->gen >= 7 && src.file != IMM)
1356 return src;
1357
1358 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1359 expanded.type = src.type;
1360 emit(BRW_OPCODE_MOV, expanded, src);
1361 return expanded;
1362 }
1363
1364 fs_inst *
1365 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1366 {
1367 switch (opcode) {
1368 case SHADER_OPCODE_RCP:
1369 case SHADER_OPCODE_RSQ:
1370 case SHADER_OPCODE_SQRT:
1371 case SHADER_OPCODE_EXP2:
1372 case SHADER_OPCODE_LOG2:
1373 case SHADER_OPCODE_SIN:
1374 case SHADER_OPCODE_COS:
1375 break;
1376 default:
1377 unreachable("not reached: bad math opcode");
1378 }
1379
1380 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1381 * might be able to do better by doing execsize = 1 math and then
1382 * expanding that result out, but we would need to be careful with
1383 * masking.
1384 *
1385 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1386 * instructions, so we also move to a temp to set those up.
1387 */
1388 if (brw->gen == 6 || brw->gen == 7)
1389 src = fix_math_operand(src);
1390
1391 fs_inst *inst = emit(opcode, dst, src);
1392
1393 if (brw->gen < 6) {
1394 inst->base_mrf = 2;
1395 inst->mlen = dispatch_width / 8;
1396 }
1397
1398 return inst;
1399 }
1400
1401 fs_inst *
1402 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1403 {
1404 int base_mrf = 2;
1405 fs_inst *inst;
1406
1407 switch (opcode) {
1408 case SHADER_OPCODE_INT_QUOTIENT:
1409 case SHADER_OPCODE_INT_REMAINDER:
1410 if (brw->gen >= 7)
1411 no16("SIMD16 INTDIV unsupported\n");
1412 break;
1413 case SHADER_OPCODE_POW:
1414 break;
1415 default:
1416 unreachable("not reached: unsupported binary math opcode.");
1417 }
1418
1419 if (brw->gen >= 8) {
1420 inst = emit(opcode, dst, src0, src1);
1421 } else if (brw->gen >= 6) {
1422 src0 = fix_math_operand(src0);
1423 src1 = fix_math_operand(src1);
1424
1425 inst = emit(opcode, dst, src0, src1);
1426 } else {
1427 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1428 * "Message Payload":
1429 *
1430 * "Operand0[7]. For the INT DIV functions, this operand is the
1431 * denominator."
1432 * ...
1433 * "Operand1[7]. For the INT DIV functions, this operand is the
1434 * numerator."
1435 */
1436 bool is_int_div = opcode != SHADER_OPCODE_POW;
1437 fs_reg &op0 = is_int_div ? src1 : src0;
1438 fs_reg &op1 = is_int_div ? src0 : src1;
1439
1440 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1441 inst = emit(opcode, dst, op0, reg_null_f);
1442
1443 inst->base_mrf = base_mrf;
1444 inst->mlen = 2 * dispatch_width / 8;
1445 }
1446 return inst;
1447 }
1448
1449 void
1450 fs_visitor::assign_curb_setup()
1451 {
1452 if (dispatch_width == 8) {
1453 prog_data->base.dispatch_grf_start_reg = payload.num_regs;
1454 } else {
1455 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1456 }
1457
1458 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1459
1460 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1461 foreach_in_list(fs_inst, inst, &instructions) {
1462 for (unsigned int i = 0; i < inst->sources; i++) {
1463 if (inst->src[i].file == UNIFORM) {
1464 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1465 int constant_nr;
1466 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1467 constant_nr = push_constant_loc[uniform_nr];
1468 } else {
1469 /* Section 5.11 of the OpenGL 4.1 spec says:
1470 * "Out-of-bounds reads return undefined values, which include
1471 * values from other variables of the active program or zero."
1472 * Just return the first push constant.
1473 */
1474 constant_nr = 0;
1475 }
1476
1477 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1478 constant_nr / 8,
1479 constant_nr % 8);
1480
1481 inst->src[i].file = HW_REG;
1482 inst->src[i].fixed_hw_reg = byte_offset(
1483 retype(brw_reg, inst->src[i].type),
1484 inst->src[i].subreg_offset);
1485 }
1486 }
1487 }
1488 }
1489
1490 void
1491 fs_visitor::calculate_urb_setup()
1492 {
1493 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1494 prog_data->urb_setup[i] = -1;
1495 }
1496
1497 int urb_next = 0;
1498 /* Figure out where each of the incoming setup attributes lands. */
1499 if (brw->gen >= 6) {
1500 if (_mesa_bitcount_64(fp->Base.InputsRead &
1501 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1502 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1503 * first 16 varying inputs, so we can put them wherever we want.
1504 * Just put them in order.
1505 *
1506 * This is useful because it means that (a) inputs not used by the
1507 * fragment shader won't take up valuable register space, and (b) we
1508 * won't have to recompile the fragment shader if it gets paired with
1509 * a different vertex (or geometry) shader.
1510 */
1511 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1512 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1513 BITFIELD64_BIT(i)) {
1514 prog_data->urb_setup[i] = urb_next++;
1515 }
1516 }
1517 } else {
1518 /* We have enough input varyings that the SF/SBE pipeline stage can't
1519 * arbitrarily rearrange them to suit our whim; we have to put them
1520 * in an order that matches the output of the previous pipeline stage
1521 * (geometry or vertex shader).
1522 */
1523 struct brw_vue_map prev_stage_vue_map;
1524 brw_compute_vue_map(brw, &prev_stage_vue_map,
1525 key->input_slots_valid);
1526 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1527 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1528 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1529 slot++) {
1530 int varying = prev_stage_vue_map.slot_to_varying[slot];
1531 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1532 * unused.
1533 */
1534 if (varying != BRW_VARYING_SLOT_COUNT &&
1535 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1536 BITFIELD64_BIT(varying))) {
1537 prog_data->urb_setup[varying] = slot - first_slot;
1538 }
1539 }
1540 urb_next = prev_stage_vue_map.num_slots - first_slot;
1541 }
1542 } else {
1543 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1544 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1545 /* Point size is packed into the header, not as a general attribute */
1546 if (i == VARYING_SLOT_PSIZ)
1547 continue;
1548
1549 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1550 /* The back color slot is skipped when the front color is
1551 * also written to. In addition, some slots can be
1552 * written in the vertex shader and not read in the
1553 * fragment shader. So the register number must always be
1554 * incremented, mapped or not.
1555 */
1556 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1557 prog_data->urb_setup[i] = urb_next;
1558 urb_next++;
1559 }
1560 }
1561
1562 /*
1563 * It's a FS only attribute, and we did interpolation for this attribute
1564 * in SF thread. So, count it here, too.
1565 *
1566 * See compile_sf_prog() for more info.
1567 */
1568 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1569 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1570 }
1571
1572 prog_data->num_varying_inputs = urb_next;
1573 }
1574
1575 void
1576 fs_visitor::assign_urb_setup()
1577 {
1578 int urb_start = payload.num_regs + prog_data->curb_read_length;
1579
1580 /* Offset all the urb_setup[] index by the actual position of the
1581 * setup regs, now that the location of the constants has been chosen.
1582 */
1583 foreach_in_list(fs_inst, inst, &instructions) {
1584 if (inst->opcode == FS_OPCODE_LINTERP) {
1585 assert(inst->src[2].file == HW_REG);
1586 inst->src[2].fixed_hw_reg.nr += urb_start;
1587 }
1588
1589 if (inst->opcode == FS_OPCODE_CINTERP) {
1590 assert(inst->src[0].file == HW_REG);
1591 inst->src[0].fixed_hw_reg.nr += urb_start;
1592 }
1593 }
1594
1595 /* Each attribute is 4 setup channels, each of which is half a reg. */
1596 this->first_non_payload_grf =
1597 urb_start + prog_data->num_varying_inputs * 2;
1598 }
1599
1600 /**
1601 * Split large virtual GRFs into separate components if we can.
1602 *
1603 * This is mostly duplicated with what brw_fs_vector_splitting does,
1604 * but that's really conservative because it's afraid of doing
1605 * splitting that doesn't result in real progress after the rest of
1606 * the optimization phases, which would cause infinite looping in
1607 * optimization. We can do it once here, safely. This also has the
1608 * opportunity to split interpolated values, or maybe even uniforms,
1609 * which we don't have at the IR level.
1610 *
1611 * We want to split, because virtual GRFs are what we register
1612 * allocate and spill (due to contiguousness requirements for some
1613 * instructions), and they're what we naturally generate in the
1614 * codegen process, but most virtual GRFs don't actually need to be
1615 * contiguous sets of GRFs. If we split, we'll end up with reduced
1616 * live intervals and better dead code elimination and coalescing.
1617 */
1618 void
1619 fs_visitor::split_virtual_grfs()
1620 {
1621 int num_vars = this->virtual_grf_count;
1622 bool split_grf[num_vars];
1623 int new_virtual_grf[num_vars];
1624
1625 /* Try to split anything > 0 sized. */
1626 for (int i = 0; i < num_vars; i++) {
1627 if (this->virtual_grf_sizes[i] != 1)
1628 split_grf[i] = true;
1629 else
1630 split_grf[i] = false;
1631 }
1632
1633 if (brw->has_pln &&
1634 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1635 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1636 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1637 * Gen6, that was the only supported interpolation mode, and since Gen6,
1638 * delta_x and delta_y are in fixed hardware registers.
1639 */
1640 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1641 false;
1642 }
1643
1644 foreach_in_list(fs_inst, inst, &instructions) {
1645 /* If there's a SEND message that requires contiguous destination
1646 * registers, no splitting is allowed.
1647 */
1648 if (inst->regs_written > 1) {
1649 split_grf[inst->dst.reg] = false;
1650 }
1651
1652 /* If we're sending from a GRF, don't split it, on the assumption that
1653 * the send is reading the whole thing.
1654 */
1655 if (inst->is_send_from_grf()) {
1656 for (int i = 0; i < inst->sources; i++) {
1657 if (inst->src[i].file == GRF) {
1658 split_grf[inst->src[i].reg] = false;
1659 }
1660 }
1661 }
1662 }
1663
1664 /* Allocate new space for split regs. Note that the virtual
1665 * numbers will be contiguous.
1666 */
1667 for (int i = 0; i < num_vars; i++) {
1668 if (split_grf[i]) {
1669 new_virtual_grf[i] = virtual_grf_alloc(1);
1670 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1671 int reg = virtual_grf_alloc(1);
1672 assert(reg == new_virtual_grf[i] + j - 1);
1673 (void) reg;
1674 }
1675 this->virtual_grf_sizes[i] = 1;
1676 }
1677 }
1678
1679 foreach_in_list(fs_inst, inst, &instructions) {
1680 if (inst->dst.file == GRF &&
1681 split_grf[inst->dst.reg] &&
1682 inst->dst.reg_offset != 0) {
1683 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1684 inst->dst.reg_offset - 1);
1685 inst->dst.reg_offset = 0;
1686 }
1687 for (int i = 0; i < inst->sources; i++) {
1688 if (inst->src[i].file == GRF &&
1689 split_grf[inst->src[i].reg] &&
1690 inst->src[i].reg_offset != 0) {
1691 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1692 inst->src[i].reg_offset - 1);
1693 inst->src[i].reg_offset = 0;
1694 }
1695 }
1696 }
1697 invalidate_live_intervals();
1698 }
1699
1700 /**
1701 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1702 *
1703 * During code generation, we create tons of temporary variables, many of
1704 * which get immediately killed and are never used again. Yet, in later
1705 * optimization and analysis passes, such as compute_live_intervals, we need
1706 * to loop over all the virtual GRFs. Compacting them can save a lot of
1707 * overhead.
1708 */
1709 void
1710 fs_visitor::compact_virtual_grfs()
1711 {
1712 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER))
1713 return;
1714
1715 /* Mark which virtual GRFs are used, and count how many. */
1716 int remap_table[this->virtual_grf_count];
1717 memset(remap_table, -1, sizeof(remap_table));
1718
1719 foreach_in_list(const fs_inst, inst, &instructions) {
1720 if (inst->dst.file == GRF)
1721 remap_table[inst->dst.reg] = 0;
1722
1723 for (int i = 0; i < inst->sources; i++) {
1724 if (inst->src[i].file == GRF)
1725 remap_table[inst->src[i].reg] = 0;
1726 }
1727 }
1728
1729 /* Compact the GRF arrays. */
1730 int new_index = 0;
1731 for (int i = 0; i < this->virtual_grf_count; i++) {
1732 if (remap_table[i] != -1) {
1733 remap_table[i] = new_index;
1734 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1735 invalidate_live_intervals();
1736 ++new_index;
1737 }
1738 }
1739
1740 this->virtual_grf_count = new_index;
1741
1742 /* Patch all the instructions to use the newly renumbered registers */
1743 foreach_in_list(fs_inst, inst, &instructions) {
1744 if (inst->dst.file == GRF)
1745 inst->dst.reg = remap_table[inst->dst.reg];
1746
1747 for (int i = 0; i < inst->sources; i++) {
1748 if (inst->src[i].file == GRF)
1749 inst->src[i].reg = remap_table[inst->src[i].reg];
1750 }
1751 }
1752
1753 /* Patch all the references to delta_x/delta_y, since they're used in
1754 * register allocation.
1755 */
1756 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
1757 if (delta_x[i].file == GRF && remap_table[delta_x[i].reg] != -1) {
1758 delta_x[i].reg = remap_table[delta_x[i].reg];
1759 }
1760 }
1761 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
1762 if (delta_y[i].file == GRF && remap_table[delta_y[i].reg] != -1) {
1763 delta_y[i].reg = remap_table[delta_y[i].reg];
1764 }
1765 }
1766 }
1767
1768 /*
1769 * Implements array access of uniforms by inserting a
1770 * PULL_CONSTANT_LOAD instruction.
1771 *
1772 * Unlike temporary GRF array access (where we don't support it due to
1773 * the difficulty of doing relative addressing on instruction
1774 * destinations), we could potentially do array access of uniforms
1775 * that were loaded in GRF space as push constants. In real-world
1776 * usage we've seen, though, the arrays being used are always larger
1777 * than we could load as push constants, so just always move all
1778 * uniform array access out to a pull constant buffer.
1779 */
1780 void
1781 fs_visitor::move_uniform_array_access_to_pull_constants()
1782 {
1783 if (dispatch_width != 8)
1784 return;
1785
1786 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1787
1788 for (unsigned int i = 0; i < uniforms; i++) {
1789 pull_constant_loc[i] = -1;
1790 }
1791
1792 /* Walk through and find array access of uniforms. Put a copy of that
1793 * uniform in the pull constant buffer.
1794 *
1795 * Note that we don't move constant-indexed accesses to arrays. No
1796 * testing has been done of the performance impact of this choice.
1797 */
1798 foreach_in_list_safe(fs_inst, inst, &instructions) {
1799 for (int i = 0 ; i < inst->sources; i++) {
1800 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1801 continue;
1802
1803 int uniform = inst->src[i].reg;
1804
1805 /* If this array isn't already present in the pull constant buffer,
1806 * add it.
1807 */
1808 if (pull_constant_loc[uniform] == -1) {
1809 const float **values = &stage_prog_data->param[uniform];
1810
1811 assert(param_size[uniform]);
1812
1813 for (int j = 0; j < param_size[uniform]; j++) {
1814 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1815
1816 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1817 values[j];
1818 }
1819 }
1820 }
1821 }
1822 }
1823
1824 /**
1825 * Assign UNIFORM file registers to either push constants or pull constants.
1826 *
1827 * We allow a fragment shader to have more than the specified minimum
1828 * maximum number of fragment shader uniform components (64). If
1829 * there are too many of these, they'd fill up all of register space.
1830 * So, this will push some of them out to the pull constant buffer and
1831 * update the program to load them.
1832 */
1833 void
1834 fs_visitor::assign_constant_locations()
1835 {
1836 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1837 if (dispatch_width != 8)
1838 return;
1839
1840 /* Find which UNIFORM registers are still in use. */
1841 bool is_live[uniforms];
1842 for (unsigned int i = 0; i < uniforms; i++) {
1843 is_live[i] = false;
1844 }
1845
1846 foreach_in_list(fs_inst, inst, &instructions) {
1847 for (int i = 0; i < inst->sources; i++) {
1848 if (inst->src[i].file != UNIFORM)
1849 continue;
1850
1851 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1852 if (constant_nr >= 0 && constant_nr < (int) uniforms)
1853 is_live[constant_nr] = true;
1854 }
1855 }
1856
1857 /* Only allow 16 registers (128 uniform components) as push constants.
1858 *
1859 * Just demote the end of the list. We could probably do better
1860 * here, demoting things that are rarely used in the program first.
1861 *
1862 * If changing this value, note the limitation about total_regs in
1863 * brw_curbe.c.
1864 */
1865 unsigned int max_push_components = 16 * 8;
1866 unsigned int num_push_constants = 0;
1867
1868 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1869
1870 for (unsigned int i = 0; i < uniforms; i++) {
1871 if (!is_live[i] || pull_constant_loc[i] != -1) {
1872 /* This UNIFORM register is either dead, or has already been demoted
1873 * to a pull const. Mark it as no longer living in the param[] array.
1874 */
1875 push_constant_loc[i] = -1;
1876 continue;
1877 }
1878
1879 if (num_push_constants < max_push_components) {
1880 /* Retain as a push constant. Record the location in the params[]
1881 * array.
1882 */
1883 push_constant_loc[i] = num_push_constants++;
1884 } else {
1885 /* Demote to a pull constant. */
1886 push_constant_loc[i] = -1;
1887
1888 int pull_index = stage_prog_data->nr_pull_params++;
1889 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1890 pull_constant_loc[i] = pull_index;
1891 }
1892 }
1893
1894 stage_prog_data->nr_params = num_push_constants;
1895
1896 /* Up until now, the param[] array has been indexed by reg + reg_offset
1897 * of UNIFORM registers. Condense it to only contain the uniforms we
1898 * chose to upload as push constants.
1899 */
1900 for (unsigned int i = 0; i < uniforms; i++) {
1901 int remapped = push_constant_loc[i];
1902
1903 if (remapped == -1)
1904 continue;
1905
1906 assert(remapped <= (int)i);
1907 stage_prog_data->param[remapped] = stage_prog_data->param[i];
1908 }
1909 }
1910
1911 /**
1912 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1913 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1914 */
1915 void
1916 fs_visitor::demote_pull_constants()
1917 {
1918 foreach_in_list(fs_inst, inst, &instructions) {
1919 for (int i = 0; i < inst->sources; i++) {
1920 if (inst->src[i].file != UNIFORM)
1921 continue;
1922
1923 int pull_index = pull_constant_loc[inst->src[i].reg +
1924 inst->src[i].reg_offset];
1925 if (pull_index == -1)
1926 continue;
1927
1928 /* Set up the annotation tracking for new generated instructions. */
1929 base_ir = inst->ir;
1930 current_annotation = inst->annotation;
1931
1932 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1933 fs_reg dst = fs_reg(this, glsl_type::float_type);
1934
1935 /* Generate a pull load into dst. */
1936 if (inst->src[i].reladdr) {
1937 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
1938 surf_index,
1939 *inst->src[i].reladdr,
1940 pull_index);
1941 inst->insert_before(&list);
1942 inst->src[i].reladdr = NULL;
1943 } else {
1944 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1945 fs_inst *pull =
1946 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1947 dst, surf_index, offset);
1948 inst->insert_before(pull);
1949 inst->src[i].set_smear(pull_index & 3);
1950 }
1951
1952 /* Rewrite the instruction to use the temporary VGRF. */
1953 inst->src[i].file = GRF;
1954 inst->src[i].reg = dst.reg;
1955 inst->src[i].reg_offset = 0;
1956 }
1957 }
1958 invalidate_live_intervals();
1959 }
1960
1961 bool
1962 fs_visitor::opt_algebraic()
1963 {
1964 bool progress = false;
1965
1966 foreach_in_list(fs_inst, inst, &instructions) {
1967 switch (inst->opcode) {
1968 case BRW_OPCODE_MUL:
1969 if (inst->src[1].file != IMM)
1970 continue;
1971
1972 /* a * 1.0 = a */
1973 if (inst->src[1].is_one()) {
1974 inst->opcode = BRW_OPCODE_MOV;
1975 inst->src[1] = reg_undef;
1976 progress = true;
1977 break;
1978 }
1979
1980 /* a * 0.0 = 0.0 */
1981 if (inst->src[1].is_zero()) {
1982 inst->opcode = BRW_OPCODE_MOV;
1983 inst->src[0] = inst->src[1];
1984 inst->src[1] = reg_undef;
1985 progress = true;
1986 break;
1987 }
1988
1989 break;
1990 case BRW_OPCODE_ADD:
1991 if (inst->src[1].file != IMM)
1992 continue;
1993
1994 /* a + 0.0 = a */
1995 if (inst->src[1].is_zero()) {
1996 inst->opcode = BRW_OPCODE_MOV;
1997 inst->src[1] = reg_undef;
1998 progress = true;
1999 break;
2000 }
2001 break;
2002 case BRW_OPCODE_OR:
2003 if (inst->src[0].equals(inst->src[1])) {
2004 inst->opcode = BRW_OPCODE_MOV;
2005 inst->src[1] = reg_undef;
2006 progress = true;
2007 break;
2008 }
2009 break;
2010 case BRW_OPCODE_LRP:
2011 if (inst->src[1].equals(inst->src[2])) {
2012 inst->opcode = BRW_OPCODE_MOV;
2013 inst->src[0] = inst->src[1];
2014 inst->src[1] = reg_undef;
2015 inst->src[2] = reg_undef;
2016 progress = true;
2017 break;
2018 }
2019 break;
2020 case BRW_OPCODE_SEL:
2021 if (inst->src[0].equals(inst->src[1])) {
2022 inst->opcode = BRW_OPCODE_MOV;
2023 inst->src[1] = reg_undef;
2024 inst->predicate = BRW_PREDICATE_NONE;
2025 inst->predicate_inverse = false;
2026 progress = true;
2027 } else if (inst->saturate && inst->src[1].file == IMM) {
2028 switch (inst->conditional_mod) {
2029 case BRW_CONDITIONAL_LE:
2030 case BRW_CONDITIONAL_L:
2031 switch (inst->src[1].type) {
2032 case BRW_REGISTER_TYPE_F:
2033 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2034 inst->opcode = BRW_OPCODE_MOV;
2035 inst->src[1] = reg_undef;
2036 progress = true;
2037 }
2038 break;
2039 default:
2040 break;
2041 }
2042 break;
2043 case BRW_CONDITIONAL_GE:
2044 case BRW_CONDITIONAL_G:
2045 switch (inst->src[1].type) {
2046 case BRW_REGISTER_TYPE_F:
2047 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2048 inst->opcode = BRW_OPCODE_MOV;
2049 inst->src[1] = reg_undef;
2050 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2051 progress = true;
2052 }
2053 break;
2054 default:
2055 break;
2056 }
2057 default:
2058 break;
2059 }
2060 }
2061 break;
2062 default:
2063 break;
2064 }
2065 }
2066
2067 return progress;
2068 }
2069
2070 bool
2071 fs_visitor::compute_to_mrf()
2072 {
2073 bool progress = false;
2074 int next_ip = 0;
2075
2076 calculate_live_intervals();
2077
2078 foreach_in_list_safe(fs_inst, inst, &instructions) {
2079 int ip = next_ip;
2080 next_ip++;
2081
2082 if (inst->opcode != BRW_OPCODE_MOV ||
2083 inst->is_partial_write() ||
2084 inst->dst.file != MRF || inst->src[0].file != GRF ||
2085 inst->dst.type != inst->src[0].type ||
2086 inst->src[0].abs || inst->src[0].negate ||
2087 !inst->src[0].is_contiguous() ||
2088 inst->src[0].subreg_offset)
2089 continue;
2090
2091 /* Work out which hardware MRF registers are written by this
2092 * instruction.
2093 */
2094 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2095 int mrf_high;
2096 if (inst->dst.reg & BRW_MRF_COMPR4) {
2097 mrf_high = mrf_low + 4;
2098 } else if (dispatch_width == 16 &&
2099 (!inst->force_uncompressed && !inst->force_sechalf)) {
2100 mrf_high = mrf_low + 1;
2101 } else {
2102 mrf_high = mrf_low;
2103 }
2104
2105 /* Can't compute-to-MRF this GRF if someone else was going to
2106 * read it later.
2107 */
2108 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2109 continue;
2110
2111 /* Found a move of a GRF to a MRF. Let's see if we can go
2112 * rewrite the thing that made this GRF to write into the MRF.
2113 */
2114 fs_inst *scan_inst;
2115 for (scan_inst = (fs_inst *)inst->prev;
2116 !scan_inst->is_head_sentinel();
2117 scan_inst = (fs_inst *)scan_inst->prev) {
2118 if (scan_inst->dst.file == GRF &&
2119 scan_inst->dst.reg == inst->src[0].reg) {
2120 /* Found the last thing to write our reg we want to turn
2121 * into a compute-to-MRF.
2122 */
2123
2124 /* If this one instruction didn't populate all the
2125 * channels, bail. We might be able to rewrite everything
2126 * that writes that reg, but it would require smarter
2127 * tracking to delay the rewriting until complete success.
2128 */
2129 if (scan_inst->is_partial_write())
2130 break;
2131
2132 /* Things returning more than one register would need us to
2133 * understand coalescing out more than one MOV at a time.
2134 */
2135 if (scan_inst->regs_written > 1)
2136 break;
2137
2138 /* SEND instructions can't have MRF as a destination. */
2139 if (scan_inst->mlen)
2140 break;
2141
2142 if (brw->gen == 6) {
2143 /* gen6 math instructions must have the destination be
2144 * GRF, so no compute-to-MRF for them.
2145 */
2146 if (scan_inst->is_math()) {
2147 break;
2148 }
2149 }
2150
2151 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2152 /* Found the creator of our MRF's source value. */
2153 scan_inst->dst.file = MRF;
2154 scan_inst->dst.reg = inst->dst.reg;
2155 scan_inst->saturate |= inst->saturate;
2156 inst->remove();
2157 progress = true;
2158 }
2159 break;
2160 }
2161
2162 /* We don't handle control flow here. Most computation of
2163 * values that end up in MRFs are shortly before the MRF
2164 * write anyway.
2165 */
2166 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2167 break;
2168
2169 /* You can't read from an MRF, so if someone else reads our
2170 * MRF's source GRF that we wanted to rewrite, that stops us.
2171 */
2172 bool interfered = false;
2173 for (int i = 0; i < scan_inst->sources; i++) {
2174 if (scan_inst->src[i].file == GRF &&
2175 scan_inst->src[i].reg == inst->src[0].reg &&
2176 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2177 interfered = true;
2178 }
2179 }
2180 if (interfered)
2181 break;
2182
2183 if (scan_inst->dst.file == MRF) {
2184 /* If somebody else writes our MRF here, we can't
2185 * compute-to-MRF before that.
2186 */
2187 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2188 int scan_mrf_high;
2189
2190 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2191 scan_mrf_high = scan_mrf_low + 4;
2192 } else if (dispatch_width == 16 &&
2193 (!scan_inst->force_uncompressed &&
2194 !scan_inst->force_sechalf)) {
2195 scan_mrf_high = scan_mrf_low + 1;
2196 } else {
2197 scan_mrf_high = scan_mrf_low;
2198 }
2199
2200 if (mrf_low == scan_mrf_low ||
2201 mrf_low == scan_mrf_high ||
2202 mrf_high == scan_mrf_low ||
2203 mrf_high == scan_mrf_high) {
2204 break;
2205 }
2206 }
2207
2208 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2209 /* Found a SEND instruction, which means that there are
2210 * live values in MRFs from base_mrf to base_mrf +
2211 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2212 * above it.
2213 */
2214 if (mrf_low >= scan_inst->base_mrf &&
2215 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2216 break;
2217 }
2218 if (mrf_high >= scan_inst->base_mrf &&
2219 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2220 break;
2221 }
2222 }
2223 }
2224 }
2225
2226 if (progress)
2227 invalidate_live_intervals();
2228
2229 return progress;
2230 }
2231
2232 /**
2233 * Walks through basic blocks, looking for repeated MRF writes and
2234 * removing the later ones.
2235 */
2236 bool
2237 fs_visitor::remove_duplicate_mrf_writes()
2238 {
2239 fs_inst *last_mrf_move[16];
2240 bool progress = false;
2241
2242 /* Need to update the MRF tracking for compressed instructions. */
2243 if (dispatch_width == 16)
2244 return false;
2245
2246 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2247
2248 foreach_in_list_safe(fs_inst, inst, &instructions) {
2249 if (inst->is_control_flow()) {
2250 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2251 }
2252
2253 if (inst->opcode == BRW_OPCODE_MOV &&
2254 inst->dst.file == MRF) {
2255 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2256 if (prev_inst && inst->equals(prev_inst)) {
2257 inst->remove();
2258 progress = true;
2259 continue;
2260 }
2261 }
2262
2263 /* Clear out the last-write records for MRFs that were overwritten. */
2264 if (inst->dst.file == MRF) {
2265 last_mrf_move[inst->dst.reg] = NULL;
2266 }
2267
2268 if (inst->mlen > 0 && inst->base_mrf != -1) {
2269 /* Found a SEND instruction, which will include two or fewer
2270 * implied MRF writes. We could do better here.
2271 */
2272 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2273 last_mrf_move[inst->base_mrf + i] = NULL;
2274 }
2275 }
2276
2277 /* Clear out any MRF move records whose sources got overwritten. */
2278 if (inst->dst.file == GRF) {
2279 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2280 if (last_mrf_move[i] &&
2281 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2282 last_mrf_move[i] = NULL;
2283 }
2284 }
2285 }
2286
2287 if (inst->opcode == BRW_OPCODE_MOV &&
2288 inst->dst.file == MRF &&
2289 inst->src[0].file == GRF &&
2290 !inst->is_partial_write()) {
2291 last_mrf_move[inst->dst.reg] = inst;
2292 }
2293 }
2294
2295 if (progress)
2296 invalidate_live_intervals();
2297
2298 return progress;
2299 }
2300
2301 static void
2302 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2303 int first_grf, int grf_len)
2304 {
2305 bool inst_simd16 = (dispatch_width > 8 &&
2306 !inst->force_uncompressed &&
2307 !inst->force_sechalf);
2308
2309 /* Clear the flag for registers that actually got read (as expected). */
2310 for (int i = 0; i < inst->sources; i++) {
2311 int grf;
2312 if (inst->src[i].file == GRF) {
2313 grf = inst->src[i].reg;
2314 } else if (inst->src[i].file == HW_REG &&
2315 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2316 grf = inst->src[i].fixed_hw_reg.nr;
2317 } else {
2318 continue;
2319 }
2320
2321 if (grf >= first_grf &&
2322 grf < first_grf + grf_len) {
2323 deps[grf - first_grf] = false;
2324 if (inst_simd16)
2325 deps[grf - first_grf + 1] = false;
2326 }
2327 }
2328 }
2329
2330 /**
2331 * Implements this workaround for the original 965:
2332 *
2333 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2334 * check for post destination dependencies on this instruction, software
2335 * must ensure that there is no destination hazard for the case of ‘write
2336 * followed by a posted write’ shown in the following example.
2337 *
2338 * 1. mov r3 0
2339 * 2. send r3.xy <rest of send instruction>
2340 * 3. mov r2 r3
2341 *
2342 * Due to no post-destination dependency check on the ‘send’, the above
2343 * code sequence could have two instructions (1 and 2) in flight at the
2344 * same time that both consider ‘r3’ as the target of their final writes.
2345 */
2346 void
2347 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2348 {
2349 int reg_size = dispatch_width / 8;
2350 int write_len = inst->regs_written * reg_size;
2351 int first_write_grf = inst->dst.reg;
2352 bool needs_dep[BRW_MAX_MRF];
2353 assert(write_len < (int)sizeof(needs_dep) - 1);
2354
2355 memset(needs_dep, false, sizeof(needs_dep));
2356 memset(needs_dep, true, write_len);
2357
2358 clear_deps_for_inst_src(inst, dispatch_width,
2359 needs_dep, first_write_grf, write_len);
2360
2361 /* Walk backwards looking for writes to registers we're writing which
2362 * aren't read since being written. If we hit the start of the program,
2363 * we assume that there are no outstanding dependencies on entry to the
2364 * program.
2365 */
2366 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2367 !scan_inst->is_head_sentinel();
2368 scan_inst = (fs_inst *)scan_inst->prev) {
2369
2370 /* If we hit control flow, assume that there *are* outstanding
2371 * dependencies, and force their cleanup before our instruction.
2372 */
2373 if (scan_inst->is_control_flow()) {
2374 for (int i = 0; i < write_len; i++) {
2375 if (needs_dep[i]) {
2376 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2377 }
2378 }
2379 return;
2380 }
2381
2382 bool scan_inst_simd16 = (dispatch_width > 8 &&
2383 !scan_inst->force_uncompressed &&
2384 !scan_inst->force_sechalf);
2385
2386 /* We insert our reads as late as possible on the assumption that any
2387 * instruction but a MOV that might have left us an outstanding
2388 * dependency has more latency than a MOV.
2389 */
2390 if (scan_inst->dst.file == GRF) {
2391 for (int i = 0; i < scan_inst->regs_written; i++) {
2392 int reg = scan_inst->dst.reg + i * reg_size;
2393
2394 if (reg >= first_write_grf &&
2395 reg < first_write_grf + write_len &&
2396 needs_dep[reg - first_write_grf]) {
2397 inst->insert_before(DEP_RESOLVE_MOV(reg));
2398 needs_dep[reg - first_write_grf] = false;
2399 if (scan_inst_simd16)
2400 needs_dep[reg - first_write_grf + 1] = false;
2401 }
2402 }
2403 }
2404
2405 /* Clear the flag for registers that actually got read (as expected). */
2406 clear_deps_for_inst_src(scan_inst, dispatch_width,
2407 needs_dep, first_write_grf, write_len);
2408
2409 /* Continue the loop only if we haven't resolved all the dependencies */
2410 int i;
2411 for (i = 0; i < write_len; i++) {
2412 if (needs_dep[i])
2413 break;
2414 }
2415 if (i == write_len)
2416 return;
2417 }
2418 }
2419
2420 /**
2421 * Implements this workaround for the original 965:
2422 *
2423 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2424 * used as a destination register until after it has been sourced by an
2425 * instruction with a different destination register.
2426 */
2427 void
2428 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2429 {
2430 int write_len = inst->regs_written * dispatch_width / 8;
2431 int first_write_grf = inst->dst.reg;
2432 bool needs_dep[BRW_MAX_MRF];
2433 assert(write_len < (int)sizeof(needs_dep) - 1);
2434
2435 memset(needs_dep, false, sizeof(needs_dep));
2436 memset(needs_dep, true, write_len);
2437 /* Walk forwards looking for writes to registers we're writing which aren't
2438 * read before being written.
2439 */
2440 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2441 !scan_inst->is_tail_sentinel();
2442 scan_inst = (fs_inst *)scan_inst->next) {
2443 /* If we hit control flow, force resolve all remaining dependencies. */
2444 if (scan_inst->is_control_flow()) {
2445 for (int i = 0; i < write_len; i++) {
2446 if (needs_dep[i])
2447 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2448 }
2449 return;
2450 }
2451
2452 /* Clear the flag for registers that actually got read (as expected). */
2453 clear_deps_for_inst_src(scan_inst, dispatch_width,
2454 needs_dep, first_write_grf, write_len);
2455
2456 /* We insert our reads as late as possible since they're reading the
2457 * result of a SEND, which has massive latency.
2458 */
2459 if (scan_inst->dst.file == GRF &&
2460 scan_inst->dst.reg >= first_write_grf &&
2461 scan_inst->dst.reg < first_write_grf + write_len &&
2462 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2463 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2464 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2465 }
2466
2467 /* Continue the loop only if we haven't resolved all the dependencies */
2468 int i;
2469 for (i = 0; i < write_len; i++) {
2470 if (needs_dep[i])
2471 break;
2472 }
2473 if (i == write_len)
2474 return;
2475 }
2476
2477 /* If we hit the end of the program, resolve all remaining dependencies out
2478 * of paranoia.
2479 */
2480 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2481 assert(last_inst->eot);
2482 for (int i = 0; i < write_len; i++) {
2483 if (needs_dep[i])
2484 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2485 }
2486 }
2487
2488 void
2489 fs_visitor::insert_gen4_send_dependency_workarounds()
2490 {
2491 if (brw->gen != 4 || brw->is_g4x)
2492 return;
2493
2494 bool progress = false;
2495
2496 /* Note that we're done with register allocation, so GRF fs_regs always
2497 * have a .reg_offset of 0.
2498 */
2499
2500 foreach_in_list_safe(fs_inst, inst, &instructions) {
2501 if (inst->mlen != 0 && inst->dst.file == GRF) {
2502 insert_gen4_pre_send_dependency_workarounds(inst);
2503 insert_gen4_post_send_dependency_workarounds(inst);
2504 progress = true;
2505 }
2506 }
2507
2508 if (progress)
2509 invalidate_live_intervals();
2510 }
2511
2512 /**
2513 * Turns the generic expression-style uniform pull constant load instruction
2514 * into a hardware-specific series of instructions for loading a pull
2515 * constant.
2516 *
2517 * The expression style allows the CSE pass before this to optimize out
2518 * repeated loads from the same offset, and gives the pre-register-allocation
2519 * scheduling full flexibility, while the conversion to native instructions
2520 * allows the post-register-allocation scheduler the best information
2521 * possible.
2522 *
2523 * Note that execution masking for setting up pull constant loads is special:
2524 * the channels that need to be written are unrelated to the current execution
2525 * mask, since a later instruction will use one of the result channels as a
2526 * source operand for all 8 or 16 of its channels.
2527 */
2528 void
2529 fs_visitor::lower_uniform_pull_constant_loads()
2530 {
2531 foreach_in_list(fs_inst, inst, &instructions) {
2532 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2533 continue;
2534
2535 if (brw->gen >= 7) {
2536 /* The offset arg before was a vec4-aligned byte offset. We need to
2537 * turn it into a dword offset.
2538 */
2539 fs_reg const_offset_reg = inst->src[1];
2540 assert(const_offset_reg.file == IMM &&
2541 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2542 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2543 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2544
2545 /* This is actually going to be a MOV, but since only the first dword
2546 * is accessed, we have a special opcode to do just that one. Note
2547 * that this needs to be an operation that will be considered a def
2548 * by live variable analysis, or register allocation will explode.
2549 */
2550 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2551 payload, const_offset_reg);
2552 setup->force_writemask_all = true;
2553
2554 setup->ir = inst->ir;
2555 setup->annotation = inst->annotation;
2556 inst->insert_before(setup);
2557
2558 /* Similarly, this will only populate the first 4 channels of the
2559 * result register (since we only use smear values from 0-3), but we
2560 * don't tell the optimizer.
2561 */
2562 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2563 inst->src[1] = payload;
2564
2565 invalidate_live_intervals();
2566 } else {
2567 /* Before register allocation, we didn't tell the scheduler about the
2568 * MRF we use. We know it's safe to use this MRF because nothing
2569 * else does except for register spill/unspill, which generates and
2570 * uses its MRF within a single IR instruction.
2571 */
2572 inst->base_mrf = 14;
2573 inst->mlen = 1;
2574 }
2575 }
2576 }
2577
2578 bool
2579 fs_visitor::lower_load_payload()
2580 {
2581 bool progress = false;
2582
2583 foreach_in_list_safe(fs_inst, inst, &instructions) {
2584 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
2585 fs_reg dst = inst->dst;
2586
2587 /* src[0] represents the (optional) message header. */
2588 if (inst->src[0].file != BAD_FILE) {
2589 inst->insert_before(MOV(dst, inst->src[0]));
2590 }
2591 dst.reg_offset++;
2592
2593 for (int i = 1; i < inst->sources; i++) {
2594 inst->insert_before(MOV(dst, inst->src[i]));
2595 dst.reg_offset++;
2596 }
2597
2598 inst->remove();
2599 progress = true;
2600 }
2601 }
2602
2603 if (progress)
2604 invalidate_live_intervals();
2605
2606 return progress;
2607 }
2608
2609 void
2610 fs_visitor::dump_instructions()
2611 {
2612 dump_instructions(NULL);
2613 }
2614
2615 void
2616 fs_visitor::dump_instructions(const char *name)
2617 {
2618 calculate_register_pressure();
2619 FILE *file = stderr;
2620 if (name && geteuid() != 0) {
2621 file = fopen(name, "w");
2622 if (!file)
2623 file = stderr;
2624 }
2625
2626 int ip = 0, max_pressure = 0;
2627 foreach_in_list(backend_instruction, inst, &instructions) {
2628 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2629 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
2630 dump_instruction(inst, file);
2631 ++ip;
2632 }
2633 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
2634
2635 if (file != stderr) {
2636 fclose(file);
2637 }
2638 }
2639
2640 void
2641 fs_visitor::dump_instruction(backend_instruction *be_inst)
2642 {
2643 dump_instruction(be_inst, stderr);
2644 }
2645
2646 void
2647 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
2648 {
2649 fs_inst *inst = (fs_inst *)be_inst;
2650
2651 if (inst->predicate) {
2652 fprintf(file, "(%cf0.%d) ",
2653 inst->predicate_inverse ? '-' : '+',
2654 inst->flag_subreg);
2655 }
2656
2657 fprintf(file, "%s", brw_instruction_name(inst->opcode));
2658 if (inst->saturate)
2659 fprintf(file, ".sat");
2660 if (inst->conditional_mod) {
2661 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
2662 if (!inst->predicate &&
2663 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2664 inst->opcode != BRW_OPCODE_IF &&
2665 inst->opcode != BRW_OPCODE_WHILE))) {
2666 fprintf(file, ".f0.%d", inst->flag_subreg);
2667 }
2668 }
2669 fprintf(file, " ");
2670
2671
2672 switch (inst->dst.file) {
2673 case GRF:
2674 fprintf(file, "vgrf%d", inst->dst.reg);
2675 if (virtual_grf_sizes[inst->dst.reg] != 1 ||
2676 inst->dst.subreg_offset)
2677 fprintf(file, "+%d.%d",
2678 inst->dst.reg_offset, inst->dst.subreg_offset);
2679 break;
2680 case MRF:
2681 fprintf(file, "m%d", inst->dst.reg);
2682 break;
2683 case BAD_FILE:
2684 fprintf(file, "(null)");
2685 break;
2686 case UNIFORM:
2687 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
2688 break;
2689 case HW_REG:
2690 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2691 switch (inst->dst.fixed_hw_reg.nr) {
2692 case BRW_ARF_NULL:
2693 fprintf(file, "null");
2694 break;
2695 case BRW_ARF_ADDRESS:
2696 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
2697 break;
2698 case BRW_ARF_ACCUMULATOR:
2699 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
2700 break;
2701 case BRW_ARF_FLAG:
2702 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2703 inst->dst.fixed_hw_reg.subnr);
2704 break;
2705 default:
2706 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2707 inst->dst.fixed_hw_reg.subnr);
2708 break;
2709 }
2710 } else {
2711 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
2712 }
2713 if (inst->dst.fixed_hw_reg.subnr)
2714 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
2715 break;
2716 default:
2717 fprintf(file, "???");
2718 break;
2719 }
2720 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
2721
2722 for (int i = 0; i < inst->sources && inst->src[i].file != BAD_FILE; i++) {
2723 if (inst->src[i].negate)
2724 fprintf(file, "-");
2725 if (inst->src[i].abs)
2726 fprintf(file, "|");
2727 switch (inst->src[i].file) {
2728 case GRF:
2729 fprintf(file, "vgrf%d", inst->src[i].reg);
2730 if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2731 inst->src[i].subreg_offset)
2732 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2733 inst->src[i].subreg_offset);
2734 break;
2735 case MRF:
2736 fprintf(file, "***m%d***", inst->src[i].reg);
2737 break;
2738 case UNIFORM:
2739 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
2740 if (inst->src[i].reladdr) {
2741 fprintf(file, "+reladdr");
2742 } else if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2743 inst->src[i].subreg_offset) {
2744 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2745 inst->src[i].subreg_offset);
2746 }
2747 break;
2748 case BAD_FILE:
2749 fprintf(file, "(null)");
2750 break;
2751 case IMM:
2752 switch (inst->src[i].type) {
2753 case BRW_REGISTER_TYPE_F:
2754 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
2755 break;
2756 case BRW_REGISTER_TYPE_D:
2757 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
2758 break;
2759 case BRW_REGISTER_TYPE_UD:
2760 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
2761 break;
2762 default:
2763 fprintf(file, "???");
2764 break;
2765 }
2766 break;
2767 case HW_REG:
2768 if (inst->src[i].fixed_hw_reg.negate)
2769 fprintf(file, "-");
2770 if (inst->src[i].fixed_hw_reg.abs)
2771 fprintf(file, "|");
2772 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2773 switch (inst->src[i].fixed_hw_reg.nr) {
2774 case BRW_ARF_NULL:
2775 fprintf(file, "null");
2776 break;
2777 case BRW_ARF_ADDRESS:
2778 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
2779 break;
2780 case BRW_ARF_ACCUMULATOR:
2781 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
2782 break;
2783 case BRW_ARF_FLAG:
2784 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2785 inst->src[i].fixed_hw_reg.subnr);
2786 break;
2787 default:
2788 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2789 inst->src[i].fixed_hw_reg.subnr);
2790 break;
2791 }
2792 } else {
2793 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2794 }
2795 if (inst->src[i].fixed_hw_reg.subnr)
2796 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
2797 if (inst->src[i].fixed_hw_reg.abs)
2798 fprintf(file, "|");
2799 break;
2800 default:
2801 fprintf(file, "???");
2802 break;
2803 }
2804 if (inst->src[i].abs)
2805 fprintf(file, "|");
2806
2807 if (inst->src[i].file != IMM) {
2808 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
2809 }
2810
2811 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
2812 fprintf(file, ", ");
2813 }
2814
2815 fprintf(file, " ");
2816
2817 if (inst->force_uncompressed)
2818 fprintf(file, "1sthalf ");
2819
2820 if (inst->force_sechalf)
2821 fprintf(file, "2ndhalf ");
2822
2823 fprintf(file, "\n");
2824 }
2825
2826 /**
2827 * Possibly returns an instruction that set up @param reg.
2828 *
2829 * Sometimes we want to take the result of some expression/variable
2830 * dereference tree and rewrite the instruction generating the result
2831 * of the tree. When processing the tree, we know that the
2832 * instructions generated are all writing temporaries that are dead
2833 * outside of this tree. So, if we have some instructions that write
2834 * a temporary, we're free to point that temp write somewhere else.
2835 *
2836 * Note that this doesn't guarantee that the instruction generated
2837 * only reg -- it might be the size=4 destination of a texture instruction.
2838 */
2839 fs_inst *
2840 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2841 fs_inst *end,
2842 const fs_reg &reg)
2843 {
2844 if (end == start ||
2845 end->is_partial_write() ||
2846 reg.reladdr ||
2847 !reg.equals(end->dst)) {
2848 return NULL;
2849 } else {
2850 return end;
2851 }
2852 }
2853
2854 void
2855 fs_visitor::setup_payload_gen6()
2856 {
2857 bool uses_depth =
2858 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2859 unsigned barycentric_interp_modes = prog_data->barycentric_interp_modes;
2860
2861 assert(brw->gen >= 6);
2862
2863 /* R0-1: masks, pixel X/Y coordinates. */
2864 payload.num_regs = 2;
2865 /* R2: only for 32-pixel dispatch.*/
2866
2867 /* R3-26: barycentric interpolation coordinates. These appear in the
2868 * same order that they appear in the brw_wm_barycentric_interp_mode
2869 * enum. Each set of coordinates occupies 2 registers if dispatch width
2870 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2871 * appear if they were enabled using the "Barycentric Interpolation
2872 * Mode" bits in WM_STATE.
2873 */
2874 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2875 if (barycentric_interp_modes & (1 << i)) {
2876 payload.barycentric_coord_reg[i] = payload.num_regs;
2877 payload.num_regs += 2;
2878 if (dispatch_width == 16) {
2879 payload.num_regs += 2;
2880 }
2881 }
2882 }
2883
2884 /* R27: interpolated depth if uses source depth */
2885 if (uses_depth) {
2886 payload.source_depth_reg = payload.num_regs;
2887 payload.num_regs++;
2888 if (dispatch_width == 16) {
2889 /* R28: interpolated depth if not SIMD8. */
2890 payload.num_regs++;
2891 }
2892 }
2893 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2894 if (uses_depth) {
2895 payload.source_w_reg = payload.num_regs;
2896 payload.num_regs++;
2897 if (dispatch_width == 16) {
2898 /* R30: interpolated W if not SIMD8. */
2899 payload.num_regs++;
2900 }
2901 }
2902
2903 prog_data->uses_pos_offset = key->compute_pos_offset;
2904 /* R31: MSAA position offsets. */
2905 if (prog_data->uses_pos_offset) {
2906 payload.sample_pos_reg = payload.num_regs;
2907 payload.num_regs++;
2908 }
2909
2910 /* R32: MSAA input coverage mask */
2911 if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
2912 assert(brw->gen >= 7);
2913 payload.sample_mask_in_reg = payload.num_regs;
2914 payload.num_regs++;
2915 if (dispatch_width == 16) {
2916 /* R33: input coverage mask if not SIMD8. */
2917 payload.num_regs++;
2918 }
2919 }
2920
2921 /* R34-: bary for 32-pixel. */
2922 /* R58-59: interp W for 32-pixel. */
2923
2924 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2925 source_depth_to_render_target = true;
2926 }
2927 }
2928
2929 void
2930 fs_visitor::assign_binding_table_offsets()
2931 {
2932 uint32_t next_binding_table_offset = 0;
2933
2934 /* If there are no color regions, we still perform an FB write to a null
2935 * renderbuffer, which we place at surface index 0.
2936 */
2937 prog_data->binding_table.render_target_start = next_binding_table_offset;
2938 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
2939
2940 assign_common_binding_table_offsets(next_binding_table_offset);
2941 }
2942
2943 void
2944 fs_visitor::calculate_register_pressure()
2945 {
2946 invalidate_live_intervals();
2947 calculate_live_intervals();
2948
2949 unsigned num_instructions = instructions.length();
2950
2951 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
2952
2953 for (int reg = 0; reg < virtual_grf_count; reg++) {
2954 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
2955 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
2956 }
2957 }
2958
2959 /**
2960 * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
2961 *
2962 * The needs_unlit_centroid_workaround ends up producing one of these per
2963 * channel of centroid input, so it's good to clean them up.
2964 *
2965 * An assumption here is that nothing ever modifies the dispatched pixels
2966 * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
2967 * dictates that anyway.
2968 */
2969 void
2970 fs_visitor::opt_drop_redundant_mov_to_flags()
2971 {
2972 bool flag_mov_found[2] = {false};
2973
2974 foreach_in_list_safe(fs_inst, inst, &instructions) {
2975 if (inst->is_control_flow()) {
2976 memset(flag_mov_found, 0, sizeof(flag_mov_found));
2977 } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
2978 if (!flag_mov_found[inst->flag_subreg])
2979 flag_mov_found[inst->flag_subreg] = true;
2980 else
2981 inst->remove();
2982 } else if (inst->writes_flag()) {
2983 flag_mov_found[inst->flag_subreg] = false;
2984 }
2985 }
2986 }
2987
2988 bool
2989 fs_visitor::run()
2990 {
2991 sanity_param_count = fp->Base.Parameters->NumParameters;
2992 bool allocated_without_spills;
2993
2994 assign_binding_table_offsets();
2995
2996 if (brw->gen >= 6)
2997 setup_payload_gen6();
2998 else
2999 setup_payload_gen4();
3000
3001 if (0) {
3002 emit_dummy_fs();
3003 } else {
3004 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3005 emit_shader_time_begin();
3006
3007 calculate_urb_setup();
3008 if (fp->Base.InputsRead > 0) {
3009 if (brw->gen < 6)
3010 emit_interpolation_setup_gen4();
3011 else
3012 emit_interpolation_setup_gen6();
3013 }
3014
3015 /* We handle discards by keeping track of the still-live pixels in f0.1.
3016 * Initialize it with the dispatched pixels.
3017 */
3018 if (fp->UsesKill || key->alpha_test_func) {
3019 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3020 discard_init->flag_subreg = 1;
3021 }
3022
3023 /* Generate FS IR for main(). (the visitor only descends into
3024 * functions called "main").
3025 */
3026 if (shader) {
3027 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3028 base_ir = ir;
3029 this->result = reg_undef;
3030 ir->accept(this);
3031 }
3032 } else {
3033 emit_fragment_program_code();
3034 }
3035 base_ir = NULL;
3036 if (failed)
3037 return false;
3038
3039 emit(FS_OPCODE_PLACEHOLDER_HALT);
3040
3041 if (key->alpha_test_func)
3042 emit_alpha_test();
3043
3044 emit_fb_writes();
3045
3046 split_virtual_grfs();
3047
3048 move_uniform_array_access_to_pull_constants();
3049 assign_constant_locations();
3050 demote_pull_constants();
3051
3052 opt_drop_redundant_mov_to_flags();
3053
3054 #define OPT(pass, args...) do { \
3055 pass_num++; \
3056 bool this_progress = pass(args); \
3057 \
3058 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3059 char filename[64]; \
3060 snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass, \
3061 dispatch_width, shader_prog->Name, iteration, pass_num); \
3062 \
3063 backend_visitor::dump_instructions(filename); \
3064 } \
3065 \
3066 progress = progress || this_progress; \
3067 } while (false)
3068
3069 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3070 char filename[64];
3071 snprintf(filename, 64, "fs%d-%04d-00-start",
3072 dispatch_width, shader_prog->Name);
3073
3074 backend_visitor::dump_instructions(filename);
3075 }
3076
3077 bool progress;
3078 int iteration = 0;
3079 do {
3080 progress = false;
3081 iteration++;
3082 int pass_num = 0;
3083
3084 compact_virtual_grfs();
3085
3086 OPT(remove_duplicate_mrf_writes);
3087
3088 OPT(opt_algebraic);
3089 OPT(opt_cse);
3090 OPT(opt_copy_propagate);
3091 OPT(opt_peephole_predicated_break);
3092 OPT(dead_code_eliminate);
3093 OPT(opt_peephole_sel);
3094 OPT(dead_control_flow_eliminate, this);
3095 OPT(opt_saturate_propagation);
3096 OPT(register_coalesce);
3097 OPT(compute_to_mrf);
3098 } while (progress);
3099
3100 if (lower_load_payload()) {
3101 register_coalesce();
3102 dead_code_eliminate();
3103 }
3104
3105 lower_uniform_pull_constant_loads();
3106
3107 assign_curb_setup();
3108 assign_urb_setup();
3109
3110 static enum instruction_scheduler_mode pre_modes[] = {
3111 SCHEDULE_PRE,
3112 SCHEDULE_PRE_NON_LIFO,
3113 SCHEDULE_PRE_LIFO,
3114 };
3115
3116 /* Try each scheduling heuristic to see if it can successfully register
3117 * allocate without spilling. They should be ordered by decreasing
3118 * performance but increasing likelihood of allocating.
3119 */
3120 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3121 schedule_instructions(pre_modes[i]);
3122
3123 if (0) {
3124 assign_regs_trivial();
3125 allocated_without_spills = true;
3126 } else {
3127 allocated_without_spills = assign_regs(false);
3128 }
3129 if (allocated_without_spills)
3130 break;
3131 }
3132
3133 if (!allocated_without_spills) {
3134 /* We assume that any spilling is worse than just dropping back to
3135 * SIMD8. There's probably actually some intermediate point where
3136 * SIMD16 with a couple of spills is still better.
3137 */
3138 if (dispatch_width == 16) {
3139 fail("Failure to register allocate. Reduce number of "
3140 "live scalar values to avoid this.");
3141 } else {
3142 perf_debug("Fragment shader triggered register spilling. "
3143 "Try reducing the number of live scalar values to "
3144 "improve performance.\n");
3145 }
3146
3147 /* Since we're out of heuristics, just go spill registers until we
3148 * get an allocation.
3149 */
3150 while (!assign_regs(true)) {
3151 if (failed)
3152 break;
3153 }
3154 }
3155 }
3156 assert(force_uncompressed_stack == 0);
3157
3158 /* This must come after all optimization and register allocation, since
3159 * it inserts dead code that happens to have side effects, and it does
3160 * so based on the actual physical registers in use.
3161 */
3162 insert_gen4_send_dependency_workarounds();
3163
3164 if (failed)
3165 return false;
3166
3167 if (!allocated_without_spills)
3168 schedule_instructions(SCHEDULE_POST);
3169
3170 if (last_scratch > 0) {
3171 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3172 }
3173
3174 if (dispatch_width == 8)
3175 prog_data->reg_blocks = brw_register_blocks(grf_used);
3176 else
3177 prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3178
3179 /* If any state parameters were appended, then ParameterValues could have
3180 * been realloced, in which case the driver uniform storage set up by
3181 * _mesa_associate_uniform_storage() would point to freed memory. Make
3182 * sure that didn't happen.
3183 */
3184 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3185
3186 return !failed;
3187 }
3188
3189 const unsigned *
3190 brw_wm_fs_emit(struct brw_context *brw,
3191 void *mem_ctx,
3192 const struct brw_wm_prog_key *key,
3193 struct brw_wm_prog_data *prog_data,
3194 struct gl_fragment_program *fp,
3195 struct gl_shader_program *prog,
3196 unsigned *final_assembly_size)
3197 {
3198 bool start_busy = false;
3199 double start_time = 0;
3200
3201 if (unlikely(brw->perf_debug)) {
3202 start_busy = (brw->batch.last_bo &&
3203 drm_intel_bo_busy(brw->batch.last_bo));
3204 start_time = get_time();
3205 }
3206
3207 struct brw_shader *shader = NULL;
3208 if (prog)
3209 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3210
3211 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3212 brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3213
3214 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3215 */
3216 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3217 if (!v.run()) {
3218 if (prog) {
3219 prog->LinkStatus = false;
3220 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3221 }
3222
3223 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3224 v.fail_msg);
3225
3226 return NULL;
3227 }
3228
3229 exec_list *simd16_instructions = NULL;
3230 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3231 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3232 if (!v.simd16_unsupported) {
3233 /* Try a SIMD16 compile */
3234 v2.import_uniforms(&v);
3235 if (!v2.run()) {
3236 perf_debug("SIMD16 shader failed to compile, falling back to "
3237 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3238 } else {
3239 simd16_instructions = &v2.instructions;
3240 }
3241 } else {
3242 perf_debug("SIMD16 shader unsupported, falling back to "
3243 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3244 }
3245 }
3246
3247 const unsigned *assembly = NULL;
3248 if (brw->gen >= 8) {
3249 gen8_fs_generator g(brw, mem_ctx, key, prog_data, prog, fp, v.do_dual_src);
3250 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3251 final_assembly_size);
3252 } else {
3253 fs_generator g(brw, mem_ctx, key, prog_data, prog, fp, v.do_dual_src,
3254 v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
3255 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3256 final_assembly_size);
3257 }
3258
3259 if (unlikely(brw->perf_debug) && shader) {
3260 if (shader->compiled_once)
3261 brw_wm_debug_recompile(brw, prog, key);
3262 shader->compiled_once = true;
3263
3264 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3265 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3266 (get_time() - start_time) * 1000);
3267 }
3268 }
3269
3270 return assembly;
3271 }
3272
3273 bool
3274 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3275 {
3276 struct brw_context *brw = brw_context(ctx);
3277 struct brw_wm_prog_key key;
3278
3279 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3280 return true;
3281
3282 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3283 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3284 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3285 bool program_uses_dfdy = fp->UsesDFdy;
3286
3287 memset(&key, 0, sizeof(key));
3288
3289 if (brw->gen < 6) {
3290 if (fp->UsesKill)
3291 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3292
3293 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3294 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3295
3296 /* Just assume depth testing. */
3297 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3298 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3299 }
3300
3301 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3302 BRW_FS_VARYING_INPUT_MASK) > 16)
3303 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3304
3305 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3306 for (unsigned i = 0; i < sampler_count; i++) {
3307 if (fp->Base.ShadowSamplers & (1 << i)) {
3308 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3309 key.tex.swizzles[i] =
3310 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3311 } else {
3312 /* Color sampler: assume no swizzling. */
3313 key.tex.swizzles[i] = SWIZZLE_XYZW;
3314 }
3315 }
3316
3317 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3318 key.drawable_height = ctx->DrawBuffer->Height;
3319 }
3320
3321 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3322 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3323 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3324
3325 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3326 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3327 key.nr_color_regions > 1;
3328 }
3329
3330 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3331 * quality of the derivatives is likely to be determined by the driconf
3332 * option.
3333 */
3334 key.high_quality_derivatives = brw->disable_derivative_optimization;
3335
3336 key.program_string_id = bfp->id;
3337
3338 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3339 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3340
3341 bool success = do_wm_prog(brw, prog, bfp, &key);
3342
3343 brw->wm.base.prog_offset = old_prog_offset;
3344 brw->wm.prog_data = old_prog_data;
3345
3346 return success;
3347 }