i965: Use unreachable() instead of unconditional assert().
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53
54 void
55 fs_inst::init(enum opcode opcode, const fs_reg &dst, fs_reg *src, int sources)
56 {
57 memset(this, 0, sizeof(*this));
58
59 this->opcode = opcode;
60 this->dst = dst;
61 this->src = src;
62 this->sources = sources;
63
64 this->conditional_mod = BRW_CONDITIONAL_NONE;
65
66 /* This will be the case for almost all instructions. */
67 this->regs_written = 1;
68
69 this->writes_accumulator = false;
70 }
71
72 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
73 {
74 fs_reg *src = ralloc_array(this, fs_reg, 3);
75 init(opcode, dst, src, 0);
76 }
77
78 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
79 {
80 fs_reg *src = ralloc_array(this, fs_reg, 3);
81 src[0] = src0;
82 init(opcode, dst, src, 1);
83 }
84
85 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
86 const fs_reg &src1)
87 {
88 fs_reg *src = ralloc_array(this, fs_reg, 3);
89 src[0] = src0;
90 src[1] = src1;
91 init(opcode, dst, src, 2);
92 }
93
94 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
95 const fs_reg &src1, const fs_reg &src2)
96 {
97 fs_reg *src = ralloc_array(this, fs_reg, 3);
98 src[0] = src0;
99 src[1] = src1;
100 src[2] = src2;
101 init(opcode, dst, src, 3);
102 }
103
104 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
105 {
106 init(opcode, dst, src, sources);
107 }
108
109 fs_inst::fs_inst(const fs_inst &that)
110 {
111 memcpy(this, &that, sizeof(that));
112
113 this->src = ralloc_array(this, fs_reg, that.sources);
114
115 for (int i = 0; i < that.sources; i++)
116 this->src[i] = that.src[i];
117 }
118
119 void
120 fs_inst::resize_sources(uint8_t num_sources)
121 {
122 if (this->sources != num_sources) {
123 this->src = reralloc(this, this->src, fs_reg, num_sources);
124 this->sources = num_sources;
125 }
126 }
127
128 #define ALU1(op) \
129 fs_inst * \
130 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
131 { \
132 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
133 }
134
135 #define ALU2(op) \
136 fs_inst * \
137 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
138 const fs_reg &src1) \
139 { \
140 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
141 }
142
143 #define ALU2_ACC(op) \
144 fs_inst * \
145 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
146 const fs_reg &src1) \
147 { \
148 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
149 inst->writes_accumulator = true; \
150 return inst; \
151 }
152
153 #define ALU3(op) \
154 fs_inst * \
155 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
156 const fs_reg &src1, const fs_reg &src2) \
157 { \
158 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
159 }
160
161 ALU1(NOT)
162 ALU1(MOV)
163 ALU1(FRC)
164 ALU1(RNDD)
165 ALU1(RNDE)
166 ALU1(RNDZ)
167 ALU2(ADD)
168 ALU2(MUL)
169 ALU2_ACC(MACH)
170 ALU2(AND)
171 ALU2(OR)
172 ALU2(XOR)
173 ALU2(SHL)
174 ALU2(SHR)
175 ALU2(ASR)
176 ALU3(LRP)
177 ALU1(BFREV)
178 ALU3(BFE)
179 ALU2(BFI1)
180 ALU3(BFI2)
181 ALU1(FBH)
182 ALU1(FBL)
183 ALU1(CBIT)
184 ALU3(MAD)
185 ALU2_ACC(ADDC)
186 ALU2_ACC(SUBB)
187 ALU2(SEL)
188 ALU2(MAC)
189
190 /** Gen4 predicated IF. */
191 fs_inst *
192 fs_visitor::IF(uint32_t predicate)
193 {
194 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
195 inst->predicate = predicate;
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 fs_inst *
201 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1, uint32_t condition)
202 {
203 assert(brw->gen == 6);
204 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
205 reg_null_d, src0, src1);
206 inst->conditional_mod = condition;
207 return inst;
208 }
209
210 /**
211 * CMP: Sets the low bit of the destination channels with the result
212 * of the comparison, while the upper bits are undefined, and updates
213 * the flag register with the packed 16 bits of the result.
214 */
215 fs_inst *
216 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
217 {
218 fs_inst *inst;
219
220 /* Take the instruction:
221 *
222 * CMP null<d> src0<f> src1<f>
223 *
224 * Original gen4 does type conversion to the destination type before
225 * comparison, producing garbage results for floating point comparisons.
226 * gen5 does the comparison on the execution type (resolved source types),
227 * so dst type doesn't matter. gen6 does comparison and then uses the
228 * result as if it was the dst type with no conversion, which happens to
229 * mostly work out for float-interpreted-as-int since our comparisons are
230 * for >0, =0, <0.
231 */
232 if (brw->gen == 4) {
233 dst.type = src0.type;
234 if (dst.file == HW_REG)
235 dst.fixed_hw_reg.type = dst.type;
236 }
237
238 resolve_ud_negate(&src0);
239 resolve_ud_negate(&src1);
240
241 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
242 inst->conditional_mod = condition;
243
244 return inst;
245 }
246
247 fs_inst *
248 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
249 {
250 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst, src,
251 sources);
252 inst->regs_written = sources;
253
254 return inst;
255 }
256
257 exec_list
258 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
259 const fs_reg &surf_index,
260 const fs_reg &varying_offset,
261 uint32_t const_offset)
262 {
263 exec_list instructions;
264 fs_inst *inst;
265
266 /* We have our constant surface use a pitch of 4 bytes, so our index can
267 * be any component of a vector, and then we load 4 contiguous
268 * components starting from that.
269 *
270 * We break down the const_offset to a portion added to the variable
271 * offset and a portion done using reg_offset, which means that if you
272 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
273 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
274 * CSE can later notice that those loads are all the same and eliminate
275 * the redundant ones.
276 */
277 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
278 instructions.push_tail(ADD(vec4_offset,
279 varying_offset, const_offset & ~3));
280
281 int scale = 1;
282 if (brw->gen == 4 && dispatch_width == 8) {
283 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
284 * u, v, r) as parameters, or we can just use the SIMD16 message
285 * consisting of (header, u). We choose the second, at the cost of a
286 * longer return length.
287 */
288 scale = 2;
289 }
290
291 enum opcode op;
292 if (brw->gen >= 7)
293 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
294 else
295 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
296 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
297 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
298 inst->regs_written = 4 * scale;
299 instructions.push_tail(inst);
300
301 if (brw->gen < 7) {
302 inst->base_mrf = 13;
303 inst->header_present = true;
304 if (brw->gen == 4)
305 inst->mlen = 3;
306 else
307 inst->mlen = 1 + dispatch_width / 8;
308 }
309
310 vec4_result.reg_offset += (const_offset & 3) * scale;
311 instructions.push_tail(MOV(dst, vec4_result));
312
313 return instructions;
314 }
315
316 /**
317 * A helper for MOV generation for fixing up broken hardware SEND dependency
318 * handling.
319 */
320 fs_inst *
321 fs_visitor::DEP_RESOLVE_MOV(int grf)
322 {
323 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
324
325 inst->ir = NULL;
326 inst->annotation = "send dependency resolve";
327
328 /* The caller always wants uncompressed to emit the minimal extra
329 * dependencies, and to avoid having to deal with aligning its regs to 2.
330 */
331 inst->force_uncompressed = true;
332
333 return inst;
334 }
335
336 bool
337 fs_inst::equals(fs_inst *inst) const
338 {
339 return (opcode == inst->opcode &&
340 dst.equals(inst->dst) &&
341 src[0].equals(inst->src[0]) &&
342 src[1].equals(inst->src[1]) &&
343 src[2].equals(inst->src[2]) &&
344 saturate == inst->saturate &&
345 predicate == inst->predicate &&
346 conditional_mod == inst->conditional_mod &&
347 mlen == inst->mlen &&
348 base_mrf == inst->base_mrf &&
349 sampler == inst->sampler &&
350 target == inst->target &&
351 eot == inst->eot &&
352 header_present == inst->header_present &&
353 shadow_compare == inst->shadow_compare &&
354 offset == inst->offset);
355 }
356
357 bool
358 fs_inst::overwrites_reg(const fs_reg &reg) const
359 {
360 return (reg.file == dst.file &&
361 reg.reg == dst.reg &&
362 reg.reg_offset >= dst.reg_offset &&
363 reg.reg_offset < dst.reg_offset + regs_written);
364 }
365
366 bool
367 fs_inst::is_send_from_grf() const
368 {
369 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
370 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
371 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
372 src[1].file == GRF) ||
373 (is_tex() && src[0].file == GRF));
374 }
375
376 bool
377 fs_inst::can_do_source_mods(struct brw_context *brw)
378 {
379 if (brw->gen == 6 && is_math())
380 return false;
381
382 if (is_send_from_grf())
383 return false;
384
385 if (!backend_instruction::can_do_source_mods())
386 return false;
387
388 return true;
389 }
390
391 void
392 fs_reg::init()
393 {
394 memset(this, 0, sizeof(*this));
395 stride = 1;
396 }
397
398 /** Generic unset register constructor. */
399 fs_reg::fs_reg()
400 {
401 init();
402 this->file = BAD_FILE;
403 }
404
405 /** Immediate value constructor. */
406 fs_reg::fs_reg(float f)
407 {
408 init();
409 this->file = IMM;
410 this->type = BRW_REGISTER_TYPE_F;
411 this->imm.f = f;
412 }
413
414 /** Immediate value constructor. */
415 fs_reg::fs_reg(int32_t i)
416 {
417 init();
418 this->file = IMM;
419 this->type = BRW_REGISTER_TYPE_D;
420 this->imm.i = i;
421 }
422
423 /** Immediate value constructor. */
424 fs_reg::fs_reg(uint32_t u)
425 {
426 init();
427 this->file = IMM;
428 this->type = BRW_REGISTER_TYPE_UD;
429 this->imm.u = u;
430 }
431
432 /** Fixed brw_reg. */
433 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
434 {
435 init();
436 this->file = HW_REG;
437 this->fixed_hw_reg = fixed_hw_reg;
438 this->type = fixed_hw_reg.type;
439 }
440
441 bool
442 fs_reg::equals(const fs_reg &r) const
443 {
444 return (file == r.file &&
445 reg == r.reg &&
446 reg_offset == r.reg_offset &&
447 subreg_offset == r.subreg_offset &&
448 type == r.type &&
449 negate == r.negate &&
450 abs == r.abs &&
451 !reladdr && !r.reladdr &&
452 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
453 sizeof(fixed_hw_reg)) == 0 &&
454 stride == r.stride &&
455 imm.u == r.imm.u);
456 }
457
458 fs_reg &
459 fs_reg::apply_stride(unsigned stride)
460 {
461 assert((this->stride * stride) <= 4 &&
462 (is_power_of_two(stride) || stride == 0) &&
463 file != HW_REG && file != IMM);
464 this->stride *= stride;
465 return *this;
466 }
467
468 fs_reg &
469 fs_reg::set_smear(unsigned subreg)
470 {
471 assert(file != HW_REG && file != IMM);
472 subreg_offset = subreg * type_sz(type);
473 stride = 0;
474 return *this;
475 }
476
477 bool
478 fs_reg::is_contiguous() const
479 {
480 return stride == 1;
481 }
482
483 bool
484 fs_reg::is_zero() const
485 {
486 if (file != IMM)
487 return false;
488
489 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
490 }
491
492 bool
493 fs_reg::is_one() const
494 {
495 if (file != IMM)
496 return false;
497
498 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
499 }
500
501 bool
502 fs_reg::is_null() const
503 {
504 return file == HW_REG &&
505 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
506 fixed_hw_reg.nr == BRW_ARF_NULL;
507 }
508
509 bool
510 fs_reg::is_valid_3src() const
511 {
512 return file == GRF || file == UNIFORM;
513 }
514
515 bool
516 fs_reg::is_accumulator() const
517 {
518 return file == HW_REG &&
519 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
520 fixed_hw_reg.nr == BRW_ARF_ACCUMULATOR;
521 }
522
523 int
524 fs_visitor::type_size(const struct glsl_type *type)
525 {
526 unsigned int size, i;
527
528 switch (type->base_type) {
529 case GLSL_TYPE_UINT:
530 case GLSL_TYPE_INT:
531 case GLSL_TYPE_FLOAT:
532 case GLSL_TYPE_BOOL:
533 return type->components();
534 case GLSL_TYPE_ARRAY:
535 return type_size(type->fields.array) * type->length;
536 case GLSL_TYPE_STRUCT:
537 size = 0;
538 for (i = 0; i < type->length; i++) {
539 size += type_size(type->fields.structure[i].type);
540 }
541 return size;
542 case GLSL_TYPE_SAMPLER:
543 /* Samplers take up no register space, since they're baked in at
544 * link time.
545 */
546 return 0;
547 case GLSL_TYPE_ATOMIC_UINT:
548 return 0;
549 case GLSL_TYPE_IMAGE:
550 case GLSL_TYPE_VOID:
551 case GLSL_TYPE_ERROR:
552 case GLSL_TYPE_INTERFACE:
553 unreachable("not reached");
554 }
555
556 return 0;
557 }
558
559 fs_reg
560 fs_visitor::get_timestamp()
561 {
562 assert(brw->gen >= 7);
563
564 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
565 BRW_ARF_TIMESTAMP,
566 0),
567 BRW_REGISTER_TYPE_UD));
568
569 fs_reg dst = fs_reg(this, glsl_type::uint_type);
570
571 fs_inst *mov = emit(MOV(dst, ts));
572 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
573 * even if it's not enabled in the dispatch.
574 */
575 mov->force_writemask_all = true;
576 mov->force_uncompressed = true;
577
578 /* The caller wants the low 32 bits of the timestamp. Since it's running
579 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
580 * which is plenty of time for our purposes. It is identical across the
581 * EUs, but since it's tracking GPU core speed it will increment at a
582 * varying rate as render P-states change.
583 *
584 * The caller could also check if render P-states have changed (or anything
585 * else that might disrupt timing) by setting smear to 2 and checking if
586 * that field is != 0.
587 */
588 dst.set_smear(0);
589
590 return dst;
591 }
592
593 void
594 fs_visitor::emit_shader_time_begin()
595 {
596 current_annotation = "shader time start";
597 shader_start_time = get_timestamp();
598 }
599
600 void
601 fs_visitor::emit_shader_time_end()
602 {
603 current_annotation = "shader time end";
604
605 enum shader_time_shader_type type, written_type, reset_type;
606 if (dispatch_width == 8) {
607 type = ST_FS8;
608 written_type = ST_FS8_WRITTEN;
609 reset_type = ST_FS8_RESET;
610 } else {
611 assert(dispatch_width == 16);
612 type = ST_FS16;
613 written_type = ST_FS16_WRITTEN;
614 reset_type = ST_FS16_RESET;
615 }
616
617 fs_reg shader_end_time = get_timestamp();
618
619 /* Check that there weren't any timestamp reset events (assuming these
620 * were the only two timestamp reads that happened).
621 */
622 fs_reg reset = shader_end_time;
623 reset.set_smear(2);
624 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
625 test->conditional_mod = BRW_CONDITIONAL_Z;
626 emit(IF(BRW_PREDICATE_NORMAL));
627
628 push_force_uncompressed();
629 fs_reg start = shader_start_time;
630 start.negate = true;
631 fs_reg diff = fs_reg(this, glsl_type::uint_type);
632 emit(ADD(diff, start, shader_end_time));
633
634 /* If there were no instructions between the two timestamp gets, the diff
635 * is 2 cycles. Remove that overhead, so I can forget about that when
636 * trying to determine the time taken for single instructions.
637 */
638 emit(ADD(diff, diff, fs_reg(-2u)));
639
640 emit_shader_time_write(type, diff);
641 emit_shader_time_write(written_type, fs_reg(1u));
642 emit(BRW_OPCODE_ELSE);
643 emit_shader_time_write(reset_type, fs_reg(1u));
644 emit(BRW_OPCODE_ENDIF);
645
646 pop_force_uncompressed();
647 }
648
649 void
650 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
651 fs_reg value)
652 {
653 int shader_time_index =
654 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
655 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
656
657 fs_reg payload;
658 if (dispatch_width == 8)
659 payload = fs_reg(this, glsl_type::uvec2_type);
660 else
661 payload = fs_reg(this, glsl_type::uint_type);
662
663 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
664 fs_reg(), payload, offset, value));
665 }
666
667 void
668 fs_visitor::vfail(const char *format, va_list va)
669 {
670 char *msg;
671
672 if (failed)
673 return;
674
675 failed = true;
676
677 msg = ralloc_vasprintf(mem_ctx, format, va);
678 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
679
680 this->fail_msg = msg;
681
682 if (INTEL_DEBUG & DEBUG_WM) {
683 fprintf(stderr, "%s", msg);
684 }
685 }
686
687 void
688 fs_visitor::fail(const char *format, ...)
689 {
690 va_list va;
691
692 va_start(va, format);
693 vfail(format, va);
694 va_end(va);
695 }
696
697 /**
698 * Mark this program as impossible to compile in SIMD16 mode.
699 *
700 * During the SIMD8 compile (which happens first), we can detect and flag
701 * things that are unsupported in SIMD16 mode, so the compiler can skip
702 * the SIMD16 compile altogether.
703 *
704 * During a SIMD16 compile (if one happens anyway), this just calls fail().
705 */
706 void
707 fs_visitor::no16(const char *format, ...)
708 {
709 va_list va;
710
711 va_start(va, format);
712
713 if (dispatch_width == 16) {
714 vfail(format, va);
715 } else {
716 simd16_unsupported = true;
717
718 if (brw->perf_debug) {
719 if (no16_msg)
720 ralloc_vasprintf_append(&no16_msg, format, va);
721 else
722 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
723 }
724 }
725
726 va_end(va);
727 }
728
729 fs_inst *
730 fs_visitor::emit(enum opcode opcode)
731 {
732 return emit(new(mem_ctx) fs_inst(opcode));
733 }
734
735 fs_inst *
736 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
737 {
738 return emit(new(mem_ctx) fs_inst(opcode, dst));
739 }
740
741 fs_inst *
742 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
743 {
744 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
745 }
746
747 fs_inst *
748 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
749 const fs_reg &src1)
750 {
751 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
752 }
753
754 fs_inst *
755 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
756 const fs_reg &src1, const fs_reg &src2)
757 {
758 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
759 }
760
761 fs_inst *
762 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
763 fs_reg src[], int sources)
764 {
765 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
766 }
767
768 void
769 fs_visitor::push_force_uncompressed()
770 {
771 force_uncompressed_stack++;
772 }
773
774 void
775 fs_visitor::pop_force_uncompressed()
776 {
777 force_uncompressed_stack--;
778 assert(force_uncompressed_stack >= 0);
779 }
780
781 /**
782 * Returns true if the instruction has a flag that means it won't
783 * update an entire destination register.
784 *
785 * For example, dead code elimination and live variable analysis want to know
786 * when a write to a variable screens off any preceding values that were in
787 * it.
788 */
789 bool
790 fs_inst::is_partial_write() const
791 {
792 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
793 this->force_uncompressed ||
794 this->force_sechalf || !this->dst.is_contiguous());
795 }
796
797 int
798 fs_inst::regs_read(fs_visitor *v, int arg) const
799 {
800 if (is_tex() && arg == 0 && src[0].file == GRF) {
801 if (v->dispatch_width == 16)
802 return (mlen + 1) / 2;
803 else
804 return mlen;
805 }
806 return 1;
807 }
808
809 bool
810 fs_inst::reads_flag() const
811 {
812 return predicate;
813 }
814
815 bool
816 fs_inst::writes_flag() const
817 {
818 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
819 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
820 }
821
822 /**
823 * Returns how many MRFs an FS opcode will write over.
824 *
825 * Note that this is not the 0 or 1 implied writes in an actual gen
826 * instruction -- the FS opcodes often generate MOVs in addition.
827 */
828 int
829 fs_visitor::implied_mrf_writes(fs_inst *inst)
830 {
831 if (inst->mlen == 0)
832 return 0;
833
834 if (inst->base_mrf == -1)
835 return 0;
836
837 switch (inst->opcode) {
838 case SHADER_OPCODE_RCP:
839 case SHADER_OPCODE_RSQ:
840 case SHADER_OPCODE_SQRT:
841 case SHADER_OPCODE_EXP2:
842 case SHADER_OPCODE_LOG2:
843 case SHADER_OPCODE_SIN:
844 case SHADER_OPCODE_COS:
845 return 1 * dispatch_width / 8;
846 case SHADER_OPCODE_POW:
847 case SHADER_OPCODE_INT_QUOTIENT:
848 case SHADER_OPCODE_INT_REMAINDER:
849 return 2 * dispatch_width / 8;
850 case SHADER_OPCODE_TEX:
851 case FS_OPCODE_TXB:
852 case SHADER_OPCODE_TXD:
853 case SHADER_OPCODE_TXF:
854 case SHADER_OPCODE_TXF_CMS:
855 case SHADER_OPCODE_TXF_MCS:
856 case SHADER_OPCODE_TG4:
857 case SHADER_OPCODE_TG4_OFFSET:
858 case SHADER_OPCODE_TXL:
859 case SHADER_OPCODE_TXS:
860 case SHADER_OPCODE_LOD:
861 return 1;
862 case FS_OPCODE_FB_WRITE:
863 return 2;
864 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
865 case SHADER_OPCODE_GEN4_SCRATCH_READ:
866 return 1;
867 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
868 return inst->mlen;
869 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
870 return 2;
871 case SHADER_OPCODE_UNTYPED_ATOMIC:
872 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
873 return 0;
874 default:
875 unreachable("not reached");
876 }
877 }
878
879 int
880 fs_visitor::virtual_grf_alloc(int size)
881 {
882 if (virtual_grf_array_size <= virtual_grf_count) {
883 if (virtual_grf_array_size == 0)
884 virtual_grf_array_size = 16;
885 else
886 virtual_grf_array_size *= 2;
887 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
888 virtual_grf_array_size);
889 }
890 virtual_grf_sizes[virtual_grf_count] = size;
891 return virtual_grf_count++;
892 }
893
894 /** Fixed HW reg constructor. */
895 fs_reg::fs_reg(enum register_file file, int reg)
896 {
897 init();
898 this->file = file;
899 this->reg = reg;
900 this->type = BRW_REGISTER_TYPE_F;
901 }
902
903 /** Fixed HW reg constructor. */
904 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
905 {
906 init();
907 this->file = file;
908 this->reg = reg;
909 this->type = type;
910 }
911
912 /** Automatic reg constructor. */
913 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
914 {
915 init();
916
917 this->file = GRF;
918 this->reg = v->virtual_grf_alloc(v->type_size(type));
919 this->reg_offset = 0;
920 this->type = brw_type_for_base_type(type);
921 }
922
923 fs_reg *
924 fs_visitor::variable_storage(ir_variable *var)
925 {
926 return (fs_reg *)hash_table_find(this->variable_ht, var);
927 }
928
929 void
930 import_uniforms_callback(const void *key,
931 void *data,
932 void *closure)
933 {
934 struct hash_table *dst_ht = (struct hash_table *)closure;
935 const fs_reg *reg = (const fs_reg *)data;
936
937 if (reg->file != UNIFORM)
938 return;
939
940 hash_table_insert(dst_ht, data, key);
941 }
942
943 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
944 * This brings in those uniform definitions
945 */
946 void
947 fs_visitor::import_uniforms(fs_visitor *v)
948 {
949 hash_table_call_foreach(v->variable_ht,
950 import_uniforms_callback,
951 variable_ht);
952 this->push_constant_loc = v->push_constant_loc;
953 this->pull_constant_loc = v->pull_constant_loc;
954 this->uniforms = v->uniforms;
955 this->param_size = v->param_size;
956 }
957
958 /* Our support for uniforms is piggy-backed on the struct
959 * gl_fragment_program, because that's where the values actually
960 * get stored, rather than in some global gl_shader_program uniform
961 * store.
962 */
963 void
964 fs_visitor::setup_uniform_values(ir_variable *ir)
965 {
966 int namelen = strlen(ir->name);
967
968 /* The data for our (non-builtin) uniforms is stored in a series of
969 * gl_uniform_driver_storage structs for each subcomponent that
970 * glGetUniformLocation() could name. We know it's been set up in the same
971 * order we'd walk the type, so walk the list of storage and find anything
972 * with our name, or the prefix of a component that starts with our name.
973 */
974 unsigned params_before = uniforms;
975 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
976 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
977
978 if (strncmp(ir->name, storage->name, namelen) != 0 ||
979 (storage->name[namelen] != 0 &&
980 storage->name[namelen] != '.' &&
981 storage->name[namelen] != '[')) {
982 continue;
983 }
984
985 unsigned slots = storage->type->component_slots();
986 if (storage->array_elements)
987 slots *= storage->array_elements;
988
989 for (unsigned i = 0; i < slots; i++) {
990 stage_prog_data->param[uniforms++] = &storage->storage[i].f;
991 }
992 }
993
994 /* Make sure we actually initialized the right amount of stuff here. */
995 assert(params_before + ir->type->component_slots() == uniforms);
996 (void)params_before;
997 }
998
999
1000 /* Our support for builtin uniforms is even scarier than non-builtin.
1001 * It sits on top of the PROG_STATE_VAR parameters that are
1002 * automatically updated from GL context state.
1003 */
1004 void
1005 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1006 {
1007 const ir_state_slot *const slots = ir->state_slots;
1008 assert(ir->state_slots != NULL);
1009
1010 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
1011 /* This state reference has already been setup by ir_to_mesa, but we'll
1012 * get the same index back here.
1013 */
1014 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
1015 (gl_state_index *)slots[i].tokens);
1016
1017 /* Add each of the unique swizzles of the element as a parameter.
1018 * This'll end up matching the expected layout of the
1019 * array/matrix/structure we're trying to fill in.
1020 */
1021 int last_swiz = -1;
1022 for (unsigned int j = 0; j < 4; j++) {
1023 int swiz = GET_SWZ(slots[i].swizzle, j);
1024 if (swiz == last_swiz)
1025 break;
1026 last_swiz = swiz;
1027
1028 stage_prog_data->param[uniforms++] =
1029 &fp->Base.Parameters->ParameterValues[index][swiz].f;
1030 }
1031 }
1032 }
1033
1034 fs_reg *
1035 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1036 {
1037 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1038 fs_reg wpos = *reg;
1039 bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1040
1041 /* gl_FragCoord.x */
1042 if (ir->data.pixel_center_integer) {
1043 emit(MOV(wpos, this->pixel_x));
1044 } else {
1045 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1046 }
1047 wpos.reg_offset++;
1048
1049 /* gl_FragCoord.y */
1050 if (!flip && ir->data.pixel_center_integer) {
1051 emit(MOV(wpos, this->pixel_y));
1052 } else {
1053 fs_reg pixel_y = this->pixel_y;
1054 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1055
1056 if (flip) {
1057 pixel_y.negate = true;
1058 offset += key->drawable_height - 1.0;
1059 }
1060
1061 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1062 }
1063 wpos.reg_offset++;
1064
1065 /* gl_FragCoord.z */
1066 if (brw->gen >= 6) {
1067 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1068 } else {
1069 emit(FS_OPCODE_LINTERP, wpos,
1070 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1071 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1072 interp_reg(VARYING_SLOT_POS, 2));
1073 }
1074 wpos.reg_offset++;
1075
1076 /* gl_FragCoord.w: Already set up in emit_interpolation */
1077 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1078
1079 return reg;
1080 }
1081
1082 fs_inst *
1083 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1084 glsl_interp_qualifier interpolation_mode,
1085 bool is_centroid, bool is_sample)
1086 {
1087 brw_wm_barycentric_interp_mode barycoord_mode;
1088 if (brw->gen >= 6) {
1089 if (is_centroid) {
1090 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1091 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1092 else
1093 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1094 } else if (is_sample) {
1095 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1096 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1097 else
1098 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1099 } else {
1100 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1101 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1102 else
1103 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1104 }
1105 } else {
1106 /* On Ironlake and below, there is only one interpolation mode.
1107 * Centroid interpolation doesn't mean anything on this hardware --
1108 * there is no multisampling.
1109 */
1110 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1111 }
1112 return emit(FS_OPCODE_LINTERP, attr,
1113 this->delta_x[barycoord_mode],
1114 this->delta_y[barycoord_mode], interp);
1115 }
1116
1117 fs_reg *
1118 fs_visitor::emit_general_interpolation(ir_variable *ir)
1119 {
1120 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1121 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1122 fs_reg attr = *reg;
1123
1124 unsigned int array_elements;
1125 const glsl_type *type;
1126
1127 if (ir->type->is_array()) {
1128 array_elements = ir->type->length;
1129 if (array_elements == 0) {
1130 fail("dereferenced array '%s' has length 0\n", ir->name);
1131 }
1132 type = ir->type->fields.array;
1133 } else {
1134 array_elements = 1;
1135 type = ir->type;
1136 }
1137
1138 glsl_interp_qualifier interpolation_mode =
1139 ir->determine_interpolation_mode(key->flat_shade);
1140
1141 int location = ir->data.location;
1142 for (unsigned int i = 0; i < array_elements; i++) {
1143 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1144 if (prog_data->urb_setup[location] == -1) {
1145 /* If there's no incoming setup data for this slot, don't
1146 * emit interpolation for it.
1147 */
1148 attr.reg_offset += type->vector_elements;
1149 location++;
1150 continue;
1151 }
1152
1153 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1154 /* Constant interpolation (flat shading) case. The SF has
1155 * handed us defined values in only the constant offset
1156 * field of the setup reg.
1157 */
1158 for (unsigned int k = 0; k < type->vector_elements; k++) {
1159 struct brw_reg interp = interp_reg(location, k);
1160 interp = suboffset(interp, 3);
1161 interp.type = reg->type;
1162 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1163 attr.reg_offset++;
1164 }
1165 } else {
1166 /* Smooth/noperspective interpolation case. */
1167 for (unsigned int k = 0; k < type->vector_elements; k++) {
1168 struct brw_reg interp = interp_reg(location, k);
1169 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1170 /* Get the pixel/sample mask into f0 so that we know
1171 * which pixels are lit. Then, for each channel that is
1172 * unlit, replace the centroid data with non-centroid
1173 * data.
1174 */
1175 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1176
1177 fs_inst *inst;
1178 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1179 false, false);
1180 inst->predicate = BRW_PREDICATE_NORMAL;
1181 inst->predicate_inverse = true;
1182 if (brw->has_pln)
1183 inst->no_dd_clear = true;
1184
1185 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1186 ir->data.centroid && !key->persample_shading,
1187 ir->data.sample || key->persample_shading);
1188 inst->predicate = BRW_PREDICATE_NORMAL;
1189 inst->predicate_inverse = false;
1190 if (brw->has_pln)
1191 inst->no_dd_check = true;
1192
1193 } else {
1194 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1195 ir->data.centroid && !key->persample_shading,
1196 ir->data.sample || key->persample_shading);
1197 }
1198 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1199 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1200 }
1201 attr.reg_offset++;
1202 }
1203
1204 }
1205 location++;
1206 }
1207 }
1208
1209 return reg;
1210 }
1211
1212 fs_reg *
1213 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1214 {
1215 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1216
1217 /* The frontfacing comes in as a bit in the thread payload. */
1218 if (brw->gen >= 6) {
1219 emit(BRW_OPCODE_ASR, *reg,
1220 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1221 fs_reg(15));
1222 emit(BRW_OPCODE_NOT, *reg, *reg);
1223 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1224 } else {
1225 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1226 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1227 * us front face
1228 */
1229 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1230 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1231 }
1232
1233 return reg;
1234 }
1235
1236 void
1237 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1238 {
1239 assert(dst.type == BRW_REGISTER_TYPE_F);
1240
1241 if (key->compute_pos_offset) {
1242 /* Convert int_sample_pos to floating point */
1243 emit(MOV(dst, int_sample_pos));
1244 /* Scale to the range [0, 1] */
1245 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1246 }
1247 else {
1248 /* From ARB_sample_shading specification:
1249 * "When rendering to a non-multisample buffer, or if multisample
1250 * rasterization is disabled, gl_SamplePosition will always be
1251 * (0.5, 0.5).
1252 */
1253 emit(MOV(dst, fs_reg(0.5f)));
1254 }
1255 }
1256
1257 fs_reg *
1258 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1259 {
1260 assert(brw->gen >= 6);
1261 assert(ir->type == glsl_type::vec2_type);
1262
1263 this->current_annotation = "compute sample position";
1264 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1265 fs_reg pos = *reg;
1266 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1267 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1268
1269 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1270 * mode will be enabled.
1271 *
1272 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1273 * R31.1:0 Position Offset X/Y for Slot[3:0]
1274 * R31.3:2 Position Offset X/Y for Slot[7:4]
1275 * .....
1276 *
1277 * The X, Y sample positions come in as bytes in thread payload. So, read
1278 * the positions using vstride=16, width=8, hstride=2.
1279 */
1280 struct brw_reg sample_pos_reg =
1281 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1282 BRW_REGISTER_TYPE_B), 16, 8, 2);
1283
1284 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1285 if (dispatch_width == 16) {
1286 fs_inst *inst = emit(MOV(half(int_sample_x, 1),
1287 fs_reg(suboffset(sample_pos_reg, 16))));
1288 inst->force_sechalf = true;
1289 }
1290 /* Compute gl_SamplePosition.x */
1291 compute_sample_position(pos, int_sample_x);
1292 pos.reg_offset++;
1293 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1294 if (dispatch_width == 16) {
1295 fs_inst *inst = emit(MOV(half(int_sample_y, 1),
1296 fs_reg(suboffset(sample_pos_reg, 17))));
1297 inst->force_sechalf = true;
1298 }
1299 /* Compute gl_SamplePosition.y */
1300 compute_sample_position(pos, int_sample_y);
1301 return reg;
1302 }
1303
1304 fs_reg *
1305 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1306 {
1307 assert(brw->gen >= 6);
1308
1309 this->current_annotation = "compute sample id";
1310 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1311
1312 if (key->compute_sample_id) {
1313 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1314 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1315 t2.type = BRW_REGISTER_TYPE_UW;
1316
1317 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1318 * 8x multisampling, subspan 0 will represent sample N (where N
1319 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1320 * 7. We can find the value of N by looking at R0.0 bits 7:6
1321 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1322 * (since samples are always delivered in pairs). That is, we
1323 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1324 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1325 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1326 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1327 * populating a temporary variable with the sequence (0, 1, 2, 3),
1328 * and then reading from it using vstride=1, width=4, hstride=0.
1329 * These computations hold good for 4x multisampling as well.
1330 */
1331 emit(BRW_OPCODE_AND, t1,
1332 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1333 fs_reg(0xc0));
1334 emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1335 /* This works for both SIMD8 and SIMD16 */
1336 emit(MOV(t2, brw_imm_v(0x3210)));
1337 /* This special instruction takes care of setting vstride=1,
1338 * width=4, hstride=0 of t2 during an ADD instruction.
1339 */
1340 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1341 } else {
1342 /* As per GL_ARB_sample_shading specification:
1343 * "When rendering to a non-multisample buffer, or if multisample
1344 * rasterization is disabled, gl_SampleID will always be zero."
1345 */
1346 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1347 }
1348
1349 return reg;
1350 }
1351
1352 fs_reg
1353 fs_visitor::fix_math_operand(fs_reg src)
1354 {
1355 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1356 * might be able to do better by doing execsize = 1 math and then
1357 * expanding that result out, but we would need to be careful with
1358 * masking.
1359 *
1360 * The hardware ignores source modifiers (negate and abs) on math
1361 * instructions, so we also move to a temp to set those up.
1362 */
1363 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1364 !src.abs && !src.negate)
1365 return src;
1366
1367 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1368 * operands to math
1369 */
1370 if (brw->gen >= 7 && src.file != IMM)
1371 return src;
1372
1373 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1374 expanded.type = src.type;
1375 emit(BRW_OPCODE_MOV, expanded, src);
1376 return expanded;
1377 }
1378
1379 fs_inst *
1380 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1381 {
1382 switch (opcode) {
1383 case SHADER_OPCODE_RCP:
1384 case SHADER_OPCODE_RSQ:
1385 case SHADER_OPCODE_SQRT:
1386 case SHADER_OPCODE_EXP2:
1387 case SHADER_OPCODE_LOG2:
1388 case SHADER_OPCODE_SIN:
1389 case SHADER_OPCODE_COS:
1390 break;
1391 default:
1392 unreachable("not reached: bad math opcode");
1393 }
1394
1395 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1396 * might be able to do better by doing execsize = 1 math and then
1397 * expanding that result out, but we would need to be careful with
1398 * masking.
1399 *
1400 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1401 * instructions, so we also move to a temp to set those up.
1402 */
1403 if (brw->gen == 6 || brw->gen == 7)
1404 src = fix_math_operand(src);
1405
1406 fs_inst *inst = emit(opcode, dst, src);
1407
1408 if (brw->gen < 6) {
1409 inst->base_mrf = 2;
1410 inst->mlen = dispatch_width / 8;
1411 }
1412
1413 return inst;
1414 }
1415
1416 fs_inst *
1417 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1418 {
1419 int base_mrf = 2;
1420 fs_inst *inst;
1421
1422 switch (opcode) {
1423 case SHADER_OPCODE_INT_QUOTIENT:
1424 case SHADER_OPCODE_INT_REMAINDER:
1425 if (brw->gen >= 7)
1426 no16("SIMD16 INTDIV unsupported\n");
1427 break;
1428 case SHADER_OPCODE_POW:
1429 break;
1430 default:
1431 unreachable("not reached: unsupported binary math opcode.");
1432 }
1433
1434 if (brw->gen >= 8) {
1435 inst = emit(opcode, dst, src0, src1);
1436 } else if (brw->gen >= 6) {
1437 src0 = fix_math_operand(src0);
1438 src1 = fix_math_operand(src1);
1439
1440 inst = emit(opcode, dst, src0, src1);
1441 } else {
1442 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1443 * "Message Payload":
1444 *
1445 * "Operand0[7]. For the INT DIV functions, this operand is the
1446 * denominator."
1447 * ...
1448 * "Operand1[7]. For the INT DIV functions, this operand is the
1449 * numerator."
1450 */
1451 bool is_int_div = opcode != SHADER_OPCODE_POW;
1452 fs_reg &op0 = is_int_div ? src1 : src0;
1453 fs_reg &op1 = is_int_div ? src0 : src1;
1454
1455 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1456 inst = emit(opcode, dst, op0, reg_null_f);
1457
1458 inst->base_mrf = base_mrf;
1459 inst->mlen = 2 * dispatch_width / 8;
1460 }
1461 return inst;
1462 }
1463
1464 void
1465 fs_visitor::assign_curb_setup()
1466 {
1467 if (dispatch_width == 8) {
1468 prog_data->first_curbe_grf = payload.num_regs;
1469 } else {
1470 prog_data->first_curbe_grf_16 = payload.num_regs;
1471 }
1472
1473 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1474
1475 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1476 foreach_in_list(fs_inst, inst, &instructions) {
1477 for (unsigned int i = 0; i < inst->sources; i++) {
1478 if (inst->src[i].file == UNIFORM) {
1479 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1480 int constant_nr;
1481 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1482 constant_nr = push_constant_loc[uniform_nr];
1483 } else {
1484 /* Section 5.11 of the OpenGL 4.1 spec says:
1485 * "Out-of-bounds reads return undefined values, which include
1486 * values from other variables of the active program or zero."
1487 * Just return the first push constant.
1488 */
1489 constant_nr = 0;
1490 }
1491
1492 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1493 constant_nr / 8,
1494 constant_nr % 8);
1495
1496 inst->src[i].file = HW_REG;
1497 inst->src[i].fixed_hw_reg = byte_offset(
1498 retype(brw_reg, inst->src[i].type),
1499 inst->src[i].subreg_offset);
1500 }
1501 }
1502 }
1503 }
1504
1505 void
1506 fs_visitor::calculate_urb_setup()
1507 {
1508 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1509 prog_data->urb_setup[i] = -1;
1510 }
1511
1512 int urb_next = 0;
1513 /* Figure out where each of the incoming setup attributes lands. */
1514 if (brw->gen >= 6) {
1515 if (_mesa_bitcount_64(fp->Base.InputsRead &
1516 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1517 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1518 * first 16 varying inputs, so we can put them wherever we want.
1519 * Just put them in order.
1520 *
1521 * This is useful because it means that (a) inputs not used by the
1522 * fragment shader won't take up valuable register space, and (b) we
1523 * won't have to recompile the fragment shader if it gets paired with
1524 * a different vertex (or geometry) shader.
1525 */
1526 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1527 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1528 BITFIELD64_BIT(i)) {
1529 prog_data->urb_setup[i] = urb_next++;
1530 }
1531 }
1532 } else {
1533 /* We have enough input varyings that the SF/SBE pipeline stage can't
1534 * arbitrarily rearrange them to suit our whim; we have to put them
1535 * in an order that matches the output of the previous pipeline stage
1536 * (geometry or vertex shader).
1537 */
1538 struct brw_vue_map prev_stage_vue_map;
1539 brw_compute_vue_map(brw, &prev_stage_vue_map,
1540 key->input_slots_valid);
1541 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1542 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1543 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1544 slot++) {
1545 int varying = prev_stage_vue_map.slot_to_varying[slot];
1546 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1547 * unused.
1548 */
1549 if (varying != BRW_VARYING_SLOT_COUNT &&
1550 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1551 BITFIELD64_BIT(varying))) {
1552 prog_data->urb_setup[varying] = slot - first_slot;
1553 }
1554 }
1555 urb_next = prev_stage_vue_map.num_slots - first_slot;
1556 }
1557 } else {
1558 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1559 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1560 /* Point size is packed into the header, not as a general attribute */
1561 if (i == VARYING_SLOT_PSIZ)
1562 continue;
1563
1564 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1565 /* The back color slot is skipped when the front color is
1566 * also written to. In addition, some slots can be
1567 * written in the vertex shader and not read in the
1568 * fragment shader. So the register number must always be
1569 * incremented, mapped or not.
1570 */
1571 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1572 prog_data->urb_setup[i] = urb_next;
1573 urb_next++;
1574 }
1575 }
1576
1577 /*
1578 * It's a FS only attribute, and we did interpolation for this attribute
1579 * in SF thread. So, count it here, too.
1580 *
1581 * See compile_sf_prog() for more info.
1582 */
1583 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1584 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1585 }
1586
1587 prog_data->num_varying_inputs = urb_next;
1588 }
1589
1590 void
1591 fs_visitor::assign_urb_setup()
1592 {
1593 int urb_start = payload.num_regs + prog_data->curb_read_length;
1594
1595 /* Offset all the urb_setup[] index by the actual position of the
1596 * setup regs, now that the location of the constants has been chosen.
1597 */
1598 foreach_in_list(fs_inst, inst, &instructions) {
1599 if (inst->opcode == FS_OPCODE_LINTERP) {
1600 assert(inst->src[2].file == HW_REG);
1601 inst->src[2].fixed_hw_reg.nr += urb_start;
1602 }
1603
1604 if (inst->opcode == FS_OPCODE_CINTERP) {
1605 assert(inst->src[0].file == HW_REG);
1606 inst->src[0].fixed_hw_reg.nr += urb_start;
1607 }
1608 }
1609
1610 /* Each attribute is 4 setup channels, each of which is half a reg. */
1611 this->first_non_payload_grf =
1612 urb_start + prog_data->num_varying_inputs * 2;
1613 }
1614
1615 /**
1616 * Split large virtual GRFs into separate components if we can.
1617 *
1618 * This is mostly duplicated with what brw_fs_vector_splitting does,
1619 * but that's really conservative because it's afraid of doing
1620 * splitting that doesn't result in real progress after the rest of
1621 * the optimization phases, which would cause infinite looping in
1622 * optimization. We can do it once here, safely. This also has the
1623 * opportunity to split interpolated values, or maybe even uniforms,
1624 * which we don't have at the IR level.
1625 *
1626 * We want to split, because virtual GRFs are what we register
1627 * allocate and spill (due to contiguousness requirements for some
1628 * instructions), and they're what we naturally generate in the
1629 * codegen process, but most virtual GRFs don't actually need to be
1630 * contiguous sets of GRFs. If we split, we'll end up with reduced
1631 * live intervals and better dead code elimination and coalescing.
1632 */
1633 void
1634 fs_visitor::split_virtual_grfs()
1635 {
1636 int num_vars = this->virtual_grf_count;
1637 bool split_grf[num_vars];
1638 int new_virtual_grf[num_vars];
1639
1640 /* Try to split anything > 0 sized. */
1641 for (int i = 0; i < num_vars; i++) {
1642 if (this->virtual_grf_sizes[i] != 1)
1643 split_grf[i] = true;
1644 else
1645 split_grf[i] = false;
1646 }
1647
1648 if (brw->has_pln &&
1649 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1650 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1651 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1652 * Gen6, that was the only supported interpolation mode, and since Gen6,
1653 * delta_x and delta_y are in fixed hardware registers.
1654 */
1655 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1656 false;
1657 }
1658
1659 foreach_in_list(fs_inst, inst, &instructions) {
1660 /* If there's a SEND message that requires contiguous destination
1661 * registers, no splitting is allowed.
1662 */
1663 if (inst->regs_written > 1) {
1664 split_grf[inst->dst.reg] = false;
1665 }
1666
1667 /* If we're sending from a GRF, don't split it, on the assumption that
1668 * the send is reading the whole thing.
1669 */
1670 if (inst->is_send_from_grf()) {
1671 for (int i = 0; i < inst->sources; i++) {
1672 if (inst->src[i].file == GRF) {
1673 split_grf[inst->src[i].reg] = false;
1674 }
1675 }
1676 }
1677 }
1678
1679 /* Allocate new space for split regs. Note that the virtual
1680 * numbers will be contiguous.
1681 */
1682 for (int i = 0; i < num_vars; i++) {
1683 if (split_grf[i]) {
1684 new_virtual_grf[i] = virtual_grf_alloc(1);
1685 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1686 int reg = virtual_grf_alloc(1);
1687 assert(reg == new_virtual_grf[i] + j - 1);
1688 (void) reg;
1689 }
1690 this->virtual_grf_sizes[i] = 1;
1691 }
1692 }
1693
1694 foreach_in_list(fs_inst, inst, &instructions) {
1695 if (inst->dst.file == GRF &&
1696 split_grf[inst->dst.reg] &&
1697 inst->dst.reg_offset != 0) {
1698 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1699 inst->dst.reg_offset - 1);
1700 inst->dst.reg_offset = 0;
1701 }
1702 for (int i = 0; i < inst->sources; i++) {
1703 if (inst->src[i].file == GRF &&
1704 split_grf[inst->src[i].reg] &&
1705 inst->src[i].reg_offset != 0) {
1706 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1707 inst->src[i].reg_offset - 1);
1708 inst->src[i].reg_offset = 0;
1709 }
1710 }
1711 }
1712 invalidate_live_intervals();
1713 }
1714
1715 /**
1716 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1717 *
1718 * During code generation, we create tons of temporary variables, many of
1719 * which get immediately killed and are never used again. Yet, in later
1720 * optimization and analysis passes, such as compute_live_intervals, we need
1721 * to loop over all the virtual GRFs. Compacting them can save a lot of
1722 * overhead.
1723 */
1724 void
1725 fs_visitor::compact_virtual_grfs()
1726 {
1727 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER))
1728 return;
1729
1730 /* Mark which virtual GRFs are used, and count how many. */
1731 int remap_table[this->virtual_grf_count];
1732 memset(remap_table, -1, sizeof(remap_table));
1733
1734 foreach_in_list(const fs_inst, inst, &instructions) {
1735 if (inst->dst.file == GRF)
1736 remap_table[inst->dst.reg] = 0;
1737
1738 for (int i = 0; i < inst->sources; i++) {
1739 if (inst->src[i].file == GRF)
1740 remap_table[inst->src[i].reg] = 0;
1741 }
1742 }
1743
1744 /* Compact the GRF arrays. */
1745 int new_index = 0;
1746 for (int i = 0; i < this->virtual_grf_count; i++) {
1747 if (remap_table[i] != -1) {
1748 remap_table[i] = new_index;
1749 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1750 invalidate_live_intervals();
1751 ++new_index;
1752 }
1753 }
1754
1755 this->virtual_grf_count = new_index;
1756
1757 /* Patch all the instructions to use the newly renumbered registers */
1758 foreach_in_list(fs_inst, inst, &instructions) {
1759 if (inst->dst.file == GRF)
1760 inst->dst.reg = remap_table[inst->dst.reg];
1761
1762 for (int i = 0; i < inst->sources; i++) {
1763 if (inst->src[i].file == GRF)
1764 inst->src[i].reg = remap_table[inst->src[i].reg];
1765 }
1766 }
1767 }
1768
1769 /*
1770 * Implements array access of uniforms by inserting a
1771 * PULL_CONSTANT_LOAD instruction.
1772 *
1773 * Unlike temporary GRF array access (where we don't support it due to
1774 * the difficulty of doing relative addressing on instruction
1775 * destinations), we could potentially do array access of uniforms
1776 * that were loaded in GRF space as push constants. In real-world
1777 * usage we've seen, though, the arrays being used are always larger
1778 * than we could load as push constants, so just always move all
1779 * uniform array access out to a pull constant buffer.
1780 */
1781 void
1782 fs_visitor::move_uniform_array_access_to_pull_constants()
1783 {
1784 if (dispatch_width != 8)
1785 return;
1786
1787 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1788
1789 for (unsigned int i = 0; i < uniforms; i++) {
1790 pull_constant_loc[i] = -1;
1791 }
1792
1793 /* Walk through and find array access of uniforms. Put a copy of that
1794 * uniform in the pull constant buffer.
1795 *
1796 * Note that we don't move constant-indexed accesses to arrays. No
1797 * testing has been done of the performance impact of this choice.
1798 */
1799 foreach_in_list_safe(fs_inst, inst, &instructions) {
1800 for (int i = 0 ; i < inst->sources; i++) {
1801 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1802 continue;
1803
1804 int uniform = inst->src[i].reg;
1805
1806 /* If this array isn't already present in the pull constant buffer,
1807 * add it.
1808 */
1809 if (pull_constant_loc[uniform] == -1) {
1810 const float **values = &stage_prog_data->param[uniform];
1811
1812 assert(param_size[uniform]);
1813
1814 for (int j = 0; j < param_size[uniform]; j++) {
1815 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1816
1817 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1818 values[j];
1819 }
1820 }
1821 }
1822 }
1823 }
1824
1825 /**
1826 * Assign UNIFORM file registers to either push constants or pull constants.
1827 *
1828 * We allow a fragment shader to have more than the specified minimum
1829 * maximum number of fragment shader uniform components (64). If
1830 * there are too many of these, they'd fill up all of register space.
1831 * So, this will push some of them out to the pull constant buffer and
1832 * update the program to load them.
1833 */
1834 void
1835 fs_visitor::assign_constant_locations()
1836 {
1837 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1838 if (dispatch_width != 8)
1839 return;
1840
1841 /* Find which UNIFORM registers are still in use. */
1842 bool is_live[uniforms];
1843 for (unsigned int i = 0; i < uniforms; i++) {
1844 is_live[i] = false;
1845 }
1846
1847 foreach_in_list(fs_inst, inst, &instructions) {
1848 for (int i = 0; i < inst->sources; i++) {
1849 if (inst->src[i].file != UNIFORM)
1850 continue;
1851
1852 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1853 if (constant_nr >= 0 && constant_nr < (int) uniforms)
1854 is_live[constant_nr] = true;
1855 }
1856 }
1857
1858 /* Only allow 16 registers (128 uniform components) as push constants.
1859 *
1860 * Just demote the end of the list. We could probably do better
1861 * here, demoting things that are rarely used in the program first.
1862 */
1863 unsigned int max_push_components = 16 * 8;
1864 unsigned int num_push_constants = 0;
1865
1866 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1867
1868 for (unsigned int i = 0; i < uniforms; i++) {
1869 if (!is_live[i] || pull_constant_loc[i] != -1) {
1870 /* This UNIFORM register is either dead, or has already been demoted
1871 * to a pull const. Mark it as no longer living in the param[] array.
1872 */
1873 push_constant_loc[i] = -1;
1874 continue;
1875 }
1876
1877 if (num_push_constants < max_push_components) {
1878 /* Retain as a push constant. Record the location in the params[]
1879 * array.
1880 */
1881 push_constant_loc[i] = num_push_constants++;
1882 } else {
1883 /* Demote to a pull constant. */
1884 push_constant_loc[i] = -1;
1885
1886 int pull_index = stage_prog_data->nr_pull_params++;
1887 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1888 pull_constant_loc[i] = pull_index;
1889 }
1890 }
1891
1892 stage_prog_data->nr_params = num_push_constants;
1893
1894 /* Up until now, the param[] array has been indexed by reg + reg_offset
1895 * of UNIFORM registers. Condense it to only contain the uniforms we
1896 * chose to upload as push constants.
1897 */
1898 for (unsigned int i = 0; i < uniforms; i++) {
1899 int remapped = push_constant_loc[i];
1900
1901 if (remapped == -1)
1902 continue;
1903
1904 assert(remapped <= (int)i);
1905 stage_prog_data->param[remapped] = stage_prog_data->param[i];
1906 }
1907 }
1908
1909 /**
1910 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1911 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1912 */
1913 void
1914 fs_visitor::demote_pull_constants()
1915 {
1916 foreach_in_list(fs_inst, inst, &instructions) {
1917 for (int i = 0; i < inst->sources; i++) {
1918 if (inst->src[i].file != UNIFORM)
1919 continue;
1920
1921 int pull_index = pull_constant_loc[inst->src[i].reg +
1922 inst->src[i].reg_offset];
1923 if (pull_index == -1)
1924 continue;
1925
1926 /* Set up the annotation tracking for new generated instructions. */
1927 base_ir = inst->ir;
1928 current_annotation = inst->annotation;
1929
1930 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1931 fs_reg dst = fs_reg(this, glsl_type::float_type);
1932
1933 /* Generate a pull load into dst. */
1934 if (inst->src[i].reladdr) {
1935 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
1936 surf_index,
1937 *inst->src[i].reladdr,
1938 pull_index);
1939 inst->insert_before(&list);
1940 inst->src[i].reladdr = NULL;
1941 } else {
1942 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1943 fs_inst *pull =
1944 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1945 dst, surf_index, offset);
1946 inst->insert_before(pull);
1947 inst->src[i].set_smear(pull_index & 3);
1948 }
1949
1950 /* Rewrite the instruction to use the temporary VGRF. */
1951 inst->src[i].file = GRF;
1952 inst->src[i].reg = dst.reg;
1953 inst->src[i].reg_offset = 0;
1954 }
1955 }
1956 invalidate_live_intervals();
1957 }
1958
1959 bool
1960 fs_visitor::opt_algebraic()
1961 {
1962 bool progress = false;
1963
1964 foreach_in_list(fs_inst, inst, &instructions) {
1965 switch (inst->opcode) {
1966 case BRW_OPCODE_MUL:
1967 if (inst->src[1].file != IMM)
1968 continue;
1969
1970 /* a * 1.0 = a */
1971 if (inst->src[1].is_one()) {
1972 inst->opcode = BRW_OPCODE_MOV;
1973 inst->src[1] = reg_undef;
1974 progress = true;
1975 break;
1976 }
1977
1978 /* a * 0.0 = 0.0 */
1979 if (inst->src[1].is_zero()) {
1980 inst->opcode = BRW_OPCODE_MOV;
1981 inst->src[0] = inst->src[1];
1982 inst->src[1] = reg_undef;
1983 progress = true;
1984 break;
1985 }
1986
1987 break;
1988 case BRW_OPCODE_ADD:
1989 if (inst->src[1].file != IMM)
1990 continue;
1991
1992 /* a + 0.0 = a */
1993 if (inst->src[1].is_zero()) {
1994 inst->opcode = BRW_OPCODE_MOV;
1995 inst->src[1] = reg_undef;
1996 progress = true;
1997 break;
1998 }
1999 break;
2000 case BRW_OPCODE_OR:
2001 if (inst->src[0].equals(inst->src[1])) {
2002 inst->opcode = BRW_OPCODE_MOV;
2003 inst->src[1] = reg_undef;
2004 progress = true;
2005 break;
2006 }
2007 break;
2008 case BRW_OPCODE_LRP:
2009 if (inst->src[1].equals(inst->src[2])) {
2010 inst->opcode = BRW_OPCODE_MOV;
2011 inst->src[0] = inst->src[1];
2012 inst->src[1] = reg_undef;
2013 inst->src[2] = reg_undef;
2014 progress = true;
2015 break;
2016 }
2017 break;
2018 case BRW_OPCODE_SEL:
2019 if (inst->src[0].equals(inst->src[1])) {
2020 inst->opcode = BRW_OPCODE_MOV;
2021 inst->src[1] = reg_undef;
2022 inst->predicate = BRW_PREDICATE_NONE;
2023 inst->predicate_inverse = false;
2024 progress = true;
2025 } else if (inst->saturate && inst->src[1].file == IMM) {
2026 switch (inst->conditional_mod) {
2027 case BRW_CONDITIONAL_LE:
2028 case BRW_CONDITIONAL_L:
2029 switch (inst->src[1].type) {
2030 case BRW_REGISTER_TYPE_F:
2031 if (inst->src[1].imm.f >= 1.0f) {
2032 inst->opcode = BRW_OPCODE_MOV;
2033 inst->src[1] = reg_undef;
2034 progress = true;
2035 }
2036 break;
2037 default:
2038 break;
2039 }
2040 break;
2041 case BRW_CONDITIONAL_GE:
2042 case BRW_CONDITIONAL_G:
2043 switch (inst->src[1].type) {
2044 case BRW_REGISTER_TYPE_F:
2045 if (inst->src[1].imm.f <= 0.0f) {
2046 inst->opcode = BRW_OPCODE_MOV;
2047 inst->src[1] = reg_undef;
2048 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2049 progress = true;
2050 }
2051 break;
2052 default:
2053 break;
2054 }
2055 default:
2056 break;
2057 }
2058 }
2059 break;
2060 default:
2061 break;
2062 }
2063 }
2064
2065 return progress;
2066 }
2067
2068 bool
2069 fs_visitor::compute_to_mrf()
2070 {
2071 bool progress = false;
2072 int next_ip = 0;
2073
2074 calculate_live_intervals();
2075
2076 foreach_in_list_safe(fs_inst, inst, &instructions) {
2077 int ip = next_ip;
2078 next_ip++;
2079
2080 if (inst->opcode != BRW_OPCODE_MOV ||
2081 inst->is_partial_write() ||
2082 inst->dst.file != MRF || inst->src[0].file != GRF ||
2083 inst->dst.type != inst->src[0].type ||
2084 inst->src[0].abs || inst->src[0].negate ||
2085 !inst->src[0].is_contiguous() ||
2086 inst->src[0].subreg_offset)
2087 continue;
2088
2089 /* Work out which hardware MRF registers are written by this
2090 * instruction.
2091 */
2092 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2093 int mrf_high;
2094 if (inst->dst.reg & BRW_MRF_COMPR4) {
2095 mrf_high = mrf_low + 4;
2096 } else if (dispatch_width == 16 &&
2097 (!inst->force_uncompressed && !inst->force_sechalf)) {
2098 mrf_high = mrf_low + 1;
2099 } else {
2100 mrf_high = mrf_low;
2101 }
2102
2103 /* Can't compute-to-MRF this GRF if someone else was going to
2104 * read it later.
2105 */
2106 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2107 continue;
2108
2109 /* Found a move of a GRF to a MRF. Let's see if we can go
2110 * rewrite the thing that made this GRF to write into the MRF.
2111 */
2112 fs_inst *scan_inst;
2113 for (scan_inst = (fs_inst *)inst->prev;
2114 !scan_inst->is_head_sentinel();
2115 scan_inst = (fs_inst *)scan_inst->prev) {
2116 if (scan_inst->dst.file == GRF &&
2117 scan_inst->dst.reg == inst->src[0].reg) {
2118 /* Found the last thing to write our reg we want to turn
2119 * into a compute-to-MRF.
2120 */
2121
2122 /* If this one instruction didn't populate all the
2123 * channels, bail. We might be able to rewrite everything
2124 * that writes that reg, but it would require smarter
2125 * tracking to delay the rewriting until complete success.
2126 */
2127 if (scan_inst->is_partial_write())
2128 break;
2129
2130 /* Things returning more than one register would need us to
2131 * understand coalescing out more than one MOV at a time.
2132 */
2133 if (scan_inst->regs_written > 1)
2134 break;
2135
2136 /* SEND instructions can't have MRF as a destination. */
2137 if (scan_inst->mlen)
2138 break;
2139
2140 if (brw->gen == 6) {
2141 /* gen6 math instructions must have the destination be
2142 * GRF, so no compute-to-MRF for them.
2143 */
2144 if (scan_inst->is_math()) {
2145 break;
2146 }
2147 }
2148
2149 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2150 /* Found the creator of our MRF's source value. */
2151 scan_inst->dst.file = MRF;
2152 scan_inst->dst.reg = inst->dst.reg;
2153 scan_inst->saturate |= inst->saturate;
2154 inst->remove();
2155 progress = true;
2156 }
2157 break;
2158 }
2159
2160 /* We don't handle control flow here. Most computation of
2161 * values that end up in MRFs are shortly before the MRF
2162 * write anyway.
2163 */
2164 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2165 break;
2166
2167 /* You can't read from an MRF, so if someone else reads our
2168 * MRF's source GRF that we wanted to rewrite, that stops us.
2169 */
2170 bool interfered = false;
2171 for (int i = 0; i < scan_inst->sources; i++) {
2172 if (scan_inst->src[i].file == GRF &&
2173 scan_inst->src[i].reg == inst->src[0].reg &&
2174 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2175 interfered = true;
2176 }
2177 }
2178 if (interfered)
2179 break;
2180
2181 if (scan_inst->dst.file == MRF) {
2182 /* If somebody else writes our MRF here, we can't
2183 * compute-to-MRF before that.
2184 */
2185 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2186 int scan_mrf_high;
2187
2188 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2189 scan_mrf_high = scan_mrf_low + 4;
2190 } else if (dispatch_width == 16 &&
2191 (!scan_inst->force_uncompressed &&
2192 !scan_inst->force_sechalf)) {
2193 scan_mrf_high = scan_mrf_low + 1;
2194 } else {
2195 scan_mrf_high = scan_mrf_low;
2196 }
2197
2198 if (mrf_low == scan_mrf_low ||
2199 mrf_low == scan_mrf_high ||
2200 mrf_high == scan_mrf_low ||
2201 mrf_high == scan_mrf_high) {
2202 break;
2203 }
2204 }
2205
2206 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2207 /* Found a SEND instruction, which means that there are
2208 * live values in MRFs from base_mrf to base_mrf +
2209 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2210 * above it.
2211 */
2212 if (mrf_low >= scan_inst->base_mrf &&
2213 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2214 break;
2215 }
2216 if (mrf_high >= scan_inst->base_mrf &&
2217 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2218 break;
2219 }
2220 }
2221 }
2222 }
2223
2224 if (progress)
2225 invalidate_live_intervals();
2226
2227 return progress;
2228 }
2229
2230 /**
2231 * Walks through basic blocks, looking for repeated MRF writes and
2232 * removing the later ones.
2233 */
2234 bool
2235 fs_visitor::remove_duplicate_mrf_writes()
2236 {
2237 fs_inst *last_mrf_move[16];
2238 bool progress = false;
2239
2240 /* Need to update the MRF tracking for compressed instructions. */
2241 if (dispatch_width == 16)
2242 return false;
2243
2244 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2245
2246 foreach_in_list_safe(fs_inst, inst, &instructions) {
2247 if (inst->is_control_flow()) {
2248 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2249 }
2250
2251 if (inst->opcode == BRW_OPCODE_MOV &&
2252 inst->dst.file == MRF) {
2253 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2254 if (prev_inst && inst->equals(prev_inst)) {
2255 inst->remove();
2256 progress = true;
2257 continue;
2258 }
2259 }
2260
2261 /* Clear out the last-write records for MRFs that were overwritten. */
2262 if (inst->dst.file == MRF) {
2263 last_mrf_move[inst->dst.reg] = NULL;
2264 }
2265
2266 if (inst->mlen > 0 && inst->base_mrf != -1) {
2267 /* Found a SEND instruction, which will include two or fewer
2268 * implied MRF writes. We could do better here.
2269 */
2270 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2271 last_mrf_move[inst->base_mrf + i] = NULL;
2272 }
2273 }
2274
2275 /* Clear out any MRF move records whose sources got overwritten. */
2276 if (inst->dst.file == GRF) {
2277 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2278 if (last_mrf_move[i] &&
2279 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2280 last_mrf_move[i] = NULL;
2281 }
2282 }
2283 }
2284
2285 if (inst->opcode == BRW_OPCODE_MOV &&
2286 inst->dst.file == MRF &&
2287 inst->src[0].file == GRF &&
2288 !inst->is_partial_write()) {
2289 last_mrf_move[inst->dst.reg] = inst;
2290 }
2291 }
2292
2293 if (progress)
2294 invalidate_live_intervals();
2295
2296 return progress;
2297 }
2298
2299 static void
2300 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2301 int first_grf, int grf_len)
2302 {
2303 bool inst_simd16 = (dispatch_width > 8 &&
2304 !inst->force_uncompressed &&
2305 !inst->force_sechalf);
2306
2307 /* Clear the flag for registers that actually got read (as expected). */
2308 for (int i = 0; i < inst->sources; i++) {
2309 int grf;
2310 if (inst->src[i].file == GRF) {
2311 grf = inst->src[i].reg;
2312 } else if (inst->src[i].file == HW_REG &&
2313 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2314 grf = inst->src[i].fixed_hw_reg.nr;
2315 } else {
2316 continue;
2317 }
2318
2319 if (grf >= first_grf &&
2320 grf < first_grf + grf_len) {
2321 deps[grf - first_grf] = false;
2322 if (inst_simd16)
2323 deps[grf - first_grf + 1] = false;
2324 }
2325 }
2326 }
2327
2328 /**
2329 * Implements this workaround for the original 965:
2330 *
2331 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2332 * check for post destination dependencies on this instruction, software
2333 * must ensure that there is no destination hazard for the case of ‘write
2334 * followed by a posted write’ shown in the following example.
2335 *
2336 * 1. mov r3 0
2337 * 2. send r3.xy <rest of send instruction>
2338 * 3. mov r2 r3
2339 *
2340 * Due to no post-destination dependency check on the ‘send’, the above
2341 * code sequence could have two instructions (1 and 2) in flight at the
2342 * same time that both consider ‘r3’ as the target of their final writes.
2343 */
2344 void
2345 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2346 {
2347 int reg_size = dispatch_width / 8;
2348 int write_len = inst->regs_written * reg_size;
2349 int first_write_grf = inst->dst.reg;
2350 bool needs_dep[BRW_MAX_MRF];
2351 assert(write_len < (int)sizeof(needs_dep) - 1);
2352
2353 memset(needs_dep, false, sizeof(needs_dep));
2354 memset(needs_dep, true, write_len);
2355
2356 clear_deps_for_inst_src(inst, dispatch_width,
2357 needs_dep, first_write_grf, write_len);
2358
2359 /* Walk backwards looking for writes to registers we're writing which
2360 * aren't read since being written. If we hit the start of the program,
2361 * we assume that there are no outstanding dependencies on entry to the
2362 * program.
2363 */
2364 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2365 !scan_inst->is_head_sentinel();
2366 scan_inst = (fs_inst *)scan_inst->prev) {
2367
2368 /* If we hit control flow, assume that there *are* outstanding
2369 * dependencies, and force their cleanup before our instruction.
2370 */
2371 if (scan_inst->is_control_flow()) {
2372 for (int i = 0; i < write_len; i++) {
2373 if (needs_dep[i]) {
2374 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2375 }
2376 }
2377 return;
2378 }
2379
2380 bool scan_inst_simd16 = (dispatch_width > 8 &&
2381 !scan_inst->force_uncompressed &&
2382 !scan_inst->force_sechalf);
2383
2384 /* We insert our reads as late as possible on the assumption that any
2385 * instruction but a MOV that might have left us an outstanding
2386 * dependency has more latency than a MOV.
2387 */
2388 if (scan_inst->dst.file == GRF) {
2389 for (int i = 0; i < scan_inst->regs_written; i++) {
2390 int reg = scan_inst->dst.reg + i * reg_size;
2391
2392 if (reg >= first_write_grf &&
2393 reg < first_write_grf + write_len &&
2394 needs_dep[reg - first_write_grf]) {
2395 inst->insert_before(DEP_RESOLVE_MOV(reg));
2396 needs_dep[reg - first_write_grf] = false;
2397 if (scan_inst_simd16)
2398 needs_dep[reg - first_write_grf + 1] = false;
2399 }
2400 }
2401 }
2402
2403 /* Clear the flag for registers that actually got read (as expected). */
2404 clear_deps_for_inst_src(scan_inst, dispatch_width,
2405 needs_dep, first_write_grf, write_len);
2406
2407 /* Continue the loop only if we haven't resolved all the dependencies */
2408 int i;
2409 for (i = 0; i < write_len; i++) {
2410 if (needs_dep[i])
2411 break;
2412 }
2413 if (i == write_len)
2414 return;
2415 }
2416 }
2417
2418 /**
2419 * Implements this workaround for the original 965:
2420 *
2421 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2422 * used as a destination register until after it has been sourced by an
2423 * instruction with a different destination register.
2424 */
2425 void
2426 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2427 {
2428 int write_len = inst->regs_written * dispatch_width / 8;
2429 int first_write_grf = inst->dst.reg;
2430 bool needs_dep[BRW_MAX_MRF];
2431 assert(write_len < (int)sizeof(needs_dep) - 1);
2432
2433 memset(needs_dep, false, sizeof(needs_dep));
2434 memset(needs_dep, true, write_len);
2435 /* Walk forwards looking for writes to registers we're writing which aren't
2436 * read before being written.
2437 */
2438 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2439 !scan_inst->is_tail_sentinel();
2440 scan_inst = (fs_inst *)scan_inst->next) {
2441 /* If we hit control flow, force resolve all remaining dependencies. */
2442 if (scan_inst->is_control_flow()) {
2443 for (int i = 0; i < write_len; i++) {
2444 if (needs_dep[i])
2445 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2446 }
2447 return;
2448 }
2449
2450 /* Clear the flag for registers that actually got read (as expected). */
2451 clear_deps_for_inst_src(scan_inst, dispatch_width,
2452 needs_dep, first_write_grf, write_len);
2453
2454 /* We insert our reads as late as possible since they're reading the
2455 * result of a SEND, which has massive latency.
2456 */
2457 if (scan_inst->dst.file == GRF &&
2458 scan_inst->dst.reg >= first_write_grf &&
2459 scan_inst->dst.reg < first_write_grf + write_len &&
2460 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2461 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2462 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2463 }
2464
2465 /* Continue the loop only if we haven't resolved all the dependencies */
2466 int i;
2467 for (i = 0; i < write_len; i++) {
2468 if (needs_dep[i])
2469 break;
2470 }
2471 if (i == write_len)
2472 return;
2473 }
2474
2475 /* If we hit the end of the program, resolve all remaining dependencies out
2476 * of paranoia.
2477 */
2478 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2479 assert(last_inst->eot);
2480 for (int i = 0; i < write_len; i++) {
2481 if (needs_dep[i])
2482 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2483 }
2484 }
2485
2486 void
2487 fs_visitor::insert_gen4_send_dependency_workarounds()
2488 {
2489 if (brw->gen != 4 || brw->is_g4x)
2490 return;
2491
2492 bool progress = false;
2493
2494 /* Note that we're done with register allocation, so GRF fs_regs always
2495 * have a .reg_offset of 0.
2496 */
2497
2498 foreach_in_list_safe(fs_inst, inst, &instructions) {
2499 if (inst->mlen != 0 && inst->dst.file == GRF) {
2500 insert_gen4_pre_send_dependency_workarounds(inst);
2501 insert_gen4_post_send_dependency_workarounds(inst);
2502 progress = true;
2503 }
2504 }
2505
2506 if (progress)
2507 invalidate_live_intervals();
2508 }
2509
2510 /**
2511 * Turns the generic expression-style uniform pull constant load instruction
2512 * into a hardware-specific series of instructions for loading a pull
2513 * constant.
2514 *
2515 * The expression style allows the CSE pass before this to optimize out
2516 * repeated loads from the same offset, and gives the pre-register-allocation
2517 * scheduling full flexibility, while the conversion to native instructions
2518 * allows the post-register-allocation scheduler the best information
2519 * possible.
2520 *
2521 * Note that execution masking for setting up pull constant loads is special:
2522 * the channels that need to be written are unrelated to the current execution
2523 * mask, since a later instruction will use one of the result channels as a
2524 * source operand for all 8 or 16 of its channels.
2525 */
2526 void
2527 fs_visitor::lower_uniform_pull_constant_loads()
2528 {
2529 foreach_in_list(fs_inst, inst, &instructions) {
2530 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2531 continue;
2532
2533 if (brw->gen >= 7) {
2534 /* The offset arg before was a vec4-aligned byte offset. We need to
2535 * turn it into a dword offset.
2536 */
2537 fs_reg const_offset_reg = inst->src[1];
2538 assert(const_offset_reg.file == IMM &&
2539 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2540 const_offset_reg.imm.u /= 4;
2541 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2542
2543 /* This is actually going to be a MOV, but since only the first dword
2544 * is accessed, we have a special opcode to do just that one. Note
2545 * that this needs to be an operation that will be considered a def
2546 * by live variable analysis, or register allocation will explode.
2547 */
2548 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2549 payload, const_offset_reg);
2550 setup->force_writemask_all = true;
2551
2552 setup->ir = inst->ir;
2553 setup->annotation = inst->annotation;
2554 inst->insert_before(setup);
2555
2556 /* Similarly, this will only populate the first 4 channels of the
2557 * result register (since we only use smear values from 0-3), but we
2558 * don't tell the optimizer.
2559 */
2560 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2561 inst->src[1] = payload;
2562
2563 invalidate_live_intervals();
2564 } else {
2565 /* Before register allocation, we didn't tell the scheduler about the
2566 * MRF we use. We know it's safe to use this MRF because nothing
2567 * else does except for register spill/unspill, which generates and
2568 * uses its MRF within a single IR instruction.
2569 */
2570 inst->base_mrf = 14;
2571 inst->mlen = 1;
2572 }
2573 }
2574 }
2575
2576 bool
2577 fs_visitor::lower_load_payload()
2578 {
2579 bool progress = false;
2580
2581 foreach_in_list_safe(fs_inst, inst, &instructions) {
2582 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
2583 fs_reg dst = inst->dst;
2584
2585 /* src[0] represents the (optional) message header. */
2586 if (inst->src[0].file != BAD_FILE) {
2587 inst->insert_before(MOV(dst, inst->src[0]));
2588 }
2589 dst.reg_offset++;
2590
2591 for (int i = 1; i < inst->sources; i++) {
2592 inst->insert_before(MOV(dst, inst->src[i]));
2593 dst.reg_offset++;
2594 }
2595
2596 inst->remove();
2597 progress = true;
2598 }
2599 }
2600
2601 if (progress)
2602 invalidate_live_intervals();
2603
2604 return progress;
2605 }
2606
2607 void
2608 fs_visitor::dump_instructions()
2609 {
2610 dump_instructions(NULL);
2611 }
2612
2613 void
2614 fs_visitor::dump_instructions(const char *name)
2615 {
2616 calculate_register_pressure();
2617 FILE *file = stderr;
2618 if (name && geteuid() != 0) {
2619 file = fopen(name, "w");
2620 if (!file)
2621 file = stderr;
2622 }
2623
2624 int ip = 0, max_pressure = 0;
2625 foreach_in_list(backend_instruction, inst, &instructions) {
2626 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2627 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
2628 dump_instruction(inst, file);
2629 ++ip;
2630 }
2631 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
2632
2633 if (file != stderr) {
2634 fclose(file);
2635 }
2636 }
2637
2638 void
2639 fs_visitor::dump_instruction(backend_instruction *be_inst)
2640 {
2641 dump_instruction(be_inst, stderr);
2642 }
2643
2644 void
2645 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
2646 {
2647 fs_inst *inst = (fs_inst *)be_inst;
2648
2649 if (inst->predicate) {
2650 fprintf(file, "(%cf0.%d) ",
2651 inst->predicate_inverse ? '-' : '+',
2652 inst->flag_subreg);
2653 }
2654
2655 fprintf(file, "%s", brw_instruction_name(inst->opcode));
2656 if (inst->saturate)
2657 fprintf(file, ".sat");
2658 if (inst->conditional_mod) {
2659 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
2660 if (!inst->predicate &&
2661 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2662 inst->opcode != BRW_OPCODE_IF &&
2663 inst->opcode != BRW_OPCODE_WHILE))) {
2664 fprintf(file, ".f0.%d", inst->flag_subreg);
2665 }
2666 }
2667 fprintf(file, " ");
2668
2669
2670 switch (inst->dst.file) {
2671 case GRF:
2672 fprintf(file, "vgrf%d", inst->dst.reg);
2673 if (virtual_grf_sizes[inst->dst.reg] != 1 ||
2674 inst->dst.subreg_offset)
2675 fprintf(file, "+%d.%d",
2676 inst->dst.reg_offset, inst->dst.subreg_offset);
2677 break;
2678 case MRF:
2679 fprintf(file, "m%d", inst->dst.reg);
2680 break;
2681 case BAD_FILE:
2682 fprintf(file, "(null)");
2683 break;
2684 case UNIFORM:
2685 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
2686 break;
2687 case HW_REG:
2688 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2689 switch (inst->dst.fixed_hw_reg.nr) {
2690 case BRW_ARF_NULL:
2691 fprintf(file, "null");
2692 break;
2693 case BRW_ARF_ADDRESS:
2694 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
2695 break;
2696 case BRW_ARF_ACCUMULATOR:
2697 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
2698 break;
2699 case BRW_ARF_FLAG:
2700 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2701 inst->dst.fixed_hw_reg.subnr);
2702 break;
2703 default:
2704 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2705 inst->dst.fixed_hw_reg.subnr);
2706 break;
2707 }
2708 } else {
2709 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
2710 }
2711 if (inst->dst.fixed_hw_reg.subnr)
2712 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
2713 break;
2714 default:
2715 fprintf(file, "???");
2716 break;
2717 }
2718 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
2719
2720 for (int i = 0; i < inst->sources && inst->src[i].file != BAD_FILE; i++) {
2721 if (inst->src[i].negate)
2722 fprintf(file, "-");
2723 if (inst->src[i].abs)
2724 fprintf(file, "|");
2725 switch (inst->src[i].file) {
2726 case GRF:
2727 fprintf(file, "vgrf%d", inst->src[i].reg);
2728 if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2729 inst->src[i].subreg_offset)
2730 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2731 inst->src[i].subreg_offset);
2732 break;
2733 case MRF:
2734 fprintf(file, "***m%d***", inst->src[i].reg);
2735 break;
2736 case UNIFORM:
2737 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
2738 if (inst->src[i].reladdr) {
2739 fprintf(file, "+reladdr");
2740 } else if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2741 inst->src[i].subreg_offset) {
2742 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2743 inst->src[i].subreg_offset);
2744 }
2745 break;
2746 case BAD_FILE:
2747 fprintf(file, "(null)");
2748 break;
2749 case IMM:
2750 switch (inst->src[i].type) {
2751 case BRW_REGISTER_TYPE_F:
2752 fprintf(file, "%ff", inst->src[i].imm.f);
2753 break;
2754 case BRW_REGISTER_TYPE_D:
2755 fprintf(file, "%dd", inst->src[i].imm.i);
2756 break;
2757 case BRW_REGISTER_TYPE_UD:
2758 fprintf(file, "%uu", inst->src[i].imm.u);
2759 break;
2760 default:
2761 fprintf(file, "???");
2762 break;
2763 }
2764 break;
2765 case HW_REG:
2766 if (inst->src[i].fixed_hw_reg.negate)
2767 fprintf(file, "-");
2768 if (inst->src[i].fixed_hw_reg.abs)
2769 fprintf(file, "|");
2770 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2771 switch (inst->src[i].fixed_hw_reg.nr) {
2772 case BRW_ARF_NULL:
2773 fprintf(file, "null");
2774 break;
2775 case BRW_ARF_ADDRESS:
2776 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
2777 break;
2778 case BRW_ARF_ACCUMULATOR:
2779 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
2780 break;
2781 case BRW_ARF_FLAG:
2782 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2783 inst->src[i].fixed_hw_reg.subnr);
2784 break;
2785 default:
2786 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2787 inst->src[i].fixed_hw_reg.subnr);
2788 break;
2789 }
2790 } else {
2791 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2792 }
2793 if (inst->src[i].fixed_hw_reg.subnr)
2794 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
2795 if (inst->src[i].fixed_hw_reg.abs)
2796 fprintf(file, "|");
2797 break;
2798 default:
2799 fprintf(file, "???");
2800 break;
2801 }
2802 if (inst->src[i].abs)
2803 fprintf(file, "|");
2804
2805 if (inst->src[i].file != IMM) {
2806 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
2807 }
2808
2809 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
2810 fprintf(file, ", ");
2811 }
2812
2813 fprintf(file, " ");
2814
2815 if (inst->force_uncompressed)
2816 fprintf(file, "1sthalf ");
2817
2818 if (inst->force_sechalf)
2819 fprintf(file, "2ndhalf ");
2820
2821 fprintf(file, "\n");
2822 }
2823
2824 /**
2825 * Possibly returns an instruction that set up @param reg.
2826 *
2827 * Sometimes we want to take the result of some expression/variable
2828 * dereference tree and rewrite the instruction generating the result
2829 * of the tree. When processing the tree, we know that the
2830 * instructions generated are all writing temporaries that are dead
2831 * outside of this tree. So, if we have some instructions that write
2832 * a temporary, we're free to point that temp write somewhere else.
2833 *
2834 * Note that this doesn't guarantee that the instruction generated
2835 * only reg -- it might be the size=4 destination of a texture instruction.
2836 */
2837 fs_inst *
2838 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2839 fs_inst *end,
2840 const fs_reg &reg)
2841 {
2842 if (end == start ||
2843 end->is_partial_write() ||
2844 reg.reladdr ||
2845 !reg.equals(end->dst)) {
2846 return NULL;
2847 } else {
2848 return end;
2849 }
2850 }
2851
2852 void
2853 fs_visitor::setup_payload_gen6()
2854 {
2855 bool uses_depth =
2856 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2857 unsigned barycentric_interp_modes = prog_data->barycentric_interp_modes;
2858
2859 assert(brw->gen >= 6);
2860
2861 /* R0-1: masks, pixel X/Y coordinates. */
2862 payload.num_regs = 2;
2863 /* R2: only for 32-pixel dispatch.*/
2864
2865 /* R3-26: barycentric interpolation coordinates. These appear in the
2866 * same order that they appear in the brw_wm_barycentric_interp_mode
2867 * enum. Each set of coordinates occupies 2 registers if dispatch width
2868 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2869 * appear if they were enabled using the "Barycentric Interpolation
2870 * Mode" bits in WM_STATE.
2871 */
2872 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2873 if (barycentric_interp_modes & (1 << i)) {
2874 payload.barycentric_coord_reg[i] = payload.num_regs;
2875 payload.num_regs += 2;
2876 if (dispatch_width == 16) {
2877 payload.num_regs += 2;
2878 }
2879 }
2880 }
2881
2882 /* R27: interpolated depth if uses source depth */
2883 if (uses_depth) {
2884 payload.source_depth_reg = payload.num_regs;
2885 payload.num_regs++;
2886 if (dispatch_width == 16) {
2887 /* R28: interpolated depth if not SIMD8. */
2888 payload.num_regs++;
2889 }
2890 }
2891 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2892 if (uses_depth) {
2893 payload.source_w_reg = payload.num_regs;
2894 payload.num_regs++;
2895 if (dispatch_width == 16) {
2896 /* R30: interpolated W if not SIMD8. */
2897 payload.num_regs++;
2898 }
2899 }
2900
2901 prog_data->uses_pos_offset = key->compute_pos_offset;
2902 /* R31: MSAA position offsets. */
2903 if (prog_data->uses_pos_offset) {
2904 payload.sample_pos_reg = payload.num_regs;
2905 payload.num_regs++;
2906 }
2907
2908 /* R32: MSAA input coverage mask */
2909 if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
2910 assert(brw->gen >= 7);
2911 payload.sample_mask_in_reg = payload.num_regs;
2912 payload.num_regs++;
2913 if (dispatch_width == 16) {
2914 /* R33: input coverage mask if not SIMD8. */
2915 payload.num_regs++;
2916 }
2917 }
2918
2919 /* R34-: bary for 32-pixel. */
2920 /* R58-59: interp W for 32-pixel. */
2921
2922 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2923 source_depth_to_render_target = true;
2924 }
2925 }
2926
2927 void
2928 fs_visitor::assign_binding_table_offsets()
2929 {
2930 uint32_t next_binding_table_offset = 0;
2931
2932 /* If there are no color regions, we still perform an FB write to a null
2933 * renderbuffer, which we place at surface index 0.
2934 */
2935 prog_data->binding_table.render_target_start = next_binding_table_offset;
2936 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
2937
2938 assign_common_binding_table_offsets(next_binding_table_offset);
2939 }
2940
2941 void
2942 fs_visitor::calculate_register_pressure()
2943 {
2944 invalidate_live_intervals();
2945 calculate_live_intervals();
2946
2947 int num_instructions = 0;
2948 foreach_in_list(fs_inst, inst, &instructions) {
2949 ++num_instructions;
2950 }
2951
2952 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
2953
2954 for (int reg = 0; reg < virtual_grf_count; reg++) {
2955 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
2956 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
2957 }
2958 }
2959
2960 /**
2961 * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
2962 *
2963 * The needs_unlit_centroid_workaround ends up producing one of these per
2964 * channel of centroid input, so it's good to clean them up.
2965 *
2966 * An assumption here is that nothing ever modifies the dispatched pixels
2967 * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
2968 * dictates that anyway.
2969 */
2970 void
2971 fs_visitor::opt_drop_redundant_mov_to_flags()
2972 {
2973 bool flag_mov_found[2] = {false};
2974
2975 foreach_in_list_safe(fs_inst, inst, &instructions) {
2976 if (inst->is_control_flow()) {
2977 memset(flag_mov_found, 0, sizeof(flag_mov_found));
2978 } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
2979 if (!flag_mov_found[inst->flag_subreg])
2980 flag_mov_found[inst->flag_subreg] = true;
2981 else
2982 inst->remove();
2983 } else if (inst->writes_flag()) {
2984 flag_mov_found[inst->flag_subreg] = false;
2985 }
2986 }
2987 }
2988
2989 bool
2990 fs_visitor::run()
2991 {
2992 sanity_param_count = fp->Base.Parameters->NumParameters;
2993 bool allocated_without_spills;
2994
2995 assign_binding_table_offsets();
2996
2997 if (brw->gen >= 6)
2998 setup_payload_gen6();
2999 else
3000 setup_payload_gen4();
3001
3002 if (0) {
3003 emit_dummy_fs();
3004 } else {
3005 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3006 emit_shader_time_begin();
3007
3008 calculate_urb_setup();
3009 if (fp->Base.InputsRead > 0) {
3010 if (brw->gen < 6)
3011 emit_interpolation_setup_gen4();
3012 else
3013 emit_interpolation_setup_gen6();
3014 }
3015
3016 /* We handle discards by keeping track of the still-live pixels in f0.1.
3017 * Initialize it with the dispatched pixels.
3018 */
3019 if (fp->UsesKill || key->alpha_test_func) {
3020 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3021 discard_init->flag_subreg = 1;
3022 }
3023
3024 /* Generate FS IR for main(). (the visitor only descends into
3025 * functions called "main").
3026 */
3027 if (shader) {
3028 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3029 base_ir = ir;
3030 this->result = reg_undef;
3031 ir->accept(this);
3032 }
3033 } else {
3034 emit_fragment_program_code();
3035 }
3036 base_ir = NULL;
3037 if (failed)
3038 return false;
3039
3040 emit(FS_OPCODE_PLACEHOLDER_HALT);
3041
3042 if (key->alpha_test_func)
3043 emit_alpha_test();
3044
3045 emit_fb_writes();
3046
3047 split_virtual_grfs();
3048
3049 move_uniform_array_access_to_pull_constants();
3050 assign_constant_locations();
3051 demote_pull_constants();
3052
3053 opt_drop_redundant_mov_to_flags();
3054
3055 #define OPT(pass, args...) do { \
3056 pass_num++; \
3057 bool this_progress = pass(args); \
3058 \
3059 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3060 char filename[64]; \
3061 snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass, \
3062 dispatch_width, shader_prog->Name, iteration, pass_num); \
3063 \
3064 backend_visitor::dump_instructions(filename); \
3065 } \
3066 \
3067 progress = progress || this_progress; \
3068 } while (false)
3069
3070 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3071 char filename[64];
3072 snprintf(filename, 64, "fs%d-%04d-00-start",
3073 dispatch_width, shader_prog->Name);
3074
3075 backend_visitor::dump_instructions(filename);
3076 }
3077
3078 bool progress;
3079 int iteration = 0;
3080 do {
3081 progress = false;
3082 iteration++;
3083 int pass_num = 0;
3084
3085 compact_virtual_grfs();
3086
3087 OPT(remove_duplicate_mrf_writes);
3088
3089 OPT(opt_algebraic);
3090 OPT(opt_cse);
3091 OPT(opt_copy_propagate);
3092 OPT(opt_peephole_predicated_break);
3093 OPT(dead_code_eliminate);
3094 OPT(opt_peephole_sel);
3095 OPT(dead_control_flow_eliminate, this);
3096 OPT(opt_saturate_propagation);
3097 OPT(register_coalesce);
3098 OPT(compute_to_mrf);
3099 } while (progress);
3100
3101 if (lower_load_payload()) {
3102 register_coalesce();
3103 dead_code_eliminate();
3104 }
3105
3106 lower_uniform_pull_constant_loads();
3107
3108 assign_curb_setup();
3109 assign_urb_setup();
3110
3111 static enum instruction_scheduler_mode pre_modes[] = {
3112 SCHEDULE_PRE,
3113 SCHEDULE_PRE_NON_LIFO,
3114 SCHEDULE_PRE_LIFO,
3115 };
3116
3117 /* Try each scheduling heuristic to see if it can successfully register
3118 * allocate without spilling. They should be ordered by decreasing
3119 * performance but increasing likelihood of allocating.
3120 */
3121 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3122 schedule_instructions(pre_modes[i]);
3123
3124 if (0) {
3125 assign_regs_trivial();
3126 allocated_without_spills = true;
3127 } else {
3128 allocated_without_spills = assign_regs(false);
3129 }
3130 if (allocated_without_spills)
3131 break;
3132 }
3133
3134 if (!allocated_without_spills) {
3135 /* We assume that any spilling is worse than just dropping back to
3136 * SIMD8. There's probably actually some intermediate point where
3137 * SIMD16 with a couple of spills is still better.
3138 */
3139 if (dispatch_width == 16) {
3140 fail("Failure to register allocate. Reduce number of "
3141 "live scalar values to avoid this.");
3142 } else {
3143 perf_debug("Fragment shader triggered register spilling. "
3144 "Try reducing the number of live scalar values to "
3145 "improve performance.\n");
3146 }
3147
3148 /* Since we're out of heuristics, just go spill registers until we
3149 * get an allocation.
3150 */
3151 while (!assign_regs(true)) {
3152 if (failed)
3153 break;
3154 }
3155 }
3156 }
3157 assert(force_uncompressed_stack == 0);
3158
3159 /* This must come after all optimization and register allocation, since
3160 * it inserts dead code that happens to have side effects, and it does
3161 * so based on the actual physical registers in use.
3162 */
3163 insert_gen4_send_dependency_workarounds();
3164
3165 if (failed)
3166 return false;
3167
3168 if (!allocated_without_spills)
3169 schedule_instructions(SCHEDULE_POST);
3170
3171 if (last_scratch > 0) {
3172 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3173 }
3174
3175 if (dispatch_width == 8)
3176 prog_data->reg_blocks = brw_register_blocks(grf_used);
3177 else
3178 prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3179
3180 /* If any state parameters were appended, then ParameterValues could have
3181 * been realloced, in which case the driver uniform storage set up by
3182 * _mesa_associate_uniform_storage() would point to freed memory. Make
3183 * sure that didn't happen.
3184 */
3185 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3186
3187 return !failed;
3188 }
3189
3190 const unsigned *
3191 brw_wm_fs_emit(struct brw_context *brw,
3192 void *mem_ctx,
3193 const struct brw_wm_prog_key *key,
3194 struct brw_wm_prog_data *prog_data,
3195 struct gl_fragment_program *fp,
3196 struct gl_shader_program *prog,
3197 unsigned *final_assembly_size)
3198 {
3199 bool start_busy = false;
3200 double start_time = 0;
3201
3202 if (unlikely(brw->perf_debug)) {
3203 start_busy = (brw->batch.last_bo &&
3204 drm_intel_bo_busy(brw->batch.last_bo));
3205 start_time = get_time();
3206 }
3207
3208 struct brw_shader *shader = NULL;
3209 if (prog)
3210 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3211
3212 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3213 brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3214
3215 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3216 */
3217 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3218 if (!v.run()) {
3219 if (prog) {
3220 prog->LinkStatus = false;
3221 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3222 }
3223
3224 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3225 v.fail_msg);
3226
3227 return NULL;
3228 }
3229
3230 exec_list *simd16_instructions = NULL;
3231 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3232 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3233 if (!v.simd16_unsupported) {
3234 /* Try a SIMD16 compile */
3235 v2.import_uniforms(&v);
3236 if (!v2.run()) {
3237 perf_debug("SIMD16 shader failed to compile, falling back to "
3238 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3239 } else {
3240 simd16_instructions = &v2.instructions;
3241 }
3242 } else {
3243 perf_debug("SIMD16 shader unsupported, falling back to "
3244 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3245 }
3246 }
3247
3248 const unsigned *assembly = NULL;
3249 if (brw->gen >= 8) {
3250 gen8_fs_generator g(brw, mem_ctx, key, prog_data, prog, fp, v.do_dual_src);
3251 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3252 final_assembly_size);
3253 } else {
3254 fs_generator g(brw, mem_ctx, key, prog_data, prog, fp, v.do_dual_src,
3255 v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
3256 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3257 final_assembly_size);
3258 }
3259
3260 if (unlikely(brw->perf_debug) && shader) {
3261 if (shader->compiled_once)
3262 brw_wm_debug_recompile(brw, prog, key);
3263 shader->compiled_once = true;
3264
3265 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3266 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3267 (get_time() - start_time) * 1000);
3268 }
3269 }
3270
3271 return assembly;
3272 }
3273
3274 bool
3275 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3276 {
3277 struct brw_context *brw = brw_context(ctx);
3278 struct brw_wm_prog_key key;
3279
3280 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3281 return true;
3282
3283 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3284 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3285 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3286 bool program_uses_dfdy = fp->UsesDFdy;
3287
3288 memset(&key, 0, sizeof(key));
3289
3290 if (brw->gen < 6) {
3291 if (fp->UsesKill)
3292 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3293
3294 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3295 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3296
3297 /* Just assume depth testing. */
3298 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3299 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3300 }
3301
3302 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3303 BRW_FS_VARYING_INPUT_MASK) > 16)
3304 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3305
3306 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3307 for (unsigned i = 0; i < sampler_count; i++) {
3308 if (fp->Base.ShadowSamplers & (1 << i)) {
3309 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3310 key.tex.swizzles[i] =
3311 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3312 } else {
3313 /* Color sampler: assume no swizzling. */
3314 key.tex.swizzles[i] = SWIZZLE_XYZW;
3315 }
3316 }
3317
3318 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3319 key.drawable_height = ctx->DrawBuffer->Height;
3320 }
3321
3322 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3323 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3324 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3325
3326 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3327 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3328 key.nr_color_regions > 1;
3329 }
3330
3331 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3332 * quality of the derivatives is likely to be determined by the driconf
3333 * option.
3334 */
3335 key.high_quality_derivatives = brw->disable_derivative_optimization;
3336
3337 key.program_string_id = bfp->id;
3338
3339 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3340 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3341
3342 bool success = do_wm_prog(brw, prog, bfp, &key);
3343
3344 brw->wm.base.prog_offset = old_prog_offset;
3345 brw->wm.prog_data = old_prog_data;
3346
3347 return success;
3348 }