i965/fs: Add SHADER_OPCODE_LOAD_PAYLOAD.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53
54 void
55 fs_inst::init(enum opcode opcode, const fs_reg &dst, fs_reg *src, int sources)
56 {
57 memset(this, 0, sizeof(*this));
58
59 this->opcode = opcode;
60 this->dst = dst;
61 this->src = src;
62 this->sources = sources;
63
64 this->conditional_mod = BRW_CONDITIONAL_NONE;
65
66 /* This will be the case for almost all instructions. */
67 this->regs_written = 1;
68
69 this->writes_accumulator = false;
70 }
71
72 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
73 {
74 fs_reg *src = ralloc_array(this, fs_reg, 3);
75 init(opcode, dst, src, 0);
76 }
77
78 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
79 {
80 fs_reg *src = ralloc_array(this, fs_reg, 3);
81 src[0] = src0;
82 init(opcode, dst, src, 1);
83 }
84
85 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
86 const fs_reg &src1)
87 {
88 fs_reg *src = ralloc_array(this, fs_reg, 3);
89 src[0] = src0;
90 src[1] = src1;
91 init(opcode, dst, src, 2);
92 }
93
94 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
95 const fs_reg &src1, const fs_reg &src2)
96 {
97 fs_reg *src = ralloc_array(this, fs_reg, 3);
98 src[0] = src0;
99 src[1] = src1;
100 src[2] = src2;
101 init(opcode, dst, src, 3);
102 }
103
104 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
105 {
106 init(opcode, dst, src, sources);
107 }
108
109 fs_inst::fs_inst(const fs_inst &that)
110 {
111 memcpy(this, &that, sizeof(that));
112
113 this->src = ralloc_array(this, fs_reg, that.sources);
114
115 for (int i = 0; i < that.sources; i++)
116 this->src[i] = that.src[i];
117 }
118
119 void
120 fs_inst::resize_sources(uint8_t num_sources)
121 {
122 if (this->sources != num_sources) {
123 this->src = reralloc(this, this->src, fs_reg, num_sources);
124 this->sources = num_sources;
125 }
126 }
127
128 #define ALU1(op) \
129 fs_inst * \
130 fs_visitor::op(fs_reg dst, fs_reg src0) \
131 { \
132 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
133 }
134
135 #define ALU2(op) \
136 fs_inst * \
137 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
138 { \
139 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
140 }
141
142 #define ALU2_ACC(op) \
143 fs_inst * \
144 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
145 { \
146 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
147 inst->writes_accumulator = true; \
148 return inst; \
149 }
150
151 #define ALU3(op) \
152 fs_inst * \
153 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
154 { \
155 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
156 }
157
158 ALU1(NOT)
159 ALU1(MOV)
160 ALU1(FRC)
161 ALU1(RNDD)
162 ALU1(RNDE)
163 ALU1(RNDZ)
164 ALU2(ADD)
165 ALU2(MUL)
166 ALU2_ACC(MACH)
167 ALU2(AND)
168 ALU2(OR)
169 ALU2(XOR)
170 ALU2(SHL)
171 ALU2(SHR)
172 ALU2(ASR)
173 ALU3(LRP)
174 ALU1(BFREV)
175 ALU3(BFE)
176 ALU2(BFI1)
177 ALU3(BFI2)
178 ALU1(FBH)
179 ALU1(FBL)
180 ALU1(CBIT)
181 ALU3(MAD)
182 ALU2_ACC(ADDC)
183 ALU2_ACC(SUBB)
184 ALU2(SEL)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 fs_inst *
189 fs_visitor::IF(uint32_t predicate)
190 {
191 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
192 inst->predicate = predicate;
193 return inst;
194 }
195
196 /** Gen6 IF with embedded comparison. */
197 fs_inst *
198 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
199 {
200 assert(brw->gen == 6);
201 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
202 reg_null_d, src0, src1);
203 inst->conditional_mod = condition;
204 return inst;
205 }
206
207 /**
208 * CMP: Sets the low bit of the destination channels with the result
209 * of the comparison, while the upper bits are undefined, and updates
210 * the flag register with the packed 16 bits of the result.
211 */
212 fs_inst *
213 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
214 {
215 fs_inst *inst;
216
217 /* Take the instruction:
218 *
219 * CMP null<d> src0<f> src1<f>
220 *
221 * Original gen4 does type conversion to the destination type before
222 * comparison, producing garbage results for floating point comparisons.
223 * gen5 does the comparison on the execution type (resolved source types),
224 * so dst type doesn't matter. gen6 does comparison and then uses the
225 * result as if it was the dst type with no conversion, which happens to
226 * mostly work out for float-interpreted-as-int since our comparisons are
227 * for >0, =0, <0.
228 */
229 if (brw->gen == 4) {
230 dst.type = src0.type;
231 if (dst.file == HW_REG)
232 dst.fixed_hw_reg.type = dst.type;
233 }
234
235 resolve_ud_negate(&src0);
236 resolve_ud_negate(&src1);
237
238 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
239 inst->conditional_mod = condition;
240
241 return inst;
242 }
243
244 fs_inst *
245 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
246 {
247 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst, src,
248 sources);
249 inst->regs_written = sources;
250
251 return inst;
252 }
253
254 exec_list
255 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
256 const fs_reg &surf_index,
257 const fs_reg &varying_offset,
258 uint32_t const_offset)
259 {
260 exec_list instructions;
261 fs_inst *inst;
262
263 /* We have our constant surface use a pitch of 4 bytes, so our index can
264 * be any component of a vector, and then we load 4 contiguous
265 * components starting from that.
266 *
267 * We break down the const_offset to a portion added to the variable
268 * offset and a portion done using reg_offset, which means that if you
269 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
270 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
271 * CSE can later notice that those loads are all the same and eliminate
272 * the redundant ones.
273 */
274 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
275 instructions.push_tail(ADD(vec4_offset,
276 varying_offset, const_offset & ~3));
277
278 int scale = 1;
279 if (brw->gen == 4 && dispatch_width == 8) {
280 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
281 * u, v, r) as parameters, or we can just use the SIMD16 message
282 * consisting of (header, u). We choose the second, at the cost of a
283 * longer return length.
284 */
285 scale = 2;
286 }
287
288 enum opcode op;
289 if (brw->gen >= 7)
290 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
291 else
292 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
293 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
294 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
295 inst->regs_written = 4 * scale;
296 instructions.push_tail(inst);
297
298 if (brw->gen < 7) {
299 inst->base_mrf = 13;
300 inst->header_present = true;
301 if (brw->gen == 4)
302 inst->mlen = 3;
303 else
304 inst->mlen = 1 + dispatch_width / 8;
305 }
306
307 vec4_result.reg_offset += (const_offset & 3) * scale;
308 instructions.push_tail(MOV(dst, vec4_result));
309
310 return instructions;
311 }
312
313 /**
314 * A helper for MOV generation for fixing up broken hardware SEND dependency
315 * handling.
316 */
317 fs_inst *
318 fs_visitor::DEP_RESOLVE_MOV(int grf)
319 {
320 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
321
322 inst->ir = NULL;
323 inst->annotation = "send dependency resolve";
324
325 /* The caller always wants uncompressed to emit the minimal extra
326 * dependencies, and to avoid having to deal with aligning its regs to 2.
327 */
328 inst->force_uncompressed = true;
329
330 return inst;
331 }
332
333 bool
334 fs_inst::equals(fs_inst *inst) const
335 {
336 return (opcode == inst->opcode &&
337 dst.equals(inst->dst) &&
338 src[0].equals(inst->src[0]) &&
339 src[1].equals(inst->src[1]) &&
340 src[2].equals(inst->src[2]) &&
341 saturate == inst->saturate &&
342 predicate == inst->predicate &&
343 conditional_mod == inst->conditional_mod &&
344 mlen == inst->mlen &&
345 base_mrf == inst->base_mrf &&
346 sampler == inst->sampler &&
347 target == inst->target &&
348 eot == inst->eot &&
349 header_present == inst->header_present &&
350 shadow_compare == inst->shadow_compare &&
351 offset == inst->offset);
352 }
353
354 bool
355 fs_inst::overwrites_reg(const fs_reg &reg) const
356 {
357 return (reg.file == dst.file &&
358 reg.reg == dst.reg &&
359 reg.reg_offset >= dst.reg_offset &&
360 reg.reg_offset < dst.reg_offset + regs_written);
361 }
362
363 bool
364 fs_inst::is_send_from_grf() const
365 {
366 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
367 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
368 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
369 src[1].file == GRF) ||
370 (is_tex() && src[0].file == GRF));
371 }
372
373 bool
374 fs_visitor::can_do_source_mods(fs_inst *inst)
375 {
376 if (brw->gen == 6 && inst->is_math())
377 return false;
378
379 if (inst->is_send_from_grf())
380 return false;
381
382 if (!inst->can_do_source_mods())
383 return false;
384
385 return true;
386 }
387
388 void
389 fs_reg::init()
390 {
391 memset(this, 0, sizeof(*this));
392 stride = 1;
393 }
394
395 /** Generic unset register constructor. */
396 fs_reg::fs_reg()
397 {
398 init();
399 this->file = BAD_FILE;
400 }
401
402 /** Immediate value constructor. */
403 fs_reg::fs_reg(float f)
404 {
405 init();
406 this->file = IMM;
407 this->type = BRW_REGISTER_TYPE_F;
408 this->imm.f = f;
409 }
410
411 /** Immediate value constructor. */
412 fs_reg::fs_reg(int32_t i)
413 {
414 init();
415 this->file = IMM;
416 this->type = BRW_REGISTER_TYPE_D;
417 this->imm.i = i;
418 }
419
420 /** Immediate value constructor. */
421 fs_reg::fs_reg(uint32_t u)
422 {
423 init();
424 this->file = IMM;
425 this->type = BRW_REGISTER_TYPE_UD;
426 this->imm.u = u;
427 }
428
429 /** Fixed brw_reg. */
430 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
431 {
432 init();
433 this->file = HW_REG;
434 this->fixed_hw_reg = fixed_hw_reg;
435 this->type = fixed_hw_reg.type;
436 }
437
438 bool
439 fs_reg::equals(const fs_reg &r) const
440 {
441 return (file == r.file &&
442 reg == r.reg &&
443 reg_offset == r.reg_offset &&
444 subreg_offset == r.subreg_offset &&
445 type == r.type &&
446 negate == r.negate &&
447 abs == r.abs &&
448 !reladdr && !r.reladdr &&
449 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
450 sizeof(fixed_hw_reg)) == 0 &&
451 stride == r.stride &&
452 imm.u == r.imm.u);
453 }
454
455 fs_reg &
456 fs_reg::apply_stride(unsigned stride)
457 {
458 assert((this->stride * stride) <= 4 &&
459 (is_power_of_two(stride) || stride == 0) &&
460 file != HW_REG && file != IMM);
461 this->stride *= stride;
462 return *this;
463 }
464
465 fs_reg &
466 fs_reg::set_smear(unsigned subreg)
467 {
468 assert(file != HW_REG && file != IMM);
469 subreg_offset = subreg * type_sz(type);
470 stride = 0;
471 return *this;
472 }
473
474 bool
475 fs_reg::is_contiguous() const
476 {
477 return stride == 1;
478 }
479
480 bool
481 fs_reg::is_zero() const
482 {
483 if (file != IMM)
484 return false;
485
486 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
487 }
488
489 bool
490 fs_reg::is_one() const
491 {
492 if (file != IMM)
493 return false;
494
495 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
496 }
497
498 bool
499 fs_reg::is_null() const
500 {
501 return file == HW_REG &&
502 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
503 fixed_hw_reg.nr == BRW_ARF_NULL;
504 }
505
506 bool
507 fs_reg::is_valid_3src() const
508 {
509 return file == GRF || file == UNIFORM;
510 }
511
512 bool
513 fs_reg::is_accumulator() const
514 {
515 return file == HW_REG &&
516 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
517 fixed_hw_reg.nr == BRW_ARF_ACCUMULATOR;
518 }
519
520 int
521 fs_visitor::type_size(const struct glsl_type *type)
522 {
523 unsigned int size, i;
524
525 switch (type->base_type) {
526 case GLSL_TYPE_UINT:
527 case GLSL_TYPE_INT:
528 case GLSL_TYPE_FLOAT:
529 case GLSL_TYPE_BOOL:
530 return type->components();
531 case GLSL_TYPE_ARRAY:
532 return type_size(type->fields.array) * type->length;
533 case GLSL_TYPE_STRUCT:
534 size = 0;
535 for (i = 0; i < type->length; i++) {
536 size += type_size(type->fields.structure[i].type);
537 }
538 return size;
539 case GLSL_TYPE_SAMPLER:
540 /* Samplers take up no register space, since they're baked in at
541 * link time.
542 */
543 return 0;
544 case GLSL_TYPE_ATOMIC_UINT:
545 return 0;
546 case GLSL_TYPE_IMAGE:
547 case GLSL_TYPE_VOID:
548 case GLSL_TYPE_ERROR:
549 case GLSL_TYPE_INTERFACE:
550 assert(!"not reached");
551 break;
552 }
553
554 return 0;
555 }
556
557 fs_reg
558 fs_visitor::get_timestamp()
559 {
560 assert(brw->gen >= 7);
561
562 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
563 BRW_ARF_TIMESTAMP,
564 0),
565 BRW_REGISTER_TYPE_UD));
566
567 fs_reg dst = fs_reg(this, glsl_type::uint_type);
568
569 fs_inst *mov = emit(MOV(dst, ts));
570 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
571 * even if it's not enabled in the dispatch.
572 */
573 mov->force_writemask_all = true;
574 mov->force_uncompressed = true;
575
576 /* The caller wants the low 32 bits of the timestamp. Since it's running
577 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
578 * which is plenty of time for our purposes. It is identical across the
579 * EUs, but since it's tracking GPU core speed it will increment at a
580 * varying rate as render P-states change.
581 *
582 * The caller could also check if render P-states have changed (or anything
583 * else that might disrupt timing) by setting smear to 2 and checking if
584 * that field is != 0.
585 */
586 dst.set_smear(0);
587
588 return dst;
589 }
590
591 void
592 fs_visitor::emit_shader_time_begin()
593 {
594 current_annotation = "shader time start";
595 shader_start_time = get_timestamp();
596 }
597
598 void
599 fs_visitor::emit_shader_time_end()
600 {
601 current_annotation = "shader time end";
602
603 enum shader_time_shader_type type, written_type, reset_type;
604 if (dispatch_width == 8) {
605 type = ST_FS8;
606 written_type = ST_FS8_WRITTEN;
607 reset_type = ST_FS8_RESET;
608 } else {
609 assert(dispatch_width == 16);
610 type = ST_FS16;
611 written_type = ST_FS16_WRITTEN;
612 reset_type = ST_FS16_RESET;
613 }
614
615 fs_reg shader_end_time = get_timestamp();
616
617 /* Check that there weren't any timestamp reset events (assuming these
618 * were the only two timestamp reads that happened).
619 */
620 fs_reg reset = shader_end_time;
621 reset.set_smear(2);
622 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
623 test->conditional_mod = BRW_CONDITIONAL_Z;
624 emit(IF(BRW_PREDICATE_NORMAL));
625
626 push_force_uncompressed();
627 fs_reg start = shader_start_time;
628 start.negate = true;
629 fs_reg diff = fs_reg(this, glsl_type::uint_type);
630 emit(ADD(diff, start, shader_end_time));
631
632 /* If there were no instructions between the two timestamp gets, the diff
633 * is 2 cycles. Remove that overhead, so I can forget about that when
634 * trying to determine the time taken for single instructions.
635 */
636 emit(ADD(diff, diff, fs_reg(-2u)));
637
638 emit_shader_time_write(type, diff);
639 emit_shader_time_write(written_type, fs_reg(1u));
640 emit(BRW_OPCODE_ELSE);
641 emit_shader_time_write(reset_type, fs_reg(1u));
642 emit(BRW_OPCODE_ENDIF);
643
644 pop_force_uncompressed();
645 }
646
647 void
648 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
649 fs_reg value)
650 {
651 int shader_time_index =
652 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
653 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
654
655 fs_reg payload;
656 if (dispatch_width == 8)
657 payload = fs_reg(this, glsl_type::uvec2_type);
658 else
659 payload = fs_reg(this, glsl_type::uint_type);
660
661 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
662 fs_reg(), payload, offset, value));
663 }
664
665 void
666 fs_visitor::vfail(const char *format, va_list va)
667 {
668 char *msg;
669
670 if (failed)
671 return;
672
673 failed = true;
674
675 msg = ralloc_vasprintf(mem_ctx, format, va);
676 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
677
678 this->fail_msg = msg;
679
680 if (INTEL_DEBUG & DEBUG_WM) {
681 fprintf(stderr, "%s", msg);
682 }
683 }
684
685 void
686 fs_visitor::fail(const char *format, ...)
687 {
688 va_list va;
689
690 va_start(va, format);
691 vfail(format, va);
692 va_end(va);
693 }
694
695 /**
696 * Mark this program as impossible to compile in SIMD16 mode.
697 *
698 * During the SIMD8 compile (which happens first), we can detect and flag
699 * things that are unsupported in SIMD16 mode, so the compiler can skip
700 * the SIMD16 compile altogether.
701 *
702 * During a SIMD16 compile (if one happens anyway), this just calls fail().
703 */
704 void
705 fs_visitor::no16(const char *format, ...)
706 {
707 va_list va;
708
709 va_start(va, format);
710
711 if (dispatch_width == 16) {
712 vfail(format, va);
713 } else {
714 simd16_unsupported = true;
715
716 if (brw->perf_debug) {
717 if (no16_msg)
718 ralloc_vasprintf_append(&no16_msg, format, va);
719 else
720 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
721 }
722 }
723
724 va_end(va);
725 }
726
727 fs_inst *
728 fs_visitor::emit(enum opcode opcode)
729 {
730 return emit(new(mem_ctx) fs_inst(opcode));
731 }
732
733 fs_inst *
734 fs_visitor::emit(enum opcode opcode, fs_reg dst)
735 {
736 return emit(new(mem_ctx) fs_inst(opcode, dst));
737 }
738
739 fs_inst *
740 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
741 {
742 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
743 }
744
745 fs_inst *
746 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
747 {
748 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
749 }
750
751 fs_inst *
752 fs_visitor::emit(enum opcode opcode, fs_reg dst,
753 fs_reg src0, fs_reg src1, fs_reg src2)
754 {
755 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
756 }
757
758 fs_inst *
759 fs_visitor::emit(enum opcode opcode, fs_reg dst,
760 fs_reg src[], int sources)
761 {
762 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
763 }
764
765 void
766 fs_visitor::push_force_uncompressed()
767 {
768 force_uncompressed_stack++;
769 }
770
771 void
772 fs_visitor::pop_force_uncompressed()
773 {
774 force_uncompressed_stack--;
775 assert(force_uncompressed_stack >= 0);
776 }
777
778 /**
779 * Returns true if the instruction has a flag that means it won't
780 * update an entire destination register.
781 *
782 * For example, dead code elimination and live variable analysis want to know
783 * when a write to a variable screens off any preceding values that were in
784 * it.
785 */
786 bool
787 fs_inst::is_partial_write() const
788 {
789 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
790 this->force_uncompressed ||
791 this->force_sechalf || !this->dst.is_contiguous());
792 }
793
794 int
795 fs_inst::regs_read(fs_visitor *v, int arg) const
796 {
797 if (is_tex() && arg == 0 && src[0].file == GRF) {
798 if (v->dispatch_width == 16)
799 return (mlen + 1) / 2;
800 else
801 return mlen;
802 }
803 return 1;
804 }
805
806 bool
807 fs_inst::reads_flag() const
808 {
809 return predicate;
810 }
811
812 bool
813 fs_inst::writes_flag() const
814 {
815 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
816 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
817 }
818
819 /**
820 * Returns how many MRFs an FS opcode will write over.
821 *
822 * Note that this is not the 0 or 1 implied writes in an actual gen
823 * instruction -- the FS opcodes often generate MOVs in addition.
824 */
825 int
826 fs_visitor::implied_mrf_writes(fs_inst *inst)
827 {
828 if (inst->mlen == 0)
829 return 0;
830
831 if (inst->base_mrf == -1)
832 return 0;
833
834 switch (inst->opcode) {
835 case SHADER_OPCODE_RCP:
836 case SHADER_OPCODE_RSQ:
837 case SHADER_OPCODE_SQRT:
838 case SHADER_OPCODE_EXP2:
839 case SHADER_OPCODE_LOG2:
840 case SHADER_OPCODE_SIN:
841 case SHADER_OPCODE_COS:
842 return 1 * dispatch_width / 8;
843 case SHADER_OPCODE_POW:
844 case SHADER_OPCODE_INT_QUOTIENT:
845 case SHADER_OPCODE_INT_REMAINDER:
846 return 2 * dispatch_width / 8;
847 case SHADER_OPCODE_TEX:
848 case FS_OPCODE_TXB:
849 case SHADER_OPCODE_TXD:
850 case SHADER_OPCODE_TXF:
851 case SHADER_OPCODE_TXF_CMS:
852 case SHADER_OPCODE_TXF_MCS:
853 case SHADER_OPCODE_TG4:
854 case SHADER_OPCODE_TG4_OFFSET:
855 case SHADER_OPCODE_TXL:
856 case SHADER_OPCODE_TXS:
857 case SHADER_OPCODE_LOD:
858 return 1;
859 case FS_OPCODE_FB_WRITE:
860 return 2;
861 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
862 case SHADER_OPCODE_GEN4_SCRATCH_READ:
863 return 1;
864 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
865 return inst->mlen;
866 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
867 return 2;
868 case SHADER_OPCODE_UNTYPED_ATOMIC:
869 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
870 return 0;
871 default:
872 assert(!"not reached");
873 return inst->mlen;
874 }
875 }
876
877 int
878 fs_visitor::virtual_grf_alloc(int size)
879 {
880 if (virtual_grf_array_size <= virtual_grf_count) {
881 if (virtual_grf_array_size == 0)
882 virtual_grf_array_size = 16;
883 else
884 virtual_grf_array_size *= 2;
885 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
886 virtual_grf_array_size);
887 }
888 virtual_grf_sizes[virtual_grf_count] = size;
889 return virtual_grf_count++;
890 }
891
892 /** Fixed HW reg constructor. */
893 fs_reg::fs_reg(enum register_file file, int reg)
894 {
895 init();
896 this->file = file;
897 this->reg = reg;
898 this->type = BRW_REGISTER_TYPE_F;
899 }
900
901 /** Fixed HW reg constructor. */
902 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
903 {
904 init();
905 this->file = file;
906 this->reg = reg;
907 this->type = type;
908 }
909
910 /** Automatic reg constructor. */
911 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
912 {
913 init();
914
915 this->file = GRF;
916 this->reg = v->virtual_grf_alloc(v->type_size(type));
917 this->reg_offset = 0;
918 this->type = brw_type_for_base_type(type);
919 }
920
921 fs_reg *
922 fs_visitor::variable_storage(ir_variable *var)
923 {
924 return (fs_reg *)hash_table_find(this->variable_ht, var);
925 }
926
927 void
928 import_uniforms_callback(const void *key,
929 void *data,
930 void *closure)
931 {
932 struct hash_table *dst_ht = (struct hash_table *)closure;
933 const fs_reg *reg = (const fs_reg *)data;
934
935 if (reg->file != UNIFORM)
936 return;
937
938 hash_table_insert(dst_ht, data, key);
939 }
940
941 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
942 * This brings in those uniform definitions
943 */
944 void
945 fs_visitor::import_uniforms(fs_visitor *v)
946 {
947 hash_table_call_foreach(v->variable_ht,
948 import_uniforms_callback,
949 variable_ht);
950 this->push_constant_loc = v->push_constant_loc;
951 this->pull_constant_loc = v->pull_constant_loc;
952 this->uniforms = v->uniforms;
953 this->param_size = v->param_size;
954 }
955
956 /* Our support for uniforms is piggy-backed on the struct
957 * gl_fragment_program, because that's where the values actually
958 * get stored, rather than in some global gl_shader_program uniform
959 * store.
960 */
961 void
962 fs_visitor::setup_uniform_values(ir_variable *ir)
963 {
964 int namelen = strlen(ir->name);
965
966 /* The data for our (non-builtin) uniforms is stored in a series of
967 * gl_uniform_driver_storage structs for each subcomponent that
968 * glGetUniformLocation() could name. We know it's been set up in the same
969 * order we'd walk the type, so walk the list of storage and find anything
970 * with our name, or the prefix of a component that starts with our name.
971 */
972 unsigned params_before = uniforms;
973 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
974 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
975
976 if (strncmp(ir->name, storage->name, namelen) != 0 ||
977 (storage->name[namelen] != 0 &&
978 storage->name[namelen] != '.' &&
979 storage->name[namelen] != '[')) {
980 continue;
981 }
982
983 unsigned slots = storage->type->component_slots();
984 if (storage->array_elements)
985 slots *= storage->array_elements;
986
987 for (unsigned i = 0; i < slots; i++) {
988 stage_prog_data->param[uniforms++] = &storage->storage[i].f;
989 }
990 }
991
992 /* Make sure we actually initialized the right amount of stuff here. */
993 assert(params_before + ir->type->component_slots() == uniforms);
994 (void)params_before;
995 }
996
997
998 /* Our support for builtin uniforms is even scarier than non-builtin.
999 * It sits on top of the PROG_STATE_VAR parameters that are
1000 * automatically updated from GL context state.
1001 */
1002 void
1003 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1004 {
1005 const ir_state_slot *const slots = ir->state_slots;
1006 assert(ir->state_slots != NULL);
1007
1008 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
1009 /* This state reference has already been setup by ir_to_mesa, but we'll
1010 * get the same index back here.
1011 */
1012 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
1013 (gl_state_index *)slots[i].tokens);
1014
1015 /* Add each of the unique swizzles of the element as a parameter.
1016 * This'll end up matching the expected layout of the
1017 * array/matrix/structure we're trying to fill in.
1018 */
1019 int last_swiz = -1;
1020 for (unsigned int j = 0; j < 4; j++) {
1021 int swiz = GET_SWZ(slots[i].swizzle, j);
1022 if (swiz == last_swiz)
1023 break;
1024 last_swiz = swiz;
1025
1026 stage_prog_data->param[uniforms++] =
1027 &fp->Base.Parameters->ParameterValues[index][swiz].f;
1028 }
1029 }
1030 }
1031
1032 fs_reg *
1033 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1034 {
1035 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1036 fs_reg wpos = *reg;
1037 bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1038
1039 /* gl_FragCoord.x */
1040 if (ir->data.pixel_center_integer) {
1041 emit(MOV(wpos, this->pixel_x));
1042 } else {
1043 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1044 }
1045 wpos.reg_offset++;
1046
1047 /* gl_FragCoord.y */
1048 if (!flip && ir->data.pixel_center_integer) {
1049 emit(MOV(wpos, this->pixel_y));
1050 } else {
1051 fs_reg pixel_y = this->pixel_y;
1052 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1053
1054 if (flip) {
1055 pixel_y.negate = true;
1056 offset += key->drawable_height - 1.0;
1057 }
1058
1059 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1060 }
1061 wpos.reg_offset++;
1062
1063 /* gl_FragCoord.z */
1064 if (brw->gen >= 6) {
1065 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1066 } else {
1067 emit(FS_OPCODE_LINTERP, wpos,
1068 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1069 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1070 interp_reg(VARYING_SLOT_POS, 2));
1071 }
1072 wpos.reg_offset++;
1073
1074 /* gl_FragCoord.w: Already set up in emit_interpolation */
1075 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1076
1077 return reg;
1078 }
1079
1080 fs_inst *
1081 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1082 glsl_interp_qualifier interpolation_mode,
1083 bool is_centroid, bool is_sample)
1084 {
1085 brw_wm_barycentric_interp_mode barycoord_mode;
1086 if (brw->gen >= 6) {
1087 if (is_centroid) {
1088 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1089 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1090 else
1091 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1092 } else if (is_sample) {
1093 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1094 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1095 else
1096 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1097 } else {
1098 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1099 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1100 else
1101 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1102 }
1103 } else {
1104 /* On Ironlake and below, there is only one interpolation mode.
1105 * Centroid interpolation doesn't mean anything on this hardware --
1106 * there is no multisampling.
1107 */
1108 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1109 }
1110 return emit(FS_OPCODE_LINTERP, attr,
1111 this->delta_x[barycoord_mode],
1112 this->delta_y[barycoord_mode], interp);
1113 }
1114
1115 fs_reg *
1116 fs_visitor::emit_general_interpolation(ir_variable *ir)
1117 {
1118 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1119 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1120 fs_reg attr = *reg;
1121
1122 unsigned int array_elements;
1123 const glsl_type *type;
1124
1125 if (ir->type->is_array()) {
1126 array_elements = ir->type->length;
1127 if (array_elements == 0) {
1128 fail("dereferenced array '%s' has length 0\n", ir->name);
1129 }
1130 type = ir->type->fields.array;
1131 } else {
1132 array_elements = 1;
1133 type = ir->type;
1134 }
1135
1136 glsl_interp_qualifier interpolation_mode =
1137 ir->determine_interpolation_mode(key->flat_shade);
1138
1139 int location = ir->data.location;
1140 for (unsigned int i = 0; i < array_elements; i++) {
1141 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1142 if (prog_data->urb_setup[location] == -1) {
1143 /* If there's no incoming setup data for this slot, don't
1144 * emit interpolation for it.
1145 */
1146 attr.reg_offset += type->vector_elements;
1147 location++;
1148 continue;
1149 }
1150
1151 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1152 /* Constant interpolation (flat shading) case. The SF has
1153 * handed us defined values in only the constant offset
1154 * field of the setup reg.
1155 */
1156 for (unsigned int k = 0; k < type->vector_elements; k++) {
1157 struct brw_reg interp = interp_reg(location, k);
1158 interp = suboffset(interp, 3);
1159 interp.type = reg->type;
1160 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1161 attr.reg_offset++;
1162 }
1163 } else {
1164 /* Smooth/noperspective interpolation case. */
1165 for (unsigned int k = 0; k < type->vector_elements; k++) {
1166 struct brw_reg interp = interp_reg(location, k);
1167 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1168 ir->data.centroid && !key->persample_shading,
1169 ir->data.sample || key->persample_shading);
1170 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1171 /* Get the pixel/sample mask into f0 so that we know
1172 * which pixels are lit. Then, for each channel that is
1173 * unlit, replace the centroid data with non-centroid
1174 * data.
1175 */
1176 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1177 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1178 interpolation_mode,
1179 false, false);
1180 inst->predicate = BRW_PREDICATE_NORMAL;
1181 inst->predicate_inverse = true;
1182 }
1183 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1184 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1185 }
1186 attr.reg_offset++;
1187 }
1188
1189 }
1190 location++;
1191 }
1192 }
1193
1194 return reg;
1195 }
1196
1197 fs_reg *
1198 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1199 {
1200 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1201
1202 /* The frontfacing comes in as a bit in the thread payload. */
1203 if (brw->gen >= 6) {
1204 emit(BRW_OPCODE_ASR, *reg,
1205 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1206 fs_reg(15));
1207 emit(BRW_OPCODE_NOT, *reg, *reg);
1208 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1209 } else {
1210 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1211 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1212 * us front face
1213 */
1214 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1215 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1216 }
1217
1218 return reg;
1219 }
1220
1221 void
1222 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1223 {
1224 assert(dst.type == BRW_REGISTER_TYPE_F);
1225
1226 if (key->compute_pos_offset) {
1227 /* Convert int_sample_pos to floating point */
1228 emit(MOV(dst, int_sample_pos));
1229 /* Scale to the range [0, 1] */
1230 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1231 }
1232 else {
1233 /* From ARB_sample_shading specification:
1234 * "When rendering to a non-multisample buffer, or if multisample
1235 * rasterization is disabled, gl_SamplePosition will always be
1236 * (0.5, 0.5).
1237 */
1238 emit(MOV(dst, fs_reg(0.5f)));
1239 }
1240 }
1241
1242 fs_reg *
1243 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1244 {
1245 assert(brw->gen >= 6);
1246 assert(ir->type == glsl_type::vec2_type);
1247
1248 this->current_annotation = "compute sample position";
1249 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1250 fs_reg pos = *reg;
1251 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1252 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1253
1254 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1255 * mode will be enabled.
1256 *
1257 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1258 * R31.1:0 Position Offset X/Y for Slot[3:0]
1259 * R31.3:2 Position Offset X/Y for Slot[7:4]
1260 * .....
1261 *
1262 * The X, Y sample positions come in as bytes in thread payload. So, read
1263 * the positions using vstride=16, width=8, hstride=2.
1264 */
1265 struct brw_reg sample_pos_reg =
1266 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1267 BRW_REGISTER_TYPE_B), 16, 8, 2);
1268
1269 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1270 if (dispatch_width == 16) {
1271 fs_inst *inst = emit(MOV(half(int_sample_x, 1),
1272 fs_reg(suboffset(sample_pos_reg, 16))));
1273 inst->force_sechalf = true;
1274 }
1275 /* Compute gl_SamplePosition.x */
1276 compute_sample_position(pos, int_sample_x);
1277 pos.reg_offset++;
1278 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1279 if (dispatch_width == 16) {
1280 fs_inst *inst = emit(MOV(half(int_sample_y, 1),
1281 fs_reg(suboffset(sample_pos_reg, 17))));
1282 inst->force_sechalf = true;
1283 }
1284 /* Compute gl_SamplePosition.y */
1285 compute_sample_position(pos, int_sample_y);
1286 return reg;
1287 }
1288
1289 fs_reg *
1290 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1291 {
1292 assert(brw->gen >= 6);
1293
1294 this->current_annotation = "compute sample id";
1295 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1296
1297 if (key->compute_sample_id) {
1298 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1299 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1300 t2.type = BRW_REGISTER_TYPE_UW;
1301
1302 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1303 * 8x multisampling, subspan 0 will represent sample N (where N
1304 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1305 * 7. We can find the value of N by looking at R0.0 bits 7:6
1306 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1307 * (since samples are always delivered in pairs). That is, we
1308 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1309 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1310 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1311 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1312 * populating a temporary variable with the sequence (0, 1, 2, 3),
1313 * and then reading from it using vstride=1, width=4, hstride=0.
1314 * These computations hold good for 4x multisampling as well.
1315 */
1316 emit(BRW_OPCODE_AND, t1,
1317 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1318 fs_reg(0xc0));
1319 emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1320 /* This works for both SIMD8 and SIMD16 */
1321 emit(MOV(t2, brw_imm_v(0x3210)));
1322 /* This special instruction takes care of setting vstride=1,
1323 * width=4, hstride=0 of t2 during an ADD instruction.
1324 */
1325 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1326 } else {
1327 /* As per GL_ARB_sample_shading specification:
1328 * "When rendering to a non-multisample buffer, or if multisample
1329 * rasterization is disabled, gl_SampleID will always be zero."
1330 */
1331 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1332 }
1333
1334 return reg;
1335 }
1336
1337 fs_reg
1338 fs_visitor::fix_math_operand(fs_reg src)
1339 {
1340 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1341 * might be able to do better by doing execsize = 1 math and then
1342 * expanding that result out, but we would need to be careful with
1343 * masking.
1344 *
1345 * The hardware ignores source modifiers (negate and abs) on math
1346 * instructions, so we also move to a temp to set those up.
1347 */
1348 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1349 !src.abs && !src.negate)
1350 return src;
1351
1352 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1353 * operands to math
1354 */
1355 if (brw->gen >= 7 && src.file != IMM)
1356 return src;
1357
1358 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1359 expanded.type = src.type;
1360 emit(BRW_OPCODE_MOV, expanded, src);
1361 return expanded;
1362 }
1363
1364 fs_inst *
1365 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1366 {
1367 switch (opcode) {
1368 case SHADER_OPCODE_RCP:
1369 case SHADER_OPCODE_RSQ:
1370 case SHADER_OPCODE_SQRT:
1371 case SHADER_OPCODE_EXP2:
1372 case SHADER_OPCODE_LOG2:
1373 case SHADER_OPCODE_SIN:
1374 case SHADER_OPCODE_COS:
1375 break;
1376 default:
1377 assert(!"not reached: bad math opcode");
1378 return NULL;
1379 }
1380
1381 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1382 * might be able to do better by doing execsize = 1 math and then
1383 * expanding that result out, but we would need to be careful with
1384 * masking.
1385 *
1386 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1387 * instructions, so we also move to a temp to set those up.
1388 */
1389 if (brw->gen >= 6)
1390 src = fix_math_operand(src);
1391
1392 fs_inst *inst = emit(opcode, dst, src);
1393
1394 if (brw->gen < 6) {
1395 inst->base_mrf = 2;
1396 inst->mlen = dispatch_width / 8;
1397 }
1398
1399 return inst;
1400 }
1401
1402 fs_inst *
1403 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1404 {
1405 int base_mrf = 2;
1406 fs_inst *inst;
1407
1408 switch (opcode) {
1409 case SHADER_OPCODE_INT_QUOTIENT:
1410 case SHADER_OPCODE_INT_REMAINDER:
1411 if (brw->gen >= 7)
1412 no16("SIMD16 INTDIV unsupported\n");
1413 break;
1414 case SHADER_OPCODE_POW:
1415 break;
1416 default:
1417 assert(!"not reached: unsupported binary math opcode.");
1418 return NULL;
1419 }
1420
1421 if (brw->gen >= 6) {
1422 src0 = fix_math_operand(src0);
1423 src1 = fix_math_operand(src1);
1424
1425 inst = emit(opcode, dst, src0, src1);
1426 } else {
1427 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1428 * "Message Payload":
1429 *
1430 * "Operand0[7]. For the INT DIV functions, this operand is the
1431 * denominator."
1432 * ...
1433 * "Operand1[7]. For the INT DIV functions, this operand is the
1434 * numerator."
1435 */
1436 bool is_int_div = opcode != SHADER_OPCODE_POW;
1437 fs_reg &op0 = is_int_div ? src1 : src0;
1438 fs_reg &op1 = is_int_div ? src0 : src1;
1439
1440 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1441 inst = emit(opcode, dst, op0, reg_null_f);
1442
1443 inst->base_mrf = base_mrf;
1444 inst->mlen = 2 * dispatch_width / 8;
1445 }
1446 return inst;
1447 }
1448
1449 void
1450 fs_visitor::assign_curb_setup()
1451 {
1452 if (dispatch_width == 8) {
1453 prog_data->first_curbe_grf = payload.num_regs;
1454 } else {
1455 prog_data->first_curbe_grf_16 = payload.num_regs;
1456 }
1457
1458 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1459
1460 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1461 foreach_list(node, &this->instructions) {
1462 fs_inst *inst = (fs_inst *)node;
1463
1464 for (unsigned int i = 0; i < inst->sources; i++) {
1465 if (inst->src[i].file == UNIFORM) {
1466 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1467 int constant_nr;
1468 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1469 constant_nr = push_constant_loc[uniform_nr];
1470 } else {
1471 /* Section 5.11 of the OpenGL 4.1 spec says:
1472 * "Out-of-bounds reads return undefined values, which include
1473 * values from other variables of the active program or zero."
1474 * Just return the first push constant.
1475 */
1476 constant_nr = 0;
1477 }
1478
1479 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1480 constant_nr / 8,
1481 constant_nr % 8);
1482
1483 inst->src[i].file = HW_REG;
1484 inst->src[i].fixed_hw_reg = byte_offset(
1485 retype(brw_reg, inst->src[i].type),
1486 inst->src[i].subreg_offset);
1487 }
1488 }
1489 }
1490 }
1491
1492 void
1493 fs_visitor::calculate_urb_setup()
1494 {
1495 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1496 prog_data->urb_setup[i] = -1;
1497 }
1498
1499 int urb_next = 0;
1500 /* Figure out where each of the incoming setup attributes lands. */
1501 if (brw->gen >= 6) {
1502 if (_mesa_bitcount_64(fp->Base.InputsRead &
1503 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1504 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1505 * first 16 varying inputs, so we can put them wherever we want.
1506 * Just put them in order.
1507 *
1508 * This is useful because it means that (a) inputs not used by the
1509 * fragment shader won't take up valuable register space, and (b) we
1510 * won't have to recompile the fragment shader if it gets paired with
1511 * a different vertex (or geometry) shader.
1512 */
1513 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1514 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1515 BITFIELD64_BIT(i)) {
1516 prog_data->urb_setup[i] = urb_next++;
1517 }
1518 }
1519 } else {
1520 /* We have enough input varyings that the SF/SBE pipeline stage can't
1521 * arbitrarily rearrange them to suit our whim; we have to put them
1522 * in an order that matches the output of the previous pipeline stage
1523 * (geometry or vertex shader).
1524 */
1525 struct brw_vue_map prev_stage_vue_map;
1526 brw_compute_vue_map(brw, &prev_stage_vue_map,
1527 key->input_slots_valid);
1528 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1529 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1530 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1531 slot++) {
1532 int varying = prev_stage_vue_map.slot_to_varying[slot];
1533 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1534 * unused.
1535 */
1536 if (varying != BRW_VARYING_SLOT_COUNT &&
1537 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1538 BITFIELD64_BIT(varying))) {
1539 prog_data->urb_setup[varying] = slot - first_slot;
1540 }
1541 }
1542 urb_next = prev_stage_vue_map.num_slots - first_slot;
1543 }
1544 } else {
1545 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1546 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1547 /* Point size is packed into the header, not as a general attribute */
1548 if (i == VARYING_SLOT_PSIZ)
1549 continue;
1550
1551 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1552 /* The back color slot is skipped when the front color is
1553 * also written to. In addition, some slots can be
1554 * written in the vertex shader and not read in the
1555 * fragment shader. So the register number must always be
1556 * incremented, mapped or not.
1557 */
1558 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1559 prog_data->urb_setup[i] = urb_next;
1560 urb_next++;
1561 }
1562 }
1563
1564 /*
1565 * It's a FS only attribute, and we did interpolation for this attribute
1566 * in SF thread. So, count it here, too.
1567 *
1568 * See compile_sf_prog() for more info.
1569 */
1570 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1571 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1572 }
1573
1574 prog_data->num_varying_inputs = urb_next;
1575 }
1576
1577 void
1578 fs_visitor::assign_urb_setup()
1579 {
1580 int urb_start = payload.num_regs + prog_data->curb_read_length;
1581
1582 /* Offset all the urb_setup[] index by the actual position of the
1583 * setup regs, now that the location of the constants has been chosen.
1584 */
1585 foreach_list(node, &this->instructions) {
1586 fs_inst *inst = (fs_inst *)node;
1587
1588 if (inst->opcode == FS_OPCODE_LINTERP) {
1589 assert(inst->src[2].file == HW_REG);
1590 inst->src[2].fixed_hw_reg.nr += urb_start;
1591 }
1592
1593 if (inst->opcode == FS_OPCODE_CINTERP) {
1594 assert(inst->src[0].file == HW_REG);
1595 inst->src[0].fixed_hw_reg.nr += urb_start;
1596 }
1597 }
1598
1599 /* Each attribute is 4 setup channels, each of which is half a reg. */
1600 this->first_non_payload_grf =
1601 urb_start + prog_data->num_varying_inputs * 2;
1602 }
1603
1604 /**
1605 * Split large virtual GRFs into separate components if we can.
1606 *
1607 * This is mostly duplicated with what brw_fs_vector_splitting does,
1608 * but that's really conservative because it's afraid of doing
1609 * splitting that doesn't result in real progress after the rest of
1610 * the optimization phases, which would cause infinite looping in
1611 * optimization. We can do it once here, safely. This also has the
1612 * opportunity to split interpolated values, or maybe even uniforms,
1613 * which we don't have at the IR level.
1614 *
1615 * We want to split, because virtual GRFs are what we register
1616 * allocate and spill (due to contiguousness requirements for some
1617 * instructions), and they're what we naturally generate in the
1618 * codegen process, but most virtual GRFs don't actually need to be
1619 * contiguous sets of GRFs. If we split, we'll end up with reduced
1620 * live intervals and better dead code elimination and coalescing.
1621 */
1622 void
1623 fs_visitor::split_virtual_grfs()
1624 {
1625 int num_vars = this->virtual_grf_count;
1626 bool split_grf[num_vars];
1627 int new_virtual_grf[num_vars];
1628
1629 /* Try to split anything > 0 sized. */
1630 for (int i = 0; i < num_vars; i++) {
1631 if (this->virtual_grf_sizes[i] != 1)
1632 split_grf[i] = true;
1633 else
1634 split_grf[i] = false;
1635 }
1636
1637 if (brw->has_pln &&
1638 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1639 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1640 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1641 * Gen6, that was the only supported interpolation mode, and since Gen6,
1642 * delta_x and delta_y are in fixed hardware registers.
1643 */
1644 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1645 false;
1646 }
1647
1648 foreach_list(node, &this->instructions) {
1649 fs_inst *inst = (fs_inst *)node;
1650
1651 /* If there's a SEND message that requires contiguous destination
1652 * registers, no splitting is allowed.
1653 */
1654 if (inst->regs_written > 1) {
1655 split_grf[inst->dst.reg] = false;
1656 }
1657
1658 /* If we're sending from a GRF, don't split it, on the assumption that
1659 * the send is reading the whole thing.
1660 */
1661 if (inst->is_send_from_grf()) {
1662 for (int i = 0; i < inst->sources; i++) {
1663 if (inst->src[i].file == GRF) {
1664 split_grf[inst->src[i].reg] = false;
1665 }
1666 }
1667 }
1668 }
1669
1670 /* Allocate new space for split regs. Note that the virtual
1671 * numbers will be contiguous.
1672 */
1673 for (int i = 0; i < num_vars; i++) {
1674 if (split_grf[i]) {
1675 new_virtual_grf[i] = virtual_grf_alloc(1);
1676 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1677 int reg = virtual_grf_alloc(1);
1678 assert(reg == new_virtual_grf[i] + j - 1);
1679 (void) reg;
1680 }
1681 this->virtual_grf_sizes[i] = 1;
1682 }
1683 }
1684
1685 foreach_list(node, &this->instructions) {
1686 fs_inst *inst = (fs_inst *)node;
1687
1688 if (inst->dst.file == GRF &&
1689 split_grf[inst->dst.reg] &&
1690 inst->dst.reg_offset != 0) {
1691 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1692 inst->dst.reg_offset - 1);
1693 inst->dst.reg_offset = 0;
1694 }
1695 for (int i = 0; i < inst->sources; i++) {
1696 if (inst->src[i].file == GRF &&
1697 split_grf[inst->src[i].reg] &&
1698 inst->src[i].reg_offset != 0) {
1699 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1700 inst->src[i].reg_offset - 1);
1701 inst->src[i].reg_offset = 0;
1702 }
1703 }
1704 }
1705 invalidate_live_intervals();
1706 }
1707
1708 /**
1709 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1710 *
1711 * During code generation, we create tons of temporary variables, many of
1712 * which get immediately killed and are never used again. Yet, in later
1713 * optimization and analysis passes, such as compute_live_intervals, we need
1714 * to loop over all the virtual GRFs. Compacting them can save a lot of
1715 * overhead.
1716 */
1717 void
1718 fs_visitor::compact_virtual_grfs()
1719 {
1720 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER))
1721 return;
1722
1723 /* Mark which virtual GRFs are used, and count how many. */
1724 int remap_table[this->virtual_grf_count];
1725 memset(remap_table, -1, sizeof(remap_table));
1726
1727 foreach_list(node, &this->instructions) {
1728 const fs_inst *inst = (const fs_inst *) node;
1729
1730 if (inst->dst.file == GRF)
1731 remap_table[inst->dst.reg] = 0;
1732
1733 for (int i = 0; i < inst->sources; i++) {
1734 if (inst->src[i].file == GRF)
1735 remap_table[inst->src[i].reg] = 0;
1736 }
1737 }
1738
1739 /* Compact the GRF arrays. */
1740 int new_index = 0;
1741 for (int i = 0; i < this->virtual_grf_count; i++) {
1742 if (remap_table[i] != -1) {
1743 remap_table[i] = new_index;
1744 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1745 invalidate_live_intervals();
1746 ++new_index;
1747 }
1748 }
1749
1750 this->virtual_grf_count = new_index;
1751
1752 /* Patch all the instructions to use the newly renumbered registers */
1753 foreach_list(node, &this->instructions) {
1754 fs_inst *inst = (fs_inst *) node;
1755
1756 if (inst->dst.file == GRF)
1757 inst->dst.reg = remap_table[inst->dst.reg];
1758
1759 for (int i = 0; i < inst->sources; i++) {
1760 if (inst->src[i].file == GRF)
1761 inst->src[i].reg = remap_table[inst->src[i].reg];
1762 }
1763 }
1764 }
1765
1766 /*
1767 * Implements array access of uniforms by inserting a
1768 * PULL_CONSTANT_LOAD instruction.
1769 *
1770 * Unlike temporary GRF array access (where we don't support it due to
1771 * the difficulty of doing relative addressing on instruction
1772 * destinations), we could potentially do array access of uniforms
1773 * that were loaded in GRF space as push constants. In real-world
1774 * usage we've seen, though, the arrays being used are always larger
1775 * than we could load as push constants, so just always move all
1776 * uniform array access out to a pull constant buffer.
1777 */
1778 void
1779 fs_visitor::move_uniform_array_access_to_pull_constants()
1780 {
1781 if (dispatch_width != 8)
1782 return;
1783
1784 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1785
1786 for (unsigned int i = 0; i < uniforms; i++) {
1787 pull_constant_loc[i] = -1;
1788 }
1789
1790 /* Walk through and find array access of uniforms. Put a copy of that
1791 * uniform in the pull constant buffer.
1792 *
1793 * Note that we don't move constant-indexed accesses to arrays. No
1794 * testing has been done of the performance impact of this choice.
1795 */
1796 foreach_list_safe(node, &this->instructions) {
1797 fs_inst *inst = (fs_inst *)node;
1798
1799 for (int i = 0 ; i < inst->sources; i++) {
1800 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1801 continue;
1802
1803 int uniform = inst->src[i].reg;
1804
1805 /* If this array isn't already present in the pull constant buffer,
1806 * add it.
1807 */
1808 if (pull_constant_loc[uniform] == -1) {
1809 const float **values = &stage_prog_data->param[uniform];
1810
1811 assert(param_size[uniform]);
1812
1813 for (int j = 0; j < param_size[uniform]; j++) {
1814 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1815
1816 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1817 values[j];
1818 }
1819 }
1820 }
1821 }
1822 }
1823
1824 /**
1825 * Assign UNIFORM file registers to either push constants or pull constants.
1826 *
1827 * We allow a fragment shader to have more than the specified minimum
1828 * maximum number of fragment shader uniform components (64). If
1829 * there are too many of these, they'd fill up all of register space.
1830 * So, this will push some of them out to the pull constant buffer and
1831 * update the program to load them.
1832 */
1833 void
1834 fs_visitor::assign_constant_locations()
1835 {
1836 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1837 if (dispatch_width != 8)
1838 return;
1839
1840 /* Find which UNIFORM registers are still in use. */
1841 bool is_live[uniforms];
1842 for (unsigned int i = 0; i < uniforms; i++) {
1843 is_live[i] = false;
1844 }
1845
1846 foreach_list(node, &this->instructions) {
1847 fs_inst *inst = (fs_inst *) node;
1848
1849 for (int i = 0; i < inst->sources; i++) {
1850 if (inst->src[i].file != UNIFORM)
1851 continue;
1852
1853 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1854 if (constant_nr >= 0 && constant_nr < (int) uniforms)
1855 is_live[constant_nr] = true;
1856 }
1857 }
1858
1859 /* Only allow 16 registers (128 uniform components) as push constants.
1860 *
1861 * Just demote the end of the list. We could probably do better
1862 * here, demoting things that are rarely used in the program first.
1863 */
1864 unsigned int max_push_components = 16 * 8;
1865 unsigned int num_push_constants = 0;
1866
1867 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1868
1869 for (unsigned int i = 0; i < uniforms; i++) {
1870 if (!is_live[i] || pull_constant_loc[i] != -1) {
1871 /* This UNIFORM register is either dead, or has already been demoted
1872 * to a pull const. Mark it as no longer living in the param[] array.
1873 */
1874 push_constant_loc[i] = -1;
1875 continue;
1876 }
1877
1878 if (num_push_constants < max_push_components) {
1879 /* Retain as a push constant. Record the location in the params[]
1880 * array.
1881 */
1882 push_constant_loc[i] = num_push_constants++;
1883 } else {
1884 /* Demote to a pull constant. */
1885 push_constant_loc[i] = -1;
1886
1887 int pull_index = stage_prog_data->nr_pull_params++;
1888 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1889 pull_constant_loc[i] = pull_index;
1890 }
1891 }
1892
1893 stage_prog_data->nr_params = num_push_constants;
1894
1895 /* Up until now, the param[] array has been indexed by reg + reg_offset
1896 * of UNIFORM registers. Condense it to only contain the uniforms we
1897 * chose to upload as push constants.
1898 */
1899 for (unsigned int i = 0; i < uniforms; i++) {
1900 int remapped = push_constant_loc[i];
1901
1902 if (remapped == -1)
1903 continue;
1904
1905 assert(remapped <= (int)i);
1906 stage_prog_data->param[remapped] = stage_prog_data->param[i];
1907 }
1908 }
1909
1910 /**
1911 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1912 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1913 */
1914 void
1915 fs_visitor::demote_pull_constants()
1916 {
1917 foreach_list(node, &this->instructions) {
1918 fs_inst *inst = (fs_inst *)node;
1919
1920 for (int i = 0; i < inst->sources; i++) {
1921 if (inst->src[i].file != UNIFORM)
1922 continue;
1923
1924 int pull_index = pull_constant_loc[inst->src[i].reg +
1925 inst->src[i].reg_offset];
1926 if (pull_index == -1)
1927 continue;
1928
1929 /* Set up the annotation tracking for new generated instructions. */
1930 base_ir = inst->ir;
1931 current_annotation = inst->annotation;
1932
1933 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1934 fs_reg dst = fs_reg(this, glsl_type::float_type);
1935
1936 /* Generate a pull load into dst. */
1937 if (inst->src[i].reladdr) {
1938 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
1939 surf_index,
1940 *inst->src[i].reladdr,
1941 pull_index);
1942 inst->insert_before(&list);
1943 inst->src[i].reladdr = NULL;
1944 } else {
1945 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1946 fs_inst *pull =
1947 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1948 dst, surf_index, offset);
1949 inst->insert_before(pull);
1950 inst->src[i].set_smear(pull_index & 3);
1951 }
1952
1953 /* Rewrite the instruction to use the temporary VGRF. */
1954 inst->src[i].file = GRF;
1955 inst->src[i].reg = dst.reg;
1956 inst->src[i].reg_offset = 0;
1957 }
1958 }
1959 invalidate_live_intervals();
1960 }
1961
1962 bool
1963 fs_visitor::opt_algebraic()
1964 {
1965 bool progress = false;
1966
1967 foreach_list(node, &this->instructions) {
1968 fs_inst *inst = (fs_inst *)node;
1969
1970 switch (inst->opcode) {
1971 case BRW_OPCODE_MUL:
1972 if (inst->src[1].file != IMM)
1973 continue;
1974
1975 /* a * 1.0 = a */
1976 if (inst->src[1].is_one()) {
1977 inst->opcode = BRW_OPCODE_MOV;
1978 inst->src[1] = reg_undef;
1979 progress = true;
1980 break;
1981 }
1982
1983 /* a * 0.0 = 0.0 */
1984 if (inst->src[1].is_zero()) {
1985 inst->opcode = BRW_OPCODE_MOV;
1986 inst->src[0] = inst->src[1];
1987 inst->src[1] = reg_undef;
1988 progress = true;
1989 break;
1990 }
1991
1992 break;
1993 case BRW_OPCODE_ADD:
1994 if (inst->src[1].file != IMM)
1995 continue;
1996
1997 /* a + 0.0 = a */
1998 if (inst->src[1].is_zero()) {
1999 inst->opcode = BRW_OPCODE_MOV;
2000 inst->src[1] = reg_undef;
2001 progress = true;
2002 break;
2003 }
2004 break;
2005 case BRW_OPCODE_OR:
2006 if (inst->src[0].equals(inst->src[1])) {
2007 inst->opcode = BRW_OPCODE_MOV;
2008 inst->src[1] = reg_undef;
2009 progress = true;
2010 break;
2011 }
2012 break;
2013 case BRW_OPCODE_LRP:
2014 if (inst->src[1].equals(inst->src[2])) {
2015 inst->opcode = BRW_OPCODE_MOV;
2016 inst->src[0] = inst->src[1];
2017 inst->src[1] = reg_undef;
2018 inst->src[2] = reg_undef;
2019 progress = true;
2020 break;
2021 }
2022 break;
2023 case BRW_OPCODE_SEL:
2024 if (inst->saturate && inst->src[1].file == IMM) {
2025 switch (inst->conditional_mod) {
2026 case BRW_CONDITIONAL_LE:
2027 case BRW_CONDITIONAL_L:
2028 switch (inst->src[1].type) {
2029 case BRW_REGISTER_TYPE_F:
2030 if (inst->src[1].imm.f >= 1.0f) {
2031 inst->opcode = BRW_OPCODE_MOV;
2032 inst->src[1] = reg_undef;
2033 progress = true;
2034 }
2035 break;
2036 default:
2037 break;
2038 }
2039 break;
2040 case BRW_CONDITIONAL_GE:
2041 case BRW_CONDITIONAL_G:
2042 switch (inst->src[1].type) {
2043 case BRW_REGISTER_TYPE_F:
2044 if (inst->src[1].imm.f <= 0.0f) {
2045 inst->opcode = BRW_OPCODE_MOV;
2046 inst->src[1] = reg_undef;
2047 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2048 progress = true;
2049 }
2050 break;
2051 default:
2052 break;
2053 }
2054 default:
2055 break;
2056 }
2057 }
2058 break;
2059 default:
2060 break;
2061 }
2062 }
2063
2064 return progress;
2065 }
2066
2067 bool
2068 fs_visitor::compute_to_mrf()
2069 {
2070 bool progress = false;
2071 int next_ip = 0;
2072
2073 calculate_live_intervals();
2074
2075 foreach_list_safe(node, &this->instructions) {
2076 fs_inst *inst = (fs_inst *)node;
2077
2078 int ip = next_ip;
2079 next_ip++;
2080
2081 if (inst->opcode != BRW_OPCODE_MOV ||
2082 inst->is_partial_write() ||
2083 inst->dst.file != MRF || inst->src[0].file != GRF ||
2084 inst->dst.type != inst->src[0].type ||
2085 inst->src[0].abs || inst->src[0].negate ||
2086 !inst->src[0].is_contiguous() ||
2087 inst->src[0].subreg_offset)
2088 continue;
2089
2090 /* Work out which hardware MRF registers are written by this
2091 * instruction.
2092 */
2093 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2094 int mrf_high;
2095 if (inst->dst.reg & BRW_MRF_COMPR4) {
2096 mrf_high = mrf_low + 4;
2097 } else if (dispatch_width == 16 &&
2098 (!inst->force_uncompressed && !inst->force_sechalf)) {
2099 mrf_high = mrf_low + 1;
2100 } else {
2101 mrf_high = mrf_low;
2102 }
2103
2104 /* Can't compute-to-MRF this GRF if someone else was going to
2105 * read it later.
2106 */
2107 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2108 continue;
2109
2110 /* Found a move of a GRF to a MRF. Let's see if we can go
2111 * rewrite the thing that made this GRF to write into the MRF.
2112 */
2113 fs_inst *scan_inst;
2114 for (scan_inst = (fs_inst *)inst->prev;
2115 scan_inst->prev != NULL;
2116 scan_inst = (fs_inst *)scan_inst->prev) {
2117 if (scan_inst->dst.file == GRF &&
2118 scan_inst->dst.reg == inst->src[0].reg) {
2119 /* Found the last thing to write our reg we want to turn
2120 * into a compute-to-MRF.
2121 */
2122
2123 /* If this one instruction didn't populate all the
2124 * channels, bail. We might be able to rewrite everything
2125 * that writes that reg, but it would require smarter
2126 * tracking to delay the rewriting until complete success.
2127 */
2128 if (scan_inst->is_partial_write())
2129 break;
2130
2131 /* Things returning more than one register would need us to
2132 * understand coalescing out more than one MOV at a time.
2133 */
2134 if (scan_inst->regs_written > 1)
2135 break;
2136
2137 /* SEND instructions can't have MRF as a destination. */
2138 if (scan_inst->mlen)
2139 break;
2140
2141 if (brw->gen == 6) {
2142 /* gen6 math instructions must have the destination be
2143 * GRF, so no compute-to-MRF for them.
2144 */
2145 if (scan_inst->is_math()) {
2146 break;
2147 }
2148 }
2149
2150 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2151 /* Found the creator of our MRF's source value. */
2152 scan_inst->dst.file = MRF;
2153 scan_inst->dst.reg = inst->dst.reg;
2154 scan_inst->saturate |= inst->saturate;
2155 inst->remove();
2156 progress = true;
2157 }
2158 break;
2159 }
2160
2161 /* We don't handle control flow here. Most computation of
2162 * values that end up in MRFs are shortly before the MRF
2163 * write anyway.
2164 */
2165 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2166 break;
2167
2168 /* You can't read from an MRF, so if someone else reads our
2169 * MRF's source GRF that we wanted to rewrite, that stops us.
2170 */
2171 bool interfered = false;
2172 for (int i = 0; i < scan_inst->sources; i++) {
2173 if (scan_inst->src[i].file == GRF &&
2174 scan_inst->src[i].reg == inst->src[0].reg &&
2175 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2176 interfered = true;
2177 }
2178 }
2179 if (interfered)
2180 break;
2181
2182 if (scan_inst->dst.file == MRF) {
2183 /* If somebody else writes our MRF here, we can't
2184 * compute-to-MRF before that.
2185 */
2186 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2187 int scan_mrf_high;
2188
2189 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2190 scan_mrf_high = scan_mrf_low + 4;
2191 } else if (dispatch_width == 16 &&
2192 (!scan_inst->force_uncompressed &&
2193 !scan_inst->force_sechalf)) {
2194 scan_mrf_high = scan_mrf_low + 1;
2195 } else {
2196 scan_mrf_high = scan_mrf_low;
2197 }
2198
2199 if (mrf_low == scan_mrf_low ||
2200 mrf_low == scan_mrf_high ||
2201 mrf_high == scan_mrf_low ||
2202 mrf_high == scan_mrf_high) {
2203 break;
2204 }
2205 }
2206
2207 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2208 /* Found a SEND instruction, which means that there are
2209 * live values in MRFs from base_mrf to base_mrf +
2210 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2211 * above it.
2212 */
2213 if (mrf_low >= scan_inst->base_mrf &&
2214 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2215 break;
2216 }
2217 if (mrf_high >= scan_inst->base_mrf &&
2218 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2219 break;
2220 }
2221 }
2222 }
2223 }
2224
2225 if (progress)
2226 invalidate_live_intervals();
2227
2228 return progress;
2229 }
2230
2231 /**
2232 * Walks through basic blocks, looking for repeated MRF writes and
2233 * removing the later ones.
2234 */
2235 bool
2236 fs_visitor::remove_duplicate_mrf_writes()
2237 {
2238 fs_inst *last_mrf_move[16];
2239 bool progress = false;
2240
2241 /* Need to update the MRF tracking for compressed instructions. */
2242 if (dispatch_width == 16)
2243 return false;
2244
2245 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2246
2247 foreach_list_safe(node, &this->instructions) {
2248 fs_inst *inst = (fs_inst *)node;
2249
2250 if (inst->is_control_flow()) {
2251 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2252 }
2253
2254 if (inst->opcode == BRW_OPCODE_MOV &&
2255 inst->dst.file == MRF) {
2256 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2257 if (prev_inst && inst->equals(prev_inst)) {
2258 inst->remove();
2259 progress = true;
2260 continue;
2261 }
2262 }
2263
2264 /* Clear out the last-write records for MRFs that were overwritten. */
2265 if (inst->dst.file == MRF) {
2266 last_mrf_move[inst->dst.reg] = NULL;
2267 }
2268
2269 if (inst->mlen > 0 && inst->base_mrf != -1) {
2270 /* Found a SEND instruction, which will include two or fewer
2271 * implied MRF writes. We could do better here.
2272 */
2273 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2274 last_mrf_move[inst->base_mrf + i] = NULL;
2275 }
2276 }
2277
2278 /* Clear out any MRF move records whose sources got overwritten. */
2279 if (inst->dst.file == GRF) {
2280 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2281 if (last_mrf_move[i] &&
2282 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2283 last_mrf_move[i] = NULL;
2284 }
2285 }
2286 }
2287
2288 if (inst->opcode == BRW_OPCODE_MOV &&
2289 inst->dst.file == MRF &&
2290 inst->src[0].file == GRF &&
2291 !inst->is_partial_write()) {
2292 last_mrf_move[inst->dst.reg] = inst;
2293 }
2294 }
2295
2296 if (progress)
2297 invalidate_live_intervals();
2298
2299 return progress;
2300 }
2301
2302 static void
2303 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2304 int first_grf, int grf_len)
2305 {
2306 bool inst_simd16 = (dispatch_width > 8 &&
2307 !inst->force_uncompressed &&
2308 !inst->force_sechalf);
2309
2310 /* Clear the flag for registers that actually got read (as expected). */
2311 for (int i = 0; i < inst->sources; i++) {
2312 int grf;
2313 if (inst->src[i].file == GRF) {
2314 grf = inst->src[i].reg;
2315 } else if (inst->src[i].file == HW_REG &&
2316 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2317 grf = inst->src[i].fixed_hw_reg.nr;
2318 } else {
2319 continue;
2320 }
2321
2322 if (grf >= first_grf &&
2323 grf < first_grf + grf_len) {
2324 deps[grf - first_grf] = false;
2325 if (inst_simd16)
2326 deps[grf - first_grf + 1] = false;
2327 }
2328 }
2329 }
2330
2331 /**
2332 * Implements this workaround for the original 965:
2333 *
2334 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2335 * check for post destination dependencies on this instruction, software
2336 * must ensure that there is no destination hazard for the case of ‘write
2337 * followed by a posted write’ shown in the following example.
2338 *
2339 * 1. mov r3 0
2340 * 2. send r3.xy <rest of send instruction>
2341 * 3. mov r2 r3
2342 *
2343 * Due to no post-destination dependency check on the ‘send’, the above
2344 * code sequence could have two instructions (1 and 2) in flight at the
2345 * same time that both consider ‘r3’ as the target of their final writes.
2346 */
2347 void
2348 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2349 {
2350 int reg_size = dispatch_width / 8;
2351 int write_len = inst->regs_written * reg_size;
2352 int first_write_grf = inst->dst.reg;
2353 bool needs_dep[BRW_MAX_MRF];
2354 assert(write_len < (int)sizeof(needs_dep) - 1);
2355
2356 memset(needs_dep, false, sizeof(needs_dep));
2357 memset(needs_dep, true, write_len);
2358
2359 clear_deps_for_inst_src(inst, dispatch_width,
2360 needs_dep, first_write_grf, write_len);
2361
2362 /* Walk backwards looking for writes to registers we're writing which
2363 * aren't read since being written. If we hit the start of the program,
2364 * we assume that there are no outstanding dependencies on entry to the
2365 * program.
2366 */
2367 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2368 !scan_inst->is_head_sentinel();
2369 scan_inst = (fs_inst *)scan_inst->prev) {
2370
2371 /* If we hit control flow, assume that there *are* outstanding
2372 * dependencies, and force their cleanup before our instruction.
2373 */
2374 if (scan_inst->is_control_flow()) {
2375 for (int i = 0; i < write_len; i++) {
2376 if (needs_dep[i]) {
2377 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2378 }
2379 }
2380 return;
2381 }
2382
2383 bool scan_inst_simd16 = (dispatch_width > 8 &&
2384 !scan_inst->force_uncompressed &&
2385 !scan_inst->force_sechalf);
2386
2387 /* We insert our reads as late as possible on the assumption that any
2388 * instruction but a MOV that might have left us an outstanding
2389 * dependency has more latency than a MOV.
2390 */
2391 if (scan_inst->dst.file == GRF) {
2392 for (int i = 0; i < scan_inst->regs_written; i++) {
2393 int reg = scan_inst->dst.reg + i * reg_size;
2394
2395 if (reg >= first_write_grf &&
2396 reg < first_write_grf + write_len &&
2397 needs_dep[reg - first_write_grf]) {
2398 inst->insert_before(DEP_RESOLVE_MOV(reg));
2399 needs_dep[reg - first_write_grf] = false;
2400 if (scan_inst_simd16)
2401 needs_dep[reg - first_write_grf + 1] = false;
2402 }
2403 }
2404 }
2405
2406 /* Clear the flag for registers that actually got read (as expected). */
2407 clear_deps_for_inst_src(scan_inst, dispatch_width,
2408 needs_dep, first_write_grf, write_len);
2409
2410 /* Continue the loop only if we haven't resolved all the dependencies */
2411 int i;
2412 for (i = 0; i < write_len; i++) {
2413 if (needs_dep[i])
2414 break;
2415 }
2416 if (i == write_len)
2417 return;
2418 }
2419 }
2420
2421 /**
2422 * Implements this workaround for the original 965:
2423 *
2424 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2425 * used as a destination register until after it has been sourced by an
2426 * instruction with a different destination register.
2427 */
2428 void
2429 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2430 {
2431 int write_len = inst->regs_written * dispatch_width / 8;
2432 int first_write_grf = inst->dst.reg;
2433 bool needs_dep[BRW_MAX_MRF];
2434 assert(write_len < (int)sizeof(needs_dep) - 1);
2435
2436 memset(needs_dep, false, sizeof(needs_dep));
2437 memset(needs_dep, true, write_len);
2438 /* Walk forwards looking for writes to registers we're writing which aren't
2439 * read before being written.
2440 */
2441 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2442 !scan_inst->is_tail_sentinel();
2443 scan_inst = (fs_inst *)scan_inst->next) {
2444 /* If we hit control flow, force resolve all remaining dependencies. */
2445 if (scan_inst->is_control_flow()) {
2446 for (int i = 0; i < write_len; i++) {
2447 if (needs_dep[i])
2448 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2449 }
2450 return;
2451 }
2452
2453 /* Clear the flag for registers that actually got read (as expected). */
2454 clear_deps_for_inst_src(scan_inst, dispatch_width,
2455 needs_dep, first_write_grf, write_len);
2456
2457 /* We insert our reads as late as possible since they're reading the
2458 * result of a SEND, which has massive latency.
2459 */
2460 if (scan_inst->dst.file == GRF &&
2461 scan_inst->dst.reg >= first_write_grf &&
2462 scan_inst->dst.reg < first_write_grf + write_len &&
2463 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2464 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2465 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2466 }
2467
2468 /* Continue the loop only if we haven't resolved all the dependencies */
2469 int i;
2470 for (i = 0; i < write_len; i++) {
2471 if (needs_dep[i])
2472 break;
2473 }
2474 if (i == write_len)
2475 return;
2476 }
2477
2478 /* If we hit the end of the program, resolve all remaining dependencies out
2479 * of paranoia.
2480 */
2481 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2482 assert(last_inst->eot);
2483 for (int i = 0; i < write_len; i++) {
2484 if (needs_dep[i])
2485 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2486 }
2487 }
2488
2489 void
2490 fs_visitor::insert_gen4_send_dependency_workarounds()
2491 {
2492 if (brw->gen != 4 || brw->is_g4x)
2493 return;
2494
2495 bool progress = false;
2496
2497 /* Note that we're done with register allocation, so GRF fs_regs always
2498 * have a .reg_offset of 0.
2499 */
2500
2501 foreach_list_safe(node, &this->instructions) {
2502 fs_inst *inst = (fs_inst *)node;
2503
2504 if (inst->mlen != 0 && inst->dst.file == GRF) {
2505 insert_gen4_pre_send_dependency_workarounds(inst);
2506 insert_gen4_post_send_dependency_workarounds(inst);
2507 progress = true;
2508 }
2509 }
2510
2511 if (progress)
2512 invalidate_live_intervals();
2513 }
2514
2515 /**
2516 * Turns the generic expression-style uniform pull constant load instruction
2517 * into a hardware-specific series of instructions for loading a pull
2518 * constant.
2519 *
2520 * The expression style allows the CSE pass before this to optimize out
2521 * repeated loads from the same offset, and gives the pre-register-allocation
2522 * scheduling full flexibility, while the conversion to native instructions
2523 * allows the post-register-allocation scheduler the best information
2524 * possible.
2525 *
2526 * Note that execution masking for setting up pull constant loads is special:
2527 * the channels that need to be written are unrelated to the current execution
2528 * mask, since a later instruction will use one of the result channels as a
2529 * source operand for all 8 or 16 of its channels.
2530 */
2531 void
2532 fs_visitor::lower_uniform_pull_constant_loads()
2533 {
2534 foreach_list(node, &this->instructions) {
2535 fs_inst *inst = (fs_inst *)node;
2536
2537 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2538 continue;
2539
2540 if (brw->gen >= 7) {
2541 /* The offset arg before was a vec4-aligned byte offset. We need to
2542 * turn it into a dword offset.
2543 */
2544 fs_reg const_offset_reg = inst->src[1];
2545 assert(const_offset_reg.file == IMM &&
2546 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2547 const_offset_reg.imm.u /= 4;
2548 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2549
2550 /* This is actually going to be a MOV, but since only the first dword
2551 * is accessed, we have a special opcode to do just that one. Note
2552 * that this needs to be an operation that will be considered a def
2553 * by live variable analysis, or register allocation will explode.
2554 */
2555 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2556 payload, const_offset_reg);
2557 setup->force_writemask_all = true;
2558
2559 setup->ir = inst->ir;
2560 setup->annotation = inst->annotation;
2561 inst->insert_before(setup);
2562
2563 /* Similarly, this will only populate the first 4 channels of the
2564 * result register (since we only use smear values from 0-3), but we
2565 * don't tell the optimizer.
2566 */
2567 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2568 inst->src[1] = payload;
2569
2570 invalidate_live_intervals();
2571 } else {
2572 /* Before register allocation, we didn't tell the scheduler about the
2573 * MRF we use. We know it's safe to use this MRF because nothing
2574 * else does except for register spill/unspill, which generates and
2575 * uses its MRF within a single IR instruction.
2576 */
2577 inst->base_mrf = 14;
2578 inst->mlen = 1;
2579 }
2580 }
2581 }
2582
2583 void
2584 fs_visitor::dump_instructions()
2585 {
2586 dump_instructions(NULL);
2587 }
2588
2589 void
2590 fs_visitor::dump_instructions(const char *name)
2591 {
2592 calculate_register_pressure();
2593 FILE *file = stderr;
2594 if (name && geteuid() != 0) {
2595 file = fopen(name, "w");
2596 if (!file)
2597 file = stderr;
2598 }
2599
2600 int ip = 0, max_pressure = 0;
2601 foreach_list(node, &this->instructions) {
2602 backend_instruction *inst = (backend_instruction *)node;
2603 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2604 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
2605 dump_instruction(inst, file);
2606 ++ip;
2607 }
2608 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
2609
2610 if (file != stderr) {
2611 fclose(file);
2612 }
2613 }
2614
2615 void
2616 fs_visitor::dump_instruction(backend_instruction *be_inst)
2617 {
2618 dump_instruction(be_inst, stderr);
2619 }
2620
2621 void
2622 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
2623 {
2624 fs_inst *inst = (fs_inst *)be_inst;
2625
2626 if (inst->predicate) {
2627 fprintf(file, "(%cf0.%d) ",
2628 inst->predicate_inverse ? '-' : '+',
2629 inst->flag_subreg);
2630 }
2631
2632 fprintf(file, "%s", brw_instruction_name(inst->opcode));
2633 if (inst->saturate)
2634 fprintf(file, ".sat");
2635 if (inst->conditional_mod) {
2636 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
2637 if (!inst->predicate &&
2638 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2639 inst->opcode != BRW_OPCODE_IF &&
2640 inst->opcode != BRW_OPCODE_WHILE))) {
2641 fprintf(file, ".f0.%d", inst->flag_subreg);
2642 }
2643 }
2644 fprintf(file, " ");
2645
2646
2647 switch (inst->dst.file) {
2648 case GRF:
2649 fprintf(file, "vgrf%d", inst->dst.reg);
2650 if (virtual_grf_sizes[inst->dst.reg] != 1 ||
2651 inst->dst.subreg_offset)
2652 fprintf(file, "+%d.%d",
2653 inst->dst.reg_offset, inst->dst.subreg_offset);
2654 break;
2655 case MRF:
2656 fprintf(file, "m%d", inst->dst.reg);
2657 break;
2658 case BAD_FILE:
2659 fprintf(file, "(null)");
2660 break;
2661 case UNIFORM:
2662 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
2663 break;
2664 case HW_REG:
2665 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2666 switch (inst->dst.fixed_hw_reg.nr) {
2667 case BRW_ARF_NULL:
2668 fprintf(file, "null");
2669 break;
2670 case BRW_ARF_ADDRESS:
2671 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
2672 break;
2673 case BRW_ARF_ACCUMULATOR:
2674 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
2675 break;
2676 case BRW_ARF_FLAG:
2677 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2678 inst->dst.fixed_hw_reg.subnr);
2679 break;
2680 default:
2681 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2682 inst->dst.fixed_hw_reg.subnr);
2683 break;
2684 }
2685 } else {
2686 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
2687 }
2688 if (inst->dst.fixed_hw_reg.subnr)
2689 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
2690 break;
2691 default:
2692 fprintf(file, "???");
2693 break;
2694 }
2695 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
2696
2697 for (int i = 0; i < inst->sources && inst->src[i].file != BAD_FILE; i++) {
2698 if (inst->src[i].negate)
2699 fprintf(file, "-");
2700 if (inst->src[i].abs)
2701 fprintf(file, "|");
2702 switch (inst->src[i].file) {
2703 case GRF:
2704 fprintf(file, "vgrf%d", inst->src[i].reg);
2705 if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2706 inst->src[i].subreg_offset)
2707 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2708 inst->src[i].subreg_offset);
2709 break;
2710 case MRF:
2711 fprintf(file, "***m%d***", inst->src[i].reg);
2712 break;
2713 case UNIFORM:
2714 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
2715 if (inst->src[i].reladdr) {
2716 fprintf(file, "+reladdr");
2717 } else if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2718 inst->src[i].subreg_offset) {
2719 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2720 inst->src[i].subreg_offset);
2721 }
2722 break;
2723 case BAD_FILE:
2724 fprintf(file, "(null)");
2725 break;
2726 case IMM:
2727 switch (inst->src[i].type) {
2728 case BRW_REGISTER_TYPE_F:
2729 fprintf(file, "%ff", inst->src[i].imm.f);
2730 break;
2731 case BRW_REGISTER_TYPE_D:
2732 fprintf(file, "%dd", inst->src[i].imm.i);
2733 break;
2734 case BRW_REGISTER_TYPE_UD:
2735 fprintf(file, "%uu", inst->src[i].imm.u);
2736 break;
2737 default:
2738 fprintf(file, "???");
2739 break;
2740 }
2741 break;
2742 case HW_REG:
2743 if (inst->src[i].fixed_hw_reg.negate)
2744 fprintf(file, "-");
2745 if (inst->src[i].fixed_hw_reg.abs)
2746 fprintf(file, "|");
2747 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2748 switch (inst->src[i].fixed_hw_reg.nr) {
2749 case BRW_ARF_NULL:
2750 fprintf(file, "null");
2751 break;
2752 case BRW_ARF_ADDRESS:
2753 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
2754 break;
2755 case BRW_ARF_ACCUMULATOR:
2756 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
2757 break;
2758 case BRW_ARF_FLAG:
2759 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2760 inst->src[i].fixed_hw_reg.subnr);
2761 break;
2762 default:
2763 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2764 inst->src[i].fixed_hw_reg.subnr);
2765 break;
2766 }
2767 } else {
2768 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2769 }
2770 if (inst->src[i].fixed_hw_reg.subnr)
2771 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
2772 if (inst->src[i].fixed_hw_reg.abs)
2773 fprintf(file, "|");
2774 break;
2775 default:
2776 fprintf(file, "???");
2777 break;
2778 }
2779 if (inst->src[i].abs)
2780 fprintf(file, "|");
2781
2782 if (inst->src[i].file != IMM) {
2783 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
2784 }
2785
2786 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
2787 fprintf(file, ", ");
2788 }
2789
2790 fprintf(file, " ");
2791
2792 if (inst->force_uncompressed)
2793 fprintf(file, "1sthalf ");
2794
2795 if (inst->force_sechalf)
2796 fprintf(file, "2ndhalf ");
2797
2798 fprintf(file, "\n");
2799 }
2800
2801 /**
2802 * Possibly returns an instruction that set up @param reg.
2803 *
2804 * Sometimes we want to take the result of some expression/variable
2805 * dereference tree and rewrite the instruction generating the result
2806 * of the tree. When processing the tree, we know that the
2807 * instructions generated are all writing temporaries that are dead
2808 * outside of this tree. So, if we have some instructions that write
2809 * a temporary, we're free to point that temp write somewhere else.
2810 *
2811 * Note that this doesn't guarantee that the instruction generated
2812 * only reg -- it might be the size=4 destination of a texture instruction.
2813 */
2814 fs_inst *
2815 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2816 fs_inst *end,
2817 const fs_reg &reg)
2818 {
2819 if (end == start ||
2820 end->is_partial_write() ||
2821 reg.reladdr ||
2822 !reg.equals(end->dst)) {
2823 return NULL;
2824 } else {
2825 return end;
2826 }
2827 }
2828
2829 void
2830 fs_visitor::setup_payload_gen6()
2831 {
2832 bool uses_depth =
2833 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2834 unsigned barycentric_interp_modes = prog_data->barycentric_interp_modes;
2835
2836 assert(brw->gen >= 6);
2837
2838 /* R0-1: masks, pixel X/Y coordinates. */
2839 payload.num_regs = 2;
2840 /* R2: only for 32-pixel dispatch.*/
2841
2842 /* R3-26: barycentric interpolation coordinates. These appear in the
2843 * same order that they appear in the brw_wm_barycentric_interp_mode
2844 * enum. Each set of coordinates occupies 2 registers if dispatch width
2845 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2846 * appear if they were enabled using the "Barycentric Interpolation
2847 * Mode" bits in WM_STATE.
2848 */
2849 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2850 if (barycentric_interp_modes & (1 << i)) {
2851 payload.barycentric_coord_reg[i] = payload.num_regs;
2852 payload.num_regs += 2;
2853 if (dispatch_width == 16) {
2854 payload.num_regs += 2;
2855 }
2856 }
2857 }
2858
2859 /* R27: interpolated depth if uses source depth */
2860 if (uses_depth) {
2861 payload.source_depth_reg = payload.num_regs;
2862 payload.num_regs++;
2863 if (dispatch_width == 16) {
2864 /* R28: interpolated depth if not SIMD8. */
2865 payload.num_regs++;
2866 }
2867 }
2868 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2869 if (uses_depth) {
2870 payload.source_w_reg = payload.num_regs;
2871 payload.num_regs++;
2872 if (dispatch_width == 16) {
2873 /* R30: interpolated W if not SIMD8. */
2874 payload.num_regs++;
2875 }
2876 }
2877
2878 prog_data->uses_pos_offset = key->compute_pos_offset;
2879 /* R31: MSAA position offsets. */
2880 if (prog_data->uses_pos_offset) {
2881 payload.sample_pos_reg = payload.num_regs;
2882 payload.num_regs++;
2883 }
2884
2885 /* R32: MSAA input coverage mask */
2886 if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
2887 assert(brw->gen >= 7);
2888 payload.sample_mask_in_reg = payload.num_regs;
2889 payload.num_regs++;
2890 if (dispatch_width == 16) {
2891 /* R33: input coverage mask if not SIMD8. */
2892 payload.num_regs++;
2893 }
2894 }
2895
2896 /* R34-: bary for 32-pixel. */
2897 /* R58-59: interp W for 32-pixel. */
2898
2899 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2900 source_depth_to_render_target = true;
2901 }
2902 }
2903
2904 void
2905 fs_visitor::assign_binding_table_offsets()
2906 {
2907 uint32_t next_binding_table_offset = 0;
2908
2909 /* If there are no color regions, we still perform an FB write to a null
2910 * renderbuffer, which we place at surface index 0.
2911 */
2912 prog_data->binding_table.render_target_start = next_binding_table_offset;
2913 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
2914
2915 assign_common_binding_table_offsets(next_binding_table_offset);
2916 }
2917
2918 void
2919 fs_visitor::calculate_register_pressure()
2920 {
2921 invalidate_live_intervals();
2922 calculate_live_intervals();
2923
2924 int num_instructions = 0;
2925 foreach_list(node, &this->instructions) {
2926 ++num_instructions;
2927 }
2928
2929 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
2930
2931 for (int reg = 0; reg < virtual_grf_count; reg++) {
2932 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
2933 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
2934 }
2935 }
2936
2937 /**
2938 * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
2939 *
2940 * The needs_unlit_centroid_workaround ends up producing one of these per
2941 * channel of centroid input, so it's good to clean them up.
2942 *
2943 * An assumption here is that nothing ever modifies the dispatched pixels
2944 * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
2945 * dictates that anyway.
2946 */
2947 void
2948 fs_visitor::opt_drop_redundant_mov_to_flags()
2949 {
2950 bool flag_mov_found[2] = {false};
2951
2952 foreach_list_safe(node, &this->instructions) {
2953 fs_inst *inst = (fs_inst *)node;
2954
2955 if (inst->is_control_flow()) {
2956 memset(flag_mov_found, 0, sizeof(flag_mov_found));
2957 } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
2958 if (!flag_mov_found[inst->flag_subreg])
2959 flag_mov_found[inst->flag_subreg] = true;
2960 else
2961 inst->remove();
2962 } else if (inst->writes_flag()) {
2963 flag_mov_found[inst->flag_subreg] = false;
2964 }
2965 }
2966 }
2967
2968 bool
2969 fs_visitor::run()
2970 {
2971 sanity_param_count = fp->Base.Parameters->NumParameters;
2972 bool allocated_without_spills;
2973
2974 assign_binding_table_offsets();
2975
2976 if (brw->gen >= 6)
2977 setup_payload_gen6();
2978 else
2979 setup_payload_gen4();
2980
2981 if (0) {
2982 emit_dummy_fs();
2983 } else {
2984 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2985 emit_shader_time_begin();
2986
2987 calculate_urb_setup();
2988 if (fp->Base.InputsRead > 0) {
2989 if (brw->gen < 6)
2990 emit_interpolation_setup_gen4();
2991 else
2992 emit_interpolation_setup_gen6();
2993 }
2994
2995 /* We handle discards by keeping track of the still-live pixels in f0.1.
2996 * Initialize it with the dispatched pixels.
2997 */
2998 if (fp->UsesKill || key->alpha_test_func) {
2999 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3000 discard_init->flag_subreg = 1;
3001 }
3002
3003 /* Generate FS IR for main(). (the visitor only descends into
3004 * functions called "main").
3005 */
3006 if (shader) {
3007 foreach_list(node, &*shader->base.ir) {
3008 ir_instruction *ir = (ir_instruction *)node;
3009 base_ir = ir;
3010 this->result = reg_undef;
3011 ir->accept(this);
3012 }
3013 } else {
3014 emit_fragment_program_code();
3015 }
3016 base_ir = NULL;
3017 if (failed)
3018 return false;
3019
3020 emit(FS_OPCODE_PLACEHOLDER_HALT);
3021
3022 if (key->alpha_test_func)
3023 emit_alpha_test();
3024
3025 emit_fb_writes();
3026
3027 split_virtual_grfs();
3028
3029 move_uniform_array_access_to_pull_constants();
3030 assign_constant_locations();
3031 demote_pull_constants();
3032
3033 opt_drop_redundant_mov_to_flags();
3034
3035 #define OPT(pass, args...) do { \
3036 pass_num++; \
3037 bool this_progress = pass(args); \
3038 \
3039 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3040 char filename[64]; \
3041 snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass, \
3042 dispatch_width, shader_prog->Name, iteration, pass_num); \
3043 \
3044 backend_visitor::dump_instructions(filename); \
3045 } \
3046 \
3047 progress = progress || this_progress; \
3048 } while (false)
3049
3050 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3051 char filename[64];
3052 snprintf(filename, 64, "fs%d-%04d-00-start",
3053 dispatch_width, shader_prog->Name);
3054
3055 backend_visitor::dump_instructions(filename);
3056 }
3057
3058 bool progress;
3059 int iteration = 0;
3060 do {
3061 progress = false;
3062 iteration++;
3063 int pass_num = 0;
3064
3065 compact_virtual_grfs();
3066
3067 OPT(remove_duplicate_mrf_writes);
3068
3069 OPT(opt_algebraic);
3070 OPT(opt_cse);
3071 OPT(opt_copy_propagate);
3072 OPT(opt_peephole_predicated_break);
3073 OPT(dead_code_eliminate);
3074 OPT(opt_peephole_sel);
3075 OPT(dead_control_flow_eliminate, this);
3076 OPT(opt_saturate_propagation);
3077 OPT(register_coalesce);
3078 OPT(compute_to_mrf);
3079 } while (progress);
3080
3081 lower_uniform_pull_constant_loads();
3082
3083 assign_curb_setup();
3084 assign_urb_setup();
3085
3086 static enum instruction_scheduler_mode pre_modes[] = {
3087 SCHEDULE_PRE,
3088 SCHEDULE_PRE_NON_LIFO,
3089 SCHEDULE_PRE_LIFO,
3090 };
3091
3092 /* Try each scheduling heuristic to see if it can successfully register
3093 * allocate without spilling. They should be ordered by decreasing
3094 * performance but increasing likelihood of allocating.
3095 */
3096 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3097 schedule_instructions(pre_modes[i]);
3098
3099 if (0) {
3100 assign_regs_trivial();
3101 allocated_without_spills = true;
3102 } else {
3103 allocated_without_spills = assign_regs(false);
3104 }
3105 if (allocated_without_spills)
3106 break;
3107 }
3108
3109 if (!allocated_without_spills) {
3110 /* We assume that any spilling is worse than just dropping back to
3111 * SIMD8. There's probably actually some intermediate point where
3112 * SIMD16 with a couple of spills is still better.
3113 */
3114 if (dispatch_width == 16) {
3115 fail("Failure to register allocate. Reduce number of "
3116 "live scalar values to avoid this.");
3117 } else {
3118 perf_debug("Fragment shader triggered register spilling. "
3119 "Try reducing the number of live scalar values to "
3120 "improve performance.\n");
3121 }
3122
3123 /* Since we're out of heuristics, just go spill registers until we
3124 * get an allocation.
3125 */
3126 while (!assign_regs(true)) {
3127 if (failed)
3128 break;
3129 }
3130 }
3131 }
3132 assert(force_uncompressed_stack == 0);
3133
3134 /* This must come after all optimization and register allocation, since
3135 * it inserts dead code that happens to have side effects, and it does
3136 * so based on the actual physical registers in use.
3137 */
3138 insert_gen4_send_dependency_workarounds();
3139
3140 if (failed)
3141 return false;
3142
3143 if (!allocated_without_spills)
3144 schedule_instructions(SCHEDULE_POST);
3145
3146 if (last_scratch > 0) {
3147 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3148 }
3149
3150 if (dispatch_width == 8)
3151 prog_data->reg_blocks = brw_register_blocks(grf_used);
3152 else
3153 prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3154
3155 /* If any state parameters were appended, then ParameterValues could have
3156 * been realloced, in which case the driver uniform storage set up by
3157 * _mesa_associate_uniform_storage() would point to freed memory. Make
3158 * sure that didn't happen.
3159 */
3160 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3161
3162 return !failed;
3163 }
3164
3165 const unsigned *
3166 brw_wm_fs_emit(struct brw_context *brw,
3167 void *mem_ctx,
3168 const struct brw_wm_prog_key *key,
3169 struct brw_wm_prog_data *prog_data,
3170 struct gl_fragment_program *fp,
3171 struct gl_shader_program *prog,
3172 unsigned *final_assembly_size)
3173 {
3174 bool start_busy = false;
3175 double start_time = 0;
3176
3177 if (unlikely(brw->perf_debug)) {
3178 start_busy = (brw->batch.last_bo &&
3179 drm_intel_bo_busy(brw->batch.last_bo));
3180 start_time = get_time();
3181 }
3182
3183 struct brw_shader *shader = NULL;
3184 if (prog)
3185 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3186
3187 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3188 brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3189
3190 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3191 */
3192 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3193 if (!v.run()) {
3194 if (prog) {
3195 prog->LinkStatus = false;
3196 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3197 }
3198
3199 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3200 v.fail_msg);
3201
3202 return NULL;
3203 }
3204
3205 exec_list *simd16_instructions = NULL;
3206 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3207 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3208 if (!v.simd16_unsupported) {
3209 /* Try a SIMD16 compile */
3210 v2.import_uniforms(&v);
3211 if (!v2.run()) {
3212 perf_debug("SIMD16 shader failed to compile, falling back to "
3213 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3214 } else {
3215 simd16_instructions = &v2.instructions;
3216 }
3217 } else {
3218 perf_debug("SIMD16 shader unsupported, falling back to "
3219 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3220 }
3221 }
3222
3223 const unsigned *assembly = NULL;
3224 if (brw->gen >= 8) {
3225 gen8_fs_generator g(brw, mem_ctx, key, prog_data, prog, fp, v.do_dual_src);
3226 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3227 final_assembly_size);
3228 } else {
3229 fs_generator g(brw, mem_ctx, key, prog_data, prog, fp, v.do_dual_src,
3230 v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
3231 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3232 final_assembly_size);
3233 }
3234
3235 if (unlikely(brw->perf_debug) && shader) {
3236 if (shader->compiled_once)
3237 brw_wm_debug_recompile(brw, prog, key);
3238 shader->compiled_once = true;
3239
3240 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3241 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3242 (get_time() - start_time) * 1000);
3243 }
3244 }
3245
3246 return assembly;
3247 }
3248
3249 bool
3250 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3251 {
3252 struct brw_context *brw = brw_context(ctx);
3253 struct brw_wm_prog_key key;
3254
3255 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3256 return true;
3257
3258 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3259 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3260 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3261 bool program_uses_dfdy = fp->UsesDFdy;
3262
3263 memset(&key, 0, sizeof(key));
3264
3265 if (brw->gen < 6) {
3266 if (fp->UsesKill)
3267 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3268
3269 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3270 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3271
3272 /* Just assume depth testing. */
3273 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3274 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3275 }
3276
3277 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3278 BRW_FS_VARYING_INPUT_MASK) > 16)
3279 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3280
3281 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3282 for (unsigned i = 0; i < sampler_count; i++) {
3283 if (fp->Base.ShadowSamplers & (1 << i)) {
3284 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3285 key.tex.swizzles[i] =
3286 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3287 } else {
3288 /* Color sampler: assume no swizzling. */
3289 key.tex.swizzles[i] = SWIZZLE_XYZW;
3290 }
3291 }
3292
3293 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3294 key.drawable_height = ctx->DrawBuffer->Height;
3295 }
3296
3297 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3298 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3299 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3300
3301 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3302 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3303 key.nr_color_regions > 1;
3304 }
3305
3306 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3307 * quality of the derivatives is likely to be determined by the driconf
3308 * option.
3309 */
3310 key.high_quality_derivatives = brw->disable_derivative_optimization;
3311
3312 key.program_string_id = bfp->id;
3313
3314 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3315 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3316
3317 bool success = do_wm_prog(brw, prog, bfp, &key);
3318
3319 brw->wm.base.prog_offset = old_prog_offset;
3320 brw->wm.prog_data = old_prog_data;
3321
3322 return success;
3323 }