i965: Add FS backend for builtin gl_SampleID
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "main/uniforms.h"
50 #include "brw_fs_live_variables.h"
51 #include "glsl/glsl_types.h"
52
53 void
54 fs_inst::init()
55 {
56 memset(this, 0, sizeof(*this));
57 this->opcode = BRW_OPCODE_NOP;
58 this->conditional_mod = BRW_CONDITIONAL_NONE;
59
60 this->dst = reg_undef;
61 this->src[0] = reg_undef;
62 this->src[1] = reg_undef;
63 this->src[2] = reg_undef;
64
65 /* This will be the case for almost all instructions. */
66 this->regs_written = 1;
67 }
68
69 fs_inst::fs_inst()
70 {
71 init();
72 }
73
74 fs_inst::fs_inst(enum opcode opcode)
75 {
76 init();
77 this->opcode = opcode;
78 }
79
80 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
81 {
82 init();
83 this->opcode = opcode;
84 this->dst = dst;
85
86 if (dst.file == GRF)
87 assert(dst.reg_offset >= 0);
88 }
89
90 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
91 {
92 init();
93 this->opcode = opcode;
94 this->dst = dst;
95 this->src[0] = src0;
96
97 if (dst.file == GRF)
98 assert(dst.reg_offset >= 0);
99 if (src[0].file == GRF)
100 assert(src[0].reg_offset >= 0);
101 }
102
103 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
104 {
105 init();
106 this->opcode = opcode;
107 this->dst = dst;
108 this->src[0] = src0;
109 this->src[1] = src1;
110
111 if (dst.file == GRF)
112 assert(dst.reg_offset >= 0);
113 if (src[0].file == GRF)
114 assert(src[0].reg_offset >= 0);
115 if (src[1].file == GRF)
116 assert(src[1].reg_offset >= 0);
117 }
118
119 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
120 fs_reg src0, fs_reg src1, fs_reg src2)
121 {
122 init();
123 this->opcode = opcode;
124 this->dst = dst;
125 this->src[0] = src0;
126 this->src[1] = src1;
127 this->src[2] = src2;
128
129 if (dst.file == GRF)
130 assert(dst.reg_offset >= 0);
131 if (src[0].file == GRF)
132 assert(src[0].reg_offset >= 0);
133 if (src[1].file == GRF)
134 assert(src[1].reg_offset >= 0);
135 if (src[2].file == GRF)
136 assert(src[2].reg_offset >= 0);
137 }
138
139 #define ALU1(op) \
140 fs_inst * \
141 fs_visitor::op(fs_reg dst, fs_reg src0) \
142 { \
143 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
144 }
145
146 #define ALU2(op) \
147 fs_inst * \
148 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
149 { \
150 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
151 }
152
153 #define ALU3(op) \
154 fs_inst * \
155 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
156 { \
157 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
158 }
159
160 ALU1(NOT)
161 ALU1(MOV)
162 ALU1(FRC)
163 ALU1(RNDD)
164 ALU1(RNDE)
165 ALU1(RNDZ)
166 ALU2(ADD)
167 ALU2(MUL)
168 ALU2(MACH)
169 ALU2(AND)
170 ALU2(OR)
171 ALU2(XOR)
172 ALU2(SHL)
173 ALU2(SHR)
174 ALU2(ASR)
175 ALU3(LRP)
176 ALU1(BFREV)
177 ALU3(BFE)
178 ALU2(BFI1)
179 ALU3(BFI2)
180 ALU1(FBH)
181 ALU1(FBL)
182 ALU1(CBIT)
183 ALU3(MAD)
184 ALU2(ADDC)
185 ALU2(SUBB)
186
187 /** Gen4 predicated IF. */
188 fs_inst *
189 fs_visitor::IF(uint32_t predicate)
190 {
191 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
192 inst->predicate = predicate;
193 return inst;
194 }
195
196 /** Gen6+ IF with embedded comparison. */
197 fs_inst *
198 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
199 {
200 assert(brw->gen >= 6);
201 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
202 reg_null_d, src0, src1);
203 inst->conditional_mod = condition;
204 return inst;
205 }
206
207 /**
208 * CMP: Sets the low bit of the destination channels with the result
209 * of the comparison, while the upper bits are undefined, and updates
210 * the flag register with the packed 16 bits of the result.
211 */
212 fs_inst *
213 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
214 {
215 fs_inst *inst;
216
217 /* Take the instruction:
218 *
219 * CMP null<d> src0<f> src1<f>
220 *
221 * Original gen4 does type conversion to the destination type before
222 * comparison, producing garbage results for floating point comparisons.
223 * gen5 does the comparison on the execution type (resolved source types),
224 * so dst type doesn't matter. gen6 does comparison and then uses the
225 * result as if it was the dst type with no conversion, which happens to
226 * mostly work out for float-interpreted-as-int since our comparisons are
227 * for >0, =0, <0.
228 */
229 if (brw->gen == 4) {
230 dst.type = src0.type;
231 if (dst.file == HW_REG)
232 dst.fixed_hw_reg.type = dst.type;
233 }
234
235 resolve_ud_negate(&src0);
236 resolve_ud_negate(&src1);
237
238 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
239 inst->conditional_mod = condition;
240
241 return inst;
242 }
243
244 exec_list
245 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
246 fs_reg varying_offset,
247 uint32_t const_offset)
248 {
249 exec_list instructions;
250 fs_inst *inst;
251
252 /* We have our constant surface use a pitch of 4 bytes, so our index can
253 * be any component of a vector, and then we load 4 contiguous
254 * components starting from that.
255 *
256 * We break down the const_offset to a portion added to the variable
257 * offset and a portion done using reg_offset, which means that if you
258 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
259 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
260 * CSE can later notice that those loads are all the same and eliminate
261 * the redundant ones.
262 */
263 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
264 instructions.push_tail(ADD(vec4_offset,
265 varying_offset, const_offset & ~3));
266
267 int scale = 1;
268 if (brw->gen == 4 && dispatch_width == 8) {
269 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
270 * u, v, r) as parameters, or we can just use the SIMD16 message
271 * consisting of (header, u). We choose the second, at the cost of a
272 * longer return length.
273 */
274 scale = 2;
275 }
276
277 enum opcode op;
278 if (brw->gen >= 7)
279 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
280 else
281 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
282 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
283 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
284 inst->regs_written = 4 * scale;
285 instructions.push_tail(inst);
286
287 if (brw->gen < 7) {
288 inst->base_mrf = 13;
289 inst->header_present = true;
290 if (brw->gen == 4)
291 inst->mlen = 3;
292 else
293 inst->mlen = 1 + dispatch_width / 8;
294 }
295
296 vec4_result.reg_offset += (const_offset & 3) * scale;
297 instructions.push_tail(MOV(dst, vec4_result));
298
299 return instructions;
300 }
301
302 /**
303 * A helper for MOV generation for fixing up broken hardware SEND dependency
304 * handling.
305 */
306 fs_inst *
307 fs_visitor::DEP_RESOLVE_MOV(int grf)
308 {
309 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
310
311 inst->ir = NULL;
312 inst->annotation = "send dependency resolve";
313
314 /* The caller always wants uncompressed to emit the minimal extra
315 * dependencies, and to avoid having to deal with aligning its regs to 2.
316 */
317 inst->force_uncompressed = true;
318
319 return inst;
320 }
321
322 bool
323 fs_inst::equals(fs_inst *inst)
324 {
325 return (opcode == inst->opcode &&
326 dst.equals(inst->dst) &&
327 src[0].equals(inst->src[0]) &&
328 src[1].equals(inst->src[1]) &&
329 src[2].equals(inst->src[2]) &&
330 saturate == inst->saturate &&
331 predicate == inst->predicate &&
332 conditional_mod == inst->conditional_mod &&
333 mlen == inst->mlen &&
334 base_mrf == inst->base_mrf &&
335 sampler == inst->sampler &&
336 target == inst->target &&
337 eot == inst->eot &&
338 header_present == inst->header_present &&
339 shadow_compare == inst->shadow_compare &&
340 offset == inst->offset);
341 }
342
343 bool
344 fs_inst::overwrites_reg(const fs_reg &reg)
345 {
346 return (reg.file == dst.file &&
347 reg.reg == dst.reg &&
348 reg.reg_offset >= dst.reg_offset &&
349 reg.reg_offset < dst.reg_offset + regs_written);
350 }
351
352 bool
353 fs_inst::is_send_from_grf()
354 {
355 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
356 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
357 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
358 src[1].file == GRF) ||
359 (is_tex() && src[0].file == GRF));
360 }
361
362 bool
363 fs_visitor::can_do_source_mods(fs_inst *inst)
364 {
365 if (brw->gen == 6 && inst->is_math())
366 return false;
367
368 if (inst->is_send_from_grf())
369 return false;
370
371 if (!inst->can_do_source_mods())
372 return false;
373
374 return true;
375 }
376
377 void
378 fs_reg::init()
379 {
380 memset(this, 0, sizeof(*this));
381 this->smear = -1;
382 }
383
384 /** Generic unset register constructor. */
385 fs_reg::fs_reg()
386 {
387 init();
388 this->file = BAD_FILE;
389 }
390
391 /** Immediate value constructor. */
392 fs_reg::fs_reg(float f)
393 {
394 init();
395 this->file = IMM;
396 this->type = BRW_REGISTER_TYPE_F;
397 this->imm.f = f;
398 }
399
400 /** Immediate value constructor. */
401 fs_reg::fs_reg(int32_t i)
402 {
403 init();
404 this->file = IMM;
405 this->type = BRW_REGISTER_TYPE_D;
406 this->imm.i = i;
407 }
408
409 /** Immediate value constructor. */
410 fs_reg::fs_reg(uint32_t u)
411 {
412 init();
413 this->file = IMM;
414 this->type = BRW_REGISTER_TYPE_UD;
415 this->imm.u = u;
416 }
417
418 /** Fixed brw_reg Immediate value constructor. */
419 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
420 {
421 init();
422 this->file = HW_REG;
423 this->fixed_hw_reg = fixed_hw_reg;
424 this->type = fixed_hw_reg.type;
425 }
426
427 bool
428 fs_reg::equals(const fs_reg &r) const
429 {
430 return (file == r.file &&
431 reg == r.reg &&
432 reg_offset == r.reg_offset &&
433 type == r.type &&
434 negate == r.negate &&
435 abs == r.abs &&
436 !reladdr && !r.reladdr &&
437 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
438 sizeof(fixed_hw_reg)) == 0 &&
439 smear == r.smear &&
440 imm.u == r.imm.u);
441 }
442
443 fs_reg
444 fs_reg::retype(uint32_t type)
445 {
446 fs_reg result = *this;
447 result.type = type;
448 return result;
449 }
450
451 bool
452 fs_reg::is_zero() const
453 {
454 if (file != IMM)
455 return false;
456
457 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
458 }
459
460 bool
461 fs_reg::is_one() const
462 {
463 if (file != IMM)
464 return false;
465
466 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
467 }
468
469 bool
470 fs_reg::is_null() const
471 {
472 return file == HW_REG &&
473 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
474 fixed_hw_reg.nr == BRW_ARF_NULL;
475 }
476
477 bool
478 fs_reg::is_valid_3src() const
479 {
480 return file == GRF || file == UNIFORM;
481 }
482
483 int
484 fs_visitor::type_size(const struct glsl_type *type)
485 {
486 unsigned int size, i;
487
488 switch (type->base_type) {
489 case GLSL_TYPE_UINT:
490 case GLSL_TYPE_INT:
491 case GLSL_TYPE_FLOAT:
492 case GLSL_TYPE_BOOL:
493 return type->components();
494 case GLSL_TYPE_ARRAY:
495 return type_size(type->fields.array) * type->length;
496 case GLSL_TYPE_STRUCT:
497 size = 0;
498 for (i = 0; i < type->length; i++) {
499 size += type_size(type->fields.structure[i].type);
500 }
501 return size;
502 case GLSL_TYPE_SAMPLER:
503 /* Samplers take up no register space, since they're baked in at
504 * link time.
505 */
506 return 0;
507 case GLSL_TYPE_ATOMIC_UINT:
508 return 0;
509 case GLSL_TYPE_VOID:
510 case GLSL_TYPE_ERROR:
511 case GLSL_TYPE_INTERFACE:
512 assert(!"not reached");
513 break;
514 }
515
516 return 0;
517 }
518
519 fs_reg
520 fs_visitor::get_timestamp()
521 {
522 assert(brw->gen >= 7);
523
524 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
525 BRW_ARF_TIMESTAMP,
526 0),
527 BRW_REGISTER_TYPE_UD));
528
529 fs_reg dst = fs_reg(this, glsl_type::uint_type);
530
531 fs_inst *mov = emit(MOV(dst, ts));
532 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
533 * even if it's not enabled in the dispatch.
534 */
535 mov->force_writemask_all = true;
536 mov->force_uncompressed = true;
537
538 /* The caller wants the low 32 bits of the timestamp. Since it's running
539 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
540 * which is plenty of time for our purposes. It is identical across the
541 * EUs, but since it's tracking GPU core speed it will increment at a
542 * varying rate as render P-states change.
543 *
544 * The caller could also check if render P-states have changed (or anything
545 * else that might disrupt timing) by setting smear to 2 and checking if
546 * that field is != 0.
547 */
548 dst.smear = 0;
549
550 return dst;
551 }
552
553 void
554 fs_visitor::emit_shader_time_begin()
555 {
556 current_annotation = "shader time start";
557 shader_start_time = get_timestamp();
558 }
559
560 void
561 fs_visitor::emit_shader_time_end()
562 {
563 current_annotation = "shader time end";
564
565 enum shader_time_shader_type type, written_type, reset_type;
566 if (dispatch_width == 8) {
567 type = ST_FS8;
568 written_type = ST_FS8_WRITTEN;
569 reset_type = ST_FS8_RESET;
570 } else {
571 assert(dispatch_width == 16);
572 type = ST_FS16;
573 written_type = ST_FS16_WRITTEN;
574 reset_type = ST_FS16_RESET;
575 }
576
577 fs_reg shader_end_time = get_timestamp();
578
579 /* Check that there weren't any timestamp reset events (assuming these
580 * were the only two timestamp reads that happened).
581 */
582 fs_reg reset = shader_end_time;
583 reset.smear = 2;
584 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
585 test->conditional_mod = BRW_CONDITIONAL_Z;
586 emit(IF(BRW_PREDICATE_NORMAL));
587
588 push_force_uncompressed();
589 fs_reg start = shader_start_time;
590 start.negate = true;
591 fs_reg diff = fs_reg(this, glsl_type::uint_type);
592 emit(ADD(diff, start, shader_end_time));
593
594 /* If there were no instructions between the two timestamp gets, the diff
595 * is 2 cycles. Remove that overhead, so I can forget about that when
596 * trying to determine the time taken for single instructions.
597 */
598 emit(ADD(diff, diff, fs_reg(-2u)));
599
600 emit_shader_time_write(type, diff);
601 emit_shader_time_write(written_type, fs_reg(1u));
602 emit(BRW_OPCODE_ELSE);
603 emit_shader_time_write(reset_type, fs_reg(1u));
604 emit(BRW_OPCODE_ENDIF);
605
606 pop_force_uncompressed();
607 }
608
609 void
610 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
611 fs_reg value)
612 {
613 int shader_time_index =
614 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
615 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
616
617 fs_reg payload;
618 if (dispatch_width == 8)
619 payload = fs_reg(this, glsl_type::uvec2_type);
620 else
621 payload = fs_reg(this, glsl_type::uint_type);
622
623 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
624 fs_reg(), payload, offset, value));
625 }
626
627 void
628 fs_visitor::fail(const char *format, ...)
629 {
630 va_list va;
631 char *msg;
632
633 if (failed)
634 return;
635
636 failed = true;
637
638 va_start(va, format);
639 msg = ralloc_vasprintf(mem_ctx, format, va);
640 va_end(va);
641 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
642
643 this->fail_msg = msg;
644
645 if (INTEL_DEBUG & DEBUG_WM) {
646 fprintf(stderr, "%s", msg);
647 }
648 }
649
650 fs_inst *
651 fs_visitor::emit(enum opcode opcode)
652 {
653 return emit(fs_inst(opcode));
654 }
655
656 fs_inst *
657 fs_visitor::emit(enum opcode opcode, fs_reg dst)
658 {
659 return emit(fs_inst(opcode, dst));
660 }
661
662 fs_inst *
663 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
664 {
665 return emit(fs_inst(opcode, dst, src0));
666 }
667
668 fs_inst *
669 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
670 {
671 return emit(fs_inst(opcode, dst, src0, src1));
672 }
673
674 fs_inst *
675 fs_visitor::emit(enum opcode opcode, fs_reg dst,
676 fs_reg src0, fs_reg src1, fs_reg src2)
677 {
678 return emit(fs_inst(opcode, dst, src0, src1, src2));
679 }
680
681 void
682 fs_visitor::push_force_uncompressed()
683 {
684 force_uncompressed_stack++;
685 }
686
687 void
688 fs_visitor::pop_force_uncompressed()
689 {
690 force_uncompressed_stack--;
691 assert(force_uncompressed_stack >= 0);
692 }
693
694 void
695 fs_visitor::push_force_sechalf()
696 {
697 force_sechalf_stack++;
698 }
699
700 void
701 fs_visitor::pop_force_sechalf()
702 {
703 force_sechalf_stack--;
704 assert(force_sechalf_stack >= 0);
705 }
706
707 /**
708 * Returns true if the instruction has a flag that means it won't
709 * update an entire destination register.
710 *
711 * For example, dead code elimination and live variable analysis want to know
712 * when a write to a variable screens off any preceding values that were in
713 * it.
714 */
715 bool
716 fs_inst::is_partial_write()
717 {
718 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
719 this->force_uncompressed ||
720 this->force_sechalf);
721 }
722
723 int
724 fs_inst::regs_read(fs_visitor *v, int arg)
725 {
726 if (is_tex() && arg == 0 && src[0].file == GRF) {
727 if (v->dispatch_width == 16)
728 return (mlen + 1) / 2;
729 else
730 return mlen;
731 }
732 return 1;
733 }
734
735 bool
736 fs_inst::reads_flag()
737 {
738 return predicate;
739 }
740
741 bool
742 fs_inst::writes_flag()
743 {
744 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
745 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
746 }
747
748 /**
749 * Returns how many MRFs an FS opcode will write over.
750 *
751 * Note that this is not the 0 or 1 implied writes in an actual gen
752 * instruction -- the FS opcodes often generate MOVs in addition.
753 */
754 int
755 fs_visitor::implied_mrf_writes(fs_inst *inst)
756 {
757 if (inst->mlen == 0)
758 return 0;
759
760 if (inst->base_mrf == -1)
761 return 0;
762
763 switch (inst->opcode) {
764 case SHADER_OPCODE_RCP:
765 case SHADER_OPCODE_RSQ:
766 case SHADER_OPCODE_SQRT:
767 case SHADER_OPCODE_EXP2:
768 case SHADER_OPCODE_LOG2:
769 case SHADER_OPCODE_SIN:
770 case SHADER_OPCODE_COS:
771 return 1 * dispatch_width / 8;
772 case SHADER_OPCODE_POW:
773 case SHADER_OPCODE_INT_QUOTIENT:
774 case SHADER_OPCODE_INT_REMAINDER:
775 return 2 * dispatch_width / 8;
776 case SHADER_OPCODE_TEX:
777 case FS_OPCODE_TXB:
778 case SHADER_OPCODE_TXD:
779 case SHADER_OPCODE_TXF:
780 case SHADER_OPCODE_TXF_MS:
781 case SHADER_OPCODE_TG4:
782 case SHADER_OPCODE_TG4_OFFSET:
783 case SHADER_OPCODE_TXL:
784 case SHADER_OPCODE_TXS:
785 case SHADER_OPCODE_LOD:
786 return 1;
787 case FS_OPCODE_FB_WRITE:
788 return 2;
789 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
790 case SHADER_OPCODE_GEN4_SCRATCH_READ:
791 return 1;
792 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
793 return inst->mlen;
794 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
795 return 2;
796 case SHADER_OPCODE_UNTYPED_ATOMIC:
797 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
798 return 0;
799 default:
800 assert(!"not reached");
801 return inst->mlen;
802 }
803 }
804
805 int
806 fs_visitor::virtual_grf_alloc(int size)
807 {
808 if (virtual_grf_array_size <= virtual_grf_count) {
809 if (virtual_grf_array_size == 0)
810 virtual_grf_array_size = 16;
811 else
812 virtual_grf_array_size *= 2;
813 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
814 virtual_grf_array_size);
815 }
816 virtual_grf_sizes[virtual_grf_count] = size;
817 return virtual_grf_count++;
818 }
819
820 /** Fixed HW reg constructor. */
821 fs_reg::fs_reg(enum register_file file, int reg)
822 {
823 init();
824 this->file = file;
825 this->reg = reg;
826 this->type = BRW_REGISTER_TYPE_F;
827 }
828
829 /** Fixed HW reg constructor. */
830 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
831 {
832 init();
833 this->file = file;
834 this->reg = reg;
835 this->type = type;
836 }
837
838 /** Automatic reg constructor. */
839 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
840 {
841 init();
842
843 this->file = GRF;
844 this->reg = v->virtual_grf_alloc(v->type_size(type));
845 this->reg_offset = 0;
846 this->type = brw_type_for_base_type(type);
847 }
848
849 fs_reg *
850 fs_visitor::variable_storage(ir_variable *var)
851 {
852 return (fs_reg *)hash_table_find(this->variable_ht, var);
853 }
854
855 void
856 import_uniforms_callback(const void *key,
857 void *data,
858 void *closure)
859 {
860 struct hash_table *dst_ht = (struct hash_table *)closure;
861 const fs_reg *reg = (const fs_reg *)data;
862
863 if (reg->file != UNIFORM)
864 return;
865
866 hash_table_insert(dst_ht, data, key);
867 }
868
869 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
870 * This brings in those uniform definitions
871 */
872 void
873 fs_visitor::import_uniforms(fs_visitor *v)
874 {
875 hash_table_call_foreach(v->variable_ht,
876 import_uniforms_callback,
877 variable_ht);
878 this->params_remap = v->params_remap;
879 this->nr_params_remap = v->nr_params_remap;
880 }
881
882 /* Our support for uniforms is piggy-backed on the struct
883 * gl_fragment_program, because that's where the values actually
884 * get stored, rather than in some global gl_shader_program uniform
885 * store.
886 */
887 void
888 fs_visitor::setup_uniform_values(ir_variable *ir)
889 {
890 int namelen = strlen(ir->name);
891
892 /* The data for our (non-builtin) uniforms is stored in a series of
893 * gl_uniform_driver_storage structs for each subcomponent that
894 * glGetUniformLocation() could name. We know it's been set up in the same
895 * order we'd walk the type, so walk the list of storage and find anything
896 * with our name, or the prefix of a component that starts with our name.
897 */
898 unsigned params_before = c->prog_data.nr_params;
899 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
900 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
901
902 if (strncmp(ir->name, storage->name, namelen) != 0 ||
903 (storage->name[namelen] != 0 &&
904 storage->name[namelen] != '.' &&
905 storage->name[namelen] != '[')) {
906 continue;
907 }
908
909 unsigned slots = storage->type->component_slots();
910 if (storage->array_elements)
911 slots *= storage->array_elements;
912
913 for (unsigned i = 0; i < slots; i++) {
914 c->prog_data.param[c->prog_data.nr_params++] =
915 &storage->storage[i].f;
916 }
917 }
918
919 /* Make sure we actually initialized the right amount of stuff here. */
920 assert(params_before + ir->type->component_slots() ==
921 c->prog_data.nr_params);
922 (void)params_before;
923 }
924
925
926 /* Our support for builtin uniforms is even scarier than non-builtin.
927 * It sits on top of the PROG_STATE_VAR parameters that are
928 * automatically updated from GL context state.
929 */
930 void
931 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
932 {
933 const ir_state_slot *const slots = ir->state_slots;
934 assert(ir->state_slots != NULL);
935
936 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
937 /* This state reference has already been setup by ir_to_mesa, but we'll
938 * get the same index back here.
939 */
940 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
941 (gl_state_index *)slots[i].tokens);
942
943 /* Add each of the unique swizzles of the element as a parameter.
944 * This'll end up matching the expected layout of the
945 * array/matrix/structure we're trying to fill in.
946 */
947 int last_swiz = -1;
948 for (unsigned int j = 0; j < 4; j++) {
949 int swiz = GET_SWZ(slots[i].swizzle, j);
950 if (swiz == last_swiz)
951 break;
952 last_swiz = swiz;
953
954 c->prog_data.param[c->prog_data.nr_params++] =
955 &fp->Base.Parameters->ParameterValues[index][swiz].f;
956 }
957 }
958 }
959
960 fs_reg *
961 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
962 {
963 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
964 fs_reg wpos = *reg;
965 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
966
967 /* gl_FragCoord.x */
968 if (ir->pixel_center_integer) {
969 emit(MOV(wpos, this->pixel_x));
970 } else {
971 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
972 }
973 wpos.reg_offset++;
974
975 /* gl_FragCoord.y */
976 if (!flip && ir->pixel_center_integer) {
977 emit(MOV(wpos, this->pixel_y));
978 } else {
979 fs_reg pixel_y = this->pixel_y;
980 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
981
982 if (flip) {
983 pixel_y.negate = true;
984 offset += c->key.drawable_height - 1.0;
985 }
986
987 emit(ADD(wpos, pixel_y, fs_reg(offset)));
988 }
989 wpos.reg_offset++;
990
991 /* gl_FragCoord.z */
992 if (brw->gen >= 6) {
993 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
994 } else {
995 emit(FS_OPCODE_LINTERP, wpos,
996 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
997 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
998 interp_reg(VARYING_SLOT_POS, 2));
999 }
1000 wpos.reg_offset++;
1001
1002 /* gl_FragCoord.w: Already set up in emit_interpolation */
1003 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1004
1005 return reg;
1006 }
1007
1008 fs_inst *
1009 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1010 glsl_interp_qualifier interpolation_mode,
1011 bool is_centroid)
1012 {
1013 brw_wm_barycentric_interp_mode barycoord_mode;
1014 if (brw->gen >= 6) {
1015 if (is_centroid) {
1016 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1017 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1018 else
1019 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1020 } else {
1021 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1022 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1023 else
1024 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1025 }
1026 } else {
1027 /* On Ironlake and below, there is only one interpolation mode.
1028 * Centroid interpolation doesn't mean anything on this hardware --
1029 * there is no multisampling.
1030 */
1031 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1032 }
1033 return emit(FS_OPCODE_LINTERP, attr,
1034 this->delta_x[barycoord_mode],
1035 this->delta_y[barycoord_mode], interp);
1036 }
1037
1038 fs_reg *
1039 fs_visitor::emit_general_interpolation(ir_variable *ir)
1040 {
1041 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1042 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1043 fs_reg attr = *reg;
1044
1045 unsigned int array_elements;
1046 const glsl_type *type;
1047
1048 if (ir->type->is_array()) {
1049 array_elements = ir->type->length;
1050 if (array_elements == 0) {
1051 fail("dereferenced array '%s' has length 0\n", ir->name);
1052 }
1053 type = ir->type->fields.array;
1054 } else {
1055 array_elements = 1;
1056 type = ir->type;
1057 }
1058
1059 glsl_interp_qualifier interpolation_mode =
1060 ir->determine_interpolation_mode(c->key.flat_shade);
1061
1062 int location = ir->location;
1063 for (unsigned int i = 0; i < array_elements; i++) {
1064 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1065 if (c->prog_data.urb_setup[location] == -1) {
1066 /* If there's no incoming setup data for this slot, don't
1067 * emit interpolation for it.
1068 */
1069 attr.reg_offset += type->vector_elements;
1070 location++;
1071 continue;
1072 }
1073
1074 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1075 /* Constant interpolation (flat shading) case. The SF has
1076 * handed us defined values in only the constant offset
1077 * field of the setup reg.
1078 */
1079 for (unsigned int k = 0; k < type->vector_elements; k++) {
1080 struct brw_reg interp = interp_reg(location, k);
1081 interp = suboffset(interp, 3);
1082 interp.type = reg->type;
1083 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1084 attr.reg_offset++;
1085 }
1086 } else {
1087 /* Smooth/noperspective interpolation case. */
1088 for (unsigned int k = 0; k < type->vector_elements; k++) {
1089 /* FINISHME: At some point we probably want to push
1090 * this farther by giving similar treatment to the
1091 * other potentially constant components of the
1092 * attribute, as well as making brw_vs_constval.c
1093 * handle varyings other than gl_TexCoord.
1094 */
1095 struct brw_reg interp = interp_reg(location, k);
1096 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1097 ir->centroid);
1098 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1099 /* Get the pixel/sample mask into f0 so that we know
1100 * which pixels are lit. Then, for each channel that is
1101 * unlit, replace the centroid data with non-centroid
1102 * data.
1103 */
1104 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1105 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1106 interpolation_mode, false);
1107 inst->predicate = BRW_PREDICATE_NORMAL;
1108 inst->predicate_inverse = true;
1109 }
1110 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1111 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1112 }
1113 attr.reg_offset++;
1114 }
1115
1116 }
1117 location++;
1118 }
1119 }
1120
1121 return reg;
1122 }
1123
1124 fs_reg *
1125 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1126 {
1127 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1128
1129 /* The frontfacing comes in as a bit in the thread payload. */
1130 if (brw->gen >= 6) {
1131 emit(BRW_OPCODE_ASR, *reg,
1132 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1133 fs_reg(15));
1134 emit(BRW_OPCODE_NOT, *reg, *reg);
1135 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1136 } else {
1137 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1138 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1139 * us front face
1140 */
1141 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1142 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1143 }
1144
1145 return reg;
1146 }
1147
1148 void
1149 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1150 {
1151 assert(dst.type == BRW_REGISTER_TYPE_F);
1152
1153 if (c->key.compute_pos_offset) {
1154 /* Convert int_sample_pos to floating point */
1155 emit(MOV(dst, int_sample_pos));
1156 /* Scale to the range [0, 1] */
1157 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1158 }
1159 else {
1160 /* From ARB_sample_shading specification:
1161 * "When rendering to a non-multisample buffer, or if multisample
1162 * rasterization is disabled, gl_SamplePosition will always be
1163 * (0.5, 0.5).
1164 */
1165 emit(MOV(dst, fs_reg(0.5f)));
1166 }
1167 }
1168
1169 fs_reg *
1170 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1171 {
1172 assert(brw->gen >= 6);
1173 assert(ir->type == glsl_type::vec2_type);
1174
1175 this->current_annotation = "compute sample position";
1176 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1177 fs_reg pos = *reg;
1178 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1179 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1180
1181 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1182 * mode will be enabled.
1183 *
1184 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1185 * R31.1:0 Position Offset X/Y for Slot[3:0]
1186 * R31.3:2 Position Offset X/Y for Slot[7:4]
1187 * .....
1188 *
1189 * The X, Y sample positions come in as bytes in thread payload. So, read
1190 * the positions using vstride=16, width=8, hstride=2.
1191 */
1192 struct brw_reg sample_pos_reg =
1193 stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1194 BRW_REGISTER_TYPE_B), 16, 8, 2);
1195
1196 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1197 if (dispatch_width == 16) {
1198 int_sample_x.sechalf = true;
1199 fs_inst *inst = emit(MOV(int_sample_x,
1200 fs_reg(suboffset(sample_pos_reg, 16))));
1201 inst->force_sechalf = true;
1202 int_sample_x.sechalf = false;
1203 }
1204 /* Compute gl_SamplePosition.x */
1205 compute_sample_position(pos, int_sample_x);
1206 pos.reg_offset++;
1207 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1208 if (dispatch_width == 16) {
1209 int_sample_y.sechalf = true;
1210 fs_inst *inst = emit(MOV(int_sample_y,
1211 fs_reg(suboffset(sample_pos_reg, 17))));
1212 inst->force_sechalf = true;
1213 int_sample_y.sechalf = false;
1214 }
1215 /* Compute gl_SamplePosition.y */
1216 compute_sample_position(pos, int_sample_y);
1217 return reg;
1218 }
1219
1220 fs_reg *
1221 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1222 {
1223 assert(brw->gen >= 6);
1224
1225 this->current_annotation = "compute sample id";
1226 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1227
1228 if (c->key.compute_sample_id) {
1229 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1230 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1231 t2.type = BRW_REGISTER_TYPE_UW;
1232
1233 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1234 * 8x multisampling, subspan 0 will represent sample N (where N
1235 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1236 * 7. We can find the value of N by looking at R0.0 bits 7:6
1237 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1238 * (since samples are always delivered in pairs). That is, we
1239 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1240 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1241 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1242 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1243 * populating a temporary variable with the sequence (0, 1, 2, 3),
1244 * and then reading from it using vstride=1, width=4, hstride=0.
1245 * These computations hold good for 4x multisampling as well.
1246 */
1247 emit(BRW_OPCODE_AND, t1,
1248 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1249 fs_reg(brw_imm_d(0xc0)));
1250 emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1251 /* This works for both SIMD8 and SIMD16 */
1252 emit(MOV(t2, brw_imm_v(0x3210)));
1253 /* This special instruction takes care of setting vstride=1,
1254 * width=4, hstride=0 of t2 during an ADD instruction.
1255 */
1256 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1257 } else {
1258 /* As per GL_ARB_sample_shading specification:
1259 * "When rendering to a non-multisample buffer, or if multisample
1260 * rasterization is disabled, gl_SampleID will always be zero."
1261 */
1262 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1263 }
1264
1265 return reg;
1266 }
1267
1268 fs_reg
1269 fs_visitor::fix_math_operand(fs_reg src)
1270 {
1271 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1272 * might be able to do better by doing execsize = 1 math and then
1273 * expanding that result out, but we would need to be careful with
1274 * masking.
1275 *
1276 * The hardware ignores source modifiers (negate and abs) on math
1277 * instructions, so we also move to a temp to set those up.
1278 */
1279 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1280 !src.abs && !src.negate)
1281 return src;
1282
1283 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1284 * operands to math
1285 */
1286 if (brw->gen >= 7 && src.file != IMM)
1287 return src;
1288
1289 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1290 expanded.type = src.type;
1291 emit(BRW_OPCODE_MOV, expanded, src);
1292 return expanded;
1293 }
1294
1295 fs_inst *
1296 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1297 {
1298 switch (opcode) {
1299 case SHADER_OPCODE_RCP:
1300 case SHADER_OPCODE_RSQ:
1301 case SHADER_OPCODE_SQRT:
1302 case SHADER_OPCODE_EXP2:
1303 case SHADER_OPCODE_LOG2:
1304 case SHADER_OPCODE_SIN:
1305 case SHADER_OPCODE_COS:
1306 break;
1307 default:
1308 assert(!"not reached: bad math opcode");
1309 return NULL;
1310 }
1311
1312 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1313 * might be able to do better by doing execsize = 1 math and then
1314 * expanding that result out, but we would need to be careful with
1315 * masking.
1316 *
1317 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1318 * instructions, so we also move to a temp to set those up.
1319 */
1320 if (brw->gen >= 6)
1321 src = fix_math_operand(src);
1322
1323 fs_inst *inst = emit(opcode, dst, src);
1324
1325 if (brw->gen < 6) {
1326 inst->base_mrf = 2;
1327 inst->mlen = dispatch_width / 8;
1328 }
1329
1330 return inst;
1331 }
1332
1333 fs_inst *
1334 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1335 {
1336 int base_mrf = 2;
1337 fs_inst *inst;
1338
1339 switch (opcode) {
1340 case SHADER_OPCODE_INT_QUOTIENT:
1341 case SHADER_OPCODE_INT_REMAINDER:
1342 if (brw->gen >= 7 && dispatch_width == 16)
1343 fail("16-wide INTDIV unsupported\n");
1344 break;
1345 case SHADER_OPCODE_POW:
1346 break;
1347 default:
1348 assert(!"not reached: unsupported binary math opcode.");
1349 return NULL;
1350 }
1351
1352 if (brw->gen >= 6) {
1353 src0 = fix_math_operand(src0);
1354 src1 = fix_math_operand(src1);
1355
1356 inst = emit(opcode, dst, src0, src1);
1357 } else {
1358 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1359 * "Message Payload":
1360 *
1361 * "Operand0[7]. For the INT DIV functions, this operand is the
1362 * denominator."
1363 * ...
1364 * "Operand1[7]. For the INT DIV functions, this operand is the
1365 * numerator."
1366 */
1367 bool is_int_div = opcode != SHADER_OPCODE_POW;
1368 fs_reg &op0 = is_int_div ? src1 : src0;
1369 fs_reg &op1 = is_int_div ? src0 : src1;
1370
1371 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1372 inst = emit(opcode, dst, op0, reg_null_f);
1373
1374 inst->base_mrf = base_mrf;
1375 inst->mlen = 2 * dispatch_width / 8;
1376 }
1377 return inst;
1378 }
1379
1380 void
1381 fs_visitor::assign_curb_setup()
1382 {
1383 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1384 if (dispatch_width == 8) {
1385 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1386 } else {
1387 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1388 }
1389
1390 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1391 foreach_list(node, &this->instructions) {
1392 fs_inst *inst = (fs_inst *)node;
1393
1394 for (unsigned int i = 0; i < 3; i++) {
1395 if (inst->src[i].file == UNIFORM) {
1396 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1397 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1398 constant_nr / 8,
1399 constant_nr % 8);
1400
1401 inst->src[i].file = HW_REG;
1402 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1403 }
1404 }
1405 }
1406 }
1407
1408 void
1409 fs_visitor::calculate_urb_setup()
1410 {
1411 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1412 c->prog_data.urb_setup[i] = -1;
1413 }
1414
1415 int urb_next = 0;
1416 /* Figure out where each of the incoming setup attributes lands. */
1417 if (brw->gen >= 6) {
1418 if (_mesa_bitcount_64(fp->Base.InputsRead &
1419 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1420 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1421 * first 16 varying inputs, so we can put them wherever we want.
1422 * Just put them in order.
1423 *
1424 * This is useful because it means that (a) inputs not used by the
1425 * fragment shader won't take up valuable register space, and (b) we
1426 * won't have to recompile the fragment shader if it gets paired with
1427 * a different vertex (or geometry) shader.
1428 */
1429 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1430 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1431 BITFIELD64_BIT(i)) {
1432 c->prog_data.urb_setup[i] = urb_next++;
1433 }
1434 }
1435 } else {
1436 /* We have enough input varyings that the SF/SBE pipeline stage can't
1437 * arbitrarily rearrange them to suit our whim; we have to put them
1438 * in an order that matches the output of the previous pipeline stage
1439 * (geometry or vertex shader).
1440 */
1441 struct brw_vue_map prev_stage_vue_map;
1442 brw_compute_vue_map(brw, &prev_stage_vue_map,
1443 c->key.input_slots_valid);
1444 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1445 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1446 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1447 slot++) {
1448 int varying = prev_stage_vue_map.slot_to_varying[slot];
1449 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1450 * unused.
1451 */
1452 if (varying != BRW_VARYING_SLOT_COUNT &&
1453 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1454 BITFIELD64_BIT(varying))) {
1455 c->prog_data.urb_setup[varying] = slot - first_slot;
1456 }
1457 }
1458 urb_next = prev_stage_vue_map.num_slots - first_slot;
1459 }
1460 } else {
1461 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1462 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1463 /* Point size is packed into the header, not as a general attribute */
1464 if (i == VARYING_SLOT_PSIZ)
1465 continue;
1466
1467 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1468 /* The back color slot is skipped when the front color is
1469 * also written to. In addition, some slots can be
1470 * written in the vertex shader and not read in the
1471 * fragment shader. So the register number must always be
1472 * incremented, mapped or not.
1473 */
1474 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1475 c->prog_data.urb_setup[i] = urb_next;
1476 urb_next++;
1477 }
1478 }
1479
1480 /*
1481 * It's a FS only attribute, and we did interpolation for this attribute
1482 * in SF thread. So, count it here, too.
1483 *
1484 * See compile_sf_prog() for more info.
1485 */
1486 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1487 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1488 }
1489
1490 c->prog_data.num_varying_inputs = urb_next;
1491 }
1492
1493 void
1494 fs_visitor::assign_urb_setup()
1495 {
1496 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1497
1498 /* Offset all the urb_setup[] index by the actual position of the
1499 * setup regs, now that the location of the constants has been chosen.
1500 */
1501 foreach_list(node, &this->instructions) {
1502 fs_inst *inst = (fs_inst *)node;
1503
1504 if (inst->opcode == FS_OPCODE_LINTERP) {
1505 assert(inst->src[2].file == HW_REG);
1506 inst->src[2].fixed_hw_reg.nr += urb_start;
1507 }
1508
1509 if (inst->opcode == FS_OPCODE_CINTERP) {
1510 assert(inst->src[0].file == HW_REG);
1511 inst->src[0].fixed_hw_reg.nr += urb_start;
1512 }
1513 }
1514
1515 /* Each attribute is 4 setup channels, each of which is half a reg. */
1516 this->first_non_payload_grf =
1517 urb_start + c->prog_data.num_varying_inputs * 2;
1518 }
1519
1520 /**
1521 * Split large virtual GRFs into separate components if we can.
1522 *
1523 * This is mostly duplicated with what brw_fs_vector_splitting does,
1524 * but that's really conservative because it's afraid of doing
1525 * splitting that doesn't result in real progress after the rest of
1526 * the optimization phases, which would cause infinite looping in
1527 * optimization. We can do it once here, safely. This also has the
1528 * opportunity to split interpolated values, or maybe even uniforms,
1529 * which we don't have at the IR level.
1530 *
1531 * We want to split, because virtual GRFs are what we register
1532 * allocate and spill (due to contiguousness requirements for some
1533 * instructions), and they're what we naturally generate in the
1534 * codegen process, but most virtual GRFs don't actually need to be
1535 * contiguous sets of GRFs. If we split, we'll end up with reduced
1536 * live intervals and better dead code elimination and coalescing.
1537 */
1538 void
1539 fs_visitor::split_virtual_grfs()
1540 {
1541 int num_vars = this->virtual_grf_count;
1542 bool split_grf[num_vars];
1543 int new_virtual_grf[num_vars];
1544
1545 /* Try to split anything > 0 sized. */
1546 for (int i = 0; i < num_vars; i++) {
1547 if (this->virtual_grf_sizes[i] != 1)
1548 split_grf[i] = true;
1549 else
1550 split_grf[i] = false;
1551 }
1552
1553 if (brw->has_pln &&
1554 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1555 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1556 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1557 * Gen6, that was the only supported interpolation mode, and since Gen6,
1558 * delta_x and delta_y are in fixed hardware registers.
1559 */
1560 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1561 false;
1562 }
1563
1564 foreach_list(node, &this->instructions) {
1565 fs_inst *inst = (fs_inst *)node;
1566
1567 /* If there's a SEND message that requires contiguous destination
1568 * registers, no splitting is allowed.
1569 */
1570 if (inst->regs_written > 1) {
1571 split_grf[inst->dst.reg] = false;
1572 }
1573
1574 /* If we're sending from a GRF, don't split it, on the assumption that
1575 * the send is reading the whole thing.
1576 */
1577 if (inst->is_send_from_grf()) {
1578 for (int i = 0; i < 3; i++) {
1579 if (inst->src[i].file == GRF) {
1580 split_grf[inst->src[i].reg] = false;
1581 }
1582 }
1583 }
1584 }
1585
1586 /* Allocate new space for split regs. Note that the virtual
1587 * numbers will be contiguous.
1588 */
1589 for (int i = 0; i < num_vars; i++) {
1590 if (split_grf[i]) {
1591 new_virtual_grf[i] = virtual_grf_alloc(1);
1592 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1593 int reg = virtual_grf_alloc(1);
1594 assert(reg == new_virtual_grf[i] + j - 1);
1595 (void) reg;
1596 }
1597 this->virtual_grf_sizes[i] = 1;
1598 }
1599 }
1600
1601 foreach_list(node, &this->instructions) {
1602 fs_inst *inst = (fs_inst *)node;
1603
1604 if (inst->dst.file == GRF &&
1605 split_grf[inst->dst.reg] &&
1606 inst->dst.reg_offset != 0) {
1607 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1608 inst->dst.reg_offset - 1);
1609 inst->dst.reg_offset = 0;
1610 }
1611 for (int i = 0; i < 3; i++) {
1612 if (inst->src[i].file == GRF &&
1613 split_grf[inst->src[i].reg] &&
1614 inst->src[i].reg_offset != 0) {
1615 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1616 inst->src[i].reg_offset - 1);
1617 inst->src[i].reg_offset = 0;
1618 }
1619 }
1620 }
1621 invalidate_live_intervals();
1622 }
1623
1624 /**
1625 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1626 *
1627 * During code generation, we create tons of temporary variables, many of
1628 * which get immediately killed and are never used again. Yet, in later
1629 * optimization and analysis passes, such as compute_live_intervals, we need
1630 * to loop over all the virtual GRFs. Compacting them can save a lot of
1631 * overhead.
1632 */
1633 void
1634 fs_visitor::compact_virtual_grfs()
1635 {
1636 /* Mark which virtual GRFs are used, and count how many. */
1637 int remap_table[this->virtual_grf_count];
1638 memset(remap_table, -1, sizeof(remap_table));
1639
1640 foreach_list(node, &this->instructions) {
1641 const fs_inst *inst = (const fs_inst *) node;
1642
1643 if (inst->dst.file == GRF)
1644 remap_table[inst->dst.reg] = 0;
1645
1646 for (int i = 0; i < 3; i++) {
1647 if (inst->src[i].file == GRF)
1648 remap_table[inst->src[i].reg] = 0;
1649 }
1650 }
1651
1652 /* In addition to registers used in instructions, fs_visitor keeps
1653 * direct references to certain special values which must be patched:
1654 */
1655 fs_reg *special[] = {
1656 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1657 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1658 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1659 &delta_x[0], &delta_x[1], &delta_x[2],
1660 &delta_x[3], &delta_x[4], &delta_x[5],
1661 &delta_y[0], &delta_y[1], &delta_y[2],
1662 &delta_y[3], &delta_y[4], &delta_y[5],
1663 };
1664 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1665 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1666
1667 /* Treat all special values as used, to be conservative */
1668 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1669 if (special[i]->file == GRF)
1670 remap_table[special[i]->reg] = 0;
1671 }
1672
1673 /* Compact the GRF arrays. */
1674 int new_index = 0;
1675 for (int i = 0; i < this->virtual_grf_count; i++) {
1676 if (remap_table[i] != -1) {
1677 remap_table[i] = new_index;
1678 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1679 invalidate_live_intervals();
1680 ++new_index;
1681 }
1682 }
1683
1684 this->virtual_grf_count = new_index;
1685
1686 /* Patch all the instructions to use the newly renumbered registers */
1687 foreach_list(node, &this->instructions) {
1688 fs_inst *inst = (fs_inst *) node;
1689
1690 if (inst->dst.file == GRF)
1691 inst->dst.reg = remap_table[inst->dst.reg];
1692
1693 for (int i = 0; i < 3; i++) {
1694 if (inst->src[i].file == GRF)
1695 inst->src[i].reg = remap_table[inst->src[i].reg];
1696 }
1697 }
1698
1699 /* Patch all the references to special values */
1700 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1701 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1702 special[i]->reg = remap_table[special[i]->reg];
1703 }
1704 }
1705
1706 bool
1707 fs_visitor::remove_dead_constants()
1708 {
1709 if (dispatch_width == 8) {
1710 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1711 this->nr_params_remap = c->prog_data.nr_params;
1712
1713 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1714 this->params_remap[i] = -1;
1715
1716 /* Find which params are still in use. */
1717 foreach_list(node, &this->instructions) {
1718 fs_inst *inst = (fs_inst *)node;
1719
1720 for (int i = 0; i < 3; i++) {
1721 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1722
1723 if (inst->src[i].file != UNIFORM)
1724 continue;
1725
1726 /* Section 5.11 of the OpenGL 4.3 spec says:
1727 *
1728 * "Out-of-bounds reads return undefined values, which include
1729 * values from other variables of the active program or zero."
1730 */
1731 if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1732 constant_nr = 0;
1733 }
1734
1735 /* For now, set this to non-negative. We'll give it the
1736 * actual new number in a moment, in order to keep the
1737 * register numbers nicely ordered.
1738 */
1739 this->params_remap[constant_nr] = 0;
1740 }
1741 }
1742
1743 /* Figure out what the new numbers for the params will be. At some
1744 * point when we're doing uniform array access, we're going to want
1745 * to keep the distinction between .reg and .reg_offset, but for
1746 * now we don't care.
1747 */
1748 unsigned int new_nr_params = 0;
1749 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1750 if (this->params_remap[i] != -1) {
1751 this->params_remap[i] = new_nr_params++;
1752 }
1753 }
1754
1755 /* Update the list of params to be uploaded to match our new numbering. */
1756 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1757 int remapped = this->params_remap[i];
1758
1759 if (remapped == -1)
1760 continue;
1761
1762 c->prog_data.param[remapped] = c->prog_data.param[i];
1763 }
1764
1765 c->prog_data.nr_params = new_nr_params;
1766 } else {
1767 /* This should have been generated in the 8-wide pass already. */
1768 assert(this->params_remap);
1769 }
1770
1771 /* Now do the renumbering of the shader to remove unused params. */
1772 foreach_list(node, &this->instructions) {
1773 fs_inst *inst = (fs_inst *)node;
1774
1775 for (int i = 0; i < 3; i++) {
1776 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1777
1778 if (inst->src[i].file != UNIFORM)
1779 continue;
1780
1781 /* as above alias to 0 */
1782 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1783 constant_nr = 0;
1784 }
1785 assert(this->params_remap[constant_nr] != -1);
1786 inst->src[i].reg = this->params_remap[constant_nr];
1787 inst->src[i].reg_offset = 0;
1788 }
1789 }
1790
1791 return true;
1792 }
1793
1794 /*
1795 * Implements array access of uniforms by inserting a
1796 * PULL_CONSTANT_LOAD instruction.
1797 *
1798 * Unlike temporary GRF array access (where we don't support it due to
1799 * the difficulty of doing relative addressing on instruction
1800 * destinations), we could potentially do array access of uniforms
1801 * that were loaded in GRF space as push constants. In real-world
1802 * usage we've seen, though, the arrays being used are always larger
1803 * than we could load as push constants, so just always move all
1804 * uniform array access out to a pull constant buffer.
1805 */
1806 void
1807 fs_visitor::move_uniform_array_access_to_pull_constants()
1808 {
1809 int pull_constant_loc[c->prog_data.nr_params];
1810
1811 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1812 pull_constant_loc[i] = -1;
1813 }
1814
1815 /* Walk through and find array access of uniforms. Put a copy of that
1816 * uniform in the pull constant buffer.
1817 *
1818 * Note that we don't move constant-indexed accesses to arrays. No
1819 * testing has been done of the performance impact of this choice.
1820 */
1821 foreach_list_safe(node, &this->instructions) {
1822 fs_inst *inst = (fs_inst *)node;
1823
1824 for (int i = 0 ; i < 3; i++) {
1825 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1826 continue;
1827
1828 int uniform = inst->src[i].reg;
1829
1830 /* If this array isn't already present in the pull constant buffer,
1831 * add it.
1832 */
1833 if (pull_constant_loc[uniform] == -1) {
1834 const float **values = &c->prog_data.param[uniform];
1835
1836 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1837
1838 assert(param_size[uniform]);
1839
1840 for (int j = 0; j < param_size[uniform]; j++) {
1841 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1842 values[j];
1843 }
1844 }
1845
1846 /* Set up the annotation tracking for new generated instructions. */
1847 base_ir = inst->ir;
1848 current_annotation = inst->annotation;
1849
1850 fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1851 fs_reg temp = fs_reg(this, glsl_type::float_type);
1852 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1853 surf_index,
1854 *inst->src[i].reladdr,
1855 pull_constant_loc[uniform] +
1856 inst->src[i].reg_offset);
1857 inst->insert_before(&list);
1858
1859 inst->src[i].file = temp.file;
1860 inst->src[i].reg = temp.reg;
1861 inst->src[i].reg_offset = temp.reg_offset;
1862 inst->src[i].reladdr = NULL;
1863 }
1864 }
1865 }
1866
1867 /**
1868 * Choose accesses from the UNIFORM file to demote to using the pull
1869 * constant buffer.
1870 *
1871 * We allow a fragment shader to have more than the specified minimum
1872 * maximum number of fragment shader uniform components (64). If
1873 * there are too many of these, they'd fill up all of register space.
1874 * So, this will push some of them out to the pull constant buffer and
1875 * update the program to load them.
1876 */
1877 void
1878 fs_visitor::setup_pull_constants()
1879 {
1880 /* Only allow 16 registers (128 uniform components) as push constants. */
1881 unsigned int max_uniform_components = 16 * 8;
1882 if (c->prog_data.nr_params <= max_uniform_components)
1883 return;
1884
1885 if (dispatch_width == 16) {
1886 fail("Pull constants not supported in 16-wide\n");
1887 return;
1888 }
1889
1890 /* Just demote the end of the list. We could probably do better
1891 * here, demoting things that are rarely used in the program first.
1892 */
1893 unsigned int pull_uniform_base = max_uniform_components;
1894
1895 int pull_constant_loc[c->prog_data.nr_params];
1896 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1897 if (i < pull_uniform_base) {
1898 pull_constant_loc[i] = -1;
1899 } else {
1900 pull_constant_loc[i] = -1;
1901 /* If our constant is already being uploaded for reladdr purposes,
1902 * reuse it.
1903 */
1904 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1905 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1906 pull_constant_loc[i] = j;
1907 break;
1908 }
1909 }
1910 if (pull_constant_loc[i] == -1) {
1911 int pull_index = c->prog_data.nr_pull_params++;
1912 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1913 pull_constant_loc[i] = pull_index;;
1914 }
1915 }
1916 }
1917 c->prog_data.nr_params = pull_uniform_base;
1918
1919 foreach_list(node, &this->instructions) {
1920 fs_inst *inst = (fs_inst *)node;
1921
1922 for (int i = 0; i < 3; i++) {
1923 if (inst->src[i].file != UNIFORM)
1924 continue;
1925
1926 int pull_index = pull_constant_loc[inst->src[i].reg +
1927 inst->src[i].reg_offset];
1928 if (pull_index == -1)
1929 continue;
1930
1931 assert(!inst->src[i].reladdr);
1932
1933 fs_reg dst = fs_reg(this, glsl_type::float_type);
1934 fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1935 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1936 fs_inst *pull =
1937 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1938 dst, index, offset);
1939 pull->ir = inst->ir;
1940 pull->annotation = inst->annotation;
1941
1942 inst->insert_before(pull);
1943
1944 inst->src[i].file = GRF;
1945 inst->src[i].reg = dst.reg;
1946 inst->src[i].reg_offset = 0;
1947 inst->src[i].smear = pull_index & 3;
1948 }
1949 }
1950 }
1951
1952 bool
1953 fs_visitor::opt_algebraic()
1954 {
1955 bool progress = false;
1956
1957 foreach_list(node, &this->instructions) {
1958 fs_inst *inst = (fs_inst *)node;
1959
1960 switch (inst->opcode) {
1961 case BRW_OPCODE_MUL:
1962 if (inst->src[1].file != IMM)
1963 continue;
1964
1965 /* a * 1.0 = a */
1966 if (inst->src[1].is_one()) {
1967 inst->opcode = BRW_OPCODE_MOV;
1968 inst->src[1] = reg_undef;
1969 progress = true;
1970 break;
1971 }
1972
1973 /* a * 0.0 = 0.0 */
1974 if (inst->src[1].is_zero()) {
1975 inst->opcode = BRW_OPCODE_MOV;
1976 inst->src[0] = inst->src[1];
1977 inst->src[1] = reg_undef;
1978 progress = true;
1979 break;
1980 }
1981
1982 break;
1983 case BRW_OPCODE_ADD:
1984 if (inst->src[1].file != IMM)
1985 continue;
1986
1987 /* a + 0.0 = a */
1988 if (inst->src[1].is_zero()) {
1989 inst->opcode = BRW_OPCODE_MOV;
1990 inst->src[1] = reg_undef;
1991 progress = true;
1992 break;
1993 }
1994 break;
1995 case BRW_OPCODE_OR:
1996 if (inst->src[0].equals(inst->src[1])) {
1997 inst->opcode = BRW_OPCODE_MOV;
1998 inst->src[1] = reg_undef;
1999 progress = true;
2000 break;
2001 }
2002 break;
2003 case BRW_OPCODE_SEL:
2004 if (inst->saturate && inst->src[1].file == IMM) {
2005 switch (inst->conditional_mod) {
2006 case BRW_CONDITIONAL_LE:
2007 case BRW_CONDITIONAL_L:
2008 switch (inst->src[1].type) {
2009 case BRW_REGISTER_TYPE_F:
2010 if (inst->src[1].imm.f >= 1.0f) {
2011 inst->opcode = BRW_OPCODE_MOV;
2012 inst->src[1] = reg_undef;
2013 progress = true;
2014 }
2015 break;
2016 default:
2017 break;
2018 }
2019 break;
2020 case BRW_CONDITIONAL_GE:
2021 case BRW_CONDITIONAL_G:
2022 switch (inst->src[1].type) {
2023 case BRW_REGISTER_TYPE_F:
2024 if (inst->src[1].imm.f <= 0.0f) {
2025 inst->opcode = BRW_OPCODE_MOV;
2026 inst->src[1] = reg_undef;
2027 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2028 progress = true;
2029 }
2030 break;
2031 default:
2032 break;
2033 }
2034 default:
2035 break;
2036 }
2037 }
2038 break;
2039 default:
2040 break;
2041 }
2042 }
2043
2044 return progress;
2045 }
2046
2047 /**
2048 * Removes any instructions writing a VGRF where that VGRF is not used by any
2049 * later instruction.
2050 */
2051 bool
2052 fs_visitor::dead_code_eliminate()
2053 {
2054 bool progress = false;
2055 int pc = 0;
2056
2057 calculate_live_intervals();
2058
2059 foreach_list_safe(node, &this->instructions) {
2060 fs_inst *inst = (fs_inst *)node;
2061
2062 if (inst->dst.file == GRF) {
2063 bool dead = true;
2064
2065 for (int i = 0; i < inst->regs_written; i++) {
2066 int var = live_intervals->var_from_vgrf[inst->dst.reg];
2067 assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2068 if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2069 dead = false;
2070 break;
2071 }
2072 }
2073
2074 if (dead) {
2075 /* Don't dead code eliminate instructions that write to the
2076 * accumulator as a side-effect. Instead just set the destination
2077 * to the null register to free it.
2078 */
2079 switch (inst->opcode) {
2080 case BRW_OPCODE_ADDC:
2081 case BRW_OPCODE_SUBB:
2082 case BRW_OPCODE_MACH:
2083 inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2084 break;
2085 default:
2086 inst->remove();
2087 progress = true;
2088 break;
2089 }
2090 }
2091 }
2092
2093 pc++;
2094 }
2095
2096 if (progress)
2097 invalidate_live_intervals();
2098
2099 return progress;
2100 }
2101
2102 struct dead_code_hash_key
2103 {
2104 int vgrf;
2105 int reg_offset;
2106 };
2107
2108 static bool
2109 dead_code_hash_compare(const void *a, const void *b)
2110 {
2111 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2112 }
2113
2114 static void
2115 clear_dead_code_hash(struct hash_table *ht)
2116 {
2117 struct hash_entry *entry;
2118
2119 hash_table_foreach(ht, entry) {
2120 _mesa_hash_table_remove(ht, entry);
2121 }
2122 }
2123
2124 static void
2125 insert_dead_code_hash(struct hash_table *ht,
2126 int vgrf, int reg_offset, fs_inst *inst)
2127 {
2128 /* We don't bother freeing keys, because they'll be GCed with the ht. */
2129 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2130
2131 key->vgrf = vgrf;
2132 key->reg_offset = reg_offset;
2133
2134 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2135 }
2136
2137 static struct hash_entry *
2138 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2139 {
2140 struct dead_code_hash_key key;
2141
2142 key.vgrf = vgrf;
2143 key.reg_offset = reg_offset;
2144
2145 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2146 }
2147
2148 static void
2149 remove_dead_code_hash(struct hash_table *ht,
2150 int vgrf, int reg_offset)
2151 {
2152 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2153 if (!entry)
2154 return;
2155
2156 _mesa_hash_table_remove(ht, entry);
2157 }
2158
2159 /**
2160 * Walks basic blocks, removing any regs that are written but not read before
2161 * being redefined.
2162 *
2163 * The dead_code_eliminate() function implements a global dead code
2164 * elimination, but it only handles the removing the last write to a register
2165 * if it's never read. This one can handle intermediate writes, but only
2166 * within a basic block.
2167 */
2168 bool
2169 fs_visitor::dead_code_eliminate_local()
2170 {
2171 struct hash_table *ht;
2172 bool progress = false;
2173
2174 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2175
2176 foreach_list_safe(node, &this->instructions) {
2177 fs_inst *inst = (fs_inst *)node;
2178
2179 /* At a basic block, empty the HT since we don't understand dataflow
2180 * here.
2181 */
2182 if (inst->is_control_flow()) {
2183 clear_dead_code_hash(ht);
2184 continue;
2185 }
2186
2187 /* Clear the HT of any instructions that got read. */
2188 for (int i = 0; i < 3; i++) {
2189 fs_reg src = inst->src[i];
2190 if (src.file != GRF)
2191 continue;
2192
2193 int read = 1;
2194 if (inst->is_send_from_grf())
2195 read = virtual_grf_sizes[src.reg] - src.reg_offset;
2196
2197 for (int reg_offset = src.reg_offset;
2198 reg_offset < src.reg_offset + read;
2199 reg_offset++) {
2200 remove_dead_code_hash(ht, src.reg, reg_offset);
2201 }
2202 }
2203
2204 /* Add any update of a GRF to the HT, removing a previous write if it
2205 * wasn't read.
2206 */
2207 if (inst->dst.file == GRF) {
2208 if (inst->regs_written > 1) {
2209 /* We don't know how to trim channels from an instruction's
2210 * writes, so we can't incrementally remove unread channels from
2211 * it. Just remove whatever it overwrites from the table
2212 */
2213 for (int i = 0; i < inst->regs_written; i++) {
2214 remove_dead_code_hash(ht,
2215 inst->dst.reg,
2216 inst->dst.reg_offset + i);
2217 }
2218 } else {
2219 struct hash_entry *entry =
2220 get_dead_code_hash_entry(ht, inst->dst.reg,
2221 inst->dst.reg_offset);
2222
2223 if (inst->is_partial_write()) {
2224 /* For a partial write, we can't remove any previous dead code
2225 * candidate, since we're just modifying their result, but we can
2226 * be dead code eliminiated ourselves.
2227 */
2228 if (entry) {
2229 entry->data = inst;
2230 } else {
2231 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2232 inst);
2233 }
2234 } else {
2235 if (entry) {
2236 /* We're completely updating a channel, and there was a
2237 * previous write to the channel that wasn't read. Kill it!
2238 */
2239 fs_inst *inst = (fs_inst *)entry->data;
2240 inst->remove();
2241 progress = true;
2242 _mesa_hash_table_remove(ht, entry);
2243 }
2244
2245 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2246 inst);
2247 }
2248 }
2249 }
2250 }
2251
2252 _mesa_hash_table_destroy(ht, NULL);
2253
2254 if (progress)
2255 invalidate_live_intervals();
2256
2257 return progress;
2258 }
2259
2260 /**
2261 * Implements a second type of register coalescing: This one checks if
2262 * the two regs involved in a raw move don't interfere, in which case
2263 * they can both by stored in the same place and the MOV removed.
2264 */
2265 bool
2266 fs_visitor::register_coalesce_2()
2267 {
2268 bool progress = false;
2269
2270 calculate_live_intervals();
2271
2272 foreach_list_safe(node, &this->instructions) {
2273 fs_inst *inst = (fs_inst *)node;
2274
2275 if (inst->opcode != BRW_OPCODE_MOV ||
2276 inst->is_partial_write() ||
2277 inst->saturate ||
2278 inst->src[0].file != GRF ||
2279 inst->src[0].negate ||
2280 inst->src[0].abs ||
2281 inst->src[0].smear != -1 ||
2282 inst->dst.file != GRF ||
2283 inst->dst.type != inst->src[0].type ||
2284 virtual_grf_sizes[inst->src[0].reg] != 1) {
2285 continue;
2286 }
2287
2288 int var_from = live_intervals->var_from_reg(&inst->src[0]);
2289 int var_to = live_intervals->var_from_reg(&inst->dst);
2290
2291 if (live_intervals->vars_interfere(var_from, var_to))
2292 continue;
2293
2294 int reg_from = inst->src[0].reg;
2295 assert(inst->src[0].reg_offset == 0);
2296 int reg_to = inst->dst.reg;
2297 int reg_to_offset = inst->dst.reg_offset;
2298
2299 foreach_list(node, &this->instructions) {
2300 fs_inst *scan_inst = (fs_inst *)node;
2301
2302 if (scan_inst->dst.file == GRF &&
2303 scan_inst->dst.reg == reg_from) {
2304 scan_inst->dst.reg = reg_to;
2305 scan_inst->dst.reg_offset = reg_to_offset;
2306 }
2307 for (int i = 0; i < 3; i++) {
2308 if (scan_inst->src[i].file == GRF &&
2309 scan_inst->src[i].reg == reg_from) {
2310 scan_inst->src[i].reg = reg_to;
2311 scan_inst->src[i].reg_offset = reg_to_offset;
2312 }
2313 }
2314 }
2315
2316 inst->remove();
2317 progress = true;
2318 continue;
2319 }
2320
2321 if (progress)
2322 invalidate_live_intervals();
2323
2324 return progress;
2325 }
2326
2327 bool
2328 fs_visitor::register_coalesce()
2329 {
2330 bool progress = false;
2331 int if_depth = 0;
2332 int loop_depth = 0;
2333
2334 foreach_list_safe(node, &this->instructions) {
2335 fs_inst *inst = (fs_inst *)node;
2336
2337 /* Make sure that we dominate the instructions we're going to
2338 * scan for interfering with our coalescing, or we won't have
2339 * scanned enough to see if anything interferes with our
2340 * coalescing. We don't dominate the following instructions if
2341 * we're in a loop or an if block.
2342 */
2343 switch (inst->opcode) {
2344 case BRW_OPCODE_DO:
2345 loop_depth++;
2346 break;
2347 case BRW_OPCODE_WHILE:
2348 loop_depth--;
2349 break;
2350 case BRW_OPCODE_IF:
2351 if_depth++;
2352 break;
2353 case BRW_OPCODE_ENDIF:
2354 if_depth--;
2355 break;
2356 default:
2357 break;
2358 }
2359 if (loop_depth || if_depth)
2360 continue;
2361
2362 if (inst->opcode != BRW_OPCODE_MOV ||
2363 inst->is_partial_write() ||
2364 inst->saturate ||
2365 inst->dst.file != GRF || (inst->src[0].file != GRF &&
2366 inst->src[0].file != UNIFORM)||
2367 inst->dst.type != inst->src[0].type)
2368 continue;
2369
2370 bool has_source_modifiers = (inst->src[0].abs ||
2371 inst->src[0].negate ||
2372 inst->src[0].smear != -1 ||
2373 inst->src[0].file == UNIFORM);
2374
2375 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
2376 * them: check for no writes to either one until the exit of the
2377 * program.
2378 */
2379 bool interfered = false;
2380
2381 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2382 !scan_inst->is_tail_sentinel();
2383 scan_inst = (fs_inst *)scan_inst->next) {
2384 if (scan_inst->dst.file == GRF) {
2385 if (scan_inst->overwrites_reg(inst->dst) ||
2386 scan_inst->overwrites_reg(inst->src[0])) {
2387 interfered = true;
2388 break;
2389 }
2390 }
2391
2392 if (has_source_modifiers) {
2393 for (int i = 0; i < 3; i++) {
2394 if (scan_inst->src[i].file == GRF &&
2395 scan_inst->src[i].reg == inst->dst.reg &&
2396 scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2397 inst->dst.type != scan_inst->src[i].type)
2398 {
2399 interfered = true;
2400 break;
2401 }
2402 }
2403 }
2404
2405
2406 /* The gen6 MATH instruction can't handle source modifiers or
2407 * unusual register regions, so avoid coalescing those for
2408 * now. We should do something more specific.
2409 */
2410 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2411 interfered = true;
2412 break;
2413 }
2414
2415 if (scan_inst->mlen > 0 && scan_inst->base_mrf == -1 &&
2416 scan_inst->src[0].file == GRF &&
2417 scan_inst->src[0].reg == inst->dst.reg) {
2418 interfered = true;
2419 break;
2420 }
2421
2422 /* The accumulator result appears to get used for the
2423 * conditional modifier generation. When negating a UD
2424 * value, there is a 33rd bit generated for the sign in the
2425 * accumulator value, so now you can't check, for example,
2426 * equality with a 32-bit value. See piglit fs-op-neg-uint.
2427 */
2428 if (scan_inst->conditional_mod &&
2429 inst->src[0].negate &&
2430 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2431 interfered = true;
2432 break;
2433 }
2434 }
2435 if (interfered) {
2436 continue;
2437 }
2438
2439 /* Rewrite the later usage to point at the source of the move to
2440 * be removed.
2441 */
2442 for (fs_inst *scan_inst = inst;
2443 !scan_inst->is_tail_sentinel();
2444 scan_inst = (fs_inst *)scan_inst->next) {
2445 for (int i = 0; i < 3; i++) {
2446 if (scan_inst->src[i].file == GRF &&
2447 scan_inst->src[i].reg == inst->dst.reg &&
2448 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2449 fs_reg new_src = inst->src[0];
2450 if (scan_inst->src[i].abs) {
2451 new_src.negate = 0;
2452 new_src.abs = 1;
2453 }
2454 new_src.negate ^= scan_inst->src[i].negate;
2455 new_src.sechalf = scan_inst->src[i].sechalf;
2456 scan_inst->src[i] = new_src;
2457 }
2458 }
2459 }
2460
2461 inst->remove();
2462 progress = true;
2463 }
2464
2465 if (progress)
2466 invalidate_live_intervals();
2467
2468 return progress;
2469 }
2470
2471
2472 bool
2473 fs_visitor::compute_to_mrf()
2474 {
2475 bool progress = false;
2476 int next_ip = 0;
2477
2478 calculate_live_intervals();
2479
2480 foreach_list_safe(node, &this->instructions) {
2481 fs_inst *inst = (fs_inst *)node;
2482
2483 int ip = next_ip;
2484 next_ip++;
2485
2486 if (inst->opcode != BRW_OPCODE_MOV ||
2487 inst->is_partial_write() ||
2488 inst->dst.file != MRF || inst->src[0].file != GRF ||
2489 inst->dst.type != inst->src[0].type ||
2490 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2491 continue;
2492
2493 /* Work out which hardware MRF registers are written by this
2494 * instruction.
2495 */
2496 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2497 int mrf_high;
2498 if (inst->dst.reg & BRW_MRF_COMPR4) {
2499 mrf_high = mrf_low + 4;
2500 } else if (dispatch_width == 16 &&
2501 (!inst->force_uncompressed && !inst->force_sechalf)) {
2502 mrf_high = mrf_low + 1;
2503 } else {
2504 mrf_high = mrf_low;
2505 }
2506
2507 /* Can't compute-to-MRF this GRF if someone else was going to
2508 * read it later.
2509 */
2510 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2511 continue;
2512
2513 /* Found a move of a GRF to a MRF. Let's see if we can go
2514 * rewrite the thing that made this GRF to write into the MRF.
2515 */
2516 fs_inst *scan_inst;
2517 for (scan_inst = (fs_inst *)inst->prev;
2518 scan_inst->prev != NULL;
2519 scan_inst = (fs_inst *)scan_inst->prev) {
2520 if (scan_inst->dst.file == GRF &&
2521 scan_inst->dst.reg == inst->src[0].reg) {
2522 /* Found the last thing to write our reg we want to turn
2523 * into a compute-to-MRF.
2524 */
2525
2526 /* If this one instruction didn't populate all the
2527 * channels, bail. We might be able to rewrite everything
2528 * that writes that reg, but it would require smarter
2529 * tracking to delay the rewriting until complete success.
2530 */
2531 if (scan_inst->is_partial_write())
2532 break;
2533
2534 /* Things returning more than one register would need us to
2535 * understand coalescing out more than one MOV at a time.
2536 */
2537 if (scan_inst->regs_written > 1)
2538 break;
2539
2540 /* SEND instructions can't have MRF as a destination. */
2541 if (scan_inst->mlen)
2542 break;
2543
2544 if (brw->gen == 6) {
2545 /* gen6 math instructions must have the destination be
2546 * GRF, so no compute-to-MRF for them.
2547 */
2548 if (scan_inst->is_math()) {
2549 break;
2550 }
2551 }
2552
2553 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2554 /* Found the creator of our MRF's source value. */
2555 scan_inst->dst.file = MRF;
2556 scan_inst->dst.reg = inst->dst.reg;
2557 scan_inst->saturate |= inst->saturate;
2558 inst->remove();
2559 progress = true;
2560 }
2561 break;
2562 }
2563
2564 /* We don't handle control flow here. Most computation of
2565 * values that end up in MRFs are shortly before the MRF
2566 * write anyway.
2567 */
2568 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2569 break;
2570
2571 /* You can't read from an MRF, so if someone else reads our
2572 * MRF's source GRF that we wanted to rewrite, that stops us.
2573 */
2574 bool interfered = false;
2575 for (int i = 0; i < 3; i++) {
2576 if (scan_inst->src[i].file == GRF &&
2577 scan_inst->src[i].reg == inst->src[0].reg &&
2578 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2579 interfered = true;
2580 }
2581 }
2582 if (interfered)
2583 break;
2584
2585 if (scan_inst->dst.file == MRF) {
2586 /* If somebody else writes our MRF here, we can't
2587 * compute-to-MRF before that.
2588 */
2589 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2590 int scan_mrf_high;
2591
2592 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2593 scan_mrf_high = scan_mrf_low + 4;
2594 } else if (dispatch_width == 16 &&
2595 (!scan_inst->force_uncompressed &&
2596 !scan_inst->force_sechalf)) {
2597 scan_mrf_high = scan_mrf_low + 1;
2598 } else {
2599 scan_mrf_high = scan_mrf_low;
2600 }
2601
2602 if (mrf_low == scan_mrf_low ||
2603 mrf_low == scan_mrf_high ||
2604 mrf_high == scan_mrf_low ||
2605 mrf_high == scan_mrf_high) {
2606 break;
2607 }
2608 }
2609
2610 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2611 /* Found a SEND instruction, which means that there are
2612 * live values in MRFs from base_mrf to base_mrf +
2613 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2614 * above it.
2615 */
2616 if (mrf_low >= scan_inst->base_mrf &&
2617 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2618 break;
2619 }
2620 if (mrf_high >= scan_inst->base_mrf &&
2621 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2622 break;
2623 }
2624 }
2625 }
2626 }
2627
2628 if (progress)
2629 invalidate_live_intervals();
2630
2631 return progress;
2632 }
2633
2634 /**
2635 * Walks through basic blocks, looking for repeated MRF writes and
2636 * removing the later ones.
2637 */
2638 bool
2639 fs_visitor::remove_duplicate_mrf_writes()
2640 {
2641 fs_inst *last_mrf_move[16];
2642 bool progress = false;
2643
2644 /* Need to update the MRF tracking for compressed instructions. */
2645 if (dispatch_width == 16)
2646 return false;
2647
2648 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2649
2650 foreach_list_safe(node, &this->instructions) {
2651 fs_inst *inst = (fs_inst *)node;
2652
2653 if (inst->is_control_flow()) {
2654 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2655 }
2656
2657 if (inst->opcode == BRW_OPCODE_MOV &&
2658 inst->dst.file == MRF) {
2659 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2660 if (prev_inst && inst->equals(prev_inst)) {
2661 inst->remove();
2662 progress = true;
2663 continue;
2664 }
2665 }
2666
2667 /* Clear out the last-write records for MRFs that were overwritten. */
2668 if (inst->dst.file == MRF) {
2669 last_mrf_move[inst->dst.reg] = NULL;
2670 }
2671
2672 if (inst->mlen > 0 && inst->base_mrf != -1) {
2673 /* Found a SEND instruction, which will include two or fewer
2674 * implied MRF writes. We could do better here.
2675 */
2676 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2677 last_mrf_move[inst->base_mrf + i] = NULL;
2678 }
2679 }
2680
2681 /* Clear out any MRF move records whose sources got overwritten. */
2682 if (inst->dst.file == GRF) {
2683 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2684 if (last_mrf_move[i] &&
2685 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2686 last_mrf_move[i] = NULL;
2687 }
2688 }
2689 }
2690
2691 if (inst->opcode == BRW_OPCODE_MOV &&
2692 inst->dst.file == MRF &&
2693 inst->src[0].file == GRF &&
2694 !inst->is_partial_write()) {
2695 last_mrf_move[inst->dst.reg] = inst;
2696 }
2697 }
2698
2699 if (progress)
2700 invalidate_live_intervals();
2701
2702 return progress;
2703 }
2704
2705 static void
2706 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2707 int first_grf, int grf_len)
2708 {
2709 bool inst_16wide = (dispatch_width > 8 &&
2710 !inst->force_uncompressed &&
2711 !inst->force_sechalf);
2712
2713 /* Clear the flag for registers that actually got read (as expected). */
2714 for (int i = 0; i < 3; i++) {
2715 int grf;
2716 if (inst->src[i].file == GRF) {
2717 grf = inst->src[i].reg;
2718 } else if (inst->src[i].file == HW_REG &&
2719 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2720 grf = inst->src[i].fixed_hw_reg.nr;
2721 } else {
2722 continue;
2723 }
2724
2725 if (grf >= first_grf &&
2726 grf < first_grf + grf_len) {
2727 deps[grf - first_grf] = false;
2728 if (inst_16wide)
2729 deps[grf - first_grf + 1] = false;
2730 }
2731 }
2732 }
2733
2734 /**
2735 * Implements this workaround for the original 965:
2736 *
2737 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2738 * check for post destination dependencies on this instruction, software
2739 * must ensure that there is no destination hazard for the case of ‘write
2740 * followed by a posted write’ shown in the following example.
2741 *
2742 * 1. mov r3 0
2743 * 2. send r3.xy <rest of send instruction>
2744 * 3. mov r2 r3
2745 *
2746 * Due to no post-destination dependency check on the ‘send’, the above
2747 * code sequence could have two instructions (1 and 2) in flight at the
2748 * same time that both consider ‘r3’ as the target of their final writes.
2749 */
2750 void
2751 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2752 {
2753 int reg_size = dispatch_width / 8;
2754 int write_len = inst->regs_written * reg_size;
2755 int first_write_grf = inst->dst.reg;
2756 bool needs_dep[BRW_MAX_MRF];
2757 assert(write_len < (int)sizeof(needs_dep) - 1);
2758
2759 memset(needs_dep, false, sizeof(needs_dep));
2760 memset(needs_dep, true, write_len);
2761
2762 clear_deps_for_inst_src(inst, dispatch_width,
2763 needs_dep, first_write_grf, write_len);
2764
2765 /* Walk backwards looking for writes to registers we're writing which
2766 * aren't read since being written. If we hit the start of the program,
2767 * we assume that there are no outstanding dependencies on entry to the
2768 * program.
2769 */
2770 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2771 scan_inst != NULL;
2772 scan_inst = (fs_inst *)scan_inst->prev) {
2773
2774 /* If we hit control flow, assume that there *are* outstanding
2775 * dependencies, and force their cleanup before our instruction.
2776 */
2777 if (scan_inst->is_control_flow()) {
2778 for (int i = 0; i < write_len; i++) {
2779 if (needs_dep[i]) {
2780 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2781 }
2782 }
2783 return;
2784 }
2785
2786 bool scan_inst_16wide = (dispatch_width > 8 &&
2787 !scan_inst->force_uncompressed &&
2788 !scan_inst->force_sechalf);
2789
2790 /* We insert our reads as late as possible on the assumption that any
2791 * instruction but a MOV that might have left us an outstanding
2792 * dependency has more latency than a MOV.
2793 */
2794 if (scan_inst->dst.file == GRF) {
2795 for (int i = 0; i < scan_inst->regs_written; i++) {
2796 int reg = scan_inst->dst.reg + i * reg_size;
2797
2798 if (reg >= first_write_grf &&
2799 reg < first_write_grf + write_len &&
2800 needs_dep[reg - first_write_grf]) {
2801 inst->insert_before(DEP_RESOLVE_MOV(reg));
2802 needs_dep[reg - first_write_grf] = false;
2803 if (scan_inst_16wide)
2804 needs_dep[reg - first_write_grf + 1] = false;
2805 }
2806 }
2807 }
2808
2809 /* Clear the flag for registers that actually got read (as expected). */
2810 clear_deps_for_inst_src(scan_inst, dispatch_width,
2811 needs_dep, first_write_grf, write_len);
2812
2813 /* Continue the loop only if we haven't resolved all the dependencies */
2814 int i;
2815 for (i = 0; i < write_len; i++) {
2816 if (needs_dep[i])
2817 break;
2818 }
2819 if (i == write_len)
2820 return;
2821 }
2822 }
2823
2824 /**
2825 * Implements this workaround for the original 965:
2826 *
2827 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2828 * used as a destination register until after it has been sourced by an
2829 * instruction with a different destination register.
2830 */
2831 void
2832 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2833 {
2834 int write_len = inst->regs_written * dispatch_width / 8;
2835 int first_write_grf = inst->dst.reg;
2836 bool needs_dep[BRW_MAX_MRF];
2837 assert(write_len < (int)sizeof(needs_dep) - 1);
2838
2839 memset(needs_dep, false, sizeof(needs_dep));
2840 memset(needs_dep, true, write_len);
2841 /* Walk forwards looking for writes to registers we're writing which aren't
2842 * read before being written.
2843 */
2844 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2845 !scan_inst->is_tail_sentinel();
2846 scan_inst = (fs_inst *)scan_inst->next) {
2847 /* If we hit control flow, force resolve all remaining dependencies. */
2848 if (scan_inst->is_control_flow()) {
2849 for (int i = 0; i < write_len; i++) {
2850 if (needs_dep[i])
2851 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2852 }
2853 return;
2854 }
2855
2856 /* Clear the flag for registers that actually got read (as expected). */
2857 clear_deps_for_inst_src(scan_inst, dispatch_width,
2858 needs_dep, first_write_grf, write_len);
2859
2860 /* We insert our reads as late as possible since they're reading the
2861 * result of a SEND, which has massive latency.
2862 */
2863 if (scan_inst->dst.file == GRF &&
2864 scan_inst->dst.reg >= first_write_grf &&
2865 scan_inst->dst.reg < first_write_grf + write_len &&
2866 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2867 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2868 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2869 }
2870
2871 /* Continue the loop only if we haven't resolved all the dependencies */
2872 int i;
2873 for (i = 0; i < write_len; i++) {
2874 if (needs_dep[i])
2875 break;
2876 }
2877 if (i == write_len)
2878 return;
2879 }
2880
2881 /* If we hit the end of the program, resolve all remaining dependencies out
2882 * of paranoia.
2883 */
2884 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2885 assert(last_inst->eot);
2886 for (int i = 0; i < write_len; i++) {
2887 if (needs_dep[i])
2888 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2889 }
2890 }
2891
2892 void
2893 fs_visitor::insert_gen4_send_dependency_workarounds()
2894 {
2895 if (brw->gen != 4 || brw->is_g4x)
2896 return;
2897
2898 /* Note that we're done with register allocation, so GRF fs_regs always
2899 * have a .reg_offset of 0.
2900 */
2901
2902 foreach_list_safe(node, &this->instructions) {
2903 fs_inst *inst = (fs_inst *)node;
2904
2905 if (inst->mlen != 0 && inst->dst.file == GRF) {
2906 insert_gen4_pre_send_dependency_workarounds(inst);
2907 insert_gen4_post_send_dependency_workarounds(inst);
2908 }
2909 }
2910 }
2911
2912 /**
2913 * Turns the generic expression-style uniform pull constant load instruction
2914 * into a hardware-specific series of instructions for loading a pull
2915 * constant.
2916 *
2917 * The expression style allows the CSE pass before this to optimize out
2918 * repeated loads from the same offset, and gives the pre-register-allocation
2919 * scheduling full flexibility, while the conversion to native instructions
2920 * allows the post-register-allocation scheduler the best information
2921 * possible.
2922 *
2923 * Note that execution masking for setting up pull constant loads is special:
2924 * the channels that need to be written are unrelated to the current execution
2925 * mask, since a later instruction will use one of the result channels as a
2926 * source operand for all 8 or 16 of its channels.
2927 */
2928 void
2929 fs_visitor::lower_uniform_pull_constant_loads()
2930 {
2931 foreach_list(node, &this->instructions) {
2932 fs_inst *inst = (fs_inst *)node;
2933
2934 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2935 continue;
2936
2937 if (brw->gen >= 7) {
2938 /* The offset arg before was a vec4-aligned byte offset. We need to
2939 * turn it into a dword offset.
2940 */
2941 fs_reg const_offset_reg = inst->src[1];
2942 assert(const_offset_reg.file == IMM &&
2943 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2944 const_offset_reg.imm.u /= 4;
2945 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2946
2947 /* This is actually going to be a MOV, but since only the first dword
2948 * is accessed, we have a special opcode to do just that one. Note
2949 * that this needs to be an operation that will be considered a def
2950 * by live variable analysis, or register allocation will explode.
2951 */
2952 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2953 payload, const_offset_reg);
2954 setup->force_writemask_all = true;
2955
2956 setup->ir = inst->ir;
2957 setup->annotation = inst->annotation;
2958 inst->insert_before(setup);
2959
2960 /* Similarly, this will only populate the first 4 channels of the
2961 * result register (since we only use smear values from 0-3), but we
2962 * don't tell the optimizer.
2963 */
2964 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2965 inst->src[1] = payload;
2966
2967 invalidate_live_intervals();
2968 } else {
2969 /* Before register allocation, we didn't tell the scheduler about the
2970 * MRF we use. We know it's safe to use this MRF because nothing
2971 * else does except for register spill/unspill, which generates and
2972 * uses its MRF within a single IR instruction.
2973 */
2974 inst->base_mrf = 14;
2975 inst->mlen = 1;
2976 }
2977 }
2978 }
2979
2980 void
2981 fs_visitor::dump_instruction(backend_instruction *be_inst)
2982 {
2983 fs_inst *inst = (fs_inst *)be_inst;
2984
2985 if (inst->predicate) {
2986 printf("(%cf0.%d) ",
2987 inst->predicate_inverse ? '-' : '+',
2988 inst->flag_subreg);
2989 }
2990
2991 printf("%s", brw_instruction_name(inst->opcode));
2992 if (inst->saturate)
2993 printf(".sat");
2994 if (inst->conditional_mod) {
2995 printf(".cmod");
2996 if (!inst->predicate &&
2997 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2998 inst->opcode != BRW_OPCODE_IF &&
2999 inst->opcode != BRW_OPCODE_WHILE))) {
3000 printf(".f0.%d", inst->flag_subreg);
3001 }
3002 }
3003 printf(" ");
3004
3005
3006 switch (inst->dst.file) {
3007 case GRF:
3008 printf("vgrf%d", inst->dst.reg);
3009 if (inst->dst.reg_offset)
3010 printf("+%d", inst->dst.reg_offset);
3011 break;
3012 case MRF:
3013 printf("m%d", inst->dst.reg);
3014 break;
3015 case BAD_FILE:
3016 printf("(null)");
3017 break;
3018 case UNIFORM:
3019 printf("***u%d***", inst->dst.reg);
3020 break;
3021 case HW_REG:
3022 printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
3023 if (inst->dst.fixed_hw_reg.subnr)
3024 printf("+%d", inst->dst.fixed_hw_reg.subnr);
3025 break;
3026 default:
3027 printf("???");
3028 break;
3029 }
3030 printf(", ");
3031
3032 for (int i = 0; i < 3; i++) {
3033 if (inst->src[i].negate)
3034 printf("-");
3035 if (inst->src[i].abs)
3036 printf("|");
3037 switch (inst->src[i].file) {
3038 case GRF:
3039 printf("vgrf%d", inst->src[i].reg);
3040 if (inst->src[i].reg_offset)
3041 printf("+%d", inst->src[i].reg_offset);
3042 break;
3043 case MRF:
3044 printf("***m%d***", inst->src[i].reg);
3045 break;
3046 case UNIFORM:
3047 printf("u%d", inst->src[i].reg);
3048 if (inst->src[i].reg_offset)
3049 printf(".%d", inst->src[i].reg_offset);
3050 break;
3051 case BAD_FILE:
3052 printf("(null)");
3053 break;
3054 case IMM:
3055 switch (inst->src[i].type) {
3056 case BRW_REGISTER_TYPE_F:
3057 printf("%ff", inst->src[i].imm.f);
3058 break;
3059 case BRW_REGISTER_TYPE_D:
3060 printf("%dd", inst->src[i].imm.i);
3061 break;
3062 case BRW_REGISTER_TYPE_UD:
3063 printf("%uu", inst->src[i].imm.u);
3064 break;
3065 default:
3066 printf("???");
3067 break;
3068 }
3069 break;
3070 case HW_REG:
3071 if (inst->src[i].fixed_hw_reg.negate)
3072 printf("-");
3073 if (inst->src[i].fixed_hw_reg.abs)
3074 printf("|");
3075 printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3076 if (inst->src[i].fixed_hw_reg.subnr)
3077 printf("+%d", inst->src[i].fixed_hw_reg.subnr);
3078 if (inst->src[i].fixed_hw_reg.abs)
3079 printf("|");
3080 break;
3081 default:
3082 printf("???");
3083 break;
3084 }
3085 if (inst->src[i].abs)
3086 printf("|");
3087
3088 if (i < 3)
3089 printf(", ");
3090 }
3091
3092 printf(" ");
3093
3094 if (inst->force_uncompressed)
3095 printf("1sthalf ");
3096
3097 if (inst->force_sechalf)
3098 printf("2ndhalf ");
3099
3100 printf("\n");
3101 }
3102
3103 /**
3104 * Possibly returns an instruction that set up @param reg.
3105 *
3106 * Sometimes we want to take the result of some expression/variable
3107 * dereference tree and rewrite the instruction generating the result
3108 * of the tree. When processing the tree, we know that the
3109 * instructions generated are all writing temporaries that are dead
3110 * outside of this tree. So, if we have some instructions that write
3111 * a temporary, we're free to point that temp write somewhere else.
3112 *
3113 * Note that this doesn't guarantee that the instruction generated
3114 * only reg -- it might be the size=4 destination of a texture instruction.
3115 */
3116 fs_inst *
3117 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3118 fs_inst *end,
3119 fs_reg reg)
3120 {
3121 if (end == start ||
3122 end->is_partial_write() ||
3123 reg.reladdr ||
3124 !reg.equals(end->dst)) {
3125 return NULL;
3126 } else {
3127 return end;
3128 }
3129 }
3130
3131 void
3132 fs_visitor::setup_payload_gen6()
3133 {
3134 bool uses_depth =
3135 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3136 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3137
3138 assert(brw->gen >= 6);
3139
3140 /* R0-1: masks, pixel X/Y coordinates. */
3141 c->nr_payload_regs = 2;
3142 /* R2: only for 32-pixel dispatch.*/
3143
3144 /* R3-26: barycentric interpolation coordinates. These appear in the
3145 * same order that they appear in the brw_wm_barycentric_interp_mode
3146 * enum. Each set of coordinates occupies 2 registers if dispatch width
3147 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3148 * appear if they were enabled using the "Barycentric Interpolation
3149 * Mode" bits in WM_STATE.
3150 */
3151 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3152 if (barycentric_interp_modes & (1 << i)) {
3153 c->barycentric_coord_reg[i] = c->nr_payload_regs;
3154 c->nr_payload_regs += 2;
3155 if (dispatch_width == 16) {
3156 c->nr_payload_regs += 2;
3157 }
3158 }
3159 }
3160
3161 /* R27: interpolated depth if uses source depth */
3162 if (uses_depth) {
3163 c->source_depth_reg = c->nr_payload_regs;
3164 c->nr_payload_regs++;
3165 if (dispatch_width == 16) {
3166 /* R28: interpolated depth if not 8-wide. */
3167 c->nr_payload_regs++;
3168 }
3169 }
3170 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3171 if (uses_depth) {
3172 c->source_w_reg = c->nr_payload_regs;
3173 c->nr_payload_regs++;
3174 if (dispatch_width == 16) {
3175 /* R30: interpolated W if not 8-wide. */
3176 c->nr_payload_regs++;
3177 }
3178 }
3179
3180 c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3181 /* R31: MSAA position offsets. */
3182 if (c->prog_data.uses_pos_offset) {
3183 c->sample_pos_reg = c->nr_payload_regs;
3184 c->nr_payload_regs++;
3185 }
3186
3187 /* R32-: bary for 32-pixel. */
3188 /* R58-59: interp W for 32-pixel. */
3189
3190 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3191 c->source_depth_to_render_target = true;
3192 }
3193 }
3194
3195 void
3196 fs_visitor::assign_binding_table_offsets()
3197 {
3198 uint32_t next_binding_table_offset = 0;
3199
3200 c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3201 next_binding_table_offset += c->key.nr_color_regions;
3202
3203 assign_common_binding_table_offsets(next_binding_table_offset);
3204 }
3205
3206 bool
3207 fs_visitor::run()
3208 {
3209 sanity_param_count = fp->Base.Parameters->NumParameters;
3210 uint32_t orig_nr_params = c->prog_data.nr_params;
3211
3212 assign_binding_table_offsets();
3213
3214 if (brw->gen >= 6)
3215 setup_payload_gen6();
3216 else
3217 setup_payload_gen4();
3218
3219 if (0) {
3220 emit_dummy_fs();
3221 } else {
3222 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3223 emit_shader_time_begin();
3224
3225 calculate_urb_setup();
3226 if (fp->Base.InputsRead > 0) {
3227 if (brw->gen < 6)
3228 emit_interpolation_setup_gen4();
3229 else
3230 emit_interpolation_setup_gen6();
3231 }
3232
3233 /* We handle discards by keeping track of the still-live pixels in f0.1.
3234 * Initialize it with the dispatched pixels.
3235 */
3236 if (fp->UsesKill) {
3237 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3238 discard_init->flag_subreg = 1;
3239 }
3240
3241 /* Generate FS IR for main(). (the visitor only descends into
3242 * functions called "main").
3243 */
3244 if (shader) {
3245 foreach_list(node, &*shader->ir) {
3246 ir_instruction *ir = (ir_instruction *)node;
3247 base_ir = ir;
3248 this->result = reg_undef;
3249 ir->accept(this);
3250 }
3251 } else {
3252 emit_fragment_program_code();
3253 }
3254 base_ir = NULL;
3255 if (failed)
3256 return false;
3257
3258 emit(FS_OPCODE_PLACEHOLDER_HALT);
3259
3260 emit_fb_writes();
3261
3262 split_virtual_grfs();
3263
3264 move_uniform_array_access_to_pull_constants();
3265 remove_dead_constants();
3266 setup_pull_constants();
3267
3268 bool progress;
3269 do {
3270 progress = false;
3271
3272 compact_virtual_grfs();
3273
3274 progress = remove_duplicate_mrf_writes() || progress;
3275
3276 progress = opt_algebraic() || progress;
3277 progress = opt_cse() || progress;
3278 progress = opt_copy_propagate() || progress;
3279 progress = dead_code_eliminate() || progress;
3280 progress = dead_code_eliminate_local() || progress;
3281 progress = register_coalesce() || progress;
3282 progress = register_coalesce_2() || progress;
3283 progress = compute_to_mrf() || progress;
3284 } while (progress);
3285
3286 schedule_instructions(false);
3287
3288 lower_uniform_pull_constant_loads();
3289
3290 assign_curb_setup();
3291 assign_urb_setup();
3292
3293 if (0)
3294 assign_regs_trivial();
3295 else {
3296 while (!assign_regs()) {
3297 if (failed)
3298 break;
3299 }
3300 }
3301 }
3302 assert(force_uncompressed_stack == 0);
3303 assert(force_sechalf_stack == 0);
3304
3305 /* This must come after all optimization and register allocation, since
3306 * it inserts dead code that happens to have side effects, and it does
3307 * so based on the actual physical registers in use.
3308 */
3309 insert_gen4_send_dependency_workarounds();
3310
3311 if (failed)
3312 return false;
3313
3314 schedule_instructions(true);
3315
3316 if (dispatch_width == 8) {
3317 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3318 } else {
3319 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3320
3321 /* Make sure we didn't try to sneak in an extra uniform */
3322 assert(orig_nr_params == c->prog_data.nr_params);
3323 (void) orig_nr_params;
3324 }
3325
3326 /* If any state parameters were appended, then ParameterValues could have
3327 * been realloced, in which case the driver uniform storage set up by
3328 * _mesa_associate_uniform_storage() would point to freed memory. Make
3329 * sure that didn't happen.
3330 */
3331 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3332
3333 return !failed;
3334 }
3335
3336 const unsigned *
3337 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3338 struct gl_fragment_program *fp,
3339 struct gl_shader_program *prog,
3340 unsigned *final_assembly_size)
3341 {
3342 bool start_busy = false;
3343 float start_time = 0;
3344
3345 if (unlikely(brw->perf_debug)) {
3346 start_busy = (brw->batch.last_bo &&
3347 drm_intel_bo_busy(brw->batch.last_bo));
3348 start_time = get_time();
3349 }
3350
3351 struct brw_shader *shader = NULL;
3352 if (prog)
3353 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3354
3355 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3356 if (prog) {
3357 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3358 _mesa_print_ir(shader->ir, NULL);
3359 printf("\n\n");
3360 } else {
3361 printf("ARB_fragment_program %d ir for native fragment shader\n",
3362 fp->Base.Id);
3363 _mesa_print_program(&fp->Base);
3364 }
3365 }
3366
3367 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3368 */
3369 fs_visitor v(brw, c, prog, fp, 8);
3370 if (!v.run()) {
3371 if (prog) {
3372 prog->LinkStatus = false;
3373 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3374 }
3375
3376 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3377 v.fail_msg);
3378
3379 return NULL;
3380 }
3381
3382 exec_list *simd16_instructions = NULL;
3383 fs_visitor v2(brw, c, prog, fp, 16);
3384 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3385 if (c->prog_data.nr_pull_params == 0) {
3386 /* Try a 16-wide compile */
3387 v2.import_uniforms(&v);
3388 if (!v2.run()) {
3389 perf_debug("16-wide shader failed to compile, falling back to "
3390 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3391 } else {
3392 simd16_instructions = &v2.instructions;
3393 }
3394 } else {
3395 perf_debug("Skipping 16-wide due to pull parameters.\n");
3396 }
3397 }
3398
3399 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3400 const unsigned *generated = g.generate_assembly(&v.instructions,
3401 simd16_instructions,
3402 final_assembly_size);
3403
3404 if (unlikely(brw->perf_debug) && shader) {
3405 if (shader->compiled_once)
3406 brw_wm_debug_recompile(brw, prog, &c->key);
3407 shader->compiled_once = true;
3408
3409 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3410 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3411 (get_time() - start_time) * 1000);
3412 }
3413 }
3414
3415 return generated;
3416 }
3417
3418 bool
3419 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3420 {
3421 struct brw_context *brw = brw_context(ctx);
3422 struct brw_wm_prog_key key;
3423
3424 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3425 return true;
3426
3427 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3428 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3429 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3430 bool program_uses_dfdy = fp->UsesDFdy;
3431
3432 memset(&key, 0, sizeof(key));
3433
3434 if (brw->gen < 6) {
3435 if (fp->UsesKill)
3436 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3437
3438 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3439 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3440
3441 /* Just assume depth testing. */
3442 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3443 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3444 }
3445
3446 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3447 BRW_FS_VARYING_INPUT_MASK) > 16)
3448 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3449
3450 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3451
3452 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3453 for (unsigned i = 0; i < sampler_count; i++) {
3454 if (fp->Base.ShadowSamplers & (1 << i)) {
3455 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3456 key.tex.swizzles[i] =
3457 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3458 } else {
3459 /* Color sampler: assume no swizzling. */
3460 key.tex.swizzles[i] = SWIZZLE_XYZW;
3461 }
3462 }
3463
3464 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3465 key.drawable_height = ctx->DrawBuffer->Height;
3466 }
3467
3468 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3469 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3470 }
3471
3472 key.nr_color_regions = 1;
3473
3474 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3475 * quality of the derivatives is likely to be determined by the driconf
3476 * option.
3477 */
3478 key.high_quality_derivatives = brw->disable_derivative_optimization;
3479
3480 key.program_string_id = bfp->id;
3481
3482 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3483 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3484
3485 bool success = do_wm_prog(brw, prog, bfp, &key);
3486
3487 brw->wm.base.prog_offset = old_prog_offset;
3488 brw->wm.prog_data = old_prog_data;
3489
3490 return success;
3491 }