i965: Use sample barycentric coordinates with per sample shading
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53
54 void
55 fs_inst::init()
56 {
57 memset(this, 0, sizeof(*this));
58 this->opcode = BRW_OPCODE_NOP;
59 this->conditional_mod = BRW_CONDITIONAL_NONE;
60
61 this->dst = reg_undef;
62 this->src[0] = reg_undef;
63 this->src[1] = reg_undef;
64 this->src[2] = reg_undef;
65
66 /* This will be the case for almost all instructions. */
67 this->regs_written = 1;
68 }
69
70 fs_inst::fs_inst()
71 {
72 init();
73 }
74
75 fs_inst::fs_inst(enum opcode opcode)
76 {
77 init();
78 this->opcode = opcode;
79 }
80
81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
82 {
83 init();
84 this->opcode = opcode;
85 this->dst = dst;
86
87 if (dst.file == GRF)
88 assert(dst.reg_offset >= 0);
89 }
90
91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
92 {
93 init();
94 this->opcode = opcode;
95 this->dst = dst;
96 this->src[0] = src0;
97
98 if (dst.file == GRF)
99 assert(dst.reg_offset >= 0);
100 if (src[0].file == GRF)
101 assert(src[0].reg_offset >= 0);
102 }
103
104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
105 {
106 init();
107 this->opcode = opcode;
108 this->dst = dst;
109 this->src[0] = src0;
110 this->src[1] = src1;
111
112 if (dst.file == GRF)
113 assert(dst.reg_offset >= 0);
114 if (src[0].file == GRF)
115 assert(src[0].reg_offset >= 0);
116 if (src[1].file == GRF)
117 assert(src[1].reg_offset >= 0);
118 }
119
120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
121 fs_reg src0, fs_reg src1, fs_reg src2)
122 {
123 init();
124 this->opcode = opcode;
125 this->dst = dst;
126 this->src[0] = src0;
127 this->src[1] = src1;
128 this->src[2] = src2;
129
130 if (dst.file == GRF)
131 assert(dst.reg_offset >= 0);
132 if (src[0].file == GRF)
133 assert(src[0].reg_offset >= 0);
134 if (src[1].file == GRF)
135 assert(src[1].reg_offset >= 0);
136 if (src[2].file == GRF)
137 assert(src[2].reg_offset >= 0);
138 }
139
140 #define ALU1(op) \
141 fs_inst * \
142 fs_visitor::op(fs_reg dst, fs_reg src0) \
143 { \
144 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
145 }
146
147 #define ALU2(op) \
148 fs_inst * \
149 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
150 { \
151 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
152 }
153
154 #define ALU3(op) \
155 fs_inst * \
156 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
157 { \
158 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
159 }
160
161 ALU1(NOT)
162 ALU1(MOV)
163 ALU1(FRC)
164 ALU1(RNDD)
165 ALU1(RNDE)
166 ALU1(RNDZ)
167 ALU2(ADD)
168 ALU2(MUL)
169 ALU2(MACH)
170 ALU2(AND)
171 ALU2(OR)
172 ALU2(XOR)
173 ALU2(SHL)
174 ALU2(SHR)
175 ALU2(ASR)
176 ALU3(LRP)
177 ALU1(BFREV)
178 ALU3(BFE)
179 ALU2(BFI1)
180 ALU3(BFI2)
181 ALU1(FBH)
182 ALU1(FBL)
183 ALU1(CBIT)
184 ALU3(MAD)
185 ALU2(ADDC)
186 ALU2(SUBB)
187 ALU2(SEL)
188
189 /** Gen4 predicated IF. */
190 fs_inst *
191 fs_visitor::IF(uint32_t predicate)
192 {
193 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195 return inst;
196 }
197
198 /** Gen6 IF with embedded comparison. */
199 fs_inst *
200 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
201 {
202 assert(brw->gen == 6);
203 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
204 reg_null_d, src0, src1);
205 inst->conditional_mod = condition;
206 return inst;
207 }
208
209 /**
210 * CMP: Sets the low bit of the destination channels with the result
211 * of the comparison, while the upper bits are undefined, and updates
212 * the flag register with the packed 16 bits of the result.
213 */
214 fs_inst *
215 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
216 {
217 fs_inst *inst;
218
219 /* Take the instruction:
220 *
221 * CMP null<d> src0<f> src1<f>
222 *
223 * Original gen4 does type conversion to the destination type before
224 * comparison, producing garbage results for floating point comparisons.
225 * gen5 does the comparison on the execution type (resolved source types),
226 * so dst type doesn't matter. gen6 does comparison and then uses the
227 * result as if it was the dst type with no conversion, which happens to
228 * mostly work out for float-interpreted-as-int since our comparisons are
229 * for >0, =0, <0.
230 */
231 if (brw->gen == 4) {
232 dst.type = src0.type;
233 if (dst.file == HW_REG)
234 dst.fixed_hw_reg.type = dst.type;
235 }
236
237 resolve_ud_negate(&src0);
238 resolve_ud_negate(&src1);
239
240 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
241 inst->conditional_mod = condition;
242
243 return inst;
244 }
245
246 exec_list
247 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
248 fs_reg varying_offset,
249 uint32_t const_offset)
250 {
251 exec_list instructions;
252 fs_inst *inst;
253
254 /* We have our constant surface use a pitch of 4 bytes, so our index can
255 * be any component of a vector, and then we load 4 contiguous
256 * components starting from that.
257 *
258 * We break down the const_offset to a portion added to the variable
259 * offset and a portion done using reg_offset, which means that if you
260 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
261 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
262 * CSE can later notice that those loads are all the same and eliminate
263 * the redundant ones.
264 */
265 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
266 instructions.push_tail(ADD(vec4_offset,
267 varying_offset, const_offset & ~3));
268
269 int scale = 1;
270 if (brw->gen == 4 && dispatch_width == 8) {
271 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
272 * u, v, r) as parameters, or we can just use the SIMD16 message
273 * consisting of (header, u). We choose the second, at the cost of a
274 * longer return length.
275 */
276 scale = 2;
277 }
278
279 enum opcode op;
280 if (brw->gen >= 7)
281 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
282 else
283 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
284 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
285 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
286 inst->regs_written = 4 * scale;
287 instructions.push_tail(inst);
288
289 if (brw->gen < 7) {
290 inst->base_mrf = 13;
291 inst->header_present = true;
292 if (brw->gen == 4)
293 inst->mlen = 3;
294 else
295 inst->mlen = 1 + dispatch_width / 8;
296 }
297
298 vec4_result.reg_offset += (const_offset & 3) * scale;
299 instructions.push_tail(MOV(dst, vec4_result));
300
301 return instructions;
302 }
303
304 /**
305 * A helper for MOV generation for fixing up broken hardware SEND dependency
306 * handling.
307 */
308 fs_inst *
309 fs_visitor::DEP_RESOLVE_MOV(int grf)
310 {
311 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
312
313 inst->ir = NULL;
314 inst->annotation = "send dependency resolve";
315
316 /* The caller always wants uncompressed to emit the minimal extra
317 * dependencies, and to avoid having to deal with aligning its regs to 2.
318 */
319 inst->force_uncompressed = true;
320
321 return inst;
322 }
323
324 bool
325 fs_inst::equals(fs_inst *inst)
326 {
327 return (opcode == inst->opcode &&
328 dst.equals(inst->dst) &&
329 src[0].equals(inst->src[0]) &&
330 src[1].equals(inst->src[1]) &&
331 src[2].equals(inst->src[2]) &&
332 saturate == inst->saturate &&
333 predicate == inst->predicate &&
334 conditional_mod == inst->conditional_mod &&
335 mlen == inst->mlen &&
336 base_mrf == inst->base_mrf &&
337 sampler == inst->sampler &&
338 target == inst->target &&
339 eot == inst->eot &&
340 header_present == inst->header_present &&
341 shadow_compare == inst->shadow_compare &&
342 offset == inst->offset);
343 }
344
345 bool
346 fs_inst::overwrites_reg(const fs_reg &reg)
347 {
348 return (reg.file == dst.file &&
349 reg.reg == dst.reg &&
350 reg.reg_offset >= dst.reg_offset &&
351 reg.reg_offset < dst.reg_offset + regs_written);
352 }
353
354 bool
355 fs_inst::is_send_from_grf()
356 {
357 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
358 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
359 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
360 src[1].file == GRF) ||
361 (is_tex() && src[0].file == GRF));
362 }
363
364 bool
365 fs_visitor::can_do_source_mods(fs_inst *inst)
366 {
367 if (brw->gen == 6 && inst->is_math())
368 return false;
369
370 if (inst->is_send_from_grf())
371 return false;
372
373 if (!inst->can_do_source_mods())
374 return false;
375
376 return true;
377 }
378
379 void
380 fs_reg::init()
381 {
382 memset(this, 0, sizeof(*this));
383 this->smear = -1;
384 }
385
386 /** Generic unset register constructor. */
387 fs_reg::fs_reg()
388 {
389 init();
390 this->file = BAD_FILE;
391 }
392
393 /** Immediate value constructor. */
394 fs_reg::fs_reg(float f)
395 {
396 init();
397 this->file = IMM;
398 this->type = BRW_REGISTER_TYPE_F;
399 this->imm.f = f;
400 }
401
402 /** Immediate value constructor. */
403 fs_reg::fs_reg(int32_t i)
404 {
405 init();
406 this->file = IMM;
407 this->type = BRW_REGISTER_TYPE_D;
408 this->imm.i = i;
409 }
410
411 /** Immediate value constructor. */
412 fs_reg::fs_reg(uint32_t u)
413 {
414 init();
415 this->file = IMM;
416 this->type = BRW_REGISTER_TYPE_UD;
417 this->imm.u = u;
418 }
419
420 /** Fixed brw_reg. */
421 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
422 {
423 init();
424 this->file = HW_REG;
425 this->fixed_hw_reg = fixed_hw_reg;
426 this->type = fixed_hw_reg.type;
427 }
428
429 bool
430 fs_reg::equals(const fs_reg &r) const
431 {
432 return (file == r.file &&
433 reg == r.reg &&
434 reg_offset == r.reg_offset &&
435 type == r.type &&
436 negate == r.negate &&
437 abs == r.abs &&
438 !reladdr && !r.reladdr &&
439 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
440 sizeof(fixed_hw_reg)) == 0 &&
441 smear == r.smear &&
442 imm.u == r.imm.u);
443 }
444
445 fs_reg
446 fs_reg::retype(uint32_t type)
447 {
448 fs_reg result = *this;
449 result.type = type;
450 return result;
451 }
452
453 bool
454 fs_reg::is_zero() const
455 {
456 if (file != IMM)
457 return false;
458
459 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
460 }
461
462 bool
463 fs_reg::is_one() const
464 {
465 if (file != IMM)
466 return false;
467
468 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
469 }
470
471 bool
472 fs_reg::is_null() const
473 {
474 return file == HW_REG &&
475 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
476 fixed_hw_reg.nr == BRW_ARF_NULL;
477 }
478
479 bool
480 fs_reg::is_valid_3src() const
481 {
482 return file == GRF || file == UNIFORM;
483 }
484
485 int
486 fs_visitor::type_size(const struct glsl_type *type)
487 {
488 unsigned int size, i;
489
490 switch (type->base_type) {
491 case GLSL_TYPE_UINT:
492 case GLSL_TYPE_INT:
493 case GLSL_TYPE_FLOAT:
494 case GLSL_TYPE_BOOL:
495 return type->components();
496 case GLSL_TYPE_ARRAY:
497 return type_size(type->fields.array) * type->length;
498 case GLSL_TYPE_STRUCT:
499 size = 0;
500 for (i = 0; i < type->length; i++) {
501 size += type_size(type->fields.structure[i].type);
502 }
503 return size;
504 case GLSL_TYPE_SAMPLER:
505 /* Samplers take up no register space, since they're baked in at
506 * link time.
507 */
508 return 0;
509 case GLSL_TYPE_ATOMIC_UINT:
510 return 0;
511 case GLSL_TYPE_VOID:
512 case GLSL_TYPE_ERROR:
513 case GLSL_TYPE_INTERFACE:
514 assert(!"not reached");
515 break;
516 }
517
518 return 0;
519 }
520
521 fs_reg
522 fs_visitor::get_timestamp()
523 {
524 assert(brw->gen >= 7);
525
526 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
527 BRW_ARF_TIMESTAMP,
528 0),
529 BRW_REGISTER_TYPE_UD));
530
531 fs_reg dst = fs_reg(this, glsl_type::uint_type);
532
533 fs_inst *mov = emit(MOV(dst, ts));
534 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
535 * even if it's not enabled in the dispatch.
536 */
537 mov->force_writemask_all = true;
538 mov->force_uncompressed = true;
539
540 /* The caller wants the low 32 bits of the timestamp. Since it's running
541 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
542 * which is plenty of time for our purposes. It is identical across the
543 * EUs, but since it's tracking GPU core speed it will increment at a
544 * varying rate as render P-states change.
545 *
546 * The caller could also check if render P-states have changed (or anything
547 * else that might disrupt timing) by setting smear to 2 and checking if
548 * that field is != 0.
549 */
550 dst.smear = 0;
551
552 return dst;
553 }
554
555 void
556 fs_visitor::emit_shader_time_begin()
557 {
558 current_annotation = "shader time start";
559 shader_start_time = get_timestamp();
560 }
561
562 void
563 fs_visitor::emit_shader_time_end()
564 {
565 current_annotation = "shader time end";
566
567 enum shader_time_shader_type type, written_type, reset_type;
568 if (dispatch_width == 8) {
569 type = ST_FS8;
570 written_type = ST_FS8_WRITTEN;
571 reset_type = ST_FS8_RESET;
572 } else {
573 assert(dispatch_width == 16);
574 type = ST_FS16;
575 written_type = ST_FS16_WRITTEN;
576 reset_type = ST_FS16_RESET;
577 }
578
579 fs_reg shader_end_time = get_timestamp();
580
581 /* Check that there weren't any timestamp reset events (assuming these
582 * were the only two timestamp reads that happened).
583 */
584 fs_reg reset = shader_end_time;
585 reset.smear = 2;
586 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
587 test->conditional_mod = BRW_CONDITIONAL_Z;
588 emit(IF(BRW_PREDICATE_NORMAL));
589
590 push_force_uncompressed();
591 fs_reg start = shader_start_time;
592 start.negate = true;
593 fs_reg diff = fs_reg(this, glsl_type::uint_type);
594 emit(ADD(diff, start, shader_end_time));
595
596 /* If there were no instructions between the two timestamp gets, the diff
597 * is 2 cycles. Remove that overhead, so I can forget about that when
598 * trying to determine the time taken for single instructions.
599 */
600 emit(ADD(diff, diff, fs_reg(-2u)));
601
602 emit_shader_time_write(type, diff);
603 emit_shader_time_write(written_type, fs_reg(1u));
604 emit(BRW_OPCODE_ELSE);
605 emit_shader_time_write(reset_type, fs_reg(1u));
606 emit(BRW_OPCODE_ENDIF);
607
608 pop_force_uncompressed();
609 }
610
611 void
612 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
613 fs_reg value)
614 {
615 int shader_time_index =
616 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
617 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
618
619 fs_reg payload;
620 if (dispatch_width == 8)
621 payload = fs_reg(this, glsl_type::uvec2_type);
622 else
623 payload = fs_reg(this, glsl_type::uint_type);
624
625 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
626 fs_reg(), payload, offset, value));
627 }
628
629 void
630 fs_visitor::fail(const char *format, ...)
631 {
632 va_list va;
633 char *msg;
634
635 if (failed)
636 return;
637
638 failed = true;
639
640 va_start(va, format);
641 msg = ralloc_vasprintf(mem_ctx, format, va);
642 va_end(va);
643 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
644
645 this->fail_msg = msg;
646
647 if (INTEL_DEBUG & DEBUG_WM) {
648 fprintf(stderr, "%s", msg);
649 }
650 }
651
652 fs_inst *
653 fs_visitor::emit(enum opcode opcode)
654 {
655 return emit(fs_inst(opcode));
656 }
657
658 fs_inst *
659 fs_visitor::emit(enum opcode opcode, fs_reg dst)
660 {
661 return emit(fs_inst(opcode, dst));
662 }
663
664 fs_inst *
665 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
666 {
667 return emit(fs_inst(opcode, dst, src0));
668 }
669
670 fs_inst *
671 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
672 {
673 return emit(fs_inst(opcode, dst, src0, src1));
674 }
675
676 fs_inst *
677 fs_visitor::emit(enum opcode opcode, fs_reg dst,
678 fs_reg src0, fs_reg src1, fs_reg src2)
679 {
680 return emit(fs_inst(opcode, dst, src0, src1, src2));
681 }
682
683 void
684 fs_visitor::push_force_uncompressed()
685 {
686 force_uncompressed_stack++;
687 }
688
689 void
690 fs_visitor::pop_force_uncompressed()
691 {
692 force_uncompressed_stack--;
693 assert(force_uncompressed_stack >= 0);
694 }
695
696 /**
697 * Returns true if the instruction has a flag that means it won't
698 * update an entire destination register.
699 *
700 * For example, dead code elimination and live variable analysis want to know
701 * when a write to a variable screens off any preceding values that were in
702 * it.
703 */
704 bool
705 fs_inst::is_partial_write()
706 {
707 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
708 this->force_uncompressed ||
709 this->force_sechalf);
710 }
711
712 int
713 fs_inst::regs_read(fs_visitor *v, int arg)
714 {
715 if (is_tex() && arg == 0 && src[0].file == GRF) {
716 if (v->dispatch_width == 16)
717 return (mlen + 1) / 2;
718 else
719 return mlen;
720 }
721 return 1;
722 }
723
724 bool
725 fs_inst::reads_flag()
726 {
727 return predicate;
728 }
729
730 bool
731 fs_inst::writes_flag()
732 {
733 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
734 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
735 }
736
737 /**
738 * Returns how many MRFs an FS opcode will write over.
739 *
740 * Note that this is not the 0 or 1 implied writes in an actual gen
741 * instruction -- the FS opcodes often generate MOVs in addition.
742 */
743 int
744 fs_visitor::implied_mrf_writes(fs_inst *inst)
745 {
746 if (inst->mlen == 0)
747 return 0;
748
749 if (inst->base_mrf == -1)
750 return 0;
751
752 switch (inst->opcode) {
753 case SHADER_OPCODE_RCP:
754 case SHADER_OPCODE_RSQ:
755 case SHADER_OPCODE_SQRT:
756 case SHADER_OPCODE_EXP2:
757 case SHADER_OPCODE_LOG2:
758 case SHADER_OPCODE_SIN:
759 case SHADER_OPCODE_COS:
760 return 1 * dispatch_width / 8;
761 case SHADER_OPCODE_POW:
762 case SHADER_OPCODE_INT_QUOTIENT:
763 case SHADER_OPCODE_INT_REMAINDER:
764 return 2 * dispatch_width / 8;
765 case SHADER_OPCODE_TEX:
766 case FS_OPCODE_TXB:
767 case SHADER_OPCODE_TXD:
768 case SHADER_OPCODE_TXF:
769 case SHADER_OPCODE_TXF_MS:
770 case SHADER_OPCODE_TXF_MCS:
771 case SHADER_OPCODE_TG4:
772 case SHADER_OPCODE_TG4_OFFSET:
773 case SHADER_OPCODE_TXL:
774 case SHADER_OPCODE_TXS:
775 case SHADER_OPCODE_LOD:
776 return 1;
777 case FS_OPCODE_FB_WRITE:
778 return 2;
779 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
780 case SHADER_OPCODE_GEN4_SCRATCH_READ:
781 return 1;
782 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
783 return inst->mlen;
784 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
785 return 2;
786 case SHADER_OPCODE_UNTYPED_ATOMIC:
787 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
788 return 0;
789 default:
790 assert(!"not reached");
791 return inst->mlen;
792 }
793 }
794
795 int
796 fs_visitor::virtual_grf_alloc(int size)
797 {
798 if (virtual_grf_array_size <= virtual_grf_count) {
799 if (virtual_grf_array_size == 0)
800 virtual_grf_array_size = 16;
801 else
802 virtual_grf_array_size *= 2;
803 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
804 virtual_grf_array_size);
805 }
806 virtual_grf_sizes[virtual_grf_count] = size;
807 return virtual_grf_count++;
808 }
809
810 /** Fixed HW reg constructor. */
811 fs_reg::fs_reg(enum register_file file, int reg)
812 {
813 init();
814 this->file = file;
815 this->reg = reg;
816 this->type = BRW_REGISTER_TYPE_F;
817 }
818
819 /** Fixed HW reg constructor. */
820 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
821 {
822 init();
823 this->file = file;
824 this->reg = reg;
825 this->type = type;
826 }
827
828 /** Automatic reg constructor. */
829 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
830 {
831 init();
832
833 this->file = GRF;
834 this->reg = v->virtual_grf_alloc(v->type_size(type));
835 this->reg_offset = 0;
836 this->type = brw_type_for_base_type(type);
837 }
838
839 fs_reg *
840 fs_visitor::variable_storage(ir_variable *var)
841 {
842 return (fs_reg *)hash_table_find(this->variable_ht, var);
843 }
844
845 void
846 import_uniforms_callback(const void *key,
847 void *data,
848 void *closure)
849 {
850 struct hash_table *dst_ht = (struct hash_table *)closure;
851 const fs_reg *reg = (const fs_reg *)data;
852
853 if (reg->file != UNIFORM)
854 return;
855
856 hash_table_insert(dst_ht, data, key);
857 }
858
859 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
860 * This brings in those uniform definitions
861 */
862 void
863 fs_visitor::import_uniforms(fs_visitor *v)
864 {
865 hash_table_call_foreach(v->variable_ht,
866 import_uniforms_callback,
867 variable_ht);
868 this->params_remap = v->params_remap;
869 this->nr_params_remap = v->nr_params_remap;
870 }
871
872 /* Our support for uniforms is piggy-backed on the struct
873 * gl_fragment_program, because that's where the values actually
874 * get stored, rather than in some global gl_shader_program uniform
875 * store.
876 */
877 void
878 fs_visitor::setup_uniform_values(ir_variable *ir)
879 {
880 int namelen = strlen(ir->name);
881
882 /* The data for our (non-builtin) uniforms is stored in a series of
883 * gl_uniform_driver_storage structs for each subcomponent that
884 * glGetUniformLocation() could name. We know it's been set up in the same
885 * order we'd walk the type, so walk the list of storage and find anything
886 * with our name, or the prefix of a component that starts with our name.
887 */
888 unsigned params_before = c->prog_data.nr_params;
889 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
890 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
891
892 if (strncmp(ir->name, storage->name, namelen) != 0 ||
893 (storage->name[namelen] != 0 &&
894 storage->name[namelen] != '.' &&
895 storage->name[namelen] != '[')) {
896 continue;
897 }
898
899 unsigned slots = storage->type->component_slots();
900 if (storage->array_elements)
901 slots *= storage->array_elements;
902
903 for (unsigned i = 0; i < slots; i++) {
904 c->prog_data.param[c->prog_data.nr_params++] =
905 &storage->storage[i].f;
906 }
907 }
908
909 /* Make sure we actually initialized the right amount of stuff here. */
910 assert(params_before + ir->type->component_slots() ==
911 c->prog_data.nr_params);
912 (void)params_before;
913 }
914
915
916 /* Our support for builtin uniforms is even scarier than non-builtin.
917 * It sits on top of the PROG_STATE_VAR parameters that are
918 * automatically updated from GL context state.
919 */
920 void
921 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
922 {
923 const ir_state_slot *const slots = ir->state_slots;
924 assert(ir->state_slots != NULL);
925
926 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
927 /* This state reference has already been setup by ir_to_mesa, but we'll
928 * get the same index back here.
929 */
930 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
931 (gl_state_index *)slots[i].tokens);
932
933 /* Add each of the unique swizzles of the element as a parameter.
934 * This'll end up matching the expected layout of the
935 * array/matrix/structure we're trying to fill in.
936 */
937 int last_swiz = -1;
938 for (unsigned int j = 0; j < 4; j++) {
939 int swiz = GET_SWZ(slots[i].swizzle, j);
940 if (swiz == last_swiz)
941 break;
942 last_swiz = swiz;
943
944 c->prog_data.param[c->prog_data.nr_params++] =
945 &fp->Base.Parameters->ParameterValues[index][swiz].f;
946 }
947 }
948 }
949
950 fs_reg *
951 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
952 {
953 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
954 fs_reg wpos = *reg;
955 bool flip = !ir->data.origin_upper_left ^ c->key.render_to_fbo;
956
957 /* gl_FragCoord.x */
958 if (ir->data.pixel_center_integer) {
959 emit(MOV(wpos, this->pixel_x));
960 } else {
961 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
962 }
963 wpos.reg_offset++;
964
965 /* gl_FragCoord.y */
966 if (!flip && ir->data.pixel_center_integer) {
967 emit(MOV(wpos, this->pixel_y));
968 } else {
969 fs_reg pixel_y = this->pixel_y;
970 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
971
972 if (flip) {
973 pixel_y.negate = true;
974 offset += c->key.drawable_height - 1.0;
975 }
976
977 emit(ADD(wpos, pixel_y, fs_reg(offset)));
978 }
979 wpos.reg_offset++;
980
981 /* gl_FragCoord.z */
982 if (brw->gen >= 6) {
983 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
984 } else {
985 emit(FS_OPCODE_LINTERP, wpos,
986 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
987 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
988 interp_reg(VARYING_SLOT_POS, 2));
989 }
990 wpos.reg_offset++;
991
992 /* gl_FragCoord.w: Already set up in emit_interpolation */
993 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
994
995 return reg;
996 }
997
998 fs_inst *
999 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1000 glsl_interp_qualifier interpolation_mode,
1001 bool is_centroid, bool is_sample)
1002 {
1003 brw_wm_barycentric_interp_mode barycoord_mode;
1004 if (brw->gen >= 6) {
1005 if (is_centroid) {
1006 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1007 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1008 else
1009 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1010 } else if (is_sample) {
1011 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1012 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1013 else
1014 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1015 } else {
1016 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1017 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1018 else
1019 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1020 }
1021 } else {
1022 /* On Ironlake and below, there is only one interpolation mode.
1023 * Centroid interpolation doesn't mean anything on this hardware --
1024 * there is no multisampling.
1025 */
1026 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1027 }
1028 return emit(FS_OPCODE_LINTERP, attr,
1029 this->delta_x[barycoord_mode],
1030 this->delta_y[barycoord_mode], interp);
1031 }
1032
1033 fs_reg *
1034 fs_visitor::emit_general_interpolation(ir_variable *ir)
1035 {
1036 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1037 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1038 fs_reg attr = *reg;
1039
1040 unsigned int array_elements;
1041 const glsl_type *type;
1042
1043 if (ir->type->is_array()) {
1044 array_elements = ir->type->length;
1045 if (array_elements == 0) {
1046 fail("dereferenced array '%s' has length 0\n", ir->name);
1047 }
1048 type = ir->type->fields.array;
1049 } else {
1050 array_elements = 1;
1051 type = ir->type;
1052 }
1053
1054 glsl_interp_qualifier interpolation_mode =
1055 ir->determine_interpolation_mode(c->key.flat_shade);
1056
1057 int location = ir->data.location;
1058 for (unsigned int i = 0; i < array_elements; i++) {
1059 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1060 if (c->prog_data.urb_setup[location] == -1) {
1061 /* If there's no incoming setup data for this slot, don't
1062 * emit interpolation for it.
1063 */
1064 attr.reg_offset += type->vector_elements;
1065 location++;
1066 continue;
1067 }
1068
1069 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1070 /* Constant interpolation (flat shading) case. The SF has
1071 * handed us defined values in only the constant offset
1072 * field of the setup reg.
1073 */
1074 for (unsigned int k = 0; k < type->vector_elements; k++) {
1075 struct brw_reg interp = interp_reg(location, k);
1076 interp = suboffset(interp, 3);
1077 interp.type = reg->type;
1078 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1079 attr.reg_offset++;
1080 }
1081 } else {
1082 /* Smooth/noperspective interpolation case. */
1083 for (unsigned int k = 0; k < type->vector_elements; k++) {
1084 /* FINISHME: At some point we probably want to push
1085 * this farther by giving similar treatment to the
1086 * other potentially constant components of the
1087 * attribute, as well as making brw_vs_constval.c
1088 * handle varyings other than gl_TexCoord.
1089 */
1090 struct brw_reg interp = interp_reg(location, k);
1091 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1092 ir->data.centroid,
1093 ir->data.sample || c->key.persample_shading);
1094 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1095 /* Get the pixel/sample mask into f0 so that we know
1096 * which pixels are lit. Then, for each channel that is
1097 * unlit, replace the centroid data with non-centroid
1098 * data.
1099 */
1100 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1101 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1102 interpolation_mode,
1103 false, false);
1104 inst->predicate = BRW_PREDICATE_NORMAL;
1105 inst->predicate_inverse = true;
1106 }
1107 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1108 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1109 }
1110 attr.reg_offset++;
1111 }
1112
1113 }
1114 location++;
1115 }
1116 }
1117
1118 return reg;
1119 }
1120
1121 fs_reg *
1122 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1123 {
1124 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1125
1126 /* The frontfacing comes in as a bit in the thread payload. */
1127 if (brw->gen >= 6) {
1128 emit(BRW_OPCODE_ASR, *reg,
1129 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1130 fs_reg(15));
1131 emit(BRW_OPCODE_NOT, *reg, *reg);
1132 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1133 } else {
1134 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1135 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1136 * us front face
1137 */
1138 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1139 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1140 }
1141
1142 return reg;
1143 }
1144
1145 void
1146 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1147 {
1148 assert(dst.type == BRW_REGISTER_TYPE_F);
1149
1150 if (c->key.compute_pos_offset) {
1151 /* Convert int_sample_pos to floating point */
1152 emit(MOV(dst, int_sample_pos));
1153 /* Scale to the range [0, 1] */
1154 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1155 }
1156 else {
1157 /* From ARB_sample_shading specification:
1158 * "When rendering to a non-multisample buffer, or if multisample
1159 * rasterization is disabled, gl_SamplePosition will always be
1160 * (0.5, 0.5).
1161 */
1162 emit(MOV(dst, fs_reg(0.5f)));
1163 }
1164 }
1165
1166 fs_reg *
1167 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1168 {
1169 assert(brw->gen >= 6);
1170 assert(ir->type == glsl_type::vec2_type);
1171
1172 this->current_annotation = "compute sample position";
1173 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1174 fs_reg pos = *reg;
1175 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1176 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1177
1178 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1179 * mode will be enabled.
1180 *
1181 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1182 * R31.1:0 Position Offset X/Y for Slot[3:0]
1183 * R31.3:2 Position Offset X/Y for Slot[7:4]
1184 * .....
1185 *
1186 * The X, Y sample positions come in as bytes in thread payload. So, read
1187 * the positions using vstride=16, width=8, hstride=2.
1188 */
1189 struct brw_reg sample_pos_reg =
1190 stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1191 BRW_REGISTER_TYPE_B), 16, 8, 2);
1192
1193 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1194 if (dispatch_width == 16) {
1195 int_sample_x.sechalf = true;
1196 fs_inst *inst = emit(MOV(int_sample_x,
1197 fs_reg(suboffset(sample_pos_reg, 16))));
1198 inst->force_sechalf = true;
1199 int_sample_x.sechalf = false;
1200 }
1201 /* Compute gl_SamplePosition.x */
1202 compute_sample_position(pos, int_sample_x);
1203 pos.reg_offset++;
1204 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1205 if (dispatch_width == 16) {
1206 int_sample_y.sechalf = true;
1207 fs_inst *inst = emit(MOV(int_sample_y,
1208 fs_reg(suboffset(sample_pos_reg, 17))));
1209 inst->force_sechalf = true;
1210 int_sample_y.sechalf = false;
1211 }
1212 /* Compute gl_SamplePosition.y */
1213 compute_sample_position(pos, int_sample_y);
1214 return reg;
1215 }
1216
1217 fs_reg *
1218 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1219 {
1220 assert(brw->gen >= 6);
1221
1222 this->current_annotation = "compute sample id";
1223 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1224
1225 if (c->key.compute_sample_id) {
1226 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1227 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1228 t2.type = BRW_REGISTER_TYPE_UW;
1229
1230 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1231 * 8x multisampling, subspan 0 will represent sample N (where N
1232 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1233 * 7. We can find the value of N by looking at R0.0 bits 7:6
1234 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1235 * (since samples are always delivered in pairs). That is, we
1236 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1237 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1238 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1239 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1240 * populating a temporary variable with the sequence (0, 1, 2, 3),
1241 * and then reading from it using vstride=1, width=4, hstride=0.
1242 * These computations hold good for 4x multisampling as well.
1243 */
1244 emit(BRW_OPCODE_AND, t1,
1245 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1246 fs_reg(brw_imm_d(0xc0)));
1247 emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1248 /* This works for both SIMD8 and SIMD16 */
1249 emit(MOV(t2, brw_imm_v(0x3210)));
1250 /* This special instruction takes care of setting vstride=1,
1251 * width=4, hstride=0 of t2 during an ADD instruction.
1252 */
1253 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1254 } else {
1255 /* As per GL_ARB_sample_shading specification:
1256 * "When rendering to a non-multisample buffer, or if multisample
1257 * rasterization is disabled, gl_SampleID will always be zero."
1258 */
1259 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1260 }
1261
1262 return reg;
1263 }
1264
1265 fs_reg *
1266 fs_visitor::emit_samplemaskin_setup(ir_variable *ir)
1267 {
1268 assert(brw->gen >= 7);
1269 this->current_annotation = "compute gl_SampleMaskIn";
1270 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1271 emit(MOV(*reg, fs_reg(retype(brw_vec8_grf(c->sample_mask_reg, 0), BRW_REGISTER_TYPE_D))));
1272 return reg;
1273 }
1274
1275 fs_reg
1276 fs_visitor::fix_math_operand(fs_reg src)
1277 {
1278 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1279 * might be able to do better by doing execsize = 1 math and then
1280 * expanding that result out, but we would need to be careful with
1281 * masking.
1282 *
1283 * The hardware ignores source modifiers (negate and abs) on math
1284 * instructions, so we also move to a temp to set those up.
1285 */
1286 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1287 !src.abs && !src.negate)
1288 return src;
1289
1290 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1291 * operands to math
1292 */
1293 if (brw->gen >= 7 && src.file != IMM)
1294 return src;
1295
1296 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1297 expanded.type = src.type;
1298 emit(BRW_OPCODE_MOV, expanded, src);
1299 return expanded;
1300 }
1301
1302 fs_inst *
1303 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1304 {
1305 switch (opcode) {
1306 case SHADER_OPCODE_RCP:
1307 case SHADER_OPCODE_RSQ:
1308 case SHADER_OPCODE_SQRT:
1309 case SHADER_OPCODE_EXP2:
1310 case SHADER_OPCODE_LOG2:
1311 case SHADER_OPCODE_SIN:
1312 case SHADER_OPCODE_COS:
1313 break;
1314 default:
1315 assert(!"not reached: bad math opcode");
1316 return NULL;
1317 }
1318
1319 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1320 * might be able to do better by doing execsize = 1 math and then
1321 * expanding that result out, but we would need to be careful with
1322 * masking.
1323 *
1324 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1325 * instructions, so we also move to a temp to set those up.
1326 */
1327 if (brw->gen >= 6)
1328 src = fix_math_operand(src);
1329
1330 fs_inst *inst = emit(opcode, dst, src);
1331
1332 if (brw->gen < 6) {
1333 inst->base_mrf = 2;
1334 inst->mlen = dispatch_width / 8;
1335 }
1336
1337 return inst;
1338 }
1339
1340 fs_inst *
1341 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1342 {
1343 int base_mrf = 2;
1344 fs_inst *inst;
1345
1346 switch (opcode) {
1347 case SHADER_OPCODE_INT_QUOTIENT:
1348 case SHADER_OPCODE_INT_REMAINDER:
1349 if (brw->gen >= 7 && dispatch_width == 16)
1350 fail("SIMD16 INTDIV unsupported\n");
1351 break;
1352 case SHADER_OPCODE_POW:
1353 break;
1354 default:
1355 assert(!"not reached: unsupported binary math opcode.");
1356 return NULL;
1357 }
1358
1359 if (brw->gen >= 6) {
1360 src0 = fix_math_operand(src0);
1361 src1 = fix_math_operand(src1);
1362
1363 inst = emit(opcode, dst, src0, src1);
1364 } else {
1365 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1366 * "Message Payload":
1367 *
1368 * "Operand0[7]. For the INT DIV functions, this operand is the
1369 * denominator."
1370 * ...
1371 * "Operand1[7]. For the INT DIV functions, this operand is the
1372 * numerator."
1373 */
1374 bool is_int_div = opcode != SHADER_OPCODE_POW;
1375 fs_reg &op0 = is_int_div ? src1 : src0;
1376 fs_reg &op1 = is_int_div ? src0 : src1;
1377
1378 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1379 inst = emit(opcode, dst, op0, reg_null_f);
1380
1381 inst->base_mrf = base_mrf;
1382 inst->mlen = 2 * dispatch_width / 8;
1383 }
1384 return inst;
1385 }
1386
1387 void
1388 fs_visitor::assign_curb_setup()
1389 {
1390 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1391 if (dispatch_width == 8) {
1392 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1393 } else {
1394 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1395 }
1396
1397 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1398 foreach_list(node, &this->instructions) {
1399 fs_inst *inst = (fs_inst *)node;
1400
1401 for (unsigned int i = 0; i < 3; i++) {
1402 if (inst->src[i].file == UNIFORM) {
1403 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1404 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1405 constant_nr / 8,
1406 constant_nr % 8);
1407
1408 inst->src[i].file = HW_REG;
1409 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1410 }
1411 }
1412 }
1413 }
1414
1415 void
1416 fs_visitor::calculate_urb_setup()
1417 {
1418 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1419 c->prog_data.urb_setup[i] = -1;
1420 }
1421
1422 int urb_next = 0;
1423 /* Figure out where each of the incoming setup attributes lands. */
1424 if (brw->gen >= 6) {
1425 if (_mesa_bitcount_64(fp->Base.InputsRead &
1426 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1427 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1428 * first 16 varying inputs, so we can put them wherever we want.
1429 * Just put them in order.
1430 *
1431 * This is useful because it means that (a) inputs not used by the
1432 * fragment shader won't take up valuable register space, and (b) we
1433 * won't have to recompile the fragment shader if it gets paired with
1434 * a different vertex (or geometry) shader.
1435 */
1436 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1437 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1438 BITFIELD64_BIT(i)) {
1439 c->prog_data.urb_setup[i] = urb_next++;
1440 }
1441 }
1442 } else {
1443 /* We have enough input varyings that the SF/SBE pipeline stage can't
1444 * arbitrarily rearrange them to suit our whim; we have to put them
1445 * in an order that matches the output of the previous pipeline stage
1446 * (geometry or vertex shader).
1447 */
1448 struct brw_vue_map prev_stage_vue_map;
1449 brw_compute_vue_map(brw, &prev_stage_vue_map,
1450 c->key.input_slots_valid);
1451 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1452 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1453 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1454 slot++) {
1455 int varying = prev_stage_vue_map.slot_to_varying[slot];
1456 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1457 * unused.
1458 */
1459 if (varying != BRW_VARYING_SLOT_COUNT &&
1460 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1461 BITFIELD64_BIT(varying))) {
1462 c->prog_data.urb_setup[varying] = slot - first_slot;
1463 }
1464 }
1465 urb_next = prev_stage_vue_map.num_slots - first_slot;
1466 }
1467 } else {
1468 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1469 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1470 /* Point size is packed into the header, not as a general attribute */
1471 if (i == VARYING_SLOT_PSIZ)
1472 continue;
1473
1474 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1475 /* The back color slot is skipped when the front color is
1476 * also written to. In addition, some slots can be
1477 * written in the vertex shader and not read in the
1478 * fragment shader. So the register number must always be
1479 * incremented, mapped or not.
1480 */
1481 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1482 c->prog_data.urb_setup[i] = urb_next;
1483 urb_next++;
1484 }
1485 }
1486
1487 /*
1488 * It's a FS only attribute, and we did interpolation for this attribute
1489 * in SF thread. So, count it here, too.
1490 *
1491 * See compile_sf_prog() for more info.
1492 */
1493 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1494 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1495 }
1496
1497 c->prog_data.num_varying_inputs = urb_next;
1498 }
1499
1500 void
1501 fs_visitor::assign_urb_setup()
1502 {
1503 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1504
1505 /* Offset all the urb_setup[] index by the actual position of the
1506 * setup regs, now that the location of the constants has been chosen.
1507 */
1508 foreach_list(node, &this->instructions) {
1509 fs_inst *inst = (fs_inst *)node;
1510
1511 if (inst->opcode == FS_OPCODE_LINTERP) {
1512 assert(inst->src[2].file == HW_REG);
1513 inst->src[2].fixed_hw_reg.nr += urb_start;
1514 }
1515
1516 if (inst->opcode == FS_OPCODE_CINTERP) {
1517 assert(inst->src[0].file == HW_REG);
1518 inst->src[0].fixed_hw_reg.nr += urb_start;
1519 }
1520 }
1521
1522 /* Each attribute is 4 setup channels, each of which is half a reg. */
1523 this->first_non_payload_grf =
1524 urb_start + c->prog_data.num_varying_inputs * 2;
1525 }
1526
1527 /**
1528 * Split large virtual GRFs into separate components if we can.
1529 *
1530 * This is mostly duplicated with what brw_fs_vector_splitting does,
1531 * but that's really conservative because it's afraid of doing
1532 * splitting that doesn't result in real progress after the rest of
1533 * the optimization phases, which would cause infinite looping in
1534 * optimization. We can do it once here, safely. This also has the
1535 * opportunity to split interpolated values, or maybe even uniforms,
1536 * which we don't have at the IR level.
1537 *
1538 * We want to split, because virtual GRFs are what we register
1539 * allocate and spill (due to contiguousness requirements for some
1540 * instructions), and they're what we naturally generate in the
1541 * codegen process, but most virtual GRFs don't actually need to be
1542 * contiguous sets of GRFs. If we split, we'll end up with reduced
1543 * live intervals and better dead code elimination and coalescing.
1544 */
1545 void
1546 fs_visitor::split_virtual_grfs()
1547 {
1548 int num_vars = this->virtual_grf_count;
1549 bool split_grf[num_vars];
1550 int new_virtual_grf[num_vars];
1551
1552 /* Try to split anything > 0 sized. */
1553 for (int i = 0; i < num_vars; i++) {
1554 if (this->virtual_grf_sizes[i] != 1)
1555 split_grf[i] = true;
1556 else
1557 split_grf[i] = false;
1558 }
1559
1560 if (brw->has_pln &&
1561 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1562 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1563 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1564 * Gen6, that was the only supported interpolation mode, and since Gen6,
1565 * delta_x and delta_y are in fixed hardware registers.
1566 */
1567 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1568 false;
1569 }
1570
1571 foreach_list(node, &this->instructions) {
1572 fs_inst *inst = (fs_inst *)node;
1573
1574 /* If there's a SEND message that requires contiguous destination
1575 * registers, no splitting is allowed.
1576 */
1577 if (inst->regs_written > 1) {
1578 split_grf[inst->dst.reg] = false;
1579 }
1580
1581 /* If we're sending from a GRF, don't split it, on the assumption that
1582 * the send is reading the whole thing.
1583 */
1584 if (inst->is_send_from_grf()) {
1585 for (int i = 0; i < 3; i++) {
1586 if (inst->src[i].file == GRF) {
1587 split_grf[inst->src[i].reg] = false;
1588 }
1589 }
1590 }
1591 }
1592
1593 /* Allocate new space for split regs. Note that the virtual
1594 * numbers will be contiguous.
1595 */
1596 for (int i = 0; i < num_vars; i++) {
1597 if (split_grf[i]) {
1598 new_virtual_grf[i] = virtual_grf_alloc(1);
1599 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1600 int reg = virtual_grf_alloc(1);
1601 assert(reg == new_virtual_grf[i] + j - 1);
1602 (void) reg;
1603 }
1604 this->virtual_grf_sizes[i] = 1;
1605 }
1606 }
1607
1608 foreach_list(node, &this->instructions) {
1609 fs_inst *inst = (fs_inst *)node;
1610
1611 if (inst->dst.file == GRF &&
1612 split_grf[inst->dst.reg] &&
1613 inst->dst.reg_offset != 0) {
1614 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1615 inst->dst.reg_offset - 1);
1616 inst->dst.reg_offset = 0;
1617 }
1618 for (int i = 0; i < 3; i++) {
1619 if (inst->src[i].file == GRF &&
1620 split_grf[inst->src[i].reg] &&
1621 inst->src[i].reg_offset != 0) {
1622 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1623 inst->src[i].reg_offset - 1);
1624 inst->src[i].reg_offset = 0;
1625 }
1626 }
1627 }
1628 invalidate_live_intervals();
1629 }
1630
1631 /**
1632 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1633 *
1634 * During code generation, we create tons of temporary variables, many of
1635 * which get immediately killed and are never used again. Yet, in later
1636 * optimization and analysis passes, such as compute_live_intervals, we need
1637 * to loop over all the virtual GRFs. Compacting them can save a lot of
1638 * overhead.
1639 */
1640 void
1641 fs_visitor::compact_virtual_grfs()
1642 {
1643 /* Mark which virtual GRFs are used, and count how many. */
1644 int remap_table[this->virtual_grf_count];
1645 memset(remap_table, -1, sizeof(remap_table));
1646
1647 foreach_list(node, &this->instructions) {
1648 const fs_inst *inst = (const fs_inst *) node;
1649
1650 if (inst->dst.file == GRF)
1651 remap_table[inst->dst.reg] = 0;
1652
1653 for (int i = 0; i < 3; i++) {
1654 if (inst->src[i].file == GRF)
1655 remap_table[inst->src[i].reg] = 0;
1656 }
1657 }
1658
1659 /* In addition to registers used in instructions, fs_visitor keeps
1660 * direct references to certain special values which must be patched:
1661 */
1662 fs_reg *special[] = {
1663 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1664 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1665 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1666 &delta_x[0], &delta_x[1], &delta_x[2],
1667 &delta_x[3], &delta_x[4], &delta_x[5],
1668 &delta_y[0], &delta_y[1], &delta_y[2],
1669 &delta_y[3], &delta_y[4], &delta_y[5],
1670 };
1671 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1672 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1673
1674 /* Treat all special values as used, to be conservative */
1675 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1676 if (special[i]->file == GRF)
1677 remap_table[special[i]->reg] = 0;
1678 }
1679
1680 /* Compact the GRF arrays. */
1681 int new_index = 0;
1682 for (int i = 0; i < this->virtual_grf_count; i++) {
1683 if (remap_table[i] != -1) {
1684 remap_table[i] = new_index;
1685 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1686 invalidate_live_intervals();
1687 ++new_index;
1688 }
1689 }
1690
1691 this->virtual_grf_count = new_index;
1692
1693 /* Patch all the instructions to use the newly renumbered registers */
1694 foreach_list(node, &this->instructions) {
1695 fs_inst *inst = (fs_inst *) node;
1696
1697 if (inst->dst.file == GRF)
1698 inst->dst.reg = remap_table[inst->dst.reg];
1699
1700 for (int i = 0; i < 3; i++) {
1701 if (inst->src[i].file == GRF)
1702 inst->src[i].reg = remap_table[inst->src[i].reg];
1703 }
1704 }
1705
1706 /* Patch all the references to special values */
1707 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1708 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1709 special[i]->reg = remap_table[special[i]->reg];
1710 }
1711 }
1712
1713 bool
1714 fs_visitor::remove_dead_constants()
1715 {
1716 if (dispatch_width == 8) {
1717 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1718 this->nr_params_remap = c->prog_data.nr_params;
1719
1720 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1721 this->params_remap[i] = -1;
1722
1723 /* Find which params are still in use. */
1724 foreach_list(node, &this->instructions) {
1725 fs_inst *inst = (fs_inst *)node;
1726
1727 for (int i = 0; i < 3; i++) {
1728 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1729
1730 if (inst->src[i].file != UNIFORM)
1731 continue;
1732
1733 /* Section 5.11 of the OpenGL 4.3 spec says:
1734 *
1735 * "Out-of-bounds reads return undefined values, which include
1736 * values from other variables of the active program or zero."
1737 */
1738 if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1739 constant_nr = 0;
1740 }
1741
1742 /* For now, set this to non-negative. We'll give it the
1743 * actual new number in a moment, in order to keep the
1744 * register numbers nicely ordered.
1745 */
1746 this->params_remap[constant_nr] = 0;
1747 }
1748 }
1749
1750 /* Figure out what the new numbers for the params will be. At some
1751 * point when we're doing uniform array access, we're going to want
1752 * to keep the distinction between .reg and .reg_offset, but for
1753 * now we don't care.
1754 */
1755 unsigned int new_nr_params = 0;
1756 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1757 if (this->params_remap[i] != -1) {
1758 this->params_remap[i] = new_nr_params++;
1759 }
1760 }
1761
1762 /* Update the list of params to be uploaded to match our new numbering. */
1763 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1764 int remapped = this->params_remap[i];
1765
1766 if (remapped == -1)
1767 continue;
1768
1769 c->prog_data.param[remapped] = c->prog_data.param[i];
1770 }
1771
1772 c->prog_data.nr_params = new_nr_params;
1773 } else {
1774 /* This should have been generated in the SIMD8 pass already. */
1775 assert(this->params_remap);
1776 }
1777
1778 /* Now do the renumbering of the shader to remove unused params. */
1779 foreach_list(node, &this->instructions) {
1780 fs_inst *inst = (fs_inst *)node;
1781
1782 for (int i = 0; i < 3; i++) {
1783 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1784
1785 if (inst->src[i].file != UNIFORM)
1786 continue;
1787
1788 /* as above alias to 0 */
1789 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1790 constant_nr = 0;
1791 }
1792 assert(this->params_remap[constant_nr] != -1);
1793 inst->src[i].reg = this->params_remap[constant_nr];
1794 inst->src[i].reg_offset = 0;
1795 }
1796 }
1797
1798 return true;
1799 }
1800
1801 /*
1802 * Implements array access of uniforms by inserting a
1803 * PULL_CONSTANT_LOAD instruction.
1804 *
1805 * Unlike temporary GRF array access (where we don't support it due to
1806 * the difficulty of doing relative addressing on instruction
1807 * destinations), we could potentially do array access of uniforms
1808 * that were loaded in GRF space as push constants. In real-world
1809 * usage we've seen, though, the arrays being used are always larger
1810 * than we could load as push constants, so just always move all
1811 * uniform array access out to a pull constant buffer.
1812 */
1813 void
1814 fs_visitor::move_uniform_array_access_to_pull_constants()
1815 {
1816 int pull_constant_loc[c->prog_data.nr_params];
1817
1818 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1819 pull_constant_loc[i] = -1;
1820 }
1821
1822 /* Walk through and find array access of uniforms. Put a copy of that
1823 * uniform in the pull constant buffer.
1824 *
1825 * Note that we don't move constant-indexed accesses to arrays. No
1826 * testing has been done of the performance impact of this choice.
1827 */
1828 foreach_list_safe(node, &this->instructions) {
1829 fs_inst *inst = (fs_inst *)node;
1830
1831 for (int i = 0 ; i < 3; i++) {
1832 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1833 continue;
1834
1835 int uniform = inst->src[i].reg;
1836
1837 /* If this array isn't already present in the pull constant buffer,
1838 * add it.
1839 */
1840 if (pull_constant_loc[uniform] == -1) {
1841 const float **values = &c->prog_data.param[uniform];
1842
1843 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1844
1845 assert(param_size[uniform]);
1846
1847 for (int j = 0; j < param_size[uniform]; j++) {
1848 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1849 values[j];
1850 }
1851 }
1852
1853 /* Set up the annotation tracking for new generated instructions. */
1854 base_ir = inst->ir;
1855 current_annotation = inst->annotation;
1856
1857 fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1858 fs_reg temp = fs_reg(this, glsl_type::float_type);
1859 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1860 surf_index,
1861 *inst->src[i].reladdr,
1862 pull_constant_loc[uniform] +
1863 inst->src[i].reg_offset);
1864 inst->insert_before(&list);
1865
1866 inst->src[i].file = temp.file;
1867 inst->src[i].reg = temp.reg;
1868 inst->src[i].reg_offset = temp.reg_offset;
1869 inst->src[i].reladdr = NULL;
1870 }
1871 }
1872 }
1873
1874 /**
1875 * Choose accesses from the UNIFORM file to demote to using the pull
1876 * constant buffer.
1877 *
1878 * We allow a fragment shader to have more than the specified minimum
1879 * maximum number of fragment shader uniform components (64). If
1880 * there are too many of these, they'd fill up all of register space.
1881 * So, this will push some of them out to the pull constant buffer and
1882 * update the program to load them.
1883 */
1884 void
1885 fs_visitor::setup_pull_constants()
1886 {
1887 /* Only allow 16 registers (128 uniform components) as push constants. */
1888 unsigned int max_uniform_components = 16 * 8;
1889 if (c->prog_data.nr_params <= max_uniform_components)
1890 return;
1891
1892 if (dispatch_width == 16) {
1893 fail("Pull constants not supported in SIMD16\n");
1894 return;
1895 }
1896
1897 /* Just demote the end of the list. We could probably do better
1898 * here, demoting things that are rarely used in the program first.
1899 */
1900 unsigned int pull_uniform_base = max_uniform_components;
1901
1902 int pull_constant_loc[c->prog_data.nr_params];
1903 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1904 if (i < pull_uniform_base) {
1905 pull_constant_loc[i] = -1;
1906 } else {
1907 pull_constant_loc[i] = -1;
1908 /* If our constant is already being uploaded for reladdr purposes,
1909 * reuse it.
1910 */
1911 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1912 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1913 pull_constant_loc[i] = j;
1914 break;
1915 }
1916 }
1917 if (pull_constant_loc[i] == -1) {
1918 int pull_index = c->prog_data.nr_pull_params++;
1919 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1920 pull_constant_loc[i] = pull_index;;
1921 }
1922 }
1923 }
1924 c->prog_data.nr_params = pull_uniform_base;
1925
1926 foreach_list(node, &this->instructions) {
1927 fs_inst *inst = (fs_inst *)node;
1928
1929 for (int i = 0; i < 3; i++) {
1930 if (inst->src[i].file != UNIFORM)
1931 continue;
1932
1933 int pull_index = pull_constant_loc[inst->src[i].reg +
1934 inst->src[i].reg_offset];
1935 if (pull_index == -1)
1936 continue;
1937
1938 assert(!inst->src[i].reladdr);
1939
1940 fs_reg dst = fs_reg(this, glsl_type::float_type);
1941 fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1942 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1943 fs_inst *pull =
1944 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1945 dst, index, offset);
1946 pull->ir = inst->ir;
1947 pull->annotation = inst->annotation;
1948
1949 inst->insert_before(pull);
1950
1951 inst->src[i].file = GRF;
1952 inst->src[i].reg = dst.reg;
1953 inst->src[i].reg_offset = 0;
1954 inst->src[i].smear = pull_index & 3;
1955 }
1956 }
1957 }
1958
1959 bool
1960 fs_visitor::opt_algebraic()
1961 {
1962 bool progress = false;
1963
1964 foreach_list(node, &this->instructions) {
1965 fs_inst *inst = (fs_inst *)node;
1966
1967 switch (inst->opcode) {
1968 case BRW_OPCODE_MUL:
1969 if (inst->src[1].file != IMM)
1970 continue;
1971
1972 /* a * 1.0 = a */
1973 if (inst->src[1].is_one()) {
1974 inst->opcode = BRW_OPCODE_MOV;
1975 inst->src[1] = reg_undef;
1976 progress = true;
1977 break;
1978 }
1979
1980 /* a * 0.0 = 0.0 */
1981 if (inst->src[1].is_zero()) {
1982 inst->opcode = BRW_OPCODE_MOV;
1983 inst->src[0] = inst->src[1];
1984 inst->src[1] = reg_undef;
1985 progress = true;
1986 break;
1987 }
1988
1989 break;
1990 case BRW_OPCODE_ADD:
1991 if (inst->src[1].file != IMM)
1992 continue;
1993
1994 /* a + 0.0 = a */
1995 if (inst->src[1].is_zero()) {
1996 inst->opcode = BRW_OPCODE_MOV;
1997 inst->src[1] = reg_undef;
1998 progress = true;
1999 break;
2000 }
2001 break;
2002 case BRW_OPCODE_OR:
2003 if (inst->src[0].equals(inst->src[1])) {
2004 inst->opcode = BRW_OPCODE_MOV;
2005 inst->src[1] = reg_undef;
2006 progress = true;
2007 break;
2008 }
2009 break;
2010 case BRW_OPCODE_LRP:
2011 if (inst->src[1].equals(inst->src[2])) {
2012 inst->opcode = BRW_OPCODE_MOV;
2013 inst->src[0] = inst->src[1];
2014 inst->src[1] = reg_undef;
2015 inst->src[2] = reg_undef;
2016 progress = true;
2017 break;
2018 }
2019 break;
2020 case BRW_OPCODE_SEL:
2021 if (inst->saturate && inst->src[1].file == IMM) {
2022 switch (inst->conditional_mod) {
2023 case BRW_CONDITIONAL_LE:
2024 case BRW_CONDITIONAL_L:
2025 switch (inst->src[1].type) {
2026 case BRW_REGISTER_TYPE_F:
2027 if (inst->src[1].imm.f >= 1.0f) {
2028 inst->opcode = BRW_OPCODE_MOV;
2029 inst->src[1] = reg_undef;
2030 progress = true;
2031 }
2032 break;
2033 default:
2034 break;
2035 }
2036 break;
2037 case BRW_CONDITIONAL_GE:
2038 case BRW_CONDITIONAL_G:
2039 switch (inst->src[1].type) {
2040 case BRW_REGISTER_TYPE_F:
2041 if (inst->src[1].imm.f <= 0.0f) {
2042 inst->opcode = BRW_OPCODE_MOV;
2043 inst->src[1] = reg_undef;
2044 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2045 progress = true;
2046 }
2047 break;
2048 default:
2049 break;
2050 }
2051 default:
2052 break;
2053 }
2054 }
2055 break;
2056 default:
2057 break;
2058 }
2059 }
2060
2061 return progress;
2062 }
2063
2064 /**
2065 * Removes any instructions writing a VGRF where that VGRF is not used by any
2066 * later instruction.
2067 */
2068 bool
2069 fs_visitor::dead_code_eliminate()
2070 {
2071 bool progress = false;
2072 int pc = 0;
2073
2074 calculate_live_intervals();
2075
2076 foreach_list_safe(node, &this->instructions) {
2077 fs_inst *inst = (fs_inst *)node;
2078
2079 if (inst->dst.file == GRF && !inst->has_side_effects()) {
2080 bool dead = true;
2081
2082 for (int i = 0; i < inst->regs_written; i++) {
2083 int var = live_intervals->var_from_vgrf[inst->dst.reg];
2084 assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2085 if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2086 dead = false;
2087 break;
2088 }
2089 }
2090
2091 if (dead) {
2092 /* Don't dead code eliminate instructions that write to the
2093 * accumulator as a side-effect. Instead just set the destination
2094 * to the null register to free it.
2095 */
2096 switch (inst->opcode) {
2097 case BRW_OPCODE_ADDC:
2098 case BRW_OPCODE_SUBB:
2099 case BRW_OPCODE_MACH:
2100 inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2101 break;
2102 default:
2103 inst->remove();
2104 progress = true;
2105 break;
2106 }
2107 }
2108 }
2109
2110 pc++;
2111 }
2112
2113 if (progress)
2114 invalidate_live_intervals();
2115
2116 return progress;
2117 }
2118
2119 struct dead_code_hash_key
2120 {
2121 int vgrf;
2122 int reg_offset;
2123 };
2124
2125 static bool
2126 dead_code_hash_compare(const void *a, const void *b)
2127 {
2128 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2129 }
2130
2131 static void
2132 clear_dead_code_hash(struct hash_table *ht)
2133 {
2134 struct hash_entry *entry;
2135
2136 hash_table_foreach(ht, entry) {
2137 _mesa_hash_table_remove(ht, entry);
2138 }
2139 }
2140
2141 static void
2142 insert_dead_code_hash(struct hash_table *ht,
2143 int vgrf, int reg_offset, fs_inst *inst)
2144 {
2145 /* We don't bother freeing keys, because they'll be GCed with the ht. */
2146 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2147
2148 key->vgrf = vgrf;
2149 key->reg_offset = reg_offset;
2150
2151 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2152 }
2153
2154 static struct hash_entry *
2155 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2156 {
2157 struct dead_code_hash_key key;
2158
2159 key.vgrf = vgrf;
2160 key.reg_offset = reg_offset;
2161
2162 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2163 }
2164
2165 static void
2166 remove_dead_code_hash(struct hash_table *ht,
2167 int vgrf, int reg_offset)
2168 {
2169 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2170 if (!entry)
2171 return;
2172
2173 _mesa_hash_table_remove(ht, entry);
2174 }
2175
2176 /**
2177 * Walks basic blocks, removing any regs that are written but not read before
2178 * being redefined.
2179 *
2180 * The dead_code_eliminate() function implements a global dead code
2181 * elimination, but it only handles the removing the last write to a register
2182 * if it's never read. This one can handle intermediate writes, but only
2183 * within a basic block.
2184 */
2185 bool
2186 fs_visitor::dead_code_eliminate_local()
2187 {
2188 struct hash_table *ht;
2189 bool progress = false;
2190
2191 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2192
2193 foreach_list_safe(node, &this->instructions) {
2194 fs_inst *inst = (fs_inst *)node;
2195
2196 /* At a basic block, empty the HT since we don't understand dataflow
2197 * here.
2198 */
2199 if (inst->is_control_flow()) {
2200 clear_dead_code_hash(ht);
2201 continue;
2202 }
2203
2204 /* Clear the HT of any instructions that got read. */
2205 for (int i = 0; i < 3; i++) {
2206 fs_reg src = inst->src[i];
2207 if (src.file != GRF)
2208 continue;
2209
2210 int read = 1;
2211 if (inst->is_send_from_grf())
2212 read = virtual_grf_sizes[src.reg] - src.reg_offset;
2213
2214 for (int reg_offset = src.reg_offset;
2215 reg_offset < src.reg_offset + read;
2216 reg_offset++) {
2217 remove_dead_code_hash(ht, src.reg, reg_offset);
2218 }
2219 }
2220
2221 /* Add any update of a GRF to the HT, removing a previous write if it
2222 * wasn't read.
2223 */
2224 if (inst->dst.file == GRF) {
2225 if (inst->regs_written > 1) {
2226 /* We don't know how to trim channels from an instruction's
2227 * writes, so we can't incrementally remove unread channels from
2228 * it. Just remove whatever it overwrites from the table
2229 */
2230 for (int i = 0; i < inst->regs_written; i++) {
2231 remove_dead_code_hash(ht,
2232 inst->dst.reg,
2233 inst->dst.reg_offset + i);
2234 }
2235 } else {
2236 struct hash_entry *entry =
2237 get_dead_code_hash_entry(ht, inst->dst.reg,
2238 inst->dst.reg_offset);
2239
2240 if (entry) {
2241 if (inst->is_partial_write()) {
2242 /* For a partial write, we can't remove any previous dead code
2243 * candidate, since we're just modifying their result.
2244 */
2245 } else {
2246 /* We're completely updating a channel, and there was a
2247 * previous write to the channel that wasn't read. Kill it!
2248 */
2249 fs_inst *inst = (fs_inst *)entry->data;
2250 inst->remove();
2251 progress = true;
2252 }
2253
2254 _mesa_hash_table_remove(ht, entry);
2255 }
2256
2257 if (!inst->has_side_effects())
2258 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2259 inst);
2260 }
2261 }
2262 }
2263
2264 _mesa_hash_table_destroy(ht, NULL);
2265
2266 if (progress)
2267 invalidate_live_intervals();
2268
2269 return progress;
2270 }
2271
2272 /**
2273 * Implements register coalescing: Checks if the two registers involved in a
2274 * raw move don't interfere, in which case they can both be stored in the same
2275 * place and the MOV removed.
2276 *
2277 * To do this, all uses of the source of the MOV in the shader are replaced
2278 * with the destination of the MOV. For example:
2279 *
2280 * add vgrf3:F, vgrf1:F, vgrf2:F
2281 * mov vgrf4:F, vgrf3:F
2282 * mul vgrf5:F, vgrf5:F, vgrf4:F
2283 *
2284 * becomes
2285 *
2286 * add vgrf4:F, vgrf1:F, vgrf2:F
2287 * mul vgrf5:F, vgrf5:F, vgrf4:F
2288 */
2289 bool
2290 fs_visitor::register_coalesce()
2291 {
2292 bool progress = false;
2293
2294 calculate_live_intervals();
2295
2296 int src_size = 0;
2297 int channels_remaining = 0;
2298 int reg_from = -1, reg_to = -1;
2299 int reg_to_offset[MAX_SAMPLER_MESSAGE_SIZE];
2300 fs_inst *mov[MAX_SAMPLER_MESSAGE_SIZE];
2301
2302 foreach_list(node, &this->instructions) {
2303 fs_inst *inst = (fs_inst *)node;
2304
2305 if (inst->opcode != BRW_OPCODE_MOV ||
2306 inst->is_partial_write() ||
2307 inst->saturate ||
2308 inst->src[0].file != GRF ||
2309 inst->src[0].negate ||
2310 inst->src[0].abs ||
2311 inst->src[0].smear != -1 ||
2312 inst->dst.file != GRF ||
2313 inst->dst.type != inst->src[0].type) {
2314 continue;
2315 }
2316
2317 if (virtual_grf_sizes[inst->src[0].reg] >
2318 virtual_grf_sizes[inst->dst.reg])
2319 continue;
2320
2321 int var_from = live_intervals->var_from_reg(&inst->src[0]);
2322 int var_to = live_intervals->var_from_reg(&inst->dst);
2323
2324 if (live_intervals->vars_interfere(var_from, var_to) &&
2325 !inst->dst.equals(inst->src[0])) {
2326
2327 /* We know that the live ranges of A (var_from) and B (var_to)
2328 * interfere because of the ->vars_interfere() call above. If the end
2329 * of B's live range is after the end of A's range, then we know two
2330 * things:
2331 * - the start of B's live range must be in A's live range (since we
2332 * already know the two ranges interfere, this is the only remaining
2333 * possibility)
2334 * - the interference isn't of the form we're looking for (where B is
2335 * entirely inside A)
2336 */
2337 if (live_intervals->end[var_to] > live_intervals->end[var_from])
2338 continue;
2339
2340 bool overwritten = false;
2341 int scan_ip = -1;
2342
2343 foreach_list(n, &this->instructions) {
2344 fs_inst *scan_inst = (fs_inst *)n;
2345 scan_ip++;
2346
2347 if (scan_inst->is_control_flow()) {
2348 overwritten = true;
2349 break;
2350 }
2351
2352 if (scan_ip <= live_intervals->start[var_to])
2353 continue;
2354
2355 if (scan_ip > live_intervals->end[var_to])
2356 break;
2357
2358 if (scan_inst->dst.equals(inst->dst) ||
2359 scan_inst->dst.equals(inst->src[0])) {
2360 overwritten = true;
2361 break;
2362 }
2363 }
2364
2365 if (overwritten)
2366 continue;
2367 }
2368
2369 if (reg_from != inst->src[0].reg) {
2370 reg_from = inst->src[0].reg;
2371
2372 src_size = virtual_grf_sizes[inst->src[0].reg];
2373 assert(src_size <= MAX_SAMPLER_MESSAGE_SIZE);
2374
2375 channels_remaining = src_size;
2376 memset(mov, 0, sizeof(mov));
2377
2378 reg_to = inst->dst.reg;
2379 }
2380
2381 if (reg_to != inst->dst.reg)
2382 continue;
2383
2384 const int offset = inst->src[0].reg_offset;
2385 reg_to_offset[offset] = inst->dst.reg_offset;
2386 mov[offset] = inst;
2387 channels_remaining--;
2388
2389 if (channels_remaining)
2390 continue;
2391
2392 bool removed = false;
2393 for (int i = 0; i < src_size; i++) {
2394 if (mov[i]) {
2395 removed = true;
2396
2397 mov[i]->opcode = BRW_OPCODE_NOP;
2398 mov[i]->conditional_mod = BRW_CONDITIONAL_NONE;
2399 mov[i]->dst = reg_undef;
2400 mov[i]->src[0] = reg_undef;
2401 mov[i]->src[1] = reg_undef;
2402 mov[i]->src[2] = reg_undef;
2403 }
2404 }
2405
2406 foreach_list(node, &this->instructions) {
2407 fs_inst *scan_inst = (fs_inst *)node;
2408
2409 for (int i = 0; i < src_size; i++) {
2410 if (mov[i]) {
2411 if (scan_inst->dst.file == GRF &&
2412 scan_inst->dst.reg == reg_from &&
2413 scan_inst->dst.reg_offset == i) {
2414 scan_inst->dst.reg = reg_to;
2415 scan_inst->dst.reg_offset = reg_to_offset[i];
2416 }
2417 for (int j = 0; j < 3; j++) {
2418 if (scan_inst->src[j].file == GRF &&
2419 scan_inst->src[j].reg == reg_from &&
2420 scan_inst->src[j].reg_offset == i) {
2421 scan_inst->src[j].reg = reg_to;
2422 scan_inst->src[j].reg_offset = reg_to_offset[i];
2423 }
2424 }
2425 }
2426 }
2427 }
2428
2429 if (removed) {
2430 live_intervals->start[var_to] = MIN2(live_intervals->start[var_to],
2431 live_intervals->start[var_from]);
2432 live_intervals->end[var_to] = MAX2(live_intervals->end[var_to],
2433 live_intervals->end[var_from]);
2434 reg_from = -1;
2435 }
2436 }
2437
2438 foreach_list_safe(node, &this->instructions) {
2439 fs_inst *inst = (fs_inst *)node;
2440
2441 if (inst->opcode == BRW_OPCODE_NOP) {
2442 inst->remove();
2443 progress = true;
2444 }
2445 }
2446
2447 if (progress)
2448 invalidate_live_intervals();
2449
2450 return progress;
2451 }
2452
2453 bool
2454 fs_visitor::compute_to_mrf()
2455 {
2456 bool progress = false;
2457 int next_ip = 0;
2458
2459 calculate_live_intervals();
2460
2461 foreach_list_safe(node, &this->instructions) {
2462 fs_inst *inst = (fs_inst *)node;
2463
2464 int ip = next_ip;
2465 next_ip++;
2466
2467 if (inst->opcode != BRW_OPCODE_MOV ||
2468 inst->is_partial_write() ||
2469 inst->dst.file != MRF || inst->src[0].file != GRF ||
2470 inst->dst.type != inst->src[0].type ||
2471 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2472 continue;
2473
2474 /* Work out which hardware MRF registers are written by this
2475 * instruction.
2476 */
2477 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2478 int mrf_high;
2479 if (inst->dst.reg & BRW_MRF_COMPR4) {
2480 mrf_high = mrf_low + 4;
2481 } else if (dispatch_width == 16 &&
2482 (!inst->force_uncompressed && !inst->force_sechalf)) {
2483 mrf_high = mrf_low + 1;
2484 } else {
2485 mrf_high = mrf_low;
2486 }
2487
2488 /* Can't compute-to-MRF this GRF if someone else was going to
2489 * read it later.
2490 */
2491 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2492 continue;
2493
2494 /* Found a move of a GRF to a MRF. Let's see if we can go
2495 * rewrite the thing that made this GRF to write into the MRF.
2496 */
2497 fs_inst *scan_inst;
2498 for (scan_inst = (fs_inst *)inst->prev;
2499 scan_inst->prev != NULL;
2500 scan_inst = (fs_inst *)scan_inst->prev) {
2501 if (scan_inst->dst.file == GRF &&
2502 scan_inst->dst.reg == inst->src[0].reg) {
2503 /* Found the last thing to write our reg we want to turn
2504 * into a compute-to-MRF.
2505 */
2506
2507 /* If this one instruction didn't populate all the
2508 * channels, bail. We might be able to rewrite everything
2509 * that writes that reg, but it would require smarter
2510 * tracking to delay the rewriting until complete success.
2511 */
2512 if (scan_inst->is_partial_write())
2513 break;
2514
2515 /* Things returning more than one register would need us to
2516 * understand coalescing out more than one MOV at a time.
2517 */
2518 if (scan_inst->regs_written > 1)
2519 break;
2520
2521 /* SEND instructions can't have MRF as a destination. */
2522 if (scan_inst->mlen)
2523 break;
2524
2525 if (brw->gen == 6) {
2526 /* gen6 math instructions must have the destination be
2527 * GRF, so no compute-to-MRF for them.
2528 */
2529 if (scan_inst->is_math()) {
2530 break;
2531 }
2532 }
2533
2534 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2535 /* Found the creator of our MRF's source value. */
2536 scan_inst->dst.file = MRF;
2537 scan_inst->dst.reg = inst->dst.reg;
2538 scan_inst->saturate |= inst->saturate;
2539 inst->remove();
2540 progress = true;
2541 }
2542 break;
2543 }
2544
2545 /* We don't handle control flow here. Most computation of
2546 * values that end up in MRFs are shortly before the MRF
2547 * write anyway.
2548 */
2549 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2550 break;
2551
2552 /* You can't read from an MRF, so if someone else reads our
2553 * MRF's source GRF that we wanted to rewrite, that stops us.
2554 */
2555 bool interfered = false;
2556 for (int i = 0; i < 3; i++) {
2557 if (scan_inst->src[i].file == GRF &&
2558 scan_inst->src[i].reg == inst->src[0].reg &&
2559 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2560 interfered = true;
2561 }
2562 }
2563 if (interfered)
2564 break;
2565
2566 if (scan_inst->dst.file == MRF) {
2567 /* If somebody else writes our MRF here, we can't
2568 * compute-to-MRF before that.
2569 */
2570 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2571 int scan_mrf_high;
2572
2573 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2574 scan_mrf_high = scan_mrf_low + 4;
2575 } else if (dispatch_width == 16 &&
2576 (!scan_inst->force_uncompressed &&
2577 !scan_inst->force_sechalf)) {
2578 scan_mrf_high = scan_mrf_low + 1;
2579 } else {
2580 scan_mrf_high = scan_mrf_low;
2581 }
2582
2583 if (mrf_low == scan_mrf_low ||
2584 mrf_low == scan_mrf_high ||
2585 mrf_high == scan_mrf_low ||
2586 mrf_high == scan_mrf_high) {
2587 break;
2588 }
2589 }
2590
2591 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2592 /* Found a SEND instruction, which means that there are
2593 * live values in MRFs from base_mrf to base_mrf +
2594 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2595 * above it.
2596 */
2597 if (mrf_low >= scan_inst->base_mrf &&
2598 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2599 break;
2600 }
2601 if (mrf_high >= scan_inst->base_mrf &&
2602 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2603 break;
2604 }
2605 }
2606 }
2607 }
2608
2609 if (progress)
2610 invalidate_live_intervals();
2611
2612 return progress;
2613 }
2614
2615 /**
2616 * Walks through basic blocks, looking for repeated MRF writes and
2617 * removing the later ones.
2618 */
2619 bool
2620 fs_visitor::remove_duplicate_mrf_writes()
2621 {
2622 fs_inst *last_mrf_move[16];
2623 bool progress = false;
2624
2625 /* Need to update the MRF tracking for compressed instructions. */
2626 if (dispatch_width == 16)
2627 return false;
2628
2629 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2630
2631 foreach_list_safe(node, &this->instructions) {
2632 fs_inst *inst = (fs_inst *)node;
2633
2634 if (inst->is_control_flow()) {
2635 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2636 }
2637
2638 if (inst->opcode == BRW_OPCODE_MOV &&
2639 inst->dst.file == MRF) {
2640 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2641 if (prev_inst && inst->equals(prev_inst)) {
2642 inst->remove();
2643 progress = true;
2644 continue;
2645 }
2646 }
2647
2648 /* Clear out the last-write records for MRFs that were overwritten. */
2649 if (inst->dst.file == MRF) {
2650 last_mrf_move[inst->dst.reg] = NULL;
2651 }
2652
2653 if (inst->mlen > 0 && inst->base_mrf != -1) {
2654 /* Found a SEND instruction, which will include two or fewer
2655 * implied MRF writes. We could do better here.
2656 */
2657 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2658 last_mrf_move[inst->base_mrf + i] = NULL;
2659 }
2660 }
2661
2662 /* Clear out any MRF move records whose sources got overwritten. */
2663 if (inst->dst.file == GRF) {
2664 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2665 if (last_mrf_move[i] &&
2666 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2667 last_mrf_move[i] = NULL;
2668 }
2669 }
2670 }
2671
2672 if (inst->opcode == BRW_OPCODE_MOV &&
2673 inst->dst.file == MRF &&
2674 inst->src[0].file == GRF &&
2675 !inst->is_partial_write()) {
2676 last_mrf_move[inst->dst.reg] = inst;
2677 }
2678 }
2679
2680 if (progress)
2681 invalidate_live_intervals();
2682
2683 return progress;
2684 }
2685
2686 static void
2687 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2688 int first_grf, int grf_len)
2689 {
2690 bool inst_simd16 = (dispatch_width > 8 &&
2691 !inst->force_uncompressed &&
2692 !inst->force_sechalf);
2693
2694 /* Clear the flag for registers that actually got read (as expected). */
2695 for (int i = 0; i < 3; i++) {
2696 int grf;
2697 if (inst->src[i].file == GRF) {
2698 grf = inst->src[i].reg;
2699 } else if (inst->src[i].file == HW_REG &&
2700 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2701 grf = inst->src[i].fixed_hw_reg.nr;
2702 } else {
2703 continue;
2704 }
2705
2706 if (grf >= first_grf &&
2707 grf < first_grf + grf_len) {
2708 deps[grf - first_grf] = false;
2709 if (inst_simd16)
2710 deps[grf - first_grf + 1] = false;
2711 }
2712 }
2713 }
2714
2715 /**
2716 * Implements this workaround for the original 965:
2717 *
2718 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2719 * check for post destination dependencies on this instruction, software
2720 * must ensure that there is no destination hazard for the case of ‘write
2721 * followed by a posted write’ shown in the following example.
2722 *
2723 * 1. mov r3 0
2724 * 2. send r3.xy <rest of send instruction>
2725 * 3. mov r2 r3
2726 *
2727 * Due to no post-destination dependency check on the ‘send’, the above
2728 * code sequence could have two instructions (1 and 2) in flight at the
2729 * same time that both consider ‘r3’ as the target of their final writes.
2730 */
2731 void
2732 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2733 {
2734 int reg_size = dispatch_width / 8;
2735 int write_len = inst->regs_written * reg_size;
2736 int first_write_grf = inst->dst.reg;
2737 bool needs_dep[BRW_MAX_MRF];
2738 assert(write_len < (int)sizeof(needs_dep) - 1);
2739
2740 memset(needs_dep, false, sizeof(needs_dep));
2741 memset(needs_dep, true, write_len);
2742
2743 clear_deps_for_inst_src(inst, dispatch_width,
2744 needs_dep, first_write_grf, write_len);
2745
2746 /* Walk backwards looking for writes to registers we're writing which
2747 * aren't read since being written. If we hit the start of the program,
2748 * we assume that there are no outstanding dependencies on entry to the
2749 * program.
2750 */
2751 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2752 scan_inst != NULL;
2753 scan_inst = (fs_inst *)scan_inst->prev) {
2754
2755 /* If we hit control flow, assume that there *are* outstanding
2756 * dependencies, and force their cleanup before our instruction.
2757 */
2758 if (scan_inst->is_control_flow()) {
2759 for (int i = 0; i < write_len; i++) {
2760 if (needs_dep[i]) {
2761 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2762 }
2763 }
2764 return;
2765 }
2766
2767 bool scan_inst_simd16 = (dispatch_width > 8 &&
2768 !scan_inst->force_uncompressed &&
2769 !scan_inst->force_sechalf);
2770
2771 /* We insert our reads as late as possible on the assumption that any
2772 * instruction but a MOV that might have left us an outstanding
2773 * dependency has more latency than a MOV.
2774 */
2775 if (scan_inst->dst.file == GRF) {
2776 for (int i = 0; i < scan_inst->regs_written; i++) {
2777 int reg = scan_inst->dst.reg + i * reg_size;
2778
2779 if (reg >= first_write_grf &&
2780 reg < first_write_grf + write_len &&
2781 needs_dep[reg - first_write_grf]) {
2782 inst->insert_before(DEP_RESOLVE_MOV(reg));
2783 needs_dep[reg - first_write_grf] = false;
2784 if (scan_inst_simd16)
2785 needs_dep[reg - first_write_grf + 1] = false;
2786 }
2787 }
2788 }
2789
2790 /* Clear the flag for registers that actually got read (as expected). */
2791 clear_deps_for_inst_src(scan_inst, dispatch_width,
2792 needs_dep, first_write_grf, write_len);
2793
2794 /* Continue the loop only if we haven't resolved all the dependencies */
2795 int i;
2796 for (i = 0; i < write_len; i++) {
2797 if (needs_dep[i])
2798 break;
2799 }
2800 if (i == write_len)
2801 return;
2802 }
2803 }
2804
2805 /**
2806 * Implements this workaround for the original 965:
2807 *
2808 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2809 * used as a destination register until after it has been sourced by an
2810 * instruction with a different destination register.
2811 */
2812 void
2813 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2814 {
2815 int write_len = inst->regs_written * dispatch_width / 8;
2816 int first_write_grf = inst->dst.reg;
2817 bool needs_dep[BRW_MAX_MRF];
2818 assert(write_len < (int)sizeof(needs_dep) - 1);
2819
2820 memset(needs_dep, false, sizeof(needs_dep));
2821 memset(needs_dep, true, write_len);
2822 /* Walk forwards looking for writes to registers we're writing which aren't
2823 * read before being written.
2824 */
2825 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2826 !scan_inst->is_tail_sentinel();
2827 scan_inst = (fs_inst *)scan_inst->next) {
2828 /* If we hit control flow, force resolve all remaining dependencies. */
2829 if (scan_inst->is_control_flow()) {
2830 for (int i = 0; i < write_len; i++) {
2831 if (needs_dep[i])
2832 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2833 }
2834 return;
2835 }
2836
2837 /* Clear the flag for registers that actually got read (as expected). */
2838 clear_deps_for_inst_src(scan_inst, dispatch_width,
2839 needs_dep, first_write_grf, write_len);
2840
2841 /* We insert our reads as late as possible since they're reading the
2842 * result of a SEND, which has massive latency.
2843 */
2844 if (scan_inst->dst.file == GRF &&
2845 scan_inst->dst.reg >= first_write_grf &&
2846 scan_inst->dst.reg < first_write_grf + write_len &&
2847 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2848 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2849 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2850 }
2851
2852 /* Continue the loop only if we haven't resolved all the dependencies */
2853 int i;
2854 for (i = 0; i < write_len; i++) {
2855 if (needs_dep[i])
2856 break;
2857 }
2858 if (i == write_len)
2859 return;
2860 }
2861
2862 /* If we hit the end of the program, resolve all remaining dependencies out
2863 * of paranoia.
2864 */
2865 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2866 assert(last_inst->eot);
2867 for (int i = 0; i < write_len; i++) {
2868 if (needs_dep[i])
2869 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2870 }
2871 }
2872
2873 void
2874 fs_visitor::insert_gen4_send_dependency_workarounds()
2875 {
2876 if (brw->gen != 4 || brw->is_g4x)
2877 return;
2878
2879 /* Note that we're done with register allocation, so GRF fs_regs always
2880 * have a .reg_offset of 0.
2881 */
2882
2883 foreach_list_safe(node, &this->instructions) {
2884 fs_inst *inst = (fs_inst *)node;
2885
2886 if (inst->mlen != 0 && inst->dst.file == GRF) {
2887 insert_gen4_pre_send_dependency_workarounds(inst);
2888 insert_gen4_post_send_dependency_workarounds(inst);
2889 }
2890 }
2891 }
2892
2893 /**
2894 * Turns the generic expression-style uniform pull constant load instruction
2895 * into a hardware-specific series of instructions for loading a pull
2896 * constant.
2897 *
2898 * The expression style allows the CSE pass before this to optimize out
2899 * repeated loads from the same offset, and gives the pre-register-allocation
2900 * scheduling full flexibility, while the conversion to native instructions
2901 * allows the post-register-allocation scheduler the best information
2902 * possible.
2903 *
2904 * Note that execution masking for setting up pull constant loads is special:
2905 * the channels that need to be written are unrelated to the current execution
2906 * mask, since a later instruction will use one of the result channels as a
2907 * source operand for all 8 or 16 of its channels.
2908 */
2909 void
2910 fs_visitor::lower_uniform_pull_constant_loads()
2911 {
2912 foreach_list(node, &this->instructions) {
2913 fs_inst *inst = (fs_inst *)node;
2914
2915 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2916 continue;
2917
2918 if (brw->gen >= 7) {
2919 /* The offset arg before was a vec4-aligned byte offset. We need to
2920 * turn it into a dword offset.
2921 */
2922 fs_reg const_offset_reg = inst->src[1];
2923 assert(const_offset_reg.file == IMM &&
2924 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2925 const_offset_reg.imm.u /= 4;
2926 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2927
2928 /* This is actually going to be a MOV, but since only the first dword
2929 * is accessed, we have a special opcode to do just that one. Note
2930 * that this needs to be an operation that will be considered a def
2931 * by live variable analysis, or register allocation will explode.
2932 */
2933 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2934 payload, const_offset_reg);
2935 setup->force_writemask_all = true;
2936
2937 setup->ir = inst->ir;
2938 setup->annotation = inst->annotation;
2939 inst->insert_before(setup);
2940
2941 /* Similarly, this will only populate the first 4 channels of the
2942 * result register (since we only use smear values from 0-3), but we
2943 * don't tell the optimizer.
2944 */
2945 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2946 inst->src[1] = payload;
2947
2948 invalidate_live_intervals();
2949 } else {
2950 /* Before register allocation, we didn't tell the scheduler about the
2951 * MRF we use. We know it's safe to use this MRF because nothing
2952 * else does except for register spill/unspill, which generates and
2953 * uses its MRF within a single IR instruction.
2954 */
2955 inst->base_mrf = 14;
2956 inst->mlen = 1;
2957 }
2958 }
2959 }
2960
2961 void
2962 fs_visitor::dump_instructions()
2963 {
2964 calculate_register_pressure();
2965
2966 int ip = 0, max_pressure = 0;
2967 foreach_list(node, &this->instructions) {
2968 backend_instruction *inst = (backend_instruction *)node;
2969 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2970 printf("{%3d} %4d: ", regs_live_at_ip[ip], ip);
2971 dump_instruction(inst);
2972 ++ip;
2973 }
2974 printf("Maximum %3d registers live at once.\n", max_pressure);
2975 }
2976
2977 void
2978 fs_visitor::dump_instruction(backend_instruction *be_inst)
2979 {
2980 fs_inst *inst = (fs_inst *)be_inst;
2981
2982 if (inst->predicate) {
2983 printf("(%cf0.%d) ",
2984 inst->predicate_inverse ? '-' : '+',
2985 inst->flag_subreg);
2986 }
2987
2988 printf("%s", brw_instruction_name(inst->opcode));
2989 if (inst->saturate)
2990 printf(".sat");
2991 if (inst->conditional_mod) {
2992 printf("%s", conditional_modifier[inst->conditional_mod]);
2993 if (!inst->predicate &&
2994 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2995 inst->opcode != BRW_OPCODE_IF &&
2996 inst->opcode != BRW_OPCODE_WHILE))) {
2997 printf(".f0.%d", inst->flag_subreg);
2998 }
2999 }
3000 printf(" ");
3001
3002
3003 switch (inst->dst.file) {
3004 case GRF:
3005 printf("vgrf%d", inst->dst.reg);
3006 if (virtual_grf_sizes[inst->dst.reg] != 1)
3007 printf("+%d", inst->dst.reg_offset);
3008 break;
3009 case MRF:
3010 printf("m%d", inst->dst.reg);
3011 break;
3012 case BAD_FILE:
3013 printf("(null)");
3014 break;
3015 case UNIFORM:
3016 printf("***u%d***", inst->dst.reg);
3017 break;
3018 case HW_REG:
3019 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3020 switch (inst->dst.fixed_hw_reg.nr) {
3021 case BRW_ARF_NULL:
3022 printf("null");
3023 break;
3024 case BRW_ARF_ADDRESS:
3025 printf("a0.%d", inst->dst.fixed_hw_reg.subnr);
3026 break;
3027 case BRW_ARF_ACCUMULATOR:
3028 printf("acc%d", inst->dst.fixed_hw_reg.subnr);
3029 break;
3030 case BRW_ARF_FLAG:
3031 printf("f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3032 inst->dst.fixed_hw_reg.subnr);
3033 break;
3034 default:
3035 printf("arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3036 inst->dst.fixed_hw_reg.subnr);
3037 break;
3038 }
3039 } else {
3040 printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
3041 }
3042 if (inst->dst.fixed_hw_reg.subnr)
3043 printf("+%d", inst->dst.fixed_hw_reg.subnr);
3044 break;
3045 default:
3046 printf("???");
3047 break;
3048 }
3049 printf(":%s, ", reg_encoding[inst->dst.type]);
3050
3051 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
3052 if (inst->src[i].negate)
3053 printf("-");
3054 if (inst->src[i].abs)
3055 printf("|");
3056 switch (inst->src[i].file) {
3057 case GRF:
3058 printf("vgrf%d", inst->src[i].reg);
3059 if (virtual_grf_sizes[inst->src[i].reg] != 1)
3060 printf("+%d", inst->src[i].reg_offset);
3061 break;
3062 case MRF:
3063 printf("***m%d***", inst->src[i].reg);
3064 break;
3065 case UNIFORM:
3066 printf("u%d", inst->src[i].reg);
3067 if (virtual_grf_sizes[inst->src[i].reg] != 1)
3068 printf(".%d", inst->src[i].reg_offset);
3069 break;
3070 case BAD_FILE:
3071 printf("(null)");
3072 break;
3073 case IMM:
3074 switch (inst->src[i].type) {
3075 case BRW_REGISTER_TYPE_F:
3076 printf("%ff", inst->src[i].imm.f);
3077 break;
3078 case BRW_REGISTER_TYPE_D:
3079 printf("%dd", inst->src[i].imm.i);
3080 break;
3081 case BRW_REGISTER_TYPE_UD:
3082 printf("%uu", inst->src[i].imm.u);
3083 break;
3084 default:
3085 printf("???");
3086 break;
3087 }
3088 break;
3089 case HW_REG:
3090 if (inst->src[i].fixed_hw_reg.negate)
3091 printf("-");
3092 if (inst->src[i].fixed_hw_reg.abs)
3093 printf("|");
3094 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3095 switch (inst->src[i].fixed_hw_reg.nr) {
3096 case BRW_ARF_NULL:
3097 printf("null");
3098 break;
3099 case BRW_ARF_ADDRESS:
3100 printf("a0.%d", inst->src[i].fixed_hw_reg.subnr);
3101 break;
3102 case BRW_ARF_ACCUMULATOR:
3103 printf("acc%d", inst->src[i].fixed_hw_reg.subnr);
3104 break;
3105 case BRW_ARF_FLAG:
3106 printf("f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3107 inst->src[i].fixed_hw_reg.subnr);
3108 break;
3109 default:
3110 printf("arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3111 inst->src[i].fixed_hw_reg.subnr);
3112 break;
3113 }
3114 } else {
3115 printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3116 }
3117 if (inst->src[i].fixed_hw_reg.subnr)
3118 printf("+%d", inst->src[i].fixed_hw_reg.subnr);
3119 if (inst->src[i].fixed_hw_reg.abs)
3120 printf("|");
3121 break;
3122 default:
3123 printf("???");
3124 break;
3125 }
3126 if (inst->src[i].abs)
3127 printf("|");
3128
3129 if (inst->src[i].file != IMM) {
3130 printf(":%s", reg_encoding[inst->src[i].type]);
3131 }
3132
3133 if (i < 2 && inst->src[i + 1].file != BAD_FILE)
3134 printf(", ");
3135 }
3136
3137 printf(" ");
3138
3139 if (inst->force_uncompressed)
3140 printf("1sthalf ");
3141
3142 if (inst->force_sechalf)
3143 printf("2ndhalf ");
3144
3145 printf("\n");
3146 }
3147
3148 /**
3149 * Possibly returns an instruction that set up @param reg.
3150 *
3151 * Sometimes we want to take the result of some expression/variable
3152 * dereference tree and rewrite the instruction generating the result
3153 * of the tree. When processing the tree, we know that the
3154 * instructions generated are all writing temporaries that are dead
3155 * outside of this tree. So, if we have some instructions that write
3156 * a temporary, we're free to point that temp write somewhere else.
3157 *
3158 * Note that this doesn't guarantee that the instruction generated
3159 * only reg -- it might be the size=4 destination of a texture instruction.
3160 */
3161 fs_inst *
3162 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3163 fs_inst *end,
3164 fs_reg reg)
3165 {
3166 if (end == start ||
3167 end->is_partial_write() ||
3168 reg.reladdr ||
3169 !reg.equals(end->dst)) {
3170 return NULL;
3171 } else {
3172 return end;
3173 }
3174 }
3175
3176 void
3177 fs_visitor::setup_payload_gen6()
3178 {
3179 bool uses_depth =
3180 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3181 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3182
3183 assert(brw->gen >= 6);
3184
3185 /* R0-1: masks, pixel X/Y coordinates. */
3186 c->nr_payload_regs = 2;
3187 /* R2: only for 32-pixel dispatch.*/
3188
3189 /* R3-26: barycentric interpolation coordinates. These appear in the
3190 * same order that they appear in the brw_wm_barycentric_interp_mode
3191 * enum. Each set of coordinates occupies 2 registers if dispatch width
3192 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3193 * appear if they were enabled using the "Barycentric Interpolation
3194 * Mode" bits in WM_STATE.
3195 */
3196 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3197 if (barycentric_interp_modes & (1 << i)) {
3198 c->barycentric_coord_reg[i] = c->nr_payload_regs;
3199 c->nr_payload_regs += 2;
3200 if (dispatch_width == 16) {
3201 c->nr_payload_regs += 2;
3202 }
3203 }
3204 }
3205
3206 /* R27: interpolated depth if uses source depth */
3207 if (uses_depth) {
3208 c->source_depth_reg = c->nr_payload_regs;
3209 c->nr_payload_regs++;
3210 if (dispatch_width == 16) {
3211 /* R28: interpolated depth if not SIMD8. */
3212 c->nr_payload_regs++;
3213 }
3214 }
3215 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3216 if (uses_depth) {
3217 c->source_w_reg = c->nr_payload_regs;
3218 c->nr_payload_regs++;
3219 if (dispatch_width == 16) {
3220 /* R30: interpolated W if not SIMD8. */
3221 c->nr_payload_regs++;
3222 }
3223 }
3224
3225 c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3226 /* R31: MSAA position offsets. */
3227 if (c->prog_data.uses_pos_offset) {
3228 c->sample_pos_reg = c->nr_payload_regs;
3229 c->nr_payload_regs++;
3230 }
3231
3232 /* R32: MSAA input coverage mask */
3233 if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3234 assert(brw->gen >= 7);
3235 c->sample_mask_reg = c->nr_payload_regs;
3236 c->nr_payload_regs++;
3237 if (dispatch_width == 16) {
3238 /* R33: input coverage mask if not SIMD8. */
3239 c->nr_payload_regs++;
3240 }
3241 }
3242
3243 /* R34-: bary for 32-pixel. */
3244 /* R58-59: interp W for 32-pixel. */
3245
3246 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3247 c->source_depth_to_render_target = true;
3248 }
3249 }
3250
3251 void
3252 fs_visitor::assign_binding_table_offsets()
3253 {
3254 uint32_t next_binding_table_offset = 0;
3255
3256 /* If there are no color regions, we still perform an FB write to a null
3257 * renderbuffer, which we place at surface index 0.
3258 */
3259 c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3260 next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3261
3262 assign_common_binding_table_offsets(next_binding_table_offset);
3263 }
3264
3265 void
3266 fs_visitor::calculate_register_pressure()
3267 {
3268 calculate_live_intervals();
3269
3270 int num_instructions = 0;
3271 foreach_list(node, &this->instructions) {
3272 ++num_instructions;
3273 }
3274
3275 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3276
3277 for (int reg = 0; reg < virtual_grf_count; reg++) {
3278 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3279 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3280 }
3281 }
3282
3283 bool
3284 fs_visitor::run()
3285 {
3286 sanity_param_count = fp->Base.Parameters->NumParameters;
3287 uint32_t orig_nr_params = c->prog_data.nr_params;
3288 bool allocated_without_spills;
3289
3290 assign_binding_table_offsets();
3291
3292 if (brw->gen >= 6)
3293 setup_payload_gen6();
3294 else
3295 setup_payload_gen4();
3296
3297 if (0) {
3298 emit_dummy_fs();
3299 } else {
3300 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3301 emit_shader_time_begin();
3302
3303 calculate_urb_setup();
3304 if (fp->Base.InputsRead > 0) {
3305 if (brw->gen < 6)
3306 emit_interpolation_setup_gen4();
3307 else
3308 emit_interpolation_setup_gen6();
3309 }
3310
3311 /* We handle discards by keeping track of the still-live pixels in f0.1.
3312 * Initialize it with the dispatched pixels.
3313 */
3314 if (fp->UsesKill || c->key.alpha_test_func) {
3315 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3316 discard_init->flag_subreg = 1;
3317 }
3318
3319 /* Generate FS IR for main(). (the visitor only descends into
3320 * functions called "main").
3321 */
3322 if (shader) {
3323 foreach_list(node, &*shader->base.ir) {
3324 ir_instruction *ir = (ir_instruction *)node;
3325 base_ir = ir;
3326 this->result = reg_undef;
3327 ir->accept(this);
3328 }
3329 } else {
3330 emit_fragment_program_code();
3331 }
3332 base_ir = NULL;
3333 if (failed)
3334 return false;
3335
3336 emit(FS_OPCODE_PLACEHOLDER_HALT);
3337
3338 if (c->key.alpha_test_func)
3339 emit_alpha_test();
3340
3341 emit_fb_writes();
3342
3343 split_virtual_grfs();
3344
3345 move_uniform_array_access_to_pull_constants();
3346 remove_dead_constants();
3347 setup_pull_constants();
3348
3349 bool progress;
3350 do {
3351 progress = false;
3352
3353 compact_virtual_grfs();
3354
3355 progress = remove_duplicate_mrf_writes() || progress;
3356
3357 progress = opt_algebraic() || progress;
3358 progress = opt_cse() || progress;
3359 progress = opt_copy_propagate() || progress;
3360 progress = opt_peephole_predicated_break() || progress;
3361 progress = dead_code_eliminate() || progress;
3362 progress = dead_code_eliminate_local() || progress;
3363 progress = opt_peephole_sel() || progress;
3364 progress = dead_control_flow_eliminate(this) || progress;
3365 progress = register_coalesce() || progress;
3366 progress = compute_to_mrf() || progress;
3367 } while (progress);
3368
3369 lower_uniform_pull_constant_loads();
3370
3371 assign_curb_setup();
3372 assign_urb_setup();
3373
3374 static enum instruction_scheduler_mode pre_modes[] = {
3375 SCHEDULE_PRE,
3376 SCHEDULE_PRE_NON_LIFO,
3377 SCHEDULE_PRE_LIFO,
3378 };
3379
3380 /* Try each scheduling heuristic to see if it can successfully register
3381 * allocate without spilling. They should be ordered by decreasing
3382 * performance but increasing likelihood of allocating.
3383 */
3384 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3385 schedule_instructions(pre_modes[i]);
3386
3387 if (0) {
3388 assign_regs_trivial();
3389 allocated_without_spills = true;
3390 } else {
3391 allocated_without_spills = assign_regs(false);
3392 }
3393 if (allocated_without_spills)
3394 break;
3395 }
3396
3397 if (!allocated_without_spills) {
3398 /* We assume that any spilling is worse than just dropping back to
3399 * SIMD8. There's probably actually some intermediate point where
3400 * SIMD16 with a couple of spills is still better.
3401 */
3402 if (dispatch_width == 16) {
3403 fail("Failure to register allocate. Reduce number of "
3404 "live scalar values to avoid this.");
3405 }
3406
3407 /* Since we're out of heuristics, just go spill registers until we
3408 * get an allocation.
3409 */
3410 while (!assign_regs(true)) {
3411 if (failed)
3412 break;
3413 }
3414 }
3415 }
3416 assert(force_uncompressed_stack == 0);
3417
3418 /* This must come after all optimization and register allocation, since
3419 * it inserts dead code that happens to have side effects, and it does
3420 * so based on the actual physical registers in use.
3421 */
3422 insert_gen4_send_dependency_workarounds();
3423
3424 if (failed)
3425 return false;
3426
3427 if (!allocated_without_spills)
3428 schedule_instructions(SCHEDULE_POST);
3429
3430 if (dispatch_width == 8) {
3431 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3432 } else {
3433 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3434
3435 /* Make sure we didn't try to sneak in an extra uniform */
3436 assert(orig_nr_params == c->prog_data.nr_params);
3437 (void) orig_nr_params;
3438 }
3439
3440 /* If any state parameters were appended, then ParameterValues could have
3441 * been realloced, in which case the driver uniform storage set up by
3442 * _mesa_associate_uniform_storage() would point to freed memory. Make
3443 * sure that didn't happen.
3444 */
3445 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3446
3447 return !failed;
3448 }
3449
3450 const unsigned *
3451 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3452 struct gl_fragment_program *fp,
3453 struct gl_shader_program *prog,
3454 unsigned *final_assembly_size)
3455 {
3456 bool start_busy = false;
3457 float start_time = 0;
3458
3459 if (unlikely(brw->perf_debug)) {
3460 start_busy = (brw->batch.last_bo &&
3461 drm_intel_bo_busy(brw->batch.last_bo));
3462 start_time = get_time();
3463 }
3464
3465 struct brw_shader *shader = NULL;
3466 if (prog)
3467 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3468
3469 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3470 if (prog) {
3471 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3472 _mesa_print_ir(shader->base.ir, NULL);
3473 printf("\n\n");
3474 } else {
3475 printf("ARB_fragment_program %d ir for native fragment shader\n",
3476 fp->Base.Id);
3477 _mesa_print_program(&fp->Base);
3478 }
3479 }
3480
3481 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3482 */
3483 fs_visitor v(brw, c, prog, fp, 8);
3484 if (!v.run()) {
3485 if (prog) {
3486 prog->LinkStatus = false;
3487 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3488 }
3489
3490 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3491 v.fail_msg);
3492
3493 return NULL;
3494 }
3495
3496 exec_list *simd16_instructions = NULL;
3497 fs_visitor v2(brw, c, prog, fp, 16);
3498 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3499 if (c->prog_data.nr_pull_params == 0) {
3500 /* Try a SIMD16 compile */
3501 v2.import_uniforms(&v);
3502 if (!v2.run()) {
3503 perf_debug("SIMD16 shader failed to compile, falling back to "
3504 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3505 } else {
3506 simd16_instructions = &v2.instructions;
3507 }
3508 } else {
3509 perf_debug("Skipping SIMD16 due to pull parameters.\n");
3510 }
3511 }
3512
3513 const unsigned *assembly = NULL;
3514 if (brw->gen >= 8) {
3515 gen8_fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3516 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3517 final_assembly_size);
3518 } else {
3519 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3520 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3521 final_assembly_size);
3522 }
3523
3524 if (unlikely(brw->perf_debug) && shader) {
3525 if (shader->compiled_once)
3526 brw_wm_debug_recompile(brw, prog, &c->key);
3527 shader->compiled_once = true;
3528
3529 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3530 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3531 (get_time() - start_time) * 1000);
3532 }
3533 }
3534
3535 return assembly;
3536 }
3537
3538 bool
3539 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3540 {
3541 struct brw_context *brw = brw_context(ctx);
3542 struct brw_wm_prog_key key;
3543
3544 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3545 return true;
3546
3547 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3548 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3549 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3550 bool program_uses_dfdy = fp->UsesDFdy;
3551
3552 memset(&key, 0, sizeof(key));
3553
3554 if (brw->gen < 6) {
3555 if (fp->UsesKill)
3556 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3557
3558 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3559 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3560
3561 /* Just assume depth testing. */
3562 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3563 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3564 }
3565
3566 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3567 BRW_FS_VARYING_INPUT_MASK) > 16)
3568 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3569
3570 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3571
3572 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3573 for (unsigned i = 0; i < sampler_count; i++) {
3574 if (fp->Base.ShadowSamplers & (1 << i)) {
3575 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3576 key.tex.swizzles[i] =
3577 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3578 } else {
3579 /* Color sampler: assume no swizzling. */
3580 key.tex.swizzles[i] = SWIZZLE_XYZW;
3581 }
3582 }
3583
3584 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3585 key.drawable_height = ctx->DrawBuffer->Height;
3586 }
3587
3588 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3589 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3590 }
3591
3592 key.nr_color_regions = 1;
3593
3594 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3595 * quality of the derivatives is likely to be determined by the driconf
3596 * option.
3597 */
3598 key.high_quality_derivatives = brw->disable_derivative_optimization;
3599
3600 key.program_string_id = bfp->id;
3601
3602 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3603 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3604
3605 bool success = do_wm_prog(brw, prog, bfp, &key);
3606
3607 brw->wm.base.prog_offset = old_prog_offset;
3608 brw->wm.prog_data = old_prog_data;
3609
3610 return success;
3611 }