i965/fs: Use a single instance of the pull_constant_loc[] array.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53
54 void
55 fs_inst::init()
56 {
57 memset(this, 0, sizeof(*this));
58 this->conditional_mod = BRW_CONDITIONAL_NONE;
59
60 this->dst = reg_undef;
61 this->src[0] = reg_undef;
62 this->src[1] = reg_undef;
63 this->src[2] = reg_undef;
64
65 /* This will be the case for almost all instructions. */
66 this->regs_written = 1;
67 }
68
69 fs_inst::fs_inst()
70 {
71 init();
72 this->opcode = BRW_OPCODE_NOP;
73 }
74
75 fs_inst::fs_inst(enum opcode opcode)
76 {
77 init();
78 this->opcode = opcode;
79 }
80
81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
82 {
83 init();
84 this->opcode = opcode;
85 this->dst = dst;
86
87 if (dst.file == GRF)
88 assert(dst.reg_offset >= 0);
89 }
90
91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
92 {
93 init();
94 this->opcode = opcode;
95 this->dst = dst;
96 this->src[0] = src0;
97
98 if (dst.file == GRF)
99 assert(dst.reg_offset >= 0);
100 if (src[0].file == GRF)
101 assert(src[0].reg_offset >= 0);
102 }
103
104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
105 {
106 init();
107 this->opcode = opcode;
108 this->dst = dst;
109 this->src[0] = src0;
110 this->src[1] = src1;
111
112 if (dst.file == GRF)
113 assert(dst.reg_offset >= 0);
114 if (src[0].file == GRF)
115 assert(src[0].reg_offset >= 0);
116 if (src[1].file == GRF)
117 assert(src[1].reg_offset >= 0);
118 }
119
120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
121 fs_reg src0, fs_reg src1, fs_reg src2)
122 {
123 init();
124 this->opcode = opcode;
125 this->dst = dst;
126 this->src[0] = src0;
127 this->src[1] = src1;
128 this->src[2] = src2;
129
130 if (dst.file == GRF)
131 assert(dst.reg_offset >= 0);
132 if (src[0].file == GRF)
133 assert(src[0].reg_offset >= 0);
134 if (src[1].file == GRF)
135 assert(src[1].reg_offset >= 0);
136 if (src[2].file == GRF)
137 assert(src[2].reg_offset >= 0);
138 }
139
140 #define ALU1(op) \
141 fs_inst * \
142 fs_visitor::op(fs_reg dst, fs_reg src0) \
143 { \
144 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
145 }
146
147 #define ALU2(op) \
148 fs_inst * \
149 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
150 { \
151 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
152 }
153
154 #define ALU3(op) \
155 fs_inst * \
156 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
157 { \
158 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
159 }
160
161 ALU1(NOT)
162 ALU1(MOV)
163 ALU1(FRC)
164 ALU1(RNDD)
165 ALU1(RNDE)
166 ALU1(RNDZ)
167 ALU2(ADD)
168 ALU2(MUL)
169 ALU2(MACH)
170 ALU2(AND)
171 ALU2(OR)
172 ALU2(XOR)
173 ALU2(SHL)
174 ALU2(SHR)
175 ALU2(ASR)
176 ALU3(LRP)
177 ALU1(BFREV)
178 ALU3(BFE)
179 ALU2(BFI1)
180 ALU3(BFI2)
181 ALU1(FBH)
182 ALU1(FBL)
183 ALU1(CBIT)
184 ALU3(MAD)
185 ALU2(ADDC)
186 ALU2(SUBB)
187 ALU2(SEL)
188
189 /** Gen4 predicated IF. */
190 fs_inst *
191 fs_visitor::IF(uint32_t predicate)
192 {
193 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195 return inst;
196 }
197
198 /** Gen6 IF with embedded comparison. */
199 fs_inst *
200 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
201 {
202 assert(brw->gen == 6);
203 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
204 reg_null_d, src0, src1);
205 inst->conditional_mod = condition;
206 return inst;
207 }
208
209 /**
210 * CMP: Sets the low bit of the destination channels with the result
211 * of the comparison, while the upper bits are undefined, and updates
212 * the flag register with the packed 16 bits of the result.
213 */
214 fs_inst *
215 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
216 {
217 fs_inst *inst;
218
219 /* Take the instruction:
220 *
221 * CMP null<d> src0<f> src1<f>
222 *
223 * Original gen4 does type conversion to the destination type before
224 * comparison, producing garbage results for floating point comparisons.
225 * gen5 does the comparison on the execution type (resolved source types),
226 * so dst type doesn't matter. gen6 does comparison and then uses the
227 * result as if it was the dst type with no conversion, which happens to
228 * mostly work out for float-interpreted-as-int since our comparisons are
229 * for >0, =0, <0.
230 */
231 if (brw->gen == 4) {
232 dst.type = src0.type;
233 if (dst.file == HW_REG)
234 dst.fixed_hw_reg.type = dst.type;
235 }
236
237 resolve_ud_negate(&src0);
238 resolve_ud_negate(&src1);
239
240 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
241 inst->conditional_mod = condition;
242
243 return inst;
244 }
245
246 exec_list
247 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
248 const fs_reg &surf_index,
249 const fs_reg &varying_offset,
250 uint32_t const_offset)
251 {
252 exec_list instructions;
253 fs_inst *inst;
254
255 /* We have our constant surface use a pitch of 4 bytes, so our index can
256 * be any component of a vector, and then we load 4 contiguous
257 * components starting from that.
258 *
259 * We break down the const_offset to a portion added to the variable
260 * offset and a portion done using reg_offset, which means that if you
261 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
262 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
263 * CSE can later notice that those loads are all the same and eliminate
264 * the redundant ones.
265 */
266 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
267 instructions.push_tail(ADD(vec4_offset,
268 varying_offset, const_offset & ~3));
269
270 int scale = 1;
271 if (brw->gen == 4 && dispatch_width == 8) {
272 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
273 * u, v, r) as parameters, or we can just use the SIMD16 message
274 * consisting of (header, u). We choose the second, at the cost of a
275 * longer return length.
276 */
277 scale = 2;
278 }
279
280 enum opcode op;
281 if (brw->gen >= 7)
282 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
283 else
284 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
285 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
286 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
287 inst->regs_written = 4 * scale;
288 instructions.push_tail(inst);
289
290 if (brw->gen < 7) {
291 inst->base_mrf = 13;
292 inst->header_present = true;
293 if (brw->gen == 4)
294 inst->mlen = 3;
295 else
296 inst->mlen = 1 + dispatch_width / 8;
297 }
298
299 vec4_result.reg_offset += (const_offset & 3) * scale;
300 instructions.push_tail(MOV(dst, vec4_result));
301
302 return instructions;
303 }
304
305 /**
306 * A helper for MOV generation for fixing up broken hardware SEND dependency
307 * handling.
308 */
309 fs_inst *
310 fs_visitor::DEP_RESOLVE_MOV(int grf)
311 {
312 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
313
314 inst->ir = NULL;
315 inst->annotation = "send dependency resolve";
316
317 /* The caller always wants uncompressed to emit the minimal extra
318 * dependencies, and to avoid having to deal with aligning its regs to 2.
319 */
320 inst->force_uncompressed = true;
321
322 return inst;
323 }
324
325 bool
326 fs_inst::equals(fs_inst *inst)
327 {
328 return (opcode == inst->opcode &&
329 dst.equals(inst->dst) &&
330 src[0].equals(inst->src[0]) &&
331 src[1].equals(inst->src[1]) &&
332 src[2].equals(inst->src[2]) &&
333 saturate == inst->saturate &&
334 predicate == inst->predicate &&
335 conditional_mod == inst->conditional_mod &&
336 mlen == inst->mlen &&
337 base_mrf == inst->base_mrf &&
338 sampler == inst->sampler &&
339 target == inst->target &&
340 eot == inst->eot &&
341 header_present == inst->header_present &&
342 shadow_compare == inst->shadow_compare &&
343 offset == inst->offset);
344 }
345
346 bool
347 fs_inst::overwrites_reg(const fs_reg &reg)
348 {
349 return (reg.file == dst.file &&
350 reg.reg == dst.reg &&
351 reg.reg_offset >= dst.reg_offset &&
352 reg.reg_offset < dst.reg_offset + regs_written);
353 }
354
355 bool
356 fs_inst::is_send_from_grf()
357 {
358 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
359 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
360 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
361 src[1].file == GRF) ||
362 (is_tex() && src[0].file == GRF));
363 }
364
365 bool
366 fs_visitor::can_do_source_mods(fs_inst *inst)
367 {
368 if (brw->gen == 6 && inst->is_math())
369 return false;
370
371 if (inst->is_send_from_grf())
372 return false;
373
374 if (!inst->can_do_source_mods())
375 return false;
376
377 return true;
378 }
379
380 void
381 fs_reg::init()
382 {
383 memset(this, 0, sizeof(*this));
384 stride = 1;
385 }
386
387 /** Generic unset register constructor. */
388 fs_reg::fs_reg()
389 {
390 init();
391 this->file = BAD_FILE;
392 }
393
394 /** Immediate value constructor. */
395 fs_reg::fs_reg(float f)
396 {
397 init();
398 this->file = IMM;
399 this->type = BRW_REGISTER_TYPE_F;
400 this->imm.f = f;
401 }
402
403 /** Immediate value constructor. */
404 fs_reg::fs_reg(int32_t i)
405 {
406 init();
407 this->file = IMM;
408 this->type = BRW_REGISTER_TYPE_D;
409 this->imm.i = i;
410 }
411
412 /** Immediate value constructor. */
413 fs_reg::fs_reg(uint32_t u)
414 {
415 init();
416 this->file = IMM;
417 this->type = BRW_REGISTER_TYPE_UD;
418 this->imm.u = u;
419 }
420
421 /** Fixed brw_reg. */
422 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
423 {
424 init();
425 this->file = HW_REG;
426 this->fixed_hw_reg = fixed_hw_reg;
427 this->type = fixed_hw_reg.type;
428 }
429
430 bool
431 fs_reg::equals(const fs_reg &r) const
432 {
433 return (file == r.file &&
434 reg == r.reg &&
435 reg_offset == r.reg_offset &&
436 subreg_offset == r.subreg_offset &&
437 type == r.type &&
438 negate == r.negate &&
439 abs == r.abs &&
440 !reladdr && !r.reladdr &&
441 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
442 sizeof(fixed_hw_reg)) == 0 &&
443 stride == r.stride &&
444 imm.u == r.imm.u);
445 }
446
447 fs_reg &
448 fs_reg::apply_stride(unsigned stride)
449 {
450 assert((this->stride * stride) <= 4 &&
451 (is_power_of_two(stride) || stride == 0) &&
452 file != HW_REG && file != IMM);
453 this->stride *= stride;
454 return *this;
455 }
456
457 fs_reg &
458 fs_reg::set_smear(unsigned subreg)
459 {
460 assert(file != HW_REG && file != IMM);
461 subreg_offset = subreg * type_sz(type);
462 stride = 0;
463 return *this;
464 }
465
466 bool
467 fs_reg::is_contiguous() const
468 {
469 return stride == 1;
470 }
471
472 bool
473 fs_reg::is_zero() const
474 {
475 if (file != IMM)
476 return false;
477
478 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
479 }
480
481 bool
482 fs_reg::is_one() const
483 {
484 if (file != IMM)
485 return false;
486
487 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
488 }
489
490 bool
491 fs_reg::is_null() const
492 {
493 return file == HW_REG &&
494 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
495 fixed_hw_reg.nr == BRW_ARF_NULL;
496 }
497
498 bool
499 fs_reg::is_valid_3src() const
500 {
501 return file == GRF || file == UNIFORM;
502 }
503
504 int
505 fs_visitor::type_size(const struct glsl_type *type)
506 {
507 unsigned int size, i;
508
509 switch (type->base_type) {
510 case GLSL_TYPE_UINT:
511 case GLSL_TYPE_INT:
512 case GLSL_TYPE_FLOAT:
513 case GLSL_TYPE_BOOL:
514 return type->components();
515 case GLSL_TYPE_ARRAY:
516 return type_size(type->fields.array) * type->length;
517 case GLSL_TYPE_STRUCT:
518 size = 0;
519 for (i = 0; i < type->length; i++) {
520 size += type_size(type->fields.structure[i].type);
521 }
522 return size;
523 case GLSL_TYPE_SAMPLER:
524 /* Samplers take up no register space, since they're baked in at
525 * link time.
526 */
527 return 0;
528 case GLSL_TYPE_ATOMIC_UINT:
529 return 0;
530 case GLSL_TYPE_IMAGE:
531 case GLSL_TYPE_VOID:
532 case GLSL_TYPE_ERROR:
533 case GLSL_TYPE_INTERFACE:
534 assert(!"not reached");
535 break;
536 }
537
538 return 0;
539 }
540
541 fs_reg
542 fs_visitor::get_timestamp()
543 {
544 assert(brw->gen >= 7);
545
546 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
547 BRW_ARF_TIMESTAMP,
548 0),
549 BRW_REGISTER_TYPE_UD));
550
551 fs_reg dst = fs_reg(this, glsl_type::uint_type);
552
553 fs_inst *mov = emit(MOV(dst, ts));
554 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
555 * even if it's not enabled in the dispatch.
556 */
557 mov->force_writemask_all = true;
558 mov->force_uncompressed = true;
559
560 /* The caller wants the low 32 bits of the timestamp. Since it's running
561 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
562 * which is plenty of time for our purposes. It is identical across the
563 * EUs, but since it's tracking GPU core speed it will increment at a
564 * varying rate as render P-states change.
565 *
566 * The caller could also check if render P-states have changed (or anything
567 * else that might disrupt timing) by setting smear to 2 and checking if
568 * that field is != 0.
569 */
570 dst.set_smear(0);
571
572 return dst;
573 }
574
575 void
576 fs_visitor::emit_shader_time_begin()
577 {
578 current_annotation = "shader time start";
579 shader_start_time = get_timestamp();
580 }
581
582 void
583 fs_visitor::emit_shader_time_end()
584 {
585 current_annotation = "shader time end";
586
587 enum shader_time_shader_type type, written_type, reset_type;
588 if (dispatch_width == 8) {
589 type = ST_FS8;
590 written_type = ST_FS8_WRITTEN;
591 reset_type = ST_FS8_RESET;
592 } else {
593 assert(dispatch_width == 16);
594 type = ST_FS16;
595 written_type = ST_FS16_WRITTEN;
596 reset_type = ST_FS16_RESET;
597 }
598
599 fs_reg shader_end_time = get_timestamp();
600
601 /* Check that there weren't any timestamp reset events (assuming these
602 * were the only two timestamp reads that happened).
603 */
604 fs_reg reset = shader_end_time;
605 reset.set_smear(2);
606 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
607 test->conditional_mod = BRW_CONDITIONAL_Z;
608 emit(IF(BRW_PREDICATE_NORMAL));
609
610 push_force_uncompressed();
611 fs_reg start = shader_start_time;
612 start.negate = true;
613 fs_reg diff = fs_reg(this, glsl_type::uint_type);
614 emit(ADD(diff, start, shader_end_time));
615
616 /* If there were no instructions between the two timestamp gets, the diff
617 * is 2 cycles. Remove that overhead, so I can forget about that when
618 * trying to determine the time taken for single instructions.
619 */
620 emit(ADD(diff, diff, fs_reg(-2u)));
621
622 emit_shader_time_write(type, diff);
623 emit_shader_time_write(written_type, fs_reg(1u));
624 emit(BRW_OPCODE_ELSE);
625 emit_shader_time_write(reset_type, fs_reg(1u));
626 emit(BRW_OPCODE_ENDIF);
627
628 pop_force_uncompressed();
629 }
630
631 void
632 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
633 fs_reg value)
634 {
635 int shader_time_index =
636 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
637 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
638
639 fs_reg payload;
640 if (dispatch_width == 8)
641 payload = fs_reg(this, glsl_type::uvec2_type);
642 else
643 payload = fs_reg(this, glsl_type::uint_type);
644
645 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
646 fs_reg(), payload, offset, value));
647 }
648
649 void
650 fs_visitor::fail(const char *format, ...)
651 {
652 va_list va;
653 char *msg;
654
655 if (failed)
656 return;
657
658 failed = true;
659
660 va_start(va, format);
661 msg = ralloc_vasprintf(mem_ctx, format, va);
662 va_end(va);
663 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
664
665 this->fail_msg = msg;
666
667 if (INTEL_DEBUG & DEBUG_WM) {
668 fprintf(stderr, "%s", msg);
669 }
670 }
671
672 fs_inst *
673 fs_visitor::emit(enum opcode opcode)
674 {
675 return emit(new(mem_ctx) fs_inst(opcode));
676 }
677
678 fs_inst *
679 fs_visitor::emit(enum opcode opcode, fs_reg dst)
680 {
681 return emit(new(mem_ctx) fs_inst(opcode, dst));
682 }
683
684 fs_inst *
685 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
686 {
687 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
688 }
689
690 fs_inst *
691 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
692 {
693 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
694 }
695
696 fs_inst *
697 fs_visitor::emit(enum opcode opcode, fs_reg dst,
698 fs_reg src0, fs_reg src1, fs_reg src2)
699 {
700 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
701 }
702
703 void
704 fs_visitor::push_force_uncompressed()
705 {
706 force_uncompressed_stack++;
707 }
708
709 void
710 fs_visitor::pop_force_uncompressed()
711 {
712 force_uncompressed_stack--;
713 assert(force_uncompressed_stack >= 0);
714 }
715
716 /**
717 * Returns true if the instruction has a flag that means it won't
718 * update an entire destination register.
719 *
720 * For example, dead code elimination and live variable analysis want to know
721 * when a write to a variable screens off any preceding values that were in
722 * it.
723 */
724 bool
725 fs_inst::is_partial_write()
726 {
727 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
728 this->force_uncompressed ||
729 this->force_sechalf || !this->dst.is_contiguous());
730 }
731
732 int
733 fs_inst::regs_read(fs_visitor *v, int arg)
734 {
735 if (is_tex() && arg == 0 && src[0].file == GRF) {
736 if (v->dispatch_width == 16)
737 return (mlen + 1) / 2;
738 else
739 return mlen;
740 }
741 return 1;
742 }
743
744 bool
745 fs_inst::reads_flag()
746 {
747 return predicate;
748 }
749
750 bool
751 fs_inst::writes_flag()
752 {
753 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
754 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
755 }
756
757 /**
758 * Returns how many MRFs an FS opcode will write over.
759 *
760 * Note that this is not the 0 or 1 implied writes in an actual gen
761 * instruction -- the FS opcodes often generate MOVs in addition.
762 */
763 int
764 fs_visitor::implied_mrf_writes(fs_inst *inst)
765 {
766 if (inst->mlen == 0)
767 return 0;
768
769 if (inst->base_mrf == -1)
770 return 0;
771
772 switch (inst->opcode) {
773 case SHADER_OPCODE_RCP:
774 case SHADER_OPCODE_RSQ:
775 case SHADER_OPCODE_SQRT:
776 case SHADER_OPCODE_EXP2:
777 case SHADER_OPCODE_LOG2:
778 case SHADER_OPCODE_SIN:
779 case SHADER_OPCODE_COS:
780 return 1 * dispatch_width / 8;
781 case SHADER_OPCODE_POW:
782 case SHADER_OPCODE_INT_QUOTIENT:
783 case SHADER_OPCODE_INT_REMAINDER:
784 return 2 * dispatch_width / 8;
785 case SHADER_OPCODE_TEX:
786 case FS_OPCODE_TXB:
787 case SHADER_OPCODE_TXD:
788 case SHADER_OPCODE_TXF:
789 case SHADER_OPCODE_TXF_CMS:
790 case SHADER_OPCODE_TXF_MCS:
791 case SHADER_OPCODE_TG4:
792 case SHADER_OPCODE_TG4_OFFSET:
793 case SHADER_OPCODE_TXL:
794 case SHADER_OPCODE_TXS:
795 case SHADER_OPCODE_LOD:
796 return 1;
797 case FS_OPCODE_FB_WRITE:
798 return 2;
799 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
800 case SHADER_OPCODE_GEN4_SCRATCH_READ:
801 return 1;
802 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
803 return inst->mlen;
804 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
805 return 2;
806 case SHADER_OPCODE_UNTYPED_ATOMIC:
807 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
808 return 0;
809 default:
810 assert(!"not reached");
811 return inst->mlen;
812 }
813 }
814
815 int
816 fs_visitor::virtual_grf_alloc(int size)
817 {
818 if (virtual_grf_array_size <= virtual_grf_count) {
819 if (virtual_grf_array_size == 0)
820 virtual_grf_array_size = 16;
821 else
822 virtual_grf_array_size *= 2;
823 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
824 virtual_grf_array_size);
825 }
826 virtual_grf_sizes[virtual_grf_count] = size;
827 return virtual_grf_count++;
828 }
829
830 /** Fixed HW reg constructor. */
831 fs_reg::fs_reg(enum register_file file, int reg)
832 {
833 init();
834 this->file = file;
835 this->reg = reg;
836 this->type = BRW_REGISTER_TYPE_F;
837 }
838
839 /** Fixed HW reg constructor. */
840 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
841 {
842 init();
843 this->file = file;
844 this->reg = reg;
845 this->type = type;
846 }
847
848 /** Automatic reg constructor. */
849 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
850 {
851 init();
852
853 this->file = GRF;
854 this->reg = v->virtual_grf_alloc(v->type_size(type));
855 this->reg_offset = 0;
856 this->type = brw_type_for_base_type(type);
857 }
858
859 fs_reg *
860 fs_visitor::variable_storage(ir_variable *var)
861 {
862 return (fs_reg *)hash_table_find(this->variable_ht, var);
863 }
864
865 void
866 import_uniforms_callback(const void *key,
867 void *data,
868 void *closure)
869 {
870 struct hash_table *dst_ht = (struct hash_table *)closure;
871 const fs_reg *reg = (const fs_reg *)data;
872
873 if (reg->file != UNIFORM)
874 return;
875
876 hash_table_insert(dst_ht, data, key);
877 }
878
879 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
880 * This brings in those uniform definitions
881 */
882 void
883 fs_visitor::import_uniforms(fs_visitor *v)
884 {
885 hash_table_call_foreach(v->variable_ht,
886 import_uniforms_callback,
887 variable_ht);
888 this->push_constant_loc = v->push_constant_loc;
889 this->uniforms = v->uniforms;
890 }
891
892 /* Our support for uniforms is piggy-backed on the struct
893 * gl_fragment_program, because that's where the values actually
894 * get stored, rather than in some global gl_shader_program uniform
895 * store.
896 */
897 void
898 fs_visitor::setup_uniform_values(ir_variable *ir)
899 {
900 int namelen = strlen(ir->name);
901
902 /* The data for our (non-builtin) uniforms is stored in a series of
903 * gl_uniform_driver_storage structs for each subcomponent that
904 * glGetUniformLocation() could name. We know it's been set up in the same
905 * order we'd walk the type, so walk the list of storage and find anything
906 * with our name, or the prefix of a component that starts with our name.
907 */
908 unsigned params_before = uniforms;
909 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
910 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
911
912 if (strncmp(ir->name, storage->name, namelen) != 0 ||
913 (storage->name[namelen] != 0 &&
914 storage->name[namelen] != '.' &&
915 storage->name[namelen] != '[')) {
916 continue;
917 }
918
919 unsigned slots = storage->type->component_slots();
920 if (storage->array_elements)
921 slots *= storage->array_elements;
922
923 for (unsigned i = 0; i < slots; i++) {
924 stage_prog_data->param[uniforms++] = &storage->storage[i].f;
925 }
926 }
927
928 /* Make sure we actually initialized the right amount of stuff here. */
929 assert(params_before + ir->type->component_slots() == uniforms);
930 (void)params_before;
931 }
932
933
934 /* Our support for builtin uniforms is even scarier than non-builtin.
935 * It sits on top of the PROG_STATE_VAR parameters that are
936 * automatically updated from GL context state.
937 */
938 void
939 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
940 {
941 const ir_state_slot *const slots = ir->state_slots;
942 assert(ir->state_slots != NULL);
943
944 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
945 /* This state reference has already been setup by ir_to_mesa, but we'll
946 * get the same index back here.
947 */
948 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
949 (gl_state_index *)slots[i].tokens);
950
951 /* Add each of the unique swizzles of the element as a parameter.
952 * This'll end up matching the expected layout of the
953 * array/matrix/structure we're trying to fill in.
954 */
955 int last_swiz = -1;
956 for (unsigned int j = 0; j < 4; j++) {
957 int swiz = GET_SWZ(slots[i].swizzle, j);
958 if (swiz == last_swiz)
959 break;
960 last_swiz = swiz;
961
962 stage_prog_data->param[uniforms++] =
963 &fp->Base.Parameters->ParameterValues[index][swiz].f;
964 }
965 }
966 }
967
968 fs_reg *
969 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
970 {
971 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
972 fs_reg wpos = *reg;
973 bool flip = !ir->data.origin_upper_left ^ c->key.render_to_fbo;
974
975 /* gl_FragCoord.x */
976 if (ir->data.pixel_center_integer) {
977 emit(MOV(wpos, this->pixel_x));
978 } else {
979 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
980 }
981 wpos.reg_offset++;
982
983 /* gl_FragCoord.y */
984 if (!flip && ir->data.pixel_center_integer) {
985 emit(MOV(wpos, this->pixel_y));
986 } else {
987 fs_reg pixel_y = this->pixel_y;
988 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
989
990 if (flip) {
991 pixel_y.negate = true;
992 offset += c->key.drawable_height - 1.0;
993 }
994
995 emit(ADD(wpos, pixel_y, fs_reg(offset)));
996 }
997 wpos.reg_offset++;
998
999 /* gl_FragCoord.z */
1000 if (brw->gen >= 6) {
1001 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
1002 } else {
1003 emit(FS_OPCODE_LINTERP, wpos,
1004 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1005 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1006 interp_reg(VARYING_SLOT_POS, 2));
1007 }
1008 wpos.reg_offset++;
1009
1010 /* gl_FragCoord.w: Already set up in emit_interpolation */
1011 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1012
1013 return reg;
1014 }
1015
1016 fs_inst *
1017 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1018 glsl_interp_qualifier interpolation_mode,
1019 bool is_centroid, bool is_sample)
1020 {
1021 brw_wm_barycentric_interp_mode barycoord_mode;
1022 if (brw->gen >= 6) {
1023 if (is_centroid) {
1024 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1025 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1026 else
1027 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1028 } else if (is_sample) {
1029 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1030 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1031 else
1032 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1033 } else {
1034 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1035 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1036 else
1037 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1038 }
1039 } else {
1040 /* On Ironlake and below, there is only one interpolation mode.
1041 * Centroid interpolation doesn't mean anything on this hardware --
1042 * there is no multisampling.
1043 */
1044 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1045 }
1046 return emit(FS_OPCODE_LINTERP, attr,
1047 this->delta_x[barycoord_mode],
1048 this->delta_y[barycoord_mode], interp);
1049 }
1050
1051 fs_reg *
1052 fs_visitor::emit_general_interpolation(ir_variable *ir)
1053 {
1054 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1055 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1056 fs_reg attr = *reg;
1057
1058 unsigned int array_elements;
1059 const glsl_type *type;
1060
1061 if (ir->type->is_array()) {
1062 array_elements = ir->type->length;
1063 if (array_elements == 0) {
1064 fail("dereferenced array '%s' has length 0\n", ir->name);
1065 }
1066 type = ir->type->fields.array;
1067 } else {
1068 array_elements = 1;
1069 type = ir->type;
1070 }
1071
1072 glsl_interp_qualifier interpolation_mode =
1073 ir->determine_interpolation_mode(c->key.flat_shade);
1074
1075 int location = ir->data.location;
1076 for (unsigned int i = 0; i < array_elements; i++) {
1077 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1078 if (c->prog_data.urb_setup[location] == -1) {
1079 /* If there's no incoming setup data for this slot, don't
1080 * emit interpolation for it.
1081 */
1082 attr.reg_offset += type->vector_elements;
1083 location++;
1084 continue;
1085 }
1086
1087 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1088 /* Constant interpolation (flat shading) case. The SF has
1089 * handed us defined values in only the constant offset
1090 * field of the setup reg.
1091 */
1092 for (unsigned int k = 0; k < type->vector_elements; k++) {
1093 struct brw_reg interp = interp_reg(location, k);
1094 interp = suboffset(interp, 3);
1095 interp.type = reg->type;
1096 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1097 attr.reg_offset++;
1098 }
1099 } else {
1100 /* Smooth/noperspective interpolation case. */
1101 for (unsigned int k = 0; k < type->vector_elements; k++) {
1102 struct brw_reg interp = interp_reg(location, k);
1103 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1104 ir->data.centroid && !c->key.persample_shading,
1105 ir->data.sample || c->key.persample_shading);
1106 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1107 /* Get the pixel/sample mask into f0 so that we know
1108 * which pixels are lit. Then, for each channel that is
1109 * unlit, replace the centroid data with non-centroid
1110 * data.
1111 */
1112 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1113 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1114 interpolation_mode,
1115 false, false);
1116 inst->predicate = BRW_PREDICATE_NORMAL;
1117 inst->predicate_inverse = true;
1118 }
1119 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1120 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1121 }
1122 attr.reg_offset++;
1123 }
1124
1125 }
1126 location++;
1127 }
1128 }
1129
1130 return reg;
1131 }
1132
1133 fs_reg *
1134 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1135 {
1136 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1137
1138 /* The frontfacing comes in as a bit in the thread payload. */
1139 if (brw->gen >= 6) {
1140 emit(BRW_OPCODE_ASR, *reg,
1141 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1142 fs_reg(15));
1143 emit(BRW_OPCODE_NOT, *reg, *reg);
1144 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1145 } else {
1146 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1147 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1148 * us front face
1149 */
1150 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1151 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1152 }
1153
1154 return reg;
1155 }
1156
1157 void
1158 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1159 {
1160 assert(dst.type == BRW_REGISTER_TYPE_F);
1161
1162 if (c->key.compute_pos_offset) {
1163 /* Convert int_sample_pos to floating point */
1164 emit(MOV(dst, int_sample_pos));
1165 /* Scale to the range [0, 1] */
1166 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1167 }
1168 else {
1169 /* From ARB_sample_shading specification:
1170 * "When rendering to a non-multisample buffer, or if multisample
1171 * rasterization is disabled, gl_SamplePosition will always be
1172 * (0.5, 0.5).
1173 */
1174 emit(MOV(dst, fs_reg(0.5f)));
1175 }
1176 }
1177
1178 fs_reg *
1179 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1180 {
1181 assert(brw->gen >= 6);
1182 assert(ir->type == glsl_type::vec2_type);
1183
1184 this->current_annotation = "compute sample position";
1185 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1186 fs_reg pos = *reg;
1187 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1188 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1189
1190 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1191 * mode will be enabled.
1192 *
1193 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1194 * R31.1:0 Position Offset X/Y for Slot[3:0]
1195 * R31.3:2 Position Offset X/Y for Slot[7:4]
1196 * .....
1197 *
1198 * The X, Y sample positions come in as bytes in thread payload. So, read
1199 * the positions using vstride=16, width=8, hstride=2.
1200 */
1201 struct brw_reg sample_pos_reg =
1202 stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1203 BRW_REGISTER_TYPE_B), 16, 8, 2);
1204
1205 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1206 if (dispatch_width == 16) {
1207 fs_inst *inst = emit(MOV(half(int_sample_x, 1),
1208 fs_reg(suboffset(sample_pos_reg, 16))));
1209 inst->force_sechalf = true;
1210 }
1211 /* Compute gl_SamplePosition.x */
1212 compute_sample_position(pos, int_sample_x);
1213 pos.reg_offset++;
1214 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1215 if (dispatch_width == 16) {
1216 fs_inst *inst = emit(MOV(half(int_sample_y, 1),
1217 fs_reg(suboffset(sample_pos_reg, 17))));
1218 inst->force_sechalf = true;
1219 }
1220 /* Compute gl_SamplePosition.y */
1221 compute_sample_position(pos, int_sample_y);
1222 return reg;
1223 }
1224
1225 fs_reg *
1226 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1227 {
1228 assert(brw->gen >= 6);
1229
1230 this->current_annotation = "compute sample id";
1231 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1232
1233 if (c->key.compute_sample_id) {
1234 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1235 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1236 t2.type = BRW_REGISTER_TYPE_UW;
1237
1238 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1239 * 8x multisampling, subspan 0 will represent sample N (where N
1240 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1241 * 7. We can find the value of N by looking at R0.0 bits 7:6
1242 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1243 * (since samples are always delivered in pairs). That is, we
1244 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1245 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1246 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1247 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1248 * populating a temporary variable with the sequence (0, 1, 2, 3),
1249 * and then reading from it using vstride=1, width=4, hstride=0.
1250 * These computations hold good for 4x multisampling as well.
1251 */
1252 emit(BRW_OPCODE_AND, t1,
1253 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1254 fs_reg(brw_imm_d(0xc0)));
1255 emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1256 /* This works for both SIMD8 and SIMD16 */
1257 emit(MOV(t2, brw_imm_v(0x3210)));
1258 /* This special instruction takes care of setting vstride=1,
1259 * width=4, hstride=0 of t2 during an ADD instruction.
1260 */
1261 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1262 } else {
1263 /* As per GL_ARB_sample_shading specification:
1264 * "When rendering to a non-multisample buffer, or if multisample
1265 * rasterization is disabled, gl_SampleID will always be zero."
1266 */
1267 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1268 }
1269
1270 return reg;
1271 }
1272
1273 fs_reg *
1274 fs_visitor::emit_samplemaskin_setup(ir_variable *ir)
1275 {
1276 assert(brw->gen >= 7);
1277 this->current_annotation = "compute gl_SampleMaskIn";
1278 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1279 emit(MOV(*reg, fs_reg(retype(brw_vec8_grf(c->sample_mask_reg, 0), BRW_REGISTER_TYPE_D))));
1280 return reg;
1281 }
1282
1283 fs_reg
1284 fs_visitor::fix_math_operand(fs_reg src)
1285 {
1286 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1287 * might be able to do better by doing execsize = 1 math and then
1288 * expanding that result out, but we would need to be careful with
1289 * masking.
1290 *
1291 * The hardware ignores source modifiers (negate and abs) on math
1292 * instructions, so we also move to a temp to set those up.
1293 */
1294 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1295 !src.abs && !src.negate)
1296 return src;
1297
1298 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1299 * operands to math
1300 */
1301 if (brw->gen >= 7 && src.file != IMM)
1302 return src;
1303
1304 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1305 expanded.type = src.type;
1306 emit(BRW_OPCODE_MOV, expanded, src);
1307 return expanded;
1308 }
1309
1310 fs_inst *
1311 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1312 {
1313 switch (opcode) {
1314 case SHADER_OPCODE_RCP:
1315 case SHADER_OPCODE_RSQ:
1316 case SHADER_OPCODE_SQRT:
1317 case SHADER_OPCODE_EXP2:
1318 case SHADER_OPCODE_LOG2:
1319 case SHADER_OPCODE_SIN:
1320 case SHADER_OPCODE_COS:
1321 break;
1322 default:
1323 assert(!"not reached: bad math opcode");
1324 return NULL;
1325 }
1326
1327 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1328 * might be able to do better by doing execsize = 1 math and then
1329 * expanding that result out, but we would need to be careful with
1330 * masking.
1331 *
1332 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1333 * instructions, so we also move to a temp to set those up.
1334 */
1335 if (brw->gen >= 6)
1336 src = fix_math_operand(src);
1337
1338 fs_inst *inst = emit(opcode, dst, src);
1339
1340 if (brw->gen < 6) {
1341 inst->base_mrf = 2;
1342 inst->mlen = dispatch_width / 8;
1343 }
1344
1345 return inst;
1346 }
1347
1348 fs_inst *
1349 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1350 {
1351 int base_mrf = 2;
1352 fs_inst *inst;
1353
1354 switch (opcode) {
1355 case SHADER_OPCODE_INT_QUOTIENT:
1356 case SHADER_OPCODE_INT_REMAINDER:
1357 if (brw->gen >= 7 && dispatch_width == 16)
1358 fail("SIMD16 INTDIV unsupported\n");
1359 break;
1360 case SHADER_OPCODE_POW:
1361 break;
1362 default:
1363 assert(!"not reached: unsupported binary math opcode.");
1364 return NULL;
1365 }
1366
1367 if (brw->gen >= 6) {
1368 src0 = fix_math_operand(src0);
1369 src1 = fix_math_operand(src1);
1370
1371 inst = emit(opcode, dst, src0, src1);
1372 } else {
1373 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1374 * "Message Payload":
1375 *
1376 * "Operand0[7]. For the INT DIV functions, this operand is the
1377 * denominator."
1378 * ...
1379 * "Operand1[7]. For the INT DIV functions, this operand is the
1380 * numerator."
1381 */
1382 bool is_int_div = opcode != SHADER_OPCODE_POW;
1383 fs_reg &op0 = is_int_div ? src1 : src0;
1384 fs_reg &op1 = is_int_div ? src0 : src1;
1385
1386 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1387 inst = emit(opcode, dst, op0, reg_null_f);
1388
1389 inst->base_mrf = base_mrf;
1390 inst->mlen = 2 * dispatch_width / 8;
1391 }
1392 return inst;
1393 }
1394
1395 void
1396 fs_visitor::assign_curb_setup()
1397 {
1398 if (dispatch_width == 8) {
1399 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1400 } else {
1401 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1402 }
1403
1404 c->prog_data.curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1405
1406 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1407 foreach_list(node, &this->instructions) {
1408 fs_inst *inst = (fs_inst *)node;
1409
1410 for (unsigned int i = 0; i < 3; i++) {
1411 if (inst->src[i].file == UNIFORM) {
1412 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1413 int constant_nr;
1414 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1415 constant_nr = push_constant_loc[uniform_nr];
1416 } else {
1417 /* Section 5.11 of the OpenGL 4.1 spec says:
1418 * "Out-of-bounds reads return undefined values, which include
1419 * values from other variables of the active program or zero."
1420 * Just return the first push constant.
1421 */
1422 constant_nr = 0;
1423 }
1424
1425 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1426 constant_nr / 8,
1427 constant_nr % 8);
1428
1429 inst->src[i].file = HW_REG;
1430 inst->src[i].fixed_hw_reg = byte_offset(
1431 retype(brw_reg, inst->src[i].type),
1432 inst->src[i].subreg_offset);
1433 }
1434 }
1435 }
1436 }
1437
1438 void
1439 fs_visitor::calculate_urb_setup()
1440 {
1441 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1442 c->prog_data.urb_setup[i] = -1;
1443 }
1444
1445 int urb_next = 0;
1446 /* Figure out where each of the incoming setup attributes lands. */
1447 if (brw->gen >= 6) {
1448 if (_mesa_bitcount_64(fp->Base.InputsRead &
1449 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1450 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1451 * first 16 varying inputs, so we can put them wherever we want.
1452 * Just put them in order.
1453 *
1454 * This is useful because it means that (a) inputs not used by the
1455 * fragment shader won't take up valuable register space, and (b) we
1456 * won't have to recompile the fragment shader if it gets paired with
1457 * a different vertex (or geometry) shader.
1458 */
1459 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1460 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1461 BITFIELD64_BIT(i)) {
1462 c->prog_data.urb_setup[i] = urb_next++;
1463 }
1464 }
1465 } else {
1466 /* We have enough input varyings that the SF/SBE pipeline stage can't
1467 * arbitrarily rearrange them to suit our whim; we have to put them
1468 * in an order that matches the output of the previous pipeline stage
1469 * (geometry or vertex shader).
1470 */
1471 struct brw_vue_map prev_stage_vue_map;
1472 brw_compute_vue_map(brw, &prev_stage_vue_map,
1473 c->key.input_slots_valid);
1474 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1475 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1476 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1477 slot++) {
1478 int varying = prev_stage_vue_map.slot_to_varying[slot];
1479 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1480 * unused.
1481 */
1482 if (varying != BRW_VARYING_SLOT_COUNT &&
1483 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1484 BITFIELD64_BIT(varying))) {
1485 c->prog_data.urb_setup[varying] = slot - first_slot;
1486 }
1487 }
1488 urb_next = prev_stage_vue_map.num_slots - first_slot;
1489 }
1490 } else {
1491 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1492 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1493 /* Point size is packed into the header, not as a general attribute */
1494 if (i == VARYING_SLOT_PSIZ)
1495 continue;
1496
1497 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1498 /* The back color slot is skipped when the front color is
1499 * also written to. In addition, some slots can be
1500 * written in the vertex shader and not read in the
1501 * fragment shader. So the register number must always be
1502 * incremented, mapped or not.
1503 */
1504 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1505 c->prog_data.urb_setup[i] = urb_next;
1506 urb_next++;
1507 }
1508 }
1509
1510 /*
1511 * It's a FS only attribute, and we did interpolation for this attribute
1512 * in SF thread. So, count it here, too.
1513 *
1514 * See compile_sf_prog() for more info.
1515 */
1516 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1517 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1518 }
1519
1520 c->prog_data.num_varying_inputs = urb_next;
1521 }
1522
1523 void
1524 fs_visitor::assign_urb_setup()
1525 {
1526 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1527
1528 /* Offset all the urb_setup[] index by the actual position of the
1529 * setup regs, now that the location of the constants has been chosen.
1530 */
1531 foreach_list(node, &this->instructions) {
1532 fs_inst *inst = (fs_inst *)node;
1533
1534 if (inst->opcode == FS_OPCODE_LINTERP) {
1535 assert(inst->src[2].file == HW_REG);
1536 inst->src[2].fixed_hw_reg.nr += urb_start;
1537 }
1538
1539 if (inst->opcode == FS_OPCODE_CINTERP) {
1540 assert(inst->src[0].file == HW_REG);
1541 inst->src[0].fixed_hw_reg.nr += urb_start;
1542 }
1543 }
1544
1545 /* Each attribute is 4 setup channels, each of which is half a reg. */
1546 this->first_non_payload_grf =
1547 urb_start + c->prog_data.num_varying_inputs * 2;
1548 }
1549
1550 /**
1551 * Split large virtual GRFs into separate components if we can.
1552 *
1553 * This is mostly duplicated with what brw_fs_vector_splitting does,
1554 * but that's really conservative because it's afraid of doing
1555 * splitting that doesn't result in real progress after the rest of
1556 * the optimization phases, which would cause infinite looping in
1557 * optimization. We can do it once here, safely. This also has the
1558 * opportunity to split interpolated values, or maybe even uniforms,
1559 * which we don't have at the IR level.
1560 *
1561 * We want to split, because virtual GRFs are what we register
1562 * allocate and spill (due to contiguousness requirements for some
1563 * instructions), and they're what we naturally generate in the
1564 * codegen process, but most virtual GRFs don't actually need to be
1565 * contiguous sets of GRFs. If we split, we'll end up with reduced
1566 * live intervals and better dead code elimination and coalescing.
1567 */
1568 void
1569 fs_visitor::split_virtual_grfs()
1570 {
1571 int num_vars = this->virtual_grf_count;
1572 bool split_grf[num_vars];
1573 int new_virtual_grf[num_vars];
1574
1575 /* Try to split anything > 0 sized. */
1576 for (int i = 0; i < num_vars; i++) {
1577 if (this->virtual_grf_sizes[i] != 1)
1578 split_grf[i] = true;
1579 else
1580 split_grf[i] = false;
1581 }
1582
1583 if (brw->has_pln &&
1584 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1585 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1586 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1587 * Gen6, that was the only supported interpolation mode, and since Gen6,
1588 * delta_x and delta_y are in fixed hardware registers.
1589 */
1590 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1591 false;
1592 }
1593
1594 foreach_list(node, &this->instructions) {
1595 fs_inst *inst = (fs_inst *)node;
1596
1597 /* If there's a SEND message that requires contiguous destination
1598 * registers, no splitting is allowed.
1599 */
1600 if (inst->regs_written > 1) {
1601 split_grf[inst->dst.reg] = false;
1602 }
1603
1604 /* If we're sending from a GRF, don't split it, on the assumption that
1605 * the send is reading the whole thing.
1606 */
1607 if (inst->is_send_from_grf()) {
1608 for (int i = 0; i < 3; i++) {
1609 if (inst->src[i].file == GRF) {
1610 split_grf[inst->src[i].reg] = false;
1611 }
1612 }
1613 }
1614 }
1615
1616 /* Allocate new space for split regs. Note that the virtual
1617 * numbers will be contiguous.
1618 */
1619 for (int i = 0; i < num_vars; i++) {
1620 if (split_grf[i]) {
1621 new_virtual_grf[i] = virtual_grf_alloc(1);
1622 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1623 int reg = virtual_grf_alloc(1);
1624 assert(reg == new_virtual_grf[i] + j - 1);
1625 (void) reg;
1626 }
1627 this->virtual_grf_sizes[i] = 1;
1628 }
1629 }
1630
1631 foreach_list(node, &this->instructions) {
1632 fs_inst *inst = (fs_inst *)node;
1633
1634 if (inst->dst.file == GRF &&
1635 split_grf[inst->dst.reg] &&
1636 inst->dst.reg_offset != 0) {
1637 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1638 inst->dst.reg_offset - 1);
1639 inst->dst.reg_offset = 0;
1640 }
1641 for (int i = 0; i < 3; i++) {
1642 if (inst->src[i].file == GRF &&
1643 split_grf[inst->src[i].reg] &&
1644 inst->src[i].reg_offset != 0) {
1645 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1646 inst->src[i].reg_offset - 1);
1647 inst->src[i].reg_offset = 0;
1648 }
1649 }
1650 }
1651 invalidate_live_intervals();
1652 }
1653
1654 /**
1655 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1656 *
1657 * During code generation, we create tons of temporary variables, many of
1658 * which get immediately killed and are never used again. Yet, in later
1659 * optimization and analysis passes, such as compute_live_intervals, we need
1660 * to loop over all the virtual GRFs. Compacting them can save a lot of
1661 * overhead.
1662 */
1663 void
1664 fs_visitor::compact_virtual_grfs()
1665 {
1666 /* Mark which virtual GRFs are used, and count how many. */
1667 int remap_table[this->virtual_grf_count];
1668 memset(remap_table, -1, sizeof(remap_table));
1669
1670 foreach_list(node, &this->instructions) {
1671 const fs_inst *inst = (const fs_inst *) node;
1672
1673 if (inst->dst.file == GRF)
1674 remap_table[inst->dst.reg] = 0;
1675
1676 for (int i = 0; i < 3; i++) {
1677 if (inst->src[i].file == GRF)
1678 remap_table[inst->src[i].reg] = 0;
1679 }
1680 }
1681
1682 /* In addition to registers used in instructions, fs_visitor keeps
1683 * direct references to certain special values which must be patched:
1684 */
1685 fs_reg *special[] = {
1686 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1687 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1688 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1689 &delta_x[0], &delta_x[1], &delta_x[2],
1690 &delta_x[3], &delta_x[4], &delta_x[5],
1691 &delta_y[0], &delta_y[1], &delta_y[2],
1692 &delta_y[3], &delta_y[4], &delta_y[5],
1693 };
1694 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1695 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1696
1697 /* Treat all special values as used, to be conservative */
1698 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1699 if (special[i]->file == GRF)
1700 remap_table[special[i]->reg] = 0;
1701 }
1702
1703 /* Compact the GRF arrays. */
1704 int new_index = 0;
1705 for (int i = 0; i < this->virtual_grf_count; i++) {
1706 if (remap_table[i] != -1) {
1707 remap_table[i] = new_index;
1708 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1709 invalidate_live_intervals();
1710 ++new_index;
1711 }
1712 }
1713
1714 this->virtual_grf_count = new_index;
1715
1716 /* Patch all the instructions to use the newly renumbered registers */
1717 foreach_list(node, &this->instructions) {
1718 fs_inst *inst = (fs_inst *) node;
1719
1720 if (inst->dst.file == GRF)
1721 inst->dst.reg = remap_table[inst->dst.reg];
1722
1723 for (int i = 0; i < 3; i++) {
1724 if (inst->src[i].file == GRF)
1725 inst->src[i].reg = remap_table[inst->src[i].reg];
1726 }
1727 }
1728
1729 /* Patch all the references to special values */
1730 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1731 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1732 special[i]->reg = remap_table[special[i]->reg];
1733 }
1734 }
1735
1736 /*
1737 * Implements array access of uniforms by inserting a
1738 * PULL_CONSTANT_LOAD instruction.
1739 *
1740 * Unlike temporary GRF array access (where we don't support it due to
1741 * the difficulty of doing relative addressing on instruction
1742 * destinations), we could potentially do array access of uniforms
1743 * that were loaded in GRF space as push constants. In real-world
1744 * usage we've seen, though, the arrays being used are always larger
1745 * than we could load as push constants, so just always move all
1746 * uniform array access out to a pull constant buffer.
1747 */
1748 void
1749 fs_visitor::move_uniform_array_access_to_pull_constants()
1750 {
1751 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1752
1753 for (unsigned int i = 0; i < uniforms; i++) {
1754 pull_constant_loc[i] = -1;
1755 }
1756
1757 /* Walk through and find array access of uniforms. Put a copy of that
1758 * uniform in the pull constant buffer.
1759 *
1760 * Note that we don't move constant-indexed accesses to arrays. No
1761 * testing has been done of the performance impact of this choice.
1762 */
1763 foreach_list_safe(node, &this->instructions) {
1764 fs_inst *inst = (fs_inst *)node;
1765
1766 for (int i = 0 ; i < 3; i++) {
1767 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1768 continue;
1769
1770 int uniform = inst->src[i].reg;
1771
1772 /* If this array isn't already present in the pull constant buffer,
1773 * add it.
1774 */
1775 if (pull_constant_loc[uniform] == -1) {
1776 const float **values = &stage_prog_data->param[uniform];
1777
1778 assert(param_size[uniform]);
1779
1780 for (int j = 0; j < param_size[uniform]; j++) {
1781 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1782
1783 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1784 values[j];
1785 }
1786 }
1787 }
1788 }
1789 }
1790
1791 /**
1792 * Assign UNIFORM file registers to either push constants or pull constants.
1793 *
1794 * We allow a fragment shader to have more than the specified minimum
1795 * maximum number of fragment shader uniform components (64). If
1796 * there are too many of these, they'd fill up all of register space.
1797 * So, this will push some of them out to the pull constant buffer and
1798 * update the program to load them.
1799 */
1800 void
1801 fs_visitor::assign_constant_locations()
1802 {
1803 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1804 if (dispatch_width != 8)
1805 return;
1806
1807 /* Find which UNIFORM registers are still in use. */
1808 bool is_live[uniforms];
1809 for (unsigned int i = 0; i < uniforms; i++) {
1810 is_live[i] = false;
1811 }
1812
1813 foreach_list(node, &this->instructions) {
1814 fs_inst *inst = (fs_inst *) node;
1815
1816 for (int i = 0; i < 3; i++) {
1817 if (inst->src[i].file != UNIFORM)
1818 continue;
1819
1820 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1821 if (constant_nr >= 0 && constant_nr < (int) uniforms)
1822 is_live[constant_nr] = true;
1823 }
1824 }
1825
1826 /* Only allow 16 registers (128 uniform components) as push constants.
1827 *
1828 * Just demote the end of the list. We could probably do better
1829 * here, demoting things that are rarely used in the program first.
1830 */
1831 unsigned int max_push_components = 16 * 8;
1832 unsigned int num_push_constants = 0;
1833
1834 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1835
1836 for (unsigned int i = 0; i < uniforms; i++) {
1837 if (!is_live[i] || pull_constant_loc[i] != -1) {
1838 /* This UNIFORM register is either dead, or has already been demoted
1839 * to a pull const. Mark it as no longer living in the param[] array.
1840 */
1841 push_constant_loc[i] = -1;
1842 continue;
1843 }
1844
1845 if (num_push_constants < max_push_components) {
1846 /* Retain as a push constant. Record the location in the params[]
1847 * array.
1848 */
1849 push_constant_loc[i] = num_push_constants++;
1850 } else {
1851 /* Demote to a pull constant. */
1852 push_constant_loc[i] = -1;
1853
1854 int pull_index = stage_prog_data->nr_pull_params++;
1855 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1856 pull_constant_loc[i] = pull_index;
1857 }
1858 }
1859
1860 stage_prog_data->nr_params = num_push_constants;
1861
1862 /* Up until now, the param[] array has been indexed by reg + reg_offset
1863 * of UNIFORM registers. Condense it to only contain the uniforms we
1864 * chose to upload as push constants.
1865 */
1866 for (unsigned int i = 0; i < uniforms; i++) {
1867 int remapped = push_constant_loc[i];
1868
1869 if (remapped == -1)
1870 continue;
1871
1872 assert(remapped <= i);
1873 stage_prog_data->param[remapped] = stage_prog_data->param[i];
1874 }
1875 }
1876
1877 /**
1878 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1879 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1880 */
1881 void
1882 fs_visitor::demote_pull_constants()
1883 {
1884 foreach_list(node, &this->instructions) {
1885 fs_inst *inst = (fs_inst *)node;
1886
1887 for (int i = 0; i < 3; i++) {
1888 if (inst->src[i].file != UNIFORM)
1889 continue;
1890
1891 int pull_index = pull_constant_loc[inst->src[i].reg +
1892 inst->src[i].reg_offset];
1893 if (pull_index == -1)
1894 continue;
1895
1896 /* Set up the annotation tracking for new generated instructions. */
1897 base_ir = inst->ir;
1898 current_annotation = inst->annotation;
1899
1900 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1901 fs_reg dst = fs_reg(this, glsl_type::float_type);
1902
1903 /* Generate a pull load into dst. */
1904 if (inst->src[i].reladdr) {
1905 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
1906 surf_index,
1907 *inst->src[i].reladdr,
1908 pull_index);
1909 inst->insert_before(&list);
1910 inst->src[i].reladdr = NULL;
1911 } else {
1912 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1913 fs_inst *pull =
1914 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1915 dst, surf_index, offset);
1916 inst->insert_before(pull);
1917 inst->src[i].set_smear(pull_index & 3);
1918 }
1919
1920 /* Rewrite the instruction to use the temporary VGRF. */
1921 inst->src[i].file = GRF;
1922 inst->src[i].reg = dst.reg;
1923 inst->src[i].reg_offset = 0;
1924 }
1925 }
1926 invalidate_live_intervals();
1927 }
1928
1929 bool
1930 fs_visitor::opt_algebraic()
1931 {
1932 bool progress = false;
1933
1934 foreach_list(node, &this->instructions) {
1935 fs_inst *inst = (fs_inst *)node;
1936
1937 switch (inst->opcode) {
1938 case BRW_OPCODE_MUL:
1939 if (inst->src[1].file != IMM)
1940 continue;
1941
1942 /* a * 1.0 = a */
1943 if (inst->src[1].is_one()) {
1944 inst->opcode = BRW_OPCODE_MOV;
1945 inst->src[1] = reg_undef;
1946 progress = true;
1947 break;
1948 }
1949
1950 /* a * 0.0 = 0.0 */
1951 if (inst->src[1].is_zero()) {
1952 inst->opcode = BRW_OPCODE_MOV;
1953 inst->src[0] = inst->src[1];
1954 inst->src[1] = reg_undef;
1955 progress = true;
1956 break;
1957 }
1958
1959 break;
1960 case BRW_OPCODE_ADD:
1961 if (inst->src[1].file != IMM)
1962 continue;
1963
1964 /* a + 0.0 = a */
1965 if (inst->src[1].is_zero()) {
1966 inst->opcode = BRW_OPCODE_MOV;
1967 inst->src[1] = reg_undef;
1968 progress = true;
1969 break;
1970 }
1971 break;
1972 case BRW_OPCODE_OR:
1973 if (inst->src[0].equals(inst->src[1])) {
1974 inst->opcode = BRW_OPCODE_MOV;
1975 inst->src[1] = reg_undef;
1976 progress = true;
1977 break;
1978 }
1979 break;
1980 case BRW_OPCODE_LRP:
1981 if (inst->src[1].equals(inst->src[2])) {
1982 inst->opcode = BRW_OPCODE_MOV;
1983 inst->src[0] = inst->src[1];
1984 inst->src[1] = reg_undef;
1985 inst->src[2] = reg_undef;
1986 progress = true;
1987 break;
1988 }
1989 break;
1990 case BRW_OPCODE_SEL:
1991 if (inst->saturate && inst->src[1].file == IMM) {
1992 switch (inst->conditional_mod) {
1993 case BRW_CONDITIONAL_LE:
1994 case BRW_CONDITIONAL_L:
1995 switch (inst->src[1].type) {
1996 case BRW_REGISTER_TYPE_F:
1997 if (inst->src[1].imm.f >= 1.0f) {
1998 inst->opcode = BRW_OPCODE_MOV;
1999 inst->src[1] = reg_undef;
2000 progress = true;
2001 }
2002 break;
2003 default:
2004 break;
2005 }
2006 break;
2007 case BRW_CONDITIONAL_GE:
2008 case BRW_CONDITIONAL_G:
2009 switch (inst->src[1].type) {
2010 case BRW_REGISTER_TYPE_F:
2011 if (inst->src[1].imm.f <= 0.0f) {
2012 inst->opcode = BRW_OPCODE_MOV;
2013 inst->src[1] = reg_undef;
2014 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2015 progress = true;
2016 }
2017 break;
2018 default:
2019 break;
2020 }
2021 default:
2022 break;
2023 }
2024 }
2025 break;
2026 default:
2027 break;
2028 }
2029 }
2030
2031 return progress;
2032 }
2033
2034 /**
2035 * Removes any instructions writing a VGRF where that VGRF is not used by any
2036 * later instruction.
2037 */
2038 bool
2039 fs_visitor::dead_code_eliminate()
2040 {
2041 bool progress = false;
2042 int pc = 0;
2043
2044 calculate_live_intervals();
2045
2046 foreach_list_safe(node, &this->instructions) {
2047 fs_inst *inst = (fs_inst *)node;
2048
2049 if (inst->dst.file == GRF && !inst->has_side_effects()) {
2050 bool dead = true;
2051
2052 for (int i = 0; i < inst->regs_written; i++) {
2053 int var = live_intervals->var_from_vgrf[inst->dst.reg];
2054 assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2055 if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2056 dead = false;
2057 break;
2058 }
2059 }
2060
2061 if (dead) {
2062 /* Don't dead code eliminate instructions that write to the
2063 * accumulator as a side-effect. Instead just set the destination
2064 * to the null register to free it.
2065 */
2066 switch (inst->opcode) {
2067 case BRW_OPCODE_ADDC:
2068 case BRW_OPCODE_SUBB:
2069 case BRW_OPCODE_MACH:
2070 inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2071 break;
2072 default:
2073 inst->remove();
2074 progress = true;
2075 break;
2076 }
2077 }
2078 }
2079
2080 pc++;
2081 }
2082
2083 if (progress)
2084 invalidate_live_intervals();
2085
2086 return progress;
2087 }
2088
2089 struct dead_code_hash_key
2090 {
2091 int vgrf;
2092 int reg_offset;
2093 };
2094
2095 static bool
2096 dead_code_hash_compare(const void *a, const void *b)
2097 {
2098 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2099 }
2100
2101 static void
2102 clear_dead_code_hash(struct hash_table *ht)
2103 {
2104 struct hash_entry *entry;
2105
2106 hash_table_foreach(ht, entry) {
2107 _mesa_hash_table_remove(ht, entry);
2108 }
2109 }
2110
2111 static void
2112 insert_dead_code_hash(struct hash_table *ht,
2113 int vgrf, int reg_offset, fs_inst *inst)
2114 {
2115 /* We don't bother freeing keys, because they'll be GCed with the ht. */
2116 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2117
2118 key->vgrf = vgrf;
2119 key->reg_offset = reg_offset;
2120
2121 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2122 }
2123
2124 static struct hash_entry *
2125 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2126 {
2127 struct dead_code_hash_key key;
2128
2129 key.vgrf = vgrf;
2130 key.reg_offset = reg_offset;
2131
2132 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2133 }
2134
2135 static void
2136 remove_dead_code_hash(struct hash_table *ht,
2137 int vgrf, int reg_offset)
2138 {
2139 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2140 if (!entry)
2141 return;
2142
2143 _mesa_hash_table_remove(ht, entry);
2144 }
2145
2146 /**
2147 * Walks basic blocks, removing any regs that are written but not read before
2148 * being redefined.
2149 *
2150 * The dead_code_eliminate() function implements a global dead code
2151 * elimination, but it only handles the removing the last write to a register
2152 * if it's never read. This one can handle intermediate writes, but only
2153 * within a basic block.
2154 */
2155 bool
2156 fs_visitor::dead_code_eliminate_local()
2157 {
2158 struct hash_table *ht;
2159 bool progress = false;
2160
2161 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2162
2163 if (ht == NULL) {
2164 return false;
2165 }
2166
2167 foreach_list_safe(node, &this->instructions) {
2168 fs_inst *inst = (fs_inst *)node;
2169
2170 /* At a basic block, empty the HT since we don't understand dataflow
2171 * here.
2172 */
2173 if (inst->is_control_flow()) {
2174 clear_dead_code_hash(ht);
2175 continue;
2176 }
2177
2178 /* Clear the HT of any instructions that got read. */
2179 for (int i = 0; i < 3; i++) {
2180 fs_reg src = inst->src[i];
2181 if (src.file != GRF)
2182 continue;
2183
2184 int read = 1;
2185 if (inst->is_send_from_grf())
2186 read = virtual_grf_sizes[src.reg] - src.reg_offset;
2187
2188 for (int reg_offset = src.reg_offset;
2189 reg_offset < src.reg_offset + read;
2190 reg_offset++) {
2191 remove_dead_code_hash(ht, src.reg, reg_offset);
2192 }
2193 }
2194
2195 /* Add any update of a GRF to the HT, removing a previous write if it
2196 * wasn't read.
2197 */
2198 if (inst->dst.file == GRF) {
2199 if (inst->regs_written > 1) {
2200 /* We don't know how to trim channels from an instruction's
2201 * writes, so we can't incrementally remove unread channels from
2202 * it. Just remove whatever it overwrites from the table
2203 */
2204 for (int i = 0; i < inst->regs_written; i++) {
2205 remove_dead_code_hash(ht,
2206 inst->dst.reg,
2207 inst->dst.reg_offset + i);
2208 }
2209 } else {
2210 struct hash_entry *entry =
2211 get_dead_code_hash_entry(ht, inst->dst.reg,
2212 inst->dst.reg_offset);
2213
2214 if (entry) {
2215 if (inst->is_partial_write()) {
2216 /* For a partial write, we can't remove any previous dead code
2217 * candidate, since we're just modifying their result.
2218 */
2219 } else {
2220 /* We're completely updating a channel, and there was a
2221 * previous write to the channel that wasn't read. Kill it!
2222 */
2223 fs_inst *inst = (fs_inst *)entry->data;
2224 inst->remove();
2225 progress = true;
2226 }
2227
2228 _mesa_hash_table_remove(ht, entry);
2229 }
2230
2231 if (!inst->has_side_effects())
2232 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2233 inst);
2234 }
2235 }
2236 }
2237
2238 _mesa_hash_table_destroy(ht, NULL);
2239
2240 if (progress)
2241 invalidate_live_intervals();
2242
2243 return progress;
2244 }
2245
2246 /**
2247 * Implements register coalescing: Checks if the two registers involved in a
2248 * raw move don't interfere, in which case they can both be stored in the same
2249 * place and the MOV removed.
2250 *
2251 * To do this, all uses of the source of the MOV in the shader are replaced
2252 * with the destination of the MOV. For example:
2253 *
2254 * add vgrf3:F, vgrf1:F, vgrf2:F
2255 * mov vgrf4:F, vgrf3:F
2256 * mul vgrf5:F, vgrf5:F, vgrf4:F
2257 *
2258 * becomes
2259 *
2260 * add vgrf4:F, vgrf1:F, vgrf2:F
2261 * mul vgrf5:F, vgrf5:F, vgrf4:F
2262 */
2263 bool
2264 fs_visitor::register_coalesce()
2265 {
2266 bool progress = false;
2267
2268 calculate_live_intervals();
2269
2270 int src_size = 0;
2271 int channels_remaining = 0;
2272 int reg_from = -1, reg_to = -1;
2273 int reg_to_offset[MAX_SAMPLER_MESSAGE_SIZE];
2274 fs_inst *mov[MAX_SAMPLER_MESSAGE_SIZE];
2275
2276 foreach_list(node, &this->instructions) {
2277 fs_inst *inst = (fs_inst *)node;
2278
2279 if (inst->opcode != BRW_OPCODE_MOV ||
2280 inst->is_partial_write() ||
2281 inst->saturate ||
2282 inst->src[0].file != GRF ||
2283 inst->src[0].negate ||
2284 inst->src[0].abs ||
2285 !inst->src[0].is_contiguous() ||
2286 inst->dst.file != GRF ||
2287 inst->dst.type != inst->src[0].type) {
2288 continue;
2289 }
2290
2291 if (virtual_grf_sizes[inst->src[0].reg] >
2292 virtual_grf_sizes[inst->dst.reg])
2293 continue;
2294
2295 int var_from = live_intervals->var_from_reg(&inst->src[0]);
2296 int var_to = live_intervals->var_from_reg(&inst->dst);
2297
2298 if (live_intervals->vars_interfere(var_from, var_to) &&
2299 !inst->dst.equals(inst->src[0])) {
2300
2301 /* We know that the live ranges of A (var_from) and B (var_to)
2302 * interfere because of the ->vars_interfere() call above. If the end
2303 * of B's live range is after the end of A's range, then we know two
2304 * things:
2305 * - the start of B's live range must be in A's live range (since we
2306 * already know the two ranges interfere, this is the only remaining
2307 * possibility)
2308 * - the interference isn't of the form we're looking for (where B is
2309 * entirely inside A)
2310 */
2311 if (live_intervals->end[var_to] > live_intervals->end[var_from])
2312 continue;
2313
2314 bool overwritten = false;
2315 int scan_ip = -1;
2316
2317 foreach_list(n, &this->instructions) {
2318 fs_inst *scan_inst = (fs_inst *)n;
2319 scan_ip++;
2320
2321 if (scan_inst->is_control_flow()) {
2322 overwritten = true;
2323 break;
2324 }
2325
2326 if (scan_ip <= live_intervals->start[var_to])
2327 continue;
2328
2329 if (scan_ip > live_intervals->end[var_to])
2330 break;
2331
2332 if (scan_inst->dst.equals(inst->dst) ||
2333 scan_inst->dst.equals(inst->src[0])) {
2334 overwritten = true;
2335 break;
2336 }
2337 }
2338
2339 if (overwritten)
2340 continue;
2341 }
2342
2343 if (reg_from != inst->src[0].reg) {
2344 reg_from = inst->src[0].reg;
2345
2346 src_size = virtual_grf_sizes[inst->src[0].reg];
2347 assert(src_size <= MAX_SAMPLER_MESSAGE_SIZE);
2348
2349 channels_remaining = src_size;
2350 memset(mov, 0, sizeof(mov));
2351
2352 reg_to = inst->dst.reg;
2353 }
2354
2355 if (reg_to != inst->dst.reg)
2356 continue;
2357
2358 const int offset = inst->src[0].reg_offset;
2359 reg_to_offset[offset] = inst->dst.reg_offset;
2360 mov[offset] = inst;
2361 channels_remaining--;
2362
2363 if (channels_remaining)
2364 continue;
2365
2366 bool removed = false;
2367 for (int i = 0; i < src_size; i++) {
2368 if (mov[i]) {
2369 removed = true;
2370
2371 mov[i]->opcode = BRW_OPCODE_NOP;
2372 mov[i]->conditional_mod = BRW_CONDITIONAL_NONE;
2373 mov[i]->dst = reg_undef;
2374 mov[i]->src[0] = reg_undef;
2375 mov[i]->src[1] = reg_undef;
2376 mov[i]->src[2] = reg_undef;
2377 }
2378 }
2379
2380 foreach_list(node, &this->instructions) {
2381 fs_inst *scan_inst = (fs_inst *)node;
2382
2383 for (int i = 0; i < src_size; i++) {
2384 if (mov[i]) {
2385 if (scan_inst->dst.file == GRF &&
2386 scan_inst->dst.reg == reg_from &&
2387 scan_inst->dst.reg_offset == i) {
2388 scan_inst->dst.reg = reg_to;
2389 scan_inst->dst.reg_offset = reg_to_offset[i];
2390 }
2391 for (int j = 0; j < 3; j++) {
2392 if (scan_inst->src[j].file == GRF &&
2393 scan_inst->src[j].reg == reg_from &&
2394 scan_inst->src[j].reg_offset == i) {
2395 scan_inst->src[j].reg = reg_to;
2396 scan_inst->src[j].reg_offset = reg_to_offset[i];
2397 }
2398 }
2399 }
2400 }
2401 }
2402
2403 if (removed) {
2404 live_intervals->start[var_to] = MIN2(live_intervals->start[var_to],
2405 live_intervals->start[var_from]);
2406 live_intervals->end[var_to] = MAX2(live_intervals->end[var_to],
2407 live_intervals->end[var_from]);
2408 reg_from = -1;
2409 }
2410 }
2411
2412 foreach_list_safe(node, &this->instructions) {
2413 fs_inst *inst = (fs_inst *)node;
2414
2415 if (inst->opcode == BRW_OPCODE_NOP) {
2416 inst->remove();
2417 progress = true;
2418 }
2419 }
2420
2421 if (progress)
2422 invalidate_live_intervals();
2423
2424 return progress;
2425 }
2426
2427 bool
2428 fs_visitor::compute_to_mrf()
2429 {
2430 bool progress = false;
2431 int next_ip = 0;
2432
2433 calculate_live_intervals();
2434
2435 foreach_list_safe(node, &this->instructions) {
2436 fs_inst *inst = (fs_inst *)node;
2437
2438 int ip = next_ip;
2439 next_ip++;
2440
2441 if (inst->opcode != BRW_OPCODE_MOV ||
2442 inst->is_partial_write() ||
2443 inst->dst.file != MRF || inst->src[0].file != GRF ||
2444 inst->dst.type != inst->src[0].type ||
2445 inst->src[0].abs || inst->src[0].negate ||
2446 !inst->src[0].is_contiguous() ||
2447 inst->src[0].subreg_offset)
2448 continue;
2449
2450 /* Work out which hardware MRF registers are written by this
2451 * instruction.
2452 */
2453 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2454 int mrf_high;
2455 if (inst->dst.reg & BRW_MRF_COMPR4) {
2456 mrf_high = mrf_low + 4;
2457 } else if (dispatch_width == 16 &&
2458 (!inst->force_uncompressed && !inst->force_sechalf)) {
2459 mrf_high = mrf_low + 1;
2460 } else {
2461 mrf_high = mrf_low;
2462 }
2463
2464 /* Can't compute-to-MRF this GRF if someone else was going to
2465 * read it later.
2466 */
2467 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2468 continue;
2469
2470 /* Found a move of a GRF to a MRF. Let's see if we can go
2471 * rewrite the thing that made this GRF to write into the MRF.
2472 */
2473 fs_inst *scan_inst;
2474 for (scan_inst = (fs_inst *)inst->prev;
2475 scan_inst->prev != NULL;
2476 scan_inst = (fs_inst *)scan_inst->prev) {
2477 if (scan_inst->dst.file == GRF &&
2478 scan_inst->dst.reg == inst->src[0].reg) {
2479 /* Found the last thing to write our reg we want to turn
2480 * into a compute-to-MRF.
2481 */
2482
2483 /* If this one instruction didn't populate all the
2484 * channels, bail. We might be able to rewrite everything
2485 * that writes that reg, but it would require smarter
2486 * tracking to delay the rewriting until complete success.
2487 */
2488 if (scan_inst->is_partial_write())
2489 break;
2490
2491 /* Things returning more than one register would need us to
2492 * understand coalescing out more than one MOV at a time.
2493 */
2494 if (scan_inst->regs_written > 1)
2495 break;
2496
2497 /* SEND instructions can't have MRF as a destination. */
2498 if (scan_inst->mlen)
2499 break;
2500
2501 if (brw->gen == 6) {
2502 /* gen6 math instructions must have the destination be
2503 * GRF, so no compute-to-MRF for them.
2504 */
2505 if (scan_inst->is_math()) {
2506 break;
2507 }
2508 }
2509
2510 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2511 /* Found the creator of our MRF's source value. */
2512 scan_inst->dst.file = MRF;
2513 scan_inst->dst.reg = inst->dst.reg;
2514 scan_inst->saturate |= inst->saturate;
2515 inst->remove();
2516 progress = true;
2517 }
2518 break;
2519 }
2520
2521 /* We don't handle control flow here. Most computation of
2522 * values that end up in MRFs are shortly before the MRF
2523 * write anyway.
2524 */
2525 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2526 break;
2527
2528 /* You can't read from an MRF, so if someone else reads our
2529 * MRF's source GRF that we wanted to rewrite, that stops us.
2530 */
2531 bool interfered = false;
2532 for (int i = 0; i < 3; i++) {
2533 if (scan_inst->src[i].file == GRF &&
2534 scan_inst->src[i].reg == inst->src[0].reg &&
2535 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2536 interfered = true;
2537 }
2538 }
2539 if (interfered)
2540 break;
2541
2542 if (scan_inst->dst.file == MRF) {
2543 /* If somebody else writes our MRF here, we can't
2544 * compute-to-MRF before that.
2545 */
2546 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2547 int scan_mrf_high;
2548
2549 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2550 scan_mrf_high = scan_mrf_low + 4;
2551 } else if (dispatch_width == 16 &&
2552 (!scan_inst->force_uncompressed &&
2553 !scan_inst->force_sechalf)) {
2554 scan_mrf_high = scan_mrf_low + 1;
2555 } else {
2556 scan_mrf_high = scan_mrf_low;
2557 }
2558
2559 if (mrf_low == scan_mrf_low ||
2560 mrf_low == scan_mrf_high ||
2561 mrf_high == scan_mrf_low ||
2562 mrf_high == scan_mrf_high) {
2563 break;
2564 }
2565 }
2566
2567 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2568 /* Found a SEND instruction, which means that there are
2569 * live values in MRFs from base_mrf to base_mrf +
2570 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2571 * above it.
2572 */
2573 if (mrf_low >= scan_inst->base_mrf &&
2574 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2575 break;
2576 }
2577 if (mrf_high >= scan_inst->base_mrf &&
2578 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2579 break;
2580 }
2581 }
2582 }
2583 }
2584
2585 if (progress)
2586 invalidate_live_intervals();
2587
2588 return progress;
2589 }
2590
2591 /**
2592 * Walks through basic blocks, looking for repeated MRF writes and
2593 * removing the later ones.
2594 */
2595 bool
2596 fs_visitor::remove_duplicate_mrf_writes()
2597 {
2598 fs_inst *last_mrf_move[16];
2599 bool progress = false;
2600
2601 /* Need to update the MRF tracking for compressed instructions. */
2602 if (dispatch_width == 16)
2603 return false;
2604
2605 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2606
2607 foreach_list_safe(node, &this->instructions) {
2608 fs_inst *inst = (fs_inst *)node;
2609
2610 if (inst->is_control_flow()) {
2611 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2612 }
2613
2614 if (inst->opcode == BRW_OPCODE_MOV &&
2615 inst->dst.file == MRF) {
2616 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2617 if (prev_inst && inst->equals(prev_inst)) {
2618 inst->remove();
2619 progress = true;
2620 continue;
2621 }
2622 }
2623
2624 /* Clear out the last-write records for MRFs that were overwritten. */
2625 if (inst->dst.file == MRF) {
2626 last_mrf_move[inst->dst.reg] = NULL;
2627 }
2628
2629 if (inst->mlen > 0 && inst->base_mrf != -1) {
2630 /* Found a SEND instruction, which will include two or fewer
2631 * implied MRF writes. We could do better here.
2632 */
2633 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2634 last_mrf_move[inst->base_mrf + i] = NULL;
2635 }
2636 }
2637
2638 /* Clear out any MRF move records whose sources got overwritten. */
2639 if (inst->dst.file == GRF) {
2640 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2641 if (last_mrf_move[i] &&
2642 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2643 last_mrf_move[i] = NULL;
2644 }
2645 }
2646 }
2647
2648 if (inst->opcode == BRW_OPCODE_MOV &&
2649 inst->dst.file == MRF &&
2650 inst->src[0].file == GRF &&
2651 !inst->is_partial_write()) {
2652 last_mrf_move[inst->dst.reg] = inst;
2653 }
2654 }
2655
2656 if (progress)
2657 invalidate_live_intervals();
2658
2659 return progress;
2660 }
2661
2662 static void
2663 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2664 int first_grf, int grf_len)
2665 {
2666 bool inst_simd16 = (dispatch_width > 8 &&
2667 !inst->force_uncompressed &&
2668 !inst->force_sechalf);
2669
2670 /* Clear the flag for registers that actually got read (as expected). */
2671 for (int i = 0; i < 3; i++) {
2672 int grf;
2673 if (inst->src[i].file == GRF) {
2674 grf = inst->src[i].reg;
2675 } else if (inst->src[i].file == HW_REG &&
2676 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2677 grf = inst->src[i].fixed_hw_reg.nr;
2678 } else {
2679 continue;
2680 }
2681
2682 if (grf >= first_grf &&
2683 grf < first_grf + grf_len) {
2684 deps[grf - first_grf] = false;
2685 if (inst_simd16)
2686 deps[grf - first_grf + 1] = false;
2687 }
2688 }
2689 }
2690
2691 /**
2692 * Implements this workaround for the original 965:
2693 *
2694 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2695 * check for post destination dependencies on this instruction, software
2696 * must ensure that there is no destination hazard for the case of ‘write
2697 * followed by a posted write’ shown in the following example.
2698 *
2699 * 1. mov r3 0
2700 * 2. send r3.xy <rest of send instruction>
2701 * 3. mov r2 r3
2702 *
2703 * Due to no post-destination dependency check on the ‘send’, the above
2704 * code sequence could have two instructions (1 and 2) in flight at the
2705 * same time that both consider ‘r3’ as the target of their final writes.
2706 */
2707 void
2708 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2709 {
2710 int reg_size = dispatch_width / 8;
2711 int write_len = inst->regs_written * reg_size;
2712 int first_write_grf = inst->dst.reg;
2713 bool needs_dep[BRW_MAX_MRF];
2714 assert(write_len < (int)sizeof(needs_dep) - 1);
2715
2716 memset(needs_dep, false, sizeof(needs_dep));
2717 memset(needs_dep, true, write_len);
2718
2719 clear_deps_for_inst_src(inst, dispatch_width,
2720 needs_dep, first_write_grf, write_len);
2721
2722 /* Walk backwards looking for writes to registers we're writing which
2723 * aren't read since being written. If we hit the start of the program,
2724 * we assume that there are no outstanding dependencies on entry to the
2725 * program.
2726 */
2727 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2728 scan_inst != NULL;
2729 scan_inst = (fs_inst *)scan_inst->prev) {
2730
2731 /* If we hit control flow, assume that there *are* outstanding
2732 * dependencies, and force their cleanup before our instruction.
2733 */
2734 if (scan_inst->is_control_flow()) {
2735 for (int i = 0; i < write_len; i++) {
2736 if (needs_dep[i]) {
2737 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2738 }
2739 }
2740 return;
2741 }
2742
2743 bool scan_inst_simd16 = (dispatch_width > 8 &&
2744 !scan_inst->force_uncompressed &&
2745 !scan_inst->force_sechalf);
2746
2747 /* We insert our reads as late as possible on the assumption that any
2748 * instruction but a MOV that might have left us an outstanding
2749 * dependency has more latency than a MOV.
2750 */
2751 if (scan_inst->dst.file == GRF) {
2752 for (int i = 0; i < scan_inst->regs_written; i++) {
2753 int reg = scan_inst->dst.reg + i * reg_size;
2754
2755 if (reg >= first_write_grf &&
2756 reg < first_write_grf + write_len &&
2757 needs_dep[reg - first_write_grf]) {
2758 inst->insert_before(DEP_RESOLVE_MOV(reg));
2759 needs_dep[reg - first_write_grf] = false;
2760 if (scan_inst_simd16)
2761 needs_dep[reg - first_write_grf + 1] = false;
2762 }
2763 }
2764 }
2765
2766 /* Clear the flag for registers that actually got read (as expected). */
2767 clear_deps_for_inst_src(scan_inst, dispatch_width,
2768 needs_dep, first_write_grf, write_len);
2769
2770 /* Continue the loop only if we haven't resolved all the dependencies */
2771 int i;
2772 for (i = 0; i < write_len; i++) {
2773 if (needs_dep[i])
2774 break;
2775 }
2776 if (i == write_len)
2777 return;
2778 }
2779 }
2780
2781 /**
2782 * Implements this workaround for the original 965:
2783 *
2784 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2785 * used as a destination register until after it has been sourced by an
2786 * instruction with a different destination register.
2787 */
2788 void
2789 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2790 {
2791 int write_len = inst->regs_written * dispatch_width / 8;
2792 int first_write_grf = inst->dst.reg;
2793 bool needs_dep[BRW_MAX_MRF];
2794 assert(write_len < (int)sizeof(needs_dep) - 1);
2795
2796 memset(needs_dep, false, sizeof(needs_dep));
2797 memset(needs_dep, true, write_len);
2798 /* Walk forwards looking for writes to registers we're writing which aren't
2799 * read before being written.
2800 */
2801 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2802 !scan_inst->is_tail_sentinel();
2803 scan_inst = (fs_inst *)scan_inst->next) {
2804 /* If we hit control flow, force resolve all remaining dependencies. */
2805 if (scan_inst->is_control_flow()) {
2806 for (int i = 0; i < write_len; i++) {
2807 if (needs_dep[i])
2808 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2809 }
2810 return;
2811 }
2812
2813 /* Clear the flag for registers that actually got read (as expected). */
2814 clear_deps_for_inst_src(scan_inst, dispatch_width,
2815 needs_dep, first_write_grf, write_len);
2816
2817 /* We insert our reads as late as possible since they're reading the
2818 * result of a SEND, which has massive latency.
2819 */
2820 if (scan_inst->dst.file == GRF &&
2821 scan_inst->dst.reg >= first_write_grf &&
2822 scan_inst->dst.reg < first_write_grf + write_len &&
2823 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2824 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2825 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2826 }
2827
2828 /* Continue the loop only if we haven't resolved all the dependencies */
2829 int i;
2830 for (i = 0; i < write_len; i++) {
2831 if (needs_dep[i])
2832 break;
2833 }
2834 if (i == write_len)
2835 return;
2836 }
2837
2838 /* If we hit the end of the program, resolve all remaining dependencies out
2839 * of paranoia.
2840 */
2841 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2842 assert(last_inst->eot);
2843 for (int i = 0; i < write_len; i++) {
2844 if (needs_dep[i])
2845 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2846 }
2847 }
2848
2849 void
2850 fs_visitor::insert_gen4_send_dependency_workarounds()
2851 {
2852 if (brw->gen != 4 || brw->is_g4x)
2853 return;
2854
2855 /* Note that we're done with register allocation, so GRF fs_regs always
2856 * have a .reg_offset of 0.
2857 */
2858
2859 foreach_list_safe(node, &this->instructions) {
2860 fs_inst *inst = (fs_inst *)node;
2861
2862 if (inst->mlen != 0 && inst->dst.file == GRF) {
2863 insert_gen4_pre_send_dependency_workarounds(inst);
2864 insert_gen4_post_send_dependency_workarounds(inst);
2865 }
2866 }
2867 }
2868
2869 /**
2870 * Turns the generic expression-style uniform pull constant load instruction
2871 * into a hardware-specific series of instructions for loading a pull
2872 * constant.
2873 *
2874 * The expression style allows the CSE pass before this to optimize out
2875 * repeated loads from the same offset, and gives the pre-register-allocation
2876 * scheduling full flexibility, while the conversion to native instructions
2877 * allows the post-register-allocation scheduler the best information
2878 * possible.
2879 *
2880 * Note that execution masking for setting up pull constant loads is special:
2881 * the channels that need to be written are unrelated to the current execution
2882 * mask, since a later instruction will use one of the result channels as a
2883 * source operand for all 8 or 16 of its channels.
2884 */
2885 void
2886 fs_visitor::lower_uniform_pull_constant_loads()
2887 {
2888 foreach_list(node, &this->instructions) {
2889 fs_inst *inst = (fs_inst *)node;
2890
2891 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2892 continue;
2893
2894 if (brw->gen >= 7) {
2895 /* The offset arg before was a vec4-aligned byte offset. We need to
2896 * turn it into a dword offset.
2897 */
2898 fs_reg const_offset_reg = inst->src[1];
2899 assert(const_offset_reg.file == IMM &&
2900 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2901 const_offset_reg.imm.u /= 4;
2902 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2903
2904 /* This is actually going to be a MOV, but since only the first dword
2905 * is accessed, we have a special opcode to do just that one. Note
2906 * that this needs to be an operation that will be considered a def
2907 * by live variable analysis, or register allocation will explode.
2908 */
2909 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2910 payload, const_offset_reg);
2911 setup->force_writemask_all = true;
2912
2913 setup->ir = inst->ir;
2914 setup->annotation = inst->annotation;
2915 inst->insert_before(setup);
2916
2917 /* Similarly, this will only populate the first 4 channels of the
2918 * result register (since we only use smear values from 0-3), but we
2919 * don't tell the optimizer.
2920 */
2921 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2922 inst->src[1] = payload;
2923
2924 invalidate_live_intervals();
2925 } else {
2926 /* Before register allocation, we didn't tell the scheduler about the
2927 * MRF we use. We know it's safe to use this MRF because nothing
2928 * else does except for register spill/unspill, which generates and
2929 * uses its MRF within a single IR instruction.
2930 */
2931 inst->base_mrf = 14;
2932 inst->mlen = 1;
2933 }
2934 }
2935 }
2936
2937 void
2938 fs_visitor::dump_instructions()
2939 {
2940 calculate_register_pressure();
2941
2942 int ip = 0, max_pressure = 0;
2943 foreach_list(node, &this->instructions) {
2944 backend_instruction *inst = (backend_instruction *)node;
2945 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2946 fprintf(stderr, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
2947 dump_instruction(inst);
2948 ++ip;
2949 }
2950 fprintf(stderr, "Maximum %3d registers live at once.\n", max_pressure);
2951 }
2952
2953 void
2954 fs_visitor::dump_instruction(backend_instruction *be_inst)
2955 {
2956 fs_inst *inst = (fs_inst *)be_inst;
2957
2958 if (inst->predicate) {
2959 fprintf(stderr, "(%cf0.%d) ",
2960 inst->predicate_inverse ? '-' : '+',
2961 inst->flag_subreg);
2962 }
2963
2964 fprintf(stderr, "%s", brw_instruction_name(inst->opcode));
2965 if (inst->saturate)
2966 fprintf(stderr, ".sat");
2967 if (inst->conditional_mod) {
2968 fprintf(stderr, "%s", conditional_modifier[inst->conditional_mod]);
2969 if (!inst->predicate &&
2970 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2971 inst->opcode != BRW_OPCODE_IF &&
2972 inst->opcode != BRW_OPCODE_WHILE))) {
2973 fprintf(stderr, ".f0.%d", inst->flag_subreg);
2974 }
2975 }
2976 fprintf(stderr, " ");
2977
2978
2979 switch (inst->dst.file) {
2980 case GRF:
2981 fprintf(stderr, "vgrf%d", inst->dst.reg);
2982 if (virtual_grf_sizes[inst->dst.reg] != 1 ||
2983 inst->dst.subreg_offset)
2984 fprintf(stderr, "+%d.%d",
2985 inst->dst.reg_offset, inst->dst.subreg_offset);
2986 break;
2987 case MRF:
2988 fprintf(stderr, "m%d", inst->dst.reg);
2989 break;
2990 case BAD_FILE:
2991 fprintf(stderr, "(null)");
2992 break;
2993 case UNIFORM:
2994 fprintf(stderr, "***u%d***", inst->dst.reg);
2995 break;
2996 case HW_REG:
2997 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2998 switch (inst->dst.fixed_hw_reg.nr) {
2999 case BRW_ARF_NULL:
3000 fprintf(stderr, "null");
3001 break;
3002 case BRW_ARF_ADDRESS:
3003 fprintf(stderr, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3004 break;
3005 case BRW_ARF_ACCUMULATOR:
3006 fprintf(stderr, "acc%d", inst->dst.fixed_hw_reg.subnr);
3007 break;
3008 case BRW_ARF_FLAG:
3009 fprintf(stderr, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3010 inst->dst.fixed_hw_reg.subnr);
3011 break;
3012 default:
3013 fprintf(stderr, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3014 inst->dst.fixed_hw_reg.subnr);
3015 break;
3016 }
3017 } else {
3018 fprintf(stderr, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3019 }
3020 if (inst->dst.fixed_hw_reg.subnr)
3021 fprintf(stderr, "+%d", inst->dst.fixed_hw_reg.subnr);
3022 break;
3023 default:
3024 fprintf(stderr, "???");
3025 break;
3026 }
3027 fprintf(stderr, ":%s, ", brw_reg_type_letters(inst->dst.type));
3028
3029 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
3030 if (inst->src[i].negate)
3031 fprintf(stderr, "-");
3032 if (inst->src[i].abs)
3033 fprintf(stderr, "|");
3034 switch (inst->src[i].file) {
3035 case GRF:
3036 fprintf(stderr, "vgrf%d", inst->src[i].reg);
3037 if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
3038 inst->src[i].subreg_offset)
3039 fprintf(stderr, "+%d.%d", inst->src[i].reg_offset,
3040 inst->src[i].subreg_offset);
3041 break;
3042 case MRF:
3043 fprintf(stderr, "***m%d***", inst->src[i].reg);
3044 break;
3045 case UNIFORM:
3046 fprintf(stderr, "u%d", inst->src[i].reg);
3047 if (inst->src[i].reladdr) {
3048 fprintf(stderr, "+reladdr");
3049 } else if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
3050 inst->src[i].subreg_offset) {
3051 fprintf(stderr, "+%d.%d", inst->src[i].reg_offset,
3052 inst->src[i].subreg_offset);
3053 }
3054 break;
3055 case BAD_FILE:
3056 fprintf(stderr, "(null)");
3057 break;
3058 case IMM:
3059 switch (inst->src[i].type) {
3060 case BRW_REGISTER_TYPE_F:
3061 fprintf(stderr, "%ff", inst->src[i].imm.f);
3062 break;
3063 case BRW_REGISTER_TYPE_D:
3064 fprintf(stderr, "%dd", inst->src[i].imm.i);
3065 break;
3066 case BRW_REGISTER_TYPE_UD:
3067 fprintf(stderr, "%uu", inst->src[i].imm.u);
3068 break;
3069 default:
3070 fprintf(stderr, "???");
3071 break;
3072 }
3073 break;
3074 case HW_REG:
3075 if (inst->src[i].fixed_hw_reg.negate)
3076 fprintf(stderr, "-");
3077 if (inst->src[i].fixed_hw_reg.abs)
3078 fprintf(stderr, "|");
3079 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3080 switch (inst->src[i].fixed_hw_reg.nr) {
3081 case BRW_ARF_NULL:
3082 fprintf(stderr, "null");
3083 break;
3084 case BRW_ARF_ADDRESS:
3085 fprintf(stderr, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3086 break;
3087 case BRW_ARF_ACCUMULATOR:
3088 fprintf(stderr, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3089 break;
3090 case BRW_ARF_FLAG:
3091 fprintf(stderr, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3092 inst->src[i].fixed_hw_reg.subnr);
3093 break;
3094 default:
3095 fprintf(stderr, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3096 inst->src[i].fixed_hw_reg.subnr);
3097 break;
3098 }
3099 } else {
3100 fprintf(stderr, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3101 }
3102 if (inst->src[i].fixed_hw_reg.subnr)
3103 fprintf(stderr, "+%d", inst->src[i].fixed_hw_reg.subnr);
3104 if (inst->src[i].fixed_hw_reg.abs)
3105 fprintf(stderr, "|");
3106 break;
3107 default:
3108 fprintf(stderr, "???");
3109 break;
3110 }
3111 if (inst->src[i].abs)
3112 fprintf(stderr, "|");
3113
3114 if (inst->src[i].file != IMM) {
3115 fprintf(stderr, ":%s", brw_reg_type_letters(inst->src[i].type));
3116 }
3117
3118 if (i < 2 && inst->src[i + 1].file != BAD_FILE)
3119 fprintf(stderr, ", ");
3120 }
3121
3122 fprintf(stderr, " ");
3123
3124 if (inst->force_uncompressed)
3125 fprintf(stderr, "1sthalf ");
3126
3127 if (inst->force_sechalf)
3128 fprintf(stderr, "2ndhalf ");
3129
3130 fprintf(stderr, "\n");
3131 }
3132
3133 /**
3134 * Possibly returns an instruction that set up @param reg.
3135 *
3136 * Sometimes we want to take the result of some expression/variable
3137 * dereference tree and rewrite the instruction generating the result
3138 * of the tree. When processing the tree, we know that the
3139 * instructions generated are all writing temporaries that are dead
3140 * outside of this tree. So, if we have some instructions that write
3141 * a temporary, we're free to point that temp write somewhere else.
3142 *
3143 * Note that this doesn't guarantee that the instruction generated
3144 * only reg -- it might be the size=4 destination of a texture instruction.
3145 */
3146 fs_inst *
3147 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3148 fs_inst *end,
3149 const fs_reg &reg)
3150 {
3151 if (end == start ||
3152 end->is_partial_write() ||
3153 reg.reladdr ||
3154 !reg.equals(end->dst)) {
3155 return NULL;
3156 } else {
3157 return end;
3158 }
3159 }
3160
3161 void
3162 fs_visitor::setup_payload_gen6()
3163 {
3164 bool uses_depth =
3165 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3166 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3167
3168 assert(brw->gen >= 6);
3169
3170 /* R0-1: masks, pixel X/Y coordinates. */
3171 c->nr_payload_regs = 2;
3172 /* R2: only for 32-pixel dispatch.*/
3173
3174 /* R3-26: barycentric interpolation coordinates. These appear in the
3175 * same order that they appear in the brw_wm_barycentric_interp_mode
3176 * enum. Each set of coordinates occupies 2 registers if dispatch width
3177 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3178 * appear if they were enabled using the "Barycentric Interpolation
3179 * Mode" bits in WM_STATE.
3180 */
3181 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3182 if (barycentric_interp_modes & (1 << i)) {
3183 c->barycentric_coord_reg[i] = c->nr_payload_regs;
3184 c->nr_payload_regs += 2;
3185 if (dispatch_width == 16) {
3186 c->nr_payload_regs += 2;
3187 }
3188 }
3189 }
3190
3191 /* R27: interpolated depth if uses source depth */
3192 if (uses_depth) {
3193 c->source_depth_reg = c->nr_payload_regs;
3194 c->nr_payload_regs++;
3195 if (dispatch_width == 16) {
3196 /* R28: interpolated depth if not SIMD8. */
3197 c->nr_payload_regs++;
3198 }
3199 }
3200 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3201 if (uses_depth) {
3202 c->source_w_reg = c->nr_payload_regs;
3203 c->nr_payload_regs++;
3204 if (dispatch_width == 16) {
3205 /* R30: interpolated W if not SIMD8. */
3206 c->nr_payload_regs++;
3207 }
3208 }
3209
3210 c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3211 /* R31: MSAA position offsets. */
3212 if (c->prog_data.uses_pos_offset) {
3213 c->sample_pos_reg = c->nr_payload_regs;
3214 c->nr_payload_regs++;
3215 }
3216
3217 /* R32: MSAA input coverage mask */
3218 if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3219 assert(brw->gen >= 7);
3220 c->sample_mask_reg = c->nr_payload_regs;
3221 c->nr_payload_regs++;
3222 if (dispatch_width == 16) {
3223 /* R33: input coverage mask if not SIMD8. */
3224 c->nr_payload_regs++;
3225 }
3226 }
3227
3228 /* R34-: bary for 32-pixel. */
3229 /* R58-59: interp W for 32-pixel. */
3230
3231 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3232 c->source_depth_to_render_target = true;
3233 }
3234 }
3235
3236 void
3237 fs_visitor::assign_binding_table_offsets()
3238 {
3239 uint32_t next_binding_table_offset = 0;
3240
3241 /* If there are no color regions, we still perform an FB write to a null
3242 * renderbuffer, which we place at surface index 0.
3243 */
3244 c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3245 next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3246
3247 assign_common_binding_table_offsets(next_binding_table_offset);
3248 }
3249
3250 void
3251 fs_visitor::calculate_register_pressure()
3252 {
3253 calculate_live_intervals();
3254
3255 int num_instructions = 0;
3256 foreach_list(node, &this->instructions) {
3257 ++num_instructions;
3258 }
3259
3260 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3261
3262 for (int reg = 0; reg < virtual_grf_count; reg++) {
3263 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3264 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3265 }
3266 }
3267
3268 /**
3269 * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
3270 *
3271 * The needs_unlit_centroid_workaround ends up producing one of these per
3272 * channel of centroid input, so it's good to clean them up.
3273 *
3274 * An assumption here is that nothing ever modifies the dispatched pixels
3275 * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
3276 * dictates that anyway.
3277 */
3278 void
3279 fs_visitor::opt_drop_redundant_mov_to_flags()
3280 {
3281 bool flag_mov_found[2] = {false};
3282
3283 foreach_list_safe(node, &this->instructions) {
3284 fs_inst *inst = (fs_inst *)node;
3285
3286 if (inst->is_control_flow()) {
3287 memset(flag_mov_found, 0, sizeof(flag_mov_found));
3288 } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
3289 if (!flag_mov_found[inst->flag_subreg])
3290 flag_mov_found[inst->flag_subreg] = true;
3291 else
3292 inst->remove();
3293 } else if (inst->writes_flag()) {
3294 flag_mov_found[inst->flag_subreg] = false;
3295 }
3296 }
3297 }
3298
3299 bool
3300 fs_visitor::run()
3301 {
3302 sanity_param_count = fp->Base.Parameters->NumParameters;
3303 bool allocated_without_spills;
3304
3305 assign_binding_table_offsets();
3306
3307 if (brw->gen >= 6)
3308 setup_payload_gen6();
3309 else
3310 setup_payload_gen4();
3311
3312 if (0) {
3313 emit_dummy_fs();
3314 } else {
3315 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3316 emit_shader_time_begin();
3317
3318 calculate_urb_setup();
3319 if (fp->Base.InputsRead > 0) {
3320 if (brw->gen < 6)
3321 emit_interpolation_setup_gen4();
3322 else
3323 emit_interpolation_setup_gen6();
3324 }
3325
3326 /* We handle discards by keeping track of the still-live pixels in f0.1.
3327 * Initialize it with the dispatched pixels.
3328 */
3329 if (fp->UsesKill || c->key.alpha_test_func) {
3330 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3331 discard_init->flag_subreg = 1;
3332 }
3333
3334 /* Generate FS IR for main(). (the visitor only descends into
3335 * functions called "main").
3336 */
3337 if (shader) {
3338 foreach_list(node, &*shader->base.ir) {
3339 ir_instruction *ir = (ir_instruction *)node;
3340 base_ir = ir;
3341 this->result = reg_undef;
3342 ir->accept(this);
3343 }
3344 } else {
3345 emit_fragment_program_code();
3346 }
3347 base_ir = NULL;
3348 if (failed)
3349 return false;
3350
3351 emit(FS_OPCODE_PLACEHOLDER_HALT);
3352
3353 if (c->key.alpha_test_func)
3354 emit_alpha_test();
3355
3356 emit_fb_writes();
3357
3358 split_virtual_grfs();
3359
3360 move_uniform_array_access_to_pull_constants();
3361 assign_constant_locations();
3362 demote_pull_constants();
3363
3364 opt_drop_redundant_mov_to_flags();
3365
3366 bool progress;
3367 do {
3368 progress = false;
3369
3370 compact_virtual_grfs();
3371
3372 progress = remove_duplicate_mrf_writes() || progress;
3373
3374 progress = opt_algebraic() || progress;
3375 progress = opt_cse() || progress;
3376 progress = opt_copy_propagate() || progress;
3377 progress = opt_peephole_predicated_break() || progress;
3378 progress = dead_code_eliminate() || progress;
3379 progress = dead_code_eliminate_local() || progress;
3380 progress = opt_peephole_sel() || progress;
3381 progress = dead_control_flow_eliminate(this) || progress;
3382 progress = opt_saturate_propagation() || progress;
3383 progress = register_coalesce() || progress;
3384 progress = compute_to_mrf() || progress;
3385 } while (progress);
3386
3387 lower_uniform_pull_constant_loads();
3388
3389 assign_curb_setup();
3390 assign_urb_setup();
3391
3392 static enum instruction_scheduler_mode pre_modes[] = {
3393 SCHEDULE_PRE,
3394 SCHEDULE_PRE_NON_LIFO,
3395 SCHEDULE_PRE_LIFO,
3396 };
3397
3398 /* Try each scheduling heuristic to see if it can successfully register
3399 * allocate without spilling. They should be ordered by decreasing
3400 * performance but increasing likelihood of allocating.
3401 */
3402 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3403 schedule_instructions(pre_modes[i]);
3404
3405 if (0) {
3406 assign_regs_trivial();
3407 allocated_without_spills = true;
3408 } else {
3409 allocated_without_spills = assign_regs(false);
3410 }
3411 if (allocated_without_spills)
3412 break;
3413 }
3414
3415 if (!allocated_without_spills) {
3416 /* We assume that any spilling is worse than just dropping back to
3417 * SIMD8. There's probably actually some intermediate point where
3418 * SIMD16 with a couple of spills is still better.
3419 */
3420 if (dispatch_width == 16) {
3421 fail("Failure to register allocate. Reduce number of "
3422 "live scalar values to avoid this.");
3423 }
3424
3425 /* Since we're out of heuristics, just go spill registers until we
3426 * get an allocation.
3427 */
3428 while (!assign_regs(true)) {
3429 if (failed)
3430 break;
3431 }
3432 }
3433 }
3434 assert(force_uncompressed_stack == 0);
3435
3436 /* This must come after all optimization and register allocation, since
3437 * it inserts dead code that happens to have side effects, and it does
3438 * so based on the actual physical registers in use.
3439 */
3440 insert_gen4_send_dependency_workarounds();
3441
3442 if (failed)
3443 return false;
3444
3445 if (!allocated_without_spills)
3446 schedule_instructions(SCHEDULE_POST);
3447
3448 if (dispatch_width == 8)
3449 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3450 else
3451 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3452
3453 /* If any state parameters were appended, then ParameterValues could have
3454 * been realloced, in which case the driver uniform storage set up by
3455 * _mesa_associate_uniform_storage() would point to freed memory. Make
3456 * sure that didn't happen.
3457 */
3458 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3459
3460 return !failed;
3461 }
3462
3463 const unsigned *
3464 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3465 struct gl_fragment_program *fp,
3466 struct gl_shader_program *prog,
3467 unsigned *final_assembly_size)
3468 {
3469 bool start_busy = false;
3470 double start_time = 0;
3471
3472 if (unlikely(brw->perf_debug)) {
3473 start_busy = (brw->batch.last_bo &&
3474 drm_intel_bo_busy(brw->batch.last_bo));
3475 start_time = get_time();
3476 }
3477
3478 struct brw_shader *shader = NULL;
3479 if (prog)
3480 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3481
3482 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3483 brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3484
3485 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3486 */
3487 fs_visitor v(brw, c, prog, fp, 8);
3488 if (!v.run()) {
3489 if (prog) {
3490 prog->LinkStatus = false;
3491 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3492 }
3493
3494 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3495 v.fail_msg);
3496
3497 return NULL;
3498 }
3499
3500 exec_list *simd16_instructions = NULL;
3501 fs_visitor v2(brw, c, prog, fp, 16);
3502 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3503 if (c->prog_data.base.nr_pull_params == 0) {
3504 /* Try a SIMD16 compile */
3505 v2.import_uniforms(&v);
3506 if (!v2.run()) {
3507 perf_debug("SIMD16 shader failed to compile, falling back to "
3508 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3509 } else {
3510 simd16_instructions = &v2.instructions;
3511 }
3512 } else {
3513 perf_debug("Skipping SIMD16 due to pull parameters.\n");
3514 }
3515 }
3516
3517 const unsigned *assembly = NULL;
3518 if (brw->gen >= 8) {
3519 gen8_fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3520 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3521 final_assembly_size);
3522 } else {
3523 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3524 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3525 final_assembly_size);
3526 }
3527
3528 if (unlikely(brw->perf_debug) && shader) {
3529 if (shader->compiled_once)
3530 brw_wm_debug_recompile(brw, prog, &c->key);
3531 shader->compiled_once = true;
3532
3533 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3534 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3535 (get_time() - start_time) * 1000);
3536 }
3537 }
3538
3539 return assembly;
3540 }
3541
3542 bool
3543 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3544 {
3545 struct brw_context *brw = brw_context(ctx);
3546 struct brw_wm_prog_key key;
3547
3548 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3549 return true;
3550
3551 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3552 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3553 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3554 bool program_uses_dfdy = fp->UsesDFdy;
3555
3556 memset(&key, 0, sizeof(key));
3557
3558 if (brw->gen < 6) {
3559 if (fp->UsesKill)
3560 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3561
3562 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3563 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3564
3565 /* Just assume depth testing. */
3566 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3567 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3568 }
3569
3570 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3571 BRW_FS_VARYING_INPUT_MASK) > 16)
3572 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3573
3574 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3575
3576 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3577 for (unsigned i = 0; i < sampler_count; i++) {
3578 if (fp->Base.ShadowSamplers & (1 << i)) {
3579 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3580 key.tex.swizzles[i] =
3581 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3582 } else {
3583 /* Color sampler: assume no swizzling. */
3584 key.tex.swizzles[i] = SWIZZLE_XYZW;
3585 }
3586 }
3587
3588 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3589 key.drawable_height = ctx->DrawBuffer->Height;
3590 }
3591
3592 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3593 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3594 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3595
3596 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3597 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3598 key.nr_color_regions > 1;
3599 }
3600
3601 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3602 * quality of the derivatives is likely to be determined by the driconf
3603 * option.
3604 */
3605 key.high_quality_derivatives = brw->disable_derivative_optimization;
3606
3607 key.program_string_id = bfp->id;
3608
3609 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3610 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3611
3612 bool success = do_wm_prog(brw, prog, bfp, &key);
3613
3614 brw->wm.base.prog_offset = old_prog_offset;
3615 brw->wm.prog_data = old_prog_data;
3616
3617 return success;
3618 }