88dfdfc205645502142c9f09c6ba872679dba372
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53
54 void
55 fs_inst::init()
56 {
57 memset(this, 0, sizeof(*this));
58 this->conditional_mod = BRW_CONDITIONAL_NONE;
59
60 this->dst = reg_undef;
61 this->src[0] = reg_undef;
62 this->src[1] = reg_undef;
63 this->src[2] = reg_undef;
64
65 /* This will be the case for almost all instructions. */
66 this->regs_written = 1;
67 }
68
69 fs_inst::fs_inst()
70 {
71 init();
72 this->opcode = BRW_OPCODE_NOP;
73 }
74
75 fs_inst::fs_inst(enum opcode opcode)
76 {
77 init();
78 this->opcode = opcode;
79 }
80
81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
82 {
83 init();
84 this->opcode = opcode;
85 this->dst = dst;
86
87 if (dst.file == GRF)
88 assert(dst.reg_offset >= 0);
89 }
90
91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
92 {
93 init();
94 this->opcode = opcode;
95 this->dst = dst;
96 this->src[0] = src0;
97
98 if (dst.file == GRF)
99 assert(dst.reg_offset >= 0);
100 if (src[0].file == GRF)
101 assert(src[0].reg_offset >= 0);
102 }
103
104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
105 {
106 init();
107 this->opcode = opcode;
108 this->dst = dst;
109 this->src[0] = src0;
110 this->src[1] = src1;
111
112 if (dst.file == GRF)
113 assert(dst.reg_offset >= 0);
114 if (src[0].file == GRF)
115 assert(src[0].reg_offset >= 0);
116 if (src[1].file == GRF)
117 assert(src[1].reg_offset >= 0);
118 }
119
120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
121 fs_reg src0, fs_reg src1, fs_reg src2)
122 {
123 init();
124 this->opcode = opcode;
125 this->dst = dst;
126 this->src[0] = src0;
127 this->src[1] = src1;
128 this->src[2] = src2;
129
130 if (dst.file == GRF)
131 assert(dst.reg_offset >= 0);
132 if (src[0].file == GRF)
133 assert(src[0].reg_offset >= 0);
134 if (src[1].file == GRF)
135 assert(src[1].reg_offset >= 0);
136 if (src[2].file == GRF)
137 assert(src[2].reg_offset >= 0);
138 }
139
140 #define ALU1(op) \
141 fs_inst * \
142 fs_visitor::op(fs_reg dst, fs_reg src0) \
143 { \
144 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
145 }
146
147 #define ALU2(op) \
148 fs_inst * \
149 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
150 { \
151 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
152 }
153
154 #define ALU3(op) \
155 fs_inst * \
156 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
157 { \
158 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
159 }
160
161 ALU1(NOT)
162 ALU1(MOV)
163 ALU1(FRC)
164 ALU1(RNDD)
165 ALU1(RNDE)
166 ALU1(RNDZ)
167 ALU2(ADD)
168 ALU2(MUL)
169 ALU2(MACH)
170 ALU2(AND)
171 ALU2(OR)
172 ALU2(XOR)
173 ALU2(SHL)
174 ALU2(SHR)
175 ALU2(ASR)
176 ALU3(LRP)
177 ALU1(BFREV)
178 ALU3(BFE)
179 ALU2(BFI1)
180 ALU3(BFI2)
181 ALU1(FBH)
182 ALU1(FBL)
183 ALU1(CBIT)
184 ALU3(MAD)
185 ALU2(ADDC)
186 ALU2(SUBB)
187 ALU2(SEL)
188
189 /** Gen4 predicated IF. */
190 fs_inst *
191 fs_visitor::IF(uint32_t predicate)
192 {
193 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195 return inst;
196 }
197
198 /** Gen6 IF with embedded comparison. */
199 fs_inst *
200 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
201 {
202 assert(brw->gen == 6);
203 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
204 reg_null_d, src0, src1);
205 inst->conditional_mod = condition;
206 return inst;
207 }
208
209 /**
210 * CMP: Sets the low bit of the destination channels with the result
211 * of the comparison, while the upper bits are undefined, and updates
212 * the flag register with the packed 16 bits of the result.
213 */
214 fs_inst *
215 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
216 {
217 fs_inst *inst;
218
219 /* Take the instruction:
220 *
221 * CMP null<d> src0<f> src1<f>
222 *
223 * Original gen4 does type conversion to the destination type before
224 * comparison, producing garbage results for floating point comparisons.
225 * gen5 does the comparison on the execution type (resolved source types),
226 * so dst type doesn't matter. gen6 does comparison and then uses the
227 * result as if it was the dst type with no conversion, which happens to
228 * mostly work out for float-interpreted-as-int since our comparisons are
229 * for >0, =0, <0.
230 */
231 if (brw->gen == 4) {
232 dst.type = src0.type;
233 if (dst.file == HW_REG)
234 dst.fixed_hw_reg.type = dst.type;
235 }
236
237 resolve_ud_negate(&src0);
238 resolve_ud_negate(&src1);
239
240 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
241 inst->conditional_mod = condition;
242
243 return inst;
244 }
245
246 exec_list
247 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
248 const fs_reg &surf_index,
249 const fs_reg &varying_offset,
250 uint32_t const_offset)
251 {
252 exec_list instructions;
253 fs_inst *inst;
254
255 /* We have our constant surface use a pitch of 4 bytes, so our index can
256 * be any component of a vector, and then we load 4 contiguous
257 * components starting from that.
258 *
259 * We break down the const_offset to a portion added to the variable
260 * offset and a portion done using reg_offset, which means that if you
261 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
262 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
263 * CSE can later notice that those loads are all the same and eliminate
264 * the redundant ones.
265 */
266 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
267 instructions.push_tail(ADD(vec4_offset,
268 varying_offset, const_offset & ~3));
269
270 int scale = 1;
271 if (brw->gen == 4 && dispatch_width == 8) {
272 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
273 * u, v, r) as parameters, or we can just use the SIMD16 message
274 * consisting of (header, u). We choose the second, at the cost of a
275 * longer return length.
276 */
277 scale = 2;
278 }
279
280 enum opcode op;
281 if (brw->gen >= 7)
282 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
283 else
284 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
285 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
286 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
287 inst->regs_written = 4 * scale;
288 instructions.push_tail(inst);
289
290 if (brw->gen < 7) {
291 inst->base_mrf = 13;
292 inst->header_present = true;
293 if (brw->gen == 4)
294 inst->mlen = 3;
295 else
296 inst->mlen = 1 + dispatch_width / 8;
297 }
298
299 vec4_result.reg_offset += (const_offset & 3) * scale;
300 instructions.push_tail(MOV(dst, vec4_result));
301
302 return instructions;
303 }
304
305 /**
306 * A helper for MOV generation for fixing up broken hardware SEND dependency
307 * handling.
308 */
309 fs_inst *
310 fs_visitor::DEP_RESOLVE_MOV(int grf)
311 {
312 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
313
314 inst->ir = NULL;
315 inst->annotation = "send dependency resolve";
316
317 /* The caller always wants uncompressed to emit the minimal extra
318 * dependencies, and to avoid having to deal with aligning its regs to 2.
319 */
320 inst->force_uncompressed = true;
321
322 return inst;
323 }
324
325 bool
326 fs_inst::equals(fs_inst *inst) const
327 {
328 return (opcode == inst->opcode &&
329 dst.equals(inst->dst) &&
330 src[0].equals(inst->src[0]) &&
331 src[1].equals(inst->src[1]) &&
332 src[2].equals(inst->src[2]) &&
333 saturate == inst->saturate &&
334 predicate == inst->predicate &&
335 conditional_mod == inst->conditional_mod &&
336 mlen == inst->mlen &&
337 base_mrf == inst->base_mrf &&
338 sampler == inst->sampler &&
339 target == inst->target &&
340 eot == inst->eot &&
341 header_present == inst->header_present &&
342 shadow_compare == inst->shadow_compare &&
343 offset == inst->offset);
344 }
345
346 bool
347 fs_inst::overwrites_reg(const fs_reg &reg) const
348 {
349 return (reg.file == dst.file &&
350 reg.reg == dst.reg &&
351 reg.reg_offset >= dst.reg_offset &&
352 reg.reg_offset < dst.reg_offset + regs_written);
353 }
354
355 bool
356 fs_inst::is_send_from_grf() const
357 {
358 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
359 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
360 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
361 src[1].file == GRF) ||
362 (is_tex() && src[0].file == GRF));
363 }
364
365 bool
366 fs_visitor::can_do_source_mods(fs_inst *inst)
367 {
368 if (brw->gen == 6 && inst->is_math())
369 return false;
370
371 if (inst->is_send_from_grf())
372 return false;
373
374 if (!inst->can_do_source_mods())
375 return false;
376
377 return true;
378 }
379
380 void
381 fs_reg::init()
382 {
383 memset(this, 0, sizeof(*this));
384 stride = 1;
385 }
386
387 /** Generic unset register constructor. */
388 fs_reg::fs_reg()
389 {
390 init();
391 this->file = BAD_FILE;
392 }
393
394 /** Immediate value constructor. */
395 fs_reg::fs_reg(float f)
396 {
397 init();
398 this->file = IMM;
399 this->type = BRW_REGISTER_TYPE_F;
400 this->imm.f = f;
401 }
402
403 /** Immediate value constructor. */
404 fs_reg::fs_reg(int32_t i)
405 {
406 init();
407 this->file = IMM;
408 this->type = BRW_REGISTER_TYPE_D;
409 this->imm.i = i;
410 }
411
412 /** Immediate value constructor. */
413 fs_reg::fs_reg(uint32_t u)
414 {
415 init();
416 this->file = IMM;
417 this->type = BRW_REGISTER_TYPE_UD;
418 this->imm.u = u;
419 }
420
421 /** Fixed brw_reg. */
422 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
423 {
424 init();
425 this->file = HW_REG;
426 this->fixed_hw_reg = fixed_hw_reg;
427 this->type = fixed_hw_reg.type;
428 }
429
430 bool
431 fs_reg::equals(const fs_reg &r) const
432 {
433 return (file == r.file &&
434 reg == r.reg &&
435 reg_offset == r.reg_offset &&
436 subreg_offset == r.subreg_offset &&
437 type == r.type &&
438 negate == r.negate &&
439 abs == r.abs &&
440 !reladdr && !r.reladdr &&
441 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
442 sizeof(fixed_hw_reg)) == 0 &&
443 stride == r.stride &&
444 imm.u == r.imm.u);
445 }
446
447 fs_reg &
448 fs_reg::apply_stride(unsigned stride)
449 {
450 assert((this->stride * stride) <= 4 &&
451 (is_power_of_two(stride) || stride == 0) &&
452 file != HW_REG && file != IMM);
453 this->stride *= stride;
454 return *this;
455 }
456
457 fs_reg &
458 fs_reg::set_smear(unsigned subreg)
459 {
460 assert(file != HW_REG && file != IMM);
461 subreg_offset = subreg * type_sz(type);
462 stride = 0;
463 return *this;
464 }
465
466 bool
467 fs_reg::is_contiguous() const
468 {
469 return stride == 1;
470 }
471
472 bool
473 fs_reg::is_zero() const
474 {
475 if (file != IMM)
476 return false;
477
478 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
479 }
480
481 bool
482 fs_reg::is_one() const
483 {
484 if (file != IMM)
485 return false;
486
487 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
488 }
489
490 bool
491 fs_reg::is_null() const
492 {
493 return file == HW_REG &&
494 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
495 fixed_hw_reg.nr == BRW_ARF_NULL;
496 }
497
498 bool
499 fs_reg::is_valid_3src() const
500 {
501 return file == GRF || file == UNIFORM;
502 }
503
504 int
505 fs_visitor::type_size(const struct glsl_type *type)
506 {
507 unsigned int size, i;
508
509 switch (type->base_type) {
510 case GLSL_TYPE_UINT:
511 case GLSL_TYPE_INT:
512 case GLSL_TYPE_FLOAT:
513 case GLSL_TYPE_BOOL:
514 return type->components();
515 case GLSL_TYPE_ARRAY:
516 return type_size(type->fields.array) * type->length;
517 case GLSL_TYPE_STRUCT:
518 size = 0;
519 for (i = 0; i < type->length; i++) {
520 size += type_size(type->fields.structure[i].type);
521 }
522 return size;
523 case GLSL_TYPE_SAMPLER:
524 /* Samplers take up no register space, since they're baked in at
525 * link time.
526 */
527 return 0;
528 case GLSL_TYPE_ATOMIC_UINT:
529 return 0;
530 case GLSL_TYPE_IMAGE:
531 case GLSL_TYPE_VOID:
532 case GLSL_TYPE_ERROR:
533 case GLSL_TYPE_INTERFACE:
534 assert(!"not reached");
535 break;
536 }
537
538 return 0;
539 }
540
541 fs_reg
542 fs_visitor::get_timestamp()
543 {
544 assert(brw->gen >= 7);
545
546 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
547 BRW_ARF_TIMESTAMP,
548 0),
549 BRW_REGISTER_TYPE_UD));
550
551 fs_reg dst = fs_reg(this, glsl_type::uint_type);
552
553 fs_inst *mov = emit(MOV(dst, ts));
554 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
555 * even if it's not enabled in the dispatch.
556 */
557 mov->force_writemask_all = true;
558 mov->force_uncompressed = true;
559
560 /* The caller wants the low 32 bits of the timestamp. Since it's running
561 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
562 * which is plenty of time for our purposes. It is identical across the
563 * EUs, but since it's tracking GPU core speed it will increment at a
564 * varying rate as render P-states change.
565 *
566 * The caller could also check if render P-states have changed (or anything
567 * else that might disrupt timing) by setting smear to 2 and checking if
568 * that field is != 0.
569 */
570 dst.set_smear(0);
571
572 return dst;
573 }
574
575 void
576 fs_visitor::emit_shader_time_begin()
577 {
578 current_annotation = "shader time start";
579 shader_start_time = get_timestamp();
580 }
581
582 void
583 fs_visitor::emit_shader_time_end()
584 {
585 current_annotation = "shader time end";
586
587 enum shader_time_shader_type type, written_type, reset_type;
588 if (dispatch_width == 8) {
589 type = ST_FS8;
590 written_type = ST_FS8_WRITTEN;
591 reset_type = ST_FS8_RESET;
592 } else {
593 assert(dispatch_width == 16);
594 type = ST_FS16;
595 written_type = ST_FS16_WRITTEN;
596 reset_type = ST_FS16_RESET;
597 }
598
599 fs_reg shader_end_time = get_timestamp();
600
601 /* Check that there weren't any timestamp reset events (assuming these
602 * were the only two timestamp reads that happened).
603 */
604 fs_reg reset = shader_end_time;
605 reset.set_smear(2);
606 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
607 test->conditional_mod = BRW_CONDITIONAL_Z;
608 emit(IF(BRW_PREDICATE_NORMAL));
609
610 push_force_uncompressed();
611 fs_reg start = shader_start_time;
612 start.negate = true;
613 fs_reg diff = fs_reg(this, glsl_type::uint_type);
614 emit(ADD(diff, start, shader_end_time));
615
616 /* If there were no instructions between the two timestamp gets, the diff
617 * is 2 cycles. Remove that overhead, so I can forget about that when
618 * trying to determine the time taken for single instructions.
619 */
620 emit(ADD(diff, diff, fs_reg(-2u)));
621
622 emit_shader_time_write(type, diff);
623 emit_shader_time_write(written_type, fs_reg(1u));
624 emit(BRW_OPCODE_ELSE);
625 emit_shader_time_write(reset_type, fs_reg(1u));
626 emit(BRW_OPCODE_ENDIF);
627
628 pop_force_uncompressed();
629 }
630
631 void
632 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
633 fs_reg value)
634 {
635 int shader_time_index =
636 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
637 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
638
639 fs_reg payload;
640 if (dispatch_width == 8)
641 payload = fs_reg(this, glsl_type::uvec2_type);
642 else
643 payload = fs_reg(this, glsl_type::uint_type);
644
645 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
646 fs_reg(), payload, offset, value));
647 }
648
649 void
650 fs_visitor::vfail(const char *format, va_list va)
651 {
652 char *msg;
653
654 if (failed)
655 return;
656
657 failed = true;
658
659 msg = ralloc_vasprintf(mem_ctx, format, va);
660 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
661
662 this->fail_msg = msg;
663
664 if (INTEL_DEBUG & DEBUG_WM) {
665 fprintf(stderr, "%s", msg);
666 }
667 }
668
669 void
670 fs_visitor::fail(const char *format, ...)
671 {
672 va_list va;
673
674 va_start(va, format);
675 vfail(format, va);
676 va_end(va);
677 }
678
679 /**
680 * Mark this program as impossible to compile in SIMD16 mode.
681 *
682 * During the SIMD8 compile (which happens first), we can detect and flag
683 * things that are unsupported in SIMD16 mode, so the compiler can skip
684 * the SIMD16 compile altogether.
685 *
686 * During a SIMD16 compile (if one happens anyway), this just calls fail().
687 */
688 void
689 fs_visitor::no16(const char *format, ...)
690 {
691 va_list va;
692
693 va_start(va, format);
694
695 if (dispatch_width == 16) {
696 vfail(format, va);
697 } else {
698 simd16_unsupported = true;
699
700 if (brw->perf_debug) {
701 if (no16_msg)
702 ralloc_vasprintf_append(&no16_msg, format, va);
703 else
704 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
705 }
706 }
707
708 va_end(va);
709 }
710
711 fs_inst *
712 fs_visitor::emit(enum opcode opcode)
713 {
714 return emit(new(mem_ctx) fs_inst(opcode));
715 }
716
717 fs_inst *
718 fs_visitor::emit(enum opcode opcode, fs_reg dst)
719 {
720 return emit(new(mem_ctx) fs_inst(opcode, dst));
721 }
722
723 fs_inst *
724 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
725 {
726 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
727 }
728
729 fs_inst *
730 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
731 {
732 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
733 }
734
735 fs_inst *
736 fs_visitor::emit(enum opcode opcode, fs_reg dst,
737 fs_reg src0, fs_reg src1, fs_reg src2)
738 {
739 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
740 }
741
742 void
743 fs_visitor::push_force_uncompressed()
744 {
745 force_uncompressed_stack++;
746 }
747
748 void
749 fs_visitor::pop_force_uncompressed()
750 {
751 force_uncompressed_stack--;
752 assert(force_uncompressed_stack >= 0);
753 }
754
755 /**
756 * Returns true if the instruction has a flag that means it won't
757 * update an entire destination register.
758 *
759 * For example, dead code elimination and live variable analysis want to know
760 * when a write to a variable screens off any preceding values that were in
761 * it.
762 */
763 bool
764 fs_inst::is_partial_write() const
765 {
766 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
767 this->force_uncompressed ||
768 this->force_sechalf || !this->dst.is_contiguous());
769 }
770
771 int
772 fs_inst::regs_read(fs_visitor *v, int arg) const
773 {
774 if (is_tex() && arg == 0 && src[0].file == GRF) {
775 if (v->dispatch_width == 16)
776 return (mlen + 1) / 2;
777 else
778 return mlen;
779 }
780 return 1;
781 }
782
783 bool
784 fs_inst::reads_flag() const
785 {
786 return predicate;
787 }
788
789 bool
790 fs_inst::writes_flag() const
791 {
792 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
793 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
794 }
795
796 /**
797 * Returns how many MRFs an FS opcode will write over.
798 *
799 * Note that this is not the 0 or 1 implied writes in an actual gen
800 * instruction -- the FS opcodes often generate MOVs in addition.
801 */
802 int
803 fs_visitor::implied_mrf_writes(fs_inst *inst)
804 {
805 if (inst->mlen == 0)
806 return 0;
807
808 if (inst->base_mrf == -1)
809 return 0;
810
811 switch (inst->opcode) {
812 case SHADER_OPCODE_RCP:
813 case SHADER_OPCODE_RSQ:
814 case SHADER_OPCODE_SQRT:
815 case SHADER_OPCODE_EXP2:
816 case SHADER_OPCODE_LOG2:
817 case SHADER_OPCODE_SIN:
818 case SHADER_OPCODE_COS:
819 return 1 * dispatch_width / 8;
820 case SHADER_OPCODE_POW:
821 case SHADER_OPCODE_INT_QUOTIENT:
822 case SHADER_OPCODE_INT_REMAINDER:
823 return 2 * dispatch_width / 8;
824 case SHADER_OPCODE_TEX:
825 case FS_OPCODE_TXB:
826 case SHADER_OPCODE_TXD:
827 case SHADER_OPCODE_TXF:
828 case SHADER_OPCODE_TXF_CMS:
829 case SHADER_OPCODE_TXF_MCS:
830 case SHADER_OPCODE_TG4:
831 case SHADER_OPCODE_TG4_OFFSET:
832 case SHADER_OPCODE_TXL:
833 case SHADER_OPCODE_TXS:
834 case SHADER_OPCODE_LOD:
835 return 1;
836 case FS_OPCODE_FB_WRITE:
837 return 2;
838 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
839 case SHADER_OPCODE_GEN4_SCRATCH_READ:
840 return 1;
841 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
842 return inst->mlen;
843 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
844 return 2;
845 case SHADER_OPCODE_UNTYPED_ATOMIC:
846 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
847 return 0;
848 default:
849 assert(!"not reached");
850 return inst->mlen;
851 }
852 }
853
854 int
855 fs_visitor::virtual_grf_alloc(int size)
856 {
857 if (virtual_grf_array_size <= virtual_grf_count) {
858 if (virtual_grf_array_size == 0)
859 virtual_grf_array_size = 16;
860 else
861 virtual_grf_array_size *= 2;
862 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
863 virtual_grf_array_size);
864 }
865 virtual_grf_sizes[virtual_grf_count] = size;
866 return virtual_grf_count++;
867 }
868
869 /** Fixed HW reg constructor. */
870 fs_reg::fs_reg(enum register_file file, int reg)
871 {
872 init();
873 this->file = file;
874 this->reg = reg;
875 this->type = BRW_REGISTER_TYPE_F;
876 }
877
878 /** Fixed HW reg constructor. */
879 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
880 {
881 init();
882 this->file = file;
883 this->reg = reg;
884 this->type = type;
885 }
886
887 /** Automatic reg constructor. */
888 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
889 {
890 init();
891
892 this->file = GRF;
893 this->reg = v->virtual_grf_alloc(v->type_size(type));
894 this->reg_offset = 0;
895 this->type = brw_type_for_base_type(type);
896 }
897
898 fs_reg *
899 fs_visitor::variable_storage(ir_variable *var)
900 {
901 return (fs_reg *)hash_table_find(this->variable_ht, var);
902 }
903
904 void
905 import_uniforms_callback(const void *key,
906 void *data,
907 void *closure)
908 {
909 struct hash_table *dst_ht = (struct hash_table *)closure;
910 const fs_reg *reg = (const fs_reg *)data;
911
912 if (reg->file != UNIFORM)
913 return;
914
915 hash_table_insert(dst_ht, data, key);
916 }
917
918 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
919 * This brings in those uniform definitions
920 */
921 void
922 fs_visitor::import_uniforms(fs_visitor *v)
923 {
924 hash_table_call_foreach(v->variable_ht,
925 import_uniforms_callback,
926 variable_ht);
927 this->push_constant_loc = v->push_constant_loc;
928 this->pull_constant_loc = v->pull_constant_loc;
929 this->uniforms = v->uniforms;
930 this->param_size = v->param_size;
931 }
932
933 /* Our support for uniforms is piggy-backed on the struct
934 * gl_fragment_program, because that's where the values actually
935 * get stored, rather than in some global gl_shader_program uniform
936 * store.
937 */
938 void
939 fs_visitor::setup_uniform_values(ir_variable *ir)
940 {
941 int namelen = strlen(ir->name);
942
943 /* The data for our (non-builtin) uniforms is stored in a series of
944 * gl_uniform_driver_storage structs for each subcomponent that
945 * glGetUniformLocation() could name. We know it's been set up in the same
946 * order we'd walk the type, so walk the list of storage and find anything
947 * with our name, or the prefix of a component that starts with our name.
948 */
949 unsigned params_before = uniforms;
950 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
951 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
952
953 if (strncmp(ir->name, storage->name, namelen) != 0 ||
954 (storage->name[namelen] != 0 &&
955 storage->name[namelen] != '.' &&
956 storage->name[namelen] != '[')) {
957 continue;
958 }
959
960 unsigned slots = storage->type->component_slots();
961 if (storage->array_elements)
962 slots *= storage->array_elements;
963
964 for (unsigned i = 0; i < slots; i++) {
965 stage_prog_data->param[uniforms++] = &storage->storage[i].f;
966 }
967 }
968
969 /* Make sure we actually initialized the right amount of stuff here. */
970 assert(params_before + ir->type->component_slots() == uniforms);
971 (void)params_before;
972 }
973
974
975 /* Our support for builtin uniforms is even scarier than non-builtin.
976 * It sits on top of the PROG_STATE_VAR parameters that are
977 * automatically updated from GL context state.
978 */
979 void
980 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
981 {
982 const ir_state_slot *const slots = ir->state_slots;
983 assert(ir->state_slots != NULL);
984
985 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
986 /* This state reference has already been setup by ir_to_mesa, but we'll
987 * get the same index back here.
988 */
989 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
990 (gl_state_index *)slots[i].tokens);
991
992 /* Add each of the unique swizzles of the element as a parameter.
993 * This'll end up matching the expected layout of the
994 * array/matrix/structure we're trying to fill in.
995 */
996 int last_swiz = -1;
997 for (unsigned int j = 0; j < 4; j++) {
998 int swiz = GET_SWZ(slots[i].swizzle, j);
999 if (swiz == last_swiz)
1000 break;
1001 last_swiz = swiz;
1002
1003 stage_prog_data->param[uniforms++] =
1004 &fp->Base.Parameters->ParameterValues[index][swiz].f;
1005 }
1006 }
1007 }
1008
1009 fs_reg *
1010 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1011 {
1012 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1013 fs_reg wpos = *reg;
1014 bool flip = !ir->data.origin_upper_left ^ c->key.render_to_fbo;
1015
1016 /* gl_FragCoord.x */
1017 if (ir->data.pixel_center_integer) {
1018 emit(MOV(wpos, this->pixel_x));
1019 } else {
1020 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1021 }
1022 wpos.reg_offset++;
1023
1024 /* gl_FragCoord.y */
1025 if (!flip && ir->data.pixel_center_integer) {
1026 emit(MOV(wpos, this->pixel_y));
1027 } else {
1028 fs_reg pixel_y = this->pixel_y;
1029 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1030
1031 if (flip) {
1032 pixel_y.negate = true;
1033 offset += c->key.drawable_height - 1.0;
1034 }
1035
1036 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1037 }
1038 wpos.reg_offset++;
1039
1040 /* gl_FragCoord.z */
1041 if (brw->gen >= 6) {
1042 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
1043 } else {
1044 emit(FS_OPCODE_LINTERP, wpos,
1045 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1046 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1047 interp_reg(VARYING_SLOT_POS, 2));
1048 }
1049 wpos.reg_offset++;
1050
1051 /* gl_FragCoord.w: Already set up in emit_interpolation */
1052 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1053
1054 return reg;
1055 }
1056
1057 fs_inst *
1058 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1059 glsl_interp_qualifier interpolation_mode,
1060 bool is_centroid, bool is_sample)
1061 {
1062 brw_wm_barycentric_interp_mode barycoord_mode;
1063 if (brw->gen >= 6) {
1064 if (is_centroid) {
1065 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1066 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1067 else
1068 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1069 } else if (is_sample) {
1070 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1071 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1072 else
1073 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1074 } else {
1075 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1076 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1077 else
1078 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1079 }
1080 } else {
1081 /* On Ironlake and below, there is only one interpolation mode.
1082 * Centroid interpolation doesn't mean anything on this hardware --
1083 * there is no multisampling.
1084 */
1085 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1086 }
1087 return emit(FS_OPCODE_LINTERP, attr,
1088 this->delta_x[barycoord_mode],
1089 this->delta_y[barycoord_mode], interp);
1090 }
1091
1092 fs_reg *
1093 fs_visitor::emit_general_interpolation(ir_variable *ir)
1094 {
1095 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1096 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1097 fs_reg attr = *reg;
1098
1099 unsigned int array_elements;
1100 const glsl_type *type;
1101
1102 if (ir->type->is_array()) {
1103 array_elements = ir->type->length;
1104 if (array_elements == 0) {
1105 fail("dereferenced array '%s' has length 0\n", ir->name);
1106 }
1107 type = ir->type->fields.array;
1108 } else {
1109 array_elements = 1;
1110 type = ir->type;
1111 }
1112
1113 glsl_interp_qualifier interpolation_mode =
1114 ir->determine_interpolation_mode(c->key.flat_shade);
1115
1116 int location = ir->data.location;
1117 for (unsigned int i = 0; i < array_elements; i++) {
1118 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1119 if (c->prog_data.urb_setup[location] == -1) {
1120 /* If there's no incoming setup data for this slot, don't
1121 * emit interpolation for it.
1122 */
1123 attr.reg_offset += type->vector_elements;
1124 location++;
1125 continue;
1126 }
1127
1128 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1129 /* Constant interpolation (flat shading) case. The SF has
1130 * handed us defined values in only the constant offset
1131 * field of the setup reg.
1132 */
1133 for (unsigned int k = 0; k < type->vector_elements; k++) {
1134 struct brw_reg interp = interp_reg(location, k);
1135 interp = suboffset(interp, 3);
1136 interp.type = reg->type;
1137 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1138 attr.reg_offset++;
1139 }
1140 } else {
1141 /* Smooth/noperspective interpolation case. */
1142 for (unsigned int k = 0; k < type->vector_elements; k++) {
1143 struct brw_reg interp = interp_reg(location, k);
1144 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1145 ir->data.centroid && !c->key.persample_shading,
1146 ir->data.sample || c->key.persample_shading);
1147 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1148 /* Get the pixel/sample mask into f0 so that we know
1149 * which pixels are lit. Then, for each channel that is
1150 * unlit, replace the centroid data with non-centroid
1151 * data.
1152 */
1153 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1154 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1155 interpolation_mode,
1156 false, false);
1157 inst->predicate = BRW_PREDICATE_NORMAL;
1158 inst->predicate_inverse = true;
1159 }
1160 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1161 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1162 }
1163 attr.reg_offset++;
1164 }
1165
1166 }
1167 location++;
1168 }
1169 }
1170
1171 return reg;
1172 }
1173
1174 fs_reg *
1175 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1176 {
1177 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1178
1179 /* The frontfacing comes in as a bit in the thread payload. */
1180 if (brw->gen >= 6) {
1181 emit(BRW_OPCODE_ASR, *reg,
1182 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1183 fs_reg(15));
1184 emit(BRW_OPCODE_NOT, *reg, *reg);
1185 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1186 } else {
1187 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1188 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1189 * us front face
1190 */
1191 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1192 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1193 }
1194
1195 return reg;
1196 }
1197
1198 void
1199 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1200 {
1201 assert(dst.type == BRW_REGISTER_TYPE_F);
1202
1203 if (c->key.compute_pos_offset) {
1204 /* Convert int_sample_pos to floating point */
1205 emit(MOV(dst, int_sample_pos));
1206 /* Scale to the range [0, 1] */
1207 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1208 }
1209 else {
1210 /* From ARB_sample_shading specification:
1211 * "When rendering to a non-multisample buffer, or if multisample
1212 * rasterization is disabled, gl_SamplePosition will always be
1213 * (0.5, 0.5).
1214 */
1215 emit(MOV(dst, fs_reg(0.5f)));
1216 }
1217 }
1218
1219 fs_reg *
1220 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1221 {
1222 assert(brw->gen >= 6);
1223 assert(ir->type == glsl_type::vec2_type);
1224
1225 this->current_annotation = "compute sample position";
1226 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1227 fs_reg pos = *reg;
1228 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1229 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1230
1231 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1232 * mode will be enabled.
1233 *
1234 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1235 * R31.1:0 Position Offset X/Y for Slot[3:0]
1236 * R31.3:2 Position Offset X/Y for Slot[7:4]
1237 * .....
1238 *
1239 * The X, Y sample positions come in as bytes in thread payload. So, read
1240 * the positions using vstride=16, width=8, hstride=2.
1241 */
1242 struct brw_reg sample_pos_reg =
1243 stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1244 BRW_REGISTER_TYPE_B), 16, 8, 2);
1245
1246 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1247 if (dispatch_width == 16) {
1248 fs_inst *inst = emit(MOV(half(int_sample_x, 1),
1249 fs_reg(suboffset(sample_pos_reg, 16))));
1250 inst->force_sechalf = true;
1251 }
1252 /* Compute gl_SamplePosition.x */
1253 compute_sample_position(pos, int_sample_x);
1254 pos.reg_offset++;
1255 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1256 if (dispatch_width == 16) {
1257 fs_inst *inst = emit(MOV(half(int_sample_y, 1),
1258 fs_reg(suboffset(sample_pos_reg, 17))));
1259 inst->force_sechalf = true;
1260 }
1261 /* Compute gl_SamplePosition.y */
1262 compute_sample_position(pos, int_sample_y);
1263 return reg;
1264 }
1265
1266 fs_reg *
1267 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1268 {
1269 assert(brw->gen >= 6);
1270
1271 this->current_annotation = "compute sample id";
1272 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1273
1274 if (c->key.compute_sample_id) {
1275 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1276 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1277 t2.type = BRW_REGISTER_TYPE_UW;
1278
1279 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1280 * 8x multisampling, subspan 0 will represent sample N (where N
1281 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1282 * 7. We can find the value of N by looking at R0.0 bits 7:6
1283 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1284 * (since samples are always delivered in pairs). That is, we
1285 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1286 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1287 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1288 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1289 * populating a temporary variable with the sequence (0, 1, 2, 3),
1290 * and then reading from it using vstride=1, width=4, hstride=0.
1291 * These computations hold good for 4x multisampling as well.
1292 */
1293 emit(BRW_OPCODE_AND, t1,
1294 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1295 fs_reg(brw_imm_d(0xc0)));
1296 emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1297 /* This works for both SIMD8 and SIMD16 */
1298 emit(MOV(t2, brw_imm_v(0x3210)));
1299 /* This special instruction takes care of setting vstride=1,
1300 * width=4, hstride=0 of t2 during an ADD instruction.
1301 */
1302 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1303 } else {
1304 /* As per GL_ARB_sample_shading specification:
1305 * "When rendering to a non-multisample buffer, or if multisample
1306 * rasterization is disabled, gl_SampleID will always be zero."
1307 */
1308 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1309 }
1310
1311 return reg;
1312 }
1313
1314 fs_reg *
1315 fs_visitor::emit_samplemaskin_setup(ir_variable *ir)
1316 {
1317 assert(brw->gen >= 7);
1318 this->current_annotation = "compute gl_SampleMaskIn";
1319 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1320 emit(MOV(*reg, fs_reg(retype(brw_vec8_grf(c->sample_mask_reg, 0), BRW_REGISTER_TYPE_D))));
1321 return reg;
1322 }
1323
1324 fs_reg
1325 fs_visitor::fix_math_operand(fs_reg src)
1326 {
1327 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1328 * might be able to do better by doing execsize = 1 math and then
1329 * expanding that result out, but we would need to be careful with
1330 * masking.
1331 *
1332 * The hardware ignores source modifiers (negate and abs) on math
1333 * instructions, so we also move to a temp to set those up.
1334 */
1335 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1336 !src.abs && !src.negate)
1337 return src;
1338
1339 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1340 * operands to math
1341 */
1342 if (brw->gen >= 7 && src.file != IMM)
1343 return src;
1344
1345 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1346 expanded.type = src.type;
1347 emit(BRW_OPCODE_MOV, expanded, src);
1348 return expanded;
1349 }
1350
1351 fs_inst *
1352 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1353 {
1354 switch (opcode) {
1355 case SHADER_OPCODE_RCP:
1356 case SHADER_OPCODE_RSQ:
1357 case SHADER_OPCODE_SQRT:
1358 case SHADER_OPCODE_EXP2:
1359 case SHADER_OPCODE_LOG2:
1360 case SHADER_OPCODE_SIN:
1361 case SHADER_OPCODE_COS:
1362 break;
1363 default:
1364 assert(!"not reached: bad math opcode");
1365 return NULL;
1366 }
1367
1368 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1369 * might be able to do better by doing execsize = 1 math and then
1370 * expanding that result out, but we would need to be careful with
1371 * masking.
1372 *
1373 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1374 * instructions, so we also move to a temp to set those up.
1375 */
1376 if (brw->gen >= 6)
1377 src = fix_math_operand(src);
1378
1379 fs_inst *inst = emit(opcode, dst, src);
1380
1381 if (brw->gen < 6) {
1382 inst->base_mrf = 2;
1383 inst->mlen = dispatch_width / 8;
1384 }
1385
1386 return inst;
1387 }
1388
1389 fs_inst *
1390 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1391 {
1392 int base_mrf = 2;
1393 fs_inst *inst;
1394
1395 switch (opcode) {
1396 case SHADER_OPCODE_INT_QUOTIENT:
1397 case SHADER_OPCODE_INT_REMAINDER:
1398 if (brw->gen >= 7)
1399 no16("SIMD16 INTDIV unsupported\n");
1400 break;
1401 case SHADER_OPCODE_POW:
1402 break;
1403 default:
1404 assert(!"not reached: unsupported binary math opcode.");
1405 return NULL;
1406 }
1407
1408 if (brw->gen >= 6) {
1409 src0 = fix_math_operand(src0);
1410 src1 = fix_math_operand(src1);
1411
1412 inst = emit(opcode, dst, src0, src1);
1413 } else {
1414 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1415 * "Message Payload":
1416 *
1417 * "Operand0[7]. For the INT DIV functions, this operand is the
1418 * denominator."
1419 * ...
1420 * "Operand1[7]. For the INT DIV functions, this operand is the
1421 * numerator."
1422 */
1423 bool is_int_div = opcode != SHADER_OPCODE_POW;
1424 fs_reg &op0 = is_int_div ? src1 : src0;
1425 fs_reg &op1 = is_int_div ? src0 : src1;
1426
1427 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1428 inst = emit(opcode, dst, op0, reg_null_f);
1429
1430 inst->base_mrf = base_mrf;
1431 inst->mlen = 2 * dispatch_width / 8;
1432 }
1433 return inst;
1434 }
1435
1436 void
1437 fs_visitor::assign_curb_setup()
1438 {
1439 if (dispatch_width == 8) {
1440 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1441 } else {
1442 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1443 }
1444
1445 c->prog_data.curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1446
1447 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1448 foreach_list(node, &this->instructions) {
1449 fs_inst *inst = (fs_inst *)node;
1450
1451 for (unsigned int i = 0; i < 3; i++) {
1452 if (inst->src[i].file == UNIFORM) {
1453 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1454 int constant_nr;
1455 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1456 constant_nr = push_constant_loc[uniform_nr];
1457 } else {
1458 /* Section 5.11 of the OpenGL 4.1 spec says:
1459 * "Out-of-bounds reads return undefined values, which include
1460 * values from other variables of the active program or zero."
1461 * Just return the first push constant.
1462 */
1463 constant_nr = 0;
1464 }
1465
1466 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1467 constant_nr / 8,
1468 constant_nr % 8);
1469
1470 inst->src[i].file = HW_REG;
1471 inst->src[i].fixed_hw_reg = byte_offset(
1472 retype(brw_reg, inst->src[i].type),
1473 inst->src[i].subreg_offset);
1474 }
1475 }
1476 }
1477 }
1478
1479 void
1480 fs_visitor::calculate_urb_setup()
1481 {
1482 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1483 c->prog_data.urb_setup[i] = -1;
1484 }
1485
1486 int urb_next = 0;
1487 /* Figure out where each of the incoming setup attributes lands. */
1488 if (brw->gen >= 6) {
1489 if (_mesa_bitcount_64(fp->Base.InputsRead &
1490 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1491 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1492 * first 16 varying inputs, so we can put them wherever we want.
1493 * Just put them in order.
1494 *
1495 * This is useful because it means that (a) inputs not used by the
1496 * fragment shader won't take up valuable register space, and (b) we
1497 * won't have to recompile the fragment shader if it gets paired with
1498 * a different vertex (or geometry) shader.
1499 */
1500 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1501 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1502 BITFIELD64_BIT(i)) {
1503 c->prog_data.urb_setup[i] = urb_next++;
1504 }
1505 }
1506 } else {
1507 /* We have enough input varyings that the SF/SBE pipeline stage can't
1508 * arbitrarily rearrange them to suit our whim; we have to put them
1509 * in an order that matches the output of the previous pipeline stage
1510 * (geometry or vertex shader).
1511 */
1512 struct brw_vue_map prev_stage_vue_map;
1513 brw_compute_vue_map(brw, &prev_stage_vue_map,
1514 c->key.input_slots_valid);
1515 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1516 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1517 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1518 slot++) {
1519 int varying = prev_stage_vue_map.slot_to_varying[slot];
1520 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1521 * unused.
1522 */
1523 if (varying != BRW_VARYING_SLOT_COUNT &&
1524 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1525 BITFIELD64_BIT(varying))) {
1526 c->prog_data.urb_setup[varying] = slot - first_slot;
1527 }
1528 }
1529 urb_next = prev_stage_vue_map.num_slots - first_slot;
1530 }
1531 } else {
1532 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1533 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1534 /* Point size is packed into the header, not as a general attribute */
1535 if (i == VARYING_SLOT_PSIZ)
1536 continue;
1537
1538 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1539 /* The back color slot is skipped when the front color is
1540 * also written to. In addition, some slots can be
1541 * written in the vertex shader and not read in the
1542 * fragment shader. So the register number must always be
1543 * incremented, mapped or not.
1544 */
1545 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1546 c->prog_data.urb_setup[i] = urb_next;
1547 urb_next++;
1548 }
1549 }
1550
1551 /*
1552 * It's a FS only attribute, and we did interpolation for this attribute
1553 * in SF thread. So, count it here, too.
1554 *
1555 * See compile_sf_prog() for more info.
1556 */
1557 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1558 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1559 }
1560
1561 c->prog_data.num_varying_inputs = urb_next;
1562 }
1563
1564 void
1565 fs_visitor::assign_urb_setup()
1566 {
1567 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1568
1569 /* Offset all the urb_setup[] index by the actual position of the
1570 * setup regs, now that the location of the constants has been chosen.
1571 */
1572 foreach_list(node, &this->instructions) {
1573 fs_inst *inst = (fs_inst *)node;
1574
1575 if (inst->opcode == FS_OPCODE_LINTERP) {
1576 assert(inst->src[2].file == HW_REG);
1577 inst->src[2].fixed_hw_reg.nr += urb_start;
1578 }
1579
1580 if (inst->opcode == FS_OPCODE_CINTERP) {
1581 assert(inst->src[0].file == HW_REG);
1582 inst->src[0].fixed_hw_reg.nr += urb_start;
1583 }
1584 }
1585
1586 /* Each attribute is 4 setup channels, each of which is half a reg. */
1587 this->first_non_payload_grf =
1588 urb_start + c->prog_data.num_varying_inputs * 2;
1589 }
1590
1591 /**
1592 * Split large virtual GRFs into separate components if we can.
1593 *
1594 * This is mostly duplicated with what brw_fs_vector_splitting does,
1595 * but that's really conservative because it's afraid of doing
1596 * splitting that doesn't result in real progress after the rest of
1597 * the optimization phases, which would cause infinite looping in
1598 * optimization. We can do it once here, safely. This also has the
1599 * opportunity to split interpolated values, or maybe even uniforms,
1600 * which we don't have at the IR level.
1601 *
1602 * We want to split, because virtual GRFs are what we register
1603 * allocate and spill (due to contiguousness requirements for some
1604 * instructions), and they're what we naturally generate in the
1605 * codegen process, but most virtual GRFs don't actually need to be
1606 * contiguous sets of GRFs. If we split, we'll end up with reduced
1607 * live intervals and better dead code elimination and coalescing.
1608 */
1609 void
1610 fs_visitor::split_virtual_grfs()
1611 {
1612 int num_vars = this->virtual_grf_count;
1613 bool split_grf[num_vars];
1614 int new_virtual_grf[num_vars];
1615
1616 /* Try to split anything > 0 sized. */
1617 for (int i = 0; i < num_vars; i++) {
1618 if (this->virtual_grf_sizes[i] != 1)
1619 split_grf[i] = true;
1620 else
1621 split_grf[i] = false;
1622 }
1623
1624 if (brw->has_pln &&
1625 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1626 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1627 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1628 * Gen6, that was the only supported interpolation mode, and since Gen6,
1629 * delta_x and delta_y are in fixed hardware registers.
1630 */
1631 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1632 false;
1633 }
1634
1635 foreach_list(node, &this->instructions) {
1636 fs_inst *inst = (fs_inst *)node;
1637
1638 /* If there's a SEND message that requires contiguous destination
1639 * registers, no splitting is allowed.
1640 */
1641 if (inst->regs_written > 1) {
1642 split_grf[inst->dst.reg] = false;
1643 }
1644
1645 /* If we're sending from a GRF, don't split it, on the assumption that
1646 * the send is reading the whole thing.
1647 */
1648 if (inst->is_send_from_grf()) {
1649 for (int i = 0; i < 3; i++) {
1650 if (inst->src[i].file == GRF) {
1651 split_grf[inst->src[i].reg] = false;
1652 }
1653 }
1654 }
1655 }
1656
1657 /* Allocate new space for split regs. Note that the virtual
1658 * numbers will be contiguous.
1659 */
1660 for (int i = 0; i < num_vars; i++) {
1661 if (split_grf[i]) {
1662 new_virtual_grf[i] = virtual_grf_alloc(1);
1663 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1664 int reg = virtual_grf_alloc(1);
1665 assert(reg == new_virtual_grf[i] + j - 1);
1666 (void) reg;
1667 }
1668 this->virtual_grf_sizes[i] = 1;
1669 }
1670 }
1671
1672 foreach_list(node, &this->instructions) {
1673 fs_inst *inst = (fs_inst *)node;
1674
1675 if (inst->dst.file == GRF &&
1676 split_grf[inst->dst.reg] &&
1677 inst->dst.reg_offset != 0) {
1678 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1679 inst->dst.reg_offset - 1);
1680 inst->dst.reg_offset = 0;
1681 }
1682 for (int i = 0; i < 3; i++) {
1683 if (inst->src[i].file == GRF &&
1684 split_grf[inst->src[i].reg] &&
1685 inst->src[i].reg_offset != 0) {
1686 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1687 inst->src[i].reg_offset - 1);
1688 inst->src[i].reg_offset = 0;
1689 }
1690 }
1691 }
1692 invalidate_live_intervals();
1693 }
1694
1695 /**
1696 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1697 *
1698 * During code generation, we create tons of temporary variables, many of
1699 * which get immediately killed and are never used again. Yet, in later
1700 * optimization and analysis passes, such as compute_live_intervals, we need
1701 * to loop over all the virtual GRFs. Compacting them can save a lot of
1702 * overhead.
1703 */
1704 void
1705 fs_visitor::compact_virtual_grfs()
1706 {
1707 /* Mark which virtual GRFs are used, and count how many. */
1708 int remap_table[this->virtual_grf_count];
1709 memset(remap_table, -1, sizeof(remap_table));
1710
1711 foreach_list(node, &this->instructions) {
1712 const fs_inst *inst = (const fs_inst *) node;
1713
1714 if (inst->dst.file == GRF)
1715 remap_table[inst->dst.reg] = 0;
1716
1717 for (int i = 0; i < 3; i++) {
1718 if (inst->src[i].file == GRF)
1719 remap_table[inst->src[i].reg] = 0;
1720 }
1721 }
1722
1723 /* In addition to registers used in instructions, fs_visitor keeps
1724 * direct references to certain special values which must be patched:
1725 */
1726 struct {
1727 fs_reg *reg;
1728 unsigned count;
1729 } special[] = {
1730 { &frag_depth, 1 },
1731 { &pixel_x, 1 },
1732 { &pixel_y, 1 },
1733 { &pixel_w, 1 },
1734 { &wpos_w, 1 },
1735 { &dual_src_output, 1 },
1736 { outputs, ARRAY_SIZE(outputs) },
1737 { delta_x, ARRAY_SIZE(delta_x) },
1738 { delta_y, ARRAY_SIZE(delta_y) },
1739 { &sample_mask, 1 },
1740 { &shader_start_time, 1 },
1741 };
1742
1743 /* Treat all special values as used, to be conservative */
1744 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1745 for (unsigned j = 0; j < special[i].count; j++) {
1746 if (special[i].reg[j].file == GRF)
1747 remap_table[special[i].reg[j].reg] = 0;
1748 }
1749 }
1750
1751 /* Compact the GRF arrays. */
1752 int new_index = 0;
1753 for (int i = 0; i < this->virtual_grf_count; i++) {
1754 if (remap_table[i] != -1) {
1755 remap_table[i] = new_index;
1756 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1757 invalidate_live_intervals();
1758 ++new_index;
1759 }
1760 }
1761
1762 this->virtual_grf_count = new_index;
1763
1764 /* Patch all the instructions to use the newly renumbered registers */
1765 foreach_list(node, &this->instructions) {
1766 fs_inst *inst = (fs_inst *) node;
1767
1768 if (inst->dst.file == GRF)
1769 inst->dst.reg = remap_table[inst->dst.reg];
1770
1771 for (int i = 0; i < 3; i++) {
1772 if (inst->src[i].file == GRF)
1773 inst->src[i].reg = remap_table[inst->src[i].reg];
1774 }
1775 }
1776
1777 /* Patch all the references to special values */
1778 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1779 for (unsigned j = 0; j < special[i].count; j++) {
1780 fs_reg *reg = &special[i].reg[j];
1781 if (reg->file == GRF && remap_table[reg->reg] != -1)
1782 reg->reg = remap_table[reg->reg];
1783 }
1784 }
1785 }
1786
1787 /*
1788 * Implements array access of uniforms by inserting a
1789 * PULL_CONSTANT_LOAD instruction.
1790 *
1791 * Unlike temporary GRF array access (where we don't support it due to
1792 * the difficulty of doing relative addressing on instruction
1793 * destinations), we could potentially do array access of uniforms
1794 * that were loaded in GRF space as push constants. In real-world
1795 * usage we've seen, though, the arrays being used are always larger
1796 * than we could load as push constants, so just always move all
1797 * uniform array access out to a pull constant buffer.
1798 */
1799 void
1800 fs_visitor::move_uniform_array_access_to_pull_constants()
1801 {
1802 if (dispatch_width != 8)
1803 return;
1804
1805 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1806
1807 for (unsigned int i = 0; i < uniforms; i++) {
1808 pull_constant_loc[i] = -1;
1809 }
1810
1811 /* Walk through and find array access of uniforms. Put a copy of that
1812 * uniform in the pull constant buffer.
1813 *
1814 * Note that we don't move constant-indexed accesses to arrays. No
1815 * testing has been done of the performance impact of this choice.
1816 */
1817 foreach_list_safe(node, &this->instructions) {
1818 fs_inst *inst = (fs_inst *)node;
1819
1820 for (int i = 0 ; i < 3; i++) {
1821 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1822 continue;
1823
1824 int uniform = inst->src[i].reg;
1825
1826 /* If this array isn't already present in the pull constant buffer,
1827 * add it.
1828 */
1829 if (pull_constant_loc[uniform] == -1) {
1830 const float **values = &stage_prog_data->param[uniform];
1831
1832 assert(param_size[uniform]);
1833
1834 for (int j = 0; j < param_size[uniform]; j++) {
1835 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1836
1837 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1838 values[j];
1839 }
1840 }
1841 }
1842 }
1843 }
1844
1845 /**
1846 * Assign UNIFORM file registers to either push constants or pull constants.
1847 *
1848 * We allow a fragment shader to have more than the specified minimum
1849 * maximum number of fragment shader uniform components (64). If
1850 * there are too many of these, they'd fill up all of register space.
1851 * So, this will push some of them out to the pull constant buffer and
1852 * update the program to load them.
1853 */
1854 void
1855 fs_visitor::assign_constant_locations()
1856 {
1857 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1858 if (dispatch_width != 8)
1859 return;
1860
1861 /* Find which UNIFORM registers are still in use. */
1862 bool is_live[uniforms];
1863 for (unsigned int i = 0; i < uniforms; i++) {
1864 is_live[i] = false;
1865 }
1866
1867 foreach_list(node, &this->instructions) {
1868 fs_inst *inst = (fs_inst *) node;
1869
1870 for (int i = 0; i < 3; i++) {
1871 if (inst->src[i].file != UNIFORM)
1872 continue;
1873
1874 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1875 if (constant_nr >= 0 && constant_nr < (int) uniforms)
1876 is_live[constant_nr] = true;
1877 }
1878 }
1879
1880 /* Only allow 16 registers (128 uniform components) as push constants.
1881 *
1882 * Just demote the end of the list. We could probably do better
1883 * here, demoting things that are rarely used in the program first.
1884 */
1885 unsigned int max_push_components = 16 * 8;
1886 unsigned int num_push_constants = 0;
1887
1888 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1889
1890 for (unsigned int i = 0; i < uniforms; i++) {
1891 if (!is_live[i] || pull_constant_loc[i] != -1) {
1892 /* This UNIFORM register is either dead, or has already been demoted
1893 * to a pull const. Mark it as no longer living in the param[] array.
1894 */
1895 push_constant_loc[i] = -1;
1896 continue;
1897 }
1898
1899 if (num_push_constants < max_push_components) {
1900 /* Retain as a push constant. Record the location in the params[]
1901 * array.
1902 */
1903 push_constant_loc[i] = num_push_constants++;
1904 } else {
1905 /* Demote to a pull constant. */
1906 push_constant_loc[i] = -1;
1907
1908 int pull_index = stage_prog_data->nr_pull_params++;
1909 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1910 pull_constant_loc[i] = pull_index;
1911 }
1912 }
1913
1914 stage_prog_data->nr_params = num_push_constants;
1915
1916 /* Up until now, the param[] array has been indexed by reg + reg_offset
1917 * of UNIFORM registers. Condense it to only contain the uniforms we
1918 * chose to upload as push constants.
1919 */
1920 for (unsigned int i = 0; i < uniforms; i++) {
1921 int remapped = push_constant_loc[i];
1922
1923 if (remapped == -1)
1924 continue;
1925
1926 assert(remapped <= (int)i);
1927 stage_prog_data->param[remapped] = stage_prog_data->param[i];
1928 }
1929 }
1930
1931 /**
1932 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1933 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1934 */
1935 void
1936 fs_visitor::demote_pull_constants()
1937 {
1938 foreach_list(node, &this->instructions) {
1939 fs_inst *inst = (fs_inst *)node;
1940
1941 for (int i = 0; i < 3; i++) {
1942 if (inst->src[i].file != UNIFORM)
1943 continue;
1944
1945 int pull_index = pull_constant_loc[inst->src[i].reg +
1946 inst->src[i].reg_offset];
1947 if (pull_index == -1)
1948 continue;
1949
1950 /* Set up the annotation tracking for new generated instructions. */
1951 base_ir = inst->ir;
1952 current_annotation = inst->annotation;
1953
1954 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1955 fs_reg dst = fs_reg(this, glsl_type::float_type);
1956
1957 /* Generate a pull load into dst. */
1958 if (inst->src[i].reladdr) {
1959 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
1960 surf_index,
1961 *inst->src[i].reladdr,
1962 pull_index);
1963 inst->insert_before(&list);
1964 inst->src[i].reladdr = NULL;
1965 } else {
1966 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1967 fs_inst *pull =
1968 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1969 dst, surf_index, offset);
1970 inst->insert_before(pull);
1971 inst->src[i].set_smear(pull_index & 3);
1972 }
1973
1974 /* Rewrite the instruction to use the temporary VGRF. */
1975 inst->src[i].file = GRF;
1976 inst->src[i].reg = dst.reg;
1977 inst->src[i].reg_offset = 0;
1978 }
1979 }
1980 invalidate_live_intervals();
1981 }
1982
1983 bool
1984 fs_visitor::opt_algebraic()
1985 {
1986 bool progress = false;
1987
1988 foreach_list(node, &this->instructions) {
1989 fs_inst *inst = (fs_inst *)node;
1990
1991 switch (inst->opcode) {
1992 case BRW_OPCODE_MUL:
1993 if (inst->src[1].file != IMM)
1994 continue;
1995
1996 /* a * 1.0 = a */
1997 if (inst->src[1].is_one()) {
1998 inst->opcode = BRW_OPCODE_MOV;
1999 inst->src[1] = reg_undef;
2000 progress = true;
2001 break;
2002 }
2003
2004 /* a * 0.0 = 0.0 */
2005 if (inst->src[1].is_zero()) {
2006 inst->opcode = BRW_OPCODE_MOV;
2007 inst->src[0] = inst->src[1];
2008 inst->src[1] = reg_undef;
2009 progress = true;
2010 break;
2011 }
2012
2013 break;
2014 case BRW_OPCODE_ADD:
2015 if (inst->src[1].file != IMM)
2016 continue;
2017
2018 /* a + 0.0 = a */
2019 if (inst->src[1].is_zero()) {
2020 inst->opcode = BRW_OPCODE_MOV;
2021 inst->src[1] = reg_undef;
2022 progress = true;
2023 break;
2024 }
2025 break;
2026 case BRW_OPCODE_OR:
2027 if (inst->src[0].equals(inst->src[1])) {
2028 inst->opcode = BRW_OPCODE_MOV;
2029 inst->src[1] = reg_undef;
2030 progress = true;
2031 break;
2032 }
2033 break;
2034 case BRW_OPCODE_LRP:
2035 if (inst->src[1].equals(inst->src[2])) {
2036 inst->opcode = BRW_OPCODE_MOV;
2037 inst->src[0] = inst->src[1];
2038 inst->src[1] = reg_undef;
2039 inst->src[2] = reg_undef;
2040 progress = true;
2041 break;
2042 }
2043 break;
2044 case BRW_OPCODE_SEL:
2045 if (inst->saturate && inst->src[1].file == IMM) {
2046 switch (inst->conditional_mod) {
2047 case BRW_CONDITIONAL_LE:
2048 case BRW_CONDITIONAL_L:
2049 switch (inst->src[1].type) {
2050 case BRW_REGISTER_TYPE_F:
2051 if (inst->src[1].imm.f >= 1.0f) {
2052 inst->opcode = BRW_OPCODE_MOV;
2053 inst->src[1] = reg_undef;
2054 progress = true;
2055 }
2056 break;
2057 default:
2058 break;
2059 }
2060 break;
2061 case BRW_CONDITIONAL_GE:
2062 case BRW_CONDITIONAL_G:
2063 switch (inst->src[1].type) {
2064 case BRW_REGISTER_TYPE_F:
2065 if (inst->src[1].imm.f <= 0.0f) {
2066 inst->opcode = BRW_OPCODE_MOV;
2067 inst->src[1] = reg_undef;
2068 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2069 progress = true;
2070 }
2071 break;
2072 default:
2073 break;
2074 }
2075 default:
2076 break;
2077 }
2078 }
2079 break;
2080 default:
2081 break;
2082 }
2083 }
2084
2085 return progress;
2086 }
2087
2088 bool
2089 fs_visitor::compute_to_mrf()
2090 {
2091 bool progress = false;
2092 int next_ip = 0;
2093
2094 calculate_live_intervals();
2095
2096 foreach_list_safe(node, &this->instructions) {
2097 fs_inst *inst = (fs_inst *)node;
2098
2099 int ip = next_ip;
2100 next_ip++;
2101
2102 if (inst->opcode != BRW_OPCODE_MOV ||
2103 inst->is_partial_write() ||
2104 inst->dst.file != MRF || inst->src[0].file != GRF ||
2105 inst->dst.type != inst->src[0].type ||
2106 inst->src[0].abs || inst->src[0].negate ||
2107 !inst->src[0].is_contiguous() ||
2108 inst->src[0].subreg_offset)
2109 continue;
2110
2111 /* Work out which hardware MRF registers are written by this
2112 * instruction.
2113 */
2114 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2115 int mrf_high;
2116 if (inst->dst.reg & BRW_MRF_COMPR4) {
2117 mrf_high = mrf_low + 4;
2118 } else if (dispatch_width == 16 &&
2119 (!inst->force_uncompressed && !inst->force_sechalf)) {
2120 mrf_high = mrf_low + 1;
2121 } else {
2122 mrf_high = mrf_low;
2123 }
2124
2125 /* Can't compute-to-MRF this GRF if someone else was going to
2126 * read it later.
2127 */
2128 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2129 continue;
2130
2131 /* Found a move of a GRF to a MRF. Let's see if we can go
2132 * rewrite the thing that made this GRF to write into the MRF.
2133 */
2134 fs_inst *scan_inst;
2135 for (scan_inst = (fs_inst *)inst->prev;
2136 scan_inst->prev != NULL;
2137 scan_inst = (fs_inst *)scan_inst->prev) {
2138 if (scan_inst->dst.file == GRF &&
2139 scan_inst->dst.reg == inst->src[0].reg) {
2140 /* Found the last thing to write our reg we want to turn
2141 * into a compute-to-MRF.
2142 */
2143
2144 /* If this one instruction didn't populate all the
2145 * channels, bail. We might be able to rewrite everything
2146 * that writes that reg, but it would require smarter
2147 * tracking to delay the rewriting until complete success.
2148 */
2149 if (scan_inst->is_partial_write())
2150 break;
2151
2152 /* Things returning more than one register would need us to
2153 * understand coalescing out more than one MOV at a time.
2154 */
2155 if (scan_inst->regs_written > 1)
2156 break;
2157
2158 /* SEND instructions can't have MRF as a destination. */
2159 if (scan_inst->mlen)
2160 break;
2161
2162 if (brw->gen == 6) {
2163 /* gen6 math instructions must have the destination be
2164 * GRF, so no compute-to-MRF for them.
2165 */
2166 if (scan_inst->is_math()) {
2167 break;
2168 }
2169 }
2170
2171 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2172 /* Found the creator of our MRF's source value. */
2173 scan_inst->dst.file = MRF;
2174 scan_inst->dst.reg = inst->dst.reg;
2175 scan_inst->saturate |= inst->saturate;
2176 inst->remove();
2177 progress = true;
2178 }
2179 break;
2180 }
2181
2182 /* We don't handle control flow here. Most computation of
2183 * values that end up in MRFs are shortly before the MRF
2184 * write anyway.
2185 */
2186 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2187 break;
2188
2189 /* You can't read from an MRF, so if someone else reads our
2190 * MRF's source GRF that we wanted to rewrite, that stops us.
2191 */
2192 bool interfered = false;
2193 for (int i = 0; i < 3; i++) {
2194 if (scan_inst->src[i].file == GRF &&
2195 scan_inst->src[i].reg == inst->src[0].reg &&
2196 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2197 interfered = true;
2198 }
2199 }
2200 if (interfered)
2201 break;
2202
2203 if (scan_inst->dst.file == MRF) {
2204 /* If somebody else writes our MRF here, we can't
2205 * compute-to-MRF before that.
2206 */
2207 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2208 int scan_mrf_high;
2209
2210 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2211 scan_mrf_high = scan_mrf_low + 4;
2212 } else if (dispatch_width == 16 &&
2213 (!scan_inst->force_uncompressed &&
2214 !scan_inst->force_sechalf)) {
2215 scan_mrf_high = scan_mrf_low + 1;
2216 } else {
2217 scan_mrf_high = scan_mrf_low;
2218 }
2219
2220 if (mrf_low == scan_mrf_low ||
2221 mrf_low == scan_mrf_high ||
2222 mrf_high == scan_mrf_low ||
2223 mrf_high == scan_mrf_high) {
2224 break;
2225 }
2226 }
2227
2228 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2229 /* Found a SEND instruction, which means that there are
2230 * live values in MRFs from base_mrf to base_mrf +
2231 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2232 * above it.
2233 */
2234 if (mrf_low >= scan_inst->base_mrf &&
2235 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2236 break;
2237 }
2238 if (mrf_high >= scan_inst->base_mrf &&
2239 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2240 break;
2241 }
2242 }
2243 }
2244 }
2245
2246 if (progress)
2247 invalidate_live_intervals();
2248
2249 return progress;
2250 }
2251
2252 /**
2253 * Walks through basic blocks, looking for repeated MRF writes and
2254 * removing the later ones.
2255 */
2256 bool
2257 fs_visitor::remove_duplicate_mrf_writes()
2258 {
2259 fs_inst *last_mrf_move[16];
2260 bool progress = false;
2261
2262 /* Need to update the MRF tracking for compressed instructions. */
2263 if (dispatch_width == 16)
2264 return false;
2265
2266 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2267
2268 foreach_list_safe(node, &this->instructions) {
2269 fs_inst *inst = (fs_inst *)node;
2270
2271 if (inst->is_control_flow()) {
2272 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2273 }
2274
2275 if (inst->opcode == BRW_OPCODE_MOV &&
2276 inst->dst.file == MRF) {
2277 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2278 if (prev_inst && inst->equals(prev_inst)) {
2279 inst->remove();
2280 progress = true;
2281 continue;
2282 }
2283 }
2284
2285 /* Clear out the last-write records for MRFs that were overwritten. */
2286 if (inst->dst.file == MRF) {
2287 last_mrf_move[inst->dst.reg] = NULL;
2288 }
2289
2290 if (inst->mlen > 0 && inst->base_mrf != -1) {
2291 /* Found a SEND instruction, which will include two or fewer
2292 * implied MRF writes. We could do better here.
2293 */
2294 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2295 last_mrf_move[inst->base_mrf + i] = NULL;
2296 }
2297 }
2298
2299 /* Clear out any MRF move records whose sources got overwritten. */
2300 if (inst->dst.file == GRF) {
2301 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2302 if (last_mrf_move[i] &&
2303 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2304 last_mrf_move[i] = NULL;
2305 }
2306 }
2307 }
2308
2309 if (inst->opcode == BRW_OPCODE_MOV &&
2310 inst->dst.file == MRF &&
2311 inst->src[0].file == GRF &&
2312 !inst->is_partial_write()) {
2313 last_mrf_move[inst->dst.reg] = inst;
2314 }
2315 }
2316
2317 if (progress)
2318 invalidate_live_intervals();
2319
2320 return progress;
2321 }
2322
2323 static void
2324 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2325 int first_grf, int grf_len)
2326 {
2327 bool inst_simd16 = (dispatch_width > 8 &&
2328 !inst->force_uncompressed &&
2329 !inst->force_sechalf);
2330
2331 /* Clear the flag for registers that actually got read (as expected). */
2332 for (int i = 0; i < 3; i++) {
2333 int grf;
2334 if (inst->src[i].file == GRF) {
2335 grf = inst->src[i].reg;
2336 } else if (inst->src[i].file == HW_REG &&
2337 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2338 grf = inst->src[i].fixed_hw_reg.nr;
2339 } else {
2340 continue;
2341 }
2342
2343 if (grf >= first_grf &&
2344 grf < first_grf + grf_len) {
2345 deps[grf - first_grf] = false;
2346 if (inst_simd16)
2347 deps[grf - first_grf + 1] = false;
2348 }
2349 }
2350 }
2351
2352 /**
2353 * Implements this workaround for the original 965:
2354 *
2355 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2356 * check for post destination dependencies on this instruction, software
2357 * must ensure that there is no destination hazard for the case of ‘write
2358 * followed by a posted write’ shown in the following example.
2359 *
2360 * 1. mov r3 0
2361 * 2. send r3.xy <rest of send instruction>
2362 * 3. mov r2 r3
2363 *
2364 * Due to no post-destination dependency check on the ‘send’, the above
2365 * code sequence could have two instructions (1 and 2) in flight at the
2366 * same time that both consider ‘r3’ as the target of their final writes.
2367 */
2368 void
2369 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2370 {
2371 int reg_size = dispatch_width / 8;
2372 int write_len = inst->regs_written * reg_size;
2373 int first_write_grf = inst->dst.reg;
2374 bool needs_dep[BRW_MAX_MRF];
2375 assert(write_len < (int)sizeof(needs_dep) - 1);
2376
2377 memset(needs_dep, false, sizeof(needs_dep));
2378 memset(needs_dep, true, write_len);
2379
2380 clear_deps_for_inst_src(inst, dispatch_width,
2381 needs_dep, first_write_grf, write_len);
2382
2383 /* Walk backwards looking for writes to registers we're writing which
2384 * aren't read since being written. If we hit the start of the program,
2385 * we assume that there are no outstanding dependencies on entry to the
2386 * program.
2387 */
2388 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2389 scan_inst != NULL;
2390 scan_inst = (fs_inst *)scan_inst->prev) {
2391
2392 /* If we hit control flow, assume that there *are* outstanding
2393 * dependencies, and force their cleanup before our instruction.
2394 */
2395 if (scan_inst->is_control_flow()) {
2396 for (int i = 0; i < write_len; i++) {
2397 if (needs_dep[i]) {
2398 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2399 }
2400 }
2401 return;
2402 }
2403
2404 bool scan_inst_simd16 = (dispatch_width > 8 &&
2405 !scan_inst->force_uncompressed &&
2406 !scan_inst->force_sechalf);
2407
2408 /* We insert our reads as late as possible on the assumption that any
2409 * instruction but a MOV that might have left us an outstanding
2410 * dependency has more latency than a MOV.
2411 */
2412 if (scan_inst->dst.file == GRF) {
2413 for (int i = 0; i < scan_inst->regs_written; i++) {
2414 int reg = scan_inst->dst.reg + i * reg_size;
2415
2416 if (reg >= first_write_grf &&
2417 reg < first_write_grf + write_len &&
2418 needs_dep[reg - first_write_grf]) {
2419 inst->insert_before(DEP_RESOLVE_MOV(reg));
2420 needs_dep[reg - first_write_grf] = false;
2421 if (scan_inst_simd16)
2422 needs_dep[reg - first_write_grf + 1] = false;
2423 }
2424 }
2425 }
2426
2427 /* Clear the flag for registers that actually got read (as expected). */
2428 clear_deps_for_inst_src(scan_inst, dispatch_width,
2429 needs_dep, first_write_grf, write_len);
2430
2431 /* Continue the loop only if we haven't resolved all the dependencies */
2432 int i;
2433 for (i = 0; i < write_len; i++) {
2434 if (needs_dep[i])
2435 break;
2436 }
2437 if (i == write_len)
2438 return;
2439 }
2440 }
2441
2442 /**
2443 * Implements this workaround for the original 965:
2444 *
2445 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2446 * used as a destination register until after it has been sourced by an
2447 * instruction with a different destination register.
2448 */
2449 void
2450 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2451 {
2452 int write_len = inst->regs_written * dispatch_width / 8;
2453 int first_write_grf = inst->dst.reg;
2454 bool needs_dep[BRW_MAX_MRF];
2455 assert(write_len < (int)sizeof(needs_dep) - 1);
2456
2457 memset(needs_dep, false, sizeof(needs_dep));
2458 memset(needs_dep, true, write_len);
2459 /* Walk forwards looking for writes to registers we're writing which aren't
2460 * read before being written.
2461 */
2462 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2463 !scan_inst->is_tail_sentinel();
2464 scan_inst = (fs_inst *)scan_inst->next) {
2465 /* If we hit control flow, force resolve all remaining dependencies. */
2466 if (scan_inst->is_control_flow()) {
2467 for (int i = 0; i < write_len; i++) {
2468 if (needs_dep[i])
2469 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2470 }
2471 return;
2472 }
2473
2474 /* Clear the flag for registers that actually got read (as expected). */
2475 clear_deps_for_inst_src(scan_inst, dispatch_width,
2476 needs_dep, first_write_grf, write_len);
2477
2478 /* We insert our reads as late as possible since they're reading the
2479 * result of a SEND, which has massive latency.
2480 */
2481 if (scan_inst->dst.file == GRF &&
2482 scan_inst->dst.reg >= first_write_grf &&
2483 scan_inst->dst.reg < first_write_grf + write_len &&
2484 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2485 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2486 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2487 }
2488
2489 /* Continue the loop only if we haven't resolved all the dependencies */
2490 int i;
2491 for (i = 0; i < write_len; i++) {
2492 if (needs_dep[i])
2493 break;
2494 }
2495 if (i == write_len)
2496 return;
2497 }
2498
2499 /* If we hit the end of the program, resolve all remaining dependencies out
2500 * of paranoia.
2501 */
2502 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2503 assert(last_inst->eot);
2504 for (int i = 0; i < write_len; i++) {
2505 if (needs_dep[i])
2506 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2507 }
2508 }
2509
2510 void
2511 fs_visitor::insert_gen4_send_dependency_workarounds()
2512 {
2513 if (brw->gen != 4 || brw->is_g4x)
2514 return;
2515
2516 /* Note that we're done with register allocation, so GRF fs_regs always
2517 * have a .reg_offset of 0.
2518 */
2519
2520 foreach_list_safe(node, &this->instructions) {
2521 fs_inst *inst = (fs_inst *)node;
2522
2523 if (inst->mlen != 0 && inst->dst.file == GRF) {
2524 insert_gen4_pre_send_dependency_workarounds(inst);
2525 insert_gen4_post_send_dependency_workarounds(inst);
2526 }
2527 }
2528 }
2529
2530 /**
2531 * Turns the generic expression-style uniform pull constant load instruction
2532 * into a hardware-specific series of instructions for loading a pull
2533 * constant.
2534 *
2535 * The expression style allows the CSE pass before this to optimize out
2536 * repeated loads from the same offset, and gives the pre-register-allocation
2537 * scheduling full flexibility, while the conversion to native instructions
2538 * allows the post-register-allocation scheduler the best information
2539 * possible.
2540 *
2541 * Note that execution masking for setting up pull constant loads is special:
2542 * the channels that need to be written are unrelated to the current execution
2543 * mask, since a later instruction will use one of the result channels as a
2544 * source operand for all 8 or 16 of its channels.
2545 */
2546 void
2547 fs_visitor::lower_uniform_pull_constant_loads()
2548 {
2549 foreach_list(node, &this->instructions) {
2550 fs_inst *inst = (fs_inst *)node;
2551
2552 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2553 continue;
2554
2555 if (brw->gen >= 7) {
2556 /* The offset arg before was a vec4-aligned byte offset. We need to
2557 * turn it into a dword offset.
2558 */
2559 fs_reg const_offset_reg = inst->src[1];
2560 assert(const_offset_reg.file == IMM &&
2561 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2562 const_offset_reg.imm.u /= 4;
2563 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2564
2565 /* This is actually going to be a MOV, but since only the first dword
2566 * is accessed, we have a special opcode to do just that one. Note
2567 * that this needs to be an operation that will be considered a def
2568 * by live variable analysis, or register allocation will explode.
2569 */
2570 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2571 payload, const_offset_reg);
2572 setup->force_writemask_all = true;
2573
2574 setup->ir = inst->ir;
2575 setup->annotation = inst->annotation;
2576 inst->insert_before(setup);
2577
2578 /* Similarly, this will only populate the first 4 channels of the
2579 * result register (since we only use smear values from 0-3), but we
2580 * don't tell the optimizer.
2581 */
2582 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2583 inst->src[1] = payload;
2584
2585 invalidate_live_intervals();
2586 } else {
2587 /* Before register allocation, we didn't tell the scheduler about the
2588 * MRF we use. We know it's safe to use this MRF because nothing
2589 * else does except for register spill/unspill, which generates and
2590 * uses its MRF within a single IR instruction.
2591 */
2592 inst->base_mrf = 14;
2593 inst->mlen = 1;
2594 }
2595 }
2596 }
2597
2598 void
2599 fs_visitor::dump_instructions()
2600 {
2601 calculate_register_pressure();
2602
2603 int ip = 0, max_pressure = 0;
2604 foreach_list(node, &this->instructions) {
2605 backend_instruction *inst = (backend_instruction *)node;
2606 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2607 fprintf(stderr, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
2608 dump_instruction(inst);
2609 ++ip;
2610 }
2611 fprintf(stderr, "Maximum %3d registers live at once.\n", max_pressure);
2612 }
2613
2614 void
2615 fs_visitor::dump_instruction(backend_instruction *be_inst)
2616 {
2617 fs_inst *inst = (fs_inst *)be_inst;
2618
2619 if (inst->predicate) {
2620 fprintf(stderr, "(%cf0.%d) ",
2621 inst->predicate_inverse ? '-' : '+',
2622 inst->flag_subreg);
2623 }
2624
2625 fprintf(stderr, "%s", brw_instruction_name(inst->opcode));
2626 if (inst->saturate)
2627 fprintf(stderr, ".sat");
2628 if (inst->conditional_mod) {
2629 fprintf(stderr, "%s", conditional_modifier[inst->conditional_mod]);
2630 if (!inst->predicate &&
2631 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2632 inst->opcode != BRW_OPCODE_IF &&
2633 inst->opcode != BRW_OPCODE_WHILE))) {
2634 fprintf(stderr, ".f0.%d", inst->flag_subreg);
2635 }
2636 }
2637 fprintf(stderr, " ");
2638
2639
2640 switch (inst->dst.file) {
2641 case GRF:
2642 fprintf(stderr, "vgrf%d", inst->dst.reg);
2643 if (virtual_grf_sizes[inst->dst.reg] != 1 ||
2644 inst->dst.subreg_offset)
2645 fprintf(stderr, "+%d.%d",
2646 inst->dst.reg_offset, inst->dst.subreg_offset);
2647 break;
2648 case MRF:
2649 fprintf(stderr, "m%d", inst->dst.reg);
2650 break;
2651 case BAD_FILE:
2652 fprintf(stderr, "(null)");
2653 break;
2654 case UNIFORM:
2655 fprintf(stderr, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
2656 break;
2657 case HW_REG:
2658 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2659 switch (inst->dst.fixed_hw_reg.nr) {
2660 case BRW_ARF_NULL:
2661 fprintf(stderr, "null");
2662 break;
2663 case BRW_ARF_ADDRESS:
2664 fprintf(stderr, "a0.%d", inst->dst.fixed_hw_reg.subnr);
2665 break;
2666 case BRW_ARF_ACCUMULATOR:
2667 fprintf(stderr, "acc%d", inst->dst.fixed_hw_reg.subnr);
2668 break;
2669 case BRW_ARF_FLAG:
2670 fprintf(stderr, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2671 inst->dst.fixed_hw_reg.subnr);
2672 break;
2673 default:
2674 fprintf(stderr, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2675 inst->dst.fixed_hw_reg.subnr);
2676 break;
2677 }
2678 } else {
2679 fprintf(stderr, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
2680 }
2681 if (inst->dst.fixed_hw_reg.subnr)
2682 fprintf(stderr, "+%d", inst->dst.fixed_hw_reg.subnr);
2683 break;
2684 default:
2685 fprintf(stderr, "???");
2686 break;
2687 }
2688 fprintf(stderr, ":%s, ", brw_reg_type_letters(inst->dst.type));
2689
2690 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
2691 if (inst->src[i].negate)
2692 fprintf(stderr, "-");
2693 if (inst->src[i].abs)
2694 fprintf(stderr, "|");
2695 switch (inst->src[i].file) {
2696 case GRF:
2697 fprintf(stderr, "vgrf%d", inst->src[i].reg);
2698 if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2699 inst->src[i].subreg_offset)
2700 fprintf(stderr, "+%d.%d", inst->src[i].reg_offset,
2701 inst->src[i].subreg_offset);
2702 break;
2703 case MRF:
2704 fprintf(stderr, "***m%d***", inst->src[i].reg);
2705 break;
2706 case UNIFORM:
2707 fprintf(stderr, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
2708 if (inst->src[i].reladdr) {
2709 fprintf(stderr, "+reladdr");
2710 } else if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2711 inst->src[i].subreg_offset) {
2712 fprintf(stderr, "+%d.%d", inst->src[i].reg_offset,
2713 inst->src[i].subreg_offset);
2714 }
2715 break;
2716 case BAD_FILE:
2717 fprintf(stderr, "(null)");
2718 break;
2719 case IMM:
2720 switch (inst->src[i].type) {
2721 case BRW_REGISTER_TYPE_F:
2722 fprintf(stderr, "%ff", inst->src[i].imm.f);
2723 break;
2724 case BRW_REGISTER_TYPE_D:
2725 fprintf(stderr, "%dd", inst->src[i].imm.i);
2726 break;
2727 case BRW_REGISTER_TYPE_UD:
2728 fprintf(stderr, "%uu", inst->src[i].imm.u);
2729 break;
2730 default:
2731 fprintf(stderr, "???");
2732 break;
2733 }
2734 break;
2735 case HW_REG:
2736 if (inst->src[i].fixed_hw_reg.negate)
2737 fprintf(stderr, "-");
2738 if (inst->src[i].fixed_hw_reg.abs)
2739 fprintf(stderr, "|");
2740 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2741 switch (inst->src[i].fixed_hw_reg.nr) {
2742 case BRW_ARF_NULL:
2743 fprintf(stderr, "null");
2744 break;
2745 case BRW_ARF_ADDRESS:
2746 fprintf(stderr, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
2747 break;
2748 case BRW_ARF_ACCUMULATOR:
2749 fprintf(stderr, "acc%d", inst->src[i].fixed_hw_reg.subnr);
2750 break;
2751 case BRW_ARF_FLAG:
2752 fprintf(stderr, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2753 inst->src[i].fixed_hw_reg.subnr);
2754 break;
2755 default:
2756 fprintf(stderr, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2757 inst->src[i].fixed_hw_reg.subnr);
2758 break;
2759 }
2760 } else {
2761 fprintf(stderr, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2762 }
2763 if (inst->src[i].fixed_hw_reg.subnr)
2764 fprintf(stderr, "+%d", inst->src[i].fixed_hw_reg.subnr);
2765 if (inst->src[i].fixed_hw_reg.abs)
2766 fprintf(stderr, "|");
2767 break;
2768 default:
2769 fprintf(stderr, "???");
2770 break;
2771 }
2772 if (inst->src[i].abs)
2773 fprintf(stderr, "|");
2774
2775 if (inst->src[i].file != IMM) {
2776 fprintf(stderr, ":%s", brw_reg_type_letters(inst->src[i].type));
2777 }
2778
2779 if (i < 2 && inst->src[i + 1].file != BAD_FILE)
2780 fprintf(stderr, ", ");
2781 }
2782
2783 fprintf(stderr, " ");
2784
2785 if (inst->force_uncompressed)
2786 fprintf(stderr, "1sthalf ");
2787
2788 if (inst->force_sechalf)
2789 fprintf(stderr, "2ndhalf ");
2790
2791 fprintf(stderr, "\n");
2792 }
2793
2794 /**
2795 * Possibly returns an instruction that set up @param reg.
2796 *
2797 * Sometimes we want to take the result of some expression/variable
2798 * dereference tree and rewrite the instruction generating the result
2799 * of the tree. When processing the tree, we know that the
2800 * instructions generated are all writing temporaries that are dead
2801 * outside of this tree. So, if we have some instructions that write
2802 * a temporary, we're free to point that temp write somewhere else.
2803 *
2804 * Note that this doesn't guarantee that the instruction generated
2805 * only reg -- it might be the size=4 destination of a texture instruction.
2806 */
2807 fs_inst *
2808 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2809 fs_inst *end,
2810 const fs_reg &reg)
2811 {
2812 if (end == start ||
2813 end->is_partial_write() ||
2814 reg.reladdr ||
2815 !reg.equals(end->dst)) {
2816 return NULL;
2817 } else {
2818 return end;
2819 }
2820 }
2821
2822 void
2823 fs_visitor::setup_payload_gen6()
2824 {
2825 bool uses_depth =
2826 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2827 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2828
2829 assert(brw->gen >= 6);
2830
2831 /* R0-1: masks, pixel X/Y coordinates. */
2832 c->nr_payload_regs = 2;
2833 /* R2: only for 32-pixel dispatch.*/
2834
2835 /* R3-26: barycentric interpolation coordinates. These appear in the
2836 * same order that they appear in the brw_wm_barycentric_interp_mode
2837 * enum. Each set of coordinates occupies 2 registers if dispatch width
2838 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2839 * appear if they were enabled using the "Barycentric Interpolation
2840 * Mode" bits in WM_STATE.
2841 */
2842 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2843 if (barycentric_interp_modes & (1 << i)) {
2844 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2845 c->nr_payload_regs += 2;
2846 if (dispatch_width == 16) {
2847 c->nr_payload_regs += 2;
2848 }
2849 }
2850 }
2851
2852 /* R27: interpolated depth if uses source depth */
2853 if (uses_depth) {
2854 c->source_depth_reg = c->nr_payload_regs;
2855 c->nr_payload_regs++;
2856 if (dispatch_width == 16) {
2857 /* R28: interpolated depth if not SIMD8. */
2858 c->nr_payload_regs++;
2859 }
2860 }
2861 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2862 if (uses_depth) {
2863 c->source_w_reg = c->nr_payload_regs;
2864 c->nr_payload_regs++;
2865 if (dispatch_width == 16) {
2866 /* R30: interpolated W if not SIMD8. */
2867 c->nr_payload_regs++;
2868 }
2869 }
2870
2871 c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
2872 /* R31: MSAA position offsets. */
2873 if (c->prog_data.uses_pos_offset) {
2874 c->sample_pos_reg = c->nr_payload_regs;
2875 c->nr_payload_regs++;
2876 }
2877
2878 /* R32: MSAA input coverage mask */
2879 if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
2880 assert(brw->gen >= 7);
2881 c->sample_mask_reg = c->nr_payload_regs;
2882 c->nr_payload_regs++;
2883 if (dispatch_width == 16) {
2884 /* R33: input coverage mask if not SIMD8. */
2885 c->nr_payload_regs++;
2886 }
2887 }
2888
2889 /* R34-: bary for 32-pixel. */
2890 /* R58-59: interp W for 32-pixel. */
2891
2892 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2893 c->source_depth_to_render_target = true;
2894 }
2895 }
2896
2897 void
2898 fs_visitor::assign_binding_table_offsets()
2899 {
2900 uint32_t next_binding_table_offset = 0;
2901
2902 /* If there are no color regions, we still perform an FB write to a null
2903 * renderbuffer, which we place at surface index 0.
2904 */
2905 c->prog_data.binding_table.render_target_start = next_binding_table_offset;
2906 next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
2907
2908 assign_common_binding_table_offsets(next_binding_table_offset);
2909 }
2910
2911 void
2912 fs_visitor::calculate_register_pressure()
2913 {
2914 invalidate_live_intervals();
2915 calculate_live_intervals();
2916
2917 int num_instructions = 0;
2918 foreach_list(node, &this->instructions) {
2919 ++num_instructions;
2920 }
2921
2922 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
2923
2924 for (int reg = 0; reg < virtual_grf_count; reg++) {
2925 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
2926 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
2927 }
2928 }
2929
2930 /**
2931 * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
2932 *
2933 * The needs_unlit_centroid_workaround ends up producing one of these per
2934 * channel of centroid input, so it's good to clean them up.
2935 *
2936 * An assumption here is that nothing ever modifies the dispatched pixels
2937 * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
2938 * dictates that anyway.
2939 */
2940 void
2941 fs_visitor::opt_drop_redundant_mov_to_flags()
2942 {
2943 bool flag_mov_found[2] = {false};
2944
2945 foreach_list_safe(node, &this->instructions) {
2946 fs_inst *inst = (fs_inst *)node;
2947
2948 if (inst->is_control_flow()) {
2949 memset(flag_mov_found, 0, sizeof(flag_mov_found));
2950 } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
2951 if (!flag_mov_found[inst->flag_subreg])
2952 flag_mov_found[inst->flag_subreg] = true;
2953 else
2954 inst->remove();
2955 } else if (inst->writes_flag()) {
2956 flag_mov_found[inst->flag_subreg] = false;
2957 }
2958 }
2959 }
2960
2961 bool
2962 fs_visitor::run()
2963 {
2964 sanity_param_count = fp->Base.Parameters->NumParameters;
2965 bool allocated_without_spills;
2966
2967 assign_binding_table_offsets();
2968
2969 if (brw->gen >= 6)
2970 setup_payload_gen6();
2971 else
2972 setup_payload_gen4();
2973
2974 if (0) {
2975 emit_dummy_fs();
2976 } else {
2977 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2978 emit_shader_time_begin();
2979
2980 calculate_urb_setup();
2981 if (fp->Base.InputsRead > 0) {
2982 if (brw->gen < 6)
2983 emit_interpolation_setup_gen4();
2984 else
2985 emit_interpolation_setup_gen6();
2986 }
2987
2988 /* We handle discards by keeping track of the still-live pixels in f0.1.
2989 * Initialize it with the dispatched pixels.
2990 */
2991 if (fp->UsesKill || c->key.alpha_test_func) {
2992 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2993 discard_init->flag_subreg = 1;
2994 }
2995
2996 /* Generate FS IR for main(). (the visitor only descends into
2997 * functions called "main").
2998 */
2999 if (shader) {
3000 foreach_list(node, &*shader->base.ir) {
3001 ir_instruction *ir = (ir_instruction *)node;
3002 base_ir = ir;
3003 this->result = reg_undef;
3004 ir->accept(this);
3005 }
3006 } else {
3007 emit_fragment_program_code();
3008 }
3009 base_ir = NULL;
3010 if (failed)
3011 return false;
3012
3013 emit(FS_OPCODE_PLACEHOLDER_HALT);
3014
3015 if (c->key.alpha_test_func)
3016 emit_alpha_test();
3017
3018 emit_fb_writes();
3019
3020 split_virtual_grfs();
3021
3022 move_uniform_array_access_to_pull_constants();
3023 assign_constant_locations();
3024 demote_pull_constants();
3025
3026 opt_drop_redundant_mov_to_flags();
3027
3028 bool progress;
3029 do {
3030 progress = false;
3031
3032 compact_virtual_grfs();
3033
3034 progress = remove_duplicate_mrf_writes() || progress;
3035
3036 progress = opt_algebraic() || progress;
3037 progress = opt_cse() || progress;
3038 progress = opt_copy_propagate() || progress;
3039 progress = opt_peephole_predicated_break() || progress;
3040 progress = dead_code_eliminate() || progress;
3041 progress = opt_peephole_sel() || progress;
3042 progress = dead_control_flow_eliminate(this) || progress;
3043 progress = opt_saturate_propagation() || progress;
3044 progress = register_coalesce() || progress;
3045 progress = compute_to_mrf() || progress;
3046 } while (progress);
3047
3048 lower_uniform_pull_constant_loads();
3049
3050 assign_curb_setup();
3051 assign_urb_setup();
3052
3053 static enum instruction_scheduler_mode pre_modes[] = {
3054 SCHEDULE_PRE,
3055 SCHEDULE_PRE_NON_LIFO,
3056 SCHEDULE_PRE_LIFO,
3057 };
3058
3059 /* Try each scheduling heuristic to see if it can successfully register
3060 * allocate without spilling. They should be ordered by decreasing
3061 * performance but increasing likelihood of allocating.
3062 */
3063 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3064 schedule_instructions(pre_modes[i]);
3065
3066 if (0) {
3067 assign_regs_trivial();
3068 allocated_without_spills = true;
3069 } else {
3070 allocated_without_spills = assign_regs(false);
3071 }
3072 if (allocated_without_spills)
3073 break;
3074 }
3075
3076 if (!allocated_without_spills) {
3077 /* We assume that any spilling is worse than just dropping back to
3078 * SIMD8. There's probably actually some intermediate point where
3079 * SIMD16 with a couple of spills is still better.
3080 */
3081 if (dispatch_width == 16) {
3082 fail("Failure to register allocate. Reduce number of "
3083 "live scalar values to avoid this.");
3084 }
3085
3086 /* Since we're out of heuristics, just go spill registers until we
3087 * get an allocation.
3088 */
3089 while (!assign_regs(true)) {
3090 if (failed)
3091 break;
3092 }
3093 }
3094 }
3095 assert(force_uncompressed_stack == 0);
3096
3097 /* This must come after all optimization and register allocation, since
3098 * it inserts dead code that happens to have side effects, and it does
3099 * so based on the actual physical registers in use.
3100 */
3101 insert_gen4_send_dependency_workarounds();
3102
3103 if (failed)
3104 return false;
3105
3106 if (!allocated_without_spills)
3107 schedule_instructions(SCHEDULE_POST);
3108
3109 if (dispatch_width == 8)
3110 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3111 else
3112 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3113
3114 /* If any state parameters were appended, then ParameterValues could have
3115 * been realloced, in which case the driver uniform storage set up by
3116 * _mesa_associate_uniform_storage() would point to freed memory. Make
3117 * sure that didn't happen.
3118 */
3119 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3120
3121 return !failed;
3122 }
3123
3124 const unsigned *
3125 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3126 struct gl_fragment_program *fp,
3127 struct gl_shader_program *prog,
3128 unsigned *final_assembly_size)
3129 {
3130 bool start_busy = false;
3131 double start_time = 0;
3132
3133 if (unlikely(brw->perf_debug)) {
3134 start_busy = (brw->batch.last_bo &&
3135 drm_intel_bo_busy(brw->batch.last_bo));
3136 start_time = get_time();
3137 }
3138
3139 struct brw_shader *shader = NULL;
3140 if (prog)
3141 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3142
3143 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3144 brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3145
3146 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3147 */
3148 fs_visitor v(brw, c, prog, fp, 8);
3149 if (!v.run()) {
3150 if (prog) {
3151 prog->LinkStatus = false;
3152 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3153 }
3154
3155 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3156 v.fail_msg);
3157
3158 return NULL;
3159 }
3160
3161 exec_list *simd16_instructions = NULL;
3162 fs_visitor v2(brw, c, prog, fp, 16);
3163 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3164 if (!v.simd16_unsupported) {
3165 /* Try a SIMD16 compile */
3166 v2.import_uniforms(&v);
3167 if (!v2.run()) {
3168 perf_debug("SIMD16 shader failed to compile, falling back to "
3169 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3170 } else {
3171 simd16_instructions = &v2.instructions;
3172 }
3173 } else {
3174 perf_debug("SIMD16 shader unsupported, falling back to "
3175 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3176 }
3177 }
3178
3179 const unsigned *assembly = NULL;
3180 if (brw->gen >= 8) {
3181 gen8_fs_generator g(brw, c, prog, fp, v.do_dual_src);
3182 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3183 final_assembly_size);
3184 } else {
3185 fs_generator g(brw, c, prog, fp, v.do_dual_src);
3186 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3187 final_assembly_size);
3188 }
3189
3190 if (unlikely(brw->perf_debug) && shader) {
3191 if (shader->compiled_once)
3192 brw_wm_debug_recompile(brw, prog, &c->key);
3193 shader->compiled_once = true;
3194
3195 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3196 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3197 (get_time() - start_time) * 1000);
3198 }
3199 }
3200
3201 return assembly;
3202 }
3203
3204 bool
3205 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3206 {
3207 struct brw_context *brw = brw_context(ctx);
3208 struct brw_wm_prog_key key;
3209
3210 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3211 return true;
3212
3213 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3214 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3215 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3216 bool program_uses_dfdy = fp->UsesDFdy;
3217
3218 memset(&key, 0, sizeof(key));
3219
3220 if (brw->gen < 6) {
3221 if (fp->UsesKill)
3222 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3223
3224 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3225 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3226
3227 /* Just assume depth testing. */
3228 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3229 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3230 }
3231
3232 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3233 BRW_FS_VARYING_INPUT_MASK) > 16)
3234 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3235
3236 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3237
3238 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3239 for (unsigned i = 0; i < sampler_count; i++) {
3240 if (fp->Base.ShadowSamplers & (1 << i)) {
3241 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3242 key.tex.swizzles[i] =
3243 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3244 } else {
3245 /* Color sampler: assume no swizzling. */
3246 key.tex.swizzles[i] = SWIZZLE_XYZW;
3247 }
3248 }
3249
3250 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3251 key.drawable_height = ctx->DrawBuffer->Height;
3252 }
3253
3254 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3255 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3256 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3257
3258 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3259 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3260 key.nr_color_regions > 1;
3261 }
3262
3263 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3264 * quality of the derivatives is likely to be determined by the driconf
3265 * option.
3266 */
3267 key.high_quality_derivatives = brw->disable_derivative_optimization;
3268
3269 key.program_string_id = bfp->id;
3270
3271 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3272 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3273
3274 bool success = do_wm_prog(brw, prog, bfp, &key);
3275
3276 brw->wm.base.prog_offset = old_prog_offset;
3277 brw->wm.prog_data = old_prog_data;
3278
3279 return success;
3280 }