i965/fs: Don't use instruction list after calculating the cfg.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "util/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "util/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_cfg.h"
50 #include "brw_dead_control_flow.h"
51 #include "main/uniforms.h"
52 #include "brw_fs_live_variables.h"
53 #include "glsl/glsl_types.h"
54
55 void
56 fs_inst::init(enum opcode opcode, const fs_reg &dst, fs_reg *src, int sources)
57 {
58 memset(this, 0, sizeof(*this));
59
60 this->opcode = opcode;
61 this->dst = dst;
62 this->src = src;
63 this->sources = sources;
64
65 this->conditional_mod = BRW_CONDITIONAL_NONE;
66
67 /* This will be the case for almost all instructions. */
68 this->regs_written = 1;
69
70 this->writes_accumulator = false;
71 }
72
73 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
74 {
75 fs_reg *src = ralloc_array(this, fs_reg, 3);
76 init(opcode, dst, src, 0);
77 }
78
79 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
80 {
81 fs_reg *src = ralloc_array(this, fs_reg, 3);
82 src[0] = src0;
83 init(opcode, dst, src, 1);
84 }
85
86 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
87 const fs_reg &src1)
88 {
89 fs_reg *src = ralloc_array(this, fs_reg, 3);
90 src[0] = src0;
91 src[1] = src1;
92 init(opcode, dst, src, 2);
93 }
94
95 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
96 const fs_reg &src1, const fs_reg &src2)
97 {
98 fs_reg *src = ralloc_array(this, fs_reg, 3);
99 src[0] = src0;
100 src[1] = src1;
101 src[2] = src2;
102 init(opcode, dst, src, 3);
103 }
104
105 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
106 {
107 init(opcode, dst, src, sources);
108 }
109
110 fs_inst::fs_inst(const fs_inst &that)
111 {
112 memcpy(this, &that, sizeof(that));
113
114 this->src = ralloc_array(this, fs_reg, that.sources);
115
116 for (int i = 0; i < that.sources; i++)
117 this->src[i] = that.src[i];
118 }
119
120 void
121 fs_inst::resize_sources(uint8_t num_sources)
122 {
123 if (this->sources != num_sources) {
124 this->src = reralloc(this, this->src, fs_reg, num_sources);
125 this->sources = num_sources;
126 }
127 }
128
129 #define ALU1(op) \
130 fs_inst * \
131 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
132 { \
133 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
134 }
135
136 #define ALU2(op) \
137 fs_inst * \
138 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
139 const fs_reg &src1) \
140 { \
141 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
142 }
143
144 #define ALU2_ACC(op) \
145 fs_inst * \
146 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
147 const fs_reg &src1) \
148 { \
149 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
150 inst->writes_accumulator = true; \
151 return inst; \
152 }
153
154 #define ALU3(op) \
155 fs_inst * \
156 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
157 const fs_reg &src1, const fs_reg &src2) \
158 { \
159 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
160 }
161
162 ALU1(NOT)
163 ALU1(MOV)
164 ALU1(FRC)
165 ALU1(RNDD)
166 ALU1(RNDE)
167 ALU1(RNDZ)
168 ALU2(ADD)
169 ALU2(MUL)
170 ALU2_ACC(MACH)
171 ALU2(AND)
172 ALU2(OR)
173 ALU2(XOR)
174 ALU2(SHL)
175 ALU2(SHR)
176 ALU2(ASR)
177 ALU3(LRP)
178 ALU1(BFREV)
179 ALU3(BFE)
180 ALU2(BFI1)
181 ALU3(BFI2)
182 ALU1(FBH)
183 ALU1(FBL)
184 ALU1(CBIT)
185 ALU3(MAD)
186 ALU2_ACC(ADDC)
187 ALU2_ACC(SUBB)
188 ALU2(SEL)
189 ALU2(MAC)
190
191 /** Gen4 predicated IF. */
192 fs_inst *
193 fs_visitor::IF(enum brw_predicate predicate)
194 {
195 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
196 inst->predicate = predicate;
197 return inst;
198 }
199
200 /** Gen6 IF with embedded comparison. */
201 fs_inst *
202 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
203 enum brw_conditional_mod condition)
204 {
205 assert(brw->gen == 6);
206 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
207 reg_null_d, src0, src1);
208 inst->conditional_mod = condition;
209 return inst;
210 }
211
212 /**
213 * CMP: Sets the low bit of the destination channels with the result
214 * of the comparison, while the upper bits are undefined, and updates
215 * the flag register with the packed 16 bits of the result.
216 */
217 fs_inst *
218 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
219 enum brw_conditional_mod condition)
220 {
221 fs_inst *inst;
222
223 /* Take the instruction:
224 *
225 * CMP null<d> src0<f> src1<f>
226 *
227 * Original gen4 does type conversion to the destination type before
228 * comparison, producing garbage results for floating point comparisons.
229 * gen5 does the comparison on the execution type (resolved source types),
230 * so dst type doesn't matter. gen6 does comparison and then uses the
231 * result as if it was the dst type with no conversion, which happens to
232 * mostly work out for float-interpreted-as-int since our comparisons are
233 * for >0, =0, <0.
234 */
235 if (brw->gen == 4) {
236 dst.type = src0.type;
237 if (dst.file == HW_REG)
238 dst.fixed_hw_reg.type = dst.type;
239 }
240
241 resolve_ud_negate(&src0);
242 resolve_ud_negate(&src1);
243
244 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
245 inst->conditional_mod = condition;
246
247 return inst;
248 }
249
250 fs_inst *
251 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
252 {
253 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst, src,
254 sources);
255 inst->regs_written = sources;
256
257 return inst;
258 }
259
260 exec_list
261 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
262 const fs_reg &surf_index,
263 const fs_reg &varying_offset,
264 uint32_t const_offset)
265 {
266 exec_list instructions;
267 fs_inst *inst;
268
269 /* We have our constant surface use a pitch of 4 bytes, so our index can
270 * be any component of a vector, and then we load 4 contiguous
271 * components starting from that.
272 *
273 * We break down the const_offset to a portion added to the variable
274 * offset and a portion done using reg_offset, which means that if you
275 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
276 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
277 * CSE can later notice that those loads are all the same and eliminate
278 * the redundant ones.
279 */
280 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
281 instructions.push_tail(ADD(vec4_offset,
282 varying_offset, const_offset & ~3));
283
284 int scale = 1;
285 if (brw->gen == 4 && dispatch_width == 8) {
286 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
287 * u, v, r) as parameters, or we can just use the SIMD16 message
288 * consisting of (header, u). We choose the second, at the cost of a
289 * longer return length.
290 */
291 scale = 2;
292 }
293
294 enum opcode op;
295 if (brw->gen >= 7)
296 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
297 else
298 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
299 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
300 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
301 inst->regs_written = 4 * scale;
302 instructions.push_tail(inst);
303
304 if (brw->gen < 7) {
305 inst->base_mrf = 13;
306 inst->header_present = true;
307 if (brw->gen == 4)
308 inst->mlen = 3;
309 else
310 inst->mlen = 1 + dispatch_width / 8;
311 }
312
313 vec4_result.reg_offset += (const_offset & 3) * scale;
314 instructions.push_tail(MOV(dst, vec4_result));
315
316 return instructions;
317 }
318
319 /**
320 * A helper for MOV generation for fixing up broken hardware SEND dependency
321 * handling.
322 */
323 fs_inst *
324 fs_visitor::DEP_RESOLVE_MOV(int grf)
325 {
326 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
327
328 inst->ir = NULL;
329 inst->annotation = "send dependency resolve";
330
331 /* The caller always wants uncompressed to emit the minimal extra
332 * dependencies, and to avoid having to deal with aligning its regs to 2.
333 */
334 inst->force_uncompressed = true;
335
336 return inst;
337 }
338
339 bool
340 fs_inst::equals(fs_inst *inst) const
341 {
342 return (opcode == inst->opcode &&
343 dst.equals(inst->dst) &&
344 src[0].equals(inst->src[0]) &&
345 src[1].equals(inst->src[1]) &&
346 src[2].equals(inst->src[2]) &&
347 saturate == inst->saturate &&
348 predicate == inst->predicate &&
349 conditional_mod == inst->conditional_mod &&
350 mlen == inst->mlen &&
351 base_mrf == inst->base_mrf &&
352 target == inst->target &&
353 eot == inst->eot &&
354 header_present == inst->header_present &&
355 shadow_compare == inst->shadow_compare &&
356 offset == inst->offset);
357 }
358
359 bool
360 fs_inst::overwrites_reg(const fs_reg &reg) const
361 {
362 return (reg.file == dst.file &&
363 reg.reg == dst.reg &&
364 reg.reg_offset >= dst.reg_offset &&
365 reg.reg_offset < dst.reg_offset + regs_written);
366 }
367
368 bool
369 fs_inst::is_send_from_grf() const
370 {
371 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
372 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
373 opcode == FS_OPCODE_INTERPOLATE_AT_CENTROID ||
374 opcode == FS_OPCODE_INTERPOLATE_AT_SAMPLE ||
375 opcode == FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET ||
376 opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET ||
377 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
378 src[1].file == GRF) ||
379 (is_tex() && src[0].file == GRF));
380 }
381
382 bool
383 fs_inst::can_do_source_mods(struct brw_context *brw)
384 {
385 if (brw->gen == 6 && is_math())
386 return false;
387
388 if (is_send_from_grf())
389 return false;
390
391 if (!backend_instruction::can_do_source_mods())
392 return false;
393
394 return true;
395 }
396
397 void
398 fs_reg::init()
399 {
400 memset(this, 0, sizeof(*this));
401 stride = 1;
402 }
403
404 /** Generic unset register constructor. */
405 fs_reg::fs_reg()
406 {
407 init();
408 this->file = BAD_FILE;
409 }
410
411 /** Immediate value constructor. */
412 fs_reg::fs_reg(float f)
413 {
414 init();
415 this->file = IMM;
416 this->type = BRW_REGISTER_TYPE_F;
417 this->fixed_hw_reg.dw1.f = f;
418 }
419
420 /** Immediate value constructor. */
421 fs_reg::fs_reg(int32_t i)
422 {
423 init();
424 this->file = IMM;
425 this->type = BRW_REGISTER_TYPE_D;
426 this->fixed_hw_reg.dw1.d = i;
427 }
428
429 /** Immediate value constructor. */
430 fs_reg::fs_reg(uint32_t u)
431 {
432 init();
433 this->file = IMM;
434 this->type = BRW_REGISTER_TYPE_UD;
435 this->fixed_hw_reg.dw1.ud = u;
436 }
437
438 /** Fixed brw_reg. */
439 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
440 {
441 init();
442 this->file = HW_REG;
443 this->fixed_hw_reg = fixed_hw_reg;
444 this->type = fixed_hw_reg.type;
445 }
446
447 bool
448 fs_reg::equals(const fs_reg &r) const
449 {
450 return (file == r.file &&
451 reg == r.reg &&
452 reg_offset == r.reg_offset &&
453 subreg_offset == r.subreg_offset &&
454 type == r.type &&
455 negate == r.negate &&
456 abs == r.abs &&
457 !reladdr && !r.reladdr &&
458 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
459 sizeof(fixed_hw_reg)) == 0 &&
460 stride == r.stride);
461 }
462
463 fs_reg &
464 fs_reg::apply_stride(unsigned stride)
465 {
466 assert((this->stride * stride) <= 4 &&
467 (is_power_of_two(stride) || stride == 0) &&
468 file != HW_REG && file != IMM);
469 this->stride *= stride;
470 return *this;
471 }
472
473 fs_reg &
474 fs_reg::set_smear(unsigned subreg)
475 {
476 assert(file != HW_REG && file != IMM);
477 subreg_offset = subreg * type_sz(type);
478 stride = 0;
479 return *this;
480 }
481
482 bool
483 fs_reg::is_contiguous() const
484 {
485 return stride == 1;
486 }
487
488 bool
489 fs_reg::is_valid_3src() const
490 {
491 return file == GRF || file == UNIFORM;
492 }
493
494 int
495 fs_visitor::type_size(const struct glsl_type *type)
496 {
497 unsigned int size, i;
498
499 switch (type->base_type) {
500 case GLSL_TYPE_UINT:
501 case GLSL_TYPE_INT:
502 case GLSL_TYPE_FLOAT:
503 case GLSL_TYPE_BOOL:
504 return type->components();
505 case GLSL_TYPE_ARRAY:
506 return type_size(type->fields.array) * type->length;
507 case GLSL_TYPE_STRUCT:
508 size = 0;
509 for (i = 0; i < type->length; i++) {
510 size += type_size(type->fields.structure[i].type);
511 }
512 return size;
513 case GLSL_TYPE_SAMPLER:
514 /* Samplers take up no register space, since they're baked in at
515 * link time.
516 */
517 return 0;
518 case GLSL_TYPE_ATOMIC_UINT:
519 return 0;
520 case GLSL_TYPE_IMAGE:
521 case GLSL_TYPE_VOID:
522 case GLSL_TYPE_ERROR:
523 case GLSL_TYPE_INTERFACE:
524 unreachable("not reached");
525 }
526
527 return 0;
528 }
529
530 fs_reg
531 fs_visitor::get_timestamp()
532 {
533 assert(brw->gen >= 7);
534
535 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
536 BRW_ARF_TIMESTAMP,
537 0),
538 BRW_REGISTER_TYPE_UD));
539
540 fs_reg dst = fs_reg(this, glsl_type::uint_type);
541
542 fs_inst *mov = emit(MOV(dst, ts));
543 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
544 * even if it's not enabled in the dispatch.
545 */
546 mov->force_writemask_all = true;
547 mov->force_uncompressed = true;
548
549 /* The caller wants the low 32 bits of the timestamp. Since it's running
550 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
551 * which is plenty of time for our purposes. It is identical across the
552 * EUs, but since it's tracking GPU core speed it will increment at a
553 * varying rate as render P-states change.
554 *
555 * The caller could also check if render P-states have changed (or anything
556 * else that might disrupt timing) by setting smear to 2 and checking if
557 * that field is != 0.
558 */
559 dst.set_smear(0);
560
561 return dst;
562 }
563
564 void
565 fs_visitor::emit_shader_time_begin()
566 {
567 current_annotation = "shader time start";
568 shader_start_time = get_timestamp();
569 }
570
571 void
572 fs_visitor::emit_shader_time_end()
573 {
574 current_annotation = "shader time end";
575
576 enum shader_time_shader_type type, written_type, reset_type;
577 if (dispatch_width == 8) {
578 type = ST_FS8;
579 written_type = ST_FS8_WRITTEN;
580 reset_type = ST_FS8_RESET;
581 } else {
582 assert(dispatch_width == 16);
583 type = ST_FS16;
584 written_type = ST_FS16_WRITTEN;
585 reset_type = ST_FS16_RESET;
586 }
587
588 fs_reg shader_end_time = get_timestamp();
589
590 /* Check that there weren't any timestamp reset events (assuming these
591 * were the only two timestamp reads that happened).
592 */
593 fs_reg reset = shader_end_time;
594 reset.set_smear(2);
595 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
596 test->conditional_mod = BRW_CONDITIONAL_Z;
597 emit(IF(BRW_PREDICATE_NORMAL));
598
599 push_force_uncompressed();
600 fs_reg start = shader_start_time;
601 start.negate = true;
602 fs_reg diff = fs_reg(this, glsl_type::uint_type);
603 emit(ADD(diff, start, shader_end_time));
604
605 /* If there were no instructions between the two timestamp gets, the diff
606 * is 2 cycles. Remove that overhead, so I can forget about that when
607 * trying to determine the time taken for single instructions.
608 */
609 emit(ADD(diff, diff, fs_reg(-2u)));
610
611 emit_shader_time_write(type, diff);
612 emit_shader_time_write(written_type, fs_reg(1u));
613 emit(BRW_OPCODE_ELSE);
614 emit_shader_time_write(reset_type, fs_reg(1u));
615 emit(BRW_OPCODE_ENDIF);
616
617 pop_force_uncompressed();
618 }
619
620 void
621 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
622 fs_reg value)
623 {
624 int shader_time_index =
625 brw_get_shader_time_index(brw, shader_prog, prog, type);
626 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
627
628 fs_reg payload;
629 if (dispatch_width == 8)
630 payload = fs_reg(this, glsl_type::uvec2_type);
631 else
632 payload = fs_reg(this, glsl_type::uint_type);
633
634 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
635 fs_reg(), payload, offset, value));
636 }
637
638 void
639 fs_visitor::vfail(const char *format, va_list va)
640 {
641 char *msg;
642
643 if (failed)
644 return;
645
646 failed = true;
647
648 msg = ralloc_vasprintf(mem_ctx, format, va);
649 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
650
651 this->fail_msg = msg;
652
653 if (INTEL_DEBUG & DEBUG_WM) {
654 fprintf(stderr, "%s", msg);
655 }
656 }
657
658 void
659 fs_visitor::fail(const char *format, ...)
660 {
661 va_list va;
662
663 va_start(va, format);
664 vfail(format, va);
665 va_end(va);
666 }
667
668 /**
669 * Mark this program as impossible to compile in SIMD16 mode.
670 *
671 * During the SIMD8 compile (which happens first), we can detect and flag
672 * things that are unsupported in SIMD16 mode, so the compiler can skip
673 * the SIMD16 compile altogether.
674 *
675 * During a SIMD16 compile (if one happens anyway), this just calls fail().
676 */
677 void
678 fs_visitor::no16(const char *format, ...)
679 {
680 va_list va;
681
682 va_start(va, format);
683
684 if (dispatch_width == 16) {
685 vfail(format, va);
686 } else {
687 simd16_unsupported = true;
688
689 if (brw->perf_debug) {
690 if (no16_msg)
691 ralloc_vasprintf_append(&no16_msg, format, va);
692 else
693 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
694 }
695 }
696
697 va_end(va);
698 }
699
700 fs_inst *
701 fs_visitor::emit(enum opcode opcode)
702 {
703 return emit(new(mem_ctx) fs_inst(opcode));
704 }
705
706 fs_inst *
707 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
708 {
709 return emit(new(mem_ctx) fs_inst(opcode, dst));
710 }
711
712 fs_inst *
713 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
714 {
715 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
716 }
717
718 fs_inst *
719 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
720 const fs_reg &src1)
721 {
722 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
723 }
724
725 fs_inst *
726 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
727 const fs_reg &src1, const fs_reg &src2)
728 {
729 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
730 }
731
732 fs_inst *
733 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
734 fs_reg src[], int sources)
735 {
736 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
737 }
738
739 void
740 fs_visitor::push_force_uncompressed()
741 {
742 force_uncompressed_stack++;
743 }
744
745 void
746 fs_visitor::pop_force_uncompressed()
747 {
748 force_uncompressed_stack--;
749 assert(force_uncompressed_stack >= 0);
750 }
751
752 /**
753 * Returns true if the instruction has a flag that means it won't
754 * update an entire destination register.
755 *
756 * For example, dead code elimination and live variable analysis want to know
757 * when a write to a variable screens off any preceding values that were in
758 * it.
759 */
760 bool
761 fs_inst::is_partial_write() const
762 {
763 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
764 this->force_uncompressed ||
765 this->force_sechalf || !this->dst.is_contiguous());
766 }
767
768 int
769 fs_inst::regs_read(fs_visitor *v, int arg) const
770 {
771 if (is_tex() && arg == 0 && src[0].file == GRF) {
772 if (v->dispatch_width == 16)
773 return (mlen + 1) / 2;
774 else
775 return mlen;
776 }
777 return 1;
778 }
779
780 bool
781 fs_inst::reads_flag() const
782 {
783 return predicate;
784 }
785
786 bool
787 fs_inst::writes_flag() const
788 {
789 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
790 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
791 }
792
793 /**
794 * Returns how many MRFs an FS opcode will write over.
795 *
796 * Note that this is not the 0 or 1 implied writes in an actual gen
797 * instruction -- the FS opcodes often generate MOVs in addition.
798 */
799 int
800 fs_visitor::implied_mrf_writes(fs_inst *inst)
801 {
802 if (inst->mlen == 0)
803 return 0;
804
805 if (inst->base_mrf == -1)
806 return 0;
807
808 switch (inst->opcode) {
809 case SHADER_OPCODE_RCP:
810 case SHADER_OPCODE_RSQ:
811 case SHADER_OPCODE_SQRT:
812 case SHADER_OPCODE_EXP2:
813 case SHADER_OPCODE_LOG2:
814 case SHADER_OPCODE_SIN:
815 case SHADER_OPCODE_COS:
816 return 1 * dispatch_width / 8;
817 case SHADER_OPCODE_POW:
818 case SHADER_OPCODE_INT_QUOTIENT:
819 case SHADER_OPCODE_INT_REMAINDER:
820 return 2 * dispatch_width / 8;
821 case SHADER_OPCODE_TEX:
822 case FS_OPCODE_TXB:
823 case SHADER_OPCODE_TXD:
824 case SHADER_OPCODE_TXF:
825 case SHADER_OPCODE_TXF_CMS:
826 case SHADER_OPCODE_TXF_MCS:
827 case SHADER_OPCODE_TG4:
828 case SHADER_OPCODE_TG4_OFFSET:
829 case SHADER_OPCODE_TXL:
830 case SHADER_OPCODE_TXS:
831 case SHADER_OPCODE_LOD:
832 return 1;
833 case FS_OPCODE_FB_WRITE:
834 return 2;
835 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
836 case SHADER_OPCODE_GEN4_SCRATCH_READ:
837 return 1;
838 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
839 return inst->mlen;
840 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
841 return 2;
842 case SHADER_OPCODE_UNTYPED_ATOMIC:
843 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
844 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
845 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
846 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
847 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
848 return 0;
849 default:
850 unreachable("not reached");
851 }
852 }
853
854 int
855 fs_visitor::virtual_grf_alloc(int size)
856 {
857 if (virtual_grf_array_size <= virtual_grf_count) {
858 if (virtual_grf_array_size == 0)
859 virtual_grf_array_size = 16;
860 else
861 virtual_grf_array_size *= 2;
862 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
863 virtual_grf_array_size);
864 }
865 virtual_grf_sizes[virtual_grf_count] = size;
866 return virtual_grf_count++;
867 }
868
869 /** Fixed HW reg constructor. */
870 fs_reg::fs_reg(enum register_file file, int reg)
871 {
872 init();
873 this->file = file;
874 this->reg = reg;
875 this->type = BRW_REGISTER_TYPE_F;
876 }
877
878 /** Fixed HW reg constructor. */
879 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
880 {
881 init();
882 this->file = file;
883 this->reg = reg;
884 this->type = type;
885 }
886
887 /** Automatic reg constructor. */
888 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
889 {
890 init();
891
892 this->file = GRF;
893 this->reg = v->virtual_grf_alloc(v->type_size(type));
894 this->reg_offset = 0;
895 this->type = brw_type_for_base_type(type);
896 }
897
898 fs_reg *
899 fs_visitor::variable_storage(ir_variable *var)
900 {
901 return (fs_reg *)hash_table_find(this->variable_ht, var);
902 }
903
904 void
905 import_uniforms_callback(const void *key,
906 void *data,
907 void *closure)
908 {
909 struct hash_table *dst_ht = (struct hash_table *)closure;
910 const fs_reg *reg = (const fs_reg *)data;
911
912 if (reg->file != UNIFORM)
913 return;
914
915 hash_table_insert(dst_ht, data, key);
916 }
917
918 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
919 * This brings in those uniform definitions
920 */
921 void
922 fs_visitor::import_uniforms(fs_visitor *v)
923 {
924 hash_table_call_foreach(v->variable_ht,
925 import_uniforms_callback,
926 variable_ht);
927 this->push_constant_loc = v->push_constant_loc;
928 this->pull_constant_loc = v->pull_constant_loc;
929 this->uniforms = v->uniforms;
930 this->param_size = v->param_size;
931 }
932
933 /* Our support for uniforms is piggy-backed on the struct
934 * gl_fragment_program, because that's where the values actually
935 * get stored, rather than in some global gl_shader_program uniform
936 * store.
937 */
938 void
939 fs_visitor::setup_uniform_values(ir_variable *ir)
940 {
941 int namelen = strlen(ir->name);
942
943 /* The data for our (non-builtin) uniforms is stored in a series of
944 * gl_uniform_driver_storage structs for each subcomponent that
945 * glGetUniformLocation() could name. We know it's been set up in the same
946 * order we'd walk the type, so walk the list of storage and find anything
947 * with our name, or the prefix of a component that starts with our name.
948 */
949 unsigned params_before = uniforms;
950 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
951 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
952
953 if (strncmp(ir->name, storage->name, namelen) != 0 ||
954 (storage->name[namelen] != 0 &&
955 storage->name[namelen] != '.' &&
956 storage->name[namelen] != '[')) {
957 continue;
958 }
959
960 unsigned slots = storage->type->component_slots();
961 if (storage->array_elements)
962 slots *= storage->array_elements;
963
964 for (unsigned i = 0; i < slots; i++) {
965 stage_prog_data->param[uniforms++] = &storage->storage[i];
966 }
967 }
968
969 /* Make sure we actually initialized the right amount of stuff here. */
970 assert(params_before + ir->type->component_slots() == uniforms);
971 (void)params_before;
972 }
973
974
975 /* Our support for builtin uniforms is even scarier than non-builtin.
976 * It sits on top of the PROG_STATE_VAR parameters that are
977 * automatically updated from GL context state.
978 */
979 void
980 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
981 {
982 const ir_state_slot *const slots = ir->state_slots;
983 assert(ir->state_slots != NULL);
984
985 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
986 /* This state reference has already been setup by ir_to_mesa, but we'll
987 * get the same index back here.
988 */
989 int index = _mesa_add_state_reference(this->prog->Parameters,
990 (gl_state_index *)slots[i].tokens);
991
992 /* Add each of the unique swizzles of the element as a parameter.
993 * This'll end up matching the expected layout of the
994 * array/matrix/structure we're trying to fill in.
995 */
996 int last_swiz = -1;
997 for (unsigned int j = 0; j < 4; j++) {
998 int swiz = GET_SWZ(slots[i].swizzle, j);
999 if (swiz == last_swiz)
1000 break;
1001 last_swiz = swiz;
1002
1003 stage_prog_data->param[uniforms++] =
1004 &prog->Parameters->ParameterValues[index][swiz];
1005 }
1006 }
1007 }
1008
1009 fs_reg *
1010 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1011 {
1012 assert(stage == MESA_SHADER_FRAGMENT);
1013 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1014 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1015 fs_reg wpos = *reg;
1016 bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1017
1018 /* gl_FragCoord.x */
1019 if (ir->data.pixel_center_integer) {
1020 emit(MOV(wpos, this->pixel_x));
1021 } else {
1022 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1023 }
1024 wpos.reg_offset++;
1025
1026 /* gl_FragCoord.y */
1027 if (!flip && ir->data.pixel_center_integer) {
1028 emit(MOV(wpos, this->pixel_y));
1029 } else {
1030 fs_reg pixel_y = this->pixel_y;
1031 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1032
1033 if (flip) {
1034 pixel_y.negate = true;
1035 offset += key->drawable_height - 1.0;
1036 }
1037
1038 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1039 }
1040 wpos.reg_offset++;
1041
1042 /* gl_FragCoord.z */
1043 if (brw->gen >= 6) {
1044 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1045 } else {
1046 emit(FS_OPCODE_LINTERP, wpos,
1047 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1048 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1049 interp_reg(VARYING_SLOT_POS, 2));
1050 }
1051 wpos.reg_offset++;
1052
1053 /* gl_FragCoord.w: Already set up in emit_interpolation */
1054 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1055
1056 return reg;
1057 }
1058
1059 fs_inst *
1060 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1061 glsl_interp_qualifier interpolation_mode,
1062 bool is_centroid, bool is_sample)
1063 {
1064 brw_wm_barycentric_interp_mode barycoord_mode;
1065 if (brw->gen >= 6) {
1066 if (is_centroid) {
1067 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1068 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1069 else
1070 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1071 } else if (is_sample) {
1072 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1073 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1074 else
1075 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1076 } else {
1077 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1078 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1079 else
1080 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1081 }
1082 } else {
1083 /* On Ironlake and below, there is only one interpolation mode.
1084 * Centroid interpolation doesn't mean anything on this hardware --
1085 * there is no multisampling.
1086 */
1087 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1088 }
1089 return emit(FS_OPCODE_LINTERP, attr,
1090 this->delta_x[barycoord_mode],
1091 this->delta_y[barycoord_mode], interp);
1092 }
1093
1094 fs_reg *
1095 fs_visitor::emit_general_interpolation(ir_variable *ir)
1096 {
1097 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1098 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1099 fs_reg attr = *reg;
1100
1101 assert(stage == MESA_SHADER_FRAGMENT);
1102 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1103 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1104
1105 unsigned int array_elements;
1106 const glsl_type *type;
1107
1108 if (ir->type->is_array()) {
1109 array_elements = ir->type->length;
1110 if (array_elements == 0) {
1111 fail("dereferenced array '%s' has length 0\n", ir->name);
1112 }
1113 type = ir->type->fields.array;
1114 } else {
1115 array_elements = 1;
1116 type = ir->type;
1117 }
1118
1119 glsl_interp_qualifier interpolation_mode =
1120 ir->determine_interpolation_mode(key->flat_shade);
1121
1122 int location = ir->data.location;
1123 for (unsigned int i = 0; i < array_elements; i++) {
1124 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1125 if (prog_data->urb_setup[location] == -1) {
1126 /* If there's no incoming setup data for this slot, don't
1127 * emit interpolation for it.
1128 */
1129 attr.reg_offset += type->vector_elements;
1130 location++;
1131 continue;
1132 }
1133
1134 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1135 /* Constant interpolation (flat shading) case. The SF has
1136 * handed us defined values in only the constant offset
1137 * field of the setup reg.
1138 */
1139 for (unsigned int k = 0; k < type->vector_elements; k++) {
1140 struct brw_reg interp = interp_reg(location, k);
1141 interp = suboffset(interp, 3);
1142 interp.type = reg->type;
1143 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1144 attr.reg_offset++;
1145 }
1146 } else {
1147 /* Smooth/noperspective interpolation case. */
1148 for (unsigned int k = 0; k < type->vector_elements; k++) {
1149 struct brw_reg interp = interp_reg(location, k);
1150 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1151 /* Get the pixel/sample mask into f0 so that we know
1152 * which pixels are lit. Then, for each channel that is
1153 * unlit, replace the centroid data with non-centroid
1154 * data.
1155 */
1156 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1157
1158 fs_inst *inst;
1159 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1160 false, false);
1161 inst->predicate = BRW_PREDICATE_NORMAL;
1162 inst->predicate_inverse = true;
1163 if (brw->has_pln)
1164 inst->no_dd_clear = true;
1165
1166 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1167 ir->data.centroid && !key->persample_shading,
1168 ir->data.sample || key->persample_shading);
1169 inst->predicate = BRW_PREDICATE_NORMAL;
1170 inst->predicate_inverse = false;
1171 if (brw->has_pln)
1172 inst->no_dd_check = true;
1173
1174 } else {
1175 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1176 ir->data.centroid && !key->persample_shading,
1177 ir->data.sample || key->persample_shading);
1178 }
1179 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1180 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1181 }
1182 attr.reg_offset++;
1183 }
1184
1185 }
1186 location++;
1187 }
1188 }
1189
1190 return reg;
1191 }
1192
1193 fs_reg *
1194 fs_visitor::emit_frontfacing_interpolation()
1195 {
1196 fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::bool_type);
1197
1198 if (brw->gen >= 6) {
1199 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1200 * a boolean result from this (~0/true or 0/false).
1201 *
1202 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1203 * this task in only one instruction:
1204 * - a negation source modifier will flip the bit; and
1205 * - a W -> D type conversion will sign extend the bit into the high
1206 * word of the destination.
1207 *
1208 * An ASR 15 fills the low word of the destination.
1209 */
1210 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1211 g0.negate = true;
1212
1213 emit(ASR(*reg, g0, fs_reg(15)));
1214 } else {
1215 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1216 * a boolean result from this (1/true or 0/false).
1217 *
1218 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1219 * the negation source modifier to flip it. Unfortunately the SHR
1220 * instruction only operates on UD (or D with an abs source modifier)
1221 * sources without negation.
1222 *
1223 * Instead, use ASR (which will give ~0/true or 0/false) followed by an
1224 * AND 1.
1225 */
1226 fs_reg asr = fs_reg(this, glsl_type::bool_type);
1227 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1228 g1_6.negate = true;
1229
1230 emit(ASR(asr, g1_6, fs_reg(31)));
1231 emit(AND(*reg, asr, fs_reg(1)));
1232 }
1233
1234 return reg;
1235 }
1236
1237 void
1238 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1239 {
1240 assert(stage == MESA_SHADER_FRAGMENT);
1241 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1242 assert(dst.type == BRW_REGISTER_TYPE_F);
1243
1244 if (key->compute_pos_offset) {
1245 /* Convert int_sample_pos to floating point */
1246 emit(MOV(dst, int_sample_pos));
1247 /* Scale to the range [0, 1] */
1248 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1249 }
1250 else {
1251 /* From ARB_sample_shading specification:
1252 * "When rendering to a non-multisample buffer, or if multisample
1253 * rasterization is disabled, gl_SamplePosition will always be
1254 * (0.5, 0.5).
1255 */
1256 emit(MOV(dst, fs_reg(0.5f)));
1257 }
1258 }
1259
1260 fs_reg *
1261 fs_visitor::emit_samplepos_setup()
1262 {
1263 assert(brw->gen >= 6);
1264
1265 this->current_annotation = "compute sample position";
1266 fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::vec2_type);
1267 fs_reg pos = *reg;
1268 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1269 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1270
1271 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1272 * mode will be enabled.
1273 *
1274 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1275 * R31.1:0 Position Offset X/Y for Slot[3:0]
1276 * R31.3:2 Position Offset X/Y for Slot[7:4]
1277 * .....
1278 *
1279 * The X, Y sample positions come in as bytes in thread payload. So, read
1280 * the positions using vstride=16, width=8, hstride=2.
1281 */
1282 struct brw_reg sample_pos_reg =
1283 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1284 BRW_REGISTER_TYPE_B), 16, 8, 2);
1285
1286 fs_inst *inst = emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1287 if (dispatch_width == 16) {
1288 inst->force_uncompressed = true;
1289 inst = emit(MOV(half(int_sample_x, 1),
1290 fs_reg(suboffset(sample_pos_reg, 16))));
1291 inst->force_sechalf = true;
1292 }
1293 /* Compute gl_SamplePosition.x */
1294 compute_sample_position(pos, int_sample_x);
1295 pos.reg_offset++;
1296 inst = emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1297 if (dispatch_width == 16) {
1298 inst->force_uncompressed = true;
1299 inst = emit(MOV(half(int_sample_y, 1),
1300 fs_reg(suboffset(sample_pos_reg, 17))));
1301 inst->force_sechalf = true;
1302 }
1303 /* Compute gl_SamplePosition.y */
1304 compute_sample_position(pos, int_sample_y);
1305 return reg;
1306 }
1307
1308 fs_reg *
1309 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1310 {
1311 assert(stage == MESA_SHADER_FRAGMENT);
1312 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1313 assert(brw->gen >= 6);
1314
1315 this->current_annotation = "compute sample id";
1316 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1317
1318 if (key->compute_sample_id) {
1319 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1320 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1321 t2.type = BRW_REGISTER_TYPE_UW;
1322
1323 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1324 * 8x multisampling, subspan 0 will represent sample N (where N
1325 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1326 * 7. We can find the value of N by looking at R0.0 bits 7:6
1327 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1328 * (since samples are always delivered in pairs). That is, we
1329 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1330 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1331 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1332 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1333 * populating a temporary variable with the sequence (0, 1, 2, 3),
1334 * and then reading from it using vstride=1, width=4, hstride=0.
1335 * These computations hold good for 4x multisampling as well.
1336 *
1337 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1338 * the first four slots are sample 0 of subspan 0; the next four
1339 * are sample 1 of subspan 0; the third group is sample 0 of
1340 * subspan 1, and finally sample 1 of subspan 1.
1341 */
1342 fs_inst *inst;
1343 inst = emit(BRW_OPCODE_AND, t1,
1344 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1345 fs_reg(0xc0));
1346 inst->force_writemask_all = true;
1347 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1348 inst->force_writemask_all = true;
1349 /* This works for both SIMD8 and SIMD16 */
1350 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1351 inst->force_writemask_all = true;
1352 /* This special instruction takes care of setting vstride=1,
1353 * width=4, hstride=0 of t2 during an ADD instruction.
1354 */
1355 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1356 } else {
1357 /* As per GL_ARB_sample_shading specification:
1358 * "When rendering to a non-multisample buffer, or if multisample
1359 * rasterization is disabled, gl_SampleID will always be zero."
1360 */
1361 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1362 }
1363
1364 return reg;
1365 }
1366
1367 fs_reg
1368 fs_visitor::fix_math_operand(fs_reg src)
1369 {
1370 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1371 * might be able to do better by doing execsize = 1 math and then
1372 * expanding that result out, but we would need to be careful with
1373 * masking.
1374 *
1375 * The hardware ignores source modifiers (negate and abs) on math
1376 * instructions, so we also move to a temp to set those up.
1377 */
1378 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1379 !src.abs && !src.negate)
1380 return src;
1381
1382 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1383 * operands to math
1384 */
1385 if (brw->gen >= 7 && src.file != IMM)
1386 return src;
1387
1388 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1389 expanded.type = src.type;
1390 emit(BRW_OPCODE_MOV, expanded, src);
1391 return expanded;
1392 }
1393
1394 fs_inst *
1395 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1396 {
1397 switch (opcode) {
1398 case SHADER_OPCODE_RCP:
1399 case SHADER_OPCODE_RSQ:
1400 case SHADER_OPCODE_SQRT:
1401 case SHADER_OPCODE_EXP2:
1402 case SHADER_OPCODE_LOG2:
1403 case SHADER_OPCODE_SIN:
1404 case SHADER_OPCODE_COS:
1405 break;
1406 default:
1407 unreachable("not reached: bad math opcode");
1408 }
1409
1410 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1411 * might be able to do better by doing execsize = 1 math and then
1412 * expanding that result out, but we would need to be careful with
1413 * masking.
1414 *
1415 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1416 * instructions, so we also move to a temp to set those up.
1417 */
1418 if (brw->gen == 6 || brw->gen == 7)
1419 src = fix_math_operand(src);
1420
1421 fs_inst *inst = emit(opcode, dst, src);
1422
1423 if (brw->gen < 6) {
1424 inst->base_mrf = 2;
1425 inst->mlen = dispatch_width / 8;
1426 }
1427
1428 return inst;
1429 }
1430
1431 fs_inst *
1432 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1433 {
1434 int base_mrf = 2;
1435 fs_inst *inst;
1436
1437 if (brw->gen >= 8) {
1438 inst = emit(opcode, dst, src0, src1);
1439 } else if (brw->gen >= 6) {
1440 src0 = fix_math_operand(src0);
1441 src1 = fix_math_operand(src1);
1442
1443 inst = emit(opcode, dst, src0, src1);
1444 } else {
1445 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1446 * "Message Payload":
1447 *
1448 * "Operand0[7]. For the INT DIV functions, this operand is the
1449 * denominator."
1450 * ...
1451 * "Operand1[7]. For the INT DIV functions, this operand is the
1452 * numerator."
1453 */
1454 bool is_int_div = opcode != SHADER_OPCODE_POW;
1455 fs_reg &op0 = is_int_div ? src1 : src0;
1456 fs_reg &op1 = is_int_div ? src0 : src1;
1457
1458 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1459 inst = emit(opcode, dst, op0, reg_null_f);
1460
1461 inst->base_mrf = base_mrf;
1462 inst->mlen = 2 * dispatch_width / 8;
1463 }
1464 return inst;
1465 }
1466
1467 void
1468 fs_visitor::assign_curb_setup()
1469 {
1470 if (dispatch_width == 8) {
1471 prog_data->dispatch_grf_start_reg = payload.num_regs;
1472 } else {
1473 assert(stage == MESA_SHADER_FRAGMENT);
1474 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1475 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1476 }
1477
1478 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1479
1480 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1481 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1482 for (unsigned int i = 0; i < inst->sources; i++) {
1483 if (inst->src[i].file == UNIFORM) {
1484 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1485 int constant_nr;
1486 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1487 constant_nr = push_constant_loc[uniform_nr];
1488 } else {
1489 /* Section 5.11 of the OpenGL 4.1 spec says:
1490 * "Out-of-bounds reads return undefined values, which include
1491 * values from other variables of the active program or zero."
1492 * Just return the first push constant.
1493 */
1494 constant_nr = 0;
1495 }
1496
1497 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1498 constant_nr / 8,
1499 constant_nr % 8);
1500
1501 inst->src[i].file = HW_REG;
1502 inst->src[i].fixed_hw_reg = byte_offset(
1503 retype(brw_reg, inst->src[i].type),
1504 inst->src[i].subreg_offset);
1505 }
1506 }
1507 }
1508 }
1509
1510 void
1511 fs_visitor::calculate_urb_setup()
1512 {
1513 assert(stage == MESA_SHADER_FRAGMENT);
1514 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1515 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1516
1517 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1518 prog_data->urb_setup[i] = -1;
1519 }
1520
1521 int urb_next = 0;
1522 /* Figure out where each of the incoming setup attributes lands. */
1523 if (brw->gen >= 6) {
1524 if (_mesa_bitcount_64(prog->InputsRead &
1525 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1526 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1527 * first 16 varying inputs, so we can put them wherever we want.
1528 * Just put them in order.
1529 *
1530 * This is useful because it means that (a) inputs not used by the
1531 * fragment shader won't take up valuable register space, and (b) we
1532 * won't have to recompile the fragment shader if it gets paired with
1533 * a different vertex (or geometry) shader.
1534 */
1535 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1536 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1537 BITFIELD64_BIT(i)) {
1538 prog_data->urb_setup[i] = urb_next++;
1539 }
1540 }
1541 } else {
1542 /* We have enough input varyings that the SF/SBE pipeline stage can't
1543 * arbitrarily rearrange them to suit our whim; we have to put them
1544 * in an order that matches the output of the previous pipeline stage
1545 * (geometry or vertex shader).
1546 */
1547 struct brw_vue_map prev_stage_vue_map;
1548 brw_compute_vue_map(brw, &prev_stage_vue_map,
1549 key->input_slots_valid);
1550 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1551 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1552 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1553 slot++) {
1554 int varying = prev_stage_vue_map.slot_to_varying[slot];
1555 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1556 * unused.
1557 */
1558 if (varying != BRW_VARYING_SLOT_COUNT &&
1559 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1560 BITFIELD64_BIT(varying))) {
1561 prog_data->urb_setup[varying] = slot - first_slot;
1562 }
1563 }
1564 urb_next = prev_stage_vue_map.num_slots - first_slot;
1565 }
1566 } else {
1567 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1568 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1569 /* Point size is packed into the header, not as a general attribute */
1570 if (i == VARYING_SLOT_PSIZ)
1571 continue;
1572
1573 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1574 /* The back color slot is skipped when the front color is
1575 * also written to. In addition, some slots can be
1576 * written in the vertex shader and not read in the
1577 * fragment shader. So the register number must always be
1578 * incremented, mapped or not.
1579 */
1580 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1581 prog_data->urb_setup[i] = urb_next;
1582 urb_next++;
1583 }
1584 }
1585
1586 /*
1587 * It's a FS only attribute, and we did interpolation for this attribute
1588 * in SF thread. So, count it here, too.
1589 *
1590 * See compile_sf_prog() for more info.
1591 */
1592 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1593 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1594 }
1595
1596 prog_data->num_varying_inputs = urb_next;
1597 }
1598
1599 void
1600 fs_visitor::assign_urb_setup()
1601 {
1602 assert(stage == MESA_SHADER_FRAGMENT);
1603 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1604
1605 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1606
1607 /* Offset all the urb_setup[] index by the actual position of the
1608 * setup regs, now that the location of the constants has been chosen.
1609 */
1610 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1611 if (inst->opcode == FS_OPCODE_LINTERP) {
1612 assert(inst->src[2].file == HW_REG);
1613 inst->src[2].fixed_hw_reg.nr += urb_start;
1614 }
1615
1616 if (inst->opcode == FS_OPCODE_CINTERP) {
1617 assert(inst->src[0].file == HW_REG);
1618 inst->src[0].fixed_hw_reg.nr += urb_start;
1619 }
1620 }
1621
1622 /* Each attribute is 4 setup channels, each of which is half a reg. */
1623 this->first_non_payload_grf =
1624 urb_start + prog_data->num_varying_inputs * 2;
1625 }
1626
1627 /**
1628 * Split large virtual GRFs into separate components if we can.
1629 *
1630 * This is mostly duplicated with what brw_fs_vector_splitting does,
1631 * but that's really conservative because it's afraid of doing
1632 * splitting that doesn't result in real progress after the rest of
1633 * the optimization phases, which would cause infinite looping in
1634 * optimization. We can do it once here, safely. This also has the
1635 * opportunity to split interpolated values, or maybe even uniforms,
1636 * which we don't have at the IR level.
1637 *
1638 * We want to split, because virtual GRFs are what we register
1639 * allocate and spill (due to contiguousness requirements for some
1640 * instructions), and they're what we naturally generate in the
1641 * codegen process, but most virtual GRFs don't actually need to be
1642 * contiguous sets of GRFs. If we split, we'll end up with reduced
1643 * live intervals and better dead code elimination and coalescing.
1644 */
1645 void
1646 fs_visitor::split_virtual_grfs()
1647 {
1648 int num_vars = this->virtual_grf_count;
1649 bool split_grf[num_vars];
1650 int new_virtual_grf[num_vars];
1651
1652 /* Try to split anything > 0 sized. */
1653 for (int i = 0; i < num_vars; i++) {
1654 if (this->virtual_grf_sizes[i] != 1)
1655 split_grf[i] = true;
1656 else
1657 split_grf[i] = false;
1658 }
1659
1660 if (brw->has_pln &&
1661 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1662 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1663 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1664 * Gen6, that was the only supported interpolation mode, and since Gen6,
1665 * delta_x and delta_y are in fixed hardware registers.
1666 */
1667 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1668 false;
1669 }
1670
1671 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1672 /* If there's a SEND message that requires contiguous destination
1673 * registers, no splitting is allowed.
1674 */
1675 if (inst->regs_written > 1) {
1676 split_grf[inst->dst.reg] = false;
1677 }
1678
1679 /* If we're sending from a GRF, don't split it, on the assumption that
1680 * the send is reading the whole thing.
1681 */
1682 if (inst->is_send_from_grf()) {
1683 for (int i = 0; i < inst->sources; i++) {
1684 if (inst->src[i].file == GRF) {
1685 split_grf[inst->src[i].reg] = false;
1686 }
1687 }
1688 }
1689 }
1690
1691 /* Allocate new space for split regs. Note that the virtual
1692 * numbers will be contiguous.
1693 */
1694 for (int i = 0; i < num_vars; i++) {
1695 if (split_grf[i]) {
1696 new_virtual_grf[i] = virtual_grf_alloc(1);
1697 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1698 int reg = virtual_grf_alloc(1);
1699 assert(reg == new_virtual_grf[i] + j - 1);
1700 (void) reg;
1701 }
1702 this->virtual_grf_sizes[i] = 1;
1703 }
1704 }
1705
1706 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1707 if (inst->dst.file == GRF &&
1708 split_grf[inst->dst.reg] &&
1709 inst->dst.reg_offset != 0) {
1710 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1711 inst->dst.reg_offset - 1);
1712 inst->dst.reg_offset = 0;
1713 }
1714 for (int i = 0; i < inst->sources; i++) {
1715 if (inst->src[i].file == GRF &&
1716 split_grf[inst->src[i].reg] &&
1717 inst->src[i].reg_offset != 0) {
1718 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1719 inst->src[i].reg_offset - 1);
1720 inst->src[i].reg_offset = 0;
1721 }
1722 }
1723 }
1724 invalidate_live_intervals();
1725 }
1726
1727 /**
1728 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1729 *
1730 * During code generation, we create tons of temporary variables, many of
1731 * which get immediately killed and are never used again. Yet, in later
1732 * optimization and analysis passes, such as compute_live_intervals, we need
1733 * to loop over all the virtual GRFs. Compacting them can save a lot of
1734 * overhead.
1735 */
1736 void
1737 fs_visitor::compact_virtual_grfs()
1738 {
1739 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER))
1740 return;
1741
1742 /* Mark which virtual GRFs are used, and count how many. */
1743 int remap_table[this->virtual_grf_count];
1744 memset(remap_table, -1, sizeof(remap_table));
1745
1746 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1747 if (inst->dst.file == GRF)
1748 remap_table[inst->dst.reg] = 0;
1749
1750 for (int i = 0; i < inst->sources; i++) {
1751 if (inst->src[i].file == GRF)
1752 remap_table[inst->src[i].reg] = 0;
1753 }
1754 }
1755
1756 /* Compact the GRF arrays. */
1757 int new_index = 0;
1758 for (int i = 0; i < this->virtual_grf_count; i++) {
1759 if (remap_table[i] != -1) {
1760 remap_table[i] = new_index;
1761 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1762 invalidate_live_intervals();
1763 ++new_index;
1764 }
1765 }
1766
1767 this->virtual_grf_count = new_index;
1768
1769 /* Patch all the instructions to use the newly renumbered registers */
1770 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1771 if (inst->dst.file == GRF)
1772 inst->dst.reg = remap_table[inst->dst.reg];
1773
1774 for (int i = 0; i < inst->sources; i++) {
1775 if (inst->src[i].file == GRF)
1776 inst->src[i].reg = remap_table[inst->src[i].reg];
1777 }
1778 }
1779
1780 /* Patch all the references to delta_x/delta_y, since they're used in
1781 * register allocation. If they're unused, switch them to BAD_FILE so
1782 * we don't think some random VGRF is delta_x/delta_y.
1783 */
1784 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
1785 if (delta_x[i].file == GRF) {
1786 if (remap_table[delta_x[i].reg] != -1) {
1787 delta_x[i].reg = remap_table[delta_x[i].reg];
1788 } else {
1789 delta_x[i].file = BAD_FILE;
1790 }
1791 }
1792 }
1793 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
1794 if (delta_y[i].file == GRF) {
1795 if (remap_table[delta_y[i].reg] != -1) {
1796 delta_y[i].reg = remap_table[delta_y[i].reg];
1797 } else {
1798 delta_y[i].file = BAD_FILE;
1799 }
1800 }
1801 }
1802 }
1803
1804 /*
1805 * Implements array access of uniforms by inserting a
1806 * PULL_CONSTANT_LOAD instruction.
1807 *
1808 * Unlike temporary GRF array access (where we don't support it due to
1809 * the difficulty of doing relative addressing on instruction
1810 * destinations), we could potentially do array access of uniforms
1811 * that were loaded in GRF space as push constants. In real-world
1812 * usage we've seen, though, the arrays being used are always larger
1813 * than we could load as push constants, so just always move all
1814 * uniform array access out to a pull constant buffer.
1815 */
1816 void
1817 fs_visitor::move_uniform_array_access_to_pull_constants()
1818 {
1819 if (dispatch_width != 8)
1820 return;
1821
1822 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1823
1824 for (unsigned int i = 0; i < uniforms; i++) {
1825 pull_constant_loc[i] = -1;
1826 }
1827
1828 /* Walk through and find array access of uniforms. Put a copy of that
1829 * uniform in the pull constant buffer.
1830 *
1831 * Note that we don't move constant-indexed accesses to arrays. No
1832 * testing has been done of the performance impact of this choice.
1833 */
1834 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
1835 for (int i = 0 ; i < inst->sources; i++) {
1836 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1837 continue;
1838
1839 int uniform = inst->src[i].reg;
1840
1841 /* If this array isn't already present in the pull constant buffer,
1842 * add it.
1843 */
1844 if (pull_constant_loc[uniform] == -1) {
1845 const gl_constant_value **values = &stage_prog_data->param[uniform];
1846
1847 assert(param_size[uniform]);
1848
1849 for (int j = 0; j < param_size[uniform]; j++) {
1850 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1851
1852 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1853 values[j];
1854 }
1855 }
1856 }
1857 }
1858 }
1859
1860 /**
1861 * Assign UNIFORM file registers to either push constants or pull constants.
1862 *
1863 * We allow a fragment shader to have more than the specified minimum
1864 * maximum number of fragment shader uniform components (64). If
1865 * there are too many of these, they'd fill up all of register space.
1866 * So, this will push some of them out to the pull constant buffer and
1867 * update the program to load them.
1868 */
1869 void
1870 fs_visitor::assign_constant_locations()
1871 {
1872 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1873 if (dispatch_width != 8)
1874 return;
1875
1876 /* Find which UNIFORM registers are still in use. */
1877 bool is_live[uniforms];
1878 for (unsigned int i = 0; i < uniforms; i++) {
1879 is_live[i] = false;
1880 }
1881
1882 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1883 for (int i = 0; i < inst->sources; i++) {
1884 if (inst->src[i].file != UNIFORM)
1885 continue;
1886
1887 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1888 if (constant_nr >= 0 && constant_nr < (int) uniforms)
1889 is_live[constant_nr] = true;
1890 }
1891 }
1892
1893 /* Only allow 16 registers (128 uniform components) as push constants.
1894 *
1895 * Just demote the end of the list. We could probably do better
1896 * here, demoting things that are rarely used in the program first.
1897 *
1898 * If changing this value, note the limitation about total_regs in
1899 * brw_curbe.c.
1900 */
1901 unsigned int max_push_components = 16 * 8;
1902 unsigned int num_push_constants = 0;
1903
1904 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1905
1906 for (unsigned int i = 0; i < uniforms; i++) {
1907 if (!is_live[i] || pull_constant_loc[i] != -1) {
1908 /* This UNIFORM register is either dead, or has already been demoted
1909 * to a pull const. Mark it as no longer living in the param[] array.
1910 */
1911 push_constant_loc[i] = -1;
1912 continue;
1913 }
1914
1915 if (num_push_constants < max_push_components) {
1916 /* Retain as a push constant. Record the location in the params[]
1917 * array.
1918 */
1919 push_constant_loc[i] = num_push_constants++;
1920 } else {
1921 /* Demote to a pull constant. */
1922 push_constant_loc[i] = -1;
1923
1924 int pull_index = stage_prog_data->nr_pull_params++;
1925 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1926 pull_constant_loc[i] = pull_index;
1927 }
1928 }
1929
1930 stage_prog_data->nr_params = num_push_constants;
1931
1932 /* Up until now, the param[] array has been indexed by reg + reg_offset
1933 * of UNIFORM registers. Condense it to only contain the uniforms we
1934 * chose to upload as push constants.
1935 */
1936 for (unsigned int i = 0; i < uniforms; i++) {
1937 int remapped = push_constant_loc[i];
1938
1939 if (remapped == -1)
1940 continue;
1941
1942 assert(remapped <= (int)i);
1943 stage_prog_data->param[remapped] = stage_prog_data->param[i];
1944 }
1945 }
1946
1947 /**
1948 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1949 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1950 */
1951 void
1952 fs_visitor::demote_pull_constants()
1953 {
1954 foreach_block_and_inst (block, fs_inst, inst, cfg) {
1955 for (int i = 0; i < inst->sources; i++) {
1956 if (inst->src[i].file != UNIFORM)
1957 continue;
1958
1959 int pull_index = pull_constant_loc[inst->src[i].reg +
1960 inst->src[i].reg_offset];
1961 if (pull_index == -1)
1962 continue;
1963
1964 /* Set up the annotation tracking for new generated instructions. */
1965 base_ir = inst->ir;
1966 current_annotation = inst->annotation;
1967
1968 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1969 fs_reg dst = fs_reg(this, glsl_type::float_type);
1970
1971 /* Generate a pull load into dst. */
1972 if (inst->src[i].reladdr) {
1973 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
1974 surf_index,
1975 *inst->src[i].reladdr,
1976 pull_index);
1977 inst->insert_before(block, &list);
1978 inst->src[i].reladdr = NULL;
1979 } else {
1980 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1981 fs_inst *pull =
1982 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1983 dst, surf_index, offset);
1984 inst->insert_before(block, pull);
1985 inst->src[i].set_smear(pull_index & 3);
1986 }
1987
1988 /* Rewrite the instruction to use the temporary VGRF. */
1989 inst->src[i].file = GRF;
1990 inst->src[i].reg = dst.reg;
1991 inst->src[i].reg_offset = 0;
1992 }
1993 }
1994 invalidate_live_intervals();
1995 }
1996
1997 bool
1998 fs_visitor::opt_algebraic()
1999 {
2000 bool progress = false;
2001
2002 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2003 switch (inst->opcode) {
2004 case BRW_OPCODE_MUL:
2005 if (inst->src[1].file != IMM)
2006 continue;
2007
2008 /* a * 1.0 = a */
2009 if (inst->src[1].is_one()) {
2010 inst->opcode = BRW_OPCODE_MOV;
2011 inst->src[1] = reg_undef;
2012 progress = true;
2013 break;
2014 }
2015
2016 /* a * 0.0 = 0.0 */
2017 if (inst->src[1].is_zero()) {
2018 inst->opcode = BRW_OPCODE_MOV;
2019 inst->src[0] = inst->src[1];
2020 inst->src[1] = reg_undef;
2021 progress = true;
2022 break;
2023 }
2024
2025 break;
2026 case BRW_OPCODE_ADD:
2027 if (inst->src[1].file != IMM)
2028 continue;
2029
2030 /* a + 0.0 = a */
2031 if (inst->src[1].is_zero()) {
2032 inst->opcode = BRW_OPCODE_MOV;
2033 inst->src[1] = reg_undef;
2034 progress = true;
2035 break;
2036 }
2037 break;
2038 case BRW_OPCODE_OR:
2039 if (inst->src[0].equals(inst->src[1])) {
2040 inst->opcode = BRW_OPCODE_MOV;
2041 inst->src[1] = reg_undef;
2042 progress = true;
2043 break;
2044 }
2045 break;
2046 case BRW_OPCODE_LRP:
2047 if (inst->src[1].equals(inst->src[2])) {
2048 inst->opcode = BRW_OPCODE_MOV;
2049 inst->src[0] = inst->src[1];
2050 inst->src[1] = reg_undef;
2051 inst->src[2] = reg_undef;
2052 progress = true;
2053 break;
2054 }
2055 break;
2056 case BRW_OPCODE_SEL:
2057 if (inst->src[0].equals(inst->src[1])) {
2058 inst->opcode = BRW_OPCODE_MOV;
2059 inst->src[1] = reg_undef;
2060 inst->predicate = BRW_PREDICATE_NONE;
2061 inst->predicate_inverse = false;
2062 progress = true;
2063 } else if (inst->saturate && inst->src[1].file == IMM) {
2064 switch (inst->conditional_mod) {
2065 case BRW_CONDITIONAL_LE:
2066 case BRW_CONDITIONAL_L:
2067 switch (inst->src[1].type) {
2068 case BRW_REGISTER_TYPE_F:
2069 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2070 inst->opcode = BRW_OPCODE_MOV;
2071 inst->src[1] = reg_undef;
2072 progress = true;
2073 }
2074 break;
2075 default:
2076 break;
2077 }
2078 break;
2079 case BRW_CONDITIONAL_GE:
2080 case BRW_CONDITIONAL_G:
2081 switch (inst->src[1].type) {
2082 case BRW_REGISTER_TYPE_F:
2083 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2084 inst->opcode = BRW_OPCODE_MOV;
2085 inst->src[1] = reg_undef;
2086 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2087 progress = true;
2088 }
2089 break;
2090 default:
2091 break;
2092 }
2093 default:
2094 break;
2095 }
2096 }
2097 break;
2098 default:
2099 break;
2100 }
2101 }
2102
2103 return progress;
2104 }
2105
2106 bool
2107 fs_visitor::opt_register_renaming()
2108 {
2109 bool progress = false;
2110 int depth = 0;
2111
2112 int remap[virtual_grf_count];
2113 memset(remap, -1, sizeof(int) * virtual_grf_count);
2114
2115 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2116 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2117 depth++;
2118 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2119 inst->opcode == BRW_OPCODE_WHILE) {
2120 depth--;
2121 }
2122
2123 /* Rewrite instruction sources. */
2124 for (int i = 0; i < inst->sources; i++) {
2125 if (inst->src[i].file == GRF &&
2126 remap[inst->src[i].reg] != -1 &&
2127 remap[inst->src[i].reg] != inst->src[i].reg) {
2128 inst->src[i].reg = remap[inst->src[i].reg];
2129 progress = true;
2130 }
2131 }
2132
2133 const int dst = inst->dst.reg;
2134
2135 if (depth == 0 &&
2136 inst->dst.file == GRF &&
2137 virtual_grf_sizes[inst->dst.reg] == 1 &&
2138 !inst->is_partial_write()) {
2139 if (remap[dst] == -1) {
2140 remap[dst] = dst;
2141 } else {
2142 remap[dst] = virtual_grf_alloc(1);
2143 inst->dst.reg = remap[dst];
2144 progress = true;
2145 }
2146 } else if (inst->dst.file == GRF &&
2147 remap[dst] != -1 &&
2148 remap[dst] != dst) {
2149 inst->dst.reg = remap[dst];
2150 progress = true;
2151 }
2152 }
2153
2154 if (progress) {
2155 invalidate_live_intervals();
2156
2157 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2158 if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2159 delta_x[i].reg = remap[delta_x[i].reg];
2160 }
2161 }
2162 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2163 if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2164 delta_y[i].reg = remap[delta_y[i].reg];
2165 }
2166 }
2167 }
2168
2169 return progress;
2170 }
2171
2172 bool
2173 fs_visitor::compute_to_mrf()
2174 {
2175 bool progress = false;
2176 int next_ip = 0;
2177
2178 calculate_live_intervals();
2179
2180 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2181 int ip = next_ip;
2182 next_ip++;
2183
2184 if (inst->opcode != BRW_OPCODE_MOV ||
2185 inst->is_partial_write() ||
2186 inst->dst.file != MRF || inst->src[0].file != GRF ||
2187 inst->dst.type != inst->src[0].type ||
2188 inst->src[0].abs || inst->src[0].negate ||
2189 !inst->src[0].is_contiguous() ||
2190 inst->src[0].subreg_offset)
2191 continue;
2192
2193 /* Work out which hardware MRF registers are written by this
2194 * instruction.
2195 */
2196 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2197 int mrf_high;
2198 if (inst->dst.reg & BRW_MRF_COMPR4) {
2199 mrf_high = mrf_low + 4;
2200 } else if (dispatch_width == 16 &&
2201 (!inst->force_uncompressed && !inst->force_sechalf)) {
2202 mrf_high = mrf_low + 1;
2203 } else {
2204 mrf_high = mrf_low;
2205 }
2206
2207 /* Can't compute-to-MRF this GRF if someone else was going to
2208 * read it later.
2209 */
2210 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2211 continue;
2212
2213 /* Found a move of a GRF to a MRF. Let's see if we can go
2214 * rewrite the thing that made this GRF to write into the MRF.
2215 */
2216 fs_inst *scan_inst;
2217 for (scan_inst = (fs_inst *)inst->prev;
2218 !scan_inst->is_head_sentinel();
2219 scan_inst = (fs_inst *)scan_inst->prev) {
2220 if (scan_inst->dst.file == GRF &&
2221 scan_inst->dst.reg == inst->src[0].reg) {
2222 /* Found the last thing to write our reg we want to turn
2223 * into a compute-to-MRF.
2224 */
2225
2226 /* If this one instruction didn't populate all the
2227 * channels, bail. We might be able to rewrite everything
2228 * that writes that reg, but it would require smarter
2229 * tracking to delay the rewriting until complete success.
2230 */
2231 if (scan_inst->is_partial_write())
2232 break;
2233
2234 /* Things returning more than one register would need us to
2235 * understand coalescing out more than one MOV at a time.
2236 */
2237 if (scan_inst->regs_written > 1)
2238 break;
2239
2240 /* SEND instructions can't have MRF as a destination. */
2241 if (scan_inst->mlen)
2242 break;
2243
2244 if (brw->gen == 6) {
2245 /* gen6 math instructions must have the destination be
2246 * GRF, so no compute-to-MRF for them.
2247 */
2248 if (scan_inst->is_math()) {
2249 break;
2250 }
2251 }
2252
2253 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2254 /* Found the creator of our MRF's source value. */
2255 scan_inst->dst.file = MRF;
2256 scan_inst->dst.reg = inst->dst.reg;
2257 scan_inst->saturate |= inst->saturate;
2258 inst->remove(block);
2259 progress = true;
2260 }
2261 break;
2262 }
2263
2264 /* We don't handle control flow here. Most computation of
2265 * values that end up in MRFs are shortly before the MRF
2266 * write anyway.
2267 */
2268 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2269 break;
2270
2271 /* You can't read from an MRF, so if someone else reads our
2272 * MRF's source GRF that we wanted to rewrite, that stops us.
2273 */
2274 bool interfered = false;
2275 for (int i = 0; i < scan_inst->sources; i++) {
2276 if (scan_inst->src[i].file == GRF &&
2277 scan_inst->src[i].reg == inst->src[0].reg &&
2278 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2279 interfered = true;
2280 }
2281 }
2282 if (interfered)
2283 break;
2284
2285 if (scan_inst->dst.file == MRF) {
2286 /* If somebody else writes our MRF here, we can't
2287 * compute-to-MRF before that.
2288 */
2289 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2290 int scan_mrf_high;
2291
2292 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2293 scan_mrf_high = scan_mrf_low + 4;
2294 } else if (dispatch_width == 16 &&
2295 (!scan_inst->force_uncompressed &&
2296 !scan_inst->force_sechalf)) {
2297 scan_mrf_high = scan_mrf_low + 1;
2298 } else {
2299 scan_mrf_high = scan_mrf_low;
2300 }
2301
2302 if (mrf_low == scan_mrf_low ||
2303 mrf_low == scan_mrf_high ||
2304 mrf_high == scan_mrf_low ||
2305 mrf_high == scan_mrf_high) {
2306 break;
2307 }
2308 }
2309
2310 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2311 /* Found a SEND instruction, which means that there are
2312 * live values in MRFs from base_mrf to base_mrf +
2313 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2314 * above it.
2315 */
2316 if (mrf_low >= scan_inst->base_mrf &&
2317 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2318 break;
2319 }
2320 if (mrf_high >= scan_inst->base_mrf &&
2321 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2322 break;
2323 }
2324 }
2325 }
2326 }
2327
2328 if (progress)
2329 invalidate_live_intervals();
2330
2331 return progress;
2332 }
2333
2334 /**
2335 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2336 * instructions to FS_OPCODE_REP_FB_WRITE.
2337 */
2338 void
2339 fs_visitor::try_rep_send()
2340 {
2341 int i, count;
2342 fs_inst *start = NULL;
2343 bblock_t *mov_block;
2344
2345 /* From the Ivybridge PRM, Volume 4 Part 1, section 3.9.11.2
2346 * ("Message Descriptor - Render Target Write"):
2347 *
2348 * "SIMD16_REPDATA message must not be used in SIMD8 pixel-shaders."
2349 */
2350 if (dispatch_width != 16)
2351 return;
2352
2353 /* The constant color write message can't handle anything but the 4 color
2354 * values. We could do MRT, but the loops below would need to understand
2355 * handling the header being enabled or disabled on different messages. It
2356 * also requires that the render target be tiled, which might not be the
2357 * case for some EGLImage paths or if we some day do rendering to PBOs.
2358 */
2359 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH) ||
2360 payload.aa_dest_stencil_reg ||
2361 payload.dest_depth_reg ||
2362 dual_src_output.file != BAD_FILE)
2363 return;
2364
2365 /* The optimization is implemented as one pass through the instruction
2366 * list. We keep track of the most recent block of MOVs into sequential
2367 * MRFs from single, sequential float registers (ie uniforms). Then when
2368 * we find an FB_WRITE opcode, we see if the payload registers match the
2369 * destination registers in our block of MOVs.
2370 */
2371 count = 0;
2372 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2373 if (count == 0) {
2374 start = inst;
2375 mov_block = block;
2376 }
2377 if (inst->opcode == BRW_OPCODE_MOV &&
2378 inst->dst.file == MRF &&
2379 inst->dst.reg == start->dst.reg + 2 * count &&
2380 inst->src[0].file == HW_REG &&
2381 inst->src[0].reg_offset == start->src[0].reg_offset + count) {
2382 if (count == 0) {
2383 start = inst;
2384 mov_block = block;
2385 }
2386 count++;
2387 }
2388
2389 if (inst->opcode == FS_OPCODE_FB_WRITE &&
2390 count == 4 &&
2391 (inst->base_mrf == start->dst.reg ||
2392 (inst->base_mrf + 2 == start->dst.reg && inst->header_present))) {
2393 fs_inst *mov = MOV(start->dst, start->src[0]);
2394
2395 /* Make a MOV that moves the four floats into the replicated write
2396 * payload. Since we're running at the very end of code generation
2397 * we can use hw registers and generate the stride and offsets we
2398 * need for this MOV. We use the first of the eight registers
2399 * allocated for the SIMD16 payload for the four floats.
2400 */
2401 mov->dst.fixed_hw_reg =
2402 brw_vec4_reg(BRW_MESSAGE_REGISTER_FILE,
2403 start->dst.reg, 0);
2404 mov->dst.file = HW_REG;
2405 mov->dst.type = mov->dst.fixed_hw_reg.type;
2406
2407 mov->src[0].fixed_hw_reg =
2408 brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2409 mov->src[0].file = HW_REG;
2410 mov->src[0].type = mov->src[0].fixed_hw_reg.type;
2411 mov->force_writemask_all = true;
2412 mov->dst.type = BRW_REGISTER_TYPE_F;
2413
2414 /* Replace the four MOVs with the new vec4 MOV. */
2415 start->insert_before(mov_block, mov);
2416 for (i = 0; i < 4; i++)
2417 ((fs_inst *) mov->next)->remove(mov_block);
2418
2419 /* Finally, adjust the message length and set the opcode to
2420 * REP_FB_WRITE for the send, so that the generator will use the
2421 * replicated data mesage type. Then reset count so we'll start
2422 * looking for a new block in case we're in a MRT shader.
2423 */
2424 inst->opcode = FS_OPCODE_REP_FB_WRITE;
2425 inst->mlen -= 7;
2426 count = 0;
2427 }
2428 }
2429
2430 return;
2431 }
2432
2433 /**
2434 * Walks through basic blocks, looking for repeated MRF writes and
2435 * removing the later ones.
2436 */
2437 bool
2438 fs_visitor::remove_duplicate_mrf_writes()
2439 {
2440 fs_inst *last_mrf_move[16];
2441 bool progress = false;
2442
2443 /* Need to update the MRF tracking for compressed instructions. */
2444 if (dispatch_width == 16)
2445 return false;
2446
2447 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2448
2449 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2450 if (inst->is_control_flow()) {
2451 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2452 }
2453
2454 if (inst->opcode == BRW_OPCODE_MOV &&
2455 inst->dst.file == MRF) {
2456 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2457 if (prev_inst && inst->equals(prev_inst)) {
2458 inst->remove(block);
2459 progress = true;
2460 continue;
2461 }
2462 }
2463
2464 /* Clear out the last-write records for MRFs that were overwritten. */
2465 if (inst->dst.file == MRF) {
2466 last_mrf_move[inst->dst.reg] = NULL;
2467 }
2468
2469 if (inst->mlen > 0 && inst->base_mrf != -1) {
2470 /* Found a SEND instruction, which will include two or fewer
2471 * implied MRF writes. We could do better here.
2472 */
2473 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2474 last_mrf_move[inst->base_mrf + i] = NULL;
2475 }
2476 }
2477
2478 /* Clear out any MRF move records whose sources got overwritten. */
2479 if (inst->dst.file == GRF) {
2480 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2481 if (last_mrf_move[i] &&
2482 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2483 last_mrf_move[i] = NULL;
2484 }
2485 }
2486 }
2487
2488 if (inst->opcode == BRW_OPCODE_MOV &&
2489 inst->dst.file == MRF &&
2490 inst->src[0].file == GRF &&
2491 !inst->is_partial_write()) {
2492 last_mrf_move[inst->dst.reg] = inst;
2493 }
2494 }
2495
2496 if (progress)
2497 invalidate_live_intervals();
2498
2499 return progress;
2500 }
2501
2502 static void
2503 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2504 int first_grf, int grf_len)
2505 {
2506 bool inst_simd16 = (dispatch_width > 8 &&
2507 !inst->force_uncompressed &&
2508 !inst->force_sechalf);
2509
2510 /* Clear the flag for registers that actually got read (as expected). */
2511 for (int i = 0; i < inst->sources; i++) {
2512 int grf;
2513 if (inst->src[i].file == GRF) {
2514 grf = inst->src[i].reg;
2515 } else if (inst->src[i].file == HW_REG &&
2516 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2517 grf = inst->src[i].fixed_hw_reg.nr;
2518 } else {
2519 continue;
2520 }
2521
2522 if (grf >= first_grf &&
2523 grf < first_grf + grf_len) {
2524 deps[grf - first_grf] = false;
2525 if (inst_simd16)
2526 deps[grf - first_grf + 1] = false;
2527 }
2528 }
2529 }
2530
2531 /**
2532 * Implements this workaround for the original 965:
2533 *
2534 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2535 * check for post destination dependencies on this instruction, software
2536 * must ensure that there is no destination hazard for the case of ‘write
2537 * followed by a posted write’ shown in the following example.
2538 *
2539 * 1. mov r3 0
2540 * 2. send r3.xy <rest of send instruction>
2541 * 3. mov r2 r3
2542 *
2543 * Due to no post-destination dependency check on the ‘send’, the above
2544 * code sequence could have two instructions (1 and 2) in flight at the
2545 * same time that both consider ‘r3’ as the target of their final writes.
2546 */
2547 void
2548 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2549 fs_inst *inst)
2550 {
2551 int reg_size = dispatch_width / 8;
2552 int write_len = inst->regs_written * reg_size;
2553 int first_write_grf = inst->dst.reg;
2554 bool needs_dep[BRW_MAX_MRF];
2555 assert(write_len < (int)sizeof(needs_dep) - 1);
2556
2557 memset(needs_dep, false, sizeof(needs_dep));
2558 memset(needs_dep, true, write_len);
2559
2560 clear_deps_for_inst_src(inst, dispatch_width,
2561 needs_dep, first_write_grf, write_len);
2562
2563 /* Walk backwards looking for writes to registers we're writing which
2564 * aren't read since being written. If we hit the start of the program,
2565 * we assume that there are no outstanding dependencies on entry to the
2566 * program.
2567 */
2568 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2569 !scan_inst->is_head_sentinel();
2570 scan_inst = (fs_inst *)scan_inst->prev) {
2571
2572 /* If we hit control flow, assume that there *are* outstanding
2573 * dependencies, and force their cleanup before our instruction.
2574 */
2575 if (scan_inst->is_control_flow()) {
2576 for (int i = 0; i < write_len; i++) {
2577 if (needs_dep[i]) {
2578 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2579 }
2580 }
2581 return;
2582 }
2583
2584 bool scan_inst_simd16 = (dispatch_width > 8 &&
2585 !scan_inst->force_uncompressed &&
2586 !scan_inst->force_sechalf);
2587
2588 /* We insert our reads as late as possible on the assumption that any
2589 * instruction but a MOV that might have left us an outstanding
2590 * dependency has more latency than a MOV.
2591 */
2592 if (scan_inst->dst.file == GRF) {
2593 for (int i = 0; i < scan_inst->regs_written; i++) {
2594 int reg = scan_inst->dst.reg + i * reg_size;
2595
2596 if (reg >= first_write_grf &&
2597 reg < first_write_grf + write_len &&
2598 needs_dep[reg - first_write_grf]) {
2599 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2600 needs_dep[reg - first_write_grf] = false;
2601 if (scan_inst_simd16)
2602 needs_dep[reg - first_write_grf + 1] = false;
2603 }
2604 }
2605 }
2606
2607 /* Clear the flag for registers that actually got read (as expected). */
2608 clear_deps_for_inst_src(scan_inst, dispatch_width,
2609 needs_dep, first_write_grf, write_len);
2610
2611 /* Continue the loop only if we haven't resolved all the dependencies */
2612 int i;
2613 for (i = 0; i < write_len; i++) {
2614 if (needs_dep[i])
2615 break;
2616 }
2617 if (i == write_len)
2618 return;
2619 }
2620 }
2621
2622 /**
2623 * Implements this workaround for the original 965:
2624 *
2625 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2626 * used as a destination register until after it has been sourced by an
2627 * instruction with a different destination register.
2628 */
2629 void
2630 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2631 {
2632 int write_len = inst->regs_written * dispatch_width / 8;
2633 int first_write_grf = inst->dst.reg;
2634 bool needs_dep[BRW_MAX_MRF];
2635 assert(write_len < (int)sizeof(needs_dep) - 1);
2636
2637 memset(needs_dep, false, sizeof(needs_dep));
2638 memset(needs_dep, true, write_len);
2639 /* Walk forwards looking for writes to registers we're writing which aren't
2640 * read before being written.
2641 */
2642 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2643 !scan_inst->is_tail_sentinel();
2644 scan_inst = (fs_inst *)scan_inst->next) {
2645 /* If we hit control flow, force resolve all remaining dependencies. */
2646 if (scan_inst->is_control_flow()) {
2647 for (int i = 0; i < write_len; i++) {
2648 if (needs_dep[i])
2649 scan_inst->insert_before(block,
2650 DEP_RESOLVE_MOV(first_write_grf + i));
2651 }
2652 return;
2653 }
2654
2655 /* Clear the flag for registers that actually got read (as expected). */
2656 clear_deps_for_inst_src(scan_inst, dispatch_width,
2657 needs_dep, first_write_grf, write_len);
2658
2659 /* We insert our reads as late as possible since they're reading the
2660 * result of a SEND, which has massive latency.
2661 */
2662 if (scan_inst->dst.file == GRF &&
2663 scan_inst->dst.reg >= first_write_grf &&
2664 scan_inst->dst.reg < first_write_grf + write_len &&
2665 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2666 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
2667 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2668 }
2669
2670 /* Continue the loop only if we haven't resolved all the dependencies */
2671 int i;
2672 for (i = 0; i < write_len; i++) {
2673 if (needs_dep[i])
2674 break;
2675 }
2676 if (i == write_len)
2677 return;
2678 }
2679
2680 /* If we hit the end of the program, resolve all remaining dependencies out
2681 * of paranoia.
2682 */
2683 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2684 assert(last_inst->eot);
2685 for (int i = 0; i < write_len; i++) {
2686 if (needs_dep[i])
2687 last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2688 }
2689 }
2690
2691 void
2692 fs_visitor::insert_gen4_send_dependency_workarounds()
2693 {
2694 if (brw->gen != 4 || brw->is_g4x)
2695 return;
2696
2697 bool progress = false;
2698
2699 /* Note that we're done with register allocation, so GRF fs_regs always
2700 * have a .reg_offset of 0.
2701 */
2702
2703 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2704 if (inst->mlen != 0 && inst->dst.file == GRF) {
2705 insert_gen4_pre_send_dependency_workarounds(block, inst);
2706 insert_gen4_post_send_dependency_workarounds(block, inst);
2707 progress = true;
2708 }
2709 }
2710
2711 if (progress)
2712 invalidate_live_intervals();
2713 }
2714
2715 /**
2716 * Turns the generic expression-style uniform pull constant load instruction
2717 * into a hardware-specific series of instructions for loading a pull
2718 * constant.
2719 *
2720 * The expression style allows the CSE pass before this to optimize out
2721 * repeated loads from the same offset, and gives the pre-register-allocation
2722 * scheduling full flexibility, while the conversion to native instructions
2723 * allows the post-register-allocation scheduler the best information
2724 * possible.
2725 *
2726 * Note that execution masking for setting up pull constant loads is special:
2727 * the channels that need to be written are unrelated to the current execution
2728 * mask, since a later instruction will use one of the result channels as a
2729 * source operand for all 8 or 16 of its channels.
2730 */
2731 void
2732 fs_visitor::lower_uniform_pull_constant_loads()
2733 {
2734 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2735 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2736 continue;
2737
2738 if (brw->gen >= 7) {
2739 /* The offset arg before was a vec4-aligned byte offset. We need to
2740 * turn it into a dword offset.
2741 */
2742 fs_reg const_offset_reg = inst->src[1];
2743 assert(const_offset_reg.file == IMM &&
2744 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2745 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2746 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2747
2748 /* This is actually going to be a MOV, but since only the first dword
2749 * is accessed, we have a special opcode to do just that one. Note
2750 * that this needs to be an operation that will be considered a def
2751 * by live variable analysis, or register allocation will explode.
2752 */
2753 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2754 payload, const_offset_reg);
2755 setup->force_writemask_all = true;
2756
2757 setup->ir = inst->ir;
2758 setup->annotation = inst->annotation;
2759 inst->insert_before(block, setup);
2760
2761 /* Similarly, this will only populate the first 4 channels of the
2762 * result register (since we only use smear values from 0-3), but we
2763 * don't tell the optimizer.
2764 */
2765 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2766 inst->src[1] = payload;
2767
2768 invalidate_live_intervals();
2769 } else {
2770 /* Before register allocation, we didn't tell the scheduler about the
2771 * MRF we use. We know it's safe to use this MRF because nothing
2772 * else does except for register spill/unspill, which generates and
2773 * uses its MRF within a single IR instruction.
2774 */
2775 inst->base_mrf = 14;
2776 inst->mlen = 1;
2777 }
2778 }
2779 }
2780
2781 bool
2782 fs_visitor::lower_load_payload()
2783 {
2784 bool progress = false;
2785
2786 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2787 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
2788 fs_reg dst = inst->dst;
2789
2790 /* src[0] represents the (optional) message header. */
2791 if (inst->src[0].file != BAD_FILE) {
2792 inst->insert_before(block, MOV(dst, inst->src[0]));
2793 }
2794 dst.reg_offset++;
2795
2796 for (int i = 1; i < inst->sources; i++) {
2797 inst->insert_before(block, MOV(dst, inst->src[i]));
2798 dst.reg_offset++;
2799 }
2800
2801 inst->remove(block);
2802 progress = true;
2803 }
2804 }
2805
2806 if (progress)
2807 invalidate_live_intervals();
2808
2809 return progress;
2810 }
2811
2812 void
2813 fs_visitor::dump_instructions()
2814 {
2815 dump_instructions(NULL);
2816 }
2817
2818 void
2819 fs_visitor::dump_instructions(const char *name)
2820 {
2821 calculate_register_pressure();
2822 FILE *file = stderr;
2823 if (name && geteuid() != 0) {
2824 file = fopen(name, "w");
2825 if (!file)
2826 file = stderr;
2827 }
2828
2829 int ip = 0, max_pressure = 0;
2830 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
2831 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2832 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
2833 dump_instruction(inst, file);
2834 ++ip;
2835 }
2836 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
2837
2838 if (file != stderr) {
2839 fclose(file);
2840 }
2841 }
2842
2843 void
2844 fs_visitor::dump_instruction(backend_instruction *be_inst)
2845 {
2846 dump_instruction(be_inst, stderr);
2847 }
2848
2849 void
2850 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
2851 {
2852 fs_inst *inst = (fs_inst *)be_inst;
2853
2854 if (inst->predicate) {
2855 fprintf(file, "(%cf0.%d) ",
2856 inst->predicate_inverse ? '-' : '+',
2857 inst->flag_subreg);
2858 }
2859
2860 fprintf(file, "%s", brw_instruction_name(inst->opcode));
2861 if (inst->saturate)
2862 fprintf(file, ".sat");
2863 if (inst->conditional_mod) {
2864 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
2865 if (!inst->predicate &&
2866 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2867 inst->opcode != BRW_OPCODE_IF &&
2868 inst->opcode != BRW_OPCODE_WHILE))) {
2869 fprintf(file, ".f0.%d", inst->flag_subreg);
2870 }
2871 }
2872 fprintf(file, " ");
2873
2874
2875 switch (inst->dst.file) {
2876 case GRF:
2877 fprintf(file, "vgrf%d", inst->dst.reg);
2878 if (virtual_grf_sizes[inst->dst.reg] != 1 ||
2879 inst->dst.subreg_offset)
2880 fprintf(file, "+%d.%d",
2881 inst->dst.reg_offset, inst->dst.subreg_offset);
2882 break;
2883 case MRF:
2884 fprintf(file, "m%d", inst->dst.reg);
2885 break;
2886 case BAD_FILE:
2887 fprintf(file, "(null)");
2888 break;
2889 case UNIFORM:
2890 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
2891 break;
2892 case HW_REG:
2893 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2894 switch (inst->dst.fixed_hw_reg.nr) {
2895 case BRW_ARF_NULL:
2896 fprintf(file, "null");
2897 break;
2898 case BRW_ARF_ADDRESS:
2899 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
2900 break;
2901 case BRW_ARF_ACCUMULATOR:
2902 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
2903 break;
2904 case BRW_ARF_FLAG:
2905 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2906 inst->dst.fixed_hw_reg.subnr);
2907 break;
2908 default:
2909 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2910 inst->dst.fixed_hw_reg.subnr);
2911 break;
2912 }
2913 } else {
2914 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
2915 }
2916 if (inst->dst.fixed_hw_reg.subnr)
2917 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
2918 break;
2919 default:
2920 fprintf(file, "???");
2921 break;
2922 }
2923 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
2924
2925 for (int i = 0; i < inst->sources && inst->src[i].file != BAD_FILE; i++) {
2926 if (inst->src[i].negate)
2927 fprintf(file, "-");
2928 if (inst->src[i].abs)
2929 fprintf(file, "|");
2930 switch (inst->src[i].file) {
2931 case GRF:
2932 fprintf(file, "vgrf%d", inst->src[i].reg);
2933 if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2934 inst->src[i].subreg_offset)
2935 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2936 inst->src[i].subreg_offset);
2937 break;
2938 case MRF:
2939 fprintf(file, "***m%d***", inst->src[i].reg);
2940 break;
2941 case UNIFORM:
2942 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
2943 if (inst->src[i].reladdr) {
2944 fprintf(file, "+reladdr");
2945 } else if (inst->src[i].subreg_offset) {
2946 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2947 inst->src[i].subreg_offset);
2948 }
2949 break;
2950 case BAD_FILE:
2951 fprintf(file, "(null)");
2952 break;
2953 case IMM:
2954 switch (inst->src[i].type) {
2955 case BRW_REGISTER_TYPE_F:
2956 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
2957 break;
2958 case BRW_REGISTER_TYPE_D:
2959 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
2960 break;
2961 case BRW_REGISTER_TYPE_UD:
2962 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
2963 break;
2964 default:
2965 fprintf(file, "???");
2966 break;
2967 }
2968 break;
2969 case HW_REG:
2970 if (inst->src[i].fixed_hw_reg.negate)
2971 fprintf(file, "-");
2972 if (inst->src[i].fixed_hw_reg.abs)
2973 fprintf(file, "|");
2974 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2975 switch (inst->src[i].fixed_hw_reg.nr) {
2976 case BRW_ARF_NULL:
2977 fprintf(file, "null");
2978 break;
2979 case BRW_ARF_ADDRESS:
2980 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
2981 break;
2982 case BRW_ARF_ACCUMULATOR:
2983 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
2984 break;
2985 case BRW_ARF_FLAG:
2986 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2987 inst->src[i].fixed_hw_reg.subnr);
2988 break;
2989 default:
2990 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2991 inst->src[i].fixed_hw_reg.subnr);
2992 break;
2993 }
2994 } else {
2995 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2996 }
2997 if (inst->src[i].fixed_hw_reg.subnr)
2998 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
2999 if (inst->src[i].fixed_hw_reg.abs)
3000 fprintf(file, "|");
3001 break;
3002 default:
3003 fprintf(file, "???");
3004 break;
3005 }
3006 if (inst->src[i].abs)
3007 fprintf(file, "|");
3008
3009 if (inst->src[i].file != IMM) {
3010 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3011 }
3012
3013 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3014 fprintf(file, ", ");
3015 }
3016
3017 fprintf(file, " ");
3018
3019 if (inst->force_uncompressed)
3020 fprintf(file, "1sthalf ");
3021
3022 if (inst->force_sechalf)
3023 fprintf(file, "2ndhalf ");
3024
3025 fprintf(file, "\n");
3026 }
3027
3028 /**
3029 * Possibly returns an instruction that set up @param reg.
3030 *
3031 * Sometimes we want to take the result of some expression/variable
3032 * dereference tree and rewrite the instruction generating the result
3033 * of the tree. When processing the tree, we know that the
3034 * instructions generated are all writing temporaries that are dead
3035 * outside of this tree. So, if we have some instructions that write
3036 * a temporary, we're free to point that temp write somewhere else.
3037 *
3038 * Note that this doesn't guarantee that the instruction generated
3039 * only reg -- it might be the size=4 destination of a texture instruction.
3040 */
3041 fs_inst *
3042 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3043 fs_inst *end,
3044 const fs_reg &reg)
3045 {
3046 if (end == start ||
3047 end->is_partial_write() ||
3048 reg.reladdr ||
3049 !reg.equals(end->dst)) {
3050 return NULL;
3051 } else {
3052 return end;
3053 }
3054 }
3055
3056 void
3057 fs_visitor::setup_payload_gen6()
3058 {
3059 bool uses_depth =
3060 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3061 unsigned barycentric_interp_modes =
3062 (stage == MESA_SHADER_FRAGMENT) ?
3063 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3064
3065 assert(brw->gen >= 6);
3066
3067 /* R0-1: masks, pixel X/Y coordinates. */
3068 payload.num_regs = 2;
3069 /* R2: only for 32-pixel dispatch.*/
3070
3071 /* R3-26: barycentric interpolation coordinates. These appear in the
3072 * same order that they appear in the brw_wm_barycentric_interp_mode
3073 * enum. Each set of coordinates occupies 2 registers if dispatch width
3074 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3075 * appear if they were enabled using the "Barycentric Interpolation
3076 * Mode" bits in WM_STATE.
3077 */
3078 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3079 if (barycentric_interp_modes & (1 << i)) {
3080 payload.barycentric_coord_reg[i] = payload.num_regs;
3081 payload.num_regs += 2;
3082 if (dispatch_width == 16) {
3083 payload.num_regs += 2;
3084 }
3085 }
3086 }
3087
3088 /* R27: interpolated depth if uses source depth */
3089 if (uses_depth) {
3090 payload.source_depth_reg = payload.num_regs;
3091 payload.num_regs++;
3092 if (dispatch_width == 16) {
3093 /* R28: interpolated depth if not SIMD8. */
3094 payload.num_regs++;
3095 }
3096 }
3097 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3098 if (uses_depth) {
3099 payload.source_w_reg = payload.num_regs;
3100 payload.num_regs++;
3101 if (dispatch_width == 16) {
3102 /* R30: interpolated W if not SIMD8. */
3103 payload.num_regs++;
3104 }
3105 }
3106
3107 if (stage == MESA_SHADER_FRAGMENT) {
3108 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3109 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3110 prog_data->uses_pos_offset = key->compute_pos_offset;
3111 /* R31: MSAA position offsets. */
3112 if (prog_data->uses_pos_offset) {
3113 payload.sample_pos_reg = payload.num_regs;
3114 payload.num_regs++;
3115 }
3116 }
3117
3118 /* R32: MSAA input coverage mask */
3119 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3120 assert(brw->gen >= 7);
3121 payload.sample_mask_in_reg = payload.num_regs;
3122 payload.num_regs++;
3123 if (dispatch_width == 16) {
3124 /* R33: input coverage mask if not SIMD8. */
3125 payload.num_regs++;
3126 }
3127 }
3128
3129 /* R34-: bary for 32-pixel. */
3130 /* R58-59: interp W for 32-pixel. */
3131
3132 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3133 source_depth_to_render_target = true;
3134 }
3135 }
3136
3137 void
3138 fs_visitor::assign_binding_table_offsets()
3139 {
3140 assert(stage == MESA_SHADER_FRAGMENT);
3141 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3142 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3143 uint32_t next_binding_table_offset = 0;
3144
3145 /* If there are no color regions, we still perform an FB write to a null
3146 * renderbuffer, which we place at surface index 0.
3147 */
3148 prog_data->binding_table.render_target_start = next_binding_table_offset;
3149 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3150
3151 assign_common_binding_table_offsets(next_binding_table_offset);
3152 }
3153
3154 void
3155 fs_visitor::calculate_register_pressure()
3156 {
3157 invalidate_live_intervals();
3158 calculate_live_intervals();
3159
3160 unsigned num_instructions = instructions.length();
3161
3162 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3163
3164 for (int reg = 0; reg < virtual_grf_count; reg++) {
3165 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3166 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3167 }
3168 }
3169
3170 /**
3171 * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
3172 *
3173 * The needs_unlit_centroid_workaround ends up producing one of these per
3174 * channel of centroid input, so it's good to clean them up.
3175 *
3176 * An assumption here is that nothing ever modifies the dispatched pixels
3177 * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
3178 * dictates that anyway.
3179 */
3180 void
3181 fs_visitor::opt_drop_redundant_mov_to_flags()
3182 {
3183 bool flag_mov_found[2] = {false};
3184
3185 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3186 if (inst->is_control_flow()) {
3187 memset(flag_mov_found, 0, sizeof(flag_mov_found));
3188 } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
3189 if (!flag_mov_found[inst->flag_subreg])
3190 flag_mov_found[inst->flag_subreg] = true;
3191 else
3192 inst->remove(block);
3193 } else if (inst->writes_flag()) {
3194 flag_mov_found[inst->flag_subreg] = false;
3195 }
3196 }
3197 }
3198
3199 bool
3200 fs_visitor::run()
3201 {
3202 sanity_param_count = prog->Parameters->NumParameters;
3203 bool allocated_without_spills;
3204
3205 assign_binding_table_offsets();
3206
3207 if (brw->gen >= 6)
3208 setup_payload_gen6();
3209 else
3210 setup_payload_gen4();
3211
3212 if (0) {
3213 emit_dummy_fs();
3214 } else {
3215 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3216 emit_shader_time_begin();
3217
3218 calculate_urb_setup();
3219 if (prog->InputsRead > 0) {
3220 if (brw->gen < 6)
3221 emit_interpolation_setup_gen4();
3222 else
3223 emit_interpolation_setup_gen6();
3224 }
3225
3226 /* We handle discards by keeping track of the still-live pixels in f0.1.
3227 * Initialize it with the dispatched pixels.
3228 */
3229 bool uses_kill =
3230 (stage == MESA_SHADER_FRAGMENT) &&
3231 ((brw_wm_prog_data*) this->prog_data)->uses_kill;
3232 bool alpha_test_func =
3233 (stage == MESA_SHADER_FRAGMENT) &&
3234 ((brw_wm_prog_key*) this->key)->alpha_test_func;
3235 if (uses_kill || alpha_test_func) {
3236 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3237 discard_init->flag_subreg = 1;
3238 }
3239
3240 /* Generate FS IR for main(). (the visitor only descends into
3241 * functions called "main").
3242 */
3243 if (shader) {
3244 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3245 base_ir = ir;
3246 this->result = reg_undef;
3247 ir->accept(this);
3248 }
3249 } else {
3250 emit_fragment_program_code();
3251 }
3252 base_ir = NULL;
3253 if (failed)
3254 return false;
3255
3256 emit(FS_OPCODE_PLACEHOLDER_HALT);
3257
3258 if (alpha_test_func)
3259 emit_alpha_test();
3260
3261 emit_fb_writes();
3262
3263 calculate_cfg();
3264
3265 split_virtual_grfs();
3266
3267 move_uniform_array_access_to_pull_constants();
3268 assign_constant_locations();
3269 demote_pull_constants();
3270
3271 opt_drop_redundant_mov_to_flags();
3272
3273 #define OPT(pass, args...) do { \
3274 pass_num++; \
3275 bool this_progress = pass(args); \
3276 \
3277 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3278 char filename[64]; \
3279 snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass, \
3280 dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3281 \
3282 backend_visitor::dump_instructions(filename); \
3283 } \
3284 \
3285 progress = progress || this_progress; \
3286 } while (false)
3287
3288 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3289 char filename[64];
3290 snprintf(filename, 64, "fs%d-%04d-00-start",
3291 dispatch_width, shader_prog ? shader_prog->Name : 0);
3292
3293 backend_visitor::dump_instructions(filename);
3294 }
3295
3296 bool progress;
3297 int iteration = 0;
3298 do {
3299 progress = false;
3300 iteration++;
3301 int pass_num = 0;
3302
3303 compact_virtual_grfs();
3304
3305 OPT(remove_duplicate_mrf_writes);
3306
3307 OPT(opt_algebraic);
3308 OPT(opt_cse);
3309 OPT(opt_copy_propagate);
3310 OPT(opt_peephole_predicated_break);
3311 OPT(dead_code_eliminate);
3312 OPT(opt_peephole_sel);
3313 OPT(dead_control_flow_eliminate, this);
3314 OPT(opt_register_renaming);
3315 OPT(opt_saturate_propagation);
3316 OPT(register_coalesce);
3317 OPT(compute_to_mrf);
3318 } while (progress);
3319
3320 if (lower_load_payload()) {
3321 register_coalesce();
3322 dead_code_eliminate();
3323 }
3324
3325 lower_uniform_pull_constant_loads();
3326
3327 assign_curb_setup();
3328 assign_urb_setup();
3329
3330 static enum instruction_scheduler_mode pre_modes[] = {
3331 SCHEDULE_PRE,
3332 SCHEDULE_PRE_NON_LIFO,
3333 SCHEDULE_PRE_LIFO,
3334 };
3335
3336 /* Try each scheduling heuristic to see if it can successfully register
3337 * allocate without spilling. They should be ordered by decreasing
3338 * performance but increasing likelihood of allocating.
3339 */
3340 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3341 schedule_instructions(pre_modes[i]);
3342
3343 if (0) {
3344 assign_regs_trivial();
3345 allocated_without_spills = true;
3346 } else {
3347 allocated_without_spills = assign_regs(false);
3348 }
3349 if (allocated_without_spills)
3350 break;
3351 }
3352
3353 if (!allocated_without_spills) {
3354 /* We assume that any spilling is worse than just dropping back to
3355 * SIMD8. There's probably actually some intermediate point where
3356 * SIMD16 with a couple of spills is still better.
3357 */
3358 if (dispatch_width == 16) {
3359 fail("Failure to register allocate. Reduce number of "
3360 "live scalar values to avoid this.");
3361 } else {
3362 perf_debug("Fragment shader triggered register spilling. "
3363 "Try reducing the number of live scalar values to "
3364 "improve performance.\n");
3365 }
3366
3367 /* Since we're out of heuristics, just go spill registers until we
3368 * get an allocation.
3369 */
3370 while (!assign_regs(true)) {
3371 if (failed)
3372 break;
3373 }
3374 }
3375 }
3376 assert(force_uncompressed_stack == 0);
3377
3378 /* This must come after all optimization and register allocation, since
3379 * it inserts dead code that happens to have side effects, and it does
3380 * so based on the actual physical registers in use.
3381 */
3382 insert_gen4_send_dependency_workarounds();
3383
3384 if (failed)
3385 return false;
3386
3387 if (!allocated_without_spills)
3388 schedule_instructions(SCHEDULE_POST);
3389
3390 if (last_scratch > 0) {
3391 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3392 }
3393
3394 if (brw->use_rep_send)
3395 try_rep_send();
3396
3397 if (stage == MESA_SHADER_FRAGMENT) {
3398 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3399 if (dispatch_width == 8)
3400 prog_data->reg_blocks = brw_register_blocks(grf_used);
3401 else
3402 prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3403 }
3404
3405 /* If any state parameters were appended, then ParameterValues could have
3406 * been realloced, in which case the driver uniform storage set up by
3407 * _mesa_associate_uniform_storage() would point to freed memory. Make
3408 * sure that didn't happen.
3409 */
3410 assert(sanity_param_count == prog->Parameters->NumParameters);
3411
3412 return !failed;
3413 }
3414
3415 const unsigned *
3416 brw_wm_fs_emit(struct brw_context *brw,
3417 void *mem_ctx,
3418 const struct brw_wm_prog_key *key,
3419 struct brw_wm_prog_data *prog_data,
3420 struct gl_fragment_program *fp,
3421 struct gl_shader_program *prog,
3422 unsigned *final_assembly_size)
3423 {
3424 bool start_busy = false;
3425 double start_time = 0;
3426
3427 if (unlikely(brw->perf_debug)) {
3428 start_busy = (brw->batch.last_bo &&
3429 drm_intel_bo_busy(brw->batch.last_bo));
3430 start_time = get_time();
3431 }
3432
3433 struct brw_shader *shader = NULL;
3434 if (prog)
3435 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3436
3437 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3438 brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3439
3440 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3441 */
3442 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3443 if (!v.run()) {
3444 if (prog) {
3445 prog->LinkStatus = false;
3446 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3447 }
3448
3449 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3450 v.fail_msg);
3451
3452 return NULL;
3453 }
3454
3455 cfg_t *simd16_cfg = NULL;
3456 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3457 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3458 brw->use_rep_send)) {
3459 if (!v.simd16_unsupported) {
3460 /* Try a SIMD16 compile */
3461 v2.import_uniforms(&v);
3462 if (!v2.run()) {
3463 perf_debug("SIMD16 shader failed to compile, falling back to "
3464 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3465 } else {
3466 simd16_cfg = v2.cfg;
3467 }
3468 } else {
3469 perf_debug("SIMD16 shader unsupported, falling back to "
3470 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3471 }
3472 }
3473
3474 cfg_t *simd8_cfg;
3475 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3476 if (no_simd8 && simd16_cfg) {
3477 simd8_cfg = NULL;
3478 prog_data->no_8 = true;
3479 } else {
3480 simd8_cfg = v.cfg;
3481 prog_data->no_8 = false;
3482 }
3483
3484 const unsigned *assembly = NULL;
3485 fs_generator g(brw, mem_ctx, key, prog_data, prog, fp,
3486 v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
3487 assembly = g.generate_assembly(simd8_cfg, simd16_cfg,
3488 final_assembly_size);
3489
3490 if (unlikely(brw->perf_debug) && shader) {
3491 if (shader->compiled_once)
3492 brw_wm_debug_recompile(brw, prog, key);
3493 shader->compiled_once = true;
3494
3495 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3496 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3497 (get_time() - start_time) * 1000);
3498 }
3499 }
3500
3501 return assembly;
3502 }
3503
3504 bool
3505 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3506 {
3507 struct brw_context *brw = brw_context(ctx);
3508 struct brw_wm_prog_key key;
3509
3510 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3511 return true;
3512
3513 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3514 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3515 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3516 bool program_uses_dfdy = fp->UsesDFdy;
3517
3518 memset(&key, 0, sizeof(key));
3519
3520 if (brw->gen < 6) {
3521 if (fp->UsesKill)
3522 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3523
3524 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3525 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3526
3527 /* Just assume depth testing. */
3528 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3529 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3530 }
3531
3532 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3533 BRW_FS_VARYING_INPUT_MASK) > 16)
3534 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3535
3536 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3537 for (unsigned i = 0; i < sampler_count; i++) {
3538 if (fp->Base.ShadowSamplers & (1 << i)) {
3539 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3540 key.tex.swizzles[i] =
3541 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3542 } else {
3543 /* Color sampler: assume no swizzling. */
3544 key.tex.swizzles[i] = SWIZZLE_XYZW;
3545 }
3546 }
3547
3548 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3549 key.drawable_height = ctx->DrawBuffer->Height;
3550 }
3551
3552 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3553 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3554 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3555
3556 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3557 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3558 key.nr_color_regions > 1;
3559 }
3560
3561 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3562 * quality of the derivatives is likely to be determined by the driconf
3563 * option.
3564 */
3565 key.high_quality_derivatives = brw->disable_derivative_optimization;
3566
3567 key.program_string_id = bfp->id;
3568
3569 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3570 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3571
3572 bool success = do_wm_prog(brw, prog, bfp, &key);
3573
3574 brw->wm.base.prog_offset = old_prog_offset;
3575 brw->wm.prog_data = old_prog_data;
3576
3577 return success;
3578 }