i965/fs: Add a function to resize fs_inst's sources array.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53
54 void
55 fs_inst::init(enum opcode opcode, const fs_reg &dst, fs_reg *src, int sources)
56 {
57 memset(this, 0, sizeof(*this));
58
59 this->opcode = opcode;
60 this->dst = dst;
61 this->src = src;
62 this->sources = sources;
63
64 this->conditional_mod = BRW_CONDITIONAL_NONE;
65
66 /* This will be the case for almost all instructions. */
67 this->regs_written = 1;
68
69 this->writes_accumulator = false;
70 }
71
72 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
73 {
74 fs_reg *src = ralloc_array(this, fs_reg, 3);
75 init(opcode, dst, src, 0);
76 }
77
78 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
79 {
80 fs_reg *src = ralloc_array(this, fs_reg, 3);
81 src[0] = src0;
82 init(opcode, dst, src, 1);
83 }
84
85 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
86 const fs_reg &src1)
87 {
88 fs_reg *src = ralloc_array(this, fs_reg, 3);
89 src[0] = src0;
90 src[1] = src1;
91 init(opcode, dst, src, 2);
92 }
93
94 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
95 const fs_reg &src1, const fs_reg &src2)
96 {
97 fs_reg *src = ralloc_array(this, fs_reg, 3);
98 src[0] = src0;
99 src[1] = src1;
100 src[2] = src2;
101 init(opcode, dst, src, 3);
102 }
103
104 fs_inst::fs_inst(const fs_inst &that)
105 {
106 memcpy(this, &that, sizeof(that));
107
108 this->src = ralloc_array(this, fs_reg, that.sources);
109
110 for (int i = 0; i < that.sources; i++)
111 this->src[i] = that.src[i];
112 }
113
114 void
115 fs_inst::resize_sources(uint8_t num_sources)
116 {
117 if (this->sources != num_sources) {
118 this->src = reralloc(this, this->src, fs_reg, num_sources);
119 this->sources = num_sources;
120 }
121 }
122
123 #define ALU1(op) \
124 fs_inst * \
125 fs_visitor::op(fs_reg dst, fs_reg src0) \
126 { \
127 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
128 }
129
130 #define ALU2(op) \
131 fs_inst * \
132 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
133 { \
134 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
135 }
136
137 #define ALU2_ACC(op) \
138 fs_inst * \
139 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
140 { \
141 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
142 inst->writes_accumulator = true; \
143 return inst; \
144 }
145
146 #define ALU3(op) \
147 fs_inst * \
148 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
149 { \
150 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
151 }
152
153 ALU1(NOT)
154 ALU1(MOV)
155 ALU1(FRC)
156 ALU1(RNDD)
157 ALU1(RNDE)
158 ALU1(RNDZ)
159 ALU2(ADD)
160 ALU2(MUL)
161 ALU2_ACC(MACH)
162 ALU2(AND)
163 ALU2(OR)
164 ALU2(XOR)
165 ALU2(SHL)
166 ALU2(SHR)
167 ALU2(ASR)
168 ALU3(LRP)
169 ALU1(BFREV)
170 ALU3(BFE)
171 ALU2(BFI1)
172 ALU3(BFI2)
173 ALU1(FBH)
174 ALU1(FBL)
175 ALU1(CBIT)
176 ALU3(MAD)
177 ALU2_ACC(ADDC)
178 ALU2_ACC(SUBB)
179 ALU2(SEL)
180 ALU2(MAC)
181
182 /** Gen4 predicated IF. */
183 fs_inst *
184 fs_visitor::IF(uint32_t predicate)
185 {
186 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
187 inst->predicate = predicate;
188 return inst;
189 }
190
191 /** Gen6 IF with embedded comparison. */
192 fs_inst *
193 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
194 {
195 assert(brw->gen == 6);
196 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
197 reg_null_d, src0, src1);
198 inst->conditional_mod = condition;
199 return inst;
200 }
201
202 /**
203 * CMP: Sets the low bit of the destination channels with the result
204 * of the comparison, while the upper bits are undefined, and updates
205 * the flag register with the packed 16 bits of the result.
206 */
207 fs_inst *
208 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
209 {
210 fs_inst *inst;
211
212 /* Take the instruction:
213 *
214 * CMP null<d> src0<f> src1<f>
215 *
216 * Original gen4 does type conversion to the destination type before
217 * comparison, producing garbage results for floating point comparisons.
218 * gen5 does the comparison on the execution type (resolved source types),
219 * so dst type doesn't matter. gen6 does comparison and then uses the
220 * result as if it was the dst type with no conversion, which happens to
221 * mostly work out for float-interpreted-as-int since our comparisons are
222 * for >0, =0, <0.
223 */
224 if (brw->gen == 4) {
225 dst.type = src0.type;
226 if (dst.file == HW_REG)
227 dst.fixed_hw_reg.type = dst.type;
228 }
229
230 resolve_ud_negate(&src0);
231 resolve_ud_negate(&src1);
232
233 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
234 inst->conditional_mod = condition;
235
236 return inst;
237 }
238
239 exec_list
240 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
241 const fs_reg &surf_index,
242 const fs_reg &varying_offset,
243 uint32_t const_offset)
244 {
245 exec_list instructions;
246 fs_inst *inst;
247
248 /* We have our constant surface use a pitch of 4 bytes, so our index can
249 * be any component of a vector, and then we load 4 contiguous
250 * components starting from that.
251 *
252 * We break down the const_offset to a portion added to the variable
253 * offset and a portion done using reg_offset, which means that if you
254 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
255 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
256 * CSE can later notice that those loads are all the same and eliminate
257 * the redundant ones.
258 */
259 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
260 instructions.push_tail(ADD(vec4_offset,
261 varying_offset, const_offset & ~3));
262
263 int scale = 1;
264 if (brw->gen == 4 && dispatch_width == 8) {
265 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
266 * u, v, r) as parameters, or we can just use the SIMD16 message
267 * consisting of (header, u). We choose the second, at the cost of a
268 * longer return length.
269 */
270 scale = 2;
271 }
272
273 enum opcode op;
274 if (brw->gen >= 7)
275 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
276 else
277 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
278 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
279 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
280 inst->regs_written = 4 * scale;
281 instructions.push_tail(inst);
282
283 if (brw->gen < 7) {
284 inst->base_mrf = 13;
285 inst->header_present = true;
286 if (brw->gen == 4)
287 inst->mlen = 3;
288 else
289 inst->mlen = 1 + dispatch_width / 8;
290 }
291
292 vec4_result.reg_offset += (const_offset & 3) * scale;
293 instructions.push_tail(MOV(dst, vec4_result));
294
295 return instructions;
296 }
297
298 /**
299 * A helper for MOV generation for fixing up broken hardware SEND dependency
300 * handling.
301 */
302 fs_inst *
303 fs_visitor::DEP_RESOLVE_MOV(int grf)
304 {
305 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
306
307 inst->ir = NULL;
308 inst->annotation = "send dependency resolve";
309
310 /* The caller always wants uncompressed to emit the minimal extra
311 * dependencies, and to avoid having to deal with aligning its regs to 2.
312 */
313 inst->force_uncompressed = true;
314
315 return inst;
316 }
317
318 bool
319 fs_inst::equals(fs_inst *inst) const
320 {
321 return (opcode == inst->opcode &&
322 dst.equals(inst->dst) &&
323 src[0].equals(inst->src[0]) &&
324 src[1].equals(inst->src[1]) &&
325 src[2].equals(inst->src[2]) &&
326 saturate == inst->saturate &&
327 predicate == inst->predicate &&
328 conditional_mod == inst->conditional_mod &&
329 mlen == inst->mlen &&
330 base_mrf == inst->base_mrf &&
331 sampler == inst->sampler &&
332 target == inst->target &&
333 eot == inst->eot &&
334 header_present == inst->header_present &&
335 shadow_compare == inst->shadow_compare &&
336 offset == inst->offset);
337 }
338
339 bool
340 fs_inst::overwrites_reg(const fs_reg &reg) const
341 {
342 return (reg.file == dst.file &&
343 reg.reg == dst.reg &&
344 reg.reg_offset >= dst.reg_offset &&
345 reg.reg_offset < dst.reg_offset + regs_written);
346 }
347
348 bool
349 fs_inst::is_send_from_grf() const
350 {
351 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
352 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
353 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
354 src[1].file == GRF) ||
355 (is_tex() && src[0].file == GRF));
356 }
357
358 bool
359 fs_visitor::can_do_source_mods(fs_inst *inst)
360 {
361 if (brw->gen == 6 && inst->is_math())
362 return false;
363
364 if (inst->is_send_from_grf())
365 return false;
366
367 if (!inst->can_do_source_mods())
368 return false;
369
370 return true;
371 }
372
373 void
374 fs_reg::init()
375 {
376 memset(this, 0, sizeof(*this));
377 stride = 1;
378 }
379
380 /** Generic unset register constructor. */
381 fs_reg::fs_reg()
382 {
383 init();
384 this->file = BAD_FILE;
385 }
386
387 /** Immediate value constructor. */
388 fs_reg::fs_reg(float f)
389 {
390 init();
391 this->file = IMM;
392 this->type = BRW_REGISTER_TYPE_F;
393 this->imm.f = f;
394 }
395
396 /** Immediate value constructor. */
397 fs_reg::fs_reg(int32_t i)
398 {
399 init();
400 this->file = IMM;
401 this->type = BRW_REGISTER_TYPE_D;
402 this->imm.i = i;
403 }
404
405 /** Immediate value constructor. */
406 fs_reg::fs_reg(uint32_t u)
407 {
408 init();
409 this->file = IMM;
410 this->type = BRW_REGISTER_TYPE_UD;
411 this->imm.u = u;
412 }
413
414 /** Fixed brw_reg. */
415 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
416 {
417 init();
418 this->file = HW_REG;
419 this->fixed_hw_reg = fixed_hw_reg;
420 this->type = fixed_hw_reg.type;
421 }
422
423 bool
424 fs_reg::equals(const fs_reg &r) const
425 {
426 return (file == r.file &&
427 reg == r.reg &&
428 reg_offset == r.reg_offset &&
429 subreg_offset == r.subreg_offset &&
430 type == r.type &&
431 negate == r.negate &&
432 abs == r.abs &&
433 !reladdr && !r.reladdr &&
434 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
435 sizeof(fixed_hw_reg)) == 0 &&
436 stride == r.stride &&
437 imm.u == r.imm.u);
438 }
439
440 fs_reg &
441 fs_reg::apply_stride(unsigned stride)
442 {
443 assert((this->stride * stride) <= 4 &&
444 (is_power_of_two(stride) || stride == 0) &&
445 file != HW_REG && file != IMM);
446 this->stride *= stride;
447 return *this;
448 }
449
450 fs_reg &
451 fs_reg::set_smear(unsigned subreg)
452 {
453 assert(file != HW_REG && file != IMM);
454 subreg_offset = subreg * type_sz(type);
455 stride = 0;
456 return *this;
457 }
458
459 bool
460 fs_reg::is_contiguous() const
461 {
462 return stride == 1;
463 }
464
465 bool
466 fs_reg::is_zero() const
467 {
468 if (file != IMM)
469 return false;
470
471 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
472 }
473
474 bool
475 fs_reg::is_one() const
476 {
477 if (file != IMM)
478 return false;
479
480 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
481 }
482
483 bool
484 fs_reg::is_null() const
485 {
486 return file == HW_REG &&
487 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
488 fixed_hw_reg.nr == BRW_ARF_NULL;
489 }
490
491 bool
492 fs_reg::is_valid_3src() const
493 {
494 return file == GRF || file == UNIFORM;
495 }
496
497 bool
498 fs_reg::is_accumulator() const
499 {
500 return file == HW_REG &&
501 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
502 fixed_hw_reg.nr == BRW_ARF_ACCUMULATOR;
503 }
504
505 int
506 fs_visitor::type_size(const struct glsl_type *type)
507 {
508 unsigned int size, i;
509
510 switch (type->base_type) {
511 case GLSL_TYPE_UINT:
512 case GLSL_TYPE_INT:
513 case GLSL_TYPE_FLOAT:
514 case GLSL_TYPE_BOOL:
515 return type->components();
516 case GLSL_TYPE_ARRAY:
517 return type_size(type->fields.array) * type->length;
518 case GLSL_TYPE_STRUCT:
519 size = 0;
520 for (i = 0; i < type->length; i++) {
521 size += type_size(type->fields.structure[i].type);
522 }
523 return size;
524 case GLSL_TYPE_SAMPLER:
525 /* Samplers take up no register space, since they're baked in at
526 * link time.
527 */
528 return 0;
529 case GLSL_TYPE_ATOMIC_UINT:
530 return 0;
531 case GLSL_TYPE_IMAGE:
532 case GLSL_TYPE_VOID:
533 case GLSL_TYPE_ERROR:
534 case GLSL_TYPE_INTERFACE:
535 assert(!"not reached");
536 break;
537 }
538
539 return 0;
540 }
541
542 fs_reg
543 fs_visitor::get_timestamp()
544 {
545 assert(brw->gen >= 7);
546
547 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
548 BRW_ARF_TIMESTAMP,
549 0),
550 BRW_REGISTER_TYPE_UD));
551
552 fs_reg dst = fs_reg(this, glsl_type::uint_type);
553
554 fs_inst *mov = emit(MOV(dst, ts));
555 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
556 * even if it's not enabled in the dispatch.
557 */
558 mov->force_writemask_all = true;
559 mov->force_uncompressed = true;
560
561 /* The caller wants the low 32 bits of the timestamp. Since it's running
562 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
563 * which is plenty of time for our purposes. It is identical across the
564 * EUs, but since it's tracking GPU core speed it will increment at a
565 * varying rate as render P-states change.
566 *
567 * The caller could also check if render P-states have changed (or anything
568 * else that might disrupt timing) by setting smear to 2 and checking if
569 * that field is != 0.
570 */
571 dst.set_smear(0);
572
573 return dst;
574 }
575
576 void
577 fs_visitor::emit_shader_time_begin()
578 {
579 current_annotation = "shader time start";
580 shader_start_time = get_timestamp();
581 }
582
583 void
584 fs_visitor::emit_shader_time_end()
585 {
586 current_annotation = "shader time end";
587
588 enum shader_time_shader_type type, written_type, reset_type;
589 if (dispatch_width == 8) {
590 type = ST_FS8;
591 written_type = ST_FS8_WRITTEN;
592 reset_type = ST_FS8_RESET;
593 } else {
594 assert(dispatch_width == 16);
595 type = ST_FS16;
596 written_type = ST_FS16_WRITTEN;
597 reset_type = ST_FS16_RESET;
598 }
599
600 fs_reg shader_end_time = get_timestamp();
601
602 /* Check that there weren't any timestamp reset events (assuming these
603 * were the only two timestamp reads that happened).
604 */
605 fs_reg reset = shader_end_time;
606 reset.set_smear(2);
607 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
608 test->conditional_mod = BRW_CONDITIONAL_Z;
609 emit(IF(BRW_PREDICATE_NORMAL));
610
611 push_force_uncompressed();
612 fs_reg start = shader_start_time;
613 start.negate = true;
614 fs_reg diff = fs_reg(this, glsl_type::uint_type);
615 emit(ADD(diff, start, shader_end_time));
616
617 /* If there were no instructions between the two timestamp gets, the diff
618 * is 2 cycles. Remove that overhead, so I can forget about that when
619 * trying to determine the time taken for single instructions.
620 */
621 emit(ADD(diff, diff, fs_reg(-2u)));
622
623 emit_shader_time_write(type, diff);
624 emit_shader_time_write(written_type, fs_reg(1u));
625 emit(BRW_OPCODE_ELSE);
626 emit_shader_time_write(reset_type, fs_reg(1u));
627 emit(BRW_OPCODE_ENDIF);
628
629 pop_force_uncompressed();
630 }
631
632 void
633 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
634 fs_reg value)
635 {
636 int shader_time_index =
637 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
638 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
639
640 fs_reg payload;
641 if (dispatch_width == 8)
642 payload = fs_reg(this, glsl_type::uvec2_type);
643 else
644 payload = fs_reg(this, glsl_type::uint_type);
645
646 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
647 fs_reg(), payload, offset, value));
648 }
649
650 void
651 fs_visitor::vfail(const char *format, va_list va)
652 {
653 char *msg;
654
655 if (failed)
656 return;
657
658 failed = true;
659
660 msg = ralloc_vasprintf(mem_ctx, format, va);
661 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
662
663 this->fail_msg = msg;
664
665 if (INTEL_DEBUG & DEBUG_WM) {
666 fprintf(stderr, "%s", msg);
667 }
668 }
669
670 void
671 fs_visitor::fail(const char *format, ...)
672 {
673 va_list va;
674
675 va_start(va, format);
676 vfail(format, va);
677 va_end(va);
678 }
679
680 /**
681 * Mark this program as impossible to compile in SIMD16 mode.
682 *
683 * During the SIMD8 compile (which happens first), we can detect and flag
684 * things that are unsupported in SIMD16 mode, so the compiler can skip
685 * the SIMD16 compile altogether.
686 *
687 * During a SIMD16 compile (if one happens anyway), this just calls fail().
688 */
689 void
690 fs_visitor::no16(const char *format, ...)
691 {
692 va_list va;
693
694 va_start(va, format);
695
696 if (dispatch_width == 16) {
697 vfail(format, va);
698 } else {
699 simd16_unsupported = true;
700
701 if (brw->perf_debug) {
702 if (no16_msg)
703 ralloc_vasprintf_append(&no16_msg, format, va);
704 else
705 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
706 }
707 }
708
709 va_end(va);
710 }
711
712 fs_inst *
713 fs_visitor::emit(enum opcode opcode)
714 {
715 return emit(new(mem_ctx) fs_inst(opcode));
716 }
717
718 fs_inst *
719 fs_visitor::emit(enum opcode opcode, fs_reg dst)
720 {
721 return emit(new(mem_ctx) fs_inst(opcode, dst));
722 }
723
724 fs_inst *
725 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
726 {
727 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
728 }
729
730 fs_inst *
731 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
732 {
733 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
734 }
735
736 fs_inst *
737 fs_visitor::emit(enum opcode opcode, fs_reg dst,
738 fs_reg src0, fs_reg src1, fs_reg src2)
739 {
740 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
741 }
742
743 void
744 fs_visitor::push_force_uncompressed()
745 {
746 force_uncompressed_stack++;
747 }
748
749 void
750 fs_visitor::pop_force_uncompressed()
751 {
752 force_uncompressed_stack--;
753 assert(force_uncompressed_stack >= 0);
754 }
755
756 /**
757 * Returns true if the instruction has a flag that means it won't
758 * update an entire destination register.
759 *
760 * For example, dead code elimination and live variable analysis want to know
761 * when a write to a variable screens off any preceding values that were in
762 * it.
763 */
764 bool
765 fs_inst::is_partial_write() const
766 {
767 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
768 this->force_uncompressed ||
769 this->force_sechalf || !this->dst.is_contiguous());
770 }
771
772 int
773 fs_inst::regs_read(fs_visitor *v, int arg) const
774 {
775 if (is_tex() && arg == 0 && src[0].file == GRF) {
776 if (v->dispatch_width == 16)
777 return (mlen + 1) / 2;
778 else
779 return mlen;
780 }
781 return 1;
782 }
783
784 bool
785 fs_inst::reads_flag() const
786 {
787 return predicate;
788 }
789
790 bool
791 fs_inst::writes_flag() const
792 {
793 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
794 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
795 }
796
797 /**
798 * Returns how many MRFs an FS opcode will write over.
799 *
800 * Note that this is not the 0 or 1 implied writes in an actual gen
801 * instruction -- the FS opcodes often generate MOVs in addition.
802 */
803 int
804 fs_visitor::implied_mrf_writes(fs_inst *inst)
805 {
806 if (inst->mlen == 0)
807 return 0;
808
809 if (inst->base_mrf == -1)
810 return 0;
811
812 switch (inst->opcode) {
813 case SHADER_OPCODE_RCP:
814 case SHADER_OPCODE_RSQ:
815 case SHADER_OPCODE_SQRT:
816 case SHADER_OPCODE_EXP2:
817 case SHADER_OPCODE_LOG2:
818 case SHADER_OPCODE_SIN:
819 case SHADER_OPCODE_COS:
820 return 1 * dispatch_width / 8;
821 case SHADER_OPCODE_POW:
822 case SHADER_OPCODE_INT_QUOTIENT:
823 case SHADER_OPCODE_INT_REMAINDER:
824 return 2 * dispatch_width / 8;
825 case SHADER_OPCODE_TEX:
826 case FS_OPCODE_TXB:
827 case SHADER_OPCODE_TXD:
828 case SHADER_OPCODE_TXF:
829 case SHADER_OPCODE_TXF_CMS:
830 case SHADER_OPCODE_TXF_MCS:
831 case SHADER_OPCODE_TG4:
832 case SHADER_OPCODE_TG4_OFFSET:
833 case SHADER_OPCODE_TXL:
834 case SHADER_OPCODE_TXS:
835 case SHADER_OPCODE_LOD:
836 return 1;
837 case FS_OPCODE_FB_WRITE:
838 return 2;
839 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
840 case SHADER_OPCODE_GEN4_SCRATCH_READ:
841 return 1;
842 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
843 return inst->mlen;
844 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
845 return 2;
846 case SHADER_OPCODE_UNTYPED_ATOMIC:
847 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
848 return 0;
849 default:
850 assert(!"not reached");
851 return inst->mlen;
852 }
853 }
854
855 int
856 fs_visitor::virtual_grf_alloc(int size)
857 {
858 if (virtual_grf_array_size <= virtual_grf_count) {
859 if (virtual_grf_array_size == 0)
860 virtual_grf_array_size = 16;
861 else
862 virtual_grf_array_size *= 2;
863 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
864 virtual_grf_array_size);
865 }
866 virtual_grf_sizes[virtual_grf_count] = size;
867 return virtual_grf_count++;
868 }
869
870 /** Fixed HW reg constructor. */
871 fs_reg::fs_reg(enum register_file file, int reg)
872 {
873 init();
874 this->file = file;
875 this->reg = reg;
876 this->type = BRW_REGISTER_TYPE_F;
877 }
878
879 /** Fixed HW reg constructor. */
880 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
881 {
882 init();
883 this->file = file;
884 this->reg = reg;
885 this->type = type;
886 }
887
888 /** Automatic reg constructor. */
889 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
890 {
891 init();
892
893 this->file = GRF;
894 this->reg = v->virtual_grf_alloc(v->type_size(type));
895 this->reg_offset = 0;
896 this->type = brw_type_for_base_type(type);
897 }
898
899 fs_reg *
900 fs_visitor::variable_storage(ir_variable *var)
901 {
902 return (fs_reg *)hash_table_find(this->variable_ht, var);
903 }
904
905 void
906 import_uniforms_callback(const void *key,
907 void *data,
908 void *closure)
909 {
910 struct hash_table *dst_ht = (struct hash_table *)closure;
911 const fs_reg *reg = (const fs_reg *)data;
912
913 if (reg->file != UNIFORM)
914 return;
915
916 hash_table_insert(dst_ht, data, key);
917 }
918
919 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
920 * This brings in those uniform definitions
921 */
922 void
923 fs_visitor::import_uniforms(fs_visitor *v)
924 {
925 hash_table_call_foreach(v->variable_ht,
926 import_uniforms_callback,
927 variable_ht);
928 this->push_constant_loc = v->push_constant_loc;
929 this->pull_constant_loc = v->pull_constant_loc;
930 this->uniforms = v->uniforms;
931 this->param_size = v->param_size;
932 }
933
934 /* Our support for uniforms is piggy-backed on the struct
935 * gl_fragment_program, because that's where the values actually
936 * get stored, rather than in some global gl_shader_program uniform
937 * store.
938 */
939 void
940 fs_visitor::setup_uniform_values(ir_variable *ir)
941 {
942 int namelen = strlen(ir->name);
943
944 /* The data for our (non-builtin) uniforms is stored in a series of
945 * gl_uniform_driver_storage structs for each subcomponent that
946 * glGetUniformLocation() could name. We know it's been set up in the same
947 * order we'd walk the type, so walk the list of storage and find anything
948 * with our name, or the prefix of a component that starts with our name.
949 */
950 unsigned params_before = uniforms;
951 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
952 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
953
954 if (strncmp(ir->name, storage->name, namelen) != 0 ||
955 (storage->name[namelen] != 0 &&
956 storage->name[namelen] != '.' &&
957 storage->name[namelen] != '[')) {
958 continue;
959 }
960
961 unsigned slots = storage->type->component_slots();
962 if (storage->array_elements)
963 slots *= storage->array_elements;
964
965 for (unsigned i = 0; i < slots; i++) {
966 stage_prog_data->param[uniforms++] = &storage->storage[i].f;
967 }
968 }
969
970 /* Make sure we actually initialized the right amount of stuff here. */
971 assert(params_before + ir->type->component_slots() == uniforms);
972 (void)params_before;
973 }
974
975
976 /* Our support for builtin uniforms is even scarier than non-builtin.
977 * It sits on top of the PROG_STATE_VAR parameters that are
978 * automatically updated from GL context state.
979 */
980 void
981 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
982 {
983 const ir_state_slot *const slots = ir->state_slots;
984 assert(ir->state_slots != NULL);
985
986 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
987 /* This state reference has already been setup by ir_to_mesa, but we'll
988 * get the same index back here.
989 */
990 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
991 (gl_state_index *)slots[i].tokens);
992
993 /* Add each of the unique swizzles of the element as a parameter.
994 * This'll end up matching the expected layout of the
995 * array/matrix/structure we're trying to fill in.
996 */
997 int last_swiz = -1;
998 for (unsigned int j = 0; j < 4; j++) {
999 int swiz = GET_SWZ(slots[i].swizzle, j);
1000 if (swiz == last_swiz)
1001 break;
1002 last_swiz = swiz;
1003
1004 stage_prog_data->param[uniforms++] =
1005 &fp->Base.Parameters->ParameterValues[index][swiz].f;
1006 }
1007 }
1008 }
1009
1010 fs_reg *
1011 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1012 {
1013 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1014 fs_reg wpos = *reg;
1015 bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1016
1017 /* gl_FragCoord.x */
1018 if (ir->data.pixel_center_integer) {
1019 emit(MOV(wpos, this->pixel_x));
1020 } else {
1021 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1022 }
1023 wpos.reg_offset++;
1024
1025 /* gl_FragCoord.y */
1026 if (!flip && ir->data.pixel_center_integer) {
1027 emit(MOV(wpos, this->pixel_y));
1028 } else {
1029 fs_reg pixel_y = this->pixel_y;
1030 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1031
1032 if (flip) {
1033 pixel_y.negate = true;
1034 offset += key->drawable_height - 1.0;
1035 }
1036
1037 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1038 }
1039 wpos.reg_offset++;
1040
1041 /* gl_FragCoord.z */
1042 if (brw->gen >= 6) {
1043 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1044 } else {
1045 emit(FS_OPCODE_LINTERP, wpos,
1046 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1047 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1048 interp_reg(VARYING_SLOT_POS, 2));
1049 }
1050 wpos.reg_offset++;
1051
1052 /* gl_FragCoord.w: Already set up in emit_interpolation */
1053 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1054
1055 return reg;
1056 }
1057
1058 fs_inst *
1059 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1060 glsl_interp_qualifier interpolation_mode,
1061 bool is_centroid, bool is_sample)
1062 {
1063 brw_wm_barycentric_interp_mode barycoord_mode;
1064 if (brw->gen >= 6) {
1065 if (is_centroid) {
1066 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1067 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1068 else
1069 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1070 } else if (is_sample) {
1071 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1072 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1073 else
1074 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1075 } else {
1076 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1077 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1078 else
1079 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1080 }
1081 } else {
1082 /* On Ironlake and below, there is only one interpolation mode.
1083 * Centroid interpolation doesn't mean anything on this hardware --
1084 * there is no multisampling.
1085 */
1086 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1087 }
1088 return emit(FS_OPCODE_LINTERP, attr,
1089 this->delta_x[barycoord_mode],
1090 this->delta_y[barycoord_mode], interp);
1091 }
1092
1093 fs_reg *
1094 fs_visitor::emit_general_interpolation(ir_variable *ir)
1095 {
1096 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1097 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1098 fs_reg attr = *reg;
1099
1100 unsigned int array_elements;
1101 const glsl_type *type;
1102
1103 if (ir->type->is_array()) {
1104 array_elements = ir->type->length;
1105 if (array_elements == 0) {
1106 fail("dereferenced array '%s' has length 0\n", ir->name);
1107 }
1108 type = ir->type->fields.array;
1109 } else {
1110 array_elements = 1;
1111 type = ir->type;
1112 }
1113
1114 glsl_interp_qualifier interpolation_mode =
1115 ir->determine_interpolation_mode(key->flat_shade);
1116
1117 int location = ir->data.location;
1118 for (unsigned int i = 0; i < array_elements; i++) {
1119 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1120 if (prog_data->urb_setup[location] == -1) {
1121 /* If there's no incoming setup data for this slot, don't
1122 * emit interpolation for it.
1123 */
1124 attr.reg_offset += type->vector_elements;
1125 location++;
1126 continue;
1127 }
1128
1129 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1130 /* Constant interpolation (flat shading) case. The SF has
1131 * handed us defined values in only the constant offset
1132 * field of the setup reg.
1133 */
1134 for (unsigned int k = 0; k < type->vector_elements; k++) {
1135 struct brw_reg interp = interp_reg(location, k);
1136 interp = suboffset(interp, 3);
1137 interp.type = reg->type;
1138 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1139 attr.reg_offset++;
1140 }
1141 } else {
1142 /* Smooth/noperspective interpolation case. */
1143 for (unsigned int k = 0; k < type->vector_elements; k++) {
1144 struct brw_reg interp = interp_reg(location, k);
1145 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1146 ir->data.centroid && !key->persample_shading,
1147 ir->data.sample || key->persample_shading);
1148 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1149 /* Get the pixel/sample mask into f0 so that we know
1150 * which pixels are lit. Then, for each channel that is
1151 * unlit, replace the centroid data with non-centroid
1152 * data.
1153 */
1154 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1155 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1156 interpolation_mode,
1157 false, false);
1158 inst->predicate = BRW_PREDICATE_NORMAL;
1159 inst->predicate_inverse = true;
1160 }
1161 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1162 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1163 }
1164 attr.reg_offset++;
1165 }
1166
1167 }
1168 location++;
1169 }
1170 }
1171
1172 return reg;
1173 }
1174
1175 fs_reg *
1176 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1177 {
1178 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1179
1180 /* The frontfacing comes in as a bit in the thread payload. */
1181 if (brw->gen >= 6) {
1182 emit(BRW_OPCODE_ASR, *reg,
1183 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1184 fs_reg(15));
1185 emit(BRW_OPCODE_NOT, *reg, *reg);
1186 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1187 } else {
1188 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1189 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1190 * us front face
1191 */
1192 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1193 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1194 }
1195
1196 return reg;
1197 }
1198
1199 void
1200 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1201 {
1202 assert(dst.type == BRW_REGISTER_TYPE_F);
1203
1204 if (key->compute_pos_offset) {
1205 /* Convert int_sample_pos to floating point */
1206 emit(MOV(dst, int_sample_pos));
1207 /* Scale to the range [0, 1] */
1208 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1209 }
1210 else {
1211 /* From ARB_sample_shading specification:
1212 * "When rendering to a non-multisample buffer, or if multisample
1213 * rasterization is disabled, gl_SamplePosition will always be
1214 * (0.5, 0.5).
1215 */
1216 emit(MOV(dst, fs_reg(0.5f)));
1217 }
1218 }
1219
1220 fs_reg *
1221 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1222 {
1223 assert(brw->gen >= 6);
1224 assert(ir->type == glsl_type::vec2_type);
1225
1226 this->current_annotation = "compute sample position";
1227 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1228 fs_reg pos = *reg;
1229 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1230 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1231
1232 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1233 * mode will be enabled.
1234 *
1235 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1236 * R31.1:0 Position Offset X/Y for Slot[3:0]
1237 * R31.3:2 Position Offset X/Y for Slot[7:4]
1238 * .....
1239 *
1240 * The X, Y sample positions come in as bytes in thread payload. So, read
1241 * the positions using vstride=16, width=8, hstride=2.
1242 */
1243 struct brw_reg sample_pos_reg =
1244 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1245 BRW_REGISTER_TYPE_B), 16, 8, 2);
1246
1247 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1248 if (dispatch_width == 16) {
1249 fs_inst *inst = emit(MOV(half(int_sample_x, 1),
1250 fs_reg(suboffset(sample_pos_reg, 16))));
1251 inst->force_sechalf = true;
1252 }
1253 /* Compute gl_SamplePosition.x */
1254 compute_sample_position(pos, int_sample_x);
1255 pos.reg_offset++;
1256 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1257 if (dispatch_width == 16) {
1258 fs_inst *inst = emit(MOV(half(int_sample_y, 1),
1259 fs_reg(suboffset(sample_pos_reg, 17))));
1260 inst->force_sechalf = true;
1261 }
1262 /* Compute gl_SamplePosition.y */
1263 compute_sample_position(pos, int_sample_y);
1264 return reg;
1265 }
1266
1267 fs_reg *
1268 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1269 {
1270 assert(brw->gen >= 6);
1271
1272 this->current_annotation = "compute sample id";
1273 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1274
1275 if (key->compute_sample_id) {
1276 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1277 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1278 t2.type = BRW_REGISTER_TYPE_UW;
1279
1280 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1281 * 8x multisampling, subspan 0 will represent sample N (where N
1282 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1283 * 7. We can find the value of N by looking at R0.0 bits 7:6
1284 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1285 * (since samples are always delivered in pairs). That is, we
1286 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1287 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1288 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1289 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1290 * populating a temporary variable with the sequence (0, 1, 2, 3),
1291 * and then reading from it using vstride=1, width=4, hstride=0.
1292 * These computations hold good for 4x multisampling as well.
1293 */
1294 emit(BRW_OPCODE_AND, t1,
1295 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1296 fs_reg(0xc0));
1297 emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1298 /* This works for both SIMD8 and SIMD16 */
1299 emit(MOV(t2, brw_imm_v(0x3210)));
1300 /* This special instruction takes care of setting vstride=1,
1301 * width=4, hstride=0 of t2 during an ADD instruction.
1302 */
1303 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1304 } else {
1305 /* As per GL_ARB_sample_shading specification:
1306 * "When rendering to a non-multisample buffer, or if multisample
1307 * rasterization is disabled, gl_SampleID will always be zero."
1308 */
1309 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1310 }
1311
1312 return reg;
1313 }
1314
1315 fs_reg
1316 fs_visitor::fix_math_operand(fs_reg src)
1317 {
1318 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1319 * might be able to do better by doing execsize = 1 math and then
1320 * expanding that result out, but we would need to be careful with
1321 * masking.
1322 *
1323 * The hardware ignores source modifiers (negate and abs) on math
1324 * instructions, so we also move to a temp to set those up.
1325 */
1326 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1327 !src.abs && !src.negate)
1328 return src;
1329
1330 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1331 * operands to math
1332 */
1333 if (brw->gen >= 7 && src.file != IMM)
1334 return src;
1335
1336 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1337 expanded.type = src.type;
1338 emit(BRW_OPCODE_MOV, expanded, src);
1339 return expanded;
1340 }
1341
1342 fs_inst *
1343 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1344 {
1345 switch (opcode) {
1346 case SHADER_OPCODE_RCP:
1347 case SHADER_OPCODE_RSQ:
1348 case SHADER_OPCODE_SQRT:
1349 case SHADER_OPCODE_EXP2:
1350 case SHADER_OPCODE_LOG2:
1351 case SHADER_OPCODE_SIN:
1352 case SHADER_OPCODE_COS:
1353 break;
1354 default:
1355 assert(!"not reached: bad math opcode");
1356 return NULL;
1357 }
1358
1359 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1360 * might be able to do better by doing execsize = 1 math and then
1361 * expanding that result out, but we would need to be careful with
1362 * masking.
1363 *
1364 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1365 * instructions, so we also move to a temp to set those up.
1366 */
1367 if (brw->gen >= 6)
1368 src = fix_math_operand(src);
1369
1370 fs_inst *inst = emit(opcode, dst, src);
1371
1372 if (brw->gen < 6) {
1373 inst->base_mrf = 2;
1374 inst->mlen = dispatch_width / 8;
1375 }
1376
1377 return inst;
1378 }
1379
1380 fs_inst *
1381 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1382 {
1383 int base_mrf = 2;
1384 fs_inst *inst;
1385
1386 switch (opcode) {
1387 case SHADER_OPCODE_INT_QUOTIENT:
1388 case SHADER_OPCODE_INT_REMAINDER:
1389 if (brw->gen >= 7)
1390 no16("SIMD16 INTDIV unsupported\n");
1391 break;
1392 case SHADER_OPCODE_POW:
1393 break;
1394 default:
1395 assert(!"not reached: unsupported binary math opcode.");
1396 return NULL;
1397 }
1398
1399 if (brw->gen >= 6) {
1400 src0 = fix_math_operand(src0);
1401 src1 = fix_math_operand(src1);
1402
1403 inst = emit(opcode, dst, src0, src1);
1404 } else {
1405 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1406 * "Message Payload":
1407 *
1408 * "Operand0[7]. For the INT DIV functions, this operand is the
1409 * denominator."
1410 * ...
1411 * "Operand1[7]. For the INT DIV functions, this operand is the
1412 * numerator."
1413 */
1414 bool is_int_div = opcode != SHADER_OPCODE_POW;
1415 fs_reg &op0 = is_int_div ? src1 : src0;
1416 fs_reg &op1 = is_int_div ? src0 : src1;
1417
1418 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1419 inst = emit(opcode, dst, op0, reg_null_f);
1420
1421 inst->base_mrf = base_mrf;
1422 inst->mlen = 2 * dispatch_width / 8;
1423 }
1424 return inst;
1425 }
1426
1427 void
1428 fs_visitor::assign_curb_setup()
1429 {
1430 if (dispatch_width == 8) {
1431 prog_data->first_curbe_grf = payload.num_regs;
1432 } else {
1433 prog_data->first_curbe_grf_16 = payload.num_regs;
1434 }
1435
1436 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1437
1438 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1439 foreach_list(node, &this->instructions) {
1440 fs_inst *inst = (fs_inst *)node;
1441
1442 for (unsigned int i = 0; i < inst->sources; i++) {
1443 if (inst->src[i].file == UNIFORM) {
1444 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1445 int constant_nr;
1446 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1447 constant_nr = push_constant_loc[uniform_nr];
1448 } else {
1449 /* Section 5.11 of the OpenGL 4.1 spec says:
1450 * "Out-of-bounds reads return undefined values, which include
1451 * values from other variables of the active program or zero."
1452 * Just return the first push constant.
1453 */
1454 constant_nr = 0;
1455 }
1456
1457 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1458 constant_nr / 8,
1459 constant_nr % 8);
1460
1461 inst->src[i].file = HW_REG;
1462 inst->src[i].fixed_hw_reg = byte_offset(
1463 retype(brw_reg, inst->src[i].type),
1464 inst->src[i].subreg_offset);
1465 }
1466 }
1467 }
1468 }
1469
1470 void
1471 fs_visitor::calculate_urb_setup()
1472 {
1473 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1474 prog_data->urb_setup[i] = -1;
1475 }
1476
1477 int urb_next = 0;
1478 /* Figure out where each of the incoming setup attributes lands. */
1479 if (brw->gen >= 6) {
1480 if (_mesa_bitcount_64(fp->Base.InputsRead &
1481 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1482 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1483 * first 16 varying inputs, so we can put them wherever we want.
1484 * Just put them in order.
1485 *
1486 * This is useful because it means that (a) inputs not used by the
1487 * fragment shader won't take up valuable register space, and (b) we
1488 * won't have to recompile the fragment shader if it gets paired with
1489 * a different vertex (or geometry) shader.
1490 */
1491 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1492 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1493 BITFIELD64_BIT(i)) {
1494 prog_data->urb_setup[i] = urb_next++;
1495 }
1496 }
1497 } else {
1498 /* We have enough input varyings that the SF/SBE pipeline stage can't
1499 * arbitrarily rearrange them to suit our whim; we have to put them
1500 * in an order that matches the output of the previous pipeline stage
1501 * (geometry or vertex shader).
1502 */
1503 struct brw_vue_map prev_stage_vue_map;
1504 brw_compute_vue_map(brw, &prev_stage_vue_map,
1505 key->input_slots_valid);
1506 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1507 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1508 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1509 slot++) {
1510 int varying = prev_stage_vue_map.slot_to_varying[slot];
1511 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1512 * unused.
1513 */
1514 if (varying != BRW_VARYING_SLOT_COUNT &&
1515 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1516 BITFIELD64_BIT(varying))) {
1517 prog_data->urb_setup[varying] = slot - first_slot;
1518 }
1519 }
1520 urb_next = prev_stage_vue_map.num_slots - first_slot;
1521 }
1522 } else {
1523 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1524 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1525 /* Point size is packed into the header, not as a general attribute */
1526 if (i == VARYING_SLOT_PSIZ)
1527 continue;
1528
1529 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1530 /* The back color slot is skipped when the front color is
1531 * also written to. In addition, some slots can be
1532 * written in the vertex shader and not read in the
1533 * fragment shader. So the register number must always be
1534 * incremented, mapped or not.
1535 */
1536 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1537 prog_data->urb_setup[i] = urb_next;
1538 urb_next++;
1539 }
1540 }
1541
1542 /*
1543 * It's a FS only attribute, and we did interpolation for this attribute
1544 * in SF thread. So, count it here, too.
1545 *
1546 * See compile_sf_prog() for more info.
1547 */
1548 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1549 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1550 }
1551
1552 prog_data->num_varying_inputs = urb_next;
1553 }
1554
1555 void
1556 fs_visitor::assign_urb_setup()
1557 {
1558 int urb_start = payload.num_regs + prog_data->curb_read_length;
1559
1560 /* Offset all the urb_setup[] index by the actual position of the
1561 * setup regs, now that the location of the constants has been chosen.
1562 */
1563 foreach_list(node, &this->instructions) {
1564 fs_inst *inst = (fs_inst *)node;
1565
1566 if (inst->opcode == FS_OPCODE_LINTERP) {
1567 assert(inst->src[2].file == HW_REG);
1568 inst->src[2].fixed_hw_reg.nr += urb_start;
1569 }
1570
1571 if (inst->opcode == FS_OPCODE_CINTERP) {
1572 assert(inst->src[0].file == HW_REG);
1573 inst->src[0].fixed_hw_reg.nr += urb_start;
1574 }
1575 }
1576
1577 /* Each attribute is 4 setup channels, each of which is half a reg. */
1578 this->first_non_payload_grf =
1579 urb_start + prog_data->num_varying_inputs * 2;
1580 }
1581
1582 /**
1583 * Split large virtual GRFs into separate components if we can.
1584 *
1585 * This is mostly duplicated with what brw_fs_vector_splitting does,
1586 * but that's really conservative because it's afraid of doing
1587 * splitting that doesn't result in real progress after the rest of
1588 * the optimization phases, which would cause infinite looping in
1589 * optimization. We can do it once here, safely. This also has the
1590 * opportunity to split interpolated values, or maybe even uniforms,
1591 * which we don't have at the IR level.
1592 *
1593 * We want to split, because virtual GRFs are what we register
1594 * allocate and spill (due to contiguousness requirements for some
1595 * instructions), and they're what we naturally generate in the
1596 * codegen process, but most virtual GRFs don't actually need to be
1597 * contiguous sets of GRFs. If we split, we'll end up with reduced
1598 * live intervals and better dead code elimination and coalescing.
1599 */
1600 void
1601 fs_visitor::split_virtual_grfs()
1602 {
1603 int num_vars = this->virtual_grf_count;
1604 bool split_grf[num_vars];
1605 int new_virtual_grf[num_vars];
1606
1607 /* Try to split anything > 0 sized. */
1608 for (int i = 0; i < num_vars; i++) {
1609 if (this->virtual_grf_sizes[i] != 1)
1610 split_grf[i] = true;
1611 else
1612 split_grf[i] = false;
1613 }
1614
1615 if (brw->has_pln &&
1616 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1617 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1618 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1619 * Gen6, that was the only supported interpolation mode, and since Gen6,
1620 * delta_x and delta_y are in fixed hardware registers.
1621 */
1622 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1623 false;
1624 }
1625
1626 foreach_list(node, &this->instructions) {
1627 fs_inst *inst = (fs_inst *)node;
1628
1629 /* If there's a SEND message that requires contiguous destination
1630 * registers, no splitting is allowed.
1631 */
1632 if (inst->regs_written > 1) {
1633 split_grf[inst->dst.reg] = false;
1634 }
1635
1636 /* If we're sending from a GRF, don't split it, on the assumption that
1637 * the send is reading the whole thing.
1638 */
1639 if (inst->is_send_from_grf()) {
1640 for (int i = 0; i < inst->sources; i++) {
1641 if (inst->src[i].file == GRF) {
1642 split_grf[inst->src[i].reg] = false;
1643 }
1644 }
1645 }
1646 }
1647
1648 /* Allocate new space for split regs. Note that the virtual
1649 * numbers will be contiguous.
1650 */
1651 for (int i = 0; i < num_vars; i++) {
1652 if (split_grf[i]) {
1653 new_virtual_grf[i] = virtual_grf_alloc(1);
1654 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1655 int reg = virtual_grf_alloc(1);
1656 assert(reg == new_virtual_grf[i] + j - 1);
1657 (void) reg;
1658 }
1659 this->virtual_grf_sizes[i] = 1;
1660 }
1661 }
1662
1663 foreach_list(node, &this->instructions) {
1664 fs_inst *inst = (fs_inst *)node;
1665
1666 if (inst->dst.file == GRF &&
1667 split_grf[inst->dst.reg] &&
1668 inst->dst.reg_offset != 0) {
1669 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1670 inst->dst.reg_offset - 1);
1671 inst->dst.reg_offset = 0;
1672 }
1673 for (int i = 0; i < inst->sources; i++) {
1674 if (inst->src[i].file == GRF &&
1675 split_grf[inst->src[i].reg] &&
1676 inst->src[i].reg_offset != 0) {
1677 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1678 inst->src[i].reg_offset - 1);
1679 inst->src[i].reg_offset = 0;
1680 }
1681 }
1682 }
1683 invalidate_live_intervals();
1684 }
1685
1686 /**
1687 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1688 *
1689 * During code generation, we create tons of temporary variables, many of
1690 * which get immediately killed and are never used again. Yet, in later
1691 * optimization and analysis passes, such as compute_live_intervals, we need
1692 * to loop over all the virtual GRFs. Compacting them can save a lot of
1693 * overhead.
1694 */
1695 void
1696 fs_visitor::compact_virtual_grfs()
1697 {
1698 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER))
1699 return;
1700
1701 /* Mark which virtual GRFs are used, and count how many. */
1702 int remap_table[this->virtual_grf_count];
1703 memset(remap_table, -1, sizeof(remap_table));
1704
1705 foreach_list(node, &this->instructions) {
1706 const fs_inst *inst = (const fs_inst *) node;
1707
1708 if (inst->dst.file == GRF)
1709 remap_table[inst->dst.reg] = 0;
1710
1711 for (int i = 0; i < inst->sources; i++) {
1712 if (inst->src[i].file == GRF)
1713 remap_table[inst->src[i].reg] = 0;
1714 }
1715 }
1716
1717 /* Compact the GRF arrays. */
1718 int new_index = 0;
1719 for (int i = 0; i < this->virtual_grf_count; i++) {
1720 if (remap_table[i] != -1) {
1721 remap_table[i] = new_index;
1722 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1723 invalidate_live_intervals();
1724 ++new_index;
1725 }
1726 }
1727
1728 this->virtual_grf_count = new_index;
1729
1730 /* Patch all the instructions to use the newly renumbered registers */
1731 foreach_list(node, &this->instructions) {
1732 fs_inst *inst = (fs_inst *) node;
1733
1734 if (inst->dst.file == GRF)
1735 inst->dst.reg = remap_table[inst->dst.reg];
1736
1737 for (int i = 0; i < inst->sources; i++) {
1738 if (inst->src[i].file == GRF)
1739 inst->src[i].reg = remap_table[inst->src[i].reg];
1740 }
1741 }
1742 }
1743
1744 /*
1745 * Implements array access of uniforms by inserting a
1746 * PULL_CONSTANT_LOAD instruction.
1747 *
1748 * Unlike temporary GRF array access (where we don't support it due to
1749 * the difficulty of doing relative addressing on instruction
1750 * destinations), we could potentially do array access of uniforms
1751 * that were loaded in GRF space as push constants. In real-world
1752 * usage we've seen, though, the arrays being used are always larger
1753 * than we could load as push constants, so just always move all
1754 * uniform array access out to a pull constant buffer.
1755 */
1756 void
1757 fs_visitor::move_uniform_array_access_to_pull_constants()
1758 {
1759 if (dispatch_width != 8)
1760 return;
1761
1762 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1763
1764 for (unsigned int i = 0; i < uniforms; i++) {
1765 pull_constant_loc[i] = -1;
1766 }
1767
1768 /* Walk through and find array access of uniforms. Put a copy of that
1769 * uniform in the pull constant buffer.
1770 *
1771 * Note that we don't move constant-indexed accesses to arrays. No
1772 * testing has been done of the performance impact of this choice.
1773 */
1774 foreach_list_safe(node, &this->instructions) {
1775 fs_inst *inst = (fs_inst *)node;
1776
1777 for (int i = 0 ; i < inst->sources; i++) {
1778 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1779 continue;
1780
1781 int uniform = inst->src[i].reg;
1782
1783 /* If this array isn't already present in the pull constant buffer,
1784 * add it.
1785 */
1786 if (pull_constant_loc[uniform] == -1) {
1787 const float **values = &stage_prog_data->param[uniform];
1788
1789 assert(param_size[uniform]);
1790
1791 for (int j = 0; j < param_size[uniform]; j++) {
1792 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1793
1794 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1795 values[j];
1796 }
1797 }
1798 }
1799 }
1800 }
1801
1802 /**
1803 * Assign UNIFORM file registers to either push constants or pull constants.
1804 *
1805 * We allow a fragment shader to have more than the specified minimum
1806 * maximum number of fragment shader uniform components (64). If
1807 * there are too many of these, they'd fill up all of register space.
1808 * So, this will push some of them out to the pull constant buffer and
1809 * update the program to load them.
1810 */
1811 void
1812 fs_visitor::assign_constant_locations()
1813 {
1814 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1815 if (dispatch_width != 8)
1816 return;
1817
1818 /* Find which UNIFORM registers are still in use. */
1819 bool is_live[uniforms];
1820 for (unsigned int i = 0; i < uniforms; i++) {
1821 is_live[i] = false;
1822 }
1823
1824 foreach_list(node, &this->instructions) {
1825 fs_inst *inst = (fs_inst *) node;
1826
1827 for (int i = 0; i < inst->sources; i++) {
1828 if (inst->src[i].file != UNIFORM)
1829 continue;
1830
1831 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1832 if (constant_nr >= 0 && constant_nr < (int) uniforms)
1833 is_live[constant_nr] = true;
1834 }
1835 }
1836
1837 /* Only allow 16 registers (128 uniform components) as push constants.
1838 *
1839 * Just demote the end of the list. We could probably do better
1840 * here, demoting things that are rarely used in the program first.
1841 */
1842 unsigned int max_push_components = 16 * 8;
1843 unsigned int num_push_constants = 0;
1844
1845 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1846
1847 for (unsigned int i = 0; i < uniforms; i++) {
1848 if (!is_live[i] || pull_constant_loc[i] != -1) {
1849 /* This UNIFORM register is either dead, or has already been demoted
1850 * to a pull const. Mark it as no longer living in the param[] array.
1851 */
1852 push_constant_loc[i] = -1;
1853 continue;
1854 }
1855
1856 if (num_push_constants < max_push_components) {
1857 /* Retain as a push constant. Record the location in the params[]
1858 * array.
1859 */
1860 push_constant_loc[i] = num_push_constants++;
1861 } else {
1862 /* Demote to a pull constant. */
1863 push_constant_loc[i] = -1;
1864
1865 int pull_index = stage_prog_data->nr_pull_params++;
1866 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1867 pull_constant_loc[i] = pull_index;
1868 }
1869 }
1870
1871 stage_prog_data->nr_params = num_push_constants;
1872
1873 /* Up until now, the param[] array has been indexed by reg + reg_offset
1874 * of UNIFORM registers. Condense it to only contain the uniforms we
1875 * chose to upload as push constants.
1876 */
1877 for (unsigned int i = 0; i < uniforms; i++) {
1878 int remapped = push_constant_loc[i];
1879
1880 if (remapped == -1)
1881 continue;
1882
1883 assert(remapped <= (int)i);
1884 stage_prog_data->param[remapped] = stage_prog_data->param[i];
1885 }
1886 }
1887
1888 /**
1889 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1890 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1891 */
1892 void
1893 fs_visitor::demote_pull_constants()
1894 {
1895 foreach_list(node, &this->instructions) {
1896 fs_inst *inst = (fs_inst *)node;
1897
1898 for (int i = 0; i < inst->sources; i++) {
1899 if (inst->src[i].file != UNIFORM)
1900 continue;
1901
1902 int pull_index = pull_constant_loc[inst->src[i].reg +
1903 inst->src[i].reg_offset];
1904 if (pull_index == -1)
1905 continue;
1906
1907 /* Set up the annotation tracking for new generated instructions. */
1908 base_ir = inst->ir;
1909 current_annotation = inst->annotation;
1910
1911 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1912 fs_reg dst = fs_reg(this, glsl_type::float_type);
1913
1914 /* Generate a pull load into dst. */
1915 if (inst->src[i].reladdr) {
1916 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
1917 surf_index,
1918 *inst->src[i].reladdr,
1919 pull_index);
1920 inst->insert_before(&list);
1921 inst->src[i].reladdr = NULL;
1922 } else {
1923 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1924 fs_inst *pull =
1925 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1926 dst, surf_index, offset);
1927 inst->insert_before(pull);
1928 inst->src[i].set_smear(pull_index & 3);
1929 }
1930
1931 /* Rewrite the instruction to use the temporary VGRF. */
1932 inst->src[i].file = GRF;
1933 inst->src[i].reg = dst.reg;
1934 inst->src[i].reg_offset = 0;
1935 }
1936 }
1937 invalidate_live_intervals();
1938 }
1939
1940 bool
1941 fs_visitor::opt_algebraic()
1942 {
1943 bool progress = false;
1944
1945 foreach_list(node, &this->instructions) {
1946 fs_inst *inst = (fs_inst *)node;
1947
1948 switch (inst->opcode) {
1949 case BRW_OPCODE_MUL:
1950 if (inst->src[1].file != IMM)
1951 continue;
1952
1953 /* a * 1.0 = a */
1954 if (inst->src[1].is_one()) {
1955 inst->opcode = BRW_OPCODE_MOV;
1956 inst->src[1] = reg_undef;
1957 progress = true;
1958 break;
1959 }
1960
1961 /* a * 0.0 = 0.0 */
1962 if (inst->src[1].is_zero()) {
1963 inst->opcode = BRW_OPCODE_MOV;
1964 inst->src[0] = inst->src[1];
1965 inst->src[1] = reg_undef;
1966 progress = true;
1967 break;
1968 }
1969
1970 break;
1971 case BRW_OPCODE_ADD:
1972 if (inst->src[1].file != IMM)
1973 continue;
1974
1975 /* a + 0.0 = a */
1976 if (inst->src[1].is_zero()) {
1977 inst->opcode = BRW_OPCODE_MOV;
1978 inst->src[1] = reg_undef;
1979 progress = true;
1980 break;
1981 }
1982 break;
1983 case BRW_OPCODE_OR:
1984 if (inst->src[0].equals(inst->src[1])) {
1985 inst->opcode = BRW_OPCODE_MOV;
1986 inst->src[1] = reg_undef;
1987 progress = true;
1988 break;
1989 }
1990 break;
1991 case BRW_OPCODE_LRP:
1992 if (inst->src[1].equals(inst->src[2])) {
1993 inst->opcode = BRW_OPCODE_MOV;
1994 inst->src[0] = inst->src[1];
1995 inst->src[1] = reg_undef;
1996 inst->src[2] = reg_undef;
1997 progress = true;
1998 break;
1999 }
2000 break;
2001 case BRW_OPCODE_SEL:
2002 if (inst->saturate && inst->src[1].file == IMM) {
2003 switch (inst->conditional_mod) {
2004 case BRW_CONDITIONAL_LE:
2005 case BRW_CONDITIONAL_L:
2006 switch (inst->src[1].type) {
2007 case BRW_REGISTER_TYPE_F:
2008 if (inst->src[1].imm.f >= 1.0f) {
2009 inst->opcode = BRW_OPCODE_MOV;
2010 inst->src[1] = reg_undef;
2011 progress = true;
2012 }
2013 break;
2014 default:
2015 break;
2016 }
2017 break;
2018 case BRW_CONDITIONAL_GE:
2019 case BRW_CONDITIONAL_G:
2020 switch (inst->src[1].type) {
2021 case BRW_REGISTER_TYPE_F:
2022 if (inst->src[1].imm.f <= 0.0f) {
2023 inst->opcode = BRW_OPCODE_MOV;
2024 inst->src[1] = reg_undef;
2025 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2026 progress = true;
2027 }
2028 break;
2029 default:
2030 break;
2031 }
2032 default:
2033 break;
2034 }
2035 }
2036 break;
2037 default:
2038 break;
2039 }
2040 }
2041
2042 return progress;
2043 }
2044
2045 bool
2046 fs_visitor::compute_to_mrf()
2047 {
2048 bool progress = false;
2049 int next_ip = 0;
2050
2051 calculate_live_intervals();
2052
2053 foreach_list_safe(node, &this->instructions) {
2054 fs_inst *inst = (fs_inst *)node;
2055
2056 int ip = next_ip;
2057 next_ip++;
2058
2059 if (inst->opcode != BRW_OPCODE_MOV ||
2060 inst->is_partial_write() ||
2061 inst->dst.file != MRF || inst->src[0].file != GRF ||
2062 inst->dst.type != inst->src[0].type ||
2063 inst->src[0].abs || inst->src[0].negate ||
2064 !inst->src[0].is_contiguous() ||
2065 inst->src[0].subreg_offset)
2066 continue;
2067
2068 /* Work out which hardware MRF registers are written by this
2069 * instruction.
2070 */
2071 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2072 int mrf_high;
2073 if (inst->dst.reg & BRW_MRF_COMPR4) {
2074 mrf_high = mrf_low + 4;
2075 } else if (dispatch_width == 16 &&
2076 (!inst->force_uncompressed && !inst->force_sechalf)) {
2077 mrf_high = mrf_low + 1;
2078 } else {
2079 mrf_high = mrf_low;
2080 }
2081
2082 /* Can't compute-to-MRF this GRF if someone else was going to
2083 * read it later.
2084 */
2085 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2086 continue;
2087
2088 /* Found a move of a GRF to a MRF. Let's see if we can go
2089 * rewrite the thing that made this GRF to write into the MRF.
2090 */
2091 fs_inst *scan_inst;
2092 for (scan_inst = (fs_inst *)inst->prev;
2093 scan_inst->prev != NULL;
2094 scan_inst = (fs_inst *)scan_inst->prev) {
2095 if (scan_inst->dst.file == GRF &&
2096 scan_inst->dst.reg == inst->src[0].reg) {
2097 /* Found the last thing to write our reg we want to turn
2098 * into a compute-to-MRF.
2099 */
2100
2101 /* If this one instruction didn't populate all the
2102 * channels, bail. We might be able to rewrite everything
2103 * that writes that reg, but it would require smarter
2104 * tracking to delay the rewriting until complete success.
2105 */
2106 if (scan_inst->is_partial_write())
2107 break;
2108
2109 /* Things returning more than one register would need us to
2110 * understand coalescing out more than one MOV at a time.
2111 */
2112 if (scan_inst->regs_written > 1)
2113 break;
2114
2115 /* SEND instructions can't have MRF as a destination. */
2116 if (scan_inst->mlen)
2117 break;
2118
2119 if (brw->gen == 6) {
2120 /* gen6 math instructions must have the destination be
2121 * GRF, so no compute-to-MRF for them.
2122 */
2123 if (scan_inst->is_math()) {
2124 break;
2125 }
2126 }
2127
2128 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2129 /* Found the creator of our MRF's source value. */
2130 scan_inst->dst.file = MRF;
2131 scan_inst->dst.reg = inst->dst.reg;
2132 scan_inst->saturate |= inst->saturate;
2133 inst->remove();
2134 progress = true;
2135 }
2136 break;
2137 }
2138
2139 /* We don't handle control flow here. Most computation of
2140 * values that end up in MRFs are shortly before the MRF
2141 * write anyway.
2142 */
2143 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2144 break;
2145
2146 /* You can't read from an MRF, so if someone else reads our
2147 * MRF's source GRF that we wanted to rewrite, that stops us.
2148 */
2149 bool interfered = false;
2150 for (int i = 0; i < scan_inst->sources; i++) {
2151 if (scan_inst->src[i].file == GRF &&
2152 scan_inst->src[i].reg == inst->src[0].reg &&
2153 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2154 interfered = true;
2155 }
2156 }
2157 if (interfered)
2158 break;
2159
2160 if (scan_inst->dst.file == MRF) {
2161 /* If somebody else writes our MRF here, we can't
2162 * compute-to-MRF before that.
2163 */
2164 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2165 int scan_mrf_high;
2166
2167 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2168 scan_mrf_high = scan_mrf_low + 4;
2169 } else if (dispatch_width == 16 &&
2170 (!scan_inst->force_uncompressed &&
2171 !scan_inst->force_sechalf)) {
2172 scan_mrf_high = scan_mrf_low + 1;
2173 } else {
2174 scan_mrf_high = scan_mrf_low;
2175 }
2176
2177 if (mrf_low == scan_mrf_low ||
2178 mrf_low == scan_mrf_high ||
2179 mrf_high == scan_mrf_low ||
2180 mrf_high == scan_mrf_high) {
2181 break;
2182 }
2183 }
2184
2185 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2186 /* Found a SEND instruction, which means that there are
2187 * live values in MRFs from base_mrf to base_mrf +
2188 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2189 * above it.
2190 */
2191 if (mrf_low >= scan_inst->base_mrf &&
2192 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2193 break;
2194 }
2195 if (mrf_high >= scan_inst->base_mrf &&
2196 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2197 break;
2198 }
2199 }
2200 }
2201 }
2202
2203 if (progress)
2204 invalidate_live_intervals();
2205
2206 return progress;
2207 }
2208
2209 /**
2210 * Walks through basic blocks, looking for repeated MRF writes and
2211 * removing the later ones.
2212 */
2213 bool
2214 fs_visitor::remove_duplicate_mrf_writes()
2215 {
2216 fs_inst *last_mrf_move[16];
2217 bool progress = false;
2218
2219 /* Need to update the MRF tracking for compressed instructions. */
2220 if (dispatch_width == 16)
2221 return false;
2222
2223 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2224
2225 foreach_list_safe(node, &this->instructions) {
2226 fs_inst *inst = (fs_inst *)node;
2227
2228 if (inst->is_control_flow()) {
2229 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2230 }
2231
2232 if (inst->opcode == BRW_OPCODE_MOV &&
2233 inst->dst.file == MRF) {
2234 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2235 if (prev_inst && inst->equals(prev_inst)) {
2236 inst->remove();
2237 progress = true;
2238 continue;
2239 }
2240 }
2241
2242 /* Clear out the last-write records for MRFs that were overwritten. */
2243 if (inst->dst.file == MRF) {
2244 last_mrf_move[inst->dst.reg] = NULL;
2245 }
2246
2247 if (inst->mlen > 0 && inst->base_mrf != -1) {
2248 /* Found a SEND instruction, which will include two or fewer
2249 * implied MRF writes. We could do better here.
2250 */
2251 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2252 last_mrf_move[inst->base_mrf + i] = NULL;
2253 }
2254 }
2255
2256 /* Clear out any MRF move records whose sources got overwritten. */
2257 if (inst->dst.file == GRF) {
2258 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2259 if (last_mrf_move[i] &&
2260 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2261 last_mrf_move[i] = NULL;
2262 }
2263 }
2264 }
2265
2266 if (inst->opcode == BRW_OPCODE_MOV &&
2267 inst->dst.file == MRF &&
2268 inst->src[0].file == GRF &&
2269 !inst->is_partial_write()) {
2270 last_mrf_move[inst->dst.reg] = inst;
2271 }
2272 }
2273
2274 if (progress)
2275 invalidate_live_intervals();
2276
2277 return progress;
2278 }
2279
2280 static void
2281 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2282 int first_grf, int grf_len)
2283 {
2284 bool inst_simd16 = (dispatch_width > 8 &&
2285 !inst->force_uncompressed &&
2286 !inst->force_sechalf);
2287
2288 /* Clear the flag for registers that actually got read (as expected). */
2289 for (int i = 0; i < inst->sources; i++) {
2290 int grf;
2291 if (inst->src[i].file == GRF) {
2292 grf = inst->src[i].reg;
2293 } else if (inst->src[i].file == HW_REG &&
2294 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2295 grf = inst->src[i].fixed_hw_reg.nr;
2296 } else {
2297 continue;
2298 }
2299
2300 if (grf >= first_grf &&
2301 grf < first_grf + grf_len) {
2302 deps[grf - first_grf] = false;
2303 if (inst_simd16)
2304 deps[grf - first_grf + 1] = false;
2305 }
2306 }
2307 }
2308
2309 /**
2310 * Implements this workaround for the original 965:
2311 *
2312 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2313 * check for post destination dependencies on this instruction, software
2314 * must ensure that there is no destination hazard for the case of ‘write
2315 * followed by a posted write’ shown in the following example.
2316 *
2317 * 1. mov r3 0
2318 * 2. send r3.xy <rest of send instruction>
2319 * 3. mov r2 r3
2320 *
2321 * Due to no post-destination dependency check on the ‘send’, the above
2322 * code sequence could have two instructions (1 and 2) in flight at the
2323 * same time that both consider ‘r3’ as the target of their final writes.
2324 */
2325 void
2326 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2327 {
2328 int reg_size = dispatch_width / 8;
2329 int write_len = inst->regs_written * reg_size;
2330 int first_write_grf = inst->dst.reg;
2331 bool needs_dep[BRW_MAX_MRF];
2332 assert(write_len < (int)sizeof(needs_dep) - 1);
2333
2334 memset(needs_dep, false, sizeof(needs_dep));
2335 memset(needs_dep, true, write_len);
2336
2337 clear_deps_for_inst_src(inst, dispatch_width,
2338 needs_dep, first_write_grf, write_len);
2339
2340 /* Walk backwards looking for writes to registers we're writing which
2341 * aren't read since being written. If we hit the start of the program,
2342 * we assume that there are no outstanding dependencies on entry to the
2343 * program.
2344 */
2345 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2346 scan_inst != NULL;
2347 scan_inst = (fs_inst *)scan_inst->prev) {
2348
2349 /* If we hit control flow, assume that there *are* outstanding
2350 * dependencies, and force their cleanup before our instruction.
2351 */
2352 if (scan_inst->is_control_flow()) {
2353 for (int i = 0; i < write_len; i++) {
2354 if (needs_dep[i]) {
2355 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2356 }
2357 }
2358 return;
2359 }
2360
2361 bool scan_inst_simd16 = (dispatch_width > 8 &&
2362 !scan_inst->force_uncompressed &&
2363 !scan_inst->force_sechalf);
2364
2365 /* We insert our reads as late as possible on the assumption that any
2366 * instruction but a MOV that might have left us an outstanding
2367 * dependency has more latency than a MOV.
2368 */
2369 if (scan_inst->dst.file == GRF) {
2370 for (int i = 0; i < scan_inst->regs_written; i++) {
2371 int reg = scan_inst->dst.reg + i * reg_size;
2372
2373 if (reg >= first_write_grf &&
2374 reg < first_write_grf + write_len &&
2375 needs_dep[reg - first_write_grf]) {
2376 inst->insert_before(DEP_RESOLVE_MOV(reg));
2377 needs_dep[reg - first_write_grf] = false;
2378 if (scan_inst_simd16)
2379 needs_dep[reg - first_write_grf + 1] = false;
2380 }
2381 }
2382 }
2383
2384 /* Clear the flag for registers that actually got read (as expected). */
2385 clear_deps_for_inst_src(scan_inst, dispatch_width,
2386 needs_dep, first_write_grf, write_len);
2387
2388 /* Continue the loop only if we haven't resolved all the dependencies */
2389 int i;
2390 for (i = 0; i < write_len; i++) {
2391 if (needs_dep[i])
2392 break;
2393 }
2394 if (i == write_len)
2395 return;
2396 }
2397 }
2398
2399 /**
2400 * Implements this workaround for the original 965:
2401 *
2402 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2403 * used as a destination register until after it has been sourced by an
2404 * instruction with a different destination register.
2405 */
2406 void
2407 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2408 {
2409 int write_len = inst->regs_written * dispatch_width / 8;
2410 int first_write_grf = inst->dst.reg;
2411 bool needs_dep[BRW_MAX_MRF];
2412 assert(write_len < (int)sizeof(needs_dep) - 1);
2413
2414 memset(needs_dep, false, sizeof(needs_dep));
2415 memset(needs_dep, true, write_len);
2416 /* Walk forwards looking for writes to registers we're writing which aren't
2417 * read before being written.
2418 */
2419 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2420 !scan_inst->is_tail_sentinel();
2421 scan_inst = (fs_inst *)scan_inst->next) {
2422 /* If we hit control flow, force resolve all remaining dependencies. */
2423 if (scan_inst->is_control_flow()) {
2424 for (int i = 0; i < write_len; i++) {
2425 if (needs_dep[i])
2426 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2427 }
2428 return;
2429 }
2430
2431 /* Clear the flag for registers that actually got read (as expected). */
2432 clear_deps_for_inst_src(scan_inst, dispatch_width,
2433 needs_dep, first_write_grf, write_len);
2434
2435 /* We insert our reads as late as possible since they're reading the
2436 * result of a SEND, which has massive latency.
2437 */
2438 if (scan_inst->dst.file == GRF &&
2439 scan_inst->dst.reg >= first_write_grf &&
2440 scan_inst->dst.reg < first_write_grf + write_len &&
2441 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2442 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2443 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2444 }
2445
2446 /* Continue the loop only if we haven't resolved all the dependencies */
2447 int i;
2448 for (i = 0; i < write_len; i++) {
2449 if (needs_dep[i])
2450 break;
2451 }
2452 if (i == write_len)
2453 return;
2454 }
2455
2456 /* If we hit the end of the program, resolve all remaining dependencies out
2457 * of paranoia.
2458 */
2459 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2460 assert(last_inst->eot);
2461 for (int i = 0; i < write_len; i++) {
2462 if (needs_dep[i])
2463 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2464 }
2465 }
2466
2467 void
2468 fs_visitor::insert_gen4_send_dependency_workarounds()
2469 {
2470 if (brw->gen != 4 || brw->is_g4x)
2471 return;
2472
2473 /* Note that we're done with register allocation, so GRF fs_regs always
2474 * have a .reg_offset of 0.
2475 */
2476
2477 foreach_list_safe(node, &this->instructions) {
2478 fs_inst *inst = (fs_inst *)node;
2479
2480 if (inst->mlen != 0 && inst->dst.file == GRF) {
2481 insert_gen4_pre_send_dependency_workarounds(inst);
2482 insert_gen4_post_send_dependency_workarounds(inst);
2483 }
2484 }
2485 }
2486
2487 /**
2488 * Turns the generic expression-style uniform pull constant load instruction
2489 * into a hardware-specific series of instructions for loading a pull
2490 * constant.
2491 *
2492 * The expression style allows the CSE pass before this to optimize out
2493 * repeated loads from the same offset, and gives the pre-register-allocation
2494 * scheduling full flexibility, while the conversion to native instructions
2495 * allows the post-register-allocation scheduler the best information
2496 * possible.
2497 *
2498 * Note that execution masking for setting up pull constant loads is special:
2499 * the channels that need to be written are unrelated to the current execution
2500 * mask, since a later instruction will use one of the result channels as a
2501 * source operand for all 8 or 16 of its channels.
2502 */
2503 void
2504 fs_visitor::lower_uniform_pull_constant_loads()
2505 {
2506 foreach_list(node, &this->instructions) {
2507 fs_inst *inst = (fs_inst *)node;
2508
2509 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2510 continue;
2511
2512 if (brw->gen >= 7) {
2513 /* The offset arg before was a vec4-aligned byte offset. We need to
2514 * turn it into a dword offset.
2515 */
2516 fs_reg const_offset_reg = inst->src[1];
2517 assert(const_offset_reg.file == IMM &&
2518 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2519 const_offset_reg.imm.u /= 4;
2520 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2521
2522 /* This is actually going to be a MOV, but since only the first dword
2523 * is accessed, we have a special opcode to do just that one. Note
2524 * that this needs to be an operation that will be considered a def
2525 * by live variable analysis, or register allocation will explode.
2526 */
2527 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2528 payload, const_offset_reg);
2529 setup->force_writemask_all = true;
2530
2531 setup->ir = inst->ir;
2532 setup->annotation = inst->annotation;
2533 inst->insert_before(setup);
2534
2535 /* Similarly, this will only populate the first 4 channels of the
2536 * result register (since we only use smear values from 0-3), but we
2537 * don't tell the optimizer.
2538 */
2539 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2540 inst->src[1] = payload;
2541
2542 invalidate_live_intervals();
2543 } else {
2544 /* Before register allocation, we didn't tell the scheduler about the
2545 * MRF we use. We know it's safe to use this MRF because nothing
2546 * else does except for register spill/unspill, which generates and
2547 * uses its MRF within a single IR instruction.
2548 */
2549 inst->base_mrf = 14;
2550 inst->mlen = 1;
2551 }
2552 }
2553 }
2554
2555 void
2556 fs_visitor::dump_instructions()
2557 {
2558 dump_instructions(NULL);
2559 }
2560
2561 void
2562 fs_visitor::dump_instructions(const char *name)
2563 {
2564 calculate_register_pressure();
2565 FILE *file = stderr;
2566 if (name && geteuid() != 0) {
2567 file = fopen(name, "w");
2568 if (!file)
2569 file = stderr;
2570 }
2571
2572 int ip = 0, max_pressure = 0;
2573 foreach_list(node, &this->instructions) {
2574 backend_instruction *inst = (backend_instruction *)node;
2575 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2576 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
2577 dump_instruction(inst, file);
2578 ++ip;
2579 }
2580 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
2581
2582 if (file != stderr) {
2583 fclose(file);
2584 }
2585 }
2586
2587 void
2588 fs_visitor::dump_instruction(backend_instruction *be_inst)
2589 {
2590 dump_instruction(be_inst, stderr);
2591 }
2592
2593 void
2594 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
2595 {
2596 fs_inst *inst = (fs_inst *)be_inst;
2597
2598 if (inst->predicate) {
2599 fprintf(file, "(%cf0.%d) ",
2600 inst->predicate_inverse ? '-' : '+',
2601 inst->flag_subreg);
2602 }
2603
2604 fprintf(file, "%s", brw_instruction_name(inst->opcode));
2605 if (inst->saturate)
2606 fprintf(file, ".sat");
2607 if (inst->conditional_mod) {
2608 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
2609 if (!inst->predicate &&
2610 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2611 inst->opcode != BRW_OPCODE_IF &&
2612 inst->opcode != BRW_OPCODE_WHILE))) {
2613 fprintf(file, ".f0.%d", inst->flag_subreg);
2614 }
2615 }
2616 fprintf(file, " ");
2617
2618
2619 switch (inst->dst.file) {
2620 case GRF:
2621 fprintf(file, "vgrf%d", inst->dst.reg);
2622 if (virtual_grf_sizes[inst->dst.reg] != 1 ||
2623 inst->dst.subreg_offset)
2624 fprintf(file, "+%d.%d",
2625 inst->dst.reg_offset, inst->dst.subreg_offset);
2626 break;
2627 case MRF:
2628 fprintf(file, "m%d", inst->dst.reg);
2629 break;
2630 case BAD_FILE:
2631 fprintf(file, "(null)");
2632 break;
2633 case UNIFORM:
2634 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
2635 break;
2636 case HW_REG:
2637 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2638 switch (inst->dst.fixed_hw_reg.nr) {
2639 case BRW_ARF_NULL:
2640 fprintf(file, "null");
2641 break;
2642 case BRW_ARF_ADDRESS:
2643 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
2644 break;
2645 case BRW_ARF_ACCUMULATOR:
2646 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
2647 break;
2648 case BRW_ARF_FLAG:
2649 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2650 inst->dst.fixed_hw_reg.subnr);
2651 break;
2652 default:
2653 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2654 inst->dst.fixed_hw_reg.subnr);
2655 break;
2656 }
2657 } else {
2658 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
2659 }
2660 if (inst->dst.fixed_hw_reg.subnr)
2661 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
2662 break;
2663 default:
2664 fprintf(file, "???");
2665 break;
2666 }
2667 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
2668
2669 for (int i = 0; i < inst->sources && inst->src[i].file != BAD_FILE; i++) {
2670 if (inst->src[i].negate)
2671 fprintf(file, "-");
2672 if (inst->src[i].abs)
2673 fprintf(file, "|");
2674 switch (inst->src[i].file) {
2675 case GRF:
2676 fprintf(file, "vgrf%d", inst->src[i].reg);
2677 if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2678 inst->src[i].subreg_offset)
2679 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2680 inst->src[i].subreg_offset);
2681 break;
2682 case MRF:
2683 fprintf(file, "***m%d***", inst->src[i].reg);
2684 break;
2685 case UNIFORM:
2686 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
2687 if (inst->src[i].reladdr) {
2688 fprintf(file, "+reladdr");
2689 } else if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2690 inst->src[i].subreg_offset) {
2691 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2692 inst->src[i].subreg_offset);
2693 }
2694 break;
2695 case BAD_FILE:
2696 fprintf(file, "(null)");
2697 break;
2698 case IMM:
2699 switch (inst->src[i].type) {
2700 case BRW_REGISTER_TYPE_F:
2701 fprintf(file, "%ff", inst->src[i].imm.f);
2702 break;
2703 case BRW_REGISTER_TYPE_D:
2704 fprintf(file, "%dd", inst->src[i].imm.i);
2705 break;
2706 case BRW_REGISTER_TYPE_UD:
2707 fprintf(file, "%uu", inst->src[i].imm.u);
2708 break;
2709 default:
2710 fprintf(file, "???");
2711 break;
2712 }
2713 break;
2714 case HW_REG:
2715 if (inst->src[i].fixed_hw_reg.negate)
2716 fprintf(file, "-");
2717 if (inst->src[i].fixed_hw_reg.abs)
2718 fprintf(file, "|");
2719 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2720 switch (inst->src[i].fixed_hw_reg.nr) {
2721 case BRW_ARF_NULL:
2722 fprintf(file, "null");
2723 break;
2724 case BRW_ARF_ADDRESS:
2725 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
2726 break;
2727 case BRW_ARF_ACCUMULATOR:
2728 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
2729 break;
2730 case BRW_ARF_FLAG:
2731 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2732 inst->src[i].fixed_hw_reg.subnr);
2733 break;
2734 default:
2735 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2736 inst->src[i].fixed_hw_reg.subnr);
2737 break;
2738 }
2739 } else {
2740 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2741 }
2742 if (inst->src[i].fixed_hw_reg.subnr)
2743 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
2744 if (inst->src[i].fixed_hw_reg.abs)
2745 fprintf(file, "|");
2746 break;
2747 default:
2748 fprintf(file, "???");
2749 break;
2750 }
2751 if (inst->src[i].abs)
2752 fprintf(file, "|");
2753
2754 if (inst->src[i].file != IMM) {
2755 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
2756 }
2757
2758 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
2759 fprintf(file, ", ");
2760 }
2761
2762 fprintf(file, " ");
2763
2764 if (inst->force_uncompressed)
2765 fprintf(file, "1sthalf ");
2766
2767 if (inst->force_sechalf)
2768 fprintf(file, "2ndhalf ");
2769
2770 fprintf(file, "\n");
2771 }
2772
2773 /**
2774 * Possibly returns an instruction that set up @param reg.
2775 *
2776 * Sometimes we want to take the result of some expression/variable
2777 * dereference tree and rewrite the instruction generating the result
2778 * of the tree. When processing the tree, we know that the
2779 * instructions generated are all writing temporaries that are dead
2780 * outside of this tree. So, if we have some instructions that write
2781 * a temporary, we're free to point that temp write somewhere else.
2782 *
2783 * Note that this doesn't guarantee that the instruction generated
2784 * only reg -- it might be the size=4 destination of a texture instruction.
2785 */
2786 fs_inst *
2787 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2788 fs_inst *end,
2789 const fs_reg &reg)
2790 {
2791 if (end == start ||
2792 end->is_partial_write() ||
2793 reg.reladdr ||
2794 !reg.equals(end->dst)) {
2795 return NULL;
2796 } else {
2797 return end;
2798 }
2799 }
2800
2801 void
2802 fs_visitor::setup_payload_gen6()
2803 {
2804 bool uses_depth =
2805 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2806 unsigned barycentric_interp_modes = prog_data->barycentric_interp_modes;
2807
2808 assert(brw->gen >= 6);
2809
2810 /* R0-1: masks, pixel X/Y coordinates. */
2811 payload.num_regs = 2;
2812 /* R2: only for 32-pixel dispatch.*/
2813
2814 /* R3-26: barycentric interpolation coordinates. These appear in the
2815 * same order that they appear in the brw_wm_barycentric_interp_mode
2816 * enum. Each set of coordinates occupies 2 registers if dispatch width
2817 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2818 * appear if they were enabled using the "Barycentric Interpolation
2819 * Mode" bits in WM_STATE.
2820 */
2821 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2822 if (barycentric_interp_modes & (1 << i)) {
2823 payload.barycentric_coord_reg[i] = payload.num_regs;
2824 payload.num_regs += 2;
2825 if (dispatch_width == 16) {
2826 payload.num_regs += 2;
2827 }
2828 }
2829 }
2830
2831 /* R27: interpolated depth if uses source depth */
2832 if (uses_depth) {
2833 payload.source_depth_reg = payload.num_regs;
2834 payload.num_regs++;
2835 if (dispatch_width == 16) {
2836 /* R28: interpolated depth if not SIMD8. */
2837 payload.num_regs++;
2838 }
2839 }
2840 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2841 if (uses_depth) {
2842 payload.source_w_reg = payload.num_regs;
2843 payload.num_regs++;
2844 if (dispatch_width == 16) {
2845 /* R30: interpolated W if not SIMD8. */
2846 payload.num_regs++;
2847 }
2848 }
2849
2850 prog_data->uses_pos_offset = key->compute_pos_offset;
2851 /* R31: MSAA position offsets. */
2852 if (prog_data->uses_pos_offset) {
2853 payload.sample_pos_reg = payload.num_regs;
2854 payload.num_regs++;
2855 }
2856
2857 /* R32: MSAA input coverage mask */
2858 if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
2859 assert(brw->gen >= 7);
2860 payload.sample_mask_in_reg = payload.num_regs;
2861 payload.num_regs++;
2862 if (dispatch_width == 16) {
2863 /* R33: input coverage mask if not SIMD8. */
2864 payload.num_regs++;
2865 }
2866 }
2867
2868 /* R34-: bary for 32-pixel. */
2869 /* R58-59: interp W for 32-pixel. */
2870
2871 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2872 source_depth_to_render_target = true;
2873 }
2874 }
2875
2876 void
2877 fs_visitor::assign_binding_table_offsets()
2878 {
2879 uint32_t next_binding_table_offset = 0;
2880
2881 /* If there are no color regions, we still perform an FB write to a null
2882 * renderbuffer, which we place at surface index 0.
2883 */
2884 prog_data->binding_table.render_target_start = next_binding_table_offset;
2885 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
2886
2887 assign_common_binding_table_offsets(next_binding_table_offset);
2888 }
2889
2890 void
2891 fs_visitor::calculate_register_pressure()
2892 {
2893 invalidate_live_intervals();
2894 calculate_live_intervals();
2895
2896 int num_instructions = 0;
2897 foreach_list(node, &this->instructions) {
2898 ++num_instructions;
2899 }
2900
2901 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
2902
2903 for (int reg = 0; reg < virtual_grf_count; reg++) {
2904 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
2905 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
2906 }
2907 }
2908
2909 /**
2910 * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
2911 *
2912 * The needs_unlit_centroid_workaround ends up producing one of these per
2913 * channel of centroid input, so it's good to clean them up.
2914 *
2915 * An assumption here is that nothing ever modifies the dispatched pixels
2916 * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
2917 * dictates that anyway.
2918 */
2919 void
2920 fs_visitor::opt_drop_redundant_mov_to_flags()
2921 {
2922 bool flag_mov_found[2] = {false};
2923
2924 foreach_list_safe(node, &this->instructions) {
2925 fs_inst *inst = (fs_inst *)node;
2926
2927 if (inst->is_control_flow()) {
2928 memset(flag_mov_found, 0, sizeof(flag_mov_found));
2929 } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
2930 if (!flag_mov_found[inst->flag_subreg])
2931 flag_mov_found[inst->flag_subreg] = true;
2932 else
2933 inst->remove();
2934 } else if (inst->writes_flag()) {
2935 flag_mov_found[inst->flag_subreg] = false;
2936 }
2937 }
2938 }
2939
2940 bool
2941 fs_visitor::run()
2942 {
2943 sanity_param_count = fp->Base.Parameters->NumParameters;
2944 bool allocated_without_spills;
2945
2946 assign_binding_table_offsets();
2947
2948 if (brw->gen >= 6)
2949 setup_payload_gen6();
2950 else
2951 setup_payload_gen4();
2952
2953 if (0) {
2954 emit_dummy_fs();
2955 } else {
2956 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2957 emit_shader_time_begin();
2958
2959 calculate_urb_setup();
2960 if (fp->Base.InputsRead > 0) {
2961 if (brw->gen < 6)
2962 emit_interpolation_setup_gen4();
2963 else
2964 emit_interpolation_setup_gen6();
2965 }
2966
2967 /* We handle discards by keeping track of the still-live pixels in f0.1.
2968 * Initialize it with the dispatched pixels.
2969 */
2970 if (fp->UsesKill || key->alpha_test_func) {
2971 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2972 discard_init->flag_subreg = 1;
2973 }
2974
2975 /* Generate FS IR for main(). (the visitor only descends into
2976 * functions called "main").
2977 */
2978 if (shader) {
2979 foreach_list(node, &*shader->base.ir) {
2980 ir_instruction *ir = (ir_instruction *)node;
2981 base_ir = ir;
2982 this->result = reg_undef;
2983 ir->accept(this);
2984 }
2985 } else {
2986 emit_fragment_program_code();
2987 }
2988 base_ir = NULL;
2989 if (failed)
2990 return false;
2991
2992 emit(FS_OPCODE_PLACEHOLDER_HALT);
2993
2994 if (key->alpha_test_func)
2995 emit_alpha_test();
2996
2997 emit_fb_writes();
2998
2999 split_virtual_grfs();
3000
3001 move_uniform_array_access_to_pull_constants();
3002 assign_constant_locations();
3003 demote_pull_constants();
3004
3005 opt_drop_redundant_mov_to_flags();
3006
3007 #define OPT(pass, args...) do { \
3008 pass_num++; \
3009 bool this_progress = pass(args); \
3010 \
3011 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3012 char filename[64]; \
3013 snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass, \
3014 dispatch_width, shader_prog->Name, iteration, pass_num); \
3015 \
3016 backend_visitor::dump_instructions(filename); \
3017 } \
3018 \
3019 progress = progress || this_progress; \
3020 } while (false)
3021
3022 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3023 char filename[64];
3024 snprintf(filename, 64, "fs%d-%04d-00-start",
3025 dispatch_width, shader_prog->Name);
3026
3027 backend_visitor::dump_instructions(filename);
3028 }
3029
3030 bool progress;
3031 int iteration = 0;
3032 do {
3033 progress = false;
3034 iteration++;
3035 int pass_num = 0;
3036
3037 compact_virtual_grfs();
3038
3039 OPT(remove_duplicate_mrf_writes);
3040
3041 OPT(opt_algebraic);
3042 OPT(opt_cse);
3043 OPT(opt_copy_propagate);
3044 OPT(opt_peephole_predicated_break);
3045 OPT(dead_code_eliminate);
3046 OPT(opt_peephole_sel);
3047 OPT(dead_control_flow_eliminate, this);
3048 OPT(opt_saturate_propagation);
3049 OPT(register_coalesce);
3050 OPT(compute_to_mrf);
3051 } while (progress);
3052
3053 lower_uniform_pull_constant_loads();
3054
3055 assign_curb_setup();
3056 assign_urb_setup();
3057
3058 static enum instruction_scheduler_mode pre_modes[] = {
3059 SCHEDULE_PRE,
3060 SCHEDULE_PRE_NON_LIFO,
3061 SCHEDULE_PRE_LIFO,
3062 };
3063
3064 /* Try each scheduling heuristic to see if it can successfully register
3065 * allocate without spilling. They should be ordered by decreasing
3066 * performance but increasing likelihood of allocating.
3067 */
3068 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3069 schedule_instructions(pre_modes[i]);
3070
3071 if (0) {
3072 assign_regs_trivial();
3073 allocated_without_spills = true;
3074 } else {
3075 allocated_without_spills = assign_regs(false);
3076 }
3077 if (allocated_without_spills)
3078 break;
3079 }
3080
3081 if (!allocated_without_spills) {
3082 /* We assume that any spilling is worse than just dropping back to
3083 * SIMD8. There's probably actually some intermediate point where
3084 * SIMD16 with a couple of spills is still better.
3085 */
3086 if (dispatch_width == 16) {
3087 fail("Failure to register allocate. Reduce number of "
3088 "live scalar values to avoid this.");
3089 } else {
3090 perf_debug("Fragment shader triggered register spilling. "
3091 "Try reducing the number of live scalar values to "
3092 "improve performance.\n");
3093 }
3094
3095 /* Since we're out of heuristics, just go spill registers until we
3096 * get an allocation.
3097 */
3098 while (!assign_regs(true)) {
3099 if (failed)
3100 break;
3101 }
3102 }
3103 }
3104 assert(force_uncompressed_stack == 0);
3105
3106 /* This must come after all optimization and register allocation, since
3107 * it inserts dead code that happens to have side effects, and it does
3108 * so based on the actual physical registers in use.
3109 */
3110 insert_gen4_send_dependency_workarounds();
3111
3112 if (failed)
3113 return false;
3114
3115 if (!allocated_without_spills)
3116 schedule_instructions(SCHEDULE_POST);
3117
3118 if (last_scratch > 0) {
3119 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3120 }
3121
3122 if (dispatch_width == 8)
3123 prog_data->reg_blocks = brw_register_blocks(grf_used);
3124 else
3125 prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3126
3127 /* If any state parameters were appended, then ParameterValues could have
3128 * been realloced, in which case the driver uniform storage set up by
3129 * _mesa_associate_uniform_storage() would point to freed memory. Make
3130 * sure that didn't happen.
3131 */
3132 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3133
3134 return !failed;
3135 }
3136
3137 const unsigned *
3138 brw_wm_fs_emit(struct brw_context *brw,
3139 void *mem_ctx,
3140 const struct brw_wm_prog_key *key,
3141 struct brw_wm_prog_data *prog_data,
3142 struct gl_fragment_program *fp,
3143 struct gl_shader_program *prog,
3144 unsigned *final_assembly_size)
3145 {
3146 bool start_busy = false;
3147 double start_time = 0;
3148
3149 if (unlikely(brw->perf_debug)) {
3150 start_busy = (brw->batch.last_bo &&
3151 drm_intel_bo_busy(brw->batch.last_bo));
3152 start_time = get_time();
3153 }
3154
3155 struct brw_shader *shader = NULL;
3156 if (prog)
3157 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3158
3159 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3160 brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3161
3162 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3163 */
3164 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3165 if (!v.run()) {
3166 if (prog) {
3167 prog->LinkStatus = false;
3168 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3169 }
3170
3171 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3172 v.fail_msg);
3173
3174 return NULL;
3175 }
3176
3177 exec_list *simd16_instructions = NULL;
3178 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3179 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3180 if (!v.simd16_unsupported) {
3181 /* Try a SIMD16 compile */
3182 v2.import_uniforms(&v);
3183 if (!v2.run()) {
3184 perf_debug("SIMD16 shader failed to compile, falling back to "
3185 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3186 } else {
3187 simd16_instructions = &v2.instructions;
3188 }
3189 } else {
3190 perf_debug("SIMD16 shader unsupported, falling back to "
3191 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3192 }
3193 }
3194
3195 const unsigned *assembly = NULL;
3196 if (brw->gen >= 8) {
3197 gen8_fs_generator g(brw, mem_ctx, key, prog_data, prog, fp, v.do_dual_src);
3198 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3199 final_assembly_size);
3200 } else {
3201 fs_generator g(brw, mem_ctx, key, prog_data, prog, fp, v.do_dual_src,
3202 INTEL_DEBUG & DEBUG_WM);
3203 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3204 final_assembly_size);
3205 }
3206
3207 if (unlikely(brw->perf_debug) && shader) {
3208 if (shader->compiled_once)
3209 brw_wm_debug_recompile(brw, prog, key);
3210 shader->compiled_once = true;
3211
3212 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3213 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3214 (get_time() - start_time) * 1000);
3215 }
3216 }
3217
3218 return assembly;
3219 }
3220
3221 bool
3222 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3223 {
3224 struct brw_context *brw = brw_context(ctx);
3225 struct brw_wm_prog_key key;
3226
3227 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3228 return true;
3229
3230 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3231 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3232 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3233 bool program_uses_dfdy = fp->UsesDFdy;
3234
3235 memset(&key, 0, sizeof(key));
3236
3237 if (brw->gen < 6) {
3238 if (fp->UsesKill)
3239 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3240
3241 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3242 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3243
3244 /* Just assume depth testing. */
3245 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3246 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3247 }
3248
3249 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3250 BRW_FS_VARYING_INPUT_MASK) > 16)
3251 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3252
3253 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3254 for (unsigned i = 0; i < sampler_count; i++) {
3255 if (fp->Base.ShadowSamplers & (1 << i)) {
3256 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3257 key.tex.swizzles[i] =
3258 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3259 } else {
3260 /* Color sampler: assume no swizzling. */
3261 key.tex.swizzles[i] = SWIZZLE_XYZW;
3262 }
3263 }
3264
3265 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3266 key.drawable_height = ctx->DrawBuffer->Height;
3267 }
3268
3269 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3270 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3271 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3272
3273 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3274 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3275 key.nr_color_regions > 1;
3276 }
3277
3278 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3279 * quality of the derivatives is likely to be determined by the driconf
3280 * option.
3281 */
3282 key.high_quality_derivatives = brw->disable_derivative_optimization;
3283
3284 key.program_string_id = bfp->id;
3285
3286 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3287 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3288
3289 bool success = do_wm_prog(brw, prog, bfp, &key);
3290
3291 brw->wm.base.prog_offset = old_prog_offset;
3292 brw->wm.prog_data = old_prog_data;
3293
3294 return success;
3295 }