i965: Add SHADER_OPCODE_TG4_OFFSET for gather with nonconstant offsets.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "main/uniforms.h"
50 #include "brw_fs_live_variables.h"
51 #include "glsl/glsl_types.h"
52
53 void
54 fs_inst::init()
55 {
56 memset(this, 0, sizeof(*this));
57 this->opcode = BRW_OPCODE_NOP;
58 this->conditional_mod = BRW_CONDITIONAL_NONE;
59
60 this->dst = reg_undef;
61 this->src[0] = reg_undef;
62 this->src[1] = reg_undef;
63 this->src[2] = reg_undef;
64
65 /* This will be the case for almost all instructions. */
66 this->regs_written = 1;
67 }
68
69 fs_inst::fs_inst()
70 {
71 init();
72 }
73
74 fs_inst::fs_inst(enum opcode opcode)
75 {
76 init();
77 this->opcode = opcode;
78 }
79
80 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
81 {
82 init();
83 this->opcode = opcode;
84 this->dst = dst;
85
86 if (dst.file == GRF)
87 assert(dst.reg_offset >= 0);
88 }
89
90 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
91 {
92 init();
93 this->opcode = opcode;
94 this->dst = dst;
95 this->src[0] = src0;
96
97 if (dst.file == GRF)
98 assert(dst.reg_offset >= 0);
99 if (src[0].file == GRF)
100 assert(src[0].reg_offset >= 0);
101 }
102
103 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
104 {
105 init();
106 this->opcode = opcode;
107 this->dst = dst;
108 this->src[0] = src0;
109 this->src[1] = src1;
110
111 if (dst.file == GRF)
112 assert(dst.reg_offset >= 0);
113 if (src[0].file == GRF)
114 assert(src[0].reg_offset >= 0);
115 if (src[1].file == GRF)
116 assert(src[1].reg_offset >= 0);
117 }
118
119 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
120 fs_reg src0, fs_reg src1, fs_reg src2)
121 {
122 init();
123 this->opcode = opcode;
124 this->dst = dst;
125 this->src[0] = src0;
126 this->src[1] = src1;
127 this->src[2] = src2;
128
129 if (dst.file == GRF)
130 assert(dst.reg_offset >= 0);
131 if (src[0].file == GRF)
132 assert(src[0].reg_offset >= 0);
133 if (src[1].file == GRF)
134 assert(src[1].reg_offset >= 0);
135 if (src[2].file == GRF)
136 assert(src[2].reg_offset >= 0);
137 }
138
139 #define ALU1(op) \
140 fs_inst * \
141 fs_visitor::op(fs_reg dst, fs_reg src0) \
142 { \
143 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
144 }
145
146 #define ALU2(op) \
147 fs_inst * \
148 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
149 { \
150 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
151 }
152
153 #define ALU3(op) \
154 fs_inst * \
155 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
156 { \
157 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
158 }
159
160 ALU1(NOT)
161 ALU1(MOV)
162 ALU1(FRC)
163 ALU1(RNDD)
164 ALU1(RNDE)
165 ALU1(RNDZ)
166 ALU2(ADD)
167 ALU2(MUL)
168 ALU2(MACH)
169 ALU2(AND)
170 ALU2(OR)
171 ALU2(XOR)
172 ALU2(SHL)
173 ALU2(SHR)
174 ALU2(ASR)
175 ALU3(LRP)
176 ALU1(BFREV)
177 ALU3(BFE)
178 ALU2(BFI1)
179 ALU3(BFI2)
180 ALU1(FBH)
181 ALU1(FBL)
182 ALU1(CBIT)
183 ALU3(MAD)
184 ALU2(ADDC)
185 ALU2(SUBB)
186
187 /** Gen4 predicated IF. */
188 fs_inst *
189 fs_visitor::IF(uint32_t predicate)
190 {
191 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
192 inst->predicate = predicate;
193 return inst;
194 }
195
196 /** Gen6+ IF with embedded comparison. */
197 fs_inst *
198 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
199 {
200 assert(brw->gen >= 6);
201 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
202 reg_null_d, src0, src1);
203 inst->conditional_mod = condition;
204 return inst;
205 }
206
207 /**
208 * CMP: Sets the low bit of the destination channels with the result
209 * of the comparison, while the upper bits are undefined, and updates
210 * the flag register with the packed 16 bits of the result.
211 */
212 fs_inst *
213 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
214 {
215 fs_inst *inst;
216
217 /* Take the instruction:
218 *
219 * CMP null<d> src0<f> src1<f>
220 *
221 * Original gen4 does type conversion to the destination type before
222 * comparison, producing garbage results for floating point comparisons.
223 * gen5 does the comparison on the execution type (resolved source types),
224 * so dst type doesn't matter. gen6 does comparison and then uses the
225 * result as if it was the dst type with no conversion, which happens to
226 * mostly work out for float-interpreted-as-int since our comparisons are
227 * for >0, =0, <0.
228 */
229 if (brw->gen == 4) {
230 dst.type = src0.type;
231 if (dst.file == HW_REG)
232 dst.fixed_hw_reg.type = dst.type;
233 }
234
235 resolve_ud_negate(&src0);
236 resolve_ud_negate(&src1);
237
238 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
239 inst->conditional_mod = condition;
240
241 return inst;
242 }
243
244 exec_list
245 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
246 fs_reg varying_offset,
247 uint32_t const_offset)
248 {
249 exec_list instructions;
250 fs_inst *inst;
251
252 /* We have our constant surface use a pitch of 4 bytes, so our index can
253 * be any component of a vector, and then we load 4 contiguous
254 * components starting from that.
255 *
256 * We break down the const_offset to a portion added to the variable
257 * offset and a portion done using reg_offset, which means that if you
258 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
259 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
260 * CSE can later notice that those loads are all the same and eliminate
261 * the redundant ones.
262 */
263 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
264 instructions.push_tail(ADD(vec4_offset,
265 varying_offset, const_offset & ~3));
266
267 int scale = 1;
268 if (brw->gen == 4 && dispatch_width == 8) {
269 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
270 * u, v, r) as parameters, or we can just use the SIMD16 message
271 * consisting of (header, u). We choose the second, at the cost of a
272 * longer return length.
273 */
274 scale = 2;
275 }
276
277 enum opcode op;
278 if (brw->gen >= 7)
279 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
280 else
281 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
282 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
283 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
284 inst->regs_written = 4 * scale;
285 instructions.push_tail(inst);
286
287 if (brw->gen < 7) {
288 inst->base_mrf = 13;
289 inst->header_present = true;
290 if (brw->gen == 4)
291 inst->mlen = 3;
292 else
293 inst->mlen = 1 + dispatch_width / 8;
294 }
295
296 vec4_result.reg_offset += (const_offset & 3) * scale;
297 instructions.push_tail(MOV(dst, vec4_result));
298
299 return instructions;
300 }
301
302 /**
303 * A helper for MOV generation for fixing up broken hardware SEND dependency
304 * handling.
305 */
306 fs_inst *
307 fs_visitor::DEP_RESOLVE_MOV(int grf)
308 {
309 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
310
311 inst->ir = NULL;
312 inst->annotation = "send dependency resolve";
313
314 /* The caller always wants uncompressed to emit the minimal extra
315 * dependencies, and to avoid having to deal with aligning its regs to 2.
316 */
317 inst->force_uncompressed = true;
318
319 return inst;
320 }
321
322 bool
323 fs_inst::equals(fs_inst *inst)
324 {
325 return (opcode == inst->opcode &&
326 dst.equals(inst->dst) &&
327 src[0].equals(inst->src[0]) &&
328 src[1].equals(inst->src[1]) &&
329 src[2].equals(inst->src[2]) &&
330 saturate == inst->saturate &&
331 predicate == inst->predicate &&
332 conditional_mod == inst->conditional_mod &&
333 mlen == inst->mlen &&
334 base_mrf == inst->base_mrf &&
335 sampler == inst->sampler &&
336 target == inst->target &&
337 eot == inst->eot &&
338 header_present == inst->header_present &&
339 shadow_compare == inst->shadow_compare &&
340 offset == inst->offset);
341 }
342
343 bool
344 fs_inst::overwrites_reg(const fs_reg &reg)
345 {
346 return (reg.file == dst.file &&
347 reg.reg == dst.reg &&
348 reg.reg_offset >= dst.reg_offset &&
349 reg.reg_offset < dst.reg_offset + regs_written);
350 }
351
352 bool
353 fs_inst::is_send_from_grf()
354 {
355 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
356 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
357 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
358 src[1].file == GRF) ||
359 (is_tex() && src[0].file == GRF));
360 }
361
362 bool
363 fs_visitor::can_do_source_mods(fs_inst *inst)
364 {
365 if (brw->gen == 6 && inst->is_math())
366 return false;
367
368 if (inst->is_send_from_grf())
369 return false;
370
371 if (!inst->can_do_source_mods())
372 return false;
373
374 return true;
375 }
376
377 void
378 fs_reg::init()
379 {
380 memset(this, 0, sizeof(*this));
381 this->smear = -1;
382 }
383
384 /** Generic unset register constructor. */
385 fs_reg::fs_reg()
386 {
387 init();
388 this->file = BAD_FILE;
389 }
390
391 /** Immediate value constructor. */
392 fs_reg::fs_reg(float f)
393 {
394 init();
395 this->file = IMM;
396 this->type = BRW_REGISTER_TYPE_F;
397 this->imm.f = f;
398 }
399
400 /** Immediate value constructor. */
401 fs_reg::fs_reg(int32_t i)
402 {
403 init();
404 this->file = IMM;
405 this->type = BRW_REGISTER_TYPE_D;
406 this->imm.i = i;
407 }
408
409 /** Immediate value constructor. */
410 fs_reg::fs_reg(uint32_t u)
411 {
412 init();
413 this->file = IMM;
414 this->type = BRW_REGISTER_TYPE_UD;
415 this->imm.u = u;
416 }
417
418 /** Fixed brw_reg Immediate value constructor. */
419 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
420 {
421 init();
422 this->file = HW_REG;
423 this->fixed_hw_reg = fixed_hw_reg;
424 this->type = fixed_hw_reg.type;
425 }
426
427 bool
428 fs_reg::equals(const fs_reg &r) const
429 {
430 return (file == r.file &&
431 reg == r.reg &&
432 reg_offset == r.reg_offset &&
433 type == r.type &&
434 negate == r.negate &&
435 abs == r.abs &&
436 !reladdr && !r.reladdr &&
437 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
438 sizeof(fixed_hw_reg)) == 0 &&
439 smear == r.smear &&
440 imm.u == r.imm.u);
441 }
442
443 fs_reg
444 fs_reg::retype(uint32_t type)
445 {
446 fs_reg result = *this;
447 result.type = type;
448 return result;
449 }
450
451 bool
452 fs_reg::is_zero() const
453 {
454 if (file != IMM)
455 return false;
456
457 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
458 }
459
460 bool
461 fs_reg::is_one() const
462 {
463 if (file != IMM)
464 return false;
465
466 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
467 }
468
469 bool
470 fs_reg::is_valid_3src() const
471 {
472 return file == GRF || file == UNIFORM;
473 }
474
475 int
476 fs_visitor::type_size(const struct glsl_type *type)
477 {
478 unsigned int size, i;
479
480 switch (type->base_type) {
481 case GLSL_TYPE_UINT:
482 case GLSL_TYPE_INT:
483 case GLSL_TYPE_FLOAT:
484 case GLSL_TYPE_BOOL:
485 return type->components();
486 case GLSL_TYPE_ARRAY:
487 return type_size(type->fields.array) * type->length;
488 case GLSL_TYPE_STRUCT:
489 size = 0;
490 for (i = 0; i < type->length; i++) {
491 size += type_size(type->fields.structure[i].type);
492 }
493 return size;
494 case GLSL_TYPE_SAMPLER:
495 /* Samplers take up no register space, since they're baked in at
496 * link time.
497 */
498 return 0;
499 case GLSL_TYPE_VOID:
500 case GLSL_TYPE_ERROR:
501 case GLSL_TYPE_INTERFACE:
502 assert(!"not reached");
503 break;
504 }
505
506 return 0;
507 }
508
509 fs_reg
510 fs_visitor::get_timestamp()
511 {
512 assert(brw->gen >= 7);
513
514 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
515 BRW_ARF_TIMESTAMP,
516 0),
517 BRW_REGISTER_TYPE_UD));
518
519 fs_reg dst = fs_reg(this, glsl_type::uint_type);
520
521 fs_inst *mov = emit(MOV(dst, ts));
522 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
523 * even if it's not enabled in the dispatch.
524 */
525 mov->force_writemask_all = true;
526 mov->force_uncompressed = true;
527
528 /* The caller wants the low 32 bits of the timestamp. Since it's running
529 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
530 * which is plenty of time for our purposes. It is identical across the
531 * EUs, but since it's tracking GPU core speed it will increment at a
532 * varying rate as render P-states change.
533 *
534 * The caller could also check if render P-states have changed (or anything
535 * else that might disrupt timing) by setting smear to 2 and checking if
536 * that field is != 0.
537 */
538 dst.smear = 0;
539
540 return dst;
541 }
542
543 void
544 fs_visitor::emit_shader_time_begin()
545 {
546 current_annotation = "shader time start";
547 shader_start_time = get_timestamp();
548 }
549
550 void
551 fs_visitor::emit_shader_time_end()
552 {
553 current_annotation = "shader time end";
554
555 enum shader_time_shader_type type, written_type, reset_type;
556 if (dispatch_width == 8) {
557 type = ST_FS8;
558 written_type = ST_FS8_WRITTEN;
559 reset_type = ST_FS8_RESET;
560 } else {
561 assert(dispatch_width == 16);
562 type = ST_FS16;
563 written_type = ST_FS16_WRITTEN;
564 reset_type = ST_FS16_RESET;
565 }
566
567 fs_reg shader_end_time = get_timestamp();
568
569 /* Check that there weren't any timestamp reset events (assuming these
570 * were the only two timestamp reads that happened).
571 */
572 fs_reg reset = shader_end_time;
573 reset.smear = 2;
574 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
575 test->conditional_mod = BRW_CONDITIONAL_Z;
576 emit(IF(BRW_PREDICATE_NORMAL));
577
578 push_force_uncompressed();
579 fs_reg start = shader_start_time;
580 start.negate = true;
581 fs_reg diff = fs_reg(this, glsl_type::uint_type);
582 emit(ADD(diff, start, shader_end_time));
583
584 /* If there were no instructions between the two timestamp gets, the diff
585 * is 2 cycles. Remove that overhead, so I can forget about that when
586 * trying to determine the time taken for single instructions.
587 */
588 emit(ADD(diff, diff, fs_reg(-2u)));
589
590 emit_shader_time_write(type, diff);
591 emit_shader_time_write(written_type, fs_reg(1u));
592 emit(BRW_OPCODE_ELSE);
593 emit_shader_time_write(reset_type, fs_reg(1u));
594 emit(BRW_OPCODE_ENDIF);
595
596 pop_force_uncompressed();
597 }
598
599 void
600 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
601 fs_reg value)
602 {
603 int shader_time_index =
604 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
605 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
606
607 fs_reg payload;
608 if (dispatch_width == 8)
609 payload = fs_reg(this, glsl_type::uvec2_type);
610 else
611 payload = fs_reg(this, glsl_type::uint_type);
612
613 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
614 fs_reg(), payload, offset, value));
615 }
616
617 void
618 fs_visitor::fail(const char *format, ...)
619 {
620 va_list va;
621 char *msg;
622
623 if (failed)
624 return;
625
626 failed = true;
627
628 va_start(va, format);
629 msg = ralloc_vasprintf(mem_ctx, format, va);
630 va_end(va);
631 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
632
633 this->fail_msg = msg;
634
635 if (INTEL_DEBUG & DEBUG_WM) {
636 fprintf(stderr, "%s", msg);
637 }
638 }
639
640 fs_inst *
641 fs_visitor::emit(enum opcode opcode)
642 {
643 return emit(fs_inst(opcode));
644 }
645
646 fs_inst *
647 fs_visitor::emit(enum opcode opcode, fs_reg dst)
648 {
649 return emit(fs_inst(opcode, dst));
650 }
651
652 fs_inst *
653 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
654 {
655 return emit(fs_inst(opcode, dst, src0));
656 }
657
658 fs_inst *
659 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
660 {
661 return emit(fs_inst(opcode, dst, src0, src1));
662 }
663
664 fs_inst *
665 fs_visitor::emit(enum opcode opcode, fs_reg dst,
666 fs_reg src0, fs_reg src1, fs_reg src2)
667 {
668 return emit(fs_inst(opcode, dst, src0, src1, src2));
669 }
670
671 void
672 fs_visitor::push_force_uncompressed()
673 {
674 force_uncompressed_stack++;
675 }
676
677 void
678 fs_visitor::pop_force_uncompressed()
679 {
680 force_uncompressed_stack--;
681 assert(force_uncompressed_stack >= 0);
682 }
683
684 void
685 fs_visitor::push_force_sechalf()
686 {
687 force_sechalf_stack++;
688 }
689
690 void
691 fs_visitor::pop_force_sechalf()
692 {
693 force_sechalf_stack--;
694 assert(force_sechalf_stack >= 0);
695 }
696
697 /**
698 * Returns true if the instruction has a flag that means it won't
699 * update an entire destination register.
700 *
701 * For example, dead code elimination and live variable analysis want to know
702 * when a write to a variable screens off any preceding values that were in
703 * it.
704 */
705 bool
706 fs_inst::is_partial_write()
707 {
708 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
709 this->force_uncompressed ||
710 this->force_sechalf);
711 }
712
713 int
714 fs_inst::regs_read(fs_visitor *v, int arg)
715 {
716 if (is_tex() && arg == 0 && src[0].file == GRF) {
717 if (v->dispatch_width == 16)
718 return (mlen + 1) / 2;
719 else
720 return mlen;
721 }
722 return 1;
723 }
724
725 /**
726 * Returns how many MRFs an FS opcode will write over.
727 *
728 * Note that this is not the 0 or 1 implied writes in an actual gen
729 * instruction -- the FS opcodes often generate MOVs in addition.
730 */
731 int
732 fs_visitor::implied_mrf_writes(fs_inst *inst)
733 {
734 if (inst->mlen == 0)
735 return 0;
736
737 if (inst->base_mrf == -1)
738 return 0;
739
740 switch (inst->opcode) {
741 case SHADER_OPCODE_RCP:
742 case SHADER_OPCODE_RSQ:
743 case SHADER_OPCODE_SQRT:
744 case SHADER_OPCODE_EXP2:
745 case SHADER_OPCODE_LOG2:
746 case SHADER_OPCODE_SIN:
747 case SHADER_OPCODE_COS:
748 return 1 * dispatch_width / 8;
749 case SHADER_OPCODE_POW:
750 case SHADER_OPCODE_INT_QUOTIENT:
751 case SHADER_OPCODE_INT_REMAINDER:
752 return 2 * dispatch_width / 8;
753 case SHADER_OPCODE_TEX:
754 case FS_OPCODE_TXB:
755 case SHADER_OPCODE_TXD:
756 case SHADER_OPCODE_TXF:
757 case SHADER_OPCODE_TXF_MS:
758 case SHADER_OPCODE_TG4:
759 case SHADER_OPCODE_TG4_OFFSET:
760 case SHADER_OPCODE_TXL:
761 case SHADER_OPCODE_TXS:
762 case SHADER_OPCODE_LOD:
763 return 1;
764 case FS_OPCODE_FB_WRITE:
765 return 2;
766 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
767 case FS_OPCODE_UNSPILL:
768 return 1;
769 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
770 return inst->mlen;
771 case FS_OPCODE_SPILL:
772 return 2;
773 default:
774 assert(!"not reached");
775 return inst->mlen;
776 }
777 }
778
779 int
780 fs_visitor::virtual_grf_alloc(int size)
781 {
782 if (virtual_grf_array_size <= virtual_grf_count) {
783 if (virtual_grf_array_size == 0)
784 virtual_grf_array_size = 16;
785 else
786 virtual_grf_array_size *= 2;
787 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
788 virtual_grf_array_size);
789 }
790 virtual_grf_sizes[virtual_grf_count] = size;
791 return virtual_grf_count++;
792 }
793
794 /** Fixed HW reg constructor. */
795 fs_reg::fs_reg(enum register_file file, int reg)
796 {
797 init();
798 this->file = file;
799 this->reg = reg;
800 this->type = BRW_REGISTER_TYPE_F;
801 }
802
803 /** Fixed HW reg constructor. */
804 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
805 {
806 init();
807 this->file = file;
808 this->reg = reg;
809 this->type = type;
810 }
811
812 /** Automatic reg constructor. */
813 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
814 {
815 init();
816
817 this->file = GRF;
818 this->reg = v->virtual_grf_alloc(v->type_size(type));
819 this->reg_offset = 0;
820 this->type = brw_type_for_base_type(type);
821 }
822
823 fs_reg *
824 fs_visitor::variable_storage(ir_variable *var)
825 {
826 return (fs_reg *)hash_table_find(this->variable_ht, var);
827 }
828
829 void
830 import_uniforms_callback(const void *key,
831 void *data,
832 void *closure)
833 {
834 struct hash_table *dst_ht = (struct hash_table *)closure;
835 const fs_reg *reg = (const fs_reg *)data;
836
837 if (reg->file != UNIFORM)
838 return;
839
840 hash_table_insert(dst_ht, data, key);
841 }
842
843 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
844 * This brings in those uniform definitions
845 */
846 void
847 fs_visitor::import_uniforms(fs_visitor *v)
848 {
849 hash_table_call_foreach(v->variable_ht,
850 import_uniforms_callback,
851 variable_ht);
852 this->params_remap = v->params_remap;
853 this->nr_params_remap = v->nr_params_remap;
854 }
855
856 /* Our support for uniforms is piggy-backed on the struct
857 * gl_fragment_program, because that's where the values actually
858 * get stored, rather than in some global gl_shader_program uniform
859 * store.
860 */
861 void
862 fs_visitor::setup_uniform_values(ir_variable *ir)
863 {
864 int namelen = strlen(ir->name);
865
866 /* The data for our (non-builtin) uniforms is stored in a series of
867 * gl_uniform_driver_storage structs for each subcomponent that
868 * glGetUniformLocation() could name. We know it's been set up in the same
869 * order we'd walk the type, so walk the list of storage and find anything
870 * with our name, or the prefix of a component that starts with our name.
871 */
872 unsigned params_before = c->prog_data.nr_params;
873 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
874 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
875
876 if (strncmp(ir->name, storage->name, namelen) != 0 ||
877 (storage->name[namelen] != 0 &&
878 storage->name[namelen] != '.' &&
879 storage->name[namelen] != '[')) {
880 continue;
881 }
882
883 unsigned slots = storage->type->component_slots();
884 if (storage->array_elements)
885 slots *= storage->array_elements;
886
887 for (unsigned i = 0; i < slots; i++) {
888 c->prog_data.param[c->prog_data.nr_params++] =
889 &storage->storage[i].f;
890 }
891 }
892
893 /* Make sure we actually initialized the right amount of stuff here. */
894 assert(params_before + ir->type->component_slots() ==
895 c->prog_data.nr_params);
896 (void)params_before;
897 }
898
899
900 /* Our support for builtin uniforms is even scarier than non-builtin.
901 * It sits on top of the PROG_STATE_VAR parameters that are
902 * automatically updated from GL context state.
903 */
904 void
905 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
906 {
907 const ir_state_slot *const slots = ir->state_slots;
908 assert(ir->state_slots != NULL);
909
910 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
911 /* This state reference has already been setup by ir_to_mesa, but we'll
912 * get the same index back here.
913 */
914 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
915 (gl_state_index *)slots[i].tokens);
916
917 /* Add each of the unique swizzles of the element as a parameter.
918 * This'll end up matching the expected layout of the
919 * array/matrix/structure we're trying to fill in.
920 */
921 int last_swiz = -1;
922 for (unsigned int j = 0; j < 4; j++) {
923 int swiz = GET_SWZ(slots[i].swizzle, j);
924 if (swiz == last_swiz)
925 break;
926 last_swiz = swiz;
927
928 c->prog_data.param[c->prog_data.nr_params++] =
929 &fp->Base.Parameters->ParameterValues[index][swiz].f;
930 }
931 }
932 }
933
934 fs_reg *
935 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
936 {
937 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
938 fs_reg wpos = *reg;
939 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
940
941 /* gl_FragCoord.x */
942 if (ir->pixel_center_integer) {
943 emit(MOV(wpos, this->pixel_x));
944 } else {
945 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
946 }
947 wpos.reg_offset++;
948
949 /* gl_FragCoord.y */
950 if (!flip && ir->pixel_center_integer) {
951 emit(MOV(wpos, this->pixel_y));
952 } else {
953 fs_reg pixel_y = this->pixel_y;
954 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
955
956 if (flip) {
957 pixel_y.negate = true;
958 offset += c->key.drawable_height - 1.0;
959 }
960
961 emit(ADD(wpos, pixel_y, fs_reg(offset)));
962 }
963 wpos.reg_offset++;
964
965 /* gl_FragCoord.z */
966 if (brw->gen >= 6) {
967 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
968 } else {
969 emit(FS_OPCODE_LINTERP, wpos,
970 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
971 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
972 interp_reg(VARYING_SLOT_POS, 2));
973 }
974 wpos.reg_offset++;
975
976 /* gl_FragCoord.w: Already set up in emit_interpolation */
977 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
978
979 return reg;
980 }
981
982 fs_inst *
983 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
984 glsl_interp_qualifier interpolation_mode,
985 bool is_centroid)
986 {
987 brw_wm_barycentric_interp_mode barycoord_mode;
988 if (brw->gen >= 6) {
989 if (is_centroid) {
990 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
991 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
992 else
993 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
994 } else {
995 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
996 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
997 else
998 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
999 }
1000 } else {
1001 /* On Ironlake and below, there is only one interpolation mode.
1002 * Centroid interpolation doesn't mean anything on this hardware --
1003 * there is no multisampling.
1004 */
1005 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1006 }
1007 return emit(FS_OPCODE_LINTERP, attr,
1008 this->delta_x[barycoord_mode],
1009 this->delta_y[barycoord_mode], interp);
1010 }
1011
1012 fs_reg *
1013 fs_visitor::emit_general_interpolation(ir_variable *ir)
1014 {
1015 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1016 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1017 fs_reg attr = *reg;
1018
1019 unsigned int array_elements;
1020 const glsl_type *type;
1021
1022 if (ir->type->is_array()) {
1023 array_elements = ir->type->length;
1024 if (array_elements == 0) {
1025 fail("dereferenced array '%s' has length 0\n", ir->name);
1026 }
1027 type = ir->type->fields.array;
1028 } else {
1029 array_elements = 1;
1030 type = ir->type;
1031 }
1032
1033 glsl_interp_qualifier interpolation_mode =
1034 ir->determine_interpolation_mode(c->key.flat_shade);
1035
1036 int location = ir->location;
1037 for (unsigned int i = 0; i < array_elements; i++) {
1038 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1039 if (c->prog_data.urb_setup[location] == -1) {
1040 /* If there's no incoming setup data for this slot, don't
1041 * emit interpolation for it.
1042 */
1043 attr.reg_offset += type->vector_elements;
1044 location++;
1045 continue;
1046 }
1047
1048 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1049 /* Constant interpolation (flat shading) case. The SF has
1050 * handed us defined values in only the constant offset
1051 * field of the setup reg.
1052 */
1053 for (unsigned int k = 0; k < type->vector_elements; k++) {
1054 struct brw_reg interp = interp_reg(location, k);
1055 interp = suboffset(interp, 3);
1056 interp.type = reg->type;
1057 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1058 attr.reg_offset++;
1059 }
1060 } else {
1061 /* Smooth/noperspective interpolation case. */
1062 for (unsigned int k = 0; k < type->vector_elements; k++) {
1063 /* FINISHME: At some point we probably want to push
1064 * this farther by giving similar treatment to the
1065 * other potentially constant components of the
1066 * attribute, as well as making brw_vs_constval.c
1067 * handle varyings other than gl_TexCoord.
1068 */
1069 struct brw_reg interp = interp_reg(location, k);
1070 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1071 ir->centroid);
1072 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1073 /* Get the pixel/sample mask into f0 so that we know
1074 * which pixels are lit. Then, for each channel that is
1075 * unlit, replace the centroid data with non-centroid
1076 * data.
1077 */
1078 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1079 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1080 interpolation_mode, false);
1081 inst->predicate = BRW_PREDICATE_NORMAL;
1082 inst->predicate_inverse = true;
1083 }
1084 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1085 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1086 }
1087 attr.reg_offset++;
1088 }
1089
1090 }
1091 location++;
1092 }
1093 }
1094
1095 return reg;
1096 }
1097
1098 fs_reg *
1099 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1100 {
1101 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1102
1103 /* The frontfacing comes in as a bit in the thread payload. */
1104 if (brw->gen >= 6) {
1105 emit(BRW_OPCODE_ASR, *reg,
1106 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1107 fs_reg(15));
1108 emit(BRW_OPCODE_NOT, *reg, *reg);
1109 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1110 } else {
1111 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1112 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1113 * us front face
1114 */
1115 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1116 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1117 }
1118
1119 return reg;
1120 }
1121
1122 fs_reg
1123 fs_visitor::fix_math_operand(fs_reg src)
1124 {
1125 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1126 * might be able to do better by doing execsize = 1 math and then
1127 * expanding that result out, but we would need to be careful with
1128 * masking.
1129 *
1130 * The hardware ignores source modifiers (negate and abs) on math
1131 * instructions, so we also move to a temp to set those up.
1132 */
1133 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1134 !src.abs && !src.negate)
1135 return src;
1136
1137 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1138 * operands to math
1139 */
1140 if (brw->gen >= 7 && src.file != IMM)
1141 return src;
1142
1143 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1144 expanded.type = src.type;
1145 emit(BRW_OPCODE_MOV, expanded, src);
1146 return expanded;
1147 }
1148
1149 fs_inst *
1150 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1151 {
1152 switch (opcode) {
1153 case SHADER_OPCODE_RCP:
1154 case SHADER_OPCODE_RSQ:
1155 case SHADER_OPCODE_SQRT:
1156 case SHADER_OPCODE_EXP2:
1157 case SHADER_OPCODE_LOG2:
1158 case SHADER_OPCODE_SIN:
1159 case SHADER_OPCODE_COS:
1160 break;
1161 default:
1162 assert(!"not reached: bad math opcode");
1163 return NULL;
1164 }
1165
1166 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1167 * might be able to do better by doing execsize = 1 math and then
1168 * expanding that result out, but we would need to be careful with
1169 * masking.
1170 *
1171 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1172 * instructions, so we also move to a temp to set those up.
1173 */
1174 if (brw->gen >= 6)
1175 src = fix_math_operand(src);
1176
1177 fs_inst *inst = emit(opcode, dst, src);
1178
1179 if (brw->gen < 6) {
1180 inst->base_mrf = 2;
1181 inst->mlen = dispatch_width / 8;
1182 }
1183
1184 return inst;
1185 }
1186
1187 fs_inst *
1188 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1189 {
1190 int base_mrf = 2;
1191 fs_inst *inst;
1192
1193 switch (opcode) {
1194 case SHADER_OPCODE_INT_QUOTIENT:
1195 case SHADER_OPCODE_INT_REMAINDER:
1196 if (brw->gen >= 7 && dispatch_width == 16)
1197 fail("16-wide INTDIV unsupported\n");
1198 break;
1199 case SHADER_OPCODE_POW:
1200 break;
1201 default:
1202 assert(!"not reached: unsupported binary math opcode.");
1203 return NULL;
1204 }
1205
1206 if (brw->gen >= 6) {
1207 src0 = fix_math_operand(src0);
1208 src1 = fix_math_operand(src1);
1209
1210 inst = emit(opcode, dst, src0, src1);
1211 } else {
1212 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1213 * "Message Payload":
1214 *
1215 * "Operand0[7]. For the INT DIV functions, this operand is the
1216 * denominator."
1217 * ...
1218 * "Operand1[7]. For the INT DIV functions, this operand is the
1219 * numerator."
1220 */
1221 bool is_int_div = opcode != SHADER_OPCODE_POW;
1222 fs_reg &op0 = is_int_div ? src1 : src0;
1223 fs_reg &op1 = is_int_div ? src0 : src1;
1224
1225 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1226 inst = emit(opcode, dst, op0, reg_null_f);
1227
1228 inst->base_mrf = base_mrf;
1229 inst->mlen = 2 * dispatch_width / 8;
1230 }
1231 return inst;
1232 }
1233
1234 void
1235 fs_visitor::assign_curb_setup()
1236 {
1237 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1238 if (dispatch_width == 8) {
1239 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1240 } else {
1241 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1242 }
1243
1244 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1245 foreach_list(node, &this->instructions) {
1246 fs_inst *inst = (fs_inst *)node;
1247
1248 for (unsigned int i = 0; i < 3; i++) {
1249 if (inst->src[i].file == UNIFORM) {
1250 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1251 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1252 constant_nr / 8,
1253 constant_nr % 8);
1254
1255 inst->src[i].file = HW_REG;
1256 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1257 }
1258 }
1259 }
1260 }
1261
1262 void
1263 fs_visitor::calculate_urb_setup()
1264 {
1265 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1266 c->prog_data.urb_setup[i] = -1;
1267 }
1268
1269 int urb_next = 0;
1270 /* Figure out where each of the incoming setup attributes lands. */
1271 if (brw->gen >= 6) {
1272 if (_mesa_bitcount_64(fp->Base.InputsRead &
1273 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1274 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1275 * first 16 varying inputs, so we can put them wherever we want.
1276 * Just put them in order.
1277 *
1278 * This is useful because it means that (a) inputs not used by the
1279 * fragment shader won't take up valuable register space, and (b) we
1280 * won't have to recompile the fragment shader if it gets paired with
1281 * a different vertex (or geometry) shader.
1282 */
1283 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1284 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1285 BITFIELD64_BIT(i)) {
1286 c->prog_data.urb_setup[i] = urb_next++;
1287 }
1288 }
1289 } else {
1290 /* We have enough input varyings that the SF/SBE pipeline stage can't
1291 * arbitrarily rearrange them to suit our whim; we have to put them
1292 * in an order that matches the output of the previous pipeline stage
1293 * (geometry or vertex shader).
1294 */
1295 struct brw_vue_map prev_stage_vue_map;
1296 brw_compute_vue_map(brw, &prev_stage_vue_map,
1297 c->key.input_slots_valid);
1298 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1299 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1300 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1301 slot++) {
1302 int varying = prev_stage_vue_map.slot_to_varying[slot];
1303 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1304 * unused.
1305 */
1306 if (varying != BRW_VARYING_SLOT_COUNT &&
1307 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1308 BITFIELD64_BIT(varying))) {
1309 c->prog_data.urb_setup[varying] = slot - first_slot;
1310 }
1311 }
1312 urb_next = prev_stage_vue_map.num_slots - first_slot;
1313 }
1314 } else {
1315 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1316 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1317 /* Point size is packed into the header, not as a general attribute */
1318 if (i == VARYING_SLOT_PSIZ)
1319 continue;
1320
1321 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1322 /* The back color slot is skipped when the front color is
1323 * also written to. In addition, some slots can be
1324 * written in the vertex shader and not read in the
1325 * fragment shader. So the register number must always be
1326 * incremented, mapped or not.
1327 */
1328 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1329 c->prog_data.urb_setup[i] = urb_next;
1330 urb_next++;
1331 }
1332 }
1333
1334 /*
1335 * It's a FS only attribute, and we did interpolation for this attribute
1336 * in SF thread. So, count it here, too.
1337 *
1338 * See compile_sf_prog() for more info.
1339 */
1340 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1341 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1342 }
1343
1344 c->prog_data.num_varying_inputs = urb_next;
1345 }
1346
1347 void
1348 fs_visitor::assign_urb_setup()
1349 {
1350 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1351
1352 /* Offset all the urb_setup[] index by the actual position of the
1353 * setup regs, now that the location of the constants has been chosen.
1354 */
1355 foreach_list(node, &this->instructions) {
1356 fs_inst *inst = (fs_inst *)node;
1357
1358 if (inst->opcode == FS_OPCODE_LINTERP) {
1359 assert(inst->src[2].file == HW_REG);
1360 inst->src[2].fixed_hw_reg.nr += urb_start;
1361 }
1362
1363 if (inst->opcode == FS_OPCODE_CINTERP) {
1364 assert(inst->src[0].file == HW_REG);
1365 inst->src[0].fixed_hw_reg.nr += urb_start;
1366 }
1367 }
1368
1369 /* Each attribute is 4 setup channels, each of which is half a reg. */
1370 this->first_non_payload_grf =
1371 urb_start + c->prog_data.num_varying_inputs * 2;
1372 }
1373
1374 /**
1375 * Split large virtual GRFs into separate components if we can.
1376 *
1377 * This is mostly duplicated with what brw_fs_vector_splitting does,
1378 * but that's really conservative because it's afraid of doing
1379 * splitting that doesn't result in real progress after the rest of
1380 * the optimization phases, which would cause infinite looping in
1381 * optimization. We can do it once here, safely. This also has the
1382 * opportunity to split interpolated values, or maybe even uniforms,
1383 * which we don't have at the IR level.
1384 *
1385 * We want to split, because virtual GRFs are what we register
1386 * allocate and spill (due to contiguousness requirements for some
1387 * instructions), and they're what we naturally generate in the
1388 * codegen process, but most virtual GRFs don't actually need to be
1389 * contiguous sets of GRFs. If we split, we'll end up with reduced
1390 * live intervals and better dead code elimination and coalescing.
1391 */
1392 void
1393 fs_visitor::split_virtual_grfs()
1394 {
1395 int num_vars = this->virtual_grf_count;
1396 bool split_grf[num_vars];
1397 int new_virtual_grf[num_vars];
1398
1399 /* Try to split anything > 0 sized. */
1400 for (int i = 0; i < num_vars; i++) {
1401 if (this->virtual_grf_sizes[i] != 1)
1402 split_grf[i] = true;
1403 else
1404 split_grf[i] = false;
1405 }
1406
1407 if (brw->has_pln &&
1408 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1409 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1410 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1411 * Gen6, that was the only supported interpolation mode, and since Gen6,
1412 * delta_x and delta_y are in fixed hardware registers.
1413 */
1414 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1415 false;
1416 }
1417
1418 foreach_list(node, &this->instructions) {
1419 fs_inst *inst = (fs_inst *)node;
1420
1421 /* If there's a SEND message that requires contiguous destination
1422 * registers, no splitting is allowed.
1423 */
1424 if (inst->regs_written > 1) {
1425 split_grf[inst->dst.reg] = false;
1426 }
1427
1428 /* If we're sending from a GRF, don't split it, on the assumption that
1429 * the send is reading the whole thing.
1430 */
1431 if (inst->is_send_from_grf()) {
1432 for (int i = 0; i < 3; i++) {
1433 if (inst->src[i].file == GRF) {
1434 split_grf[inst->src[i].reg] = false;
1435 }
1436 }
1437 }
1438 }
1439
1440 /* Allocate new space for split regs. Note that the virtual
1441 * numbers will be contiguous.
1442 */
1443 for (int i = 0; i < num_vars; i++) {
1444 if (split_grf[i]) {
1445 new_virtual_grf[i] = virtual_grf_alloc(1);
1446 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1447 int reg = virtual_grf_alloc(1);
1448 assert(reg == new_virtual_grf[i] + j - 1);
1449 (void) reg;
1450 }
1451 this->virtual_grf_sizes[i] = 1;
1452 }
1453 }
1454
1455 foreach_list(node, &this->instructions) {
1456 fs_inst *inst = (fs_inst *)node;
1457
1458 if (inst->dst.file == GRF &&
1459 split_grf[inst->dst.reg] &&
1460 inst->dst.reg_offset != 0) {
1461 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1462 inst->dst.reg_offset - 1);
1463 inst->dst.reg_offset = 0;
1464 }
1465 for (int i = 0; i < 3; i++) {
1466 if (inst->src[i].file == GRF &&
1467 split_grf[inst->src[i].reg] &&
1468 inst->src[i].reg_offset != 0) {
1469 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1470 inst->src[i].reg_offset - 1);
1471 inst->src[i].reg_offset = 0;
1472 }
1473 }
1474 }
1475 invalidate_live_intervals();
1476 }
1477
1478 /**
1479 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1480 *
1481 * During code generation, we create tons of temporary variables, many of
1482 * which get immediately killed and are never used again. Yet, in later
1483 * optimization and analysis passes, such as compute_live_intervals, we need
1484 * to loop over all the virtual GRFs. Compacting them can save a lot of
1485 * overhead.
1486 */
1487 void
1488 fs_visitor::compact_virtual_grfs()
1489 {
1490 /* Mark which virtual GRFs are used, and count how many. */
1491 int remap_table[this->virtual_grf_count];
1492 memset(remap_table, -1, sizeof(remap_table));
1493
1494 foreach_list(node, &this->instructions) {
1495 const fs_inst *inst = (const fs_inst *) node;
1496
1497 if (inst->dst.file == GRF)
1498 remap_table[inst->dst.reg] = 0;
1499
1500 for (int i = 0; i < 3; i++) {
1501 if (inst->src[i].file == GRF)
1502 remap_table[inst->src[i].reg] = 0;
1503 }
1504 }
1505
1506 /* In addition to registers used in instructions, fs_visitor keeps
1507 * direct references to certain special values which must be patched:
1508 */
1509 fs_reg *special[] = {
1510 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1511 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1512 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1513 &delta_x[0], &delta_x[1], &delta_x[2],
1514 &delta_x[3], &delta_x[4], &delta_x[5],
1515 &delta_y[0], &delta_y[1], &delta_y[2],
1516 &delta_y[3], &delta_y[4], &delta_y[5],
1517 };
1518 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1519 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1520
1521 /* Treat all special values as used, to be conservative */
1522 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1523 if (special[i]->file == GRF)
1524 remap_table[special[i]->reg] = 0;
1525 }
1526
1527 /* Compact the GRF arrays. */
1528 int new_index = 0;
1529 for (int i = 0; i < this->virtual_grf_count; i++) {
1530 if (remap_table[i] != -1) {
1531 remap_table[i] = new_index;
1532 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1533 invalidate_live_intervals();
1534 ++new_index;
1535 }
1536 }
1537
1538 this->virtual_grf_count = new_index;
1539
1540 /* Patch all the instructions to use the newly renumbered registers */
1541 foreach_list(node, &this->instructions) {
1542 fs_inst *inst = (fs_inst *) node;
1543
1544 if (inst->dst.file == GRF)
1545 inst->dst.reg = remap_table[inst->dst.reg];
1546
1547 for (int i = 0; i < 3; i++) {
1548 if (inst->src[i].file == GRF)
1549 inst->src[i].reg = remap_table[inst->src[i].reg];
1550 }
1551 }
1552
1553 /* Patch all the references to special values */
1554 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1555 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1556 special[i]->reg = remap_table[special[i]->reg];
1557 }
1558 }
1559
1560 bool
1561 fs_visitor::remove_dead_constants()
1562 {
1563 if (dispatch_width == 8) {
1564 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1565 this->nr_params_remap = c->prog_data.nr_params;
1566
1567 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1568 this->params_remap[i] = -1;
1569
1570 /* Find which params are still in use. */
1571 foreach_list(node, &this->instructions) {
1572 fs_inst *inst = (fs_inst *)node;
1573
1574 for (int i = 0; i < 3; i++) {
1575 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1576
1577 if (inst->src[i].file != UNIFORM)
1578 continue;
1579
1580 /* Section 5.11 of the OpenGL 4.3 spec says:
1581 *
1582 * "Out-of-bounds reads return undefined values, which include
1583 * values from other variables of the active program or zero."
1584 */
1585 if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1586 constant_nr = 0;
1587 }
1588
1589 /* For now, set this to non-negative. We'll give it the
1590 * actual new number in a moment, in order to keep the
1591 * register numbers nicely ordered.
1592 */
1593 this->params_remap[constant_nr] = 0;
1594 }
1595 }
1596
1597 /* Figure out what the new numbers for the params will be. At some
1598 * point when we're doing uniform array access, we're going to want
1599 * to keep the distinction between .reg and .reg_offset, but for
1600 * now we don't care.
1601 */
1602 unsigned int new_nr_params = 0;
1603 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1604 if (this->params_remap[i] != -1) {
1605 this->params_remap[i] = new_nr_params++;
1606 }
1607 }
1608
1609 /* Update the list of params to be uploaded to match our new numbering. */
1610 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1611 int remapped = this->params_remap[i];
1612
1613 if (remapped == -1)
1614 continue;
1615
1616 c->prog_data.param[remapped] = c->prog_data.param[i];
1617 }
1618
1619 c->prog_data.nr_params = new_nr_params;
1620 } else {
1621 /* This should have been generated in the 8-wide pass already. */
1622 assert(this->params_remap);
1623 }
1624
1625 /* Now do the renumbering of the shader to remove unused params. */
1626 foreach_list(node, &this->instructions) {
1627 fs_inst *inst = (fs_inst *)node;
1628
1629 for (int i = 0; i < 3; i++) {
1630 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1631
1632 if (inst->src[i].file != UNIFORM)
1633 continue;
1634
1635 /* as above alias to 0 */
1636 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1637 constant_nr = 0;
1638 }
1639 assert(this->params_remap[constant_nr] != -1);
1640 inst->src[i].reg = this->params_remap[constant_nr];
1641 inst->src[i].reg_offset = 0;
1642 }
1643 }
1644
1645 return true;
1646 }
1647
1648 /*
1649 * Implements array access of uniforms by inserting a
1650 * PULL_CONSTANT_LOAD instruction.
1651 *
1652 * Unlike temporary GRF array access (where we don't support it due to
1653 * the difficulty of doing relative addressing on instruction
1654 * destinations), we could potentially do array access of uniforms
1655 * that were loaded in GRF space as push constants. In real-world
1656 * usage we've seen, though, the arrays being used are always larger
1657 * than we could load as push constants, so just always move all
1658 * uniform array access out to a pull constant buffer.
1659 */
1660 void
1661 fs_visitor::move_uniform_array_access_to_pull_constants()
1662 {
1663 int pull_constant_loc[c->prog_data.nr_params];
1664
1665 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1666 pull_constant_loc[i] = -1;
1667 }
1668
1669 /* Walk through and find array access of uniforms. Put a copy of that
1670 * uniform in the pull constant buffer.
1671 *
1672 * Note that we don't move constant-indexed accesses to arrays. No
1673 * testing has been done of the performance impact of this choice.
1674 */
1675 foreach_list_safe(node, &this->instructions) {
1676 fs_inst *inst = (fs_inst *)node;
1677
1678 for (int i = 0 ; i < 3; i++) {
1679 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1680 continue;
1681
1682 int uniform = inst->src[i].reg;
1683
1684 /* If this array isn't already present in the pull constant buffer,
1685 * add it.
1686 */
1687 if (pull_constant_loc[uniform] == -1) {
1688 const float **values = &c->prog_data.param[uniform];
1689
1690 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1691
1692 assert(param_size[uniform]);
1693
1694 for (int j = 0; j < param_size[uniform]; j++) {
1695 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1696 values[j];
1697 }
1698 }
1699
1700 /* Set up the annotation tracking for new generated instructions. */
1701 base_ir = inst->ir;
1702 current_annotation = inst->annotation;
1703
1704 fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1705 fs_reg temp = fs_reg(this, glsl_type::float_type);
1706 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1707 surf_index,
1708 *inst->src[i].reladdr,
1709 pull_constant_loc[uniform] +
1710 inst->src[i].reg_offset);
1711 inst->insert_before(&list);
1712
1713 inst->src[i].file = temp.file;
1714 inst->src[i].reg = temp.reg;
1715 inst->src[i].reg_offset = temp.reg_offset;
1716 inst->src[i].reladdr = NULL;
1717 }
1718 }
1719 }
1720
1721 /**
1722 * Choose accesses from the UNIFORM file to demote to using the pull
1723 * constant buffer.
1724 *
1725 * We allow a fragment shader to have more than the specified minimum
1726 * maximum number of fragment shader uniform components (64). If
1727 * there are too many of these, they'd fill up all of register space.
1728 * So, this will push some of them out to the pull constant buffer and
1729 * update the program to load them.
1730 */
1731 void
1732 fs_visitor::setup_pull_constants()
1733 {
1734 /* Only allow 16 registers (128 uniform components) as push constants. */
1735 unsigned int max_uniform_components = 16 * 8;
1736 if (c->prog_data.nr_params <= max_uniform_components)
1737 return;
1738
1739 if (dispatch_width == 16) {
1740 fail("Pull constants not supported in 16-wide\n");
1741 return;
1742 }
1743
1744 /* Just demote the end of the list. We could probably do better
1745 * here, demoting things that are rarely used in the program first.
1746 */
1747 unsigned int pull_uniform_base = max_uniform_components;
1748
1749 int pull_constant_loc[c->prog_data.nr_params];
1750 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1751 if (i < pull_uniform_base) {
1752 pull_constant_loc[i] = -1;
1753 } else {
1754 pull_constant_loc[i] = -1;
1755 /* If our constant is already being uploaded for reladdr purposes,
1756 * reuse it.
1757 */
1758 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1759 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1760 pull_constant_loc[i] = j;
1761 break;
1762 }
1763 }
1764 if (pull_constant_loc[i] == -1) {
1765 int pull_index = c->prog_data.nr_pull_params++;
1766 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1767 pull_constant_loc[i] = pull_index;;
1768 }
1769 }
1770 }
1771 c->prog_data.nr_params = pull_uniform_base;
1772
1773 foreach_list(node, &this->instructions) {
1774 fs_inst *inst = (fs_inst *)node;
1775
1776 for (int i = 0; i < 3; i++) {
1777 if (inst->src[i].file != UNIFORM)
1778 continue;
1779
1780 int pull_index = pull_constant_loc[inst->src[i].reg +
1781 inst->src[i].reg_offset];
1782 if (pull_index == -1)
1783 continue;
1784
1785 assert(!inst->src[i].reladdr);
1786
1787 fs_reg dst = fs_reg(this, glsl_type::float_type);
1788 fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1789 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1790 fs_inst *pull =
1791 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1792 dst, index, offset);
1793 pull->ir = inst->ir;
1794 pull->annotation = inst->annotation;
1795
1796 inst->insert_before(pull);
1797
1798 inst->src[i].file = GRF;
1799 inst->src[i].reg = dst.reg;
1800 inst->src[i].reg_offset = 0;
1801 inst->src[i].smear = pull_index & 3;
1802 }
1803 }
1804 }
1805
1806 bool
1807 fs_visitor::opt_algebraic()
1808 {
1809 bool progress = false;
1810
1811 foreach_list(node, &this->instructions) {
1812 fs_inst *inst = (fs_inst *)node;
1813
1814 switch (inst->opcode) {
1815 case BRW_OPCODE_MUL:
1816 if (inst->src[1].file != IMM)
1817 continue;
1818
1819 /* a * 1.0 = a */
1820 if (inst->src[1].is_one()) {
1821 inst->opcode = BRW_OPCODE_MOV;
1822 inst->src[1] = reg_undef;
1823 progress = true;
1824 break;
1825 }
1826
1827 /* a * 0.0 = 0.0 */
1828 if (inst->src[1].is_zero()) {
1829 inst->opcode = BRW_OPCODE_MOV;
1830 inst->src[0] = inst->src[1];
1831 inst->src[1] = reg_undef;
1832 progress = true;
1833 break;
1834 }
1835
1836 break;
1837 case BRW_OPCODE_ADD:
1838 if (inst->src[1].file != IMM)
1839 continue;
1840
1841 /* a + 0.0 = a */
1842 if (inst->src[1].is_zero()) {
1843 inst->opcode = BRW_OPCODE_MOV;
1844 inst->src[1] = reg_undef;
1845 progress = true;
1846 break;
1847 }
1848 break;
1849 default:
1850 break;
1851 }
1852 }
1853
1854 return progress;
1855 }
1856
1857 /**
1858 * Removes any instructions writing a VGRF where that VGRF is not used by any
1859 * later instruction.
1860 */
1861 bool
1862 fs_visitor::dead_code_eliminate()
1863 {
1864 bool progress = false;
1865 int pc = 0;
1866
1867 calculate_live_intervals();
1868
1869 foreach_list_safe(node, &this->instructions) {
1870 fs_inst *inst = (fs_inst *)node;
1871
1872 if (inst->dst.file == GRF) {
1873 bool dead = true;
1874
1875 for (int i = 0; i < inst->regs_written; i++) {
1876 int var = live_intervals->var_from_vgrf[inst->dst.reg];
1877 assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
1878 if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
1879 dead = false;
1880 break;
1881 }
1882 }
1883
1884 if (dead) {
1885 /* Don't dead code eliminate instructions that write to the
1886 * accumulator as a side-effect. Instead just set the destination
1887 * to the null register to free it.
1888 */
1889 switch (inst->opcode) {
1890 case BRW_OPCODE_ADDC:
1891 case BRW_OPCODE_SUBB:
1892 case BRW_OPCODE_MACH:
1893 inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
1894 break;
1895 default:
1896 inst->remove();
1897 progress = true;
1898 break;
1899 }
1900 }
1901 }
1902
1903 pc++;
1904 }
1905
1906 if (progress)
1907 invalidate_live_intervals();
1908
1909 return progress;
1910 }
1911
1912 struct dead_code_hash_key
1913 {
1914 int vgrf;
1915 int reg_offset;
1916 };
1917
1918 static bool
1919 dead_code_hash_compare(const void *a, const void *b)
1920 {
1921 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1922 }
1923
1924 static void
1925 clear_dead_code_hash(struct hash_table *ht)
1926 {
1927 struct hash_entry *entry;
1928
1929 hash_table_foreach(ht, entry) {
1930 _mesa_hash_table_remove(ht, entry);
1931 }
1932 }
1933
1934 static void
1935 insert_dead_code_hash(struct hash_table *ht,
1936 int vgrf, int reg_offset, fs_inst *inst)
1937 {
1938 /* We don't bother freeing keys, because they'll be GCed with the ht. */
1939 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1940
1941 key->vgrf = vgrf;
1942 key->reg_offset = reg_offset;
1943
1944 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1945 }
1946
1947 static struct hash_entry *
1948 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1949 {
1950 struct dead_code_hash_key key;
1951
1952 key.vgrf = vgrf;
1953 key.reg_offset = reg_offset;
1954
1955 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1956 }
1957
1958 static void
1959 remove_dead_code_hash(struct hash_table *ht,
1960 int vgrf, int reg_offset)
1961 {
1962 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1963 if (!entry)
1964 return;
1965
1966 _mesa_hash_table_remove(ht, entry);
1967 }
1968
1969 /**
1970 * Walks basic blocks, removing any regs that are written but not read before
1971 * being redefined.
1972 *
1973 * The dead_code_eliminate() function implements a global dead code
1974 * elimination, but it only handles the removing the last write to a register
1975 * if it's never read. This one can handle intermediate writes, but only
1976 * within a basic block.
1977 */
1978 bool
1979 fs_visitor::dead_code_eliminate_local()
1980 {
1981 struct hash_table *ht;
1982 bool progress = false;
1983
1984 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1985
1986 foreach_list_safe(node, &this->instructions) {
1987 fs_inst *inst = (fs_inst *)node;
1988
1989 /* At a basic block, empty the HT since we don't understand dataflow
1990 * here.
1991 */
1992 if (inst->is_control_flow()) {
1993 clear_dead_code_hash(ht);
1994 continue;
1995 }
1996
1997 /* Clear the HT of any instructions that got read. */
1998 for (int i = 0; i < 3; i++) {
1999 fs_reg src = inst->src[i];
2000 if (src.file != GRF)
2001 continue;
2002
2003 int read = 1;
2004 if (inst->is_send_from_grf())
2005 read = virtual_grf_sizes[src.reg] - src.reg_offset;
2006
2007 for (int reg_offset = src.reg_offset;
2008 reg_offset < src.reg_offset + read;
2009 reg_offset++) {
2010 remove_dead_code_hash(ht, src.reg, reg_offset);
2011 }
2012 }
2013
2014 /* Add any update of a GRF to the HT, removing a previous write if it
2015 * wasn't read.
2016 */
2017 if (inst->dst.file == GRF) {
2018 if (inst->regs_written > 1) {
2019 /* We don't know how to trim channels from an instruction's
2020 * writes, so we can't incrementally remove unread channels from
2021 * it. Just remove whatever it overwrites from the table
2022 */
2023 for (int i = 0; i < inst->regs_written; i++) {
2024 remove_dead_code_hash(ht,
2025 inst->dst.reg,
2026 inst->dst.reg_offset + i);
2027 }
2028 } else {
2029 struct hash_entry *entry =
2030 get_dead_code_hash_entry(ht, inst->dst.reg,
2031 inst->dst.reg_offset);
2032
2033 if (inst->is_partial_write()) {
2034 /* For a partial write, we can't remove any previous dead code
2035 * candidate, since we're just modifying their result, but we can
2036 * be dead code eliminiated ourselves.
2037 */
2038 if (entry) {
2039 entry->data = inst;
2040 } else {
2041 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2042 inst);
2043 }
2044 } else {
2045 if (entry) {
2046 /* We're completely updating a channel, and there was a
2047 * previous write to the channel that wasn't read. Kill it!
2048 */
2049 fs_inst *inst = (fs_inst *)entry->data;
2050 inst->remove();
2051 progress = true;
2052 _mesa_hash_table_remove(ht, entry);
2053 }
2054
2055 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2056 inst);
2057 }
2058 }
2059 }
2060 }
2061
2062 _mesa_hash_table_destroy(ht, NULL);
2063
2064 if (progress)
2065 invalidate_live_intervals();
2066
2067 return progress;
2068 }
2069
2070 /**
2071 * Implements a second type of register coalescing: This one checks if
2072 * the two regs involved in a raw move don't interfere, in which case
2073 * they can both by stored in the same place and the MOV removed.
2074 */
2075 bool
2076 fs_visitor::register_coalesce_2()
2077 {
2078 bool progress = false;
2079
2080 calculate_live_intervals();
2081
2082 foreach_list_safe(node, &this->instructions) {
2083 fs_inst *inst = (fs_inst *)node;
2084
2085 if (inst->opcode != BRW_OPCODE_MOV ||
2086 inst->is_partial_write() ||
2087 inst->saturate ||
2088 inst->src[0].file != GRF ||
2089 inst->src[0].negate ||
2090 inst->src[0].abs ||
2091 inst->src[0].smear != -1 ||
2092 inst->dst.file != GRF ||
2093 inst->dst.type != inst->src[0].type ||
2094 virtual_grf_sizes[inst->src[0].reg] != 1) {
2095 continue;
2096 }
2097
2098 int var_from = live_intervals->var_from_reg(&inst->src[0]);
2099 int var_to = live_intervals->var_from_reg(&inst->dst);
2100
2101 if (live_intervals->vars_interfere(var_from, var_to))
2102 continue;
2103
2104 int reg_from = inst->src[0].reg;
2105 assert(inst->src[0].reg_offset == 0);
2106 int reg_to = inst->dst.reg;
2107 int reg_to_offset = inst->dst.reg_offset;
2108
2109 foreach_list(node, &this->instructions) {
2110 fs_inst *scan_inst = (fs_inst *)node;
2111
2112 if (scan_inst->dst.file == GRF &&
2113 scan_inst->dst.reg == reg_from) {
2114 scan_inst->dst.reg = reg_to;
2115 scan_inst->dst.reg_offset = reg_to_offset;
2116 }
2117 for (int i = 0; i < 3; i++) {
2118 if (scan_inst->src[i].file == GRF &&
2119 scan_inst->src[i].reg == reg_from) {
2120 scan_inst->src[i].reg = reg_to;
2121 scan_inst->src[i].reg_offset = reg_to_offset;
2122 }
2123 }
2124 }
2125
2126 inst->remove();
2127 progress = true;
2128 continue;
2129 }
2130
2131 if (progress)
2132 invalidate_live_intervals();
2133
2134 return progress;
2135 }
2136
2137 bool
2138 fs_visitor::register_coalesce()
2139 {
2140 bool progress = false;
2141 int if_depth = 0;
2142 int loop_depth = 0;
2143
2144 foreach_list_safe(node, &this->instructions) {
2145 fs_inst *inst = (fs_inst *)node;
2146
2147 /* Make sure that we dominate the instructions we're going to
2148 * scan for interfering with our coalescing, or we won't have
2149 * scanned enough to see if anything interferes with our
2150 * coalescing. We don't dominate the following instructions if
2151 * we're in a loop or an if block.
2152 */
2153 switch (inst->opcode) {
2154 case BRW_OPCODE_DO:
2155 loop_depth++;
2156 break;
2157 case BRW_OPCODE_WHILE:
2158 loop_depth--;
2159 break;
2160 case BRW_OPCODE_IF:
2161 if_depth++;
2162 break;
2163 case BRW_OPCODE_ENDIF:
2164 if_depth--;
2165 break;
2166 default:
2167 break;
2168 }
2169 if (loop_depth || if_depth)
2170 continue;
2171
2172 if (inst->opcode != BRW_OPCODE_MOV ||
2173 inst->is_partial_write() ||
2174 inst->saturate ||
2175 inst->dst.file != GRF || (inst->src[0].file != GRF &&
2176 inst->src[0].file != UNIFORM)||
2177 inst->dst.type != inst->src[0].type)
2178 continue;
2179
2180 bool has_source_modifiers = (inst->src[0].abs ||
2181 inst->src[0].negate ||
2182 inst->src[0].smear != -1 ||
2183 inst->src[0].file == UNIFORM);
2184
2185 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
2186 * them: check for no writes to either one until the exit of the
2187 * program.
2188 */
2189 bool interfered = false;
2190
2191 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2192 !scan_inst->is_tail_sentinel();
2193 scan_inst = (fs_inst *)scan_inst->next) {
2194 if (scan_inst->dst.file == GRF) {
2195 if (scan_inst->overwrites_reg(inst->dst) ||
2196 scan_inst->overwrites_reg(inst->src[0])) {
2197 interfered = true;
2198 break;
2199 }
2200 }
2201
2202 if (has_source_modifiers) {
2203 for (int i = 0; i < 3; i++) {
2204 if (scan_inst->src[i].file == GRF &&
2205 scan_inst->src[i].reg == inst->dst.reg &&
2206 scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2207 inst->dst.type != scan_inst->src[i].type)
2208 {
2209 interfered = true;
2210 break;
2211 }
2212 }
2213 }
2214
2215
2216 /* The gen6 MATH instruction can't handle source modifiers or
2217 * unusual register regions, so avoid coalescing those for
2218 * now. We should do something more specific.
2219 */
2220 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2221 interfered = true;
2222 break;
2223 }
2224
2225 if (scan_inst->mlen > 0 && scan_inst->base_mrf == -1 &&
2226 scan_inst->src[0].file == GRF &&
2227 scan_inst->src[0].reg == inst->dst.reg) {
2228 interfered = true;
2229 break;
2230 }
2231
2232 /* The accumulator result appears to get used for the
2233 * conditional modifier generation. When negating a UD
2234 * value, there is a 33rd bit generated for the sign in the
2235 * accumulator value, so now you can't check, for example,
2236 * equality with a 32-bit value. See piglit fs-op-neg-uint.
2237 */
2238 if (scan_inst->conditional_mod &&
2239 inst->src[0].negate &&
2240 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2241 interfered = true;
2242 break;
2243 }
2244 }
2245 if (interfered) {
2246 continue;
2247 }
2248
2249 /* Rewrite the later usage to point at the source of the move to
2250 * be removed.
2251 */
2252 for (fs_inst *scan_inst = inst;
2253 !scan_inst->is_tail_sentinel();
2254 scan_inst = (fs_inst *)scan_inst->next) {
2255 for (int i = 0; i < 3; i++) {
2256 if (scan_inst->src[i].file == GRF &&
2257 scan_inst->src[i].reg == inst->dst.reg &&
2258 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2259 fs_reg new_src = inst->src[0];
2260 if (scan_inst->src[i].abs) {
2261 new_src.negate = 0;
2262 new_src.abs = 1;
2263 }
2264 new_src.negate ^= scan_inst->src[i].negate;
2265 new_src.sechalf = scan_inst->src[i].sechalf;
2266 scan_inst->src[i] = new_src;
2267 }
2268 }
2269 }
2270
2271 inst->remove();
2272 progress = true;
2273 }
2274
2275 if (progress)
2276 invalidate_live_intervals();
2277
2278 return progress;
2279 }
2280
2281
2282 bool
2283 fs_visitor::compute_to_mrf()
2284 {
2285 bool progress = false;
2286 int next_ip = 0;
2287
2288 calculate_live_intervals();
2289
2290 foreach_list_safe(node, &this->instructions) {
2291 fs_inst *inst = (fs_inst *)node;
2292
2293 int ip = next_ip;
2294 next_ip++;
2295
2296 if (inst->opcode != BRW_OPCODE_MOV ||
2297 inst->is_partial_write() ||
2298 inst->dst.file != MRF || inst->src[0].file != GRF ||
2299 inst->dst.type != inst->src[0].type ||
2300 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2301 continue;
2302
2303 /* Work out which hardware MRF registers are written by this
2304 * instruction.
2305 */
2306 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2307 int mrf_high;
2308 if (inst->dst.reg & BRW_MRF_COMPR4) {
2309 mrf_high = mrf_low + 4;
2310 } else if (dispatch_width == 16 &&
2311 (!inst->force_uncompressed && !inst->force_sechalf)) {
2312 mrf_high = mrf_low + 1;
2313 } else {
2314 mrf_high = mrf_low;
2315 }
2316
2317 /* Can't compute-to-MRF this GRF if someone else was going to
2318 * read it later.
2319 */
2320 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2321 continue;
2322
2323 /* Found a move of a GRF to a MRF. Let's see if we can go
2324 * rewrite the thing that made this GRF to write into the MRF.
2325 */
2326 fs_inst *scan_inst;
2327 for (scan_inst = (fs_inst *)inst->prev;
2328 scan_inst->prev != NULL;
2329 scan_inst = (fs_inst *)scan_inst->prev) {
2330 if (scan_inst->dst.file == GRF &&
2331 scan_inst->dst.reg == inst->src[0].reg) {
2332 /* Found the last thing to write our reg we want to turn
2333 * into a compute-to-MRF.
2334 */
2335
2336 /* If this one instruction didn't populate all the
2337 * channels, bail. We might be able to rewrite everything
2338 * that writes that reg, but it would require smarter
2339 * tracking to delay the rewriting until complete success.
2340 */
2341 if (scan_inst->is_partial_write())
2342 break;
2343
2344 /* Things returning more than one register would need us to
2345 * understand coalescing out more than one MOV at a time.
2346 */
2347 if (scan_inst->regs_written > 1)
2348 break;
2349
2350 /* SEND instructions can't have MRF as a destination. */
2351 if (scan_inst->mlen)
2352 break;
2353
2354 if (brw->gen == 6) {
2355 /* gen6 math instructions must have the destination be
2356 * GRF, so no compute-to-MRF for them.
2357 */
2358 if (scan_inst->is_math()) {
2359 break;
2360 }
2361 }
2362
2363 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2364 /* Found the creator of our MRF's source value. */
2365 scan_inst->dst.file = MRF;
2366 scan_inst->dst.reg = inst->dst.reg;
2367 scan_inst->saturate |= inst->saturate;
2368 inst->remove();
2369 progress = true;
2370 }
2371 break;
2372 }
2373
2374 /* We don't handle control flow here. Most computation of
2375 * values that end up in MRFs are shortly before the MRF
2376 * write anyway.
2377 */
2378 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2379 break;
2380
2381 /* You can't read from an MRF, so if someone else reads our
2382 * MRF's source GRF that we wanted to rewrite, that stops us.
2383 */
2384 bool interfered = false;
2385 for (int i = 0; i < 3; i++) {
2386 if (scan_inst->src[i].file == GRF &&
2387 scan_inst->src[i].reg == inst->src[0].reg &&
2388 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2389 interfered = true;
2390 }
2391 }
2392 if (interfered)
2393 break;
2394
2395 if (scan_inst->dst.file == MRF) {
2396 /* If somebody else writes our MRF here, we can't
2397 * compute-to-MRF before that.
2398 */
2399 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2400 int scan_mrf_high;
2401
2402 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2403 scan_mrf_high = scan_mrf_low + 4;
2404 } else if (dispatch_width == 16 &&
2405 (!scan_inst->force_uncompressed &&
2406 !scan_inst->force_sechalf)) {
2407 scan_mrf_high = scan_mrf_low + 1;
2408 } else {
2409 scan_mrf_high = scan_mrf_low;
2410 }
2411
2412 if (mrf_low == scan_mrf_low ||
2413 mrf_low == scan_mrf_high ||
2414 mrf_high == scan_mrf_low ||
2415 mrf_high == scan_mrf_high) {
2416 break;
2417 }
2418 }
2419
2420 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2421 /* Found a SEND instruction, which means that there are
2422 * live values in MRFs from base_mrf to base_mrf +
2423 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2424 * above it.
2425 */
2426 if (mrf_low >= scan_inst->base_mrf &&
2427 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2428 break;
2429 }
2430 if (mrf_high >= scan_inst->base_mrf &&
2431 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2432 break;
2433 }
2434 }
2435 }
2436 }
2437
2438 if (progress)
2439 invalidate_live_intervals();
2440
2441 return progress;
2442 }
2443
2444 /**
2445 * Walks through basic blocks, looking for repeated MRF writes and
2446 * removing the later ones.
2447 */
2448 bool
2449 fs_visitor::remove_duplicate_mrf_writes()
2450 {
2451 fs_inst *last_mrf_move[16];
2452 bool progress = false;
2453
2454 /* Need to update the MRF tracking for compressed instructions. */
2455 if (dispatch_width == 16)
2456 return false;
2457
2458 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2459
2460 foreach_list_safe(node, &this->instructions) {
2461 fs_inst *inst = (fs_inst *)node;
2462
2463 if (inst->is_control_flow()) {
2464 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2465 }
2466
2467 if (inst->opcode == BRW_OPCODE_MOV &&
2468 inst->dst.file == MRF) {
2469 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2470 if (prev_inst && inst->equals(prev_inst)) {
2471 inst->remove();
2472 progress = true;
2473 continue;
2474 }
2475 }
2476
2477 /* Clear out the last-write records for MRFs that were overwritten. */
2478 if (inst->dst.file == MRF) {
2479 last_mrf_move[inst->dst.reg] = NULL;
2480 }
2481
2482 if (inst->mlen > 0 && inst->base_mrf != -1) {
2483 /* Found a SEND instruction, which will include two or fewer
2484 * implied MRF writes. We could do better here.
2485 */
2486 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2487 last_mrf_move[inst->base_mrf + i] = NULL;
2488 }
2489 }
2490
2491 /* Clear out any MRF move records whose sources got overwritten. */
2492 if (inst->dst.file == GRF) {
2493 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2494 if (last_mrf_move[i] &&
2495 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2496 last_mrf_move[i] = NULL;
2497 }
2498 }
2499 }
2500
2501 if (inst->opcode == BRW_OPCODE_MOV &&
2502 inst->dst.file == MRF &&
2503 inst->src[0].file == GRF &&
2504 !inst->is_partial_write()) {
2505 last_mrf_move[inst->dst.reg] = inst;
2506 }
2507 }
2508
2509 if (progress)
2510 invalidate_live_intervals();
2511
2512 return progress;
2513 }
2514
2515 static void
2516 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2517 int first_grf, int grf_len)
2518 {
2519 bool inst_16wide = (dispatch_width > 8 &&
2520 !inst->force_uncompressed &&
2521 !inst->force_sechalf);
2522
2523 /* Clear the flag for registers that actually got read (as expected). */
2524 for (int i = 0; i < 3; i++) {
2525 int grf;
2526 if (inst->src[i].file == GRF) {
2527 grf = inst->src[i].reg;
2528 } else if (inst->src[i].file == HW_REG &&
2529 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2530 grf = inst->src[i].fixed_hw_reg.nr;
2531 } else {
2532 continue;
2533 }
2534
2535 if (grf >= first_grf &&
2536 grf < first_grf + grf_len) {
2537 deps[grf - first_grf] = false;
2538 if (inst_16wide)
2539 deps[grf - first_grf + 1] = false;
2540 }
2541 }
2542 }
2543
2544 /**
2545 * Implements this workaround for the original 965:
2546 *
2547 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2548 * check for post destination dependencies on this instruction, software
2549 * must ensure that there is no destination hazard for the case of ‘write
2550 * followed by a posted write’ shown in the following example.
2551 *
2552 * 1. mov r3 0
2553 * 2. send r3.xy <rest of send instruction>
2554 * 3. mov r2 r3
2555 *
2556 * Due to no post-destination dependency check on the ‘send’, the above
2557 * code sequence could have two instructions (1 and 2) in flight at the
2558 * same time that both consider ‘r3’ as the target of their final writes.
2559 */
2560 void
2561 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2562 {
2563 int reg_size = dispatch_width / 8;
2564 int write_len = inst->regs_written * reg_size;
2565 int first_write_grf = inst->dst.reg;
2566 bool needs_dep[BRW_MAX_MRF];
2567 assert(write_len < (int)sizeof(needs_dep) - 1);
2568
2569 memset(needs_dep, false, sizeof(needs_dep));
2570 memset(needs_dep, true, write_len);
2571
2572 clear_deps_for_inst_src(inst, dispatch_width,
2573 needs_dep, first_write_grf, write_len);
2574
2575 /* Walk backwards looking for writes to registers we're writing which
2576 * aren't read since being written. If we hit the start of the program,
2577 * we assume that there are no outstanding dependencies on entry to the
2578 * program.
2579 */
2580 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2581 scan_inst != NULL;
2582 scan_inst = (fs_inst *)scan_inst->prev) {
2583
2584 /* If we hit control flow, assume that there *are* outstanding
2585 * dependencies, and force their cleanup before our instruction.
2586 */
2587 if (scan_inst->is_control_flow()) {
2588 for (int i = 0; i < write_len; i++) {
2589 if (needs_dep[i]) {
2590 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2591 }
2592 }
2593 return;
2594 }
2595
2596 bool scan_inst_16wide = (dispatch_width > 8 &&
2597 !scan_inst->force_uncompressed &&
2598 !scan_inst->force_sechalf);
2599
2600 /* We insert our reads as late as possible on the assumption that any
2601 * instruction but a MOV that might have left us an outstanding
2602 * dependency has more latency than a MOV.
2603 */
2604 if (scan_inst->dst.file == GRF) {
2605 for (int i = 0; i < scan_inst->regs_written; i++) {
2606 int reg = scan_inst->dst.reg + i * reg_size;
2607
2608 if (reg >= first_write_grf &&
2609 reg < first_write_grf + write_len &&
2610 needs_dep[reg - first_write_grf]) {
2611 inst->insert_before(DEP_RESOLVE_MOV(reg));
2612 needs_dep[reg - first_write_grf] = false;
2613 if (scan_inst_16wide)
2614 needs_dep[reg - first_write_grf + 1] = false;
2615 }
2616 }
2617 }
2618
2619 /* Clear the flag for registers that actually got read (as expected). */
2620 clear_deps_for_inst_src(scan_inst, dispatch_width,
2621 needs_dep, first_write_grf, write_len);
2622
2623 /* Continue the loop only if we haven't resolved all the dependencies */
2624 int i;
2625 for (i = 0; i < write_len; i++) {
2626 if (needs_dep[i])
2627 break;
2628 }
2629 if (i == write_len)
2630 return;
2631 }
2632 }
2633
2634 /**
2635 * Implements this workaround for the original 965:
2636 *
2637 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2638 * used as a destination register until after it has been sourced by an
2639 * instruction with a different destination register.
2640 */
2641 void
2642 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2643 {
2644 int write_len = inst->regs_written * dispatch_width / 8;
2645 int first_write_grf = inst->dst.reg;
2646 bool needs_dep[BRW_MAX_MRF];
2647 assert(write_len < (int)sizeof(needs_dep) - 1);
2648
2649 memset(needs_dep, false, sizeof(needs_dep));
2650 memset(needs_dep, true, write_len);
2651 /* Walk forwards looking for writes to registers we're writing which aren't
2652 * read before being written.
2653 */
2654 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2655 !scan_inst->is_tail_sentinel();
2656 scan_inst = (fs_inst *)scan_inst->next) {
2657 /* If we hit control flow, force resolve all remaining dependencies. */
2658 if (scan_inst->is_control_flow()) {
2659 for (int i = 0; i < write_len; i++) {
2660 if (needs_dep[i])
2661 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2662 }
2663 return;
2664 }
2665
2666 /* Clear the flag for registers that actually got read (as expected). */
2667 clear_deps_for_inst_src(scan_inst, dispatch_width,
2668 needs_dep, first_write_grf, write_len);
2669
2670 /* We insert our reads as late as possible since they're reading the
2671 * result of a SEND, which has massive latency.
2672 */
2673 if (scan_inst->dst.file == GRF &&
2674 scan_inst->dst.reg >= first_write_grf &&
2675 scan_inst->dst.reg < first_write_grf + write_len &&
2676 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2677 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2678 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2679 }
2680
2681 /* Continue the loop only if we haven't resolved all the dependencies */
2682 int i;
2683 for (i = 0; i < write_len; i++) {
2684 if (needs_dep[i])
2685 break;
2686 }
2687 if (i == write_len)
2688 return;
2689 }
2690
2691 /* If we hit the end of the program, resolve all remaining dependencies out
2692 * of paranoia.
2693 */
2694 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2695 assert(last_inst->eot);
2696 for (int i = 0; i < write_len; i++) {
2697 if (needs_dep[i])
2698 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2699 }
2700 }
2701
2702 void
2703 fs_visitor::insert_gen4_send_dependency_workarounds()
2704 {
2705 if (brw->gen != 4 || brw->is_g4x)
2706 return;
2707
2708 /* Note that we're done with register allocation, so GRF fs_regs always
2709 * have a .reg_offset of 0.
2710 */
2711
2712 foreach_list_safe(node, &this->instructions) {
2713 fs_inst *inst = (fs_inst *)node;
2714
2715 if (inst->mlen != 0 && inst->dst.file == GRF) {
2716 insert_gen4_pre_send_dependency_workarounds(inst);
2717 insert_gen4_post_send_dependency_workarounds(inst);
2718 }
2719 }
2720 }
2721
2722 /**
2723 * Turns the generic expression-style uniform pull constant load instruction
2724 * into a hardware-specific series of instructions for loading a pull
2725 * constant.
2726 *
2727 * The expression style allows the CSE pass before this to optimize out
2728 * repeated loads from the same offset, and gives the pre-register-allocation
2729 * scheduling full flexibility, while the conversion to native instructions
2730 * allows the post-register-allocation scheduler the best information
2731 * possible.
2732 *
2733 * Note that execution masking for setting up pull constant loads is special:
2734 * the channels that need to be written are unrelated to the current execution
2735 * mask, since a later instruction will use one of the result channels as a
2736 * source operand for all 8 or 16 of its channels.
2737 */
2738 void
2739 fs_visitor::lower_uniform_pull_constant_loads()
2740 {
2741 foreach_list(node, &this->instructions) {
2742 fs_inst *inst = (fs_inst *)node;
2743
2744 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2745 continue;
2746
2747 if (brw->gen >= 7) {
2748 /* The offset arg before was a vec4-aligned byte offset. We need to
2749 * turn it into a dword offset.
2750 */
2751 fs_reg const_offset_reg = inst->src[1];
2752 assert(const_offset_reg.file == IMM &&
2753 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2754 const_offset_reg.imm.u /= 4;
2755 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2756
2757 /* This is actually going to be a MOV, but since only the first dword
2758 * is accessed, we have a special opcode to do just that one. Note
2759 * that this needs to be an operation that will be considered a def
2760 * by live variable analysis, or register allocation will explode.
2761 */
2762 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2763 payload, const_offset_reg);
2764 setup->force_writemask_all = true;
2765
2766 setup->ir = inst->ir;
2767 setup->annotation = inst->annotation;
2768 inst->insert_before(setup);
2769
2770 /* Similarly, this will only populate the first 4 channels of the
2771 * result register (since we only use smear values from 0-3), but we
2772 * don't tell the optimizer.
2773 */
2774 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2775 inst->src[1] = payload;
2776
2777 invalidate_live_intervals();
2778 } else {
2779 /* Before register allocation, we didn't tell the scheduler about the
2780 * MRF we use. We know it's safe to use this MRF because nothing
2781 * else does except for register spill/unspill, which generates and
2782 * uses its MRF within a single IR instruction.
2783 */
2784 inst->base_mrf = 14;
2785 inst->mlen = 1;
2786 }
2787 }
2788 }
2789
2790 void
2791 fs_visitor::dump_instruction(backend_instruction *be_inst)
2792 {
2793 fs_inst *inst = (fs_inst *)be_inst;
2794
2795 if (inst->predicate) {
2796 printf("(%cf0.%d) ",
2797 inst->predicate_inverse ? '-' : '+',
2798 inst->flag_subreg);
2799 }
2800
2801 printf("%s", brw_instruction_name(inst->opcode));
2802 if (inst->saturate)
2803 printf(".sat");
2804 if (inst->conditional_mod) {
2805 printf(".cmod");
2806 if (!inst->predicate &&
2807 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2808 inst->opcode != BRW_OPCODE_IF &&
2809 inst->opcode != BRW_OPCODE_WHILE))) {
2810 printf(".f0.%d", inst->flag_subreg);
2811 }
2812 }
2813 printf(" ");
2814
2815
2816 switch (inst->dst.file) {
2817 case GRF:
2818 printf("vgrf%d", inst->dst.reg);
2819 if (inst->dst.reg_offset)
2820 printf("+%d", inst->dst.reg_offset);
2821 break;
2822 case MRF:
2823 printf("m%d", inst->dst.reg);
2824 break;
2825 case BAD_FILE:
2826 printf("(null)");
2827 break;
2828 case UNIFORM:
2829 printf("***u%d***", inst->dst.reg);
2830 break;
2831 case HW_REG:
2832 printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
2833 if (inst->dst.fixed_hw_reg.subnr)
2834 printf("+%d", inst->dst.fixed_hw_reg.subnr);
2835 break;
2836 default:
2837 printf("???");
2838 break;
2839 }
2840 printf(", ");
2841
2842 for (int i = 0; i < 3; i++) {
2843 if (inst->src[i].negate)
2844 printf("-");
2845 if (inst->src[i].abs)
2846 printf("|");
2847 switch (inst->src[i].file) {
2848 case GRF:
2849 printf("vgrf%d", inst->src[i].reg);
2850 if (inst->src[i].reg_offset)
2851 printf("+%d", inst->src[i].reg_offset);
2852 break;
2853 case MRF:
2854 printf("***m%d***", inst->src[i].reg);
2855 break;
2856 case UNIFORM:
2857 printf("u%d", inst->src[i].reg);
2858 if (inst->src[i].reg_offset)
2859 printf(".%d", inst->src[i].reg_offset);
2860 break;
2861 case BAD_FILE:
2862 printf("(null)");
2863 break;
2864 case IMM:
2865 switch (inst->src[i].type) {
2866 case BRW_REGISTER_TYPE_F:
2867 printf("%ff", inst->src[i].imm.f);
2868 break;
2869 case BRW_REGISTER_TYPE_D:
2870 printf("%dd", inst->src[i].imm.i);
2871 break;
2872 case BRW_REGISTER_TYPE_UD:
2873 printf("%uu", inst->src[i].imm.u);
2874 break;
2875 default:
2876 printf("???");
2877 break;
2878 }
2879 break;
2880 case HW_REG:
2881 if (inst->src[i].fixed_hw_reg.negate)
2882 printf("-");
2883 if (inst->src[i].fixed_hw_reg.abs)
2884 printf("|");
2885 printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2886 if (inst->src[i].fixed_hw_reg.subnr)
2887 printf("+%d", inst->src[i].fixed_hw_reg.subnr);
2888 if (inst->src[i].fixed_hw_reg.abs)
2889 printf("|");
2890 break;
2891 default:
2892 printf("???");
2893 break;
2894 }
2895 if (inst->src[i].abs)
2896 printf("|");
2897
2898 if (i < 3)
2899 printf(", ");
2900 }
2901
2902 printf(" ");
2903
2904 if (inst->force_uncompressed)
2905 printf("1sthalf ");
2906
2907 if (inst->force_sechalf)
2908 printf("2ndhalf ");
2909
2910 printf("\n");
2911 }
2912
2913 /**
2914 * Possibly returns an instruction that set up @param reg.
2915 *
2916 * Sometimes we want to take the result of some expression/variable
2917 * dereference tree and rewrite the instruction generating the result
2918 * of the tree. When processing the tree, we know that the
2919 * instructions generated are all writing temporaries that are dead
2920 * outside of this tree. So, if we have some instructions that write
2921 * a temporary, we're free to point that temp write somewhere else.
2922 *
2923 * Note that this doesn't guarantee that the instruction generated
2924 * only reg -- it might be the size=4 destination of a texture instruction.
2925 */
2926 fs_inst *
2927 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2928 fs_inst *end,
2929 fs_reg reg)
2930 {
2931 if (end == start ||
2932 end->is_partial_write() ||
2933 reg.reladdr ||
2934 !reg.equals(end->dst)) {
2935 return NULL;
2936 } else {
2937 return end;
2938 }
2939 }
2940
2941 void
2942 fs_visitor::setup_payload_gen6()
2943 {
2944 bool uses_depth =
2945 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2946 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2947
2948 assert(brw->gen >= 6);
2949
2950 /* R0-1: masks, pixel X/Y coordinates. */
2951 c->nr_payload_regs = 2;
2952 /* R2: only for 32-pixel dispatch.*/
2953
2954 /* R3-26: barycentric interpolation coordinates. These appear in the
2955 * same order that they appear in the brw_wm_barycentric_interp_mode
2956 * enum. Each set of coordinates occupies 2 registers if dispatch width
2957 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2958 * appear if they were enabled using the "Barycentric Interpolation
2959 * Mode" bits in WM_STATE.
2960 */
2961 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2962 if (barycentric_interp_modes & (1 << i)) {
2963 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2964 c->nr_payload_regs += 2;
2965 if (dispatch_width == 16) {
2966 c->nr_payload_regs += 2;
2967 }
2968 }
2969 }
2970
2971 /* R27: interpolated depth if uses source depth */
2972 if (uses_depth) {
2973 c->source_depth_reg = c->nr_payload_regs;
2974 c->nr_payload_regs++;
2975 if (dispatch_width == 16) {
2976 /* R28: interpolated depth if not 8-wide. */
2977 c->nr_payload_regs++;
2978 }
2979 }
2980 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2981 if (uses_depth) {
2982 c->source_w_reg = c->nr_payload_regs;
2983 c->nr_payload_regs++;
2984 if (dispatch_width == 16) {
2985 /* R30: interpolated W if not 8-wide. */
2986 c->nr_payload_regs++;
2987 }
2988 }
2989 /* R31: MSAA position offsets. */
2990 /* R32-: bary for 32-pixel. */
2991 /* R58-59: interp W for 32-pixel. */
2992
2993 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2994 c->source_depth_to_render_target = true;
2995 }
2996 }
2997
2998 void
2999 fs_visitor::assign_binding_table_offsets()
3000 {
3001 uint32_t next_binding_table_offset = 0;
3002
3003 c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3004 next_binding_table_offset += c->key.nr_color_regions;
3005
3006 assign_common_binding_table_offsets(next_binding_table_offset);
3007 }
3008
3009 bool
3010 fs_visitor::run()
3011 {
3012 sanity_param_count = fp->Base.Parameters->NumParameters;
3013 uint32_t orig_nr_params = c->prog_data.nr_params;
3014
3015 assign_binding_table_offsets();
3016
3017 if (brw->gen >= 6)
3018 setup_payload_gen6();
3019 else
3020 setup_payload_gen4();
3021
3022 if (0) {
3023 emit_dummy_fs();
3024 } else {
3025 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3026 emit_shader_time_begin();
3027
3028 calculate_urb_setup();
3029 if (fp->Base.InputsRead > 0) {
3030 if (brw->gen < 6)
3031 emit_interpolation_setup_gen4();
3032 else
3033 emit_interpolation_setup_gen6();
3034 }
3035
3036 /* We handle discards by keeping track of the still-live pixels in f0.1.
3037 * Initialize it with the dispatched pixels.
3038 */
3039 if (fp->UsesKill) {
3040 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3041 discard_init->flag_subreg = 1;
3042 }
3043
3044 /* Generate FS IR for main(). (the visitor only descends into
3045 * functions called "main").
3046 */
3047 if (shader) {
3048 foreach_list(node, &*shader->ir) {
3049 ir_instruction *ir = (ir_instruction *)node;
3050 base_ir = ir;
3051 this->result = reg_undef;
3052 ir->accept(this);
3053 }
3054 } else {
3055 emit_fragment_program_code();
3056 }
3057 base_ir = NULL;
3058 if (failed)
3059 return false;
3060
3061 emit(FS_OPCODE_PLACEHOLDER_HALT);
3062
3063 emit_fb_writes();
3064
3065 split_virtual_grfs();
3066
3067 move_uniform_array_access_to_pull_constants();
3068 setup_pull_constants();
3069
3070 bool progress;
3071 do {
3072 progress = false;
3073
3074 compact_virtual_grfs();
3075
3076 progress = remove_duplicate_mrf_writes() || progress;
3077
3078 progress = opt_algebraic() || progress;
3079 progress = opt_cse() || progress;
3080 progress = opt_copy_propagate() || progress;
3081 progress = dead_code_eliminate() || progress;
3082 progress = dead_code_eliminate_local() || progress;
3083 progress = register_coalesce() || progress;
3084 progress = register_coalesce_2() || progress;
3085 progress = compute_to_mrf() || progress;
3086 } while (progress);
3087
3088 remove_dead_constants();
3089
3090 schedule_instructions(false);
3091
3092 lower_uniform_pull_constant_loads();
3093
3094 assign_curb_setup();
3095 assign_urb_setup();
3096
3097 if (0) {
3098 /* Debug of register spilling: Go spill everything. */
3099 for (int i = 0; i < virtual_grf_count; i++) {
3100 spill_reg(i);
3101 }
3102 }
3103
3104 if (0)
3105 assign_regs_trivial();
3106 else {
3107 while (!assign_regs()) {
3108 if (failed)
3109 break;
3110 }
3111 }
3112 }
3113 assert(force_uncompressed_stack == 0);
3114 assert(force_sechalf_stack == 0);
3115
3116 /* This must come after all optimization and register allocation, since
3117 * it inserts dead code that happens to have side effects, and it does
3118 * so based on the actual physical registers in use.
3119 */
3120 insert_gen4_send_dependency_workarounds();
3121
3122 if (failed)
3123 return false;
3124
3125 schedule_instructions(true);
3126
3127 if (dispatch_width == 8) {
3128 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3129 } else {
3130 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3131
3132 /* Make sure we didn't try to sneak in an extra uniform */
3133 assert(orig_nr_params == c->prog_data.nr_params);
3134 (void) orig_nr_params;
3135 }
3136
3137 /* If any state parameters were appended, then ParameterValues could have
3138 * been realloced, in which case the driver uniform storage set up by
3139 * _mesa_associate_uniform_storage() would point to freed memory. Make
3140 * sure that didn't happen.
3141 */
3142 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3143
3144 return !failed;
3145 }
3146
3147 const unsigned *
3148 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3149 struct gl_fragment_program *fp,
3150 struct gl_shader_program *prog,
3151 unsigned *final_assembly_size)
3152 {
3153 bool start_busy = false;
3154 float start_time = 0;
3155
3156 if (unlikely(brw->perf_debug)) {
3157 start_busy = (brw->batch.last_bo &&
3158 drm_intel_bo_busy(brw->batch.last_bo));
3159 start_time = get_time();
3160 }
3161
3162 struct brw_shader *shader = NULL;
3163 if (prog)
3164 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3165
3166 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3167 if (prog) {
3168 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3169 _mesa_print_ir(shader->ir, NULL);
3170 printf("\n\n");
3171 } else {
3172 printf("ARB_fragment_program %d ir for native fragment shader\n",
3173 fp->Base.Id);
3174 _mesa_print_program(&fp->Base);
3175 }
3176 }
3177
3178 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3179 */
3180 fs_visitor v(brw, c, prog, fp, 8);
3181 if (!v.run()) {
3182 if (prog) {
3183 prog->LinkStatus = false;
3184 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3185 }
3186
3187 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3188 v.fail_msg);
3189
3190 return NULL;
3191 }
3192
3193 exec_list *simd16_instructions = NULL;
3194 fs_visitor v2(brw, c, prog, fp, 16);
3195 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3196 if (c->prog_data.nr_pull_params == 0) {
3197 /* Try a 16-wide compile */
3198 v2.import_uniforms(&v);
3199 if (!v2.run()) {
3200 perf_debug("16-wide shader failed to compile, falling back to "
3201 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3202 } else {
3203 simd16_instructions = &v2.instructions;
3204 }
3205 } else {
3206 perf_debug("Skipping 16-wide due to pull parameters.\n");
3207 }
3208 }
3209
3210 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3211 const unsigned *generated = g.generate_assembly(&v.instructions,
3212 simd16_instructions,
3213 final_assembly_size);
3214
3215 if (unlikely(brw->perf_debug) && shader) {
3216 if (shader->compiled_once)
3217 brw_wm_debug_recompile(brw, prog, &c->key);
3218 shader->compiled_once = true;
3219
3220 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3221 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3222 (get_time() - start_time) * 1000);
3223 }
3224 }
3225
3226 return generated;
3227 }
3228
3229 bool
3230 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3231 {
3232 struct brw_context *brw = brw_context(ctx);
3233 struct brw_wm_prog_key key;
3234
3235 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3236 return true;
3237
3238 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3239 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3240 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3241 bool program_uses_dfdy = fp->UsesDFdy;
3242
3243 memset(&key, 0, sizeof(key));
3244
3245 if (brw->gen < 6) {
3246 if (fp->UsesKill)
3247 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3248
3249 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3250 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3251
3252 /* Just assume depth testing. */
3253 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3254 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3255 }
3256
3257 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3258 BRW_FS_VARYING_INPUT_MASK) > 16)
3259 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3260
3261 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3262
3263 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3264 for (unsigned i = 0; i < sampler_count; i++) {
3265 if (fp->Base.ShadowSamplers & (1 << i)) {
3266 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3267 key.tex.swizzles[i] =
3268 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3269 } else {
3270 /* Color sampler: assume no swizzling. */
3271 key.tex.swizzles[i] = SWIZZLE_XYZW;
3272 }
3273 }
3274
3275 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3276 key.drawable_height = ctx->DrawBuffer->Height;
3277 }
3278
3279 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3280 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3281 }
3282
3283 key.nr_color_regions = 1;
3284
3285 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3286 * quality of the derivatives is likely to be determined by the driconf
3287 * option.
3288 */
3289 key.high_quality_derivatives = brw->disable_derivative_optimization;
3290
3291 key.program_string_id = bfp->id;
3292
3293 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3294 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3295
3296 bool success = do_wm_prog(brw, prog, bfp, &key);
3297
3298 brw->wm.base.prog_offset = old_prog_offset;
3299 brw->wm.prog_data = old_prog_data;
3300
3301 return success;
3302 }