i965: Merge together opcodes for SHADER_OPCODE_GEN4_SCRATCH_READ/WRITE
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "main/uniforms.h"
50 #include "brw_fs_live_variables.h"
51 #include "glsl/glsl_types.h"
52
53 void
54 fs_inst::init()
55 {
56 memset(this, 0, sizeof(*this));
57 this->opcode = BRW_OPCODE_NOP;
58 this->conditional_mod = BRW_CONDITIONAL_NONE;
59
60 this->dst = reg_undef;
61 this->src[0] = reg_undef;
62 this->src[1] = reg_undef;
63 this->src[2] = reg_undef;
64
65 /* This will be the case for almost all instructions. */
66 this->regs_written = 1;
67 }
68
69 fs_inst::fs_inst()
70 {
71 init();
72 }
73
74 fs_inst::fs_inst(enum opcode opcode)
75 {
76 init();
77 this->opcode = opcode;
78 }
79
80 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
81 {
82 init();
83 this->opcode = opcode;
84 this->dst = dst;
85
86 if (dst.file == GRF)
87 assert(dst.reg_offset >= 0);
88 }
89
90 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
91 {
92 init();
93 this->opcode = opcode;
94 this->dst = dst;
95 this->src[0] = src0;
96
97 if (dst.file == GRF)
98 assert(dst.reg_offset >= 0);
99 if (src[0].file == GRF)
100 assert(src[0].reg_offset >= 0);
101 }
102
103 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
104 {
105 init();
106 this->opcode = opcode;
107 this->dst = dst;
108 this->src[0] = src0;
109 this->src[1] = src1;
110
111 if (dst.file == GRF)
112 assert(dst.reg_offset >= 0);
113 if (src[0].file == GRF)
114 assert(src[0].reg_offset >= 0);
115 if (src[1].file == GRF)
116 assert(src[1].reg_offset >= 0);
117 }
118
119 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
120 fs_reg src0, fs_reg src1, fs_reg src2)
121 {
122 init();
123 this->opcode = opcode;
124 this->dst = dst;
125 this->src[0] = src0;
126 this->src[1] = src1;
127 this->src[2] = src2;
128
129 if (dst.file == GRF)
130 assert(dst.reg_offset >= 0);
131 if (src[0].file == GRF)
132 assert(src[0].reg_offset >= 0);
133 if (src[1].file == GRF)
134 assert(src[1].reg_offset >= 0);
135 if (src[2].file == GRF)
136 assert(src[2].reg_offset >= 0);
137 }
138
139 #define ALU1(op) \
140 fs_inst * \
141 fs_visitor::op(fs_reg dst, fs_reg src0) \
142 { \
143 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
144 }
145
146 #define ALU2(op) \
147 fs_inst * \
148 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
149 { \
150 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
151 }
152
153 #define ALU3(op) \
154 fs_inst * \
155 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
156 { \
157 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
158 }
159
160 ALU1(NOT)
161 ALU1(MOV)
162 ALU1(FRC)
163 ALU1(RNDD)
164 ALU1(RNDE)
165 ALU1(RNDZ)
166 ALU2(ADD)
167 ALU2(MUL)
168 ALU2(MACH)
169 ALU2(AND)
170 ALU2(OR)
171 ALU2(XOR)
172 ALU2(SHL)
173 ALU2(SHR)
174 ALU2(ASR)
175 ALU3(LRP)
176 ALU1(BFREV)
177 ALU3(BFE)
178 ALU2(BFI1)
179 ALU3(BFI2)
180 ALU1(FBH)
181 ALU1(FBL)
182 ALU1(CBIT)
183 ALU3(MAD)
184 ALU2(ADDC)
185 ALU2(SUBB)
186
187 /** Gen4 predicated IF. */
188 fs_inst *
189 fs_visitor::IF(uint32_t predicate)
190 {
191 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
192 inst->predicate = predicate;
193 return inst;
194 }
195
196 /** Gen6+ IF with embedded comparison. */
197 fs_inst *
198 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
199 {
200 assert(brw->gen >= 6);
201 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
202 reg_null_d, src0, src1);
203 inst->conditional_mod = condition;
204 return inst;
205 }
206
207 /**
208 * CMP: Sets the low bit of the destination channels with the result
209 * of the comparison, while the upper bits are undefined, and updates
210 * the flag register with the packed 16 bits of the result.
211 */
212 fs_inst *
213 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
214 {
215 fs_inst *inst;
216
217 /* Take the instruction:
218 *
219 * CMP null<d> src0<f> src1<f>
220 *
221 * Original gen4 does type conversion to the destination type before
222 * comparison, producing garbage results for floating point comparisons.
223 * gen5 does the comparison on the execution type (resolved source types),
224 * so dst type doesn't matter. gen6 does comparison and then uses the
225 * result as if it was the dst type with no conversion, which happens to
226 * mostly work out for float-interpreted-as-int since our comparisons are
227 * for >0, =0, <0.
228 */
229 if (brw->gen == 4) {
230 dst.type = src0.type;
231 if (dst.file == HW_REG)
232 dst.fixed_hw_reg.type = dst.type;
233 }
234
235 resolve_ud_negate(&src0);
236 resolve_ud_negate(&src1);
237
238 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
239 inst->conditional_mod = condition;
240
241 return inst;
242 }
243
244 exec_list
245 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
246 fs_reg varying_offset,
247 uint32_t const_offset)
248 {
249 exec_list instructions;
250 fs_inst *inst;
251
252 /* We have our constant surface use a pitch of 4 bytes, so our index can
253 * be any component of a vector, and then we load 4 contiguous
254 * components starting from that.
255 *
256 * We break down the const_offset to a portion added to the variable
257 * offset and a portion done using reg_offset, which means that if you
258 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
259 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
260 * CSE can later notice that those loads are all the same and eliminate
261 * the redundant ones.
262 */
263 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
264 instructions.push_tail(ADD(vec4_offset,
265 varying_offset, const_offset & ~3));
266
267 int scale = 1;
268 if (brw->gen == 4 && dispatch_width == 8) {
269 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
270 * u, v, r) as parameters, or we can just use the SIMD16 message
271 * consisting of (header, u). We choose the second, at the cost of a
272 * longer return length.
273 */
274 scale = 2;
275 }
276
277 enum opcode op;
278 if (brw->gen >= 7)
279 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
280 else
281 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
282 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
283 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
284 inst->regs_written = 4 * scale;
285 instructions.push_tail(inst);
286
287 if (brw->gen < 7) {
288 inst->base_mrf = 13;
289 inst->header_present = true;
290 if (brw->gen == 4)
291 inst->mlen = 3;
292 else
293 inst->mlen = 1 + dispatch_width / 8;
294 }
295
296 vec4_result.reg_offset += (const_offset & 3) * scale;
297 instructions.push_tail(MOV(dst, vec4_result));
298
299 return instructions;
300 }
301
302 /**
303 * A helper for MOV generation for fixing up broken hardware SEND dependency
304 * handling.
305 */
306 fs_inst *
307 fs_visitor::DEP_RESOLVE_MOV(int grf)
308 {
309 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
310
311 inst->ir = NULL;
312 inst->annotation = "send dependency resolve";
313
314 /* The caller always wants uncompressed to emit the minimal extra
315 * dependencies, and to avoid having to deal with aligning its regs to 2.
316 */
317 inst->force_uncompressed = true;
318
319 return inst;
320 }
321
322 bool
323 fs_inst::equals(fs_inst *inst)
324 {
325 return (opcode == inst->opcode &&
326 dst.equals(inst->dst) &&
327 src[0].equals(inst->src[0]) &&
328 src[1].equals(inst->src[1]) &&
329 src[2].equals(inst->src[2]) &&
330 saturate == inst->saturate &&
331 predicate == inst->predicate &&
332 conditional_mod == inst->conditional_mod &&
333 mlen == inst->mlen &&
334 base_mrf == inst->base_mrf &&
335 sampler == inst->sampler &&
336 target == inst->target &&
337 eot == inst->eot &&
338 header_present == inst->header_present &&
339 shadow_compare == inst->shadow_compare &&
340 offset == inst->offset);
341 }
342
343 bool
344 fs_inst::overwrites_reg(const fs_reg &reg)
345 {
346 return (reg.file == dst.file &&
347 reg.reg == dst.reg &&
348 reg.reg_offset >= dst.reg_offset &&
349 reg.reg_offset < dst.reg_offset + regs_written);
350 }
351
352 bool
353 fs_inst::is_send_from_grf()
354 {
355 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
356 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
357 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
358 src[1].file == GRF) ||
359 (is_tex() && src[0].file == GRF));
360 }
361
362 bool
363 fs_visitor::can_do_source_mods(fs_inst *inst)
364 {
365 if (brw->gen == 6 && inst->is_math())
366 return false;
367
368 if (inst->is_send_from_grf())
369 return false;
370
371 if (!inst->can_do_source_mods())
372 return false;
373
374 return true;
375 }
376
377 void
378 fs_reg::init()
379 {
380 memset(this, 0, sizeof(*this));
381 this->smear = -1;
382 }
383
384 /** Generic unset register constructor. */
385 fs_reg::fs_reg()
386 {
387 init();
388 this->file = BAD_FILE;
389 }
390
391 /** Immediate value constructor. */
392 fs_reg::fs_reg(float f)
393 {
394 init();
395 this->file = IMM;
396 this->type = BRW_REGISTER_TYPE_F;
397 this->imm.f = f;
398 }
399
400 /** Immediate value constructor. */
401 fs_reg::fs_reg(int32_t i)
402 {
403 init();
404 this->file = IMM;
405 this->type = BRW_REGISTER_TYPE_D;
406 this->imm.i = i;
407 }
408
409 /** Immediate value constructor. */
410 fs_reg::fs_reg(uint32_t u)
411 {
412 init();
413 this->file = IMM;
414 this->type = BRW_REGISTER_TYPE_UD;
415 this->imm.u = u;
416 }
417
418 /** Fixed brw_reg Immediate value constructor. */
419 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
420 {
421 init();
422 this->file = HW_REG;
423 this->fixed_hw_reg = fixed_hw_reg;
424 this->type = fixed_hw_reg.type;
425 }
426
427 bool
428 fs_reg::equals(const fs_reg &r) const
429 {
430 return (file == r.file &&
431 reg == r.reg &&
432 reg_offset == r.reg_offset &&
433 type == r.type &&
434 negate == r.negate &&
435 abs == r.abs &&
436 !reladdr && !r.reladdr &&
437 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
438 sizeof(fixed_hw_reg)) == 0 &&
439 smear == r.smear &&
440 imm.u == r.imm.u);
441 }
442
443 fs_reg
444 fs_reg::retype(uint32_t type)
445 {
446 fs_reg result = *this;
447 result.type = type;
448 return result;
449 }
450
451 bool
452 fs_reg::is_zero() const
453 {
454 if (file != IMM)
455 return false;
456
457 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
458 }
459
460 bool
461 fs_reg::is_one() const
462 {
463 if (file != IMM)
464 return false;
465
466 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
467 }
468
469 bool
470 fs_reg::is_valid_3src() const
471 {
472 return file == GRF || file == UNIFORM;
473 }
474
475 int
476 fs_visitor::type_size(const struct glsl_type *type)
477 {
478 unsigned int size, i;
479
480 switch (type->base_type) {
481 case GLSL_TYPE_UINT:
482 case GLSL_TYPE_INT:
483 case GLSL_TYPE_FLOAT:
484 case GLSL_TYPE_BOOL:
485 return type->components();
486 case GLSL_TYPE_ARRAY:
487 return type_size(type->fields.array) * type->length;
488 case GLSL_TYPE_STRUCT:
489 size = 0;
490 for (i = 0; i < type->length; i++) {
491 size += type_size(type->fields.structure[i].type);
492 }
493 return size;
494 case GLSL_TYPE_SAMPLER:
495 /* Samplers take up no register space, since they're baked in at
496 * link time.
497 */
498 return 0;
499 case GLSL_TYPE_ATOMIC_UINT:
500 return 0;
501 case GLSL_TYPE_VOID:
502 case GLSL_TYPE_ERROR:
503 case GLSL_TYPE_INTERFACE:
504 assert(!"not reached");
505 break;
506 }
507
508 return 0;
509 }
510
511 fs_reg
512 fs_visitor::get_timestamp()
513 {
514 assert(brw->gen >= 7);
515
516 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
517 BRW_ARF_TIMESTAMP,
518 0),
519 BRW_REGISTER_TYPE_UD));
520
521 fs_reg dst = fs_reg(this, glsl_type::uint_type);
522
523 fs_inst *mov = emit(MOV(dst, ts));
524 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
525 * even if it's not enabled in the dispatch.
526 */
527 mov->force_writemask_all = true;
528 mov->force_uncompressed = true;
529
530 /* The caller wants the low 32 bits of the timestamp. Since it's running
531 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
532 * which is plenty of time for our purposes. It is identical across the
533 * EUs, but since it's tracking GPU core speed it will increment at a
534 * varying rate as render P-states change.
535 *
536 * The caller could also check if render P-states have changed (or anything
537 * else that might disrupt timing) by setting smear to 2 and checking if
538 * that field is != 0.
539 */
540 dst.smear = 0;
541
542 return dst;
543 }
544
545 void
546 fs_visitor::emit_shader_time_begin()
547 {
548 current_annotation = "shader time start";
549 shader_start_time = get_timestamp();
550 }
551
552 void
553 fs_visitor::emit_shader_time_end()
554 {
555 current_annotation = "shader time end";
556
557 enum shader_time_shader_type type, written_type, reset_type;
558 if (dispatch_width == 8) {
559 type = ST_FS8;
560 written_type = ST_FS8_WRITTEN;
561 reset_type = ST_FS8_RESET;
562 } else {
563 assert(dispatch_width == 16);
564 type = ST_FS16;
565 written_type = ST_FS16_WRITTEN;
566 reset_type = ST_FS16_RESET;
567 }
568
569 fs_reg shader_end_time = get_timestamp();
570
571 /* Check that there weren't any timestamp reset events (assuming these
572 * were the only two timestamp reads that happened).
573 */
574 fs_reg reset = shader_end_time;
575 reset.smear = 2;
576 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
577 test->conditional_mod = BRW_CONDITIONAL_Z;
578 emit(IF(BRW_PREDICATE_NORMAL));
579
580 push_force_uncompressed();
581 fs_reg start = shader_start_time;
582 start.negate = true;
583 fs_reg diff = fs_reg(this, glsl_type::uint_type);
584 emit(ADD(diff, start, shader_end_time));
585
586 /* If there were no instructions between the two timestamp gets, the diff
587 * is 2 cycles. Remove that overhead, so I can forget about that when
588 * trying to determine the time taken for single instructions.
589 */
590 emit(ADD(diff, diff, fs_reg(-2u)));
591
592 emit_shader_time_write(type, diff);
593 emit_shader_time_write(written_type, fs_reg(1u));
594 emit(BRW_OPCODE_ELSE);
595 emit_shader_time_write(reset_type, fs_reg(1u));
596 emit(BRW_OPCODE_ENDIF);
597
598 pop_force_uncompressed();
599 }
600
601 void
602 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
603 fs_reg value)
604 {
605 int shader_time_index =
606 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
607 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
608
609 fs_reg payload;
610 if (dispatch_width == 8)
611 payload = fs_reg(this, glsl_type::uvec2_type);
612 else
613 payload = fs_reg(this, glsl_type::uint_type);
614
615 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
616 fs_reg(), payload, offset, value));
617 }
618
619 void
620 fs_visitor::fail(const char *format, ...)
621 {
622 va_list va;
623 char *msg;
624
625 if (failed)
626 return;
627
628 failed = true;
629
630 va_start(va, format);
631 msg = ralloc_vasprintf(mem_ctx, format, va);
632 va_end(va);
633 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
634
635 this->fail_msg = msg;
636
637 if (INTEL_DEBUG & DEBUG_WM) {
638 fprintf(stderr, "%s", msg);
639 }
640 }
641
642 fs_inst *
643 fs_visitor::emit(enum opcode opcode)
644 {
645 return emit(fs_inst(opcode));
646 }
647
648 fs_inst *
649 fs_visitor::emit(enum opcode opcode, fs_reg dst)
650 {
651 return emit(fs_inst(opcode, dst));
652 }
653
654 fs_inst *
655 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
656 {
657 return emit(fs_inst(opcode, dst, src0));
658 }
659
660 fs_inst *
661 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
662 {
663 return emit(fs_inst(opcode, dst, src0, src1));
664 }
665
666 fs_inst *
667 fs_visitor::emit(enum opcode opcode, fs_reg dst,
668 fs_reg src0, fs_reg src1, fs_reg src2)
669 {
670 return emit(fs_inst(opcode, dst, src0, src1, src2));
671 }
672
673 void
674 fs_visitor::push_force_uncompressed()
675 {
676 force_uncompressed_stack++;
677 }
678
679 void
680 fs_visitor::pop_force_uncompressed()
681 {
682 force_uncompressed_stack--;
683 assert(force_uncompressed_stack >= 0);
684 }
685
686 void
687 fs_visitor::push_force_sechalf()
688 {
689 force_sechalf_stack++;
690 }
691
692 void
693 fs_visitor::pop_force_sechalf()
694 {
695 force_sechalf_stack--;
696 assert(force_sechalf_stack >= 0);
697 }
698
699 /**
700 * Returns true if the instruction has a flag that means it won't
701 * update an entire destination register.
702 *
703 * For example, dead code elimination and live variable analysis want to know
704 * when a write to a variable screens off any preceding values that were in
705 * it.
706 */
707 bool
708 fs_inst::is_partial_write()
709 {
710 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
711 this->force_uncompressed ||
712 this->force_sechalf);
713 }
714
715 int
716 fs_inst::regs_read(fs_visitor *v, int arg)
717 {
718 if (is_tex() && arg == 0 && src[0].file == GRF) {
719 if (v->dispatch_width == 16)
720 return (mlen + 1) / 2;
721 else
722 return mlen;
723 }
724 return 1;
725 }
726
727 /**
728 * Returns how many MRFs an FS opcode will write over.
729 *
730 * Note that this is not the 0 or 1 implied writes in an actual gen
731 * instruction -- the FS opcodes often generate MOVs in addition.
732 */
733 int
734 fs_visitor::implied_mrf_writes(fs_inst *inst)
735 {
736 if (inst->mlen == 0)
737 return 0;
738
739 if (inst->base_mrf == -1)
740 return 0;
741
742 switch (inst->opcode) {
743 case SHADER_OPCODE_RCP:
744 case SHADER_OPCODE_RSQ:
745 case SHADER_OPCODE_SQRT:
746 case SHADER_OPCODE_EXP2:
747 case SHADER_OPCODE_LOG2:
748 case SHADER_OPCODE_SIN:
749 case SHADER_OPCODE_COS:
750 return 1 * dispatch_width / 8;
751 case SHADER_OPCODE_POW:
752 case SHADER_OPCODE_INT_QUOTIENT:
753 case SHADER_OPCODE_INT_REMAINDER:
754 return 2 * dispatch_width / 8;
755 case SHADER_OPCODE_TEX:
756 case FS_OPCODE_TXB:
757 case SHADER_OPCODE_TXD:
758 case SHADER_OPCODE_TXF:
759 case SHADER_OPCODE_TXF_MS:
760 case SHADER_OPCODE_TG4:
761 case SHADER_OPCODE_TG4_OFFSET:
762 case SHADER_OPCODE_TXL:
763 case SHADER_OPCODE_TXS:
764 case SHADER_OPCODE_LOD:
765 return 1;
766 case FS_OPCODE_FB_WRITE:
767 return 2;
768 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
769 case SHADER_OPCODE_GEN4_SCRATCH_READ:
770 return 1;
771 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
772 return inst->mlen;
773 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
774 return 2;
775 case SHADER_OPCODE_UNTYPED_ATOMIC:
776 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
777 return 0;
778 default:
779 assert(!"not reached");
780 return inst->mlen;
781 }
782 }
783
784 int
785 fs_visitor::virtual_grf_alloc(int size)
786 {
787 if (virtual_grf_array_size <= virtual_grf_count) {
788 if (virtual_grf_array_size == 0)
789 virtual_grf_array_size = 16;
790 else
791 virtual_grf_array_size *= 2;
792 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
793 virtual_grf_array_size);
794 }
795 virtual_grf_sizes[virtual_grf_count] = size;
796 return virtual_grf_count++;
797 }
798
799 /** Fixed HW reg constructor. */
800 fs_reg::fs_reg(enum register_file file, int reg)
801 {
802 init();
803 this->file = file;
804 this->reg = reg;
805 this->type = BRW_REGISTER_TYPE_F;
806 }
807
808 /** Fixed HW reg constructor. */
809 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
810 {
811 init();
812 this->file = file;
813 this->reg = reg;
814 this->type = type;
815 }
816
817 /** Automatic reg constructor. */
818 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
819 {
820 init();
821
822 this->file = GRF;
823 this->reg = v->virtual_grf_alloc(v->type_size(type));
824 this->reg_offset = 0;
825 this->type = brw_type_for_base_type(type);
826 }
827
828 fs_reg *
829 fs_visitor::variable_storage(ir_variable *var)
830 {
831 return (fs_reg *)hash_table_find(this->variable_ht, var);
832 }
833
834 void
835 import_uniforms_callback(const void *key,
836 void *data,
837 void *closure)
838 {
839 struct hash_table *dst_ht = (struct hash_table *)closure;
840 const fs_reg *reg = (const fs_reg *)data;
841
842 if (reg->file != UNIFORM)
843 return;
844
845 hash_table_insert(dst_ht, data, key);
846 }
847
848 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
849 * This brings in those uniform definitions
850 */
851 void
852 fs_visitor::import_uniforms(fs_visitor *v)
853 {
854 hash_table_call_foreach(v->variable_ht,
855 import_uniforms_callback,
856 variable_ht);
857 this->params_remap = v->params_remap;
858 this->nr_params_remap = v->nr_params_remap;
859 }
860
861 /* Our support for uniforms is piggy-backed on the struct
862 * gl_fragment_program, because that's where the values actually
863 * get stored, rather than in some global gl_shader_program uniform
864 * store.
865 */
866 void
867 fs_visitor::setup_uniform_values(ir_variable *ir)
868 {
869 int namelen = strlen(ir->name);
870
871 /* The data for our (non-builtin) uniforms is stored in a series of
872 * gl_uniform_driver_storage structs for each subcomponent that
873 * glGetUniformLocation() could name. We know it's been set up in the same
874 * order we'd walk the type, so walk the list of storage and find anything
875 * with our name, or the prefix of a component that starts with our name.
876 */
877 unsigned params_before = c->prog_data.nr_params;
878 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
879 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
880
881 if (strncmp(ir->name, storage->name, namelen) != 0 ||
882 (storage->name[namelen] != 0 &&
883 storage->name[namelen] != '.' &&
884 storage->name[namelen] != '[')) {
885 continue;
886 }
887
888 unsigned slots = storage->type->component_slots();
889 if (storage->array_elements)
890 slots *= storage->array_elements;
891
892 for (unsigned i = 0; i < slots; i++) {
893 c->prog_data.param[c->prog_data.nr_params++] =
894 &storage->storage[i].f;
895 }
896 }
897
898 /* Make sure we actually initialized the right amount of stuff here. */
899 assert(params_before + ir->type->component_slots() ==
900 c->prog_data.nr_params);
901 (void)params_before;
902 }
903
904
905 /* Our support for builtin uniforms is even scarier than non-builtin.
906 * It sits on top of the PROG_STATE_VAR parameters that are
907 * automatically updated from GL context state.
908 */
909 void
910 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
911 {
912 const ir_state_slot *const slots = ir->state_slots;
913 assert(ir->state_slots != NULL);
914
915 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
916 /* This state reference has already been setup by ir_to_mesa, but we'll
917 * get the same index back here.
918 */
919 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
920 (gl_state_index *)slots[i].tokens);
921
922 /* Add each of the unique swizzles of the element as a parameter.
923 * This'll end up matching the expected layout of the
924 * array/matrix/structure we're trying to fill in.
925 */
926 int last_swiz = -1;
927 for (unsigned int j = 0; j < 4; j++) {
928 int swiz = GET_SWZ(slots[i].swizzle, j);
929 if (swiz == last_swiz)
930 break;
931 last_swiz = swiz;
932
933 c->prog_data.param[c->prog_data.nr_params++] =
934 &fp->Base.Parameters->ParameterValues[index][swiz].f;
935 }
936 }
937 }
938
939 fs_reg *
940 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
941 {
942 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
943 fs_reg wpos = *reg;
944 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
945
946 /* gl_FragCoord.x */
947 if (ir->pixel_center_integer) {
948 emit(MOV(wpos, this->pixel_x));
949 } else {
950 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
951 }
952 wpos.reg_offset++;
953
954 /* gl_FragCoord.y */
955 if (!flip && ir->pixel_center_integer) {
956 emit(MOV(wpos, this->pixel_y));
957 } else {
958 fs_reg pixel_y = this->pixel_y;
959 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
960
961 if (flip) {
962 pixel_y.negate = true;
963 offset += c->key.drawable_height - 1.0;
964 }
965
966 emit(ADD(wpos, pixel_y, fs_reg(offset)));
967 }
968 wpos.reg_offset++;
969
970 /* gl_FragCoord.z */
971 if (brw->gen >= 6) {
972 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
973 } else {
974 emit(FS_OPCODE_LINTERP, wpos,
975 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
976 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
977 interp_reg(VARYING_SLOT_POS, 2));
978 }
979 wpos.reg_offset++;
980
981 /* gl_FragCoord.w: Already set up in emit_interpolation */
982 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
983
984 return reg;
985 }
986
987 fs_inst *
988 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
989 glsl_interp_qualifier interpolation_mode,
990 bool is_centroid)
991 {
992 brw_wm_barycentric_interp_mode barycoord_mode;
993 if (brw->gen >= 6) {
994 if (is_centroid) {
995 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
996 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
997 else
998 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
999 } else {
1000 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1001 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1002 else
1003 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1004 }
1005 } else {
1006 /* On Ironlake and below, there is only one interpolation mode.
1007 * Centroid interpolation doesn't mean anything on this hardware --
1008 * there is no multisampling.
1009 */
1010 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1011 }
1012 return emit(FS_OPCODE_LINTERP, attr,
1013 this->delta_x[barycoord_mode],
1014 this->delta_y[barycoord_mode], interp);
1015 }
1016
1017 fs_reg *
1018 fs_visitor::emit_general_interpolation(ir_variable *ir)
1019 {
1020 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1021 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1022 fs_reg attr = *reg;
1023
1024 unsigned int array_elements;
1025 const glsl_type *type;
1026
1027 if (ir->type->is_array()) {
1028 array_elements = ir->type->length;
1029 if (array_elements == 0) {
1030 fail("dereferenced array '%s' has length 0\n", ir->name);
1031 }
1032 type = ir->type->fields.array;
1033 } else {
1034 array_elements = 1;
1035 type = ir->type;
1036 }
1037
1038 glsl_interp_qualifier interpolation_mode =
1039 ir->determine_interpolation_mode(c->key.flat_shade);
1040
1041 int location = ir->location;
1042 for (unsigned int i = 0; i < array_elements; i++) {
1043 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1044 if (c->prog_data.urb_setup[location] == -1) {
1045 /* If there's no incoming setup data for this slot, don't
1046 * emit interpolation for it.
1047 */
1048 attr.reg_offset += type->vector_elements;
1049 location++;
1050 continue;
1051 }
1052
1053 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1054 /* Constant interpolation (flat shading) case. The SF has
1055 * handed us defined values in only the constant offset
1056 * field of the setup reg.
1057 */
1058 for (unsigned int k = 0; k < type->vector_elements; k++) {
1059 struct brw_reg interp = interp_reg(location, k);
1060 interp = suboffset(interp, 3);
1061 interp.type = reg->type;
1062 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1063 attr.reg_offset++;
1064 }
1065 } else {
1066 /* Smooth/noperspective interpolation case. */
1067 for (unsigned int k = 0; k < type->vector_elements; k++) {
1068 /* FINISHME: At some point we probably want to push
1069 * this farther by giving similar treatment to the
1070 * other potentially constant components of the
1071 * attribute, as well as making brw_vs_constval.c
1072 * handle varyings other than gl_TexCoord.
1073 */
1074 struct brw_reg interp = interp_reg(location, k);
1075 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1076 ir->centroid);
1077 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1078 /* Get the pixel/sample mask into f0 so that we know
1079 * which pixels are lit. Then, for each channel that is
1080 * unlit, replace the centroid data with non-centroid
1081 * data.
1082 */
1083 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1084 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1085 interpolation_mode, false);
1086 inst->predicate = BRW_PREDICATE_NORMAL;
1087 inst->predicate_inverse = true;
1088 }
1089 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1090 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1091 }
1092 attr.reg_offset++;
1093 }
1094
1095 }
1096 location++;
1097 }
1098 }
1099
1100 return reg;
1101 }
1102
1103 fs_reg *
1104 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1105 {
1106 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1107
1108 /* The frontfacing comes in as a bit in the thread payload. */
1109 if (brw->gen >= 6) {
1110 emit(BRW_OPCODE_ASR, *reg,
1111 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1112 fs_reg(15));
1113 emit(BRW_OPCODE_NOT, *reg, *reg);
1114 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1115 } else {
1116 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1117 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1118 * us front face
1119 */
1120 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1121 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1122 }
1123
1124 return reg;
1125 }
1126
1127 fs_reg
1128 fs_visitor::fix_math_operand(fs_reg src)
1129 {
1130 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1131 * might be able to do better by doing execsize = 1 math and then
1132 * expanding that result out, but we would need to be careful with
1133 * masking.
1134 *
1135 * The hardware ignores source modifiers (negate and abs) on math
1136 * instructions, so we also move to a temp to set those up.
1137 */
1138 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1139 !src.abs && !src.negate)
1140 return src;
1141
1142 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1143 * operands to math
1144 */
1145 if (brw->gen >= 7 && src.file != IMM)
1146 return src;
1147
1148 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1149 expanded.type = src.type;
1150 emit(BRW_OPCODE_MOV, expanded, src);
1151 return expanded;
1152 }
1153
1154 fs_inst *
1155 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1156 {
1157 switch (opcode) {
1158 case SHADER_OPCODE_RCP:
1159 case SHADER_OPCODE_RSQ:
1160 case SHADER_OPCODE_SQRT:
1161 case SHADER_OPCODE_EXP2:
1162 case SHADER_OPCODE_LOG2:
1163 case SHADER_OPCODE_SIN:
1164 case SHADER_OPCODE_COS:
1165 break;
1166 default:
1167 assert(!"not reached: bad math opcode");
1168 return NULL;
1169 }
1170
1171 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1172 * might be able to do better by doing execsize = 1 math and then
1173 * expanding that result out, but we would need to be careful with
1174 * masking.
1175 *
1176 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1177 * instructions, so we also move to a temp to set those up.
1178 */
1179 if (brw->gen >= 6)
1180 src = fix_math_operand(src);
1181
1182 fs_inst *inst = emit(opcode, dst, src);
1183
1184 if (brw->gen < 6) {
1185 inst->base_mrf = 2;
1186 inst->mlen = dispatch_width / 8;
1187 }
1188
1189 return inst;
1190 }
1191
1192 fs_inst *
1193 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1194 {
1195 int base_mrf = 2;
1196 fs_inst *inst;
1197
1198 switch (opcode) {
1199 case SHADER_OPCODE_INT_QUOTIENT:
1200 case SHADER_OPCODE_INT_REMAINDER:
1201 if (brw->gen >= 7 && dispatch_width == 16)
1202 fail("16-wide INTDIV unsupported\n");
1203 break;
1204 case SHADER_OPCODE_POW:
1205 break;
1206 default:
1207 assert(!"not reached: unsupported binary math opcode.");
1208 return NULL;
1209 }
1210
1211 if (brw->gen >= 6) {
1212 src0 = fix_math_operand(src0);
1213 src1 = fix_math_operand(src1);
1214
1215 inst = emit(opcode, dst, src0, src1);
1216 } else {
1217 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1218 * "Message Payload":
1219 *
1220 * "Operand0[7]. For the INT DIV functions, this operand is the
1221 * denominator."
1222 * ...
1223 * "Operand1[7]. For the INT DIV functions, this operand is the
1224 * numerator."
1225 */
1226 bool is_int_div = opcode != SHADER_OPCODE_POW;
1227 fs_reg &op0 = is_int_div ? src1 : src0;
1228 fs_reg &op1 = is_int_div ? src0 : src1;
1229
1230 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1231 inst = emit(opcode, dst, op0, reg_null_f);
1232
1233 inst->base_mrf = base_mrf;
1234 inst->mlen = 2 * dispatch_width / 8;
1235 }
1236 return inst;
1237 }
1238
1239 void
1240 fs_visitor::assign_curb_setup()
1241 {
1242 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1243 if (dispatch_width == 8) {
1244 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1245 } else {
1246 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1247 }
1248
1249 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1250 foreach_list(node, &this->instructions) {
1251 fs_inst *inst = (fs_inst *)node;
1252
1253 for (unsigned int i = 0; i < 3; i++) {
1254 if (inst->src[i].file == UNIFORM) {
1255 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1256 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1257 constant_nr / 8,
1258 constant_nr % 8);
1259
1260 inst->src[i].file = HW_REG;
1261 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1262 }
1263 }
1264 }
1265 }
1266
1267 void
1268 fs_visitor::calculate_urb_setup()
1269 {
1270 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1271 c->prog_data.urb_setup[i] = -1;
1272 }
1273
1274 int urb_next = 0;
1275 /* Figure out where each of the incoming setup attributes lands. */
1276 if (brw->gen >= 6) {
1277 if (_mesa_bitcount_64(fp->Base.InputsRead &
1278 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1279 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1280 * first 16 varying inputs, so we can put them wherever we want.
1281 * Just put them in order.
1282 *
1283 * This is useful because it means that (a) inputs not used by the
1284 * fragment shader won't take up valuable register space, and (b) we
1285 * won't have to recompile the fragment shader if it gets paired with
1286 * a different vertex (or geometry) shader.
1287 */
1288 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1289 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1290 BITFIELD64_BIT(i)) {
1291 c->prog_data.urb_setup[i] = urb_next++;
1292 }
1293 }
1294 } else {
1295 /* We have enough input varyings that the SF/SBE pipeline stage can't
1296 * arbitrarily rearrange them to suit our whim; we have to put them
1297 * in an order that matches the output of the previous pipeline stage
1298 * (geometry or vertex shader).
1299 */
1300 struct brw_vue_map prev_stage_vue_map;
1301 brw_compute_vue_map(brw, &prev_stage_vue_map,
1302 c->key.input_slots_valid);
1303 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1304 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1305 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1306 slot++) {
1307 int varying = prev_stage_vue_map.slot_to_varying[slot];
1308 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1309 * unused.
1310 */
1311 if (varying != BRW_VARYING_SLOT_COUNT &&
1312 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1313 BITFIELD64_BIT(varying))) {
1314 c->prog_data.urb_setup[varying] = slot - first_slot;
1315 }
1316 }
1317 urb_next = prev_stage_vue_map.num_slots - first_slot;
1318 }
1319 } else {
1320 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1321 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1322 /* Point size is packed into the header, not as a general attribute */
1323 if (i == VARYING_SLOT_PSIZ)
1324 continue;
1325
1326 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1327 /* The back color slot is skipped when the front color is
1328 * also written to. In addition, some slots can be
1329 * written in the vertex shader and not read in the
1330 * fragment shader. So the register number must always be
1331 * incremented, mapped or not.
1332 */
1333 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1334 c->prog_data.urb_setup[i] = urb_next;
1335 urb_next++;
1336 }
1337 }
1338
1339 /*
1340 * It's a FS only attribute, and we did interpolation for this attribute
1341 * in SF thread. So, count it here, too.
1342 *
1343 * See compile_sf_prog() for more info.
1344 */
1345 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1346 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1347 }
1348
1349 c->prog_data.num_varying_inputs = urb_next;
1350 }
1351
1352 void
1353 fs_visitor::assign_urb_setup()
1354 {
1355 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1356
1357 /* Offset all the urb_setup[] index by the actual position of the
1358 * setup regs, now that the location of the constants has been chosen.
1359 */
1360 foreach_list(node, &this->instructions) {
1361 fs_inst *inst = (fs_inst *)node;
1362
1363 if (inst->opcode == FS_OPCODE_LINTERP) {
1364 assert(inst->src[2].file == HW_REG);
1365 inst->src[2].fixed_hw_reg.nr += urb_start;
1366 }
1367
1368 if (inst->opcode == FS_OPCODE_CINTERP) {
1369 assert(inst->src[0].file == HW_REG);
1370 inst->src[0].fixed_hw_reg.nr += urb_start;
1371 }
1372 }
1373
1374 /* Each attribute is 4 setup channels, each of which is half a reg. */
1375 this->first_non_payload_grf =
1376 urb_start + c->prog_data.num_varying_inputs * 2;
1377 }
1378
1379 /**
1380 * Split large virtual GRFs into separate components if we can.
1381 *
1382 * This is mostly duplicated with what brw_fs_vector_splitting does,
1383 * but that's really conservative because it's afraid of doing
1384 * splitting that doesn't result in real progress after the rest of
1385 * the optimization phases, which would cause infinite looping in
1386 * optimization. We can do it once here, safely. This also has the
1387 * opportunity to split interpolated values, or maybe even uniforms,
1388 * which we don't have at the IR level.
1389 *
1390 * We want to split, because virtual GRFs are what we register
1391 * allocate and spill (due to contiguousness requirements for some
1392 * instructions), and they're what we naturally generate in the
1393 * codegen process, but most virtual GRFs don't actually need to be
1394 * contiguous sets of GRFs. If we split, we'll end up with reduced
1395 * live intervals and better dead code elimination and coalescing.
1396 */
1397 void
1398 fs_visitor::split_virtual_grfs()
1399 {
1400 int num_vars = this->virtual_grf_count;
1401 bool split_grf[num_vars];
1402 int new_virtual_grf[num_vars];
1403
1404 /* Try to split anything > 0 sized. */
1405 for (int i = 0; i < num_vars; i++) {
1406 if (this->virtual_grf_sizes[i] != 1)
1407 split_grf[i] = true;
1408 else
1409 split_grf[i] = false;
1410 }
1411
1412 if (brw->has_pln &&
1413 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1414 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1415 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1416 * Gen6, that was the only supported interpolation mode, and since Gen6,
1417 * delta_x and delta_y are in fixed hardware registers.
1418 */
1419 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1420 false;
1421 }
1422
1423 foreach_list(node, &this->instructions) {
1424 fs_inst *inst = (fs_inst *)node;
1425
1426 /* If there's a SEND message that requires contiguous destination
1427 * registers, no splitting is allowed.
1428 */
1429 if (inst->regs_written > 1) {
1430 split_grf[inst->dst.reg] = false;
1431 }
1432
1433 /* If we're sending from a GRF, don't split it, on the assumption that
1434 * the send is reading the whole thing.
1435 */
1436 if (inst->is_send_from_grf()) {
1437 for (int i = 0; i < 3; i++) {
1438 if (inst->src[i].file == GRF) {
1439 split_grf[inst->src[i].reg] = false;
1440 }
1441 }
1442 }
1443 }
1444
1445 /* Allocate new space for split regs. Note that the virtual
1446 * numbers will be contiguous.
1447 */
1448 for (int i = 0; i < num_vars; i++) {
1449 if (split_grf[i]) {
1450 new_virtual_grf[i] = virtual_grf_alloc(1);
1451 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1452 int reg = virtual_grf_alloc(1);
1453 assert(reg == new_virtual_grf[i] + j - 1);
1454 (void) reg;
1455 }
1456 this->virtual_grf_sizes[i] = 1;
1457 }
1458 }
1459
1460 foreach_list(node, &this->instructions) {
1461 fs_inst *inst = (fs_inst *)node;
1462
1463 if (inst->dst.file == GRF &&
1464 split_grf[inst->dst.reg] &&
1465 inst->dst.reg_offset != 0) {
1466 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1467 inst->dst.reg_offset - 1);
1468 inst->dst.reg_offset = 0;
1469 }
1470 for (int i = 0; i < 3; i++) {
1471 if (inst->src[i].file == GRF &&
1472 split_grf[inst->src[i].reg] &&
1473 inst->src[i].reg_offset != 0) {
1474 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1475 inst->src[i].reg_offset - 1);
1476 inst->src[i].reg_offset = 0;
1477 }
1478 }
1479 }
1480 invalidate_live_intervals();
1481 }
1482
1483 /**
1484 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1485 *
1486 * During code generation, we create tons of temporary variables, many of
1487 * which get immediately killed and are never used again. Yet, in later
1488 * optimization and analysis passes, such as compute_live_intervals, we need
1489 * to loop over all the virtual GRFs. Compacting them can save a lot of
1490 * overhead.
1491 */
1492 void
1493 fs_visitor::compact_virtual_grfs()
1494 {
1495 /* Mark which virtual GRFs are used, and count how many. */
1496 int remap_table[this->virtual_grf_count];
1497 memset(remap_table, -1, sizeof(remap_table));
1498
1499 foreach_list(node, &this->instructions) {
1500 const fs_inst *inst = (const fs_inst *) node;
1501
1502 if (inst->dst.file == GRF)
1503 remap_table[inst->dst.reg] = 0;
1504
1505 for (int i = 0; i < 3; i++) {
1506 if (inst->src[i].file == GRF)
1507 remap_table[inst->src[i].reg] = 0;
1508 }
1509 }
1510
1511 /* In addition to registers used in instructions, fs_visitor keeps
1512 * direct references to certain special values which must be patched:
1513 */
1514 fs_reg *special[] = {
1515 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1516 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1517 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1518 &delta_x[0], &delta_x[1], &delta_x[2],
1519 &delta_x[3], &delta_x[4], &delta_x[5],
1520 &delta_y[0], &delta_y[1], &delta_y[2],
1521 &delta_y[3], &delta_y[4], &delta_y[5],
1522 };
1523 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1524 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1525
1526 /* Treat all special values as used, to be conservative */
1527 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1528 if (special[i]->file == GRF)
1529 remap_table[special[i]->reg] = 0;
1530 }
1531
1532 /* Compact the GRF arrays. */
1533 int new_index = 0;
1534 for (int i = 0; i < this->virtual_grf_count; i++) {
1535 if (remap_table[i] != -1) {
1536 remap_table[i] = new_index;
1537 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1538 invalidate_live_intervals();
1539 ++new_index;
1540 }
1541 }
1542
1543 this->virtual_grf_count = new_index;
1544
1545 /* Patch all the instructions to use the newly renumbered registers */
1546 foreach_list(node, &this->instructions) {
1547 fs_inst *inst = (fs_inst *) node;
1548
1549 if (inst->dst.file == GRF)
1550 inst->dst.reg = remap_table[inst->dst.reg];
1551
1552 for (int i = 0; i < 3; i++) {
1553 if (inst->src[i].file == GRF)
1554 inst->src[i].reg = remap_table[inst->src[i].reg];
1555 }
1556 }
1557
1558 /* Patch all the references to special values */
1559 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1560 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1561 special[i]->reg = remap_table[special[i]->reg];
1562 }
1563 }
1564
1565 bool
1566 fs_visitor::remove_dead_constants()
1567 {
1568 if (dispatch_width == 8) {
1569 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1570 this->nr_params_remap = c->prog_data.nr_params;
1571
1572 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1573 this->params_remap[i] = -1;
1574
1575 /* Find which params are still in use. */
1576 foreach_list(node, &this->instructions) {
1577 fs_inst *inst = (fs_inst *)node;
1578
1579 for (int i = 0; i < 3; i++) {
1580 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1581
1582 if (inst->src[i].file != UNIFORM)
1583 continue;
1584
1585 /* Section 5.11 of the OpenGL 4.3 spec says:
1586 *
1587 * "Out-of-bounds reads return undefined values, which include
1588 * values from other variables of the active program or zero."
1589 */
1590 if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1591 constant_nr = 0;
1592 }
1593
1594 /* For now, set this to non-negative. We'll give it the
1595 * actual new number in a moment, in order to keep the
1596 * register numbers nicely ordered.
1597 */
1598 this->params_remap[constant_nr] = 0;
1599 }
1600 }
1601
1602 /* Figure out what the new numbers for the params will be. At some
1603 * point when we're doing uniform array access, we're going to want
1604 * to keep the distinction between .reg and .reg_offset, but for
1605 * now we don't care.
1606 */
1607 unsigned int new_nr_params = 0;
1608 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1609 if (this->params_remap[i] != -1) {
1610 this->params_remap[i] = new_nr_params++;
1611 }
1612 }
1613
1614 /* Update the list of params to be uploaded to match our new numbering. */
1615 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1616 int remapped = this->params_remap[i];
1617
1618 if (remapped == -1)
1619 continue;
1620
1621 c->prog_data.param[remapped] = c->prog_data.param[i];
1622 }
1623
1624 c->prog_data.nr_params = new_nr_params;
1625 } else {
1626 /* This should have been generated in the 8-wide pass already. */
1627 assert(this->params_remap);
1628 }
1629
1630 /* Now do the renumbering of the shader to remove unused params. */
1631 foreach_list(node, &this->instructions) {
1632 fs_inst *inst = (fs_inst *)node;
1633
1634 for (int i = 0; i < 3; i++) {
1635 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1636
1637 if (inst->src[i].file != UNIFORM)
1638 continue;
1639
1640 /* as above alias to 0 */
1641 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1642 constant_nr = 0;
1643 }
1644 assert(this->params_remap[constant_nr] != -1);
1645 inst->src[i].reg = this->params_remap[constant_nr];
1646 inst->src[i].reg_offset = 0;
1647 }
1648 }
1649
1650 return true;
1651 }
1652
1653 /*
1654 * Implements array access of uniforms by inserting a
1655 * PULL_CONSTANT_LOAD instruction.
1656 *
1657 * Unlike temporary GRF array access (where we don't support it due to
1658 * the difficulty of doing relative addressing on instruction
1659 * destinations), we could potentially do array access of uniforms
1660 * that were loaded in GRF space as push constants. In real-world
1661 * usage we've seen, though, the arrays being used are always larger
1662 * than we could load as push constants, so just always move all
1663 * uniform array access out to a pull constant buffer.
1664 */
1665 void
1666 fs_visitor::move_uniform_array_access_to_pull_constants()
1667 {
1668 int pull_constant_loc[c->prog_data.nr_params];
1669
1670 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1671 pull_constant_loc[i] = -1;
1672 }
1673
1674 /* Walk through and find array access of uniforms. Put a copy of that
1675 * uniform in the pull constant buffer.
1676 *
1677 * Note that we don't move constant-indexed accesses to arrays. No
1678 * testing has been done of the performance impact of this choice.
1679 */
1680 foreach_list_safe(node, &this->instructions) {
1681 fs_inst *inst = (fs_inst *)node;
1682
1683 for (int i = 0 ; i < 3; i++) {
1684 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1685 continue;
1686
1687 int uniform = inst->src[i].reg;
1688
1689 /* If this array isn't already present in the pull constant buffer,
1690 * add it.
1691 */
1692 if (pull_constant_loc[uniform] == -1) {
1693 const float **values = &c->prog_data.param[uniform];
1694
1695 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1696
1697 assert(param_size[uniform]);
1698
1699 for (int j = 0; j < param_size[uniform]; j++) {
1700 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1701 values[j];
1702 }
1703 }
1704
1705 /* Set up the annotation tracking for new generated instructions. */
1706 base_ir = inst->ir;
1707 current_annotation = inst->annotation;
1708
1709 fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1710 fs_reg temp = fs_reg(this, glsl_type::float_type);
1711 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1712 surf_index,
1713 *inst->src[i].reladdr,
1714 pull_constant_loc[uniform] +
1715 inst->src[i].reg_offset);
1716 inst->insert_before(&list);
1717
1718 inst->src[i].file = temp.file;
1719 inst->src[i].reg = temp.reg;
1720 inst->src[i].reg_offset = temp.reg_offset;
1721 inst->src[i].reladdr = NULL;
1722 }
1723 }
1724 }
1725
1726 /**
1727 * Choose accesses from the UNIFORM file to demote to using the pull
1728 * constant buffer.
1729 *
1730 * We allow a fragment shader to have more than the specified minimum
1731 * maximum number of fragment shader uniform components (64). If
1732 * there are too many of these, they'd fill up all of register space.
1733 * So, this will push some of them out to the pull constant buffer and
1734 * update the program to load them.
1735 */
1736 void
1737 fs_visitor::setup_pull_constants()
1738 {
1739 /* Only allow 16 registers (128 uniform components) as push constants. */
1740 unsigned int max_uniform_components = 16 * 8;
1741 if (c->prog_data.nr_params <= max_uniform_components)
1742 return;
1743
1744 if (dispatch_width == 16) {
1745 fail("Pull constants not supported in 16-wide\n");
1746 return;
1747 }
1748
1749 /* Just demote the end of the list. We could probably do better
1750 * here, demoting things that are rarely used in the program first.
1751 */
1752 unsigned int pull_uniform_base = max_uniform_components;
1753
1754 int pull_constant_loc[c->prog_data.nr_params];
1755 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1756 if (i < pull_uniform_base) {
1757 pull_constant_loc[i] = -1;
1758 } else {
1759 pull_constant_loc[i] = -1;
1760 /* If our constant is already being uploaded for reladdr purposes,
1761 * reuse it.
1762 */
1763 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1764 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1765 pull_constant_loc[i] = j;
1766 break;
1767 }
1768 }
1769 if (pull_constant_loc[i] == -1) {
1770 int pull_index = c->prog_data.nr_pull_params++;
1771 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1772 pull_constant_loc[i] = pull_index;;
1773 }
1774 }
1775 }
1776 c->prog_data.nr_params = pull_uniform_base;
1777
1778 foreach_list(node, &this->instructions) {
1779 fs_inst *inst = (fs_inst *)node;
1780
1781 for (int i = 0; i < 3; i++) {
1782 if (inst->src[i].file != UNIFORM)
1783 continue;
1784
1785 int pull_index = pull_constant_loc[inst->src[i].reg +
1786 inst->src[i].reg_offset];
1787 if (pull_index == -1)
1788 continue;
1789
1790 assert(!inst->src[i].reladdr);
1791
1792 fs_reg dst = fs_reg(this, glsl_type::float_type);
1793 fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1794 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1795 fs_inst *pull =
1796 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1797 dst, index, offset);
1798 pull->ir = inst->ir;
1799 pull->annotation = inst->annotation;
1800
1801 inst->insert_before(pull);
1802
1803 inst->src[i].file = GRF;
1804 inst->src[i].reg = dst.reg;
1805 inst->src[i].reg_offset = 0;
1806 inst->src[i].smear = pull_index & 3;
1807 }
1808 }
1809 }
1810
1811 bool
1812 fs_visitor::opt_algebraic()
1813 {
1814 bool progress = false;
1815
1816 foreach_list(node, &this->instructions) {
1817 fs_inst *inst = (fs_inst *)node;
1818
1819 switch (inst->opcode) {
1820 case BRW_OPCODE_MUL:
1821 if (inst->src[1].file != IMM)
1822 continue;
1823
1824 /* a * 1.0 = a */
1825 if (inst->src[1].is_one()) {
1826 inst->opcode = BRW_OPCODE_MOV;
1827 inst->src[1] = reg_undef;
1828 progress = true;
1829 break;
1830 }
1831
1832 /* a * 0.0 = 0.0 */
1833 if (inst->src[1].is_zero()) {
1834 inst->opcode = BRW_OPCODE_MOV;
1835 inst->src[0] = inst->src[1];
1836 inst->src[1] = reg_undef;
1837 progress = true;
1838 break;
1839 }
1840
1841 break;
1842 case BRW_OPCODE_ADD:
1843 if (inst->src[1].file != IMM)
1844 continue;
1845
1846 /* a + 0.0 = a */
1847 if (inst->src[1].is_zero()) {
1848 inst->opcode = BRW_OPCODE_MOV;
1849 inst->src[1] = reg_undef;
1850 progress = true;
1851 break;
1852 }
1853 break;
1854 default:
1855 break;
1856 }
1857 }
1858
1859 return progress;
1860 }
1861
1862 /**
1863 * Removes any instructions writing a VGRF where that VGRF is not used by any
1864 * later instruction.
1865 */
1866 bool
1867 fs_visitor::dead_code_eliminate()
1868 {
1869 bool progress = false;
1870 int pc = 0;
1871
1872 calculate_live_intervals();
1873
1874 foreach_list_safe(node, &this->instructions) {
1875 fs_inst *inst = (fs_inst *)node;
1876
1877 if (inst->dst.file == GRF) {
1878 bool dead = true;
1879
1880 for (int i = 0; i < inst->regs_written; i++) {
1881 int var = live_intervals->var_from_vgrf[inst->dst.reg];
1882 assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
1883 if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
1884 dead = false;
1885 break;
1886 }
1887 }
1888
1889 if (dead) {
1890 /* Don't dead code eliminate instructions that write to the
1891 * accumulator as a side-effect. Instead just set the destination
1892 * to the null register to free it.
1893 */
1894 switch (inst->opcode) {
1895 case BRW_OPCODE_ADDC:
1896 case BRW_OPCODE_SUBB:
1897 case BRW_OPCODE_MACH:
1898 inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
1899 break;
1900 default:
1901 inst->remove();
1902 progress = true;
1903 break;
1904 }
1905 }
1906 }
1907
1908 pc++;
1909 }
1910
1911 if (progress)
1912 invalidate_live_intervals();
1913
1914 return progress;
1915 }
1916
1917 struct dead_code_hash_key
1918 {
1919 int vgrf;
1920 int reg_offset;
1921 };
1922
1923 static bool
1924 dead_code_hash_compare(const void *a, const void *b)
1925 {
1926 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1927 }
1928
1929 static void
1930 clear_dead_code_hash(struct hash_table *ht)
1931 {
1932 struct hash_entry *entry;
1933
1934 hash_table_foreach(ht, entry) {
1935 _mesa_hash_table_remove(ht, entry);
1936 }
1937 }
1938
1939 static void
1940 insert_dead_code_hash(struct hash_table *ht,
1941 int vgrf, int reg_offset, fs_inst *inst)
1942 {
1943 /* We don't bother freeing keys, because they'll be GCed with the ht. */
1944 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1945
1946 key->vgrf = vgrf;
1947 key->reg_offset = reg_offset;
1948
1949 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1950 }
1951
1952 static struct hash_entry *
1953 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1954 {
1955 struct dead_code_hash_key key;
1956
1957 key.vgrf = vgrf;
1958 key.reg_offset = reg_offset;
1959
1960 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1961 }
1962
1963 static void
1964 remove_dead_code_hash(struct hash_table *ht,
1965 int vgrf, int reg_offset)
1966 {
1967 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1968 if (!entry)
1969 return;
1970
1971 _mesa_hash_table_remove(ht, entry);
1972 }
1973
1974 /**
1975 * Walks basic blocks, removing any regs that are written but not read before
1976 * being redefined.
1977 *
1978 * The dead_code_eliminate() function implements a global dead code
1979 * elimination, but it only handles the removing the last write to a register
1980 * if it's never read. This one can handle intermediate writes, but only
1981 * within a basic block.
1982 */
1983 bool
1984 fs_visitor::dead_code_eliminate_local()
1985 {
1986 struct hash_table *ht;
1987 bool progress = false;
1988
1989 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1990
1991 foreach_list_safe(node, &this->instructions) {
1992 fs_inst *inst = (fs_inst *)node;
1993
1994 /* At a basic block, empty the HT since we don't understand dataflow
1995 * here.
1996 */
1997 if (inst->is_control_flow()) {
1998 clear_dead_code_hash(ht);
1999 continue;
2000 }
2001
2002 /* Clear the HT of any instructions that got read. */
2003 for (int i = 0; i < 3; i++) {
2004 fs_reg src = inst->src[i];
2005 if (src.file != GRF)
2006 continue;
2007
2008 int read = 1;
2009 if (inst->is_send_from_grf())
2010 read = virtual_grf_sizes[src.reg] - src.reg_offset;
2011
2012 for (int reg_offset = src.reg_offset;
2013 reg_offset < src.reg_offset + read;
2014 reg_offset++) {
2015 remove_dead_code_hash(ht, src.reg, reg_offset);
2016 }
2017 }
2018
2019 /* Add any update of a GRF to the HT, removing a previous write if it
2020 * wasn't read.
2021 */
2022 if (inst->dst.file == GRF) {
2023 if (inst->regs_written > 1) {
2024 /* We don't know how to trim channels from an instruction's
2025 * writes, so we can't incrementally remove unread channels from
2026 * it. Just remove whatever it overwrites from the table
2027 */
2028 for (int i = 0; i < inst->regs_written; i++) {
2029 remove_dead_code_hash(ht,
2030 inst->dst.reg,
2031 inst->dst.reg_offset + i);
2032 }
2033 } else {
2034 struct hash_entry *entry =
2035 get_dead_code_hash_entry(ht, inst->dst.reg,
2036 inst->dst.reg_offset);
2037
2038 if (inst->is_partial_write()) {
2039 /* For a partial write, we can't remove any previous dead code
2040 * candidate, since we're just modifying their result, but we can
2041 * be dead code eliminiated ourselves.
2042 */
2043 if (entry) {
2044 entry->data = inst;
2045 } else {
2046 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2047 inst);
2048 }
2049 } else {
2050 if (entry) {
2051 /* We're completely updating a channel, and there was a
2052 * previous write to the channel that wasn't read. Kill it!
2053 */
2054 fs_inst *inst = (fs_inst *)entry->data;
2055 inst->remove();
2056 progress = true;
2057 _mesa_hash_table_remove(ht, entry);
2058 }
2059
2060 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2061 inst);
2062 }
2063 }
2064 }
2065 }
2066
2067 _mesa_hash_table_destroy(ht, NULL);
2068
2069 if (progress)
2070 invalidate_live_intervals();
2071
2072 return progress;
2073 }
2074
2075 /**
2076 * Implements a second type of register coalescing: This one checks if
2077 * the two regs involved in a raw move don't interfere, in which case
2078 * they can both by stored in the same place and the MOV removed.
2079 */
2080 bool
2081 fs_visitor::register_coalesce_2()
2082 {
2083 bool progress = false;
2084
2085 calculate_live_intervals();
2086
2087 foreach_list_safe(node, &this->instructions) {
2088 fs_inst *inst = (fs_inst *)node;
2089
2090 if (inst->opcode != BRW_OPCODE_MOV ||
2091 inst->is_partial_write() ||
2092 inst->saturate ||
2093 inst->src[0].file != GRF ||
2094 inst->src[0].negate ||
2095 inst->src[0].abs ||
2096 inst->src[0].smear != -1 ||
2097 inst->dst.file != GRF ||
2098 inst->dst.type != inst->src[0].type ||
2099 virtual_grf_sizes[inst->src[0].reg] != 1) {
2100 continue;
2101 }
2102
2103 int var_from = live_intervals->var_from_reg(&inst->src[0]);
2104 int var_to = live_intervals->var_from_reg(&inst->dst);
2105
2106 if (live_intervals->vars_interfere(var_from, var_to))
2107 continue;
2108
2109 int reg_from = inst->src[0].reg;
2110 assert(inst->src[0].reg_offset == 0);
2111 int reg_to = inst->dst.reg;
2112 int reg_to_offset = inst->dst.reg_offset;
2113
2114 foreach_list(node, &this->instructions) {
2115 fs_inst *scan_inst = (fs_inst *)node;
2116
2117 if (scan_inst->dst.file == GRF &&
2118 scan_inst->dst.reg == reg_from) {
2119 scan_inst->dst.reg = reg_to;
2120 scan_inst->dst.reg_offset = reg_to_offset;
2121 }
2122 for (int i = 0; i < 3; i++) {
2123 if (scan_inst->src[i].file == GRF &&
2124 scan_inst->src[i].reg == reg_from) {
2125 scan_inst->src[i].reg = reg_to;
2126 scan_inst->src[i].reg_offset = reg_to_offset;
2127 }
2128 }
2129 }
2130
2131 inst->remove();
2132 progress = true;
2133 continue;
2134 }
2135
2136 if (progress)
2137 invalidate_live_intervals();
2138
2139 return progress;
2140 }
2141
2142 bool
2143 fs_visitor::register_coalesce()
2144 {
2145 bool progress = false;
2146 int if_depth = 0;
2147 int loop_depth = 0;
2148
2149 foreach_list_safe(node, &this->instructions) {
2150 fs_inst *inst = (fs_inst *)node;
2151
2152 /* Make sure that we dominate the instructions we're going to
2153 * scan for interfering with our coalescing, or we won't have
2154 * scanned enough to see if anything interferes with our
2155 * coalescing. We don't dominate the following instructions if
2156 * we're in a loop or an if block.
2157 */
2158 switch (inst->opcode) {
2159 case BRW_OPCODE_DO:
2160 loop_depth++;
2161 break;
2162 case BRW_OPCODE_WHILE:
2163 loop_depth--;
2164 break;
2165 case BRW_OPCODE_IF:
2166 if_depth++;
2167 break;
2168 case BRW_OPCODE_ENDIF:
2169 if_depth--;
2170 break;
2171 default:
2172 break;
2173 }
2174 if (loop_depth || if_depth)
2175 continue;
2176
2177 if (inst->opcode != BRW_OPCODE_MOV ||
2178 inst->is_partial_write() ||
2179 inst->saturate ||
2180 inst->dst.file != GRF || (inst->src[0].file != GRF &&
2181 inst->src[0].file != UNIFORM)||
2182 inst->dst.type != inst->src[0].type)
2183 continue;
2184
2185 bool has_source_modifiers = (inst->src[0].abs ||
2186 inst->src[0].negate ||
2187 inst->src[0].smear != -1 ||
2188 inst->src[0].file == UNIFORM);
2189
2190 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
2191 * them: check for no writes to either one until the exit of the
2192 * program.
2193 */
2194 bool interfered = false;
2195
2196 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2197 !scan_inst->is_tail_sentinel();
2198 scan_inst = (fs_inst *)scan_inst->next) {
2199 if (scan_inst->dst.file == GRF) {
2200 if (scan_inst->overwrites_reg(inst->dst) ||
2201 scan_inst->overwrites_reg(inst->src[0])) {
2202 interfered = true;
2203 break;
2204 }
2205 }
2206
2207 if (has_source_modifiers) {
2208 for (int i = 0; i < 3; i++) {
2209 if (scan_inst->src[i].file == GRF &&
2210 scan_inst->src[i].reg == inst->dst.reg &&
2211 scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2212 inst->dst.type != scan_inst->src[i].type)
2213 {
2214 interfered = true;
2215 break;
2216 }
2217 }
2218 }
2219
2220
2221 /* The gen6 MATH instruction can't handle source modifiers or
2222 * unusual register regions, so avoid coalescing those for
2223 * now. We should do something more specific.
2224 */
2225 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2226 interfered = true;
2227 break;
2228 }
2229
2230 if (scan_inst->mlen > 0 && scan_inst->base_mrf == -1 &&
2231 scan_inst->src[0].file == GRF &&
2232 scan_inst->src[0].reg == inst->dst.reg) {
2233 interfered = true;
2234 break;
2235 }
2236
2237 /* The accumulator result appears to get used for the
2238 * conditional modifier generation. When negating a UD
2239 * value, there is a 33rd bit generated for the sign in the
2240 * accumulator value, so now you can't check, for example,
2241 * equality with a 32-bit value. See piglit fs-op-neg-uint.
2242 */
2243 if (scan_inst->conditional_mod &&
2244 inst->src[0].negate &&
2245 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2246 interfered = true;
2247 break;
2248 }
2249 }
2250 if (interfered) {
2251 continue;
2252 }
2253
2254 /* Rewrite the later usage to point at the source of the move to
2255 * be removed.
2256 */
2257 for (fs_inst *scan_inst = inst;
2258 !scan_inst->is_tail_sentinel();
2259 scan_inst = (fs_inst *)scan_inst->next) {
2260 for (int i = 0; i < 3; i++) {
2261 if (scan_inst->src[i].file == GRF &&
2262 scan_inst->src[i].reg == inst->dst.reg &&
2263 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2264 fs_reg new_src = inst->src[0];
2265 if (scan_inst->src[i].abs) {
2266 new_src.negate = 0;
2267 new_src.abs = 1;
2268 }
2269 new_src.negate ^= scan_inst->src[i].negate;
2270 new_src.sechalf = scan_inst->src[i].sechalf;
2271 scan_inst->src[i] = new_src;
2272 }
2273 }
2274 }
2275
2276 inst->remove();
2277 progress = true;
2278 }
2279
2280 if (progress)
2281 invalidate_live_intervals();
2282
2283 return progress;
2284 }
2285
2286
2287 bool
2288 fs_visitor::compute_to_mrf()
2289 {
2290 bool progress = false;
2291 int next_ip = 0;
2292
2293 calculate_live_intervals();
2294
2295 foreach_list_safe(node, &this->instructions) {
2296 fs_inst *inst = (fs_inst *)node;
2297
2298 int ip = next_ip;
2299 next_ip++;
2300
2301 if (inst->opcode != BRW_OPCODE_MOV ||
2302 inst->is_partial_write() ||
2303 inst->dst.file != MRF || inst->src[0].file != GRF ||
2304 inst->dst.type != inst->src[0].type ||
2305 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2306 continue;
2307
2308 /* Work out which hardware MRF registers are written by this
2309 * instruction.
2310 */
2311 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2312 int mrf_high;
2313 if (inst->dst.reg & BRW_MRF_COMPR4) {
2314 mrf_high = mrf_low + 4;
2315 } else if (dispatch_width == 16 &&
2316 (!inst->force_uncompressed && !inst->force_sechalf)) {
2317 mrf_high = mrf_low + 1;
2318 } else {
2319 mrf_high = mrf_low;
2320 }
2321
2322 /* Can't compute-to-MRF this GRF if someone else was going to
2323 * read it later.
2324 */
2325 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2326 continue;
2327
2328 /* Found a move of a GRF to a MRF. Let's see if we can go
2329 * rewrite the thing that made this GRF to write into the MRF.
2330 */
2331 fs_inst *scan_inst;
2332 for (scan_inst = (fs_inst *)inst->prev;
2333 scan_inst->prev != NULL;
2334 scan_inst = (fs_inst *)scan_inst->prev) {
2335 if (scan_inst->dst.file == GRF &&
2336 scan_inst->dst.reg == inst->src[0].reg) {
2337 /* Found the last thing to write our reg we want to turn
2338 * into a compute-to-MRF.
2339 */
2340
2341 /* If this one instruction didn't populate all the
2342 * channels, bail. We might be able to rewrite everything
2343 * that writes that reg, but it would require smarter
2344 * tracking to delay the rewriting until complete success.
2345 */
2346 if (scan_inst->is_partial_write())
2347 break;
2348
2349 /* Things returning more than one register would need us to
2350 * understand coalescing out more than one MOV at a time.
2351 */
2352 if (scan_inst->regs_written > 1)
2353 break;
2354
2355 /* SEND instructions can't have MRF as a destination. */
2356 if (scan_inst->mlen)
2357 break;
2358
2359 if (brw->gen == 6) {
2360 /* gen6 math instructions must have the destination be
2361 * GRF, so no compute-to-MRF for them.
2362 */
2363 if (scan_inst->is_math()) {
2364 break;
2365 }
2366 }
2367
2368 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2369 /* Found the creator of our MRF's source value. */
2370 scan_inst->dst.file = MRF;
2371 scan_inst->dst.reg = inst->dst.reg;
2372 scan_inst->saturate |= inst->saturate;
2373 inst->remove();
2374 progress = true;
2375 }
2376 break;
2377 }
2378
2379 /* We don't handle control flow here. Most computation of
2380 * values that end up in MRFs are shortly before the MRF
2381 * write anyway.
2382 */
2383 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2384 break;
2385
2386 /* You can't read from an MRF, so if someone else reads our
2387 * MRF's source GRF that we wanted to rewrite, that stops us.
2388 */
2389 bool interfered = false;
2390 for (int i = 0; i < 3; i++) {
2391 if (scan_inst->src[i].file == GRF &&
2392 scan_inst->src[i].reg == inst->src[0].reg &&
2393 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2394 interfered = true;
2395 }
2396 }
2397 if (interfered)
2398 break;
2399
2400 if (scan_inst->dst.file == MRF) {
2401 /* If somebody else writes our MRF here, we can't
2402 * compute-to-MRF before that.
2403 */
2404 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2405 int scan_mrf_high;
2406
2407 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2408 scan_mrf_high = scan_mrf_low + 4;
2409 } else if (dispatch_width == 16 &&
2410 (!scan_inst->force_uncompressed &&
2411 !scan_inst->force_sechalf)) {
2412 scan_mrf_high = scan_mrf_low + 1;
2413 } else {
2414 scan_mrf_high = scan_mrf_low;
2415 }
2416
2417 if (mrf_low == scan_mrf_low ||
2418 mrf_low == scan_mrf_high ||
2419 mrf_high == scan_mrf_low ||
2420 mrf_high == scan_mrf_high) {
2421 break;
2422 }
2423 }
2424
2425 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2426 /* Found a SEND instruction, which means that there are
2427 * live values in MRFs from base_mrf to base_mrf +
2428 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2429 * above it.
2430 */
2431 if (mrf_low >= scan_inst->base_mrf &&
2432 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2433 break;
2434 }
2435 if (mrf_high >= scan_inst->base_mrf &&
2436 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2437 break;
2438 }
2439 }
2440 }
2441 }
2442
2443 if (progress)
2444 invalidate_live_intervals();
2445
2446 return progress;
2447 }
2448
2449 /**
2450 * Walks through basic blocks, looking for repeated MRF writes and
2451 * removing the later ones.
2452 */
2453 bool
2454 fs_visitor::remove_duplicate_mrf_writes()
2455 {
2456 fs_inst *last_mrf_move[16];
2457 bool progress = false;
2458
2459 /* Need to update the MRF tracking for compressed instructions. */
2460 if (dispatch_width == 16)
2461 return false;
2462
2463 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2464
2465 foreach_list_safe(node, &this->instructions) {
2466 fs_inst *inst = (fs_inst *)node;
2467
2468 if (inst->is_control_flow()) {
2469 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2470 }
2471
2472 if (inst->opcode == BRW_OPCODE_MOV &&
2473 inst->dst.file == MRF) {
2474 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2475 if (prev_inst && inst->equals(prev_inst)) {
2476 inst->remove();
2477 progress = true;
2478 continue;
2479 }
2480 }
2481
2482 /* Clear out the last-write records for MRFs that were overwritten. */
2483 if (inst->dst.file == MRF) {
2484 last_mrf_move[inst->dst.reg] = NULL;
2485 }
2486
2487 if (inst->mlen > 0 && inst->base_mrf != -1) {
2488 /* Found a SEND instruction, which will include two or fewer
2489 * implied MRF writes. We could do better here.
2490 */
2491 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2492 last_mrf_move[inst->base_mrf + i] = NULL;
2493 }
2494 }
2495
2496 /* Clear out any MRF move records whose sources got overwritten. */
2497 if (inst->dst.file == GRF) {
2498 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2499 if (last_mrf_move[i] &&
2500 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2501 last_mrf_move[i] = NULL;
2502 }
2503 }
2504 }
2505
2506 if (inst->opcode == BRW_OPCODE_MOV &&
2507 inst->dst.file == MRF &&
2508 inst->src[0].file == GRF &&
2509 !inst->is_partial_write()) {
2510 last_mrf_move[inst->dst.reg] = inst;
2511 }
2512 }
2513
2514 if (progress)
2515 invalidate_live_intervals();
2516
2517 return progress;
2518 }
2519
2520 static void
2521 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2522 int first_grf, int grf_len)
2523 {
2524 bool inst_16wide = (dispatch_width > 8 &&
2525 !inst->force_uncompressed &&
2526 !inst->force_sechalf);
2527
2528 /* Clear the flag for registers that actually got read (as expected). */
2529 for (int i = 0; i < 3; i++) {
2530 int grf;
2531 if (inst->src[i].file == GRF) {
2532 grf = inst->src[i].reg;
2533 } else if (inst->src[i].file == HW_REG &&
2534 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2535 grf = inst->src[i].fixed_hw_reg.nr;
2536 } else {
2537 continue;
2538 }
2539
2540 if (grf >= first_grf &&
2541 grf < first_grf + grf_len) {
2542 deps[grf - first_grf] = false;
2543 if (inst_16wide)
2544 deps[grf - first_grf + 1] = false;
2545 }
2546 }
2547 }
2548
2549 /**
2550 * Implements this workaround for the original 965:
2551 *
2552 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2553 * check for post destination dependencies on this instruction, software
2554 * must ensure that there is no destination hazard for the case of ‘write
2555 * followed by a posted write’ shown in the following example.
2556 *
2557 * 1. mov r3 0
2558 * 2. send r3.xy <rest of send instruction>
2559 * 3. mov r2 r3
2560 *
2561 * Due to no post-destination dependency check on the ‘send’, the above
2562 * code sequence could have two instructions (1 and 2) in flight at the
2563 * same time that both consider ‘r3’ as the target of their final writes.
2564 */
2565 void
2566 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2567 {
2568 int reg_size = dispatch_width / 8;
2569 int write_len = inst->regs_written * reg_size;
2570 int first_write_grf = inst->dst.reg;
2571 bool needs_dep[BRW_MAX_MRF];
2572 assert(write_len < (int)sizeof(needs_dep) - 1);
2573
2574 memset(needs_dep, false, sizeof(needs_dep));
2575 memset(needs_dep, true, write_len);
2576
2577 clear_deps_for_inst_src(inst, dispatch_width,
2578 needs_dep, first_write_grf, write_len);
2579
2580 /* Walk backwards looking for writes to registers we're writing which
2581 * aren't read since being written. If we hit the start of the program,
2582 * we assume that there are no outstanding dependencies on entry to the
2583 * program.
2584 */
2585 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2586 scan_inst != NULL;
2587 scan_inst = (fs_inst *)scan_inst->prev) {
2588
2589 /* If we hit control flow, assume that there *are* outstanding
2590 * dependencies, and force their cleanup before our instruction.
2591 */
2592 if (scan_inst->is_control_flow()) {
2593 for (int i = 0; i < write_len; i++) {
2594 if (needs_dep[i]) {
2595 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2596 }
2597 }
2598 return;
2599 }
2600
2601 bool scan_inst_16wide = (dispatch_width > 8 &&
2602 !scan_inst->force_uncompressed &&
2603 !scan_inst->force_sechalf);
2604
2605 /* We insert our reads as late as possible on the assumption that any
2606 * instruction but a MOV that might have left us an outstanding
2607 * dependency has more latency than a MOV.
2608 */
2609 if (scan_inst->dst.file == GRF) {
2610 for (int i = 0; i < scan_inst->regs_written; i++) {
2611 int reg = scan_inst->dst.reg + i * reg_size;
2612
2613 if (reg >= first_write_grf &&
2614 reg < first_write_grf + write_len &&
2615 needs_dep[reg - first_write_grf]) {
2616 inst->insert_before(DEP_RESOLVE_MOV(reg));
2617 needs_dep[reg - first_write_grf] = false;
2618 if (scan_inst_16wide)
2619 needs_dep[reg - first_write_grf + 1] = false;
2620 }
2621 }
2622 }
2623
2624 /* Clear the flag for registers that actually got read (as expected). */
2625 clear_deps_for_inst_src(scan_inst, dispatch_width,
2626 needs_dep, first_write_grf, write_len);
2627
2628 /* Continue the loop only if we haven't resolved all the dependencies */
2629 int i;
2630 for (i = 0; i < write_len; i++) {
2631 if (needs_dep[i])
2632 break;
2633 }
2634 if (i == write_len)
2635 return;
2636 }
2637 }
2638
2639 /**
2640 * Implements this workaround for the original 965:
2641 *
2642 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2643 * used as a destination register until after it has been sourced by an
2644 * instruction with a different destination register.
2645 */
2646 void
2647 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2648 {
2649 int write_len = inst->regs_written * dispatch_width / 8;
2650 int first_write_grf = inst->dst.reg;
2651 bool needs_dep[BRW_MAX_MRF];
2652 assert(write_len < (int)sizeof(needs_dep) - 1);
2653
2654 memset(needs_dep, false, sizeof(needs_dep));
2655 memset(needs_dep, true, write_len);
2656 /* Walk forwards looking for writes to registers we're writing which aren't
2657 * read before being written.
2658 */
2659 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2660 !scan_inst->is_tail_sentinel();
2661 scan_inst = (fs_inst *)scan_inst->next) {
2662 /* If we hit control flow, force resolve all remaining dependencies. */
2663 if (scan_inst->is_control_flow()) {
2664 for (int i = 0; i < write_len; i++) {
2665 if (needs_dep[i])
2666 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2667 }
2668 return;
2669 }
2670
2671 /* Clear the flag for registers that actually got read (as expected). */
2672 clear_deps_for_inst_src(scan_inst, dispatch_width,
2673 needs_dep, first_write_grf, write_len);
2674
2675 /* We insert our reads as late as possible since they're reading the
2676 * result of a SEND, which has massive latency.
2677 */
2678 if (scan_inst->dst.file == GRF &&
2679 scan_inst->dst.reg >= first_write_grf &&
2680 scan_inst->dst.reg < first_write_grf + write_len &&
2681 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2682 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2683 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2684 }
2685
2686 /* Continue the loop only if we haven't resolved all the dependencies */
2687 int i;
2688 for (i = 0; i < write_len; i++) {
2689 if (needs_dep[i])
2690 break;
2691 }
2692 if (i == write_len)
2693 return;
2694 }
2695
2696 /* If we hit the end of the program, resolve all remaining dependencies out
2697 * of paranoia.
2698 */
2699 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2700 assert(last_inst->eot);
2701 for (int i = 0; i < write_len; i++) {
2702 if (needs_dep[i])
2703 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2704 }
2705 }
2706
2707 void
2708 fs_visitor::insert_gen4_send_dependency_workarounds()
2709 {
2710 if (brw->gen != 4 || brw->is_g4x)
2711 return;
2712
2713 /* Note that we're done with register allocation, so GRF fs_regs always
2714 * have a .reg_offset of 0.
2715 */
2716
2717 foreach_list_safe(node, &this->instructions) {
2718 fs_inst *inst = (fs_inst *)node;
2719
2720 if (inst->mlen != 0 && inst->dst.file == GRF) {
2721 insert_gen4_pre_send_dependency_workarounds(inst);
2722 insert_gen4_post_send_dependency_workarounds(inst);
2723 }
2724 }
2725 }
2726
2727 /**
2728 * Turns the generic expression-style uniform pull constant load instruction
2729 * into a hardware-specific series of instructions for loading a pull
2730 * constant.
2731 *
2732 * The expression style allows the CSE pass before this to optimize out
2733 * repeated loads from the same offset, and gives the pre-register-allocation
2734 * scheduling full flexibility, while the conversion to native instructions
2735 * allows the post-register-allocation scheduler the best information
2736 * possible.
2737 *
2738 * Note that execution masking for setting up pull constant loads is special:
2739 * the channels that need to be written are unrelated to the current execution
2740 * mask, since a later instruction will use one of the result channels as a
2741 * source operand for all 8 or 16 of its channels.
2742 */
2743 void
2744 fs_visitor::lower_uniform_pull_constant_loads()
2745 {
2746 foreach_list(node, &this->instructions) {
2747 fs_inst *inst = (fs_inst *)node;
2748
2749 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2750 continue;
2751
2752 if (brw->gen >= 7) {
2753 /* The offset arg before was a vec4-aligned byte offset. We need to
2754 * turn it into a dword offset.
2755 */
2756 fs_reg const_offset_reg = inst->src[1];
2757 assert(const_offset_reg.file == IMM &&
2758 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2759 const_offset_reg.imm.u /= 4;
2760 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2761
2762 /* This is actually going to be a MOV, but since only the first dword
2763 * is accessed, we have a special opcode to do just that one. Note
2764 * that this needs to be an operation that will be considered a def
2765 * by live variable analysis, or register allocation will explode.
2766 */
2767 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2768 payload, const_offset_reg);
2769 setup->force_writemask_all = true;
2770
2771 setup->ir = inst->ir;
2772 setup->annotation = inst->annotation;
2773 inst->insert_before(setup);
2774
2775 /* Similarly, this will only populate the first 4 channels of the
2776 * result register (since we only use smear values from 0-3), but we
2777 * don't tell the optimizer.
2778 */
2779 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2780 inst->src[1] = payload;
2781
2782 invalidate_live_intervals();
2783 } else {
2784 /* Before register allocation, we didn't tell the scheduler about the
2785 * MRF we use. We know it's safe to use this MRF because nothing
2786 * else does except for register spill/unspill, which generates and
2787 * uses its MRF within a single IR instruction.
2788 */
2789 inst->base_mrf = 14;
2790 inst->mlen = 1;
2791 }
2792 }
2793 }
2794
2795 void
2796 fs_visitor::dump_instruction(backend_instruction *be_inst)
2797 {
2798 fs_inst *inst = (fs_inst *)be_inst;
2799
2800 if (inst->predicate) {
2801 printf("(%cf0.%d) ",
2802 inst->predicate_inverse ? '-' : '+',
2803 inst->flag_subreg);
2804 }
2805
2806 printf("%s", brw_instruction_name(inst->opcode));
2807 if (inst->saturate)
2808 printf(".sat");
2809 if (inst->conditional_mod) {
2810 printf(".cmod");
2811 if (!inst->predicate &&
2812 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2813 inst->opcode != BRW_OPCODE_IF &&
2814 inst->opcode != BRW_OPCODE_WHILE))) {
2815 printf(".f0.%d", inst->flag_subreg);
2816 }
2817 }
2818 printf(" ");
2819
2820
2821 switch (inst->dst.file) {
2822 case GRF:
2823 printf("vgrf%d", inst->dst.reg);
2824 if (inst->dst.reg_offset)
2825 printf("+%d", inst->dst.reg_offset);
2826 break;
2827 case MRF:
2828 printf("m%d", inst->dst.reg);
2829 break;
2830 case BAD_FILE:
2831 printf("(null)");
2832 break;
2833 case UNIFORM:
2834 printf("***u%d***", inst->dst.reg);
2835 break;
2836 case HW_REG:
2837 printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
2838 if (inst->dst.fixed_hw_reg.subnr)
2839 printf("+%d", inst->dst.fixed_hw_reg.subnr);
2840 break;
2841 default:
2842 printf("???");
2843 break;
2844 }
2845 printf(", ");
2846
2847 for (int i = 0; i < 3; i++) {
2848 if (inst->src[i].negate)
2849 printf("-");
2850 if (inst->src[i].abs)
2851 printf("|");
2852 switch (inst->src[i].file) {
2853 case GRF:
2854 printf("vgrf%d", inst->src[i].reg);
2855 if (inst->src[i].reg_offset)
2856 printf("+%d", inst->src[i].reg_offset);
2857 break;
2858 case MRF:
2859 printf("***m%d***", inst->src[i].reg);
2860 break;
2861 case UNIFORM:
2862 printf("u%d", inst->src[i].reg);
2863 if (inst->src[i].reg_offset)
2864 printf(".%d", inst->src[i].reg_offset);
2865 break;
2866 case BAD_FILE:
2867 printf("(null)");
2868 break;
2869 case IMM:
2870 switch (inst->src[i].type) {
2871 case BRW_REGISTER_TYPE_F:
2872 printf("%ff", inst->src[i].imm.f);
2873 break;
2874 case BRW_REGISTER_TYPE_D:
2875 printf("%dd", inst->src[i].imm.i);
2876 break;
2877 case BRW_REGISTER_TYPE_UD:
2878 printf("%uu", inst->src[i].imm.u);
2879 break;
2880 default:
2881 printf("???");
2882 break;
2883 }
2884 break;
2885 case HW_REG:
2886 if (inst->src[i].fixed_hw_reg.negate)
2887 printf("-");
2888 if (inst->src[i].fixed_hw_reg.abs)
2889 printf("|");
2890 printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2891 if (inst->src[i].fixed_hw_reg.subnr)
2892 printf("+%d", inst->src[i].fixed_hw_reg.subnr);
2893 if (inst->src[i].fixed_hw_reg.abs)
2894 printf("|");
2895 break;
2896 default:
2897 printf("???");
2898 break;
2899 }
2900 if (inst->src[i].abs)
2901 printf("|");
2902
2903 if (i < 3)
2904 printf(", ");
2905 }
2906
2907 printf(" ");
2908
2909 if (inst->force_uncompressed)
2910 printf("1sthalf ");
2911
2912 if (inst->force_sechalf)
2913 printf("2ndhalf ");
2914
2915 printf("\n");
2916 }
2917
2918 /**
2919 * Possibly returns an instruction that set up @param reg.
2920 *
2921 * Sometimes we want to take the result of some expression/variable
2922 * dereference tree and rewrite the instruction generating the result
2923 * of the tree. When processing the tree, we know that the
2924 * instructions generated are all writing temporaries that are dead
2925 * outside of this tree. So, if we have some instructions that write
2926 * a temporary, we're free to point that temp write somewhere else.
2927 *
2928 * Note that this doesn't guarantee that the instruction generated
2929 * only reg -- it might be the size=4 destination of a texture instruction.
2930 */
2931 fs_inst *
2932 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2933 fs_inst *end,
2934 fs_reg reg)
2935 {
2936 if (end == start ||
2937 end->is_partial_write() ||
2938 reg.reladdr ||
2939 !reg.equals(end->dst)) {
2940 return NULL;
2941 } else {
2942 return end;
2943 }
2944 }
2945
2946 void
2947 fs_visitor::setup_payload_gen6()
2948 {
2949 bool uses_depth =
2950 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2951 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2952
2953 assert(brw->gen >= 6);
2954
2955 /* R0-1: masks, pixel X/Y coordinates. */
2956 c->nr_payload_regs = 2;
2957 /* R2: only for 32-pixel dispatch.*/
2958
2959 /* R3-26: barycentric interpolation coordinates. These appear in the
2960 * same order that they appear in the brw_wm_barycentric_interp_mode
2961 * enum. Each set of coordinates occupies 2 registers if dispatch width
2962 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2963 * appear if they were enabled using the "Barycentric Interpolation
2964 * Mode" bits in WM_STATE.
2965 */
2966 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2967 if (barycentric_interp_modes & (1 << i)) {
2968 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2969 c->nr_payload_regs += 2;
2970 if (dispatch_width == 16) {
2971 c->nr_payload_regs += 2;
2972 }
2973 }
2974 }
2975
2976 /* R27: interpolated depth if uses source depth */
2977 if (uses_depth) {
2978 c->source_depth_reg = c->nr_payload_regs;
2979 c->nr_payload_regs++;
2980 if (dispatch_width == 16) {
2981 /* R28: interpolated depth if not 8-wide. */
2982 c->nr_payload_regs++;
2983 }
2984 }
2985 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2986 if (uses_depth) {
2987 c->source_w_reg = c->nr_payload_regs;
2988 c->nr_payload_regs++;
2989 if (dispatch_width == 16) {
2990 /* R30: interpolated W if not 8-wide. */
2991 c->nr_payload_regs++;
2992 }
2993 }
2994 /* R31: MSAA position offsets. */
2995 /* R32-: bary for 32-pixel. */
2996 /* R58-59: interp W for 32-pixel. */
2997
2998 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2999 c->source_depth_to_render_target = true;
3000 }
3001 }
3002
3003 void
3004 fs_visitor::assign_binding_table_offsets()
3005 {
3006 uint32_t next_binding_table_offset = 0;
3007
3008 c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3009 next_binding_table_offset += c->key.nr_color_regions;
3010
3011 assign_common_binding_table_offsets(next_binding_table_offset);
3012 }
3013
3014 bool
3015 fs_visitor::run()
3016 {
3017 sanity_param_count = fp->Base.Parameters->NumParameters;
3018 uint32_t orig_nr_params = c->prog_data.nr_params;
3019
3020 assign_binding_table_offsets();
3021
3022 if (brw->gen >= 6)
3023 setup_payload_gen6();
3024 else
3025 setup_payload_gen4();
3026
3027 if (0) {
3028 emit_dummy_fs();
3029 } else {
3030 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3031 emit_shader_time_begin();
3032
3033 calculate_urb_setup();
3034 if (fp->Base.InputsRead > 0) {
3035 if (brw->gen < 6)
3036 emit_interpolation_setup_gen4();
3037 else
3038 emit_interpolation_setup_gen6();
3039 }
3040
3041 /* We handle discards by keeping track of the still-live pixels in f0.1.
3042 * Initialize it with the dispatched pixels.
3043 */
3044 if (fp->UsesKill) {
3045 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3046 discard_init->flag_subreg = 1;
3047 }
3048
3049 /* Generate FS IR for main(). (the visitor only descends into
3050 * functions called "main").
3051 */
3052 if (shader) {
3053 foreach_list(node, &*shader->ir) {
3054 ir_instruction *ir = (ir_instruction *)node;
3055 base_ir = ir;
3056 this->result = reg_undef;
3057 ir->accept(this);
3058 }
3059 } else {
3060 emit_fragment_program_code();
3061 }
3062 base_ir = NULL;
3063 if (failed)
3064 return false;
3065
3066 emit(FS_OPCODE_PLACEHOLDER_HALT);
3067
3068 emit_fb_writes();
3069
3070 split_virtual_grfs();
3071
3072 move_uniform_array_access_to_pull_constants();
3073 remove_dead_constants();
3074 setup_pull_constants();
3075
3076 bool progress;
3077 do {
3078 progress = false;
3079
3080 compact_virtual_grfs();
3081
3082 progress = remove_duplicate_mrf_writes() || progress;
3083
3084 progress = opt_algebraic() || progress;
3085 progress = opt_cse() || progress;
3086 progress = opt_copy_propagate() || progress;
3087 progress = dead_code_eliminate() || progress;
3088 progress = dead_code_eliminate_local() || progress;
3089 progress = register_coalesce() || progress;
3090 progress = register_coalesce_2() || progress;
3091 progress = compute_to_mrf() || progress;
3092 } while (progress);
3093
3094 schedule_instructions(false);
3095
3096 lower_uniform_pull_constant_loads();
3097
3098 assign_curb_setup();
3099 assign_urb_setup();
3100
3101 if (0)
3102 assign_regs_trivial();
3103 else {
3104 while (!assign_regs()) {
3105 if (failed)
3106 break;
3107 }
3108 }
3109 }
3110 assert(force_uncompressed_stack == 0);
3111 assert(force_sechalf_stack == 0);
3112
3113 /* This must come after all optimization and register allocation, since
3114 * it inserts dead code that happens to have side effects, and it does
3115 * so based on the actual physical registers in use.
3116 */
3117 insert_gen4_send_dependency_workarounds();
3118
3119 if (failed)
3120 return false;
3121
3122 schedule_instructions(true);
3123
3124 if (dispatch_width == 8) {
3125 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3126 } else {
3127 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3128
3129 /* Make sure we didn't try to sneak in an extra uniform */
3130 assert(orig_nr_params == c->prog_data.nr_params);
3131 (void) orig_nr_params;
3132 }
3133
3134 /* If any state parameters were appended, then ParameterValues could have
3135 * been realloced, in which case the driver uniform storage set up by
3136 * _mesa_associate_uniform_storage() would point to freed memory. Make
3137 * sure that didn't happen.
3138 */
3139 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3140
3141 return !failed;
3142 }
3143
3144 const unsigned *
3145 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3146 struct gl_fragment_program *fp,
3147 struct gl_shader_program *prog,
3148 unsigned *final_assembly_size)
3149 {
3150 bool start_busy = false;
3151 float start_time = 0;
3152
3153 if (unlikely(brw->perf_debug)) {
3154 start_busy = (brw->batch.last_bo &&
3155 drm_intel_bo_busy(brw->batch.last_bo));
3156 start_time = get_time();
3157 }
3158
3159 struct brw_shader *shader = NULL;
3160 if (prog)
3161 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3162
3163 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3164 if (prog) {
3165 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3166 _mesa_print_ir(shader->ir, NULL);
3167 printf("\n\n");
3168 } else {
3169 printf("ARB_fragment_program %d ir for native fragment shader\n",
3170 fp->Base.Id);
3171 _mesa_print_program(&fp->Base);
3172 }
3173 }
3174
3175 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3176 */
3177 fs_visitor v(brw, c, prog, fp, 8);
3178 if (!v.run()) {
3179 if (prog) {
3180 prog->LinkStatus = false;
3181 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3182 }
3183
3184 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3185 v.fail_msg);
3186
3187 return NULL;
3188 }
3189
3190 exec_list *simd16_instructions = NULL;
3191 fs_visitor v2(brw, c, prog, fp, 16);
3192 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3193 if (c->prog_data.nr_pull_params == 0) {
3194 /* Try a 16-wide compile */
3195 v2.import_uniforms(&v);
3196 if (!v2.run()) {
3197 perf_debug("16-wide shader failed to compile, falling back to "
3198 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3199 } else {
3200 simd16_instructions = &v2.instructions;
3201 }
3202 } else {
3203 perf_debug("Skipping 16-wide due to pull parameters.\n");
3204 }
3205 }
3206
3207 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3208 const unsigned *generated = g.generate_assembly(&v.instructions,
3209 simd16_instructions,
3210 final_assembly_size);
3211
3212 if (unlikely(brw->perf_debug) && shader) {
3213 if (shader->compiled_once)
3214 brw_wm_debug_recompile(brw, prog, &c->key);
3215 shader->compiled_once = true;
3216
3217 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3218 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3219 (get_time() - start_time) * 1000);
3220 }
3221 }
3222
3223 return generated;
3224 }
3225
3226 bool
3227 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3228 {
3229 struct brw_context *brw = brw_context(ctx);
3230 struct brw_wm_prog_key key;
3231
3232 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3233 return true;
3234
3235 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3236 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3237 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3238 bool program_uses_dfdy = fp->UsesDFdy;
3239
3240 memset(&key, 0, sizeof(key));
3241
3242 if (brw->gen < 6) {
3243 if (fp->UsesKill)
3244 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3245
3246 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3247 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3248
3249 /* Just assume depth testing. */
3250 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3251 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3252 }
3253
3254 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3255 BRW_FS_VARYING_INPUT_MASK) > 16)
3256 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3257
3258 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3259
3260 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3261 for (unsigned i = 0; i < sampler_count; i++) {
3262 if (fp->Base.ShadowSamplers & (1 << i)) {
3263 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3264 key.tex.swizzles[i] =
3265 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3266 } else {
3267 /* Color sampler: assume no swizzling. */
3268 key.tex.swizzles[i] = SWIZZLE_XYZW;
3269 }
3270 }
3271
3272 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3273 key.drawable_height = ctx->DrawBuffer->Height;
3274 }
3275
3276 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3277 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3278 }
3279
3280 key.nr_color_regions = 1;
3281
3282 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3283 * quality of the derivatives is likely to be determined by the driconf
3284 * option.
3285 */
3286 key.high_quality_derivatives = brw->disable_derivative_optimization;
3287
3288 key.program_string_id = bfp->id;
3289
3290 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3291 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3292
3293 bool success = do_wm_prog(brw, prog, bfp, &key);
3294
3295 brw->wm.base.prog_offset = old_prog_offset;
3296 brw->wm.prog_data = old_prog_data;
3297
3298 return success;
3299 }