i965: Make a brw_stage_prog_data for storing the SURF_INDEX information.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "main/uniforms.h"
50 #include "brw_fs_live_variables.h"
51 #include "glsl/glsl_types.h"
52
53 void
54 fs_inst::init()
55 {
56 memset(this, 0, sizeof(*this));
57 this->opcode = BRW_OPCODE_NOP;
58 this->conditional_mod = BRW_CONDITIONAL_NONE;
59
60 this->dst = reg_undef;
61 this->src[0] = reg_undef;
62 this->src[1] = reg_undef;
63 this->src[2] = reg_undef;
64
65 /* This will be the case for almost all instructions. */
66 this->regs_written = 1;
67 }
68
69 fs_inst::fs_inst()
70 {
71 init();
72 }
73
74 fs_inst::fs_inst(enum opcode opcode)
75 {
76 init();
77 this->opcode = opcode;
78 }
79
80 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
81 {
82 init();
83 this->opcode = opcode;
84 this->dst = dst;
85
86 if (dst.file == GRF)
87 assert(dst.reg_offset >= 0);
88 }
89
90 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
91 {
92 init();
93 this->opcode = opcode;
94 this->dst = dst;
95 this->src[0] = src0;
96
97 if (dst.file == GRF)
98 assert(dst.reg_offset >= 0);
99 if (src[0].file == GRF)
100 assert(src[0].reg_offset >= 0);
101 }
102
103 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
104 {
105 init();
106 this->opcode = opcode;
107 this->dst = dst;
108 this->src[0] = src0;
109 this->src[1] = src1;
110
111 if (dst.file == GRF)
112 assert(dst.reg_offset >= 0);
113 if (src[0].file == GRF)
114 assert(src[0].reg_offset >= 0);
115 if (src[1].file == GRF)
116 assert(src[1].reg_offset >= 0);
117 }
118
119 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
120 fs_reg src0, fs_reg src1, fs_reg src2)
121 {
122 init();
123 this->opcode = opcode;
124 this->dst = dst;
125 this->src[0] = src0;
126 this->src[1] = src1;
127 this->src[2] = src2;
128
129 if (dst.file == GRF)
130 assert(dst.reg_offset >= 0);
131 if (src[0].file == GRF)
132 assert(src[0].reg_offset >= 0);
133 if (src[1].file == GRF)
134 assert(src[1].reg_offset >= 0);
135 if (src[2].file == GRF)
136 assert(src[2].reg_offset >= 0);
137 }
138
139 #define ALU1(op) \
140 fs_inst * \
141 fs_visitor::op(fs_reg dst, fs_reg src0) \
142 { \
143 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
144 }
145
146 #define ALU2(op) \
147 fs_inst * \
148 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
149 { \
150 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
151 }
152
153 #define ALU3(op) \
154 fs_inst * \
155 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
156 { \
157 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
158 }
159
160 ALU1(NOT)
161 ALU1(MOV)
162 ALU1(FRC)
163 ALU1(RNDD)
164 ALU1(RNDE)
165 ALU1(RNDZ)
166 ALU2(ADD)
167 ALU2(MUL)
168 ALU2(MACH)
169 ALU2(AND)
170 ALU2(OR)
171 ALU2(XOR)
172 ALU2(SHL)
173 ALU2(SHR)
174 ALU2(ASR)
175 ALU3(LRP)
176 ALU1(BFREV)
177 ALU3(BFE)
178 ALU2(BFI1)
179 ALU3(BFI2)
180 ALU1(FBH)
181 ALU1(FBL)
182 ALU1(CBIT)
183 ALU3(MAD)
184 ALU2(ADDC)
185 ALU2(SUBB)
186
187 /** Gen4 predicated IF. */
188 fs_inst *
189 fs_visitor::IF(uint32_t predicate)
190 {
191 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
192 inst->predicate = predicate;
193 return inst;
194 }
195
196 /** Gen6+ IF with embedded comparison. */
197 fs_inst *
198 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
199 {
200 assert(brw->gen >= 6);
201 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
202 reg_null_d, src0, src1);
203 inst->conditional_mod = condition;
204 return inst;
205 }
206
207 /**
208 * CMP: Sets the low bit of the destination channels with the result
209 * of the comparison, while the upper bits are undefined, and updates
210 * the flag register with the packed 16 bits of the result.
211 */
212 fs_inst *
213 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
214 {
215 fs_inst *inst;
216
217 /* Take the instruction:
218 *
219 * CMP null<d> src0<f> src1<f>
220 *
221 * Original gen4 does type conversion to the destination type before
222 * comparison, producing garbage results for floating point comparisons.
223 * gen5 does the comparison on the execution type (resolved source types),
224 * so dst type doesn't matter. gen6 does comparison and then uses the
225 * result as if it was the dst type with no conversion, which happens to
226 * mostly work out for float-interpreted-as-int since our comparisons are
227 * for >0, =0, <0.
228 */
229 if (brw->gen == 4) {
230 dst.type = src0.type;
231 if (dst.file == HW_REG)
232 dst.fixed_hw_reg.type = dst.type;
233 }
234
235 resolve_ud_negate(&src0);
236 resolve_ud_negate(&src1);
237
238 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
239 inst->conditional_mod = condition;
240
241 return inst;
242 }
243
244 exec_list
245 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
246 fs_reg varying_offset,
247 uint32_t const_offset)
248 {
249 exec_list instructions;
250 fs_inst *inst;
251
252 /* We have our constant surface use a pitch of 4 bytes, so our index can
253 * be any component of a vector, and then we load 4 contiguous
254 * components starting from that.
255 *
256 * We break down the const_offset to a portion added to the variable
257 * offset and a portion done using reg_offset, which means that if you
258 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
259 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
260 * CSE can later notice that those loads are all the same and eliminate
261 * the redundant ones.
262 */
263 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
264 instructions.push_tail(ADD(vec4_offset,
265 varying_offset, const_offset & ~3));
266
267 int scale = 1;
268 if (brw->gen == 4 && dispatch_width == 8) {
269 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
270 * u, v, r) as parameters, or we can just use the SIMD16 message
271 * consisting of (header, u). We choose the second, at the cost of a
272 * longer return length.
273 */
274 scale = 2;
275 }
276
277 enum opcode op;
278 if (brw->gen >= 7)
279 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
280 else
281 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
282 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
283 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
284 inst->regs_written = 4 * scale;
285 instructions.push_tail(inst);
286
287 if (brw->gen < 7) {
288 inst->base_mrf = 13;
289 inst->header_present = true;
290 if (brw->gen == 4)
291 inst->mlen = 3;
292 else
293 inst->mlen = 1 + dispatch_width / 8;
294 }
295
296 vec4_result.reg_offset += (const_offset & 3) * scale;
297 instructions.push_tail(MOV(dst, vec4_result));
298
299 return instructions;
300 }
301
302 /**
303 * A helper for MOV generation for fixing up broken hardware SEND dependency
304 * handling.
305 */
306 fs_inst *
307 fs_visitor::DEP_RESOLVE_MOV(int grf)
308 {
309 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
310
311 inst->ir = NULL;
312 inst->annotation = "send dependency resolve";
313
314 /* The caller always wants uncompressed to emit the minimal extra
315 * dependencies, and to avoid having to deal with aligning its regs to 2.
316 */
317 inst->force_uncompressed = true;
318
319 return inst;
320 }
321
322 bool
323 fs_inst::equals(fs_inst *inst)
324 {
325 return (opcode == inst->opcode &&
326 dst.equals(inst->dst) &&
327 src[0].equals(inst->src[0]) &&
328 src[1].equals(inst->src[1]) &&
329 src[2].equals(inst->src[2]) &&
330 saturate == inst->saturate &&
331 predicate == inst->predicate &&
332 conditional_mod == inst->conditional_mod &&
333 mlen == inst->mlen &&
334 base_mrf == inst->base_mrf &&
335 sampler == inst->sampler &&
336 target == inst->target &&
337 eot == inst->eot &&
338 header_present == inst->header_present &&
339 shadow_compare == inst->shadow_compare &&
340 offset == inst->offset);
341 }
342
343 bool
344 fs_inst::overwrites_reg(const fs_reg &reg)
345 {
346 return (reg.file == dst.file &&
347 reg.reg == dst.reg &&
348 reg.reg_offset >= dst.reg_offset &&
349 reg.reg_offset < dst.reg_offset + regs_written);
350 }
351
352 bool
353 fs_inst::is_send_from_grf()
354 {
355 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
356 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
357 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
358 src[1].file == GRF) ||
359 (is_tex() && src[0].file == GRF));
360 }
361
362 bool
363 fs_visitor::can_do_source_mods(fs_inst *inst)
364 {
365 if (brw->gen == 6 && inst->is_math())
366 return false;
367
368 if (inst->is_send_from_grf())
369 return false;
370
371 if (!inst->can_do_source_mods())
372 return false;
373
374 return true;
375 }
376
377 void
378 fs_reg::init()
379 {
380 memset(this, 0, sizeof(*this));
381 this->smear = -1;
382 }
383
384 /** Generic unset register constructor. */
385 fs_reg::fs_reg()
386 {
387 init();
388 this->file = BAD_FILE;
389 }
390
391 /** Immediate value constructor. */
392 fs_reg::fs_reg(float f)
393 {
394 init();
395 this->file = IMM;
396 this->type = BRW_REGISTER_TYPE_F;
397 this->imm.f = f;
398 }
399
400 /** Immediate value constructor. */
401 fs_reg::fs_reg(int32_t i)
402 {
403 init();
404 this->file = IMM;
405 this->type = BRW_REGISTER_TYPE_D;
406 this->imm.i = i;
407 }
408
409 /** Immediate value constructor. */
410 fs_reg::fs_reg(uint32_t u)
411 {
412 init();
413 this->file = IMM;
414 this->type = BRW_REGISTER_TYPE_UD;
415 this->imm.u = u;
416 }
417
418 /** Fixed brw_reg Immediate value constructor. */
419 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
420 {
421 init();
422 this->file = HW_REG;
423 this->fixed_hw_reg = fixed_hw_reg;
424 this->type = fixed_hw_reg.type;
425 }
426
427 bool
428 fs_reg::equals(const fs_reg &r) const
429 {
430 return (file == r.file &&
431 reg == r.reg &&
432 reg_offset == r.reg_offset &&
433 type == r.type &&
434 negate == r.negate &&
435 abs == r.abs &&
436 !reladdr && !r.reladdr &&
437 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
438 sizeof(fixed_hw_reg)) == 0 &&
439 smear == r.smear &&
440 imm.u == r.imm.u);
441 }
442
443 fs_reg
444 fs_reg::retype(uint32_t type)
445 {
446 fs_reg result = *this;
447 result.type = type;
448 return result;
449 }
450
451 bool
452 fs_reg::is_zero() const
453 {
454 if (file != IMM)
455 return false;
456
457 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
458 }
459
460 bool
461 fs_reg::is_one() const
462 {
463 if (file != IMM)
464 return false;
465
466 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
467 }
468
469 bool
470 fs_reg::is_valid_3src() const
471 {
472 return file == GRF || file == UNIFORM;
473 }
474
475 int
476 fs_visitor::type_size(const struct glsl_type *type)
477 {
478 unsigned int size, i;
479
480 switch (type->base_type) {
481 case GLSL_TYPE_UINT:
482 case GLSL_TYPE_INT:
483 case GLSL_TYPE_FLOAT:
484 case GLSL_TYPE_BOOL:
485 return type->components();
486 case GLSL_TYPE_ARRAY:
487 return type_size(type->fields.array) * type->length;
488 case GLSL_TYPE_STRUCT:
489 size = 0;
490 for (i = 0; i < type->length; i++) {
491 size += type_size(type->fields.structure[i].type);
492 }
493 return size;
494 case GLSL_TYPE_SAMPLER:
495 /* Samplers take up no register space, since they're baked in at
496 * link time.
497 */
498 return 0;
499 case GLSL_TYPE_VOID:
500 case GLSL_TYPE_ERROR:
501 case GLSL_TYPE_INTERFACE:
502 assert(!"not reached");
503 break;
504 }
505
506 return 0;
507 }
508
509 fs_reg
510 fs_visitor::get_timestamp()
511 {
512 assert(brw->gen >= 7);
513
514 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
515 BRW_ARF_TIMESTAMP,
516 0),
517 BRW_REGISTER_TYPE_UD));
518
519 fs_reg dst = fs_reg(this, glsl_type::uint_type);
520
521 fs_inst *mov = emit(MOV(dst, ts));
522 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
523 * even if it's not enabled in the dispatch.
524 */
525 mov->force_writemask_all = true;
526 mov->force_uncompressed = true;
527
528 /* The caller wants the low 32 bits of the timestamp. Since it's running
529 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
530 * which is plenty of time for our purposes. It is identical across the
531 * EUs, but since it's tracking GPU core speed it will increment at a
532 * varying rate as render P-states change.
533 *
534 * The caller could also check if render P-states have changed (or anything
535 * else that might disrupt timing) by setting smear to 2 and checking if
536 * that field is != 0.
537 */
538 dst.smear = 0;
539
540 return dst;
541 }
542
543 void
544 fs_visitor::emit_shader_time_begin()
545 {
546 current_annotation = "shader time start";
547 shader_start_time = get_timestamp();
548 }
549
550 void
551 fs_visitor::emit_shader_time_end()
552 {
553 current_annotation = "shader time end";
554
555 enum shader_time_shader_type type, written_type, reset_type;
556 if (dispatch_width == 8) {
557 type = ST_FS8;
558 written_type = ST_FS8_WRITTEN;
559 reset_type = ST_FS8_RESET;
560 } else {
561 assert(dispatch_width == 16);
562 type = ST_FS16;
563 written_type = ST_FS16_WRITTEN;
564 reset_type = ST_FS16_RESET;
565 }
566
567 fs_reg shader_end_time = get_timestamp();
568
569 /* Check that there weren't any timestamp reset events (assuming these
570 * were the only two timestamp reads that happened).
571 */
572 fs_reg reset = shader_end_time;
573 reset.smear = 2;
574 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
575 test->conditional_mod = BRW_CONDITIONAL_Z;
576 emit(IF(BRW_PREDICATE_NORMAL));
577
578 push_force_uncompressed();
579 fs_reg start = shader_start_time;
580 start.negate = true;
581 fs_reg diff = fs_reg(this, glsl_type::uint_type);
582 emit(ADD(diff, start, shader_end_time));
583
584 /* If there were no instructions between the two timestamp gets, the diff
585 * is 2 cycles. Remove that overhead, so I can forget about that when
586 * trying to determine the time taken for single instructions.
587 */
588 emit(ADD(diff, diff, fs_reg(-2u)));
589
590 emit_shader_time_write(type, diff);
591 emit_shader_time_write(written_type, fs_reg(1u));
592 emit(BRW_OPCODE_ELSE);
593 emit_shader_time_write(reset_type, fs_reg(1u));
594 emit(BRW_OPCODE_ENDIF);
595
596 pop_force_uncompressed();
597 }
598
599 void
600 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
601 fs_reg value)
602 {
603 int shader_time_index =
604 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
605 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
606
607 fs_reg payload;
608 if (dispatch_width == 8)
609 payload = fs_reg(this, glsl_type::uvec2_type);
610 else
611 payload = fs_reg(this, glsl_type::uint_type);
612
613 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
614 fs_reg(), payload, offset, value));
615 }
616
617 void
618 fs_visitor::fail(const char *format, ...)
619 {
620 va_list va;
621 char *msg;
622
623 if (failed)
624 return;
625
626 failed = true;
627
628 va_start(va, format);
629 msg = ralloc_vasprintf(mem_ctx, format, va);
630 va_end(va);
631 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
632
633 this->fail_msg = msg;
634
635 if (INTEL_DEBUG & DEBUG_WM) {
636 fprintf(stderr, "%s", msg);
637 }
638 }
639
640 fs_inst *
641 fs_visitor::emit(enum opcode opcode)
642 {
643 return emit(fs_inst(opcode));
644 }
645
646 fs_inst *
647 fs_visitor::emit(enum opcode opcode, fs_reg dst)
648 {
649 return emit(fs_inst(opcode, dst));
650 }
651
652 fs_inst *
653 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
654 {
655 return emit(fs_inst(opcode, dst, src0));
656 }
657
658 fs_inst *
659 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
660 {
661 return emit(fs_inst(opcode, dst, src0, src1));
662 }
663
664 fs_inst *
665 fs_visitor::emit(enum opcode opcode, fs_reg dst,
666 fs_reg src0, fs_reg src1, fs_reg src2)
667 {
668 return emit(fs_inst(opcode, dst, src0, src1, src2));
669 }
670
671 void
672 fs_visitor::push_force_uncompressed()
673 {
674 force_uncompressed_stack++;
675 }
676
677 void
678 fs_visitor::pop_force_uncompressed()
679 {
680 force_uncompressed_stack--;
681 assert(force_uncompressed_stack >= 0);
682 }
683
684 void
685 fs_visitor::push_force_sechalf()
686 {
687 force_sechalf_stack++;
688 }
689
690 void
691 fs_visitor::pop_force_sechalf()
692 {
693 force_sechalf_stack--;
694 assert(force_sechalf_stack >= 0);
695 }
696
697 /**
698 * Returns true if the instruction has a flag that means it won't
699 * update an entire destination register.
700 *
701 * For example, dead code elimination and live variable analysis want to know
702 * when a write to a variable screens off any preceding values that were in
703 * it.
704 */
705 bool
706 fs_inst::is_partial_write()
707 {
708 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
709 this->force_uncompressed ||
710 this->force_sechalf);
711 }
712
713 int
714 fs_inst::regs_read(fs_visitor *v, int arg)
715 {
716 if (is_tex() && arg == 0 && src[0].file == GRF) {
717 if (v->dispatch_width == 16)
718 return (mlen + 1) / 2;
719 else
720 return mlen;
721 }
722 return 1;
723 }
724
725 /**
726 * Returns how many MRFs an FS opcode will write over.
727 *
728 * Note that this is not the 0 or 1 implied writes in an actual gen
729 * instruction -- the FS opcodes often generate MOVs in addition.
730 */
731 int
732 fs_visitor::implied_mrf_writes(fs_inst *inst)
733 {
734 if (inst->mlen == 0)
735 return 0;
736
737 if (inst->base_mrf == -1)
738 return 0;
739
740 switch (inst->opcode) {
741 case SHADER_OPCODE_RCP:
742 case SHADER_OPCODE_RSQ:
743 case SHADER_OPCODE_SQRT:
744 case SHADER_OPCODE_EXP2:
745 case SHADER_OPCODE_LOG2:
746 case SHADER_OPCODE_SIN:
747 case SHADER_OPCODE_COS:
748 return 1 * dispatch_width / 8;
749 case SHADER_OPCODE_POW:
750 case SHADER_OPCODE_INT_QUOTIENT:
751 case SHADER_OPCODE_INT_REMAINDER:
752 return 2 * dispatch_width / 8;
753 case SHADER_OPCODE_TEX:
754 case FS_OPCODE_TXB:
755 case SHADER_OPCODE_TXD:
756 case SHADER_OPCODE_TXF:
757 case SHADER_OPCODE_TXF_MS:
758 case SHADER_OPCODE_TG4:
759 case SHADER_OPCODE_TXL:
760 case SHADER_OPCODE_TXS:
761 case SHADER_OPCODE_LOD:
762 return 1;
763 case FS_OPCODE_FB_WRITE:
764 return 2;
765 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
766 case FS_OPCODE_UNSPILL:
767 return 1;
768 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
769 return inst->mlen;
770 case FS_OPCODE_SPILL:
771 return 2;
772 default:
773 assert(!"not reached");
774 return inst->mlen;
775 }
776 }
777
778 int
779 fs_visitor::virtual_grf_alloc(int size)
780 {
781 if (virtual_grf_array_size <= virtual_grf_count) {
782 if (virtual_grf_array_size == 0)
783 virtual_grf_array_size = 16;
784 else
785 virtual_grf_array_size *= 2;
786 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
787 virtual_grf_array_size);
788 }
789 virtual_grf_sizes[virtual_grf_count] = size;
790 return virtual_grf_count++;
791 }
792
793 /** Fixed HW reg constructor. */
794 fs_reg::fs_reg(enum register_file file, int reg)
795 {
796 init();
797 this->file = file;
798 this->reg = reg;
799 this->type = BRW_REGISTER_TYPE_F;
800 }
801
802 /** Fixed HW reg constructor. */
803 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
804 {
805 init();
806 this->file = file;
807 this->reg = reg;
808 this->type = type;
809 }
810
811 /** Automatic reg constructor. */
812 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
813 {
814 init();
815
816 this->file = GRF;
817 this->reg = v->virtual_grf_alloc(v->type_size(type));
818 this->reg_offset = 0;
819 this->type = brw_type_for_base_type(type);
820 }
821
822 fs_reg *
823 fs_visitor::variable_storage(ir_variable *var)
824 {
825 return (fs_reg *)hash_table_find(this->variable_ht, var);
826 }
827
828 void
829 import_uniforms_callback(const void *key,
830 void *data,
831 void *closure)
832 {
833 struct hash_table *dst_ht = (struct hash_table *)closure;
834 const fs_reg *reg = (const fs_reg *)data;
835
836 if (reg->file != UNIFORM)
837 return;
838
839 hash_table_insert(dst_ht, data, key);
840 }
841
842 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
843 * This brings in those uniform definitions
844 */
845 void
846 fs_visitor::import_uniforms(fs_visitor *v)
847 {
848 hash_table_call_foreach(v->variable_ht,
849 import_uniforms_callback,
850 variable_ht);
851 this->params_remap = v->params_remap;
852 this->nr_params_remap = v->nr_params_remap;
853 }
854
855 /* Our support for uniforms is piggy-backed on the struct
856 * gl_fragment_program, because that's where the values actually
857 * get stored, rather than in some global gl_shader_program uniform
858 * store.
859 */
860 void
861 fs_visitor::setup_uniform_values(ir_variable *ir)
862 {
863 int namelen = strlen(ir->name);
864
865 /* The data for our (non-builtin) uniforms is stored in a series of
866 * gl_uniform_driver_storage structs for each subcomponent that
867 * glGetUniformLocation() could name. We know it's been set up in the same
868 * order we'd walk the type, so walk the list of storage and find anything
869 * with our name, or the prefix of a component that starts with our name.
870 */
871 unsigned params_before = c->prog_data.nr_params;
872 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
873 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
874
875 if (strncmp(ir->name, storage->name, namelen) != 0 ||
876 (storage->name[namelen] != 0 &&
877 storage->name[namelen] != '.' &&
878 storage->name[namelen] != '[')) {
879 continue;
880 }
881
882 unsigned slots = storage->type->component_slots();
883 if (storage->array_elements)
884 slots *= storage->array_elements;
885
886 for (unsigned i = 0; i < slots; i++) {
887 c->prog_data.param[c->prog_data.nr_params++] =
888 &storage->storage[i].f;
889 }
890 }
891
892 /* Make sure we actually initialized the right amount of stuff here. */
893 assert(params_before + ir->type->component_slots() ==
894 c->prog_data.nr_params);
895 (void)params_before;
896 }
897
898
899 /* Our support for builtin uniforms is even scarier than non-builtin.
900 * It sits on top of the PROG_STATE_VAR parameters that are
901 * automatically updated from GL context state.
902 */
903 void
904 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
905 {
906 const ir_state_slot *const slots = ir->state_slots;
907 assert(ir->state_slots != NULL);
908
909 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
910 /* This state reference has already been setup by ir_to_mesa, but we'll
911 * get the same index back here.
912 */
913 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
914 (gl_state_index *)slots[i].tokens);
915
916 /* Add each of the unique swizzles of the element as a parameter.
917 * This'll end up matching the expected layout of the
918 * array/matrix/structure we're trying to fill in.
919 */
920 int last_swiz = -1;
921 for (unsigned int j = 0; j < 4; j++) {
922 int swiz = GET_SWZ(slots[i].swizzle, j);
923 if (swiz == last_swiz)
924 break;
925 last_swiz = swiz;
926
927 c->prog_data.param[c->prog_data.nr_params++] =
928 &fp->Base.Parameters->ParameterValues[index][swiz].f;
929 }
930 }
931 }
932
933 fs_reg *
934 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
935 {
936 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
937 fs_reg wpos = *reg;
938 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
939
940 /* gl_FragCoord.x */
941 if (ir->pixel_center_integer) {
942 emit(MOV(wpos, this->pixel_x));
943 } else {
944 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
945 }
946 wpos.reg_offset++;
947
948 /* gl_FragCoord.y */
949 if (!flip && ir->pixel_center_integer) {
950 emit(MOV(wpos, this->pixel_y));
951 } else {
952 fs_reg pixel_y = this->pixel_y;
953 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
954
955 if (flip) {
956 pixel_y.negate = true;
957 offset += c->key.drawable_height - 1.0;
958 }
959
960 emit(ADD(wpos, pixel_y, fs_reg(offset)));
961 }
962 wpos.reg_offset++;
963
964 /* gl_FragCoord.z */
965 if (brw->gen >= 6) {
966 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
967 } else {
968 emit(FS_OPCODE_LINTERP, wpos,
969 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
970 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
971 interp_reg(VARYING_SLOT_POS, 2));
972 }
973 wpos.reg_offset++;
974
975 /* gl_FragCoord.w: Already set up in emit_interpolation */
976 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
977
978 return reg;
979 }
980
981 fs_inst *
982 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
983 glsl_interp_qualifier interpolation_mode,
984 bool is_centroid)
985 {
986 brw_wm_barycentric_interp_mode barycoord_mode;
987 if (brw->gen >= 6) {
988 if (is_centroid) {
989 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
990 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
991 else
992 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
993 } else {
994 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
995 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
996 else
997 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
998 }
999 } else {
1000 /* On Ironlake and below, there is only one interpolation mode.
1001 * Centroid interpolation doesn't mean anything on this hardware --
1002 * there is no multisampling.
1003 */
1004 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1005 }
1006 return emit(FS_OPCODE_LINTERP, attr,
1007 this->delta_x[barycoord_mode],
1008 this->delta_y[barycoord_mode], interp);
1009 }
1010
1011 fs_reg *
1012 fs_visitor::emit_general_interpolation(ir_variable *ir)
1013 {
1014 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1015 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1016 fs_reg attr = *reg;
1017
1018 unsigned int array_elements;
1019 const glsl_type *type;
1020
1021 if (ir->type->is_array()) {
1022 array_elements = ir->type->length;
1023 if (array_elements == 0) {
1024 fail("dereferenced array '%s' has length 0\n", ir->name);
1025 }
1026 type = ir->type->fields.array;
1027 } else {
1028 array_elements = 1;
1029 type = ir->type;
1030 }
1031
1032 glsl_interp_qualifier interpolation_mode =
1033 ir->determine_interpolation_mode(c->key.flat_shade);
1034
1035 int location = ir->location;
1036 for (unsigned int i = 0; i < array_elements; i++) {
1037 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1038 if (c->prog_data.urb_setup[location] == -1) {
1039 /* If there's no incoming setup data for this slot, don't
1040 * emit interpolation for it.
1041 */
1042 attr.reg_offset += type->vector_elements;
1043 location++;
1044 continue;
1045 }
1046
1047 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1048 /* Constant interpolation (flat shading) case. The SF has
1049 * handed us defined values in only the constant offset
1050 * field of the setup reg.
1051 */
1052 for (unsigned int k = 0; k < type->vector_elements; k++) {
1053 struct brw_reg interp = interp_reg(location, k);
1054 interp = suboffset(interp, 3);
1055 interp.type = reg->type;
1056 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1057 attr.reg_offset++;
1058 }
1059 } else {
1060 /* Smooth/noperspective interpolation case. */
1061 for (unsigned int k = 0; k < type->vector_elements; k++) {
1062 /* FINISHME: At some point we probably want to push
1063 * this farther by giving similar treatment to the
1064 * other potentially constant components of the
1065 * attribute, as well as making brw_vs_constval.c
1066 * handle varyings other than gl_TexCoord.
1067 */
1068 struct brw_reg interp = interp_reg(location, k);
1069 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1070 ir->centroid);
1071 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1072 /* Get the pixel/sample mask into f0 so that we know
1073 * which pixels are lit. Then, for each channel that is
1074 * unlit, replace the centroid data with non-centroid
1075 * data.
1076 */
1077 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1078 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1079 interpolation_mode, false);
1080 inst->predicate = BRW_PREDICATE_NORMAL;
1081 inst->predicate_inverse = true;
1082 }
1083 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1084 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1085 }
1086 attr.reg_offset++;
1087 }
1088
1089 }
1090 location++;
1091 }
1092 }
1093
1094 return reg;
1095 }
1096
1097 fs_reg *
1098 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1099 {
1100 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1101
1102 /* The frontfacing comes in as a bit in the thread payload. */
1103 if (brw->gen >= 6) {
1104 emit(BRW_OPCODE_ASR, *reg,
1105 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1106 fs_reg(15));
1107 emit(BRW_OPCODE_NOT, *reg, *reg);
1108 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1109 } else {
1110 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1111 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1112 * us front face
1113 */
1114 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1115 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1116 }
1117
1118 return reg;
1119 }
1120
1121 fs_reg
1122 fs_visitor::fix_math_operand(fs_reg src)
1123 {
1124 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1125 * might be able to do better by doing execsize = 1 math and then
1126 * expanding that result out, but we would need to be careful with
1127 * masking.
1128 *
1129 * The hardware ignores source modifiers (negate and abs) on math
1130 * instructions, so we also move to a temp to set those up.
1131 */
1132 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1133 !src.abs && !src.negate)
1134 return src;
1135
1136 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1137 * operands to math
1138 */
1139 if (brw->gen >= 7 && src.file != IMM)
1140 return src;
1141
1142 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1143 expanded.type = src.type;
1144 emit(BRW_OPCODE_MOV, expanded, src);
1145 return expanded;
1146 }
1147
1148 fs_inst *
1149 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1150 {
1151 switch (opcode) {
1152 case SHADER_OPCODE_RCP:
1153 case SHADER_OPCODE_RSQ:
1154 case SHADER_OPCODE_SQRT:
1155 case SHADER_OPCODE_EXP2:
1156 case SHADER_OPCODE_LOG2:
1157 case SHADER_OPCODE_SIN:
1158 case SHADER_OPCODE_COS:
1159 break;
1160 default:
1161 assert(!"not reached: bad math opcode");
1162 return NULL;
1163 }
1164
1165 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1166 * might be able to do better by doing execsize = 1 math and then
1167 * expanding that result out, but we would need to be careful with
1168 * masking.
1169 *
1170 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1171 * instructions, so we also move to a temp to set those up.
1172 */
1173 if (brw->gen >= 6)
1174 src = fix_math_operand(src);
1175
1176 fs_inst *inst = emit(opcode, dst, src);
1177
1178 if (brw->gen < 6) {
1179 inst->base_mrf = 2;
1180 inst->mlen = dispatch_width / 8;
1181 }
1182
1183 return inst;
1184 }
1185
1186 fs_inst *
1187 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1188 {
1189 int base_mrf = 2;
1190 fs_inst *inst;
1191
1192 switch (opcode) {
1193 case SHADER_OPCODE_INT_QUOTIENT:
1194 case SHADER_OPCODE_INT_REMAINDER:
1195 if (brw->gen >= 7 && dispatch_width == 16)
1196 fail("16-wide INTDIV unsupported\n");
1197 break;
1198 case SHADER_OPCODE_POW:
1199 break;
1200 default:
1201 assert(!"not reached: unsupported binary math opcode.");
1202 return NULL;
1203 }
1204
1205 if (brw->gen >= 6) {
1206 src0 = fix_math_operand(src0);
1207 src1 = fix_math_operand(src1);
1208
1209 inst = emit(opcode, dst, src0, src1);
1210 } else {
1211 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1212 * "Message Payload":
1213 *
1214 * "Operand0[7]. For the INT DIV functions, this operand is the
1215 * denominator."
1216 * ...
1217 * "Operand1[7]. For the INT DIV functions, this operand is the
1218 * numerator."
1219 */
1220 bool is_int_div = opcode != SHADER_OPCODE_POW;
1221 fs_reg &op0 = is_int_div ? src1 : src0;
1222 fs_reg &op1 = is_int_div ? src0 : src1;
1223
1224 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1225 inst = emit(opcode, dst, op0, reg_null_f);
1226
1227 inst->base_mrf = base_mrf;
1228 inst->mlen = 2 * dispatch_width / 8;
1229 }
1230 return inst;
1231 }
1232
1233 void
1234 fs_visitor::assign_curb_setup()
1235 {
1236 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1237 if (dispatch_width == 8) {
1238 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1239 } else {
1240 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1241 }
1242
1243 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1244 foreach_list(node, &this->instructions) {
1245 fs_inst *inst = (fs_inst *)node;
1246
1247 for (unsigned int i = 0; i < 3; i++) {
1248 if (inst->src[i].file == UNIFORM) {
1249 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1250 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1251 constant_nr / 8,
1252 constant_nr % 8);
1253
1254 inst->src[i].file = HW_REG;
1255 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1256 }
1257 }
1258 }
1259 }
1260
1261 void
1262 fs_visitor::calculate_urb_setup()
1263 {
1264 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1265 c->prog_data.urb_setup[i] = -1;
1266 }
1267
1268 int urb_next = 0;
1269 /* Figure out where each of the incoming setup attributes lands. */
1270 if (brw->gen >= 6) {
1271 if (_mesa_bitcount_64(fp->Base.InputsRead &
1272 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1273 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1274 * first 16 varying inputs, so we can put them wherever we want.
1275 * Just put them in order.
1276 *
1277 * This is useful because it means that (a) inputs not used by the
1278 * fragment shader won't take up valuable register space, and (b) we
1279 * won't have to recompile the fragment shader if it gets paired with
1280 * a different vertex (or geometry) shader.
1281 */
1282 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1283 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1284 BITFIELD64_BIT(i)) {
1285 c->prog_data.urb_setup[i] = urb_next++;
1286 }
1287 }
1288 } else {
1289 /* We have enough input varyings that the SF/SBE pipeline stage can't
1290 * arbitrarily rearrange them to suit our whim; we have to put them
1291 * in an order that matches the output of the previous pipeline stage
1292 * (geometry or vertex shader).
1293 */
1294 struct brw_vue_map prev_stage_vue_map;
1295 brw_compute_vue_map(brw, &prev_stage_vue_map,
1296 c->key.input_slots_valid);
1297 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1298 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1299 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1300 slot++) {
1301 int varying = prev_stage_vue_map.slot_to_varying[slot];
1302 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1303 * unused.
1304 */
1305 if (varying != BRW_VARYING_SLOT_COUNT &&
1306 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1307 BITFIELD64_BIT(varying))) {
1308 c->prog_data.urb_setup[varying] = slot - first_slot;
1309 }
1310 }
1311 urb_next = prev_stage_vue_map.num_slots - first_slot;
1312 }
1313 } else {
1314 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1315 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1316 /* Point size is packed into the header, not as a general attribute */
1317 if (i == VARYING_SLOT_PSIZ)
1318 continue;
1319
1320 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1321 /* The back color slot is skipped when the front color is
1322 * also written to. In addition, some slots can be
1323 * written in the vertex shader and not read in the
1324 * fragment shader. So the register number must always be
1325 * incremented, mapped or not.
1326 */
1327 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1328 c->prog_data.urb_setup[i] = urb_next;
1329 urb_next++;
1330 }
1331 }
1332
1333 /*
1334 * It's a FS only attribute, and we did interpolation for this attribute
1335 * in SF thread. So, count it here, too.
1336 *
1337 * See compile_sf_prog() for more info.
1338 */
1339 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1340 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1341 }
1342
1343 c->prog_data.num_varying_inputs = urb_next;
1344 }
1345
1346 void
1347 fs_visitor::assign_urb_setup()
1348 {
1349 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1350
1351 /* Offset all the urb_setup[] index by the actual position of the
1352 * setup regs, now that the location of the constants has been chosen.
1353 */
1354 foreach_list(node, &this->instructions) {
1355 fs_inst *inst = (fs_inst *)node;
1356
1357 if (inst->opcode == FS_OPCODE_LINTERP) {
1358 assert(inst->src[2].file == HW_REG);
1359 inst->src[2].fixed_hw_reg.nr += urb_start;
1360 }
1361
1362 if (inst->opcode == FS_OPCODE_CINTERP) {
1363 assert(inst->src[0].file == HW_REG);
1364 inst->src[0].fixed_hw_reg.nr += urb_start;
1365 }
1366 }
1367
1368 /* Each attribute is 4 setup channels, each of which is half a reg. */
1369 this->first_non_payload_grf =
1370 urb_start + c->prog_data.num_varying_inputs * 2;
1371 }
1372
1373 /**
1374 * Split large virtual GRFs into separate components if we can.
1375 *
1376 * This is mostly duplicated with what brw_fs_vector_splitting does,
1377 * but that's really conservative because it's afraid of doing
1378 * splitting that doesn't result in real progress after the rest of
1379 * the optimization phases, which would cause infinite looping in
1380 * optimization. We can do it once here, safely. This also has the
1381 * opportunity to split interpolated values, or maybe even uniforms,
1382 * which we don't have at the IR level.
1383 *
1384 * We want to split, because virtual GRFs are what we register
1385 * allocate and spill (due to contiguousness requirements for some
1386 * instructions), and they're what we naturally generate in the
1387 * codegen process, but most virtual GRFs don't actually need to be
1388 * contiguous sets of GRFs. If we split, we'll end up with reduced
1389 * live intervals and better dead code elimination and coalescing.
1390 */
1391 void
1392 fs_visitor::split_virtual_grfs()
1393 {
1394 int num_vars = this->virtual_grf_count;
1395 bool split_grf[num_vars];
1396 int new_virtual_grf[num_vars];
1397
1398 /* Try to split anything > 0 sized. */
1399 for (int i = 0; i < num_vars; i++) {
1400 if (this->virtual_grf_sizes[i] != 1)
1401 split_grf[i] = true;
1402 else
1403 split_grf[i] = false;
1404 }
1405
1406 if (brw->has_pln &&
1407 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1408 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1409 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1410 * Gen6, that was the only supported interpolation mode, and since Gen6,
1411 * delta_x and delta_y are in fixed hardware registers.
1412 */
1413 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1414 false;
1415 }
1416
1417 foreach_list(node, &this->instructions) {
1418 fs_inst *inst = (fs_inst *)node;
1419
1420 /* If there's a SEND message that requires contiguous destination
1421 * registers, no splitting is allowed.
1422 */
1423 if (inst->regs_written > 1) {
1424 split_grf[inst->dst.reg] = false;
1425 }
1426
1427 /* If we're sending from a GRF, don't split it, on the assumption that
1428 * the send is reading the whole thing.
1429 */
1430 if (inst->is_send_from_grf()) {
1431 for (int i = 0; i < 3; i++) {
1432 if (inst->src[i].file == GRF) {
1433 split_grf[inst->src[i].reg] = false;
1434 }
1435 }
1436 }
1437 }
1438
1439 /* Allocate new space for split regs. Note that the virtual
1440 * numbers will be contiguous.
1441 */
1442 for (int i = 0; i < num_vars; i++) {
1443 if (split_grf[i]) {
1444 new_virtual_grf[i] = virtual_grf_alloc(1);
1445 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1446 int reg = virtual_grf_alloc(1);
1447 assert(reg == new_virtual_grf[i] + j - 1);
1448 (void) reg;
1449 }
1450 this->virtual_grf_sizes[i] = 1;
1451 }
1452 }
1453
1454 foreach_list(node, &this->instructions) {
1455 fs_inst *inst = (fs_inst *)node;
1456
1457 if (inst->dst.file == GRF &&
1458 split_grf[inst->dst.reg] &&
1459 inst->dst.reg_offset != 0) {
1460 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1461 inst->dst.reg_offset - 1);
1462 inst->dst.reg_offset = 0;
1463 }
1464 for (int i = 0; i < 3; i++) {
1465 if (inst->src[i].file == GRF &&
1466 split_grf[inst->src[i].reg] &&
1467 inst->src[i].reg_offset != 0) {
1468 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1469 inst->src[i].reg_offset - 1);
1470 inst->src[i].reg_offset = 0;
1471 }
1472 }
1473 }
1474 invalidate_live_intervals();
1475 }
1476
1477 /**
1478 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1479 *
1480 * During code generation, we create tons of temporary variables, many of
1481 * which get immediately killed and are never used again. Yet, in later
1482 * optimization and analysis passes, such as compute_live_intervals, we need
1483 * to loop over all the virtual GRFs. Compacting them can save a lot of
1484 * overhead.
1485 */
1486 void
1487 fs_visitor::compact_virtual_grfs()
1488 {
1489 /* Mark which virtual GRFs are used, and count how many. */
1490 int remap_table[this->virtual_grf_count];
1491 memset(remap_table, -1, sizeof(remap_table));
1492
1493 foreach_list(node, &this->instructions) {
1494 const fs_inst *inst = (const fs_inst *) node;
1495
1496 if (inst->dst.file == GRF)
1497 remap_table[inst->dst.reg] = 0;
1498
1499 for (int i = 0; i < 3; i++) {
1500 if (inst->src[i].file == GRF)
1501 remap_table[inst->src[i].reg] = 0;
1502 }
1503 }
1504
1505 /* In addition to registers used in instructions, fs_visitor keeps
1506 * direct references to certain special values which must be patched:
1507 */
1508 fs_reg *special[] = {
1509 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1510 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1511 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1512 &delta_x[0], &delta_x[1], &delta_x[2],
1513 &delta_x[3], &delta_x[4], &delta_x[5],
1514 &delta_y[0], &delta_y[1], &delta_y[2],
1515 &delta_y[3], &delta_y[4], &delta_y[5],
1516 };
1517 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1518 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1519
1520 /* Treat all special values as used, to be conservative */
1521 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1522 if (special[i]->file == GRF)
1523 remap_table[special[i]->reg] = 0;
1524 }
1525
1526 /* Compact the GRF arrays. */
1527 int new_index = 0;
1528 for (int i = 0; i < this->virtual_grf_count; i++) {
1529 if (remap_table[i] != -1) {
1530 remap_table[i] = new_index;
1531 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1532 invalidate_live_intervals();
1533 ++new_index;
1534 }
1535 }
1536
1537 this->virtual_grf_count = new_index;
1538
1539 /* Patch all the instructions to use the newly renumbered registers */
1540 foreach_list(node, &this->instructions) {
1541 fs_inst *inst = (fs_inst *) node;
1542
1543 if (inst->dst.file == GRF)
1544 inst->dst.reg = remap_table[inst->dst.reg];
1545
1546 for (int i = 0; i < 3; i++) {
1547 if (inst->src[i].file == GRF)
1548 inst->src[i].reg = remap_table[inst->src[i].reg];
1549 }
1550 }
1551
1552 /* Patch all the references to special values */
1553 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1554 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1555 special[i]->reg = remap_table[special[i]->reg];
1556 }
1557 }
1558
1559 bool
1560 fs_visitor::remove_dead_constants()
1561 {
1562 if (dispatch_width == 8) {
1563 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1564 this->nr_params_remap = c->prog_data.nr_params;
1565
1566 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1567 this->params_remap[i] = -1;
1568
1569 /* Find which params are still in use. */
1570 foreach_list(node, &this->instructions) {
1571 fs_inst *inst = (fs_inst *)node;
1572
1573 for (int i = 0; i < 3; i++) {
1574 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1575
1576 if (inst->src[i].file != UNIFORM)
1577 continue;
1578
1579 /* Section 5.11 of the OpenGL 4.3 spec says:
1580 *
1581 * "Out-of-bounds reads return undefined values, which include
1582 * values from other variables of the active program or zero."
1583 */
1584 if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1585 constant_nr = 0;
1586 }
1587
1588 /* For now, set this to non-negative. We'll give it the
1589 * actual new number in a moment, in order to keep the
1590 * register numbers nicely ordered.
1591 */
1592 this->params_remap[constant_nr] = 0;
1593 }
1594 }
1595
1596 /* Figure out what the new numbers for the params will be. At some
1597 * point when we're doing uniform array access, we're going to want
1598 * to keep the distinction between .reg and .reg_offset, but for
1599 * now we don't care.
1600 */
1601 unsigned int new_nr_params = 0;
1602 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1603 if (this->params_remap[i] != -1) {
1604 this->params_remap[i] = new_nr_params++;
1605 }
1606 }
1607
1608 /* Update the list of params to be uploaded to match our new numbering. */
1609 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1610 int remapped = this->params_remap[i];
1611
1612 if (remapped == -1)
1613 continue;
1614
1615 c->prog_data.param[remapped] = c->prog_data.param[i];
1616 }
1617
1618 c->prog_data.nr_params = new_nr_params;
1619 } else {
1620 /* This should have been generated in the 8-wide pass already. */
1621 assert(this->params_remap);
1622 }
1623
1624 /* Now do the renumbering of the shader to remove unused params. */
1625 foreach_list(node, &this->instructions) {
1626 fs_inst *inst = (fs_inst *)node;
1627
1628 for (int i = 0; i < 3; i++) {
1629 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1630
1631 if (inst->src[i].file != UNIFORM)
1632 continue;
1633
1634 /* as above alias to 0 */
1635 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1636 constant_nr = 0;
1637 }
1638 assert(this->params_remap[constant_nr] != -1);
1639 inst->src[i].reg = this->params_remap[constant_nr];
1640 inst->src[i].reg_offset = 0;
1641 }
1642 }
1643
1644 return true;
1645 }
1646
1647 /*
1648 * Implements array access of uniforms by inserting a
1649 * PULL_CONSTANT_LOAD instruction.
1650 *
1651 * Unlike temporary GRF array access (where we don't support it due to
1652 * the difficulty of doing relative addressing on instruction
1653 * destinations), we could potentially do array access of uniforms
1654 * that were loaded in GRF space as push constants. In real-world
1655 * usage we've seen, though, the arrays being used are always larger
1656 * than we could load as push constants, so just always move all
1657 * uniform array access out to a pull constant buffer.
1658 */
1659 void
1660 fs_visitor::move_uniform_array_access_to_pull_constants()
1661 {
1662 int pull_constant_loc[c->prog_data.nr_params];
1663
1664 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1665 pull_constant_loc[i] = -1;
1666 }
1667
1668 /* Walk through and find array access of uniforms. Put a copy of that
1669 * uniform in the pull constant buffer.
1670 *
1671 * Note that we don't move constant-indexed accesses to arrays. No
1672 * testing has been done of the performance impact of this choice.
1673 */
1674 foreach_list_safe(node, &this->instructions) {
1675 fs_inst *inst = (fs_inst *)node;
1676
1677 for (int i = 0 ; i < 3; i++) {
1678 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1679 continue;
1680
1681 int uniform = inst->src[i].reg;
1682
1683 /* If this array isn't already present in the pull constant buffer,
1684 * add it.
1685 */
1686 if (pull_constant_loc[uniform] == -1) {
1687 const float **values = &c->prog_data.param[uniform];
1688
1689 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1690
1691 assert(param_size[uniform]);
1692
1693 for (int j = 0; j < param_size[uniform]; j++) {
1694 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1695 values[j];
1696 }
1697 }
1698
1699 /* Set up the annotation tracking for new generated instructions. */
1700 base_ir = inst->ir;
1701 current_annotation = inst->annotation;
1702
1703 fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1704 fs_reg temp = fs_reg(this, glsl_type::float_type);
1705 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1706 surf_index,
1707 *inst->src[i].reladdr,
1708 pull_constant_loc[uniform] +
1709 inst->src[i].reg_offset);
1710 inst->insert_before(&list);
1711
1712 inst->src[i].file = temp.file;
1713 inst->src[i].reg = temp.reg;
1714 inst->src[i].reg_offset = temp.reg_offset;
1715 inst->src[i].reladdr = NULL;
1716 }
1717 }
1718 }
1719
1720 /**
1721 * Choose accesses from the UNIFORM file to demote to using the pull
1722 * constant buffer.
1723 *
1724 * We allow a fragment shader to have more than the specified minimum
1725 * maximum number of fragment shader uniform components (64). If
1726 * there are too many of these, they'd fill up all of register space.
1727 * So, this will push some of them out to the pull constant buffer and
1728 * update the program to load them.
1729 */
1730 void
1731 fs_visitor::setup_pull_constants()
1732 {
1733 /* Only allow 16 registers (128 uniform components) as push constants. */
1734 unsigned int max_uniform_components = 16 * 8;
1735 if (c->prog_data.nr_params <= max_uniform_components)
1736 return;
1737
1738 if (dispatch_width == 16) {
1739 fail("Pull constants not supported in 16-wide\n");
1740 return;
1741 }
1742
1743 /* Just demote the end of the list. We could probably do better
1744 * here, demoting things that are rarely used in the program first.
1745 */
1746 unsigned int pull_uniform_base = max_uniform_components;
1747
1748 int pull_constant_loc[c->prog_data.nr_params];
1749 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1750 if (i < pull_uniform_base) {
1751 pull_constant_loc[i] = -1;
1752 } else {
1753 pull_constant_loc[i] = -1;
1754 /* If our constant is already being uploaded for reladdr purposes,
1755 * reuse it.
1756 */
1757 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1758 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1759 pull_constant_loc[i] = j;
1760 break;
1761 }
1762 }
1763 if (pull_constant_loc[i] == -1) {
1764 int pull_index = c->prog_data.nr_pull_params++;
1765 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1766 pull_constant_loc[i] = pull_index;;
1767 }
1768 }
1769 }
1770 c->prog_data.nr_params = pull_uniform_base;
1771
1772 foreach_list(node, &this->instructions) {
1773 fs_inst *inst = (fs_inst *)node;
1774
1775 for (int i = 0; i < 3; i++) {
1776 if (inst->src[i].file != UNIFORM)
1777 continue;
1778
1779 int pull_index = pull_constant_loc[inst->src[i].reg +
1780 inst->src[i].reg_offset];
1781 if (pull_index == -1)
1782 continue;
1783
1784 assert(!inst->src[i].reladdr);
1785
1786 fs_reg dst = fs_reg(this, glsl_type::float_type);
1787 fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1788 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1789 fs_inst *pull =
1790 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1791 dst, index, offset);
1792 pull->ir = inst->ir;
1793 pull->annotation = inst->annotation;
1794
1795 inst->insert_before(pull);
1796
1797 inst->src[i].file = GRF;
1798 inst->src[i].reg = dst.reg;
1799 inst->src[i].reg_offset = 0;
1800 inst->src[i].smear = pull_index & 3;
1801 }
1802 }
1803 }
1804
1805 bool
1806 fs_visitor::opt_algebraic()
1807 {
1808 bool progress = false;
1809
1810 foreach_list(node, &this->instructions) {
1811 fs_inst *inst = (fs_inst *)node;
1812
1813 switch (inst->opcode) {
1814 case BRW_OPCODE_MUL:
1815 if (inst->src[1].file != IMM)
1816 continue;
1817
1818 /* a * 1.0 = a */
1819 if (inst->src[1].is_one()) {
1820 inst->opcode = BRW_OPCODE_MOV;
1821 inst->src[1] = reg_undef;
1822 progress = true;
1823 break;
1824 }
1825
1826 /* a * 0.0 = 0.0 */
1827 if (inst->src[1].is_zero()) {
1828 inst->opcode = BRW_OPCODE_MOV;
1829 inst->src[0] = inst->src[1];
1830 inst->src[1] = reg_undef;
1831 progress = true;
1832 break;
1833 }
1834
1835 break;
1836 case BRW_OPCODE_ADD:
1837 if (inst->src[1].file != IMM)
1838 continue;
1839
1840 /* a + 0.0 = a */
1841 if (inst->src[1].is_zero()) {
1842 inst->opcode = BRW_OPCODE_MOV;
1843 inst->src[1] = reg_undef;
1844 progress = true;
1845 break;
1846 }
1847 break;
1848 default:
1849 break;
1850 }
1851 }
1852
1853 return progress;
1854 }
1855
1856 /**
1857 * Removes any instructions writing a VGRF where that VGRF is not used by any
1858 * later instruction.
1859 */
1860 bool
1861 fs_visitor::dead_code_eliminate()
1862 {
1863 bool progress = false;
1864 int pc = 0;
1865
1866 calculate_live_intervals();
1867
1868 foreach_list_safe(node, &this->instructions) {
1869 fs_inst *inst = (fs_inst *)node;
1870
1871 if (inst->dst.file == GRF) {
1872 bool dead = true;
1873
1874 for (int i = 0; i < inst->regs_written; i++) {
1875 int var = live_intervals->var_from_vgrf[inst->dst.reg];
1876 assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
1877 if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
1878 dead = false;
1879 break;
1880 }
1881 }
1882
1883 if (dead) {
1884 /* Don't dead code eliminate instructions that write to the
1885 * accumulator as a side-effect. Instead just set the destination
1886 * to the null register to free it.
1887 */
1888 switch (inst->opcode) {
1889 case BRW_OPCODE_ADDC:
1890 case BRW_OPCODE_SUBB:
1891 case BRW_OPCODE_MACH:
1892 inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
1893 break;
1894 default:
1895 inst->remove();
1896 progress = true;
1897 break;
1898 }
1899 }
1900 }
1901
1902 pc++;
1903 }
1904
1905 if (progress)
1906 invalidate_live_intervals();
1907
1908 return progress;
1909 }
1910
1911 struct dead_code_hash_key
1912 {
1913 int vgrf;
1914 int reg_offset;
1915 };
1916
1917 static bool
1918 dead_code_hash_compare(const void *a, const void *b)
1919 {
1920 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1921 }
1922
1923 static void
1924 clear_dead_code_hash(struct hash_table *ht)
1925 {
1926 struct hash_entry *entry;
1927
1928 hash_table_foreach(ht, entry) {
1929 _mesa_hash_table_remove(ht, entry);
1930 }
1931 }
1932
1933 static void
1934 insert_dead_code_hash(struct hash_table *ht,
1935 int vgrf, int reg_offset, fs_inst *inst)
1936 {
1937 /* We don't bother freeing keys, because they'll be GCed with the ht. */
1938 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1939
1940 key->vgrf = vgrf;
1941 key->reg_offset = reg_offset;
1942
1943 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1944 }
1945
1946 static struct hash_entry *
1947 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1948 {
1949 struct dead_code_hash_key key;
1950
1951 key.vgrf = vgrf;
1952 key.reg_offset = reg_offset;
1953
1954 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1955 }
1956
1957 static void
1958 remove_dead_code_hash(struct hash_table *ht,
1959 int vgrf, int reg_offset)
1960 {
1961 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1962 if (!entry)
1963 return;
1964
1965 _mesa_hash_table_remove(ht, entry);
1966 }
1967
1968 /**
1969 * Walks basic blocks, removing any regs that are written but not read before
1970 * being redefined.
1971 *
1972 * The dead_code_eliminate() function implements a global dead code
1973 * elimination, but it only handles the removing the last write to a register
1974 * if it's never read. This one can handle intermediate writes, but only
1975 * within a basic block.
1976 */
1977 bool
1978 fs_visitor::dead_code_eliminate_local()
1979 {
1980 struct hash_table *ht;
1981 bool progress = false;
1982
1983 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1984
1985 foreach_list_safe(node, &this->instructions) {
1986 fs_inst *inst = (fs_inst *)node;
1987
1988 /* At a basic block, empty the HT since we don't understand dataflow
1989 * here.
1990 */
1991 if (inst->is_control_flow()) {
1992 clear_dead_code_hash(ht);
1993 continue;
1994 }
1995
1996 /* Clear the HT of any instructions that got read. */
1997 for (int i = 0; i < 3; i++) {
1998 fs_reg src = inst->src[i];
1999 if (src.file != GRF)
2000 continue;
2001
2002 int read = 1;
2003 if (inst->is_send_from_grf())
2004 read = virtual_grf_sizes[src.reg] - src.reg_offset;
2005
2006 for (int reg_offset = src.reg_offset;
2007 reg_offset < src.reg_offset + read;
2008 reg_offset++) {
2009 remove_dead_code_hash(ht, src.reg, reg_offset);
2010 }
2011 }
2012
2013 /* Add any update of a GRF to the HT, removing a previous write if it
2014 * wasn't read.
2015 */
2016 if (inst->dst.file == GRF) {
2017 if (inst->regs_written > 1) {
2018 /* We don't know how to trim channels from an instruction's
2019 * writes, so we can't incrementally remove unread channels from
2020 * it. Just remove whatever it overwrites from the table
2021 */
2022 for (int i = 0; i < inst->regs_written; i++) {
2023 remove_dead_code_hash(ht,
2024 inst->dst.reg,
2025 inst->dst.reg_offset + i);
2026 }
2027 } else {
2028 struct hash_entry *entry =
2029 get_dead_code_hash_entry(ht, inst->dst.reg,
2030 inst->dst.reg_offset);
2031
2032 if (inst->is_partial_write()) {
2033 /* For a partial write, we can't remove any previous dead code
2034 * candidate, since we're just modifying their result, but we can
2035 * be dead code eliminiated ourselves.
2036 */
2037 if (entry) {
2038 entry->data = inst;
2039 } else {
2040 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2041 inst);
2042 }
2043 } else {
2044 if (entry) {
2045 /* We're completely updating a channel, and there was a
2046 * previous write to the channel that wasn't read. Kill it!
2047 */
2048 fs_inst *inst = (fs_inst *)entry->data;
2049 inst->remove();
2050 progress = true;
2051 _mesa_hash_table_remove(ht, entry);
2052 }
2053
2054 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2055 inst);
2056 }
2057 }
2058 }
2059 }
2060
2061 _mesa_hash_table_destroy(ht, NULL);
2062
2063 if (progress)
2064 invalidate_live_intervals();
2065
2066 return progress;
2067 }
2068
2069 /**
2070 * Implements a second type of register coalescing: This one checks if
2071 * the two regs involved in a raw move don't interfere, in which case
2072 * they can both by stored in the same place and the MOV removed.
2073 */
2074 bool
2075 fs_visitor::register_coalesce_2()
2076 {
2077 bool progress = false;
2078
2079 calculate_live_intervals();
2080
2081 foreach_list_safe(node, &this->instructions) {
2082 fs_inst *inst = (fs_inst *)node;
2083
2084 if (inst->opcode != BRW_OPCODE_MOV ||
2085 inst->is_partial_write() ||
2086 inst->saturate ||
2087 inst->src[0].file != GRF ||
2088 inst->src[0].negate ||
2089 inst->src[0].abs ||
2090 inst->src[0].smear != -1 ||
2091 inst->dst.file != GRF ||
2092 inst->dst.type != inst->src[0].type ||
2093 virtual_grf_sizes[inst->src[0].reg] != 1) {
2094 continue;
2095 }
2096
2097 int var_from = live_intervals->var_from_reg(&inst->src[0]);
2098 int var_to = live_intervals->var_from_reg(&inst->dst);
2099
2100 if (live_intervals->vars_interfere(var_from, var_to))
2101 continue;
2102
2103 int reg_from = inst->src[0].reg;
2104 assert(inst->src[0].reg_offset == 0);
2105 int reg_to = inst->dst.reg;
2106 int reg_to_offset = inst->dst.reg_offset;
2107
2108 foreach_list(node, &this->instructions) {
2109 fs_inst *scan_inst = (fs_inst *)node;
2110
2111 if (scan_inst->dst.file == GRF &&
2112 scan_inst->dst.reg == reg_from) {
2113 scan_inst->dst.reg = reg_to;
2114 scan_inst->dst.reg_offset = reg_to_offset;
2115 }
2116 for (int i = 0; i < 3; i++) {
2117 if (scan_inst->src[i].file == GRF &&
2118 scan_inst->src[i].reg == reg_from) {
2119 scan_inst->src[i].reg = reg_to;
2120 scan_inst->src[i].reg_offset = reg_to_offset;
2121 }
2122 }
2123 }
2124
2125 inst->remove();
2126 progress = true;
2127 continue;
2128 }
2129
2130 if (progress)
2131 invalidate_live_intervals();
2132
2133 return progress;
2134 }
2135
2136 bool
2137 fs_visitor::register_coalesce()
2138 {
2139 bool progress = false;
2140 int if_depth = 0;
2141 int loop_depth = 0;
2142
2143 foreach_list_safe(node, &this->instructions) {
2144 fs_inst *inst = (fs_inst *)node;
2145
2146 /* Make sure that we dominate the instructions we're going to
2147 * scan for interfering with our coalescing, or we won't have
2148 * scanned enough to see if anything interferes with our
2149 * coalescing. We don't dominate the following instructions if
2150 * we're in a loop or an if block.
2151 */
2152 switch (inst->opcode) {
2153 case BRW_OPCODE_DO:
2154 loop_depth++;
2155 break;
2156 case BRW_OPCODE_WHILE:
2157 loop_depth--;
2158 break;
2159 case BRW_OPCODE_IF:
2160 if_depth++;
2161 break;
2162 case BRW_OPCODE_ENDIF:
2163 if_depth--;
2164 break;
2165 default:
2166 break;
2167 }
2168 if (loop_depth || if_depth)
2169 continue;
2170
2171 if (inst->opcode != BRW_OPCODE_MOV ||
2172 inst->is_partial_write() ||
2173 inst->saturate ||
2174 inst->dst.file != GRF || (inst->src[0].file != GRF &&
2175 inst->src[0].file != UNIFORM)||
2176 inst->dst.type != inst->src[0].type)
2177 continue;
2178
2179 bool has_source_modifiers = (inst->src[0].abs ||
2180 inst->src[0].negate ||
2181 inst->src[0].smear != -1 ||
2182 inst->src[0].file == UNIFORM);
2183
2184 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
2185 * them: check for no writes to either one until the exit of the
2186 * program.
2187 */
2188 bool interfered = false;
2189
2190 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2191 !scan_inst->is_tail_sentinel();
2192 scan_inst = (fs_inst *)scan_inst->next) {
2193 if (scan_inst->dst.file == GRF) {
2194 if (scan_inst->overwrites_reg(inst->dst) ||
2195 scan_inst->overwrites_reg(inst->src[0])) {
2196 interfered = true;
2197 break;
2198 }
2199 }
2200
2201 if (has_source_modifiers) {
2202 for (int i = 0; i < 3; i++) {
2203 if (scan_inst->src[i].file == GRF &&
2204 scan_inst->src[i].reg == inst->dst.reg &&
2205 scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2206 inst->dst.type != scan_inst->src[i].type)
2207 {
2208 interfered = true;
2209 break;
2210 }
2211 }
2212 }
2213
2214
2215 /* The gen6 MATH instruction can't handle source modifiers or
2216 * unusual register regions, so avoid coalescing those for
2217 * now. We should do something more specific.
2218 */
2219 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2220 interfered = true;
2221 break;
2222 }
2223
2224 if (scan_inst->mlen > 0 && scan_inst->base_mrf == -1 &&
2225 scan_inst->src[0].file == GRF &&
2226 scan_inst->src[0].reg == inst->dst.reg) {
2227 interfered = true;
2228 break;
2229 }
2230
2231 /* The accumulator result appears to get used for the
2232 * conditional modifier generation. When negating a UD
2233 * value, there is a 33rd bit generated for the sign in the
2234 * accumulator value, so now you can't check, for example,
2235 * equality with a 32-bit value. See piglit fs-op-neg-uint.
2236 */
2237 if (scan_inst->conditional_mod &&
2238 inst->src[0].negate &&
2239 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2240 interfered = true;
2241 break;
2242 }
2243 }
2244 if (interfered) {
2245 continue;
2246 }
2247
2248 /* Rewrite the later usage to point at the source of the move to
2249 * be removed.
2250 */
2251 for (fs_inst *scan_inst = inst;
2252 !scan_inst->is_tail_sentinel();
2253 scan_inst = (fs_inst *)scan_inst->next) {
2254 for (int i = 0; i < 3; i++) {
2255 if (scan_inst->src[i].file == GRF &&
2256 scan_inst->src[i].reg == inst->dst.reg &&
2257 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2258 fs_reg new_src = inst->src[0];
2259 if (scan_inst->src[i].abs) {
2260 new_src.negate = 0;
2261 new_src.abs = 1;
2262 }
2263 new_src.negate ^= scan_inst->src[i].negate;
2264 new_src.sechalf = scan_inst->src[i].sechalf;
2265 scan_inst->src[i] = new_src;
2266 }
2267 }
2268 }
2269
2270 inst->remove();
2271 progress = true;
2272 }
2273
2274 if (progress)
2275 invalidate_live_intervals();
2276
2277 return progress;
2278 }
2279
2280
2281 bool
2282 fs_visitor::compute_to_mrf()
2283 {
2284 bool progress = false;
2285 int next_ip = 0;
2286
2287 calculate_live_intervals();
2288
2289 foreach_list_safe(node, &this->instructions) {
2290 fs_inst *inst = (fs_inst *)node;
2291
2292 int ip = next_ip;
2293 next_ip++;
2294
2295 if (inst->opcode != BRW_OPCODE_MOV ||
2296 inst->is_partial_write() ||
2297 inst->dst.file != MRF || inst->src[0].file != GRF ||
2298 inst->dst.type != inst->src[0].type ||
2299 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2300 continue;
2301
2302 /* Work out which hardware MRF registers are written by this
2303 * instruction.
2304 */
2305 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2306 int mrf_high;
2307 if (inst->dst.reg & BRW_MRF_COMPR4) {
2308 mrf_high = mrf_low + 4;
2309 } else if (dispatch_width == 16 &&
2310 (!inst->force_uncompressed && !inst->force_sechalf)) {
2311 mrf_high = mrf_low + 1;
2312 } else {
2313 mrf_high = mrf_low;
2314 }
2315
2316 /* Can't compute-to-MRF this GRF if someone else was going to
2317 * read it later.
2318 */
2319 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2320 continue;
2321
2322 /* Found a move of a GRF to a MRF. Let's see if we can go
2323 * rewrite the thing that made this GRF to write into the MRF.
2324 */
2325 fs_inst *scan_inst;
2326 for (scan_inst = (fs_inst *)inst->prev;
2327 scan_inst->prev != NULL;
2328 scan_inst = (fs_inst *)scan_inst->prev) {
2329 if (scan_inst->dst.file == GRF &&
2330 scan_inst->dst.reg == inst->src[0].reg) {
2331 /* Found the last thing to write our reg we want to turn
2332 * into a compute-to-MRF.
2333 */
2334
2335 /* If this one instruction didn't populate all the
2336 * channels, bail. We might be able to rewrite everything
2337 * that writes that reg, but it would require smarter
2338 * tracking to delay the rewriting until complete success.
2339 */
2340 if (scan_inst->is_partial_write())
2341 break;
2342
2343 /* Things returning more than one register would need us to
2344 * understand coalescing out more than one MOV at a time.
2345 */
2346 if (scan_inst->regs_written > 1)
2347 break;
2348
2349 /* SEND instructions can't have MRF as a destination. */
2350 if (scan_inst->mlen)
2351 break;
2352
2353 if (brw->gen == 6) {
2354 /* gen6 math instructions must have the destination be
2355 * GRF, so no compute-to-MRF for them.
2356 */
2357 if (scan_inst->is_math()) {
2358 break;
2359 }
2360 }
2361
2362 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2363 /* Found the creator of our MRF's source value. */
2364 scan_inst->dst.file = MRF;
2365 scan_inst->dst.reg = inst->dst.reg;
2366 scan_inst->saturate |= inst->saturate;
2367 inst->remove();
2368 progress = true;
2369 }
2370 break;
2371 }
2372
2373 /* We don't handle control flow here. Most computation of
2374 * values that end up in MRFs are shortly before the MRF
2375 * write anyway.
2376 */
2377 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2378 break;
2379
2380 /* You can't read from an MRF, so if someone else reads our
2381 * MRF's source GRF that we wanted to rewrite, that stops us.
2382 */
2383 bool interfered = false;
2384 for (int i = 0; i < 3; i++) {
2385 if (scan_inst->src[i].file == GRF &&
2386 scan_inst->src[i].reg == inst->src[0].reg &&
2387 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2388 interfered = true;
2389 }
2390 }
2391 if (interfered)
2392 break;
2393
2394 if (scan_inst->dst.file == MRF) {
2395 /* If somebody else writes our MRF here, we can't
2396 * compute-to-MRF before that.
2397 */
2398 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2399 int scan_mrf_high;
2400
2401 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2402 scan_mrf_high = scan_mrf_low + 4;
2403 } else if (dispatch_width == 16 &&
2404 (!scan_inst->force_uncompressed &&
2405 !scan_inst->force_sechalf)) {
2406 scan_mrf_high = scan_mrf_low + 1;
2407 } else {
2408 scan_mrf_high = scan_mrf_low;
2409 }
2410
2411 if (mrf_low == scan_mrf_low ||
2412 mrf_low == scan_mrf_high ||
2413 mrf_high == scan_mrf_low ||
2414 mrf_high == scan_mrf_high) {
2415 break;
2416 }
2417 }
2418
2419 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2420 /* Found a SEND instruction, which means that there are
2421 * live values in MRFs from base_mrf to base_mrf +
2422 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2423 * above it.
2424 */
2425 if (mrf_low >= scan_inst->base_mrf &&
2426 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2427 break;
2428 }
2429 if (mrf_high >= scan_inst->base_mrf &&
2430 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2431 break;
2432 }
2433 }
2434 }
2435 }
2436
2437 if (progress)
2438 invalidate_live_intervals();
2439
2440 return progress;
2441 }
2442
2443 /**
2444 * Walks through basic blocks, looking for repeated MRF writes and
2445 * removing the later ones.
2446 */
2447 bool
2448 fs_visitor::remove_duplicate_mrf_writes()
2449 {
2450 fs_inst *last_mrf_move[16];
2451 bool progress = false;
2452
2453 /* Need to update the MRF tracking for compressed instructions. */
2454 if (dispatch_width == 16)
2455 return false;
2456
2457 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2458
2459 foreach_list_safe(node, &this->instructions) {
2460 fs_inst *inst = (fs_inst *)node;
2461
2462 if (inst->is_control_flow()) {
2463 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2464 }
2465
2466 if (inst->opcode == BRW_OPCODE_MOV &&
2467 inst->dst.file == MRF) {
2468 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2469 if (prev_inst && inst->equals(prev_inst)) {
2470 inst->remove();
2471 progress = true;
2472 continue;
2473 }
2474 }
2475
2476 /* Clear out the last-write records for MRFs that were overwritten. */
2477 if (inst->dst.file == MRF) {
2478 last_mrf_move[inst->dst.reg] = NULL;
2479 }
2480
2481 if (inst->mlen > 0 && inst->base_mrf != -1) {
2482 /* Found a SEND instruction, which will include two or fewer
2483 * implied MRF writes. We could do better here.
2484 */
2485 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2486 last_mrf_move[inst->base_mrf + i] = NULL;
2487 }
2488 }
2489
2490 /* Clear out any MRF move records whose sources got overwritten. */
2491 if (inst->dst.file == GRF) {
2492 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2493 if (last_mrf_move[i] &&
2494 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2495 last_mrf_move[i] = NULL;
2496 }
2497 }
2498 }
2499
2500 if (inst->opcode == BRW_OPCODE_MOV &&
2501 inst->dst.file == MRF &&
2502 inst->src[0].file == GRF &&
2503 !inst->is_partial_write()) {
2504 last_mrf_move[inst->dst.reg] = inst;
2505 }
2506 }
2507
2508 if (progress)
2509 invalidate_live_intervals();
2510
2511 return progress;
2512 }
2513
2514 static void
2515 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2516 int first_grf, int grf_len)
2517 {
2518 bool inst_16wide = (dispatch_width > 8 &&
2519 !inst->force_uncompressed &&
2520 !inst->force_sechalf);
2521
2522 /* Clear the flag for registers that actually got read (as expected). */
2523 for (int i = 0; i < 3; i++) {
2524 int grf;
2525 if (inst->src[i].file == GRF) {
2526 grf = inst->src[i].reg;
2527 } else if (inst->src[i].file == HW_REG &&
2528 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2529 grf = inst->src[i].fixed_hw_reg.nr;
2530 } else {
2531 continue;
2532 }
2533
2534 if (grf >= first_grf &&
2535 grf < first_grf + grf_len) {
2536 deps[grf - first_grf] = false;
2537 if (inst_16wide)
2538 deps[grf - first_grf + 1] = false;
2539 }
2540 }
2541 }
2542
2543 /**
2544 * Implements this workaround for the original 965:
2545 *
2546 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2547 * check for post destination dependencies on this instruction, software
2548 * must ensure that there is no destination hazard for the case of ‘write
2549 * followed by a posted write’ shown in the following example.
2550 *
2551 * 1. mov r3 0
2552 * 2. send r3.xy <rest of send instruction>
2553 * 3. mov r2 r3
2554 *
2555 * Due to no post-destination dependency check on the ‘send’, the above
2556 * code sequence could have two instructions (1 and 2) in flight at the
2557 * same time that both consider ‘r3’ as the target of their final writes.
2558 */
2559 void
2560 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2561 {
2562 int reg_size = dispatch_width / 8;
2563 int write_len = inst->regs_written * reg_size;
2564 int first_write_grf = inst->dst.reg;
2565 bool needs_dep[BRW_MAX_MRF];
2566 assert(write_len < (int)sizeof(needs_dep) - 1);
2567
2568 memset(needs_dep, false, sizeof(needs_dep));
2569 memset(needs_dep, true, write_len);
2570
2571 clear_deps_for_inst_src(inst, dispatch_width,
2572 needs_dep, first_write_grf, write_len);
2573
2574 /* Walk backwards looking for writes to registers we're writing which
2575 * aren't read since being written. If we hit the start of the program,
2576 * we assume that there are no outstanding dependencies on entry to the
2577 * program.
2578 */
2579 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2580 scan_inst != NULL;
2581 scan_inst = (fs_inst *)scan_inst->prev) {
2582
2583 /* If we hit control flow, assume that there *are* outstanding
2584 * dependencies, and force their cleanup before our instruction.
2585 */
2586 if (scan_inst->is_control_flow()) {
2587 for (int i = 0; i < write_len; i++) {
2588 if (needs_dep[i]) {
2589 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2590 }
2591 }
2592 return;
2593 }
2594
2595 bool scan_inst_16wide = (dispatch_width > 8 &&
2596 !scan_inst->force_uncompressed &&
2597 !scan_inst->force_sechalf);
2598
2599 /* We insert our reads as late as possible on the assumption that any
2600 * instruction but a MOV that might have left us an outstanding
2601 * dependency has more latency than a MOV.
2602 */
2603 if (scan_inst->dst.file == GRF) {
2604 for (int i = 0; i < scan_inst->regs_written; i++) {
2605 int reg = scan_inst->dst.reg + i * reg_size;
2606
2607 if (reg >= first_write_grf &&
2608 reg < first_write_grf + write_len &&
2609 needs_dep[reg - first_write_grf]) {
2610 inst->insert_before(DEP_RESOLVE_MOV(reg));
2611 needs_dep[reg - first_write_grf] = false;
2612 if (scan_inst_16wide)
2613 needs_dep[reg - first_write_grf + 1] = false;
2614 }
2615 }
2616 }
2617
2618 /* Clear the flag for registers that actually got read (as expected). */
2619 clear_deps_for_inst_src(scan_inst, dispatch_width,
2620 needs_dep, first_write_grf, write_len);
2621
2622 /* Continue the loop only if we haven't resolved all the dependencies */
2623 int i;
2624 for (i = 0; i < write_len; i++) {
2625 if (needs_dep[i])
2626 break;
2627 }
2628 if (i == write_len)
2629 return;
2630 }
2631 }
2632
2633 /**
2634 * Implements this workaround for the original 965:
2635 *
2636 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2637 * used as a destination register until after it has been sourced by an
2638 * instruction with a different destination register.
2639 */
2640 void
2641 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2642 {
2643 int write_len = inst->regs_written * dispatch_width / 8;
2644 int first_write_grf = inst->dst.reg;
2645 bool needs_dep[BRW_MAX_MRF];
2646 assert(write_len < (int)sizeof(needs_dep) - 1);
2647
2648 memset(needs_dep, false, sizeof(needs_dep));
2649 memset(needs_dep, true, write_len);
2650 /* Walk forwards looking for writes to registers we're writing which aren't
2651 * read before being written.
2652 */
2653 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2654 !scan_inst->is_tail_sentinel();
2655 scan_inst = (fs_inst *)scan_inst->next) {
2656 /* If we hit control flow, force resolve all remaining dependencies. */
2657 if (scan_inst->is_control_flow()) {
2658 for (int i = 0; i < write_len; i++) {
2659 if (needs_dep[i])
2660 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2661 }
2662 return;
2663 }
2664
2665 /* Clear the flag for registers that actually got read (as expected). */
2666 clear_deps_for_inst_src(scan_inst, dispatch_width,
2667 needs_dep, first_write_grf, write_len);
2668
2669 /* We insert our reads as late as possible since they're reading the
2670 * result of a SEND, which has massive latency.
2671 */
2672 if (scan_inst->dst.file == GRF &&
2673 scan_inst->dst.reg >= first_write_grf &&
2674 scan_inst->dst.reg < first_write_grf + write_len &&
2675 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2676 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2677 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2678 }
2679
2680 /* Continue the loop only if we haven't resolved all the dependencies */
2681 int i;
2682 for (i = 0; i < write_len; i++) {
2683 if (needs_dep[i])
2684 break;
2685 }
2686 if (i == write_len)
2687 return;
2688 }
2689
2690 /* If we hit the end of the program, resolve all remaining dependencies out
2691 * of paranoia.
2692 */
2693 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2694 assert(last_inst->eot);
2695 for (int i = 0; i < write_len; i++) {
2696 if (needs_dep[i])
2697 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2698 }
2699 }
2700
2701 void
2702 fs_visitor::insert_gen4_send_dependency_workarounds()
2703 {
2704 if (brw->gen != 4 || brw->is_g4x)
2705 return;
2706
2707 /* Note that we're done with register allocation, so GRF fs_regs always
2708 * have a .reg_offset of 0.
2709 */
2710
2711 foreach_list_safe(node, &this->instructions) {
2712 fs_inst *inst = (fs_inst *)node;
2713
2714 if (inst->mlen != 0 && inst->dst.file == GRF) {
2715 insert_gen4_pre_send_dependency_workarounds(inst);
2716 insert_gen4_post_send_dependency_workarounds(inst);
2717 }
2718 }
2719 }
2720
2721 /**
2722 * Turns the generic expression-style uniform pull constant load instruction
2723 * into a hardware-specific series of instructions for loading a pull
2724 * constant.
2725 *
2726 * The expression style allows the CSE pass before this to optimize out
2727 * repeated loads from the same offset, and gives the pre-register-allocation
2728 * scheduling full flexibility, while the conversion to native instructions
2729 * allows the post-register-allocation scheduler the best information
2730 * possible.
2731 *
2732 * Note that execution masking for setting up pull constant loads is special:
2733 * the channels that need to be written are unrelated to the current execution
2734 * mask, since a later instruction will use one of the result channels as a
2735 * source operand for all 8 or 16 of its channels.
2736 */
2737 void
2738 fs_visitor::lower_uniform_pull_constant_loads()
2739 {
2740 foreach_list(node, &this->instructions) {
2741 fs_inst *inst = (fs_inst *)node;
2742
2743 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2744 continue;
2745
2746 if (brw->gen >= 7) {
2747 /* The offset arg before was a vec4-aligned byte offset. We need to
2748 * turn it into a dword offset.
2749 */
2750 fs_reg const_offset_reg = inst->src[1];
2751 assert(const_offset_reg.file == IMM &&
2752 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2753 const_offset_reg.imm.u /= 4;
2754 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2755
2756 /* This is actually going to be a MOV, but since only the first dword
2757 * is accessed, we have a special opcode to do just that one. Note
2758 * that this needs to be an operation that will be considered a def
2759 * by live variable analysis, or register allocation will explode.
2760 */
2761 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2762 payload, const_offset_reg);
2763 setup->force_writemask_all = true;
2764
2765 setup->ir = inst->ir;
2766 setup->annotation = inst->annotation;
2767 inst->insert_before(setup);
2768
2769 /* Similarly, this will only populate the first 4 channels of the
2770 * result register (since we only use smear values from 0-3), but we
2771 * don't tell the optimizer.
2772 */
2773 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2774 inst->src[1] = payload;
2775
2776 invalidate_live_intervals();
2777 } else {
2778 /* Before register allocation, we didn't tell the scheduler about the
2779 * MRF we use. We know it's safe to use this MRF because nothing
2780 * else does except for register spill/unspill, which generates and
2781 * uses its MRF within a single IR instruction.
2782 */
2783 inst->base_mrf = 14;
2784 inst->mlen = 1;
2785 }
2786 }
2787 }
2788
2789 void
2790 fs_visitor::dump_instruction(backend_instruction *be_inst)
2791 {
2792 fs_inst *inst = (fs_inst *)be_inst;
2793
2794 if (inst->predicate) {
2795 printf("(%cf0.%d) ",
2796 inst->predicate_inverse ? '-' : '+',
2797 inst->flag_subreg);
2798 }
2799
2800 printf("%s", brw_instruction_name(inst->opcode));
2801 if (inst->saturate)
2802 printf(".sat");
2803 if (inst->conditional_mod) {
2804 printf(".cmod");
2805 if (!inst->predicate &&
2806 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2807 inst->opcode != BRW_OPCODE_IF &&
2808 inst->opcode != BRW_OPCODE_WHILE))) {
2809 printf(".f0.%d", inst->flag_subreg);
2810 }
2811 }
2812 printf(" ");
2813
2814
2815 switch (inst->dst.file) {
2816 case GRF:
2817 printf("vgrf%d", inst->dst.reg);
2818 if (inst->dst.reg_offset)
2819 printf("+%d", inst->dst.reg_offset);
2820 break;
2821 case MRF:
2822 printf("m%d", inst->dst.reg);
2823 break;
2824 case BAD_FILE:
2825 printf("(null)");
2826 break;
2827 case UNIFORM:
2828 printf("***u%d***", inst->dst.reg);
2829 break;
2830 default:
2831 printf("???");
2832 break;
2833 }
2834 printf(", ");
2835
2836 for (int i = 0; i < 3; i++) {
2837 if (inst->src[i].negate)
2838 printf("-");
2839 if (inst->src[i].abs)
2840 printf("|");
2841 switch (inst->src[i].file) {
2842 case GRF:
2843 printf("vgrf%d", inst->src[i].reg);
2844 if (inst->src[i].reg_offset)
2845 printf("+%d", inst->src[i].reg_offset);
2846 break;
2847 case MRF:
2848 printf("***m%d***", inst->src[i].reg);
2849 break;
2850 case UNIFORM:
2851 printf("u%d", inst->src[i].reg);
2852 if (inst->src[i].reg_offset)
2853 printf(".%d", inst->src[i].reg_offset);
2854 break;
2855 case BAD_FILE:
2856 printf("(null)");
2857 break;
2858 case IMM:
2859 switch (inst->src[i].type) {
2860 case BRW_REGISTER_TYPE_F:
2861 printf("%ff", inst->src[i].imm.f);
2862 break;
2863 case BRW_REGISTER_TYPE_D:
2864 printf("%dd", inst->src[i].imm.i);
2865 break;
2866 case BRW_REGISTER_TYPE_UD:
2867 printf("%uu", inst->src[i].imm.u);
2868 break;
2869 default:
2870 printf("???");
2871 break;
2872 }
2873 break;
2874 default:
2875 printf("???");
2876 break;
2877 }
2878 if (inst->src[i].abs)
2879 printf("|");
2880
2881 if (i < 3)
2882 printf(", ");
2883 }
2884
2885 printf(" ");
2886
2887 if (inst->force_uncompressed)
2888 printf("1sthalf ");
2889
2890 if (inst->force_sechalf)
2891 printf("2ndhalf ");
2892
2893 printf("\n");
2894 }
2895
2896 /**
2897 * Possibly returns an instruction that set up @param reg.
2898 *
2899 * Sometimes we want to take the result of some expression/variable
2900 * dereference tree and rewrite the instruction generating the result
2901 * of the tree. When processing the tree, we know that the
2902 * instructions generated are all writing temporaries that are dead
2903 * outside of this tree. So, if we have some instructions that write
2904 * a temporary, we're free to point that temp write somewhere else.
2905 *
2906 * Note that this doesn't guarantee that the instruction generated
2907 * only reg -- it might be the size=4 destination of a texture instruction.
2908 */
2909 fs_inst *
2910 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2911 fs_inst *end,
2912 fs_reg reg)
2913 {
2914 if (end == start ||
2915 end->is_partial_write() ||
2916 reg.reladdr ||
2917 !reg.equals(end->dst)) {
2918 return NULL;
2919 } else {
2920 return end;
2921 }
2922 }
2923
2924 void
2925 fs_visitor::setup_payload_gen6()
2926 {
2927 bool uses_depth =
2928 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2929 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2930
2931 assert(brw->gen >= 6);
2932
2933 /* R0-1: masks, pixel X/Y coordinates. */
2934 c->nr_payload_regs = 2;
2935 /* R2: only for 32-pixel dispatch.*/
2936
2937 /* R3-26: barycentric interpolation coordinates. These appear in the
2938 * same order that they appear in the brw_wm_barycentric_interp_mode
2939 * enum. Each set of coordinates occupies 2 registers if dispatch width
2940 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2941 * appear if they were enabled using the "Barycentric Interpolation
2942 * Mode" bits in WM_STATE.
2943 */
2944 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2945 if (barycentric_interp_modes & (1 << i)) {
2946 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2947 c->nr_payload_regs += 2;
2948 if (dispatch_width == 16) {
2949 c->nr_payload_regs += 2;
2950 }
2951 }
2952 }
2953
2954 /* R27: interpolated depth if uses source depth */
2955 if (uses_depth) {
2956 c->source_depth_reg = c->nr_payload_regs;
2957 c->nr_payload_regs++;
2958 if (dispatch_width == 16) {
2959 /* R28: interpolated depth if not 8-wide. */
2960 c->nr_payload_regs++;
2961 }
2962 }
2963 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2964 if (uses_depth) {
2965 c->source_w_reg = c->nr_payload_regs;
2966 c->nr_payload_regs++;
2967 if (dispatch_width == 16) {
2968 /* R30: interpolated W if not 8-wide. */
2969 c->nr_payload_regs++;
2970 }
2971 }
2972 /* R31: MSAA position offsets. */
2973 /* R32-: bary for 32-pixel. */
2974 /* R58-59: interp W for 32-pixel. */
2975
2976 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2977 c->source_depth_to_render_target = true;
2978 }
2979 }
2980
2981 void
2982 fs_visitor::assign_binding_table_offsets()
2983 {
2984 c->prog_data.binding_table.render_target_start = SURF_INDEX_DRAW(0);
2985 c->prog_data.base.binding_table.texture_start = SURF_INDEX_TEXTURE(0);
2986 c->prog_data.base.binding_table.ubo_start = SURF_INDEX_WM_UBO(0);
2987 c->prog_data.base.binding_table.shader_time_start = SURF_INDEX_WM_SHADER_TIME;
2988 c->prog_data.base.binding_table.gather_texture_start = SURF_INDEX_GATHER_TEXTURE(0);
2989 c->prog_data.base.binding_table.pull_constants_start = SURF_INDEX_FRAG_CONST_BUFFER;
2990
2991 /* c->prog_data.base.binding_table.size will be set by mark_surface_used. */
2992 }
2993
2994 bool
2995 fs_visitor::run()
2996 {
2997 sanity_param_count = fp->Base.Parameters->NumParameters;
2998 uint32_t orig_nr_params = c->prog_data.nr_params;
2999
3000 assign_binding_table_offsets();
3001
3002 if (brw->gen >= 6)
3003 setup_payload_gen6();
3004 else
3005 setup_payload_gen4();
3006
3007 if (0) {
3008 emit_dummy_fs();
3009 } else {
3010 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3011 emit_shader_time_begin();
3012
3013 calculate_urb_setup();
3014 if (brw->gen < 6)
3015 emit_interpolation_setup_gen4();
3016 else
3017 emit_interpolation_setup_gen6();
3018
3019 /* We handle discards by keeping track of the still-live pixels in f0.1.
3020 * Initialize it with the dispatched pixels.
3021 */
3022 if (fp->UsesKill) {
3023 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3024 discard_init->flag_subreg = 1;
3025 }
3026
3027 /* Generate FS IR for main(). (the visitor only descends into
3028 * functions called "main").
3029 */
3030 if (shader) {
3031 foreach_list(node, &*shader->ir) {
3032 ir_instruction *ir = (ir_instruction *)node;
3033 base_ir = ir;
3034 this->result = reg_undef;
3035 ir->accept(this);
3036 }
3037 } else {
3038 emit_fragment_program_code();
3039 }
3040 base_ir = NULL;
3041 if (failed)
3042 return false;
3043
3044 emit(FS_OPCODE_PLACEHOLDER_HALT);
3045
3046 emit_fb_writes();
3047
3048 split_virtual_grfs();
3049
3050 move_uniform_array_access_to_pull_constants();
3051 setup_pull_constants();
3052
3053 bool progress;
3054 do {
3055 progress = false;
3056
3057 compact_virtual_grfs();
3058
3059 progress = remove_duplicate_mrf_writes() || progress;
3060
3061 progress = opt_algebraic() || progress;
3062 progress = opt_cse() || progress;
3063 progress = opt_copy_propagate() || progress;
3064 progress = dead_code_eliminate() || progress;
3065 progress = dead_code_eliminate_local() || progress;
3066 progress = register_coalesce() || progress;
3067 progress = register_coalesce_2() || progress;
3068 progress = compute_to_mrf() || progress;
3069 } while (progress);
3070
3071 remove_dead_constants();
3072
3073 schedule_instructions(false);
3074
3075 lower_uniform_pull_constant_loads();
3076
3077 assign_curb_setup();
3078 assign_urb_setup();
3079
3080 if (0) {
3081 /* Debug of register spilling: Go spill everything. */
3082 for (int i = 0; i < virtual_grf_count; i++) {
3083 spill_reg(i);
3084 }
3085 }
3086
3087 if (0)
3088 assign_regs_trivial();
3089 else {
3090 while (!assign_regs()) {
3091 if (failed)
3092 break;
3093 }
3094 }
3095 }
3096 assert(force_uncompressed_stack == 0);
3097 assert(force_sechalf_stack == 0);
3098
3099 /* This must come after all optimization and register allocation, since
3100 * it inserts dead code that happens to have side effects, and it does
3101 * so based on the actual physical registers in use.
3102 */
3103 insert_gen4_send_dependency_workarounds();
3104
3105 if (failed)
3106 return false;
3107
3108 schedule_instructions(true);
3109
3110 if (dispatch_width == 8) {
3111 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3112 } else {
3113 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3114
3115 /* Make sure we didn't try to sneak in an extra uniform */
3116 assert(orig_nr_params == c->prog_data.nr_params);
3117 (void) orig_nr_params;
3118 }
3119
3120 /* If any state parameters were appended, then ParameterValues could have
3121 * been realloced, in which case the driver uniform storage set up by
3122 * _mesa_associate_uniform_storage() would point to freed memory. Make
3123 * sure that didn't happen.
3124 */
3125 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3126
3127 return !failed;
3128 }
3129
3130 const unsigned *
3131 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3132 struct gl_fragment_program *fp,
3133 struct gl_shader_program *prog,
3134 unsigned *final_assembly_size)
3135 {
3136 bool start_busy = false;
3137 float start_time = 0;
3138
3139 if (unlikely(brw->perf_debug)) {
3140 start_busy = (brw->batch.last_bo &&
3141 drm_intel_bo_busy(brw->batch.last_bo));
3142 start_time = get_time();
3143 }
3144
3145 struct brw_shader *shader = NULL;
3146 if (prog)
3147 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3148
3149 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3150 if (prog) {
3151 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3152 _mesa_print_ir(shader->ir, NULL);
3153 printf("\n\n");
3154 } else {
3155 printf("ARB_fragment_program %d ir for native fragment shader\n",
3156 fp->Base.Id);
3157 _mesa_print_program(&fp->Base);
3158 }
3159 }
3160
3161 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3162 */
3163 fs_visitor v(brw, c, prog, fp, 8);
3164 if (!v.run()) {
3165 if (prog) {
3166 prog->LinkStatus = false;
3167 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3168 }
3169
3170 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3171 v.fail_msg);
3172
3173 return NULL;
3174 }
3175
3176 exec_list *simd16_instructions = NULL;
3177 fs_visitor v2(brw, c, prog, fp, 16);
3178 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3179 if (c->prog_data.nr_pull_params == 0) {
3180 /* Try a 16-wide compile */
3181 v2.import_uniforms(&v);
3182 if (!v2.run()) {
3183 perf_debug("16-wide shader failed to compile, falling back to "
3184 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3185 } else {
3186 simd16_instructions = &v2.instructions;
3187 }
3188 } else {
3189 perf_debug("Skipping 16-wide due to pull parameters.\n");
3190 }
3191 }
3192
3193 c->prog_data.dispatch_width = 8;
3194
3195 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3196 const unsigned *generated = g.generate_assembly(&v.instructions,
3197 simd16_instructions,
3198 final_assembly_size);
3199
3200 if (unlikely(brw->perf_debug) && shader) {
3201 if (shader->compiled_once)
3202 brw_wm_debug_recompile(brw, prog, &c->key);
3203 shader->compiled_once = true;
3204
3205 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3206 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3207 (get_time() - start_time) * 1000);
3208 }
3209 }
3210
3211 return generated;
3212 }
3213
3214 bool
3215 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3216 {
3217 struct brw_context *brw = brw_context(ctx);
3218 struct brw_wm_prog_key key;
3219
3220 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3221 return true;
3222
3223 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3224 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3225 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3226 bool program_uses_dfdy = fp->UsesDFdy;
3227
3228 memset(&key, 0, sizeof(key));
3229
3230 if (brw->gen < 6) {
3231 if (fp->UsesKill)
3232 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3233
3234 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3235 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3236
3237 /* Just assume depth testing. */
3238 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3239 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3240 }
3241
3242 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3243 BRW_FS_VARYING_INPUT_MASK) > 16)
3244 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3245
3246 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3247
3248 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3249 for (unsigned i = 0; i < sampler_count; i++) {
3250 if (fp->Base.ShadowSamplers & (1 << i)) {
3251 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3252 key.tex.swizzles[i] =
3253 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3254 } else {
3255 /* Color sampler: assume no swizzling. */
3256 key.tex.swizzles[i] = SWIZZLE_XYZW;
3257 }
3258 }
3259
3260 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3261 key.drawable_height = ctx->DrawBuffer->Height;
3262 }
3263
3264 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3265 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3266 }
3267
3268 key.nr_color_regions = 1;
3269
3270 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3271 * quality of the derivatives is likely to be determined by the driconf
3272 * option.
3273 */
3274 key.high_quality_derivatives = brw->disable_derivative_optimization;
3275
3276 key.program_string_id = bfp->id;
3277
3278 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3279 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3280
3281 bool success = do_wm_prog(brw, prog, bfp, &key);
3282
3283 brw->wm.base.prog_offset = old_prog_offset;
3284 brw->wm.prog_data = old_prog_data;
3285
3286 return success;
3287 }