i965/fs: Simplify computation of key.input_slots_valid during precompile.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/uniforms.h"
39 #include "main/fbobject.h"
40 #include "program/prog_parameter.h"
41 #include "program/prog_print.h"
42 #include "program/register_allocate.h"
43 #include "program/sampler.h"
44 #include "program/hash_table.h"
45 #include "brw_context.h"
46 #include "brw_eu.h"
47 #include "brw_wm.h"
48 }
49 #include "brw_fs.h"
50 #include "glsl/glsl_types.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63
64 /* This will be the case for almost all instructions. */
65 this->regs_written = 1;
66 }
67
68 fs_inst::fs_inst()
69 {
70 init();
71 }
72
73 fs_inst::fs_inst(enum opcode opcode)
74 {
75 init();
76 this->opcode = opcode;
77 }
78
79 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
80 {
81 init();
82 this->opcode = opcode;
83 this->dst = dst;
84
85 if (dst.file == GRF)
86 assert(dst.reg_offset >= 0);
87 }
88
89 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
90 {
91 init();
92 this->opcode = opcode;
93 this->dst = dst;
94 this->src[0] = src0;
95
96 if (dst.file == GRF)
97 assert(dst.reg_offset >= 0);
98 if (src[0].file == GRF)
99 assert(src[0].reg_offset >= 0);
100 }
101
102 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
103 {
104 init();
105 this->opcode = opcode;
106 this->dst = dst;
107 this->src[0] = src0;
108 this->src[1] = src1;
109
110 if (dst.file == GRF)
111 assert(dst.reg_offset >= 0);
112 if (src[0].file == GRF)
113 assert(src[0].reg_offset >= 0);
114 if (src[1].file == GRF)
115 assert(src[1].reg_offset >= 0);
116 }
117
118 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
119 fs_reg src0, fs_reg src1, fs_reg src2)
120 {
121 init();
122 this->opcode = opcode;
123 this->dst = dst;
124 this->src[0] = src0;
125 this->src[1] = src1;
126 this->src[2] = src2;
127
128 if (dst.file == GRF)
129 assert(dst.reg_offset >= 0);
130 if (src[0].file == GRF)
131 assert(src[0].reg_offset >= 0);
132 if (src[1].file == GRF)
133 assert(src[1].reg_offset >= 0);
134 if (src[2].file == GRF)
135 assert(src[2].reg_offset >= 0);
136 }
137
138 #define ALU1(op) \
139 fs_inst * \
140 fs_visitor::op(fs_reg dst, fs_reg src0) \
141 { \
142 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
143 }
144
145 #define ALU2(op) \
146 fs_inst * \
147 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
148 { \
149 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
150 }
151
152 #define ALU3(op) \
153 fs_inst * \
154 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
155 { \
156 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
157 }
158
159 ALU1(NOT)
160 ALU1(MOV)
161 ALU1(FRC)
162 ALU1(RNDD)
163 ALU1(RNDE)
164 ALU1(RNDZ)
165 ALU2(ADD)
166 ALU2(MUL)
167 ALU2(MACH)
168 ALU2(AND)
169 ALU2(OR)
170 ALU2(XOR)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183
184 /** Gen4 predicated IF. */
185 fs_inst *
186 fs_visitor::IF(uint32_t predicate)
187 {
188 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
189 inst->predicate = predicate;
190 return inst;
191 }
192
193 /** Gen6+ IF with embedded comparison. */
194 fs_inst *
195 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
196 {
197 assert(brw->gen >= 6);
198 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
199 reg_null_d, src0, src1);
200 inst->conditional_mod = condition;
201 return inst;
202 }
203
204 /**
205 * CMP: Sets the low bit of the destination channels with the result
206 * of the comparison, while the upper bits are undefined, and updates
207 * the flag register with the packed 16 bits of the result.
208 */
209 fs_inst *
210 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
211 {
212 fs_inst *inst;
213
214 /* Take the instruction:
215 *
216 * CMP null<d> src0<f> src1<f>
217 *
218 * Original gen4 does type conversion to the destination type before
219 * comparison, producing garbage results for floating point comparisons.
220 * gen5 does the comparison on the execution type (resolved source types),
221 * so dst type doesn't matter. gen6 does comparison and then uses the
222 * result as if it was the dst type with no conversion, which happens to
223 * mostly work out for float-interpreted-as-int since our comparisons are
224 * for >0, =0, <0.
225 */
226 if (brw->gen == 4) {
227 dst.type = src0.type;
228 if (dst.file == HW_REG)
229 dst.fixed_hw_reg.type = dst.type;
230 }
231
232 resolve_ud_negate(&src0);
233 resolve_ud_negate(&src1);
234
235 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
236 inst->conditional_mod = condition;
237
238 return inst;
239 }
240
241 exec_list
242 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
243 fs_reg varying_offset,
244 uint32_t const_offset)
245 {
246 exec_list instructions;
247 fs_inst *inst;
248
249 /* We have our constant surface use a pitch of 4 bytes, so our index can
250 * be any component of a vector, and then we load 4 contiguous
251 * components starting from that.
252 *
253 * We break down the const_offset to a portion added to the variable
254 * offset and a portion done using reg_offset, which means that if you
255 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
256 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
257 * CSE can later notice that those loads are all the same and eliminate
258 * the redundant ones.
259 */
260 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
261 instructions.push_tail(ADD(vec4_offset,
262 varying_offset, const_offset & ~3));
263
264 int scale = 1;
265 if (brw->gen == 4 && dispatch_width == 8) {
266 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
267 * u, v, r) as parameters, or we can just use the SIMD16 message
268 * consisting of (header, u). We choose the second, at the cost of a
269 * longer return length.
270 */
271 scale = 2;
272 }
273
274 enum opcode op;
275 if (brw->gen >= 7)
276 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
277 else
278 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
279 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
280 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
281 inst->regs_written = 4 * scale;
282 instructions.push_tail(inst);
283
284 if (brw->gen < 7) {
285 inst->base_mrf = 13;
286 inst->header_present = true;
287 if (brw->gen == 4)
288 inst->mlen = 3;
289 else
290 inst->mlen = 1 + dispatch_width / 8;
291 }
292
293 vec4_result.reg_offset += (const_offset & 3) * scale;
294 instructions.push_tail(MOV(dst, vec4_result));
295
296 return instructions;
297 }
298
299 /**
300 * A helper for MOV generation for fixing up broken hardware SEND dependency
301 * handling.
302 */
303 fs_inst *
304 fs_visitor::DEP_RESOLVE_MOV(int grf)
305 {
306 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
307
308 inst->ir = NULL;
309 inst->annotation = "send dependency resolve";
310
311 /* The caller always wants uncompressed to emit the minimal extra
312 * dependencies, and to avoid having to deal with aligning its regs to 2.
313 */
314 inst->force_uncompressed = true;
315
316 return inst;
317 }
318
319 bool
320 fs_inst::equals(fs_inst *inst)
321 {
322 return (opcode == inst->opcode &&
323 dst.equals(inst->dst) &&
324 src[0].equals(inst->src[0]) &&
325 src[1].equals(inst->src[1]) &&
326 src[2].equals(inst->src[2]) &&
327 saturate == inst->saturate &&
328 predicate == inst->predicate &&
329 conditional_mod == inst->conditional_mod &&
330 mlen == inst->mlen &&
331 base_mrf == inst->base_mrf &&
332 sampler == inst->sampler &&
333 target == inst->target &&
334 eot == inst->eot &&
335 header_present == inst->header_present &&
336 shadow_compare == inst->shadow_compare &&
337 offset == inst->offset);
338 }
339
340 bool
341 fs_inst::overwrites_reg(const fs_reg &reg)
342 {
343 return (reg.file == dst.file &&
344 reg.reg == dst.reg &&
345 reg.reg_offset >= dst.reg_offset &&
346 reg.reg_offset < dst.reg_offset + regs_written);
347 }
348
349 bool
350 fs_inst::is_send_from_grf()
351 {
352 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
353 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
354 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
355 src[1].file == GRF));
356 }
357
358 bool
359 fs_visitor::can_do_source_mods(fs_inst *inst)
360 {
361 if (brw->gen == 6 && inst->is_math())
362 return false;
363
364 if (inst->is_send_from_grf())
365 return false;
366
367 return true;
368 }
369
370 void
371 fs_reg::init()
372 {
373 memset(this, 0, sizeof(*this));
374 this->smear = -1;
375 }
376
377 /** Generic unset register constructor. */
378 fs_reg::fs_reg()
379 {
380 init();
381 this->file = BAD_FILE;
382 }
383
384 /** Immediate value constructor. */
385 fs_reg::fs_reg(float f)
386 {
387 init();
388 this->file = IMM;
389 this->type = BRW_REGISTER_TYPE_F;
390 this->imm.f = f;
391 }
392
393 /** Immediate value constructor. */
394 fs_reg::fs_reg(int32_t i)
395 {
396 init();
397 this->file = IMM;
398 this->type = BRW_REGISTER_TYPE_D;
399 this->imm.i = i;
400 }
401
402 /** Immediate value constructor. */
403 fs_reg::fs_reg(uint32_t u)
404 {
405 init();
406 this->file = IMM;
407 this->type = BRW_REGISTER_TYPE_UD;
408 this->imm.u = u;
409 }
410
411 /** Fixed brw_reg Immediate value constructor. */
412 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
413 {
414 init();
415 this->file = HW_REG;
416 this->fixed_hw_reg = fixed_hw_reg;
417 this->type = fixed_hw_reg.type;
418 }
419
420 bool
421 fs_reg::equals(const fs_reg &r) const
422 {
423 return (file == r.file &&
424 reg == r.reg &&
425 reg_offset == r.reg_offset &&
426 type == r.type &&
427 negate == r.negate &&
428 abs == r.abs &&
429 !reladdr && !r.reladdr &&
430 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
431 sizeof(fixed_hw_reg)) == 0 &&
432 smear == r.smear &&
433 imm.u == r.imm.u);
434 }
435
436 bool
437 fs_reg::is_zero() const
438 {
439 if (file != IMM)
440 return false;
441
442 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
443 }
444
445 bool
446 fs_reg::is_one() const
447 {
448 if (file != IMM)
449 return false;
450
451 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
452 }
453
454 bool
455 fs_reg::is_valid_3src() const
456 {
457 return file == GRF || file == UNIFORM;
458 }
459
460 int
461 fs_visitor::type_size(const struct glsl_type *type)
462 {
463 unsigned int size, i;
464
465 switch (type->base_type) {
466 case GLSL_TYPE_UINT:
467 case GLSL_TYPE_INT:
468 case GLSL_TYPE_FLOAT:
469 case GLSL_TYPE_BOOL:
470 return type->components();
471 case GLSL_TYPE_ARRAY:
472 return type_size(type->fields.array) * type->length;
473 case GLSL_TYPE_STRUCT:
474 size = 0;
475 for (i = 0; i < type->length; i++) {
476 size += type_size(type->fields.structure[i].type);
477 }
478 return size;
479 case GLSL_TYPE_SAMPLER:
480 /* Samplers take up no register space, since they're baked in at
481 * link time.
482 */
483 return 0;
484 case GLSL_TYPE_VOID:
485 case GLSL_TYPE_ERROR:
486 case GLSL_TYPE_INTERFACE:
487 assert(!"not reached");
488 break;
489 }
490
491 return 0;
492 }
493
494 fs_reg
495 fs_visitor::get_timestamp()
496 {
497 assert(brw->gen >= 7);
498
499 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
500 BRW_ARF_TIMESTAMP,
501 0),
502 BRW_REGISTER_TYPE_UD));
503
504 fs_reg dst = fs_reg(this, glsl_type::uint_type);
505
506 fs_inst *mov = emit(MOV(dst, ts));
507 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
508 * even if it's not enabled in the dispatch.
509 */
510 mov->force_writemask_all = true;
511 mov->force_uncompressed = true;
512
513 /* The caller wants the low 32 bits of the timestamp. Since it's running
514 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
515 * which is plenty of time for our purposes. It is identical across the
516 * EUs, but since it's tracking GPU core speed it will increment at a
517 * varying rate as render P-states change.
518 *
519 * The caller could also check if render P-states have changed (or anything
520 * else that might disrupt timing) by setting smear to 2 and checking if
521 * that field is != 0.
522 */
523 dst.smear = 0;
524
525 return dst;
526 }
527
528 void
529 fs_visitor::emit_shader_time_begin()
530 {
531 current_annotation = "shader time start";
532 shader_start_time = get_timestamp();
533 }
534
535 void
536 fs_visitor::emit_shader_time_end()
537 {
538 current_annotation = "shader time end";
539
540 enum shader_time_shader_type type, written_type, reset_type;
541 if (dispatch_width == 8) {
542 type = ST_FS8;
543 written_type = ST_FS8_WRITTEN;
544 reset_type = ST_FS8_RESET;
545 } else {
546 assert(dispatch_width == 16);
547 type = ST_FS16;
548 written_type = ST_FS16_WRITTEN;
549 reset_type = ST_FS16_RESET;
550 }
551
552 fs_reg shader_end_time = get_timestamp();
553
554 /* Check that there weren't any timestamp reset events (assuming these
555 * were the only two timestamp reads that happened).
556 */
557 fs_reg reset = shader_end_time;
558 reset.smear = 2;
559 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
560 test->conditional_mod = BRW_CONDITIONAL_Z;
561 emit(IF(BRW_PREDICATE_NORMAL));
562
563 push_force_uncompressed();
564 fs_reg start = shader_start_time;
565 start.negate = true;
566 fs_reg diff = fs_reg(this, glsl_type::uint_type);
567 emit(ADD(diff, start, shader_end_time));
568
569 /* If there were no instructions between the two timestamp gets, the diff
570 * is 2 cycles. Remove that overhead, so I can forget about that when
571 * trying to determine the time taken for single instructions.
572 */
573 emit(ADD(diff, diff, fs_reg(-2u)));
574
575 emit_shader_time_write(type, diff);
576 emit_shader_time_write(written_type, fs_reg(1u));
577 emit(BRW_OPCODE_ELSE);
578 emit_shader_time_write(reset_type, fs_reg(1u));
579 emit(BRW_OPCODE_ENDIF);
580
581 pop_force_uncompressed();
582 }
583
584 void
585 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
586 fs_reg value)
587 {
588 int shader_time_index =
589 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
590 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
591
592 fs_reg payload;
593 if (dispatch_width == 8)
594 payload = fs_reg(this, glsl_type::uvec2_type);
595 else
596 payload = fs_reg(this, glsl_type::uint_type);
597
598 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
599 fs_reg(), payload, offset, value));
600 }
601
602 void
603 fs_visitor::fail(const char *format, ...)
604 {
605 va_list va;
606 char *msg;
607
608 if (failed)
609 return;
610
611 failed = true;
612
613 va_start(va, format);
614 msg = ralloc_vasprintf(mem_ctx, format, va);
615 va_end(va);
616 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
617
618 this->fail_msg = msg;
619
620 if (INTEL_DEBUG & DEBUG_WM) {
621 fprintf(stderr, "%s", msg);
622 }
623 }
624
625 fs_inst *
626 fs_visitor::emit(enum opcode opcode)
627 {
628 return emit(fs_inst(opcode));
629 }
630
631 fs_inst *
632 fs_visitor::emit(enum opcode opcode, fs_reg dst)
633 {
634 return emit(fs_inst(opcode, dst));
635 }
636
637 fs_inst *
638 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
639 {
640 return emit(fs_inst(opcode, dst, src0));
641 }
642
643 fs_inst *
644 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
645 {
646 return emit(fs_inst(opcode, dst, src0, src1));
647 }
648
649 fs_inst *
650 fs_visitor::emit(enum opcode opcode, fs_reg dst,
651 fs_reg src0, fs_reg src1, fs_reg src2)
652 {
653 return emit(fs_inst(opcode, dst, src0, src1, src2));
654 }
655
656 void
657 fs_visitor::push_force_uncompressed()
658 {
659 force_uncompressed_stack++;
660 }
661
662 void
663 fs_visitor::pop_force_uncompressed()
664 {
665 force_uncompressed_stack--;
666 assert(force_uncompressed_stack >= 0);
667 }
668
669 void
670 fs_visitor::push_force_sechalf()
671 {
672 force_sechalf_stack++;
673 }
674
675 void
676 fs_visitor::pop_force_sechalf()
677 {
678 force_sechalf_stack--;
679 assert(force_sechalf_stack >= 0);
680 }
681
682 /**
683 * Returns true if the instruction has a flag that means it won't
684 * update an entire destination register.
685 *
686 * For example, dead code elimination and live variable analysis want to know
687 * when a write to a variable screens off any preceding values that were in
688 * it.
689 */
690 bool
691 fs_inst::is_partial_write()
692 {
693 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
694 this->force_uncompressed ||
695 this->force_sechalf);
696 }
697
698 /**
699 * Returns how many MRFs an FS opcode will write over.
700 *
701 * Note that this is not the 0 or 1 implied writes in an actual gen
702 * instruction -- the FS opcodes often generate MOVs in addition.
703 */
704 int
705 fs_visitor::implied_mrf_writes(fs_inst *inst)
706 {
707 if (inst->mlen == 0)
708 return 0;
709
710 switch (inst->opcode) {
711 case SHADER_OPCODE_RCP:
712 case SHADER_OPCODE_RSQ:
713 case SHADER_OPCODE_SQRT:
714 case SHADER_OPCODE_EXP2:
715 case SHADER_OPCODE_LOG2:
716 case SHADER_OPCODE_SIN:
717 case SHADER_OPCODE_COS:
718 return 1 * dispatch_width / 8;
719 case SHADER_OPCODE_POW:
720 case SHADER_OPCODE_INT_QUOTIENT:
721 case SHADER_OPCODE_INT_REMAINDER:
722 return 2 * dispatch_width / 8;
723 case SHADER_OPCODE_TEX:
724 case FS_OPCODE_TXB:
725 case SHADER_OPCODE_TXD:
726 case SHADER_OPCODE_TXF:
727 case SHADER_OPCODE_TXF_MS:
728 case SHADER_OPCODE_TXL:
729 case SHADER_OPCODE_TXS:
730 case SHADER_OPCODE_LOD:
731 return 1;
732 case FS_OPCODE_FB_WRITE:
733 return 2;
734 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
735 case FS_OPCODE_UNSPILL:
736 return 1;
737 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
738 return inst->mlen;
739 case FS_OPCODE_SPILL:
740 return 2;
741 default:
742 assert(!"not reached");
743 return inst->mlen;
744 }
745 }
746
747 int
748 fs_visitor::virtual_grf_alloc(int size)
749 {
750 if (virtual_grf_array_size <= virtual_grf_count) {
751 if (virtual_grf_array_size == 0)
752 virtual_grf_array_size = 16;
753 else
754 virtual_grf_array_size *= 2;
755 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
756 virtual_grf_array_size);
757 }
758 virtual_grf_sizes[virtual_grf_count] = size;
759 return virtual_grf_count++;
760 }
761
762 /** Fixed HW reg constructor. */
763 fs_reg::fs_reg(enum register_file file, int reg)
764 {
765 init();
766 this->file = file;
767 this->reg = reg;
768 this->type = BRW_REGISTER_TYPE_F;
769 }
770
771 /** Fixed HW reg constructor. */
772 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
773 {
774 init();
775 this->file = file;
776 this->reg = reg;
777 this->type = type;
778 }
779
780 /** Automatic reg constructor. */
781 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
782 {
783 init();
784
785 this->file = GRF;
786 this->reg = v->virtual_grf_alloc(v->type_size(type));
787 this->reg_offset = 0;
788 this->type = brw_type_for_base_type(type);
789 }
790
791 fs_reg *
792 fs_visitor::variable_storage(ir_variable *var)
793 {
794 return (fs_reg *)hash_table_find(this->variable_ht, var);
795 }
796
797 void
798 import_uniforms_callback(const void *key,
799 void *data,
800 void *closure)
801 {
802 struct hash_table *dst_ht = (struct hash_table *)closure;
803 const fs_reg *reg = (const fs_reg *)data;
804
805 if (reg->file != UNIFORM)
806 return;
807
808 hash_table_insert(dst_ht, data, key);
809 }
810
811 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
812 * This brings in those uniform definitions
813 */
814 void
815 fs_visitor::import_uniforms(fs_visitor *v)
816 {
817 hash_table_call_foreach(v->variable_ht,
818 import_uniforms_callback,
819 variable_ht);
820 this->params_remap = v->params_remap;
821 this->nr_params_remap = v->nr_params_remap;
822 }
823
824 /* Our support for uniforms is piggy-backed on the struct
825 * gl_fragment_program, because that's where the values actually
826 * get stored, rather than in some global gl_shader_program uniform
827 * store.
828 */
829 void
830 fs_visitor::setup_uniform_values(ir_variable *ir)
831 {
832 int namelen = strlen(ir->name);
833
834 /* The data for our (non-builtin) uniforms is stored in a series of
835 * gl_uniform_driver_storage structs for each subcomponent that
836 * glGetUniformLocation() could name. We know it's been set up in the same
837 * order we'd walk the type, so walk the list of storage and find anything
838 * with our name, or the prefix of a component that starts with our name.
839 */
840 unsigned params_before = c->prog_data.nr_params;
841 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
842 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
843
844 if (strncmp(ir->name, storage->name, namelen) != 0 ||
845 (storage->name[namelen] != 0 &&
846 storage->name[namelen] != '.' &&
847 storage->name[namelen] != '[')) {
848 continue;
849 }
850
851 unsigned slots = storage->type->component_slots();
852 if (storage->array_elements)
853 slots *= storage->array_elements;
854
855 for (unsigned i = 0; i < slots; i++) {
856 c->prog_data.param[c->prog_data.nr_params++] =
857 &storage->storage[i].f;
858 }
859 }
860
861 /* Make sure we actually initialized the right amount of stuff here. */
862 assert(params_before + ir->type->component_slots() ==
863 c->prog_data.nr_params);
864 (void)params_before;
865 }
866
867
868 /* Our support for builtin uniforms is even scarier than non-builtin.
869 * It sits on top of the PROG_STATE_VAR parameters that are
870 * automatically updated from GL context state.
871 */
872 void
873 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
874 {
875 const ir_state_slot *const slots = ir->state_slots;
876 assert(ir->state_slots != NULL);
877
878 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
879 /* This state reference has already been setup by ir_to_mesa, but we'll
880 * get the same index back here.
881 */
882 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
883 (gl_state_index *)slots[i].tokens);
884
885 /* Add each of the unique swizzles of the element as a parameter.
886 * This'll end up matching the expected layout of the
887 * array/matrix/structure we're trying to fill in.
888 */
889 int last_swiz = -1;
890 for (unsigned int j = 0; j < 4; j++) {
891 int swiz = GET_SWZ(slots[i].swizzle, j);
892 if (swiz == last_swiz)
893 break;
894 last_swiz = swiz;
895
896 c->prog_data.param[c->prog_data.nr_params++] =
897 &fp->Base.Parameters->ParameterValues[index][swiz].f;
898 }
899 }
900 }
901
902 fs_reg *
903 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
904 {
905 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
906 fs_reg wpos = *reg;
907 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
908
909 /* gl_FragCoord.x */
910 if (ir->pixel_center_integer) {
911 emit(MOV(wpos, this->pixel_x));
912 } else {
913 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
914 }
915 wpos.reg_offset++;
916
917 /* gl_FragCoord.y */
918 if (!flip && ir->pixel_center_integer) {
919 emit(MOV(wpos, this->pixel_y));
920 } else {
921 fs_reg pixel_y = this->pixel_y;
922 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
923
924 if (flip) {
925 pixel_y.negate = true;
926 offset += c->key.drawable_height - 1.0;
927 }
928
929 emit(ADD(wpos, pixel_y, fs_reg(offset)));
930 }
931 wpos.reg_offset++;
932
933 /* gl_FragCoord.z */
934 if (brw->gen >= 6) {
935 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
936 } else {
937 emit(FS_OPCODE_LINTERP, wpos,
938 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
939 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
940 interp_reg(VARYING_SLOT_POS, 2));
941 }
942 wpos.reg_offset++;
943
944 /* gl_FragCoord.w: Already set up in emit_interpolation */
945 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
946
947 return reg;
948 }
949
950 fs_inst *
951 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
952 glsl_interp_qualifier interpolation_mode,
953 bool is_centroid)
954 {
955 brw_wm_barycentric_interp_mode barycoord_mode;
956 if (brw->gen >= 6) {
957 if (is_centroid) {
958 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
959 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
960 else
961 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
962 } else {
963 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
964 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
965 else
966 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
967 }
968 } else {
969 /* On Ironlake and below, there is only one interpolation mode.
970 * Centroid interpolation doesn't mean anything on this hardware --
971 * there is no multisampling.
972 */
973 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
974 }
975 return emit(FS_OPCODE_LINTERP, attr,
976 this->delta_x[barycoord_mode],
977 this->delta_y[barycoord_mode], interp);
978 }
979
980 fs_reg *
981 fs_visitor::emit_general_interpolation(ir_variable *ir)
982 {
983 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
984 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
985 fs_reg attr = *reg;
986
987 unsigned int array_elements;
988 const glsl_type *type;
989
990 if (ir->type->is_array()) {
991 array_elements = ir->type->length;
992 if (array_elements == 0) {
993 fail("dereferenced array '%s' has length 0\n", ir->name);
994 }
995 type = ir->type->fields.array;
996 } else {
997 array_elements = 1;
998 type = ir->type;
999 }
1000
1001 glsl_interp_qualifier interpolation_mode =
1002 ir->determine_interpolation_mode(c->key.flat_shade);
1003
1004 int location = ir->location;
1005 for (unsigned int i = 0; i < array_elements; i++) {
1006 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1007 if (c->prog_data.urb_setup[location] == -1) {
1008 /* If there's no incoming setup data for this slot, don't
1009 * emit interpolation for it.
1010 */
1011 attr.reg_offset += type->vector_elements;
1012 location++;
1013 continue;
1014 }
1015
1016 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1017 /* Constant interpolation (flat shading) case. The SF has
1018 * handed us defined values in only the constant offset
1019 * field of the setup reg.
1020 */
1021 for (unsigned int k = 0; k < type->vector_elements; k++) {
1022 struct brw_reg interp = interp_reg(location, k);
1023 interp = suboffset(interp, 3);
1024 interp.type = reg->type;
1025 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1026 attr.reg_offset++;
1027 }
1028 } else {
1029 /* Smooth/noperspective interpolation case. */
1030 for (unsigned int k = 0; k < type->vector_elements; k++) {
1031 /* FINISHME: At some point we probably want to push
1032 * this farther by giving similar treatment to the
1033 * other potentially constant components of the
1034 * attribute, as well as making brw_vs_constval.c
1035 * handle varyings other than gl_TexCoord.
1036 */
1037 struct brw_reg interp = interp_reg(location, k);
1038 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1039 ir->centroid);
1040 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1041 /* Get the pixel/sample mask into f0 so that we know
1042 * which pixels are lit. Then, for each channel that is
1043 * unlit, replace the centroid data with non-centroid
1044 * data.
1045 */
1046 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1047 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1048 interpolation_mode, false);
1049 inst->predicate = BRW_PREDICATE_NORMAL;
1050 inst->predicate_inverse = true;
1051 }
1052 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1053 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1054 }
1055 attr.reg_offset++;
1056 }
1057
1058 }
1059 location++;
1060 }
1061 }
1062
1063 return reg;
1064 }
1065
1066 fs_reg *
1067 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1068 {
1069 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1070
1071 /* The frontfacing comes in as a bit in the thread payload. */
1072 if (brw->gen >= 6) {
1073 emit(BRW_OPCODE_ASR, *reg,
1074 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1075 fs_reg(15));
1076 emit(BRW_OPCODE_NOT, *reg, *reg);
1077 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1078 } else {
1079 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1080 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1081 * us front face
1082 */
1083 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1084 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1085 }
1086
1087 return reg;
1088 }
1089
1090 fs_reg
1091 fs_visitor::fix_math_operand(fs_reg src)
1092 {
1093 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1094 * might be able to do better by doing execsize = 1 math and then
1095 * expanding that result out, but we would need to be careful with
1096 * masking.
1097 *
1098 * The hardware ignores source modifiers (negate and abs) on math
1099 * instructions, so we also move to a temp to set those up.
1100 */
1101 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1102 !src.abs && !src.negate)
1103 return src;
1104
1105 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1106 * operands to math
1107 */
1108 if (brw->gen >= 7 && src.file != IMM)
1109 return src;
1110
1111 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1112 expanded.type = src.type;
1113 emit(BRW_OPCODE_MOV, expanded, src);
1114 return expanded;
1115 }
1116
1117 fs_inst *
1118 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1119 {
1120 switch (opcode) {
1121 case SHADER_OPCODE_RCP:
1122 case SHADER_OPCODE_RSQ:
1123 case SHADER_OPCODE_SQRT:
1124 case SHADER_OPCODE_EXP2:
1125 case SHADER_OPCODE_LOG2:
1126 case SHADER_OPCODE_SIN:
1127 case SHADER_OPCODE_COS:
1128 break;
1129 default:
1130 assert(!"not reached: bad math opcode");
1131 return NULL;
1132 }
1133
1134 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1135 * might be able to do better by doing execsize = 1 math and then
1136 * expanding that result out, but we would need to be careful with
1137 * masking.
1138 *
1139 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1140 * instructions, so we also move to a temp to set those up.
1141 */
1142 if (brw->gen >= 6)
1143 src = fix_math_operand(src);
1144
1145 fs_inst *inst = emit(opcode, dst, src);
1146
1147 if (brw->gen < 6) {
1148 inst->base_mrf = 2;
1149 inst->mlen = dispatch_width / 8;
1150 }
1151
1152 return inst;
1153 }
1154
1155 fs_inst *
1156 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1157 {
1158 int base_mrf = 2;
1159 fs_inst *inst;
1160
1161 switch (opcode) {
1162 case SHADER_OPCODE_INT_QUOTIENT:
1163 case SHADER_OPCODE_INT_REMAINDER:
1164 if (brw->gen >= 7 && dispatch_width == 16)
1165 fail("16-wide INTDIV unsupported\n");
1166 break;
1167 case SHADER_OPCODE_POW:
1168 break;
1169 default:
1170 assert(!"not reached: unsupported binary math opcode.");
1171 return NULL;
1172 }
1173
1174 if (brw->gen >= 6) {
1175 src0 = fix_math_operand(src0);
1176 src1 = fix_math_operand(src1);
1177
1178 inst = emit(opcode, dst, src0, src1);
1179 } else {
1180 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1181 * "Message Payload":
1182 *
1183 * "Operand0[7]. For the INT DIV functions, this operand is the
1184 * denominator."
1185 * ...
1186 * "Operand1[7]. For the INT DIV functions, this operand is the
1187 * numerator."
1188 */
1189 bool is_int_div = opcode != SHADER_OPCODE_POW;
1190 fs_reg &op0 = is_int_div ? src1 : src0;
1191 fs_reg &op1 = is_int_div ? src0 : src1;
1192
1193 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1194 inst = emit(opcode, dst, op0, reg_null_f);
1195
1196 inst->base_mrf = base_mrf;
1197 inst->mlen = 2 * dispatch_width / 8;
1198 }
1199 return inst;
1200 }
1201
1202 void
1203 fs_visitor::assign_curb_setup()
1204 {
1205 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1206 if (dispatch_width == 8) {
1207 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1208 } else {
1209 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1210 }
1211
1212 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1213 foreach_list(node, &this->instructions) {
1214 fs_inst *inst = (fs_inst *)node;
1215
1216 for (unsigned int i = 0; i < 3; i++) {
1217 if (inst->src[i].file == UNIFORM) {
1218 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1219 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1220 constant_nr / 8,
1221 constant_nr % 8);
1222
1223 inst->src[i].file = HW_REG;
1224 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1225 }
1226 }
1227 }
1228 }
1229
1230 void
1231 fs_visitor::calculate_urb_setup()
1232 {
1233 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1234 c->prog_data.urb_setup[i] = -1;
1235 }
1236
1237 int urb_next = 0;
1238 /* Figure out where each of the incoming setup attributes lands. */
1239 if (brw->gen >= 6) {
1240 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1241 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1242 BITFIELD64_BIT(i)) {
1243 c->prog_data.urb_setup[i] = urb_next++;
1244 }
1245 }
1246 } else {
1247 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1248 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1249 /* Point size is packed into the header, not as a general attribute */
1250 if (i == VARYING_SLOT_PSIZ)
1251 continue;
1252
1253 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1254 /* The back color slot is skipped when the front color is
1255 * also written to. In addition, some slots can be
1256 * written in the vertex shader and not read in the
1257 * fragment shader. So the register number must always be
1258 * incremented, mapped or not.
1259 */
1260 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1261 c->prog_data.urb_setup[i] = urb_next;
1262 urb_next++;
1263 }
1264 }
1265
1266 /*
1267 * It's a FS only attribute, and we did interpolation for this attribute
1268 * in SF thread. So, count it here, too.
1269 *
1270 * See compile_sf_prog() for more info.
1271 */
1272 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1273 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1274 }
1275
1276 c->prog_data.num_varying_inputs = urb_next;
1277 }
1278
1279 void
1280 fs_visitor::assign_urb_setup()
1281 {
1282 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1283
1284 /* Offset all the urb_setup[] index by the actual position of the
1285 * setup regs, now that the location of the constants has been chosen.
1286 */
1287 foreach_list(node, &this->instructions) {
1288 fs_inst *inst = (fs_inst *)node;
1289
1290 if (inst->opcode == FS_OPCODE_LINTERP) {
1291 assert(inst->src[2].file == HW_REG);
1292 inst->src[2].fixed_hw_reg.nr += urb_start;
1293 }
1294
1295 if (inst->opcode == FS_OPCODE_CINTERP) {
1296 assert(inst->src[0].file == HW_REG);
1297 inst->src[0].fixed_hw_reg.nr += urb_start;
1298 }
1299 }
1300
1301 /* Each attribute is 4 setup channels, each of which is half a reg. */
1302 this->first_non_payload_grf =
1303 urb_start + c->prog_data.num_varying_inputs * 2;
1304 }
1305
1306 /**
1307 * Split large virtual GRFs into separate components if we can.
1308 *
1309 * This is mostly duplicated with what brw_fs_vector_splitting does,
1310 * but that's really conservative because it's afraid of doing
1311 * splitting that doesn't result in real progress after the rest of
1312 * the optimization phases, which would cause infinite looping in
1313 * optimization. We can do it once here, safely. This also has the
1314 * opportunity to split interpolated values, or maybe even uniforms,
1315 * which we don't have at the IR level.
1316 *
1317 * We want to split, because virtual GRFs are what we register
1318 * allocate and spill (due to contiguousness requirements for some
1319 * instructions), and they're what we naturally generate in the
1320 * codegen process, but most virtual GRFs don't actually need to be
1321 * contiguous sets of GRFs. If we split, we'll end up with reduced
1322 * live intervals and better dead code elimination and coalescing.
1323 */
1324 void
1325 fs_visitor::split_virtual_grfs()
1326 {
1327 int num_vars = this->virtual_grf_count;
1328 bool split_grf[num_vars];
1329 int new_virtual_grf[num_vars];
1330
1331 /* Try to split anything > 0 sized. */
1332 for (int i = 0; i < num_vars; i++) {
1333 if (this->virtual_grf_sizes[i] != 1)
1334 split_grf[i] = true;
1335 else
1336 split_grf[i] = false;
1337 }
1338
1339 if (brw->has_pln &&
1340 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1341 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1342 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1343 * Gen6, that was the only supported interpolation mode, and since Gen6,
1344 * delta_x and delta_y are in fixed hardware registers.
1345 */
1346 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1347 false;
1348 }
1349
1350 foreach_list(node, &this->instructions) {
1351 fs_inst *inst = (fs_inst *)node;
1352
1353 /* If there's a SEND message that requires contiguous destination
1354 * registers, no splitting is allowed.
1355 */
1356 if (inst->regs_written > 1) {
1357 split_grf[inst->dst.reg] = false;
1358 }
1359
1360 /* If we're sending from a GRF, don't split it, on the assumption that
1361 * the send is reading the whole thing.
1362 */
1363 if (inst->is_send_from_grf()) {
1364 for (int i = 0; i < 3; i++) {
1365 if (inst->src[i].file == GRF) {
1366 split_grf[inst->src[i].reg] = false;
1367 }
1368 }
1369 }
1370 }
1371
1372 /* Allocate new space for split regs. Note that the virtual
1373 * numbers will be contiguous.
1374 */
1375 for (int i = 0; i < num_vars; i++) {
1376 if (split_grf[i]) {
1377 new_virtual_grf[i] = virtual_grf_alloc(1);
1378 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1379 int reg = virtual_grf_alloc(1);
1380 assert(reg == new_virtual_grf[i] + j - 1);
1381 (void) reg;
1382 }
1383 this->virtual_grf_sizes[i] = 1;
1384 }
1385 }
1386
1387 foreach_list(node, &this->instructions) {
1388 fs_inst *inst = (fs_inst *)node;
1389
1390 if (inst->dst.file == GRF &&
1391 split_grf[inst->dst.reg] &&
1392 inst->dst.reg_offset != 0) {
1393 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1394 inst->dst.reg_offset - 1);
1395 inst->dst.reg_offset = 0;
1396 }
1397 for (int i = 0; i < 3; i++) {
1398 if (inst->src[i].file == GRF &&
1399 split_grf[inst->src[i].reg] &&
1400 inst->src[i].reg_offset != 0) {
1401 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1402 inst->src[i].reg_offset - 1);
1403 inst->src[i].reg_offset = 0;
1404 }
1405 }
1406 }
1407 this->live_intervals_valid = false;
1408 }
1409
1410 /**
1411 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1412 *
1413 * During code generation, we create tons of temporary variables, many of
1414 * which get immediately killed and are never used again. Yet, in later
1415 * optimization and analysis passes, such as compute_live_intervals, we need
1416 * to loop over all the virtual GRFs. Compacting them can save a lot of
1417 * overhead.
1418 */
1419 void
1420 fs_visitor::compact_virtual_grfs()
1421 {
1422 /* Mark which virtual GRFs are used, and count how many. */
1423 int remap_table[this->virtual_grf_count];
1424 memset(remap_table, -1, sizeof(remap_table));
1425
1426 foreach_list(node, &this->instructions) {
1427 const fs_inst *inst = (const fs_inst *) node;
1428
1429 if (inst->dst.file == GRF)
1430 remap_table[inst->dst.reg] = 0;
1431
1432 for (int i = 0; i < 3; i++) {
1433 if (inst->src[i].file == GRF)
1434 remap_table[inst->src[i].reg] = 0;
1435 }
1436 }
1437
1438 /* In addition to registers used in instructions, fs_visitor keeps
1439 * direct references to certain special values which must be patched:
1440 */
1441 fs_reg *special[] = {
1442 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1443 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1444 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1445 &delta_x[0], &delta_x[1], &delta_x[2],
1446 &delta_x[3], &delta_x[4], &delta_x[5],
1447 &delta_y[0], &delta_y[1], &delta_y[2],
1448 &delta_y[3], &delta_y[4], &delta_y[5],
1449 };
1450 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1451 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1452
1453 /* Treat all special values as used, to be conservative */
1454 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1455 if (special[i]->file == GRF)
1456 remap_table[special[i]->reg] = 0;
1457 }
1458
1459 /* Compact the GRF arrays. */
1460 int new_index = 0;
1461 for (int i = 0; i < this->virtual_grf_count; i++) {
1462 if (remap_table[i] != -1) {
1463 remap_table[i] = new_index;
1464 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1465 if (live_intervals_valid) {
1466 virtual_grf_start[new_index] = virtual_grf_start[i];
1467 virtual_grf_end[new_index] = virtual_grf_end[i];
1468 }
1469 ++new_index;
1470 }
1471 }
1472
1473 this->virtual_grf_count = new_index;
1474
1475 /* Patch all the instructions to use the newly renumbered registers */
1476 foreach_list(node, &this->instructions) {
1477 fs_inst *inst = (fs_inst *) node;
1478
1479 if (inst->dst.file == GRF)
1480 inst->dst.reg = remap_table[inst->dst.reg];
1481
1482 for (int i = 0; i < 3; i++) {
1483 if (inst->src[i].file == GRF)
1484 inst->src[i].reg = remap_table[inst->src[i].reg];
1485 }
1486 }
1487
1488 /* Patch all the references to special values */
1489 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1490 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1491 special[i]->reg = remap_table[special[i]->reg];
1492 }
1493 }
1494
1495 bool
1496 fs_visitor::remove_dead_constants()
1497 {
1498 if (dispatch_width == 8) {
1499 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1500 this->nr_params_remap = c->prog_data.nr_params;
1501
1502 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1503 this->params_remap[i] = -1;
1504
1505 /* Find which params are still in use. */
1506 foreach_list(node, &this->instructions) {
1507 fs_inst *inst = (fs_inst *)node;
1508
1509 for (int i = 0; i < 3; i++) {
1510 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1511
1512 if (inst->src[i].file != UNIFORM)
1513 continue;
1514
1515 /* Section 5.11 of the OpenGL 4.3 spec says:
1516 *
1517 * "Out-of-bounds reads return undefined values, which include
1518 * values from other variables of the active program or zero."
1519 */
1520 if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1521 constant_nr = 0;
1522 }
1523
1524 /* For now, set this to non-negative. We'll give it the
1525 * actual new number in a moment, in order to keep the
1526 * register numbers nicely ordered.
1527 */
1528 this->params_remap[constant_nr] = 0;
1529 }
1530 }
1531
1532 /* Figure out what the new numbers for the params will be. At some
1533 * point when we're doing uniform array access, we're going to want
1534 * to keep the distinction between .reg and .reg_offset, but for
1535 * now we don't care.
1536 */
1537 unsigned int new_nr_params = 0;
1538 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1539 if (this->params_remap[i] != -1) {
1540 this->params_remap[i] = new_nr_params++;
1541 }
1542 }
1543
1544 /* Update the list of params to be uploaded to match our new numbering. */
1545 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1546 int remapped = this->params_remap[i];
1547
1548 if (remapped == -1)
1549 continue;
1550
1551 c->prog_data.param[remapped] = c->prog_data.param[i];
1552 }
1553
1554 c->prog_data.nr_params = new_nr_params;
1555 } else {
1556 /* This should have been generated in the 8-wide pass already. */
1557 assert(this->params_remap);
1558 }
1559
1560 /* Now do the renumbering of the shader to remove unused params. */
1561 foreach_list(node, &this->instructions) {
1562 fs_inst *inst = (fs_inst *)node;
1563
1564 for (int i = 0; i < 3; i++) {
1565 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1566
1567 if (inst->src[i].file != UNIFORM)
1568 continue;
1569
1570 /* as above alias to 0 */
1571 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1572 constant_nr = 0;
1573 }
1574 assert(this->params_remap[constant_nr] != -1);
1575 inst->src[i].reg = this->params_remap[constant_nr];
1576 inst->src[i].reg_offset = 0;
1577 }
1578 }
1579
1580 return true;
1581 }
1582
1583 /*
1584 * Implements array access of uniforms by inserting a
1585 * PULL_CONSTANT_LOAD instruction.
1586 *
1587 * Unlike temporary GRF array access (where we don't support it due to
1588 * the difficulty of doing relative addressing on instruction
1589 * destinations), we could potentially do array access of uniforms
1590 * that were loaded in GRF space as push constants. In real-world
1591 * usage we've seen, though, the arrays being used are always larger
1592 * than we could load as push constants, so just always move all
1593 * uniform array access out to a pull constant buffer.
1594 */
1595 void
1596 fs_visitor::move_uniform_array_access_to_pull_constants()
1597 {
1598 int pull_constant_loc[c->prog_data.nr_params];
1599
1600 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1601 pull_constant_loc[i] = -1;
1602 }
1603
1604 /* Walk through and find array access of uniforms. Put a copy of that
1605 * uniform in the pull constant buffer.
1606 *
1607 * Note that we don't move constant-indexed accesses to arrays. No
1608 * testing has been done of the performance impact of this choice.
1609 */
1610 foreach_list_safe(node, &this->instructions) {
1611 fs_inst *inst = (fs_inst *)node;
1612
1613 for (int i = 0 ; i < 3; i++) {
1614 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1615 continue;
1616
1617 int uniform = inst->src[i].reg;
1618
1619 /* If this array isn't already present in the pull constant buffer,
1620 * add it.
1621 */
1622 if (pull_constant_loc[uniform] == -1) {
1623 const float **values = &c->prog_data.param[uniform];
1624
1625 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1626
1627 assert(param_size[uniform]);
1628
1629 for (int j = 0; j < param_size[uniform]; j++) {
1630 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1631 values[j];
1632 }
1633 }
1634
1635 /* Set up the annotation tracking for new generated instructions. */
1636 base_ir = inst->ir;
1637 current_annotation = inst->annotation;
1638
1639 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1640 fs_reg temp = fs_reg(this, glsl_type::float_type);
1641 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1642 surf_index,
1643 *inst->src[i].reladdr,
1644 pull_constant_loc[uniform] +
1645 inst->src[i].reg_offset);
1646 inst->insert_before(&list);
1647
1648 inst->src[i].file = temp.file;
1649 inst->src[i].reg = temp.reg;
1650 inst->src[i].reg_offset = temp.reg_offset;
1651 inst->src[i].reladdr = NULL;
1652 }
1653 }
1654 }
1655
1656 /**
1657 * Choose accesses from the UNIFORM file to demote to using the pull
1658 * constant buffer.
1659 *
1660 * We allow a fragment shader to have more than the specified minimum
1661 * maximum number of fragment shader uniform components (64). If
1662 * there are too many of these, they'd fill up all of register space.
1663 * So, this will push some of them out to the pull constant buffer and
1664 * update the program to load them.
1665 */
1666 void
1667 fs_visitor::setup_pull_constants()
1668 {
1669 /* Only allow 16 registers (128 uniform components) as push constants. */
1670 unsigned int max_uniform_components = 16 * 8;
1671 if (c->prog_data.nr_params <= max_uniform_components)
1672 return;
1673
1674 if (dispatch_width == 16) {
1675 fail("Pull constants not supported in 16-wide\n");
1676 return;
1677 }
1678
1679 /* Just demote the end of the list. We could probably do better
1680 * here, demoting things that are rarely used in the program first.
1681 */
1682 unsigned int pull_uniform_base = max_uniform_components;
1683
1684 int pull_constant_loc[c->prog_data.nr_params];
1685 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1686 if (i < pull_uniform_base) {
1687 pull_constant_loc[i] = -1;
1688 } else {
1689 pull_constant_loc[i] = -1;
1690 /* If our constant is already being uploaded for reladdr purposes,
1691 * reuse it.
1692 */
1693 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1694 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1695 pull_constant_loc[i] = j;
1696 break;
1697 }
1698 }
1699 if (pull_constant_loc[i] == -1) {
1700 int pull_index = c->prog_data.nr_pull_params++;
1701 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1702 pull_constant_loc[i] = pull_index;;
1703 }
1704 }
1705 }
1706 c->prog_data.nr_params = pull_uniform_base;
1707
1708 foreach_list(node, &this->instructions) {
1709 fs_inst *inst = (fs_inst *)node;
1710
1711 for (int i = 0; i < 3; i++) {
1712 if (inst->src[i].file != UNIFORM)
1713 continue;
1714
1715 int pull_index = pull_constant_loc[inst->src[i].reg +
1716 inst->src[i].reg_offset];
1717 if (pull_index == -1)
1718 continue;
1719
1720 assert(!inst->src[i].reladdr);
1721
1722 fs_reg dst = fs_reg(this, glsl_type::float_type);
1723 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1724 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1725 fs_inst *pull =
1726 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1727 dst, index, offset);
1728 pull->ir = inst->ir;
1729 pull->annotation = inst->annotation;
1730
1731 inst->insert_before(pull);
1732
1733 inst->src[i].file = GRF;
1734 inst->src[i].reg = dst.reg;
1735 inst->src[i].reg_offset = 0;
1736 inst->src[i].smear = pull_index & 3;
1737 }
1738 }
1739 }
1740
1741 bool
1742 fs_visitor::opt_algebraic()
1743 {
1744 bool progress = false;
1745
1746 foreach_list(node, &this->instructions) {
1747 fs_inst *inst = (fs_inst *)node;
1748
1749 switch (inst->opcode) {
1750 case BRW_OPCODE_MUL:
1751 if (inst->src[1].file != IMM)
1752 continue;
1753
1754 /* a * 1.0 = a */
1755 if (inst->src[1].is_one()) {
1756 inst->opcode = BRW_OPCODE_MOV;
1757 inst->src[1] = reg_undef;
1758 progress = true;
1759 break;
1760 }
1761
1762 /* a * 0.0 = 0.0 */
1763 if (inst->src[1].is_zero()) {
1764 inst->opcode = BRW_OPCODE_MOV;
1765 inst->src[0] = inst->src[1];
1766 inst->src[1] = reg_undef;
1767 progress = true;
1768 break;
1769 }
1770
1771 break;
1772 case BRW_OPCODE_ADD:
1773 if (inst->src[1].file != IMM)
1774 continue;
1775
1776 /* a + 0.0 = a */
1777 if (inst->src[1].is_zero()) {
1778 inst->opcode = BRW_OPCODE_MOV;
1779 inst->src[1] = reg_undef;
1780 progress = true;
1781 break;
1782 }
1783 break;
1784 default:
1785 break;
1786 }
1787 }
1788
1789 return progress;
1790 }
1791
1792 /**
1793 * Removes any instructions writing a VGRF where that VGRF is not used by any
1794 * later instruction.
1795 */
1796 bool
1797 fs_visitor::dead_code_eliminate()
1798 {
1799 bool progress = false;
1800 int pc = 0;
1801
1802 calculate_live_intervals();
1803
1804 foreach_list_safe(node, &this->instructions) {
1805 fs_inst *inst = (fs_inst *)node;
1806
1807 if (inst->dst.file == GRF) {
1808 assert(this->virtual_grf_end[inst->dst.reg] >= pc);
1809 if (this->virtual_grf_end[inst->dst.reg] == pc) {
1810 inst->remove();
1811 progress = true;
1812 }
1813 }
1814
1815 pc++;
1816 }
1817
1818 if (progress)
1819 live_intervals_valid = false;
1820
1821 return progress;
1822 }
1823
1824 struct dead_code_hash_key
1825 {
1826 int vgrf;
1827 int reg_offset;
1828 };
1829
1830 static bool
1831 dead_code_hash_compare(const void *a, const void *b)
1832 {
1833 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1834 }
1835
1836 static void
1837 clear_dead_code_hash(struct hash_table *ht)
1838 {
1839 struct hash_entry *entry;
1840
1841 hash_table_foreach(ht, entry) {
1842 _mesa_hash_table_remove(ht, entry);
1843 }
1844 }
1845
1846 static void
1847 insert_dead_code_hash(struct hash_table *ht,
1848 int vgrf, int reg_offset, fs_inst *inst)
1849 {
1850 /* We don't bother freeing keys, because they'll be GCed with the ht. */
1851 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1852
1853 key->vgrf = vgrf;
1854 key->reg_offset = reg_offset;
1855
1856 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1857 }
1858
1859 static struct hash_entry *
1860 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1861 {
1862 struct dead_code_hash_key key;
1863
1864 key.vgrf = vgrf;
1865 key.reg_offset = reg_offset;
1866
1867 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1868 }
1869
1870 static void
1871 remove_dead_code_hash(struct hash_table *ht,
1872 int vgrf, int reg_offset)
1873 {
1874 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1875 if (!entry)
1876 return;
1877
1878 _mesa_hash_table_remove(ht, entry);
1879 }
1880
1881 /**
1882 * Walks basic blocks, removing any regs that are written but not read before
1883 * being redefined.
1884 *
1885 * The dead_code_eliminate() function implements a global dead code
1886 * elimination, but it only handles the removing the last write to a register
1887 * if it's never read. This one can handle intermediate writes, but only
1888 * within a basic block.
1889 */
1890 bool
1891 fs_visitor::dead_code_eliminate_local()
1892 {
1893 struct hash_table *ht;
1894 bool progress = false;
1895
1896 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1897
1898 foreach_list_safe(node, &this->instructions) {
1899 fs_inst *inst = (fs_inst *)node;
1900
1901 /* At a basic block, empty the HT since we don't understand dataflow
1902 * here.
1903 */
1904 if (inst->is_control_flow()) {
1905 clear_dead_code_hash(ht);
1906 continue;
1907 }
1908
1909 /* Clear the HT of any instructions that got read. */
1910 for (int i = 0; i < 3; i++) {
1911 fs_reg src = inst->src[i];
1912 if (src.file != GRF)
1913 continue;
1914
1915 int read = 1;
1916 if (inst->is_send_from_grf())
1917 read = virtual_grf_sizes[src.reg] - src.reg_offset;
1918
1919 for (int reg_offset = src.reg_offset;
1920 reg_offset < src.reg_offset + read;
1921 reg_offset++) {
1922 remove_dead_code_hash(ht, src.reg, reg_offset);
1923 }
1924 }
1925
1926 /* Add any update of a GRF to the HT, removing a previous write if it
1927 * wasn't read.
1928 */
1929 if (inst->dst.file == GRF) {
1930 if (inst->regs_written > 1) {
1931 /* We don't know how to trim channels from an instruction's
1932 * writes, so we can't incrementally remove unread channels from
1933 * it. Just remove whatever it overwrites from the table
1934 */
1935 for (int i = 0; i < inst->regs_written; i++) {
1936 remove_dead_code_hash(ht,
1937 inst->dst.reg,
1938 inst->dst.reg_offset + i);
1939 }
1940 } else {
1941 struct hash_entry *entry =
1942 get_dead_code_hash_entry(ht, inst->dst.reg,
1943 inst->dst.reg_offset);
1944
1945 if (inst->is_partial_write()) {
1946 /* For a partial write, we can't remove any previous dead code
1947 * candidate, since we're just modifying their result, but we can
1948 * be dead code eliminiated ourselves.
1949 */
1950 if (entry) {
1951 entry->data = inst;
1952 } else {
1953 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1954 inst);
1955 }
1956 } else {
1957 if (entry) {
1958 /* We're completely updating a channel, and there was a
1959 * previous write to the channel that wasn't read. Kill it!
1960 */
1961 fs_inst *inst = (fs_inst *)entry->data;
1962 inst->remove();
1963 progress = true;
1964 _mesa_hash_table_remove(ht, entry);
1965 }
1966
1967 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1968 inst);
1969 }
1970 }
1971 }
1972 }
1973
1974 _mesa_hash_table_destroy(ht, NULL);
1975
1976 if (progress)
1977 live_intervals_valid = false;
1978
1979 return progress;
1980 }
1981
1982 /**
1983 * Implements a second type of register coalescing: This one checks if
1984 * the two regs involved in a raw move don't interfere, in which case
1985 * they can both by stored in the same place and the MOV removed.
1986 */
1987 bool
1988 fs_visitor::register_coalesce_2()
1989 {
1990 bool progress = false;
1991
1992 calculate_live_intervals();
1993
1994 foreach_list_safe(node, &this->instructions) {
1995 fs_inst *inst = (fs_inst *)node;
1996
1997 if (inst->opcode != BRW_OPCODE_MOV ||
1998 inst->is_partial_write() ||
1999 inst->saturate ||
2000 inst->src[0].file != GRF ||
2001 inst->src[0].negate ||
2002 inst->src[0].abs ||
2003 inst->src[0].smear != -1 ||
2004 inst->dst.file != GRF ||
2005 inst->dst.type != inst->src[0].type ||
2006 virtual_grf_sizes[inst->src[0].reg] != 1 ||
2007 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
2008 continue;
2009 }
2010
2011 int reg_from = inst->src[0].reg;
2012 assert(inst->src[0].reg_offset == 0);
2013 int reg_to = inst->dst.reg;
2014 int reg_to_offset = inst->dst.reg_offset;
2015
2016 foreach_list(node, &this->instructions) {
2017 fs_inst *scan_inst = (fs_inst *)node;
2018
2019 if (scan_inst->dst.file == GRF &&
2020 scan_inst->dst.reg == reg_from) {
2021 scan_inst->dst.reg = reg_to;
2022 scan_inst->dst.reg_offset = reg_to_offset;
2023 }
2024 for (int i = 0; i < 3; i++) {
2025 if (scan_inst->src[i].file == GRF &&
2026 scan_inst->src[i].reg == reg_from) {
2027 scan_inst->src[i].reg = reg_to;
2028 scan_inst->src[i].reg_offset = reg_to_offset;
2029 }
2030 }
2031 }
2032
2033 inst->remove();
2034
2035 /* We don't need to recalculate live intervals inside the loop despite
2036 * flagging live_intervals_valid because we only use live intervals for
2037 * the interferes test, and we must have had a situation where the
2038 * intervals were:
2039 *
2040 * from to
2041 * ^
2042 * |
2043 * v
2044 * ^
2045 * |
2046 * v
2047 *
2048 * Some register R that might get coalesced with one of these two could
2049 * only be referencing "to", otherwise "from"'s range would have been
2050 * longer. R's range could also only start at the end of "to" or later,
2051 * otherwise it will conflict with "to" when we try to coalesce "to"
2052 * into Rw anyway.
2053 */
2054 live_intervals_valid = false;
2055
2056 progress = true;
2057 continue;
2058 }
2059
2060 return progress;
2061 }
2062
2063 bool
2064 fs_visitor::register_coalesce()
2065 {
2066 bool progress = false;
2067 int if_depth = 0;
2068 int loop_depth = 0;
2069
2070 foreach_list_safe(node, &this->instructions) {
2071 fs_inst *inst = (fs_inst *)node;
2072
2073 /* Make sure that we dominate the instructions we're going to
2074 * scan for interfering with our coalescing, or we won't have
2075 * scanned enough to see if anything interferes with our
2076 * coalescing. We don't dominate the following instructions if
2077 * we're in a loop or an if block.
2078 */
2079 switch (inst->opcode) {
2080 case BRW_OPCODE_DO:
2081 loop_depth++;
2082 break;
2083 case BRW_OPCODE_WHILE:
2084 loop_depth--;
2085 break;
2086 case BRW_OPCODE_IF:
2087 if_depth++;
2088 break;
2089 case BRW_OPCODE_ENDIF:
2090 if_depth--;
2091 break;
2092 default:
2093 break;
2094 }
2095 if (loop_depth || if_depth)
2096 continue;
2097
2098 if (inst->opcode != BRW_OPCODE_MOV ||
2099 inst->is_partial_write() ||
2100 inst->saturate ||
2101 inst->dst.file != GRF || (inst->src[0].file != GRF &&
2102 inst->src[0].file != UNIFORM)||
2103 inst->dst.type != inst->src[0].type)
2104 continue;
2105
2106 bool has_source_modifiers = (inst->src[0].abs ||
2107 inst->src[0].negate ||
2108 inst->src[0].smear != -1 ||
2109 inst->src[0].file == UNIFORM);
2110
2111 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
2112 * them: check for no writes to either one until the exit of the
2113 * program.
2114 */
2115 bool interfered = false;
2116
2117 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2118 !scan_inst->is_tail_sentinel();
2119 scan_inst = (fs_inst *)scan_inst->next) {
2120 if (scan_inst->dst.file == GRF) {
2121 if (scan_inst->overwrites_reg(inst->dst) ||
2122 scan_inst->overwrites_reg(inst->src[0])) {
2123 interfered = true;
2124 break;
2125 }
2126 }
2127
2128 if (has_source_modifiers) {
2129 for (int i = 0; i < 3; i++) {
2130 if (scan_inst->src[i].file == GRF &&
2131 scan_inst->src[i].reg == inst->dst.reg &&
2132 scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2133 inst->dst.type != scan_inst->src[i].type)
2134 {
2135 interfered = true;
2136 break;
2137 }
2138 }
2139 }
2140
2141
2142 /* The gen6 MATH instruction can't handle source modifiers or
2143 * unusual register regions, so avoid coalescing those for
2144 * now. We should do something more specific.
2145 */
2146 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2147 interfered = true;
2148 break;
2149 }
2150
2151 /* The accumulator result appears to get used for the
2152 * conditional modifier generation. When negating a UD
2153 * value, there is a 33rd bit generated for the sign in the
2154 * accumulator value, so now you can't check, for example,
2155 * equality with a 32-bit value. See piglit fs-op-neg-uint.
2156 */
2157 if (scan_inst->conditional_mod &&
2158 inst->src[0].negate &&
2159 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2160 interfered = true;
2161 break;
2162 }
2163 }
2164 if (interfered) {
2165 continue;
2166 }
2167
2168 /* Rewrite the later usage to point at the source of the move to
2169 * be removed.
2170 */
2171 for (fs_inst *scan_inst = inst;
2172 !scan_inst->is_tail_sentinel();
2173 scan_inst = (fs_inst *)scan_inst->next) {
2174 for (int i = 0; i < 3; i++) {
2175 if (scan_inst->src[i].file == GRF &&
2176 scan_inst->src[i].reg == inst->dst.reg &&
2177 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2178 fs_reg new_src = inst->src[0];
2179 if (scan_inst->src[i].abs) {
2180 new_src.negate = 0;
2181 new_src.abs = 1;
2182 }
2183 new_src.negate ^= scan_inst->src[i].negate;
2184 scan_inst->src[i] = new_src;
2185 }
2186 }
2187 }
2188
2189 inst->remove();
2190 progress = true;
2191 }
2192
2193 if (progress)
2194 live_intervals_valid = false;
2195
2196 return progress;
2197 }
2198
2199
2200 bool
2201 fs_visitor::compute_to_mrf()
2202 {
2203 bool progress = false;
2204 int next_ip = 0;
2205
2206 calculate_live_intervals();
2207
2208 foreach_list_safe(node, &this->instructions) {
2209 fs_inst *inst = (fs_inst *)node;
2210
2211 int ip = next_ip;
2212 next_ip++;
2213
2214 if (inst->opcode != BRW_OPCODE_MOV ||
2215 inst->is_partial_write() ||
2216 inst->dst.file != MRF || inst->src[0].file != GRF ||
2217 inst->dst.type != inst->src[0].type ||
2218 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2219 continue;
2220
2221 /* Work out which hardware MRF registers are written by this
2222 * instruction.
2223 */
2224 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2225 int mrf_high;
2226 if (inst->dst.reg & BRW_MRF_COMPR4) {
2227 mrf_high = mrf_low + 4;
2228 } else if (dispatch_width == 16 &&
2229 (!inst->force_uncompressed && !inst->force_sechalf)) {
2230 mrf_high = mrf_low + 1;
2231 } else {
2232 mrf_high = mrf_low;
2233 }
2234
2235 /* Can't compute-to-MRF this GRF if someone else was going to
2236 * read it later.
2237 */
2238 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2239 continue;
2240
2241 /* Found a move of a GRF to a MRF. Let's see if we can go
2242 * rewrite the thing that made this GRF to write into the MRF.
2243 */
2244 fs_inst *scan_inst;
2245 for (scan_inst = (fs_inst *)inst->prev;
2246 scan_inst->prev != NULL;
2247 scan_inst = (fs_inst *)scan_inst->prev) {
2248 if (scan_inst->dst.file == GRF &&
2249 scan_inst->dst.reg == inst->src[0].reg) {
2250 /* Found the last thing to write our reg we want to turn
2251 * into a compute-to-MRF.
2252 */
2253
2254 /* If this one instruction didn't populate all the
2255 * channels, bail. We might be able to rewrite everything
2256 * that writes that reg, but it would require smarter
2257 * tracking to delay the rewriting until complete success.
2258 */
2259 if (scan_inst->is_partial_write())
2260 break;
2261
2262 /* Things returning more than one register would need us to
2263 * understand coalescing out more than one MOV at a time.
2264 */
2265 if (scan_inst->regs_written > 1)
2266 break;
2267
2268 /* SEND instructions can't have MRF as a destination. */
2269 if (scan_inst->mlen)
2270 break;
2271
2272 if (brw->gen == 6) {
2273 /* gen6 math instructions must have the destination be
2274 * GRF, so no compute-to-MRF for them.
2275 */
2276 if (scan_inst->is_math()) {
2277 break;
2278 }
2279 }
2280
2281 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2282 /* Found the creator of our MRF's source value. */
2283 scan_inst->dst.file = MRF;
2284 scan_inst->dst.reg = inst->dst.reg;
2285 scan_inst->saturate |= inst->saturate;
2286 inst->remove();
2287 progress = true;
2288 }
2289 break;
2290 }
2291
2292 /* We don't handle control flow here. Most computation of
2293 * values that end up in MRFs are shortly before the MRF
2294 * write anyway.
2295 */
2296 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2297 break;
2298
2299 /* You can't read from an MRF, so if someone else reads our
2300 * MRF's source GRF that we wanted to rewrite, that stops us.
2301 */
2302 bool interfered = false;
2303 for (int i = 0; i < 3; i++) {
2304 if (scan_inst->src[i].file == GRF &&
2305 scan_inst->src[i].reg == inst->src[0].reg &&
2306 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2307 interfered = true;
2308 }
2309 }
2310 if (interfered)
2311 break;
2312
2313 if (scan_inst->dst.file == MRF) {
2314 /* If somebody else writes our MRF here, we can't
2315 * compute-to-MRF before that.
2316 */
2317 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2318 int scan_mrf_high;
2319
2320 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2321 scan_mrf_high = scan_mrf_low + 4;
2322 } else if (dispatch_width == 16 &&
2323 (!scan_inst->force_uncompressed &&
2324 !scan_inst->force_sechalf)) {
2325 scan_mrf_high = scan_mrf_low + 1;
2326 } else {
2327 scan_mrf_high = scan_mrf_low;
2328 }
2329
2330 if (mrf_low == scan_mrf_low ||
2331 mrf_low == scan_mrf_high ||
2332 mrf_high == scan_mrf_low ||
2333 mrf_high == scan_mrf_high) {
2334 break;
2335 }
2336 }
2337
2338 if (scan_inst->mlen > 0) {
2339 /* Found a SEND instruction, which means that there are
2340 * live values in MRFs from base_mrf to base_mrf +
2341 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2342 * above it.
2343 */
2344 if (mrf_low >= scan_inst->base_mrf &&
2345 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2346 break;
2347 }
2348 if (mrf_high >= scan_inst->base_mrf &&
2349 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2350 break;
2351 }
2352 }
2353 }
2354 }
2355
2356 if (progress)
2357 live_intervals_valid = false;
2358
2359 return progress;
2360 }
2361
2362 /**
2363 * Walks through basic blocks, looking for repeated MRF writes and
2364 * removing the later ones.
2365 */
2366 bool
2367 fs_visitor::remove_duplicate_mrf_writes()
2368 {
2369 fs_inst *last_mrf_move[16];
2370 bool progress = false;
2371
2372 /* Need to update the MRF tracking for compressed instructions. */
2373 if (dispatch_width == 16)
2374 return false;
2375
2376 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2377
2378 foreach_list_safe(node, &this->instructions) {
2379 fs_inst *inst = (fs_inst *)node;
2380
2381 if (inst->is_control_flow()) {
2382 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2383 }
2384
2385 if (inst->opcode == BRW_OPCODE_MOV &&
2386 inst->dst.file == MRF) {
2387 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2388 if (prev_inst && inst->equals(prev_inst)) {
2389 inst->remove();
2390 progress = true;
2391 continue;
2392 }
2393 }
2394
2395 /* Clear out the last-write records for MRFs that were overwritten. */
2396 if (inst->dst.file == MRF) {
2397 last_mrf_move[inst->dst.reg] = NULL;
2398 }
2399
2400 if (inst->mlen > 0) {
2401 /* Found a SEND instruction, which will include two or fewer
2402 * implied MRF writes. We could do better here.
2403 */
2404 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2405 last_mrf_move[inst->base_mrf + i] = NULL;
2406 }
2407 }
2408
2409 /* Clear out any MRF move records whose sources got overwritten. */
2410 if (inst->dst.file == GRF) {
2411 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2412 if (last_mrf_move[i] &&
2413 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2414 last_mrf_move[i] = NULL;
2415 }
2416 }
2417 }
2418
2419 if (inst->opcode == BRW_OPCODE_MOV &&
2420 inst->dst.file == MRF &&
2421 inst->src[0].file == GRF &&
2422 !inst->is_partial_write()) {
2423 last_mrf_move[inst->dst.reg] = inst;
2424 }
2425 }
2426
2427 if (progress)
2428 live_intervals_valid = false;
2429
2430 return progress;
2431 }
2432
2433 static void
2434 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2435 int first_grf, int grf_len)
2436 {
2437 bool inst_16wide = (dispatch_width > 8 &&
2438 !inst->force_uncompressed &&
2439 !inst->force_sechalf);
2440
2441 /* Clear the flag for registers that actually got read (as expected). */
2442 for (int i = 0; i < 3; i++) {
2443 int grf;
2444 if (inst->src[i].file == GRF) {
2445 grf = inst->src[i].reg;
2446 } else if (inst->src[i].file == HW_REG &&
2447 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2448 grf = inst->src[i].fixed_hw_reg.nr;
2449 } else {
2450 continue;
2451 }
2452
2453 if (grf >= first_grf &&
2454 grf < first_grf + grf_len) {
2455 deps[grf - first_grf] = false;
2456 if (inst_16wide)
2457 deps[grf - first_grf + 1] = false;
2458 }
2459 }
2460 }
2461
2462 /**
2463 * Implements this workaround for the original 965:
2464 *
2465 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2466 * check for post destination dependencies on this instruction, software
2467 * must ensure that there is no destination hazard for the case of ‘write
2468 * followed by a posted write’ shown in the following example.
2469 *
2470 * 1. mov r3 0
2471 * 2. send r3.xy <rest of send instruction>
2472 * 3. mov r2 r3
2473 *
2474 * Due to no post-destination dependency check on the ‘send’, the above
2475 * code sequence could have two instructions (1 and 2) in flight at the
2476 * same time that both consider ‘r3’ as the target of their final writes.
2477 */
2478 void
2479 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2480 {
2481 int reg_size = dispatch_width / 8;
2482 int write_len = inst->regs_written * reg_size;
2483 int first_write_grf = inst->dst.reg;
2484 bool needs_dep[BRW_MAX_MRF];
2485 assert(write_len < (int)sizeof(needs_dep) - 1);
2486
2487 memset(needs_dep, false, sizeof(needs_dep));
2488 memset(needs_dep, true, write_len);
2489
2490 clear_deps_for_inst_src(inst, dispatch_width,
2491 needs_dep, first_write_grf, write_len);
2492
2493 /* Walk backwards looking for writes to registers we're writing which
2494 * aren't read since being written. If we hit the start of the program,
2495 * we assume that there are no outstanding dependencies on entry to the
2496 * program.
2497 */
2498 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2499 scan_inst != NULL;
2500 scan_inst = (fs_inst *)scan_inst->prev) {
2501
2502 /* If we hit control flow, assume that there *are* outstanding
2503 * dependencies, and force their cleanup before our instruction.
2504 */
2505 if (scan_inst->is_control_flow()) {
2506 for (int i = 0; i < write_len; i++) {
2507 if (needs_dep[i]) {
2508 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2509 }
2510 }
2511 return;
2512 }
2513
2514 bool scan_inst_16wide = (dispatch_width > 8 &&
2515 !scan_inst->force_uncompressed &&
2516 !scan_inst->force_sechalf);
2517
2518 /* We insert our reads as late as possible on the assumption that any
2519 * instruction but a MOV that might have left us an outstanding
2520 * dependency has more latency than a MOV.
2521 */
2522 if (scan_inst->dst.file == GRF) {
2523 for (int i = 0; i < scan_inst->regs_written; i++) {
2524 int reg = scan_inst->dst.reg + i * reg_size;
2525
2526 if (reg >= first_write_grf &&
2527 reg < first_write_grf + write_len &&
2528 needs_dep[reg - first_write_grf]) {
2529 inst->insert_before(DEP_RESOLVE_MOV(reg));
2530 needs_dep[reg - first_write_grf] = false;
2531 if (scan_inst_16wide)
2532 needs_dep[reg - first_write_grf + 1] = false;
2533 }
2534 }
2535 }
2536
2537 /* Clear the flag for registers that actually got read (as expected). */
2538 clear_deps_for_inst_src(scan_inst, dispatch_width,
2539 needs_dep, first_write_grf, write_len);
2540
2541 /* Continue the loop only if we haven't resolved all the dependencies */
2542 int i;
2543 for (i = 0; i < write_len; i++) {
2544 if (needs_dep[i])
2545 break;
2546 }
2547 if (i == write_len)
2548 return;
2549 }
2550 }
2551
2552 /**
2553 * Implements this workaround for the original 965:
2554 *
2555 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2556 * used as a destination register until after it has been sourced by an
2557 * instruction with a different destination register.
2558 */
2559 void
2560 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2561 {
2562 int write_len = inst->regs_written * dispatch_width / 8;
2563 int first_write_grf = inst->dst.reg;
2564 bool needs_dep[BRW_MAX_MRF];
2565 assert(write_len < (int)sizeof(needs_dep) - 1);
2566
2567 memset(needs_dep, false, sizeof(needs_dep));
2568 memset(needs_dep, true, write_len);
2569 /* Walk forwards looking for writes to registers we're writing which aren't
2570 * read before being written.
2571 */
2572 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2573 !scan_inst->is_tail_sentinel();
2574 scan_inst = (fs_inst *)scan_inst->next) {
2575 /* If we hit control flow, force resolve all remaining dependencies. */
2576 if (scan_inst->is_control_flow()) {
2577 for (int i = 0; i < write_len; i++) {
2578 if (needs_dep[i])
2579 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2580 }
2581 return;
2582 }
2583
2584 /* Clear the flag for registers that actually got read (as expected). */
2585 clear_deps_for_inst_src(scan_inst, dispatch_width,
2586 needs_dep, first_write_grf, write_len);
2587
2588 /* We insert our reads as late as possible since they're reading the
2589 * result of a SEND, which has massive latency.
2590 */
2591 if (scan_inst->dst.file == GRF &&
2592 scan_inst->dst.reg >= first_write_grf &&
2593 scan_inst->dst.reg < first_write_grf + write_len &&
2594 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2595 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2596 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2597 }
2598
2599 /* Continue the loop only if we haven't resolved all the dependencies */
2600 int i;
2601 for (i = 0; i < write_len; i++) {
2602 if (needs_dep[i])
2603 break;
2604 }
2605 if (i == write_len)
2606 return;
2607 }
2608
2609 /* If we hit the end of the program, resolve all remaining dependencies out
2610 * of paranoia.
2611 */
2612 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2613 assert(last_inst->eot);
2614 for (int i = 0; i < write_len; i++) {
2615 if (needs_dep[i])
2616 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2617 }
2618 }
2619
2620 void
2621 fs_visitor::insert_gen4_send_dependency_workarounds()
2622 {
2623 if (brw->gen != 4 || brw->is_g4x)
2624 return;
2625
2626 /* Note that we're done with register allocation, so GRF fs_regs always
2627 * have a .reg_offset of 0.
2628 */
2629
2630 foreach_list_safe(node, &this->instructions) {
2631 fs_inst *inst = (fs_inst *)node;
2632
2633 if (inst->mlen != 0 && inst->dst.file == GRF) {
2634 insert_gen4_pre_send_dependency_workarounds(inst);
2635 insert_gen4_post_send_dependency_workarounds(inst);
2636 }
2637 }
2638 }
2639
2640 /**
2641 * Turns the generic expression-style uniform pull constant load instruction
2642 * into a hardware-specific series of instructions for loading a pull
2643 * constant.
2644 *
2645 * The expression style allows the CSE pass before this to optimize out
2646 * repeated loads from the same offset, and gives the pre-register-allocation
2647 * scheduling full flexibility, while the conversion to native instructions
2648 * allows the post-register-allocation scheduler the best information
2649 * possible.
2650 *
2651 * Note that execution masking for setting up pull constant loads is special:
2652 * the channels that need to be written are unrelated to the current execution
2653 * mask, since a later instruction will use one of the result channels as a
2654 * source operand for all 8 or 16 of its channels.
2655 */
2656 void
2657 fs_visitor::lower_uniform_pull_constant_loads()
2658 {
2659 foreach_list(node, &this->instructions) {
2660 fs_inst *inst = (fs_inst *)node;
2661
2662 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2663 continue;
2664
2665 if (brw->gen >= 7) {
2666 /* The offset arg before was a vec4-aligned byte offset. We need to
2667 * turn it into a dword offset.
2668 */
2669 fs_reg const_offset_reg = inst->src[1];
2670 assert(const_offset_reg.file == IMM &&
2671 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2672 const_offset_reg.imm.u /= 4;
2673 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2674
2675 /* This is actually going to be a MOV, but since only the first dword
2676 * is accessed, we have a special opcode to do just that one. Note
2677 * that this needs to be an operation that will be considered a def
2678 * by live variable analysis, or register allocation will explode.
2679 */
2680 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2681 payload, const_offset_reg);
2682 setup->force_writemask_all = true;
2683
2684 setup->ir = inst->ir;
2685 setup->annotation = inst->annotation;
2686 inst->insert_before(setup);
2687
2688 /* Similarly, this will only populate the first 4 channels of the
2689 * result register (since we only use smear values from 0-3), but we
2690 * don't tell the optimizer.
2691 */
2692 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2693 inst->src[1] = payload;
2694
2695 this->live_intervals_valid = false;
2696 } else {
2697 /* Before register allocation, we didn't tell the scheduler about the
2698 * MRF we use. We know it's safe to use this MRF because nothing
2699 * else does except for register spill/unspill, which generates and
2700 * uses its MRF within a single IR instruction.
2701 */
2702 inst->base_mrf = 14;
2703 inst->mlen = 1;
2704 }
2705 }
2706 }
2707
2708 void
2709 fs_visitor::dump_instruction(backend_instruction *be_inst)
2710 {
2711 fs_inst *inst = (fs_inst *)be_inst;
2712
2713 if (inst->predicate) {
2714 printf("(%cf0.%d) ",
2715 inst->predicate_inverse ? '-' : '+',
2716 inst->flag_subreg);
2717 }
2718
2719 printf("%s", brw_instruction_name(inst->opcode));
2720 if (inst->saturate)
2721 printf(".sat");
2722 if (inst->conditional_mod) {
2723 printf(".cmod");
2724 if (!inst->predicate &&
2725 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2726 inst->opcode != BRW_OPCODE_IF &&
2727 inst->opcode != BRW_OPCODE_WHILE))) {
2728 printf(".f0.%d", inst->flag_subreg);
2729 }
2730 }
2731 printf(" ");
2732
2733
2734 switch (inst->dst.file) {
2735 case GRF:
2736 printf("vgrf%d", inst->dst.reg);
2737 if (inst->dst.reg_offset)
2738 printf("+%d", inst->dst.reg_offset);
2739 break;
2740 case MRF:
2741 printf("m%d", inst->dst.reg);
2742 break;
2743 case BAD_FILE:
2744 printf("(null)");
2745 break;
2746 case UNIFORM:
2747 printf("***u%d***", inst->dst.reg);
2748 break;
2749 case ARF:
2750 if (inst->dst.reg == BRW_ARF_NULL)
2751 printf("(null)");
2752 else
2753 printf("arf%d", inst->dst.reg);
2754 break;
2755 default:
2756 printf("???");
2757 break;
2758 }
2759 printf(", ");
2760
2761 for (int i = 0; i < 3; i++) {
2762 if (inst->src[i].negate)
2763 printf("-");
2764 if (inst->src[i].abs)
2765 printf("|");
2766 switch (inst->src[i].file) {
2767 case GRF:
2768 printf("vgrf%d", inst->src[i].reg);
2769 if (inst->src[i].reg_offset)
2770 printf("+%d", inst->src[i].reg_offset);
2771 break;
2772 case MRF:
2773 printf("***m%d***", inst->src[i].reg);
2774 break;
2775 case UNIFORM:
2776 printf("u%d", inst->src[i].reg);
2777 if (inst->src[i].reg_offset)
2778 printf(".%d", inst->src[i].reg_offset);
2779 break;
2780 case BAD_FILE:
2781 printf("(null)");
2782 break;
2783 case IMM:
2784 switch (inst->src[i].type) {
2785 case BRW_REGISTER_TYPE_F:
2786 printf("%ff", inst->src[i].imm.f);
2787 break;
2788 case BRW_REGISTER_TYPE_D:
2789 printf("%dd", inst->src[i].imm.i);
2790 break;
2791 case BRW_REGISTER_TYPE_UD:
2792 printf("%uu", inst->src[i].imm.u);
2793 break;
2794 default:
2795 printf("???");
2796 break;
2797 }
2798 break;
2799 default:
2800 printf("???");
2801 break;
2802 }
2803 if (inst->src[i].abs)
2804 printf("|");
2805
2806 if (i < 3)
2807 printf(", ");
2808 }
2809
2810 printf(" ");
2811
2812 if (inst->force_uncompressed)
2813 printf("1sthalf ");
2814
2815 if (inst->force_sechalf)
2816 printf("2ndhalf ");
2817
2818 printf("\n");
2819 }
2820
2821 /**
2822 * Possibly returns an instruction that set up @param reg.
2823 *
2824 * Sometimes we want to take the result of some expression/variable
2825 * dereference tree and rewrite the instruction generating the result
2826 * of the tree. When processing the tree, we know that the
2827 * instructions generated are all writing temporaries that are dead
2828 * outside of this tree. So, if we have some instructions that write
2829 * a temporary, we're free to point that temp write somewhere else.
2830 *
2831 * Note that this doesn't guarantee that the instruction generated
2832 * only reg -- it might be the size=4 destination of a texture instruction.
2833 */
2834 fs_inst *
2835 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2836 fs_inst *end,
2837 fs_reg reg)
2838 {
2839 if (end == start ||
2840 end->is_partial_write() ||
2841 reg.reladdr ||
2842 !reg.equals(end->dst)) {
2843 return NULL;
2844 } else {
2845 return end;
2846 }
2847 }
2848
2849 void
2850 fs_visitor::setup_payload_gen6()
2851 {
2852 bool uses_depth =
2853 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2854 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2855
2856 assert(brw->gen >= 6);
2857
2858 /* R0-1: masks, pixel X/Y coordinates. */
2859 c->nr_payload_regs = 2;
2860 /* R2: only for 32-pixel dispatch.*/
2861
2862 /* R3-26: barycentric interpolation coordinates. These appear in the
2863 * same order that they appear in the brw_wm_barycentric_interp_mode
2864 * enum. Each set of coordinates occupies 2 registers if dispatch width
2865 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2866 * appear if they were enabled using the "Barycentric Interpolation
2867 * Mode" bits in WM_STATE.
2868 */
2869 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2870 if (barycentric_interp_modes & (1 << i)) {
2871 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2872 c->nr_payload_regs += 2;
2873 if (dispatch_width == 16) {
2874 c->nr_payload_regs += 2;
2875 }
2876 }
2877 }
2878
2879 /* R27: interpolated depth if uses source depth */
2880 if (uses_depth) {
2881 c->source_depth_reg = c->nr_payload_regs;
2882 c->nr_payload_regs++;
2883 if (dispatch_width == 16) {
2884 /* R28: interpolated depth if not 8-wide. */
2885 c->nr_payload_regs++;
2886 }
2887 }
2888 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2889 if (uses_depth) {
2890 c->source_w_reg = c->nr_payload_regs;
2891 c->nr_payload_regs++;
2892 if (dispatch_width == 16) {
2893 /* R30: interpolated W if not 8-wide. */
2894 c->nr_payload_regs++;
2895 }
2896 }
2897 /* R31: MSAA position offsets. */
2898 /* R32-: bary for 32-pixel. */
2899 /* R58-59: interp W for 32-pixel. */
2900
2901 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2902 c->source_depth_to_render_target = true;
2903 }
2904 }
2905
2906 bool
2907 fs_visitor::run()
2908 {
2909 sanity_param_count = fp->Base.Parameters->NumParameters;
2910 uint32_t orig_nr_params = c->prog_data.nr_params;
2911
2912 if (brw->gen >= 6)
2913 setup_payload_gen6();
2914 else
2915 setup_payload_gen4();
2916
2917 if (0) {
2918 emit_dummy_fs();
2919 } else {
2920 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2921 emit_shader_time_begin();
2922
2923 calculate_urb_setup();
2924 if (brw->gen < 6)
2925 emit_interpolation_setup_gen4();
2926 else
2927 emit_interpolation_setup_gen6();
2928
2929 /* We handle discards by keeping track of the still-live pixels in f0.1.
2930 * Initialize it with the dispatched pixels.
2931 */
2932 if (fp->UsesKill) {
2933 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2934 discard_init->flag_subreg = 1;
2935 }
2936
2937 /* Generate FS IR for main(). (the visitor only descends into
2938 * functions called "main").
2939 */
2940 if (shader) {
2941 foreach_list(node, &*shader->ir) {
2942 ir_instruction *ir = (ir_instruction *)node;
2943 base_ir = ir;
2944 this->result = reg_undef;
2945 ir->accept(this);
2946 }
2947 } else {
2948 emit_fragment_program_code();
2949 }
2950 base_ir = NULL;
2951 if (failed)
2952 return false;
2953
2954 emit(FS_OPCODE_PLACEHOLDER_HALT);
2955
2956 emit_fb_writes();
2957
2958 split_virtual_grfs();
2959
2960 move_uniform_array_access_to_pull_constants();
2961 setup_pull_constants();
2962
2963 bool progress;
2964 do {
2965 progress = false;
2966
2967 compact_virtual_grfs();
2968
2969 progress = remove_duplicate_mrf_writes() || progress;
2970
2971 progress = opt_algebraic() || progress;
2972 progress = opt_cse() || progress;
2973 progress = opt_copy_propagate() || progress;
2974 progress = dead_code_eliminate() || progress;
2975 progress = dead_code_eliminate_local() || progress;
2976 progress = register_coalesce() || progress;
2977 progress = register_coalesce_2() || progress;
2978 progress = compute_to_mrf() || progress;
2979 } while (progress);
2980
2981 remove_dead_constants();
2982
2983 schedule_instructions(false);
2984
2985 lower_uniform_pull_constant_loads();
2986
2987 assign_curb_setup();
2988 assign_urb_setup();
2989
2990 if (0) {
2991 /* Debug of register spilling: Go spill everything. */
2992 for (int i = 0; i < virtual_grf_count; i++) {
2993 spill_reg(i);
2994 }
2995 }
2996
2997 if (0)
2998 assign_regs_trivial();
2999 else {
3000 while (!assign_regs()) {
3001 if (failed)
3002 break;
3003 }
3004 }
3005 }
3006 assert(force_uncompressed_stack == 0);
3007 assert(force_sechalf_stack == 0);
3008
3009 /* This must come after all optimization and register allocation, since
3010 * it inserts dead code that happens to have side effects, and it does
3011 * so based on the actual physical registers in use.
3012 */
3013 insert_gen4_send_dependency_workarounds();
3014
3015 if (failed)
3016 return false;
3017
3018 schedule_instructions(true);
3019
3020 if (dispatch_width == 8) {
3021 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3022 } else {
3023 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3024
3025 /* Make sure we didn't try to sneak in an extra uniform */
3026 assert(orig_nr_params == c->prog_data.nr_params);
3027 (void) orig_nr_params;
3028 }
3029
3030 /* If any state parameters were appended, then ParameterValues could have
3031 * been realloced, in which case the driver uniform storage set up by
3032 * _mesa_associate_uniform_storage() would point to freed memory. Make
3033 * sure that didn't happen.
3034 */
3035 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3036
3037 return !failed;
3038 }
3039
3040 const unsigned *
3041 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3042 struct gl_fragment_program *fp,
3043 struct gl_shader_program *prog,
3044 unsigned *final_assembly_size)
3045 {
3046 bool start_busy = false;
3047 float start_time = 0;
3048
3049 if (unlikely(brw->perf_debug)) {
3050 start_busy = (brw->batch.last_bo &&
3051 drm_intel_bo_busy(brw->batch.last_bo));
3052 start_time = get_time();
3053 }
3054
3055 struct brw_shader *shader = NULL;
3056 if (prog)
3057 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3058
3059 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3060 if (prog) {
3061 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3062 _mesa_print_ir(shader->ir, NULL);
3063 printf("\n\n");
3064 } else {
3065 printf("ARB_fragment_program %d ir for native fragment shader\n",
3066 fp->Base.Id);
3067 _mesa_print_program(&fp->Base);
3068 }
3069 }
3070
3071 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3072 */
3073 fs_visitor v(brw, c, prog, fp, 8);
3074 if (!v.run()) {
3075 if (prog) {
3076 prog->LinkStatus = false;
3077 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3078 }
3079
3080 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3081 v.fail_msg);
3082
3083 return NULL;
3084 }
3085
3086 exec_list *simd16_instructions = NULL;
3087 fs_visitor v2(brw, c, prog, fp, 16);
3088 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3089 if (c->prog_data.nr_pull_params == 0) {
3090 /* Try a 16-wide compile */
3091 v2.import_uniforms(&v);
3092 if (!v2.run()) {
3093 perf_debug("16-wide shader failed to compile, falling back to "
3094 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3095 } else {
3096 simd16_instructions = &v2.instructions;
3097 }
3098 } else {
3099 perf_debug("Skipping 16-wide due to pull parameters.\n");
3100 }
3101 }
3102
3103 c->prog_data.dispatch_width = 8;
3104
3105 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3106 const unsigned *generated = g.generate_assembly(&v.instructions,
3107 simd16_instructions,
3108 final_assembly_size);
3109
3110 if (unlikely(brw->perf_debug) && shader) {
3111 if (shader->compiled_once)
3112 brw_wm_debug_recompile(brw, prog, &c->key);
3113 shader->compiled_once = true;
3114
3115 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3116 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3117 (get_time() - start_time) * 1000);
3118 }
3119 }
3120
3121 return generated;
3122 }
3123
3124 bool
3125 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3126 {
3127 struct brw_context *brw = brw_context(ctx);
3128 struct brw_wm_prog_key key;
3129
3130 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3131 return true;
3132
3133 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3134 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3135 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3136 bool program_uses_dfdy = fp->UsesDFdy;
3137
3138 memset(&key, 0, sizeof(key));
3139
3140 if (brw->gen < 6) {
3141 if (fp->UsesKill)
3142 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3143
3144 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3145 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3146
3147 /* Just assume depth testing. */
3148 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3149 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3150 }
3151
3152 if (brw->gen < 6)
3153 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3154
3155 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3156
3157 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3158 for (unsigned i = 0; i < sampler_count; i++) {
3159 if (fp->Base.ShadowSamplers & (1 << i)) {
3160 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3161 key.tex.swizzles[i] =
3162 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3163 } else {
3164 /* Color sampler: assume no swizzling. */
3165 key.tex.swizzles[i] = SWIZZLE_XYZW;
3166 }
3167 }
3168
3169 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3170 key.drawable_height = ctx->DrawBuffer->Height;
3171 }
3172
3173 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3174 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3175 }
3176
3177 key.nr_color_regions = 1;
3178
3179 key.program_string_id = bfp->id;
3180
3181 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3182 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3183
3184 bool success = do_wm_prog(brw, prog, bfp, &key);
3185
3186 brw->wm.base.prog_offset = old_prog_offset;
3187 brw->wm.prog_data = old_prog_data;
3188
3189 return success;
3190 }