i965: Remove the "ARF" register file.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "main/uniforms.h"
50 #include "glsl/glsl_types.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63
64 /* This will be the case for almost all instructions. */
65 this->regs_written = 1;
66 }
67
68 fs_inst::fs_inst()
69 {
70 init();
71 }
72
73 fs_inst::fs_inst(enum opcode opcode)
74 {
75 init();
76 this->opcode = opcode;
77 }
78
79 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
80 {
81 init();
82 this->opcode = opcode;
83 this->dst = dst;
84
85 if (dst.file == GRF)
86 assert(dst.reg_offset >= 0);
87 }
88
89 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
90 {
91 init();
92 this->opcode = opcode;
93 this->dst = dst;
94 this->src[0] = src0;
95
96 if (dst.file == GRF)
97 assert(dst.reg_offset >= 0);
98 if (src[0].file == GRF)
99 assert(src[0].reg_offset >= 0);
100 }
101
102 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
103 {
104 init();
105 this->opcode = opcode;
106 this->dst = dst;
107 this->src[0] = src0;
108 this->src[1] = src1;
109
110 if (dst.file == GRF)
111 assert(dst.reg_offset >= 0);
112 if (src[0].file == GRF)
113 assert(src[0].reg_offset >= 0);
114 if (src[1].file == GRF)
115 assert(src[1].reg_offset >= 0);
116 }
117
118 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
119 fs_reg src0, fs_reg src1, fs_reg src2)
120 {
121 init();
122 this->opcode = opcode;
123 this->dst = dst;
124 this->src[0] = src0;
125 this->src[1] = src1;
126 this->src[2] = src2;
127
128 if (dst.file == GRF)
129 assert(dst.reg_offset >= 0);
130 if (src[0].file == GRF)
131 assert(src[0].reg_offset >= 0);
132 if (src[1].file == GRF)
133 assert(src[1].reg_offset >= 0);
134 if (src[2].file == GRF)
135 assert(src[2].reg_offset >= 0);
136 }
137
138 #define ALU1(op) \
139 fs_inst * \
140 fs_visitor::op(fs_reg dst, fs_reg src0) \
141 { \
142 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
143 }
144
145 #define ALU2(op) \
146 fs_inst * \
147 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
148 { \
149 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
150 }
151
152 #define ALU3(op) \
153 fs_inst * \
154 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
155 { \
156 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
157 }
158
159 ALU1(NOT)
160 ALU1(MOV)
161 ALU1(FRC)
162 ALU1(RNDD)
163 ALU1(RNDE)
164 ALU1(RNDZ)
165 ALU2(ADD)
166 ALU2(MUL)
167 ALU2(MACH)
168 ALU2(AND)
169 ALU2(OR)
170 ALU2(XOR)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2(ADDC)
184 ALU2(SUBB)
185
186 /** Gen4 predicated IF. */
187 fs_inst *
188 fs_visitor::IF(uint32_t predicate)
189 {
190 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
191 inst->predicate = predicate;
192 return inst;
193 }
194
195 /** Gen6+ IF with embedded comparison. */
196 fs_inst *
197 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
198 {
199 assert(brw->gen >= 6);
200 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
201 reg_null_d, src0, src1);
202 inst->conditional_mod = condition;
203 return inst;
204 }
205
206 /**
207 * CMP: Sets the low bit of the destination channels with the result
208 * of the comparison, while the upper bits are undefined, and updates
209 * the flag register with the packed 16 bits of the result.
210 */
211 fs_inst *
212 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
213 {
214 fs_inst *inst;
215
216 /* Take the instruction:
217 *
218 * CMP null<d> src0<f> src1<f>
219 *
220 * Original gen4 does type conversion to the destination type before
221 * comparison, producing garbage results for floating point comparisons.
222 * gen5 does the comparison on the execution type (resolved source types),
223 * so dst type doesn't matter. gen6 does comparison and then uses the
224 * result as if it was the dst type with no conversion, which happens to
225 * mostly work out for float-interpreted-as-int since our comparisons are
226 * for >0, =0, <0.
227 */
228 if (brw->gen == 4) {
229 dst.type = src0.type;
230 if (dst.file == HW_REG)
231 dst.fixed_hw_reg.type = dst.type;
232 }
233
234 resolve_ud_negate(&src0);
235 resolve_ud_negate(&src1);
236
237 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
238 inst->conditional_mod = condition;
239
240 return inst;
241 }
242
243 exec_list
244 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
245 fs_reg varying_offset,
246 uint32_t const_offset)
247 {
248 exec_list instructions;
249 fs_inst *inst;
250
251 /* We have our constant surface use a pitch of 4 bytes, so our index can
252 * be any component of a vector, and then we load 4 contiguous
253 * components starting from that.
254 *
255 * We break down the const_offset to a portion added to the variable
256 * offset and a portion done using reg_offset, which means that if you
257 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
258 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
259 * CSE can later notice that those loads are all the same and eliminate
260 * the redundant ones.
261 */
262 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
263 instructions.push_tail(ADD(vec4_offset,
264 varying_offset, const_offset & ~3));
265
266 int scale = 1;
267 if (brw->gen == 4 && dispatch_width == 8) {
268 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
269 * u, v, r) as parameters, or we can just use the SIMD16 message
270 * consisting of (header, u). We choose the second, at the cost of a
271 * longer return length.
272 */
273 scale = 2;
274 }
275
276 enum opcode op;
277 if (brw->gen >= 7)
278 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
279 else
280 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
281 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
282 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
283 inst->regs_written = 4 * scale;
284 instructions.push_tail(inst);
285
286 if (brw->gen < 7) {
287 inst->base_mrf = 13;
288 inst->header_present = true;
289 if (brw->gen == 4)
290 inst->mlen = 3;
291 else
292 inst->mlen = 1 + dispatch_width / 8;
293 }
294
295 vec4_result.reg_offset += (const_offset & 3) * scale;
296 instructions.push_tail(MOV(dst, vec4_result));
297
298 return instructions;
299 }
300
301 /**
302 * A helper for MOV generation for fixing up broken hardware SEND dependency
303 * handling.
304 */
305 fs_inst *
306 fs_visitor::DEP_RESOLVE_MOV(int grf)
307 {
308 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
309
310 inst->ir = NULL;
311 inst->annotation = "send dependency resolve";
312
313 /* The caller always wants uncompressed to emit the minimal extra
314 * dependencies, and to avoid having to deal with aligning its regs to 2.
315 */
316 inst->force_uncompressed = true;
317
318 return inst;
319 }
320
321 bool
322 fs_inst::equals(fs_inst *inst)
323 {
324 return (opcode == inst->opcode &&
325 dst.equals(inst->dst) &&
326 src[0].equals(inst->src[0]) &&
327 src[1].equals(inst->src[1]) &&
328 src[2].equals(inst->src[2]) &&
329 saturate == inst->saturate &&
330 predicate == inst->predicate &&
331 conditional_mod == inst->conditional_mod &&
332 mlen == inst->mlen &&
333 base_mrf == inst->base_mrf &&
334 sampler == inst->sampler &&
335 target == inst->target &&
336 eot == inst->eot &&
337 header_present == inst->header_present &&
338 shadow_compare == inst->shadow_compare &&
339 offset == inst->offset);
340 }
341
342 bool
343 fs_inst::overwrites_reg(const fs_reg &reg)
344 {
345 return (reg.file == dst.file &&
346 reg.reg == dst.reg &&
347 reg.reg_offset >= dst.reg_offset &&
348 reg.reg_offset < dst.reg_offset + regs_written);
349 }
350
351 bool
352 fs_inst::is_send_from_grf()
353 {
354 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
355 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
356 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
357 src[1].file == GRF));
358 }
359
360 bool
361 fs_visitor::can_do_source_mods(fs_inst *inst)
362 {
363 if (brw->gen == 6 && inst->is_math())
364 return false;
365
366 if (inst->is_send_from_grf())
367 return false;
368
369 return true;
370 }
371
372 void
373 fs_reg::init()
374 {
375 memset(this, 0, sizeof(*this));
376 this->smear = -1;
377 }
378
379 /** Generic unset register constructor. */
380 fs_reg::fs_reg()
381 {
382 init();
383 this->file = BAD_FILE;
384 }
385
386 /** Immediate value constructor. */
387 fs_reg::fs_reg(float f)
388 {
389 init();
390 this->file = IMM;
391 this->type = BRW_REGISTER_TYPE_F;
392 this->imm.f = f;
393 }
394
395 /** Immediate value constructor. */
396 fs_reg::fs_reg(int32_t i)
397 {
398 init();
399 this->file = IMM;
400 this->type = BRW_REGISTER_TYPE_D;
401 this->imm.i = i;
402 }
403
404 /** Immediate value constructor. */
405 fs_reg::fs_reg(uint32_t u)
406 {
407 init();
408 this->file = IMM;
409 this->type = BRW_REGISTER_TYPE_UD;
410 this->imm.u = u;
411 }
412
413 /** Fixed brw_reg Immediate value constructor. */
414 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
415 {
416 init();
417 this->file = HW_REG;
418 this->fixed_hw_reg = fixed_hw_reg;
419 this->type = fixed_hw_reg.type;
420 }
421
422 bool
423 fs_reg::equals(const fs_reg &r) const
424 {
425 return (file == r.file &&
426 reg == r.reg &&
427 reg_offset == r.reg_offset &&
428 type == r.type &&
429 negate == r.negate &&
430 abs == r.abs &&
431 !reladdr && !r.reladdr &&
432 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
433 sizeof(fixed_hw_reg)) == 0 &&
434 smear == r.smear &&
435 imm.u == r.imm.u);
436 }
437
438 bool
439 fs_reg::is_zero() const
440 {
441 if (file != IMM)
442 return false;
443
444 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
445 }
446
447 bool
448 fs_reg::is_one() const
449 {
450 if (file != IMM)
451 return false;
452
453 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
454 }
455
456 bool
457 fs_reg::is_valid_3src() const
458 {
459 return file == GRF || file == UNIFORM;
460 }
461
462 int
463 fs_visitor::type_size(const struct glsl_type *type)
464 {
465 unsigned int size, i;
466
467 switch (type->base_type) {
468 case GLSL_TYPE_UINT:
469 case GLSL_TYPE_INT:
470 case GLSL_TYPE_FLOAT:
471 case GLSL_TYPE_BOOL:
472 return type->components();
473 case GLSL_TYPE_ARRAY:
474 return type_size(type->fields.array) * type->length;
475 case GLSL_TYPE_STRUCT:
476 size = 0;
477 for (i = 0; i < type->length; i++) {
478 size += type_size(type->fields.structure[i].type);
479 }
480 return size;
481 case GLSL_TYPE_SAMPLER:
482 /* Samplers take up no register space, since they're baked in at
483 * link time.
484 */
485 return 0;
486 case GLSL_TYPE_VOID:
487 case GLSL_TYPE_ERROR:
488 case GLSL_TYPE_INTERFACE:
489 assert(!"not reached");
490 break;
491 }
492
493 return 0;
494 }
495
496 fs_reg
497 fs_visitor::get_timestamp()
498 {
499 assert(brw->gen >= 7);
500
501 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
502 BRW_ARF_TIMESTAMP,
503 0),
504 BRW_REGISTER_TYPE_UD));
505
506 fs_reg dst = fs_reg(this, glsl_type::uint_type);
507
508 fs_inst *mov = emit(MOV(dst, ts));
509 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
510 * even if it's not enabled in the dispatch.
511 */
512 mov->force_writemask_all = true;
513 mov->force_uncompressed = true;
514
515 /* The caller wants the low 32 bits of the timestamp. Since it's running
516 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
517 * which is plenty of time for our purposes. It is identical across the
518 * EUs, but since it's tracking GPU core speed it will increment at a
519 * varying rate as render P-states change.
520 *
521 * The caller could also check if render P-states have changed (or anything
522 * else that might disrupt timing) by setting smear to 2 and checking if
523 * that field is != 0.
524 */
525 dst.smear = 0;
526
527 return dst;
528 }
529
530 void
531 fs_visitor::emit_shader_time_begin()
532 {
533 current_annotation = "shader time start";
534 shader_start_time = get_timestamp();
535 }
536
537 void
538 fs_visitor::emit_shader_time_end()
539 {
540 current_annotation = "shader time end";
541
542 enum shader_time_shader_type type, written_type, reset_type;
543 if (dispatch_width == 8) {
544 type = ST_FS8;
545 written_type = ST_FS8_WRITTEN;
546 reset_type = ST_FS8_RESET;
547 } else {
548 assert(dispatch_width == 16);
549 type = ST_FS16;
550 written_type = ST_FS16_WRITTEN;
551 reset_type = ST_FS16_RESET;
552 }
553
554 fs_reg shader_end_time = get_timestamp();
555
556 /* Check that there weren't any timestamp reset events (assuming these
557 * were the only two timestamp reads that happened).
558 */
559 fs_reg reset = shader_end_time;
560 reset.smear = 2;
561 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
562 test->conditional_mod = BRW_CONDITIONAL_Z;
563 emit(IF(BRW_PREDICATE_NORMAL));
564
565 push_force_uncompressed();
566 fs_reg start = shader_start_time;
567 start.negate = true;
568 fs_reg diff = fs_reg(this, glsl_type::uint_type);
569 emit(ADD(diff, start, shader_end_time));
570
571 /* If there were no instructions between the two timestamp gets, the diff
572 * is 2 cycles. Remove that overhead, so I can forget about that when
573 * trying to determine the time taken for single instructions.
574 */
575 emit(ADD(diff, diff, fs_reg(-2u)));
576
577 emit_shader_time_write(type, diff);
578 emit_shader_time_write(written_type, fs_reg(1u));
579 emit(BRW_OPCODE_ELSE);
580 emit_shader_time_write(reset_type, fs_reg(1u));
581 emit(BRW_OPCODE_ENDIF);
582
583 pop_force_uncompressed();
584 }
585
586 void
587 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
588 fs_reg value)
589 {
590 int shader_time_index =
591 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
592 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
593
594 fs_reg payload;
595 if (dispatch_width == 8)
596 payload = fs_reg(this, glsl_type::uvec2_type);
597 else
598 payload = fs_reg(this, glsl_type::uint_type);
599
600 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
601 fs_reg(), payload, offset, value));
602 }
603
604 void
605 fs_visitor::fail(const char *format, ...)
606 {
607 va_list va;
608 char *msg;
609
610 if (failed)
611 return;
612
613 failed = true;
614
615 va_start(va, format);
616 msg = ralloc_vasprintf(mem_ctx, format, va);
617 va_end(va);
618 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
619
620 this->fail_msg = msg;
621
622 if (INTEL_DEBUG & DEBUG_WM) {
623 fprintf(stderr, "%s", msg);
624 }
625 }
626
627 fs_inst *
628 fs_visitor::emit(enum opcode opcode)
629 {
630 return emit(fs_inst(opcode));
631 }
632
633 fs_inst *
634 fs_visitor::emit(enum opcode opcode, fs_reg dst)
635 {
636 return emit(fs_inst(opcode, dst));
637 }
638
639 fs_inst *
640 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
641 {
642 return emit(fs_inst(opcode, dst, src0));
643 }
644
645 fs_inst *
646 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
647 {
648 return emit(fs_inst(opcode, dst, src0, src1));
649 }
650
651 fs_inst *
652 fs_visitor::emit(enum opcode opcode, fs_reg dst,
653 fs_reg src0, fs_reg src1, fs_reg src2)
654 {
655 return emit(fs_inst(opcode, dst, src0, src1, src2));
656 }
657
658 void
659 fs_visitor::push_force_uncompressed()
660 {
661 force_uncompressed_stack++;
662 }
663
664 void
665 fs_visitor::pop_force_uncompressed()
666 {
667 force_uncompressed_stack--;
668 assert(force_uncompressed_stack >= 0);
669 }
670
671 void
672 fs_visitor::push_force_sechalf()
673 {
674 force_sechalf_stack++;
675 }
676
677 void
678 fs_visitor::pop_force_sechalf()
679 {
680 force_sechalf_stack--;
681 assert(force_sechalf_stack >= 0);
682 }
683
684 /**
685 * Returns true if the instruction has a flag that means it won't
686 * update an entire destination register.
687 *
688 * For example, dead code elimination and live variable analysis want to know
689 * when a write to a variable screens off any preceding values that were in
690 * it.
691 */
692 bool
693 fs_inst::is_partial_write()
694 {
695 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
696 this->force_uncompressed ||
697 this->force_sechalf);
698 }
699
700 /**
701 * Returns how many MRFs an FS opcode will write over.
702 *
703 * Note that this is not the 0 or 1 implied writes in an actual gen
704 * instruction -- the FS opcodes often generate MOVs in addition.
705 */
706 int
707 fs_visitor::implied_mrf_writes(fs_inst *inst)
708 {
709 if (inst->mlen == 0)
710 return 0;
711
712 switch (inst->opcode) {
713 case SHADER_OPCODE_RCP:
714 case SHADER_OPCODE_RSQ:
715 case SHADER_OPCODE_SQRT:
716 case SHADER_OPCODE_EXP2:
717 case SHADER_OPCODE_LOG2:
718 case SHADER_OPCODE_SIN:
719 case SHADER_OPCODE_COS:
720 return 1 * dispatch_width / 8;
721 case SHADER_OPCODE_POW:
722 case SHADER_OPCODE_INT_QUOTIENT:
723 case SHADER_OPCODE_INT_REMAINDER:
724 return 2 * dispatch_width / 8;
725 case SHADER_OPCODE_TEX:
726 case FS_OPCODE_TXB:
727 case SHADER_OPCODE_TXD:
728 case SHADER_OPCODE_TXF:
729 case SHADER_OPCODE_TXF_MS:
730 case SHADER_OPCODE_TG4:
731 case SHADER_OPCODE_TXL:
732 case SHADER_OPCODE_TXS:
733 case SHADER_OPCODE_LOD:
734 return 1;
735 case FS_OPCODE_FB_WRITE:
736 return 2;
737 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
738 case FS_OPCODE_UNSPILL:
739 return 1;
740 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
741 return inst->mlen;
742 case FS_OPCODE_SPILL:
743 return 2;
744 default:
745 assert(!"not reached");
746 return inst->mlen;
747 }
748 }
749
750 int
751 fs_visitor::virtual_grf_alloc(int size)
752 {
753 if (virtual_grf_array_size <= virtual_grf_count) {
754 if (virtual_grf_array_size == 0)
755 virtual_grf_array_size = 16;
756 else
757 virtual_grf_array_size *= 2;
758 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
759 virtual_grf_array_size);
760 }
761 virtual_grf_sizes[virtual_grf_count] = size;
762 return virtual_grf_count++;
763 }
764
765 /** Fixed HW reg constructor. */
766 fs_reg::fs_reg(enum register_file file, int reg)
767 {
768 init();
769 this->file = file;
770 this->reg = reg;
771 this->type = BRW_REGISTER_TYPE_F;
772 }
773
774 /** Fixed HW reg constructor. */
775 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
776 {
777 init();
778 this->file = file;
779 this->reg = reg;
780 this->type = type;
781 }
782
783 /** Automatic reg constructor. */
784 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
785 {
786 init();
787
788 this->file = GRF;
789 this->reg = v->virtual_grf_alloc(v->type_size(type));
790 this->reg_offset = 0;
791 this->type = brw_type_for_base_type(type);
792 }
793
794 fs_reg *
795 fs_visitor::variable_storage(ir_variable *var)
796 {
797 return (fs_reg *)hash_table_find(this->variable_ht, var);
798 }
799
800 void
801 import_uniforms_callback(const void *key,
802 void *data,
803 void *closure)
804 {
805 struct hash_table *dst_ht = (struct hash_table *)closure;
806 const fs_reg *reg = (const fs_reg *)data;
807
808 if (reg->file != UNIFORM)
809 return;
810
811 hash_table_insert(dst_ht, data, key);
812 }
813
814 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
815 * This brings in those uniform definitions
816 */
817 void
818 fs_visitor::import_uniforms(fs_visitor *v)
819 {
820 hash_table_call_foreach(v->variable_ht,
821 import_uniforms_callback,
822 variable_ht);
823 this->params_remap = v->params_remap;
824 this->nr_params_remap = v->nr_params_remap;
825 }
826
827 /* Our support for uniforms is piggy-backed on the struct
828 * gl_fragment_program, because that's where the values actually
829 * get stored, rather than in some global gl_shader_program uniform
830 * store.
831 */
832 void
833 fs_visitor::setup_uniform_values(ir_variable *ir)
834 {
835 int namelen = strlen(ir->name);
836
837 /* The data for our (non-builtin) uniforms is stored in a series of
838 * gl_uniform_driver_storage structs for each subcomponent that
839 * glGetUniformLocation() could name. We know it's been set up in the same
840 * order we'd walk the type, so walk the list of storage and find anything
841 * with our name, or the prefix of a component that starts with our name.
842 */
843 unsigned params_before = c->prog_data.nr_params;
844 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
845 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
846
847 if (strncmp(ir->name, storage->name, namelen) != 0 ||
848 (storage->name[namelen] != 0 &&
849 storage->name[namelen] != '.' &&
850 storage->name[namelen] != '[')) {
851 continue;
852 }
853
854 unsigned slots = storage->type->component_slots();
855 if (storage->array_elements)
856 slots *= storage->array_elements;
857
858 for (unsigned i = 0; i < slots; i++) {
859 c->prog_data.param[c->prog_data.nr_params++] =
860 &storage->storage[i].f;
861 }
862 }
863
864 /* Make sure we actually initialized the right amount of stuff here. */
865 assert(params_before + ir->type->component_slots() ==
866 c->prog_data.nr_params);
867 (void)params_before;
868 }
869
870
871 /* Our support for builtin uniforms is even scarier than non-builtin.
872 * It sits on top of the PROG_STATE_VAR parameters that are
873 * automatically updated from GL context state.
874 */
875 void
876 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
877 {
878 const ir_state_slot *const slots = ir->state_slots;
879 assert(ir->state_slots != NULL);
880
881 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
882 /* This state reference has already been setup by ir_to_mesa, but we'll
883 * get the same index back here.
884 */
885 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
886 (gl_state_index *)slots[i].tokens);
887
888 /* Add each of the unique swizzles of the element as a parameter.
889 * This'll end up matching the expected layout of the
890 * array/matrix/structure we're trying to fill in.
891 */
892 int last_swiz = -1;
893 for (unsigned int j = 0; j < 4; j++) {
894 int swiz = GET_SWZ(slots[i].swizzle, j);
895 if (swiz == last_swiz)
896 break;
897 last_swiz = swiz;
898
899 c->prog_data.param[c->prog_data.nr_params++] =
900 &fp->Base.Parameters->ParameterValues[index][swiz].f;
901 }
902 }
903 }
904
905 fs_reg *
906 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
907 {
908 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
909 fs_reg wpos = *reg;
910 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
911
912 /* gl_FragCoord.x */
913 if (ir->pixel_center_integer) {
914 emit(MOV(wpos, this->pixel_x));
915 } else {
916 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
917 }
918 wpos.reg_offset++;
919
920 /* gl_FragCoord.y */
921 if (!flip && ir->pixel_center_integer) {
922 emit(MOV(wpos, this->pixel_y));
923 } else {
924 fs_reg pixel_y = this->pixel_y;
925 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
926
927 if (flip) {
928 pixel_y.negate = true;
929 offset += c->key.drawable_height - 1.0;
930 }
931
932 emit(ADD(wpos, pixel_y, fs_reg(offset)));
933 }
934 wpos.reg_offset++;
935
936 /* gl_FragCoord.z */
937 if (brw->gen >= 6) {
938 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
939 } else {
940 emit(FS_OPCODE_LINTERP, wpos,
941 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
942 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
943 interp_reg(VARYING_SLOT_POS, 2));
944 }
945 wpos.reg_offset++;
946
947 /* gl_FragCoord.w: Already set up in emit_interpolation */
948 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
949
950 return reg;
951 }
952
953 fs_inst *
954 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
955 glsl_interp_qualifier interpolation_mode,
956 bool is_centroid)
957 {
958 brw_wm_barycentric_interp_mode barycoord_mode;
959 if (brw->gen >= 6) {
960 if (is_centroid) {
961 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
962 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
963 else
964 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
965 } else {
966 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
967 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
968 else
969 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
970 }
971 } else {
972 /* On Ironlake and below, there is only one interpolation mode.
973 * Centroid interpolation doesn't mean anything on this hardware --
974 * there is no multisampling.
975 */
976 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
977 }
978 return emit(FS_OPCODE_LINTERP, attr,
979 this->delta_x[barycoord_mode],
980 this->delta_y[barycoord_mode], interp);
981 }
982
983 fs_reg *
984 fs_visitor::emit_general_interpolation(ir_variable *ir)
985 {
986 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
987 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
988 fs_reg attr = *reg;
989
990 unsigned int array_elements;
991 const glsl_type *type;
992
993 if (ir->type->is_array()) {
994 array_elements = ir->type->length;
995 if (array_elements == 0) {
996 fail("dereferenced array '%s' has length 0\n", ir->name);
997 }
998 type = ir->type->fields.array;
999 } else {
1000 array_elements = 1;
1001 type = ir->type;
1002 }
1003
1004 glsl_interp_qualifier interpolation_mode =
1005 ir->determine_interpolation_mode(c->key.flat_shade);
1006
1007 int location = ir->location;
1008 for (unsigned int i = 0; i < array_elements; i++) {
1009 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1010 if (c->prog_data.urb_setup[location] == -1) {
1011 /* If there's no incoming setup data for this slot, don't
1012 * emit interpolation for it.
1013 */
1014 attr.reg_offset += type->vector_elements;
1015 location++;
1016 continue;
1017 }
1018
1019 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1020 /* Constant interpolation (flat shading) case. The SF has
1021 * handed us defined values in only the constant offset
1022 * field of the setup reg.
1023 */
1024 for (unsigned int k = 0; k < type->vector_elements; k++) {
1025 struct brw_reg interp = interp_reg(location, k);
1026 interp = suboffset(interp, 3);
1027 interp.type = reg->type;
1028 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1029 attr.reg_offset++;
1030 }
1031 } else {
1032 /* Smooth/noperspective interpolation case. */
1033 for (unsigned int k = 0; k < type->vector_elements; k++) {
1034 /* FINISHME: At some point we probably want to push
1035 * this farther by giving similar treatment to the
1036 * other potentially constant components of the
1037 * attribute, as well as making brw_vs_constval.c
1038 * handle varyings other than gl_TexCoord.
1039 */
1040 struct brw_reg interp = interp_reg(location, k);
1041 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1042 ir->centroid);
1043 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1044 /* Get the pixel/sample mask into f0 so that we know
1045 * which pixels are lit. Then, for each channel that is
1046 * unlit, replace the centroid data with non-centroid
1047 * data.
1048 */
1049 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1050 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1051 interpolation_mode, false);
1052 inst->predicate = BRW_PREDICATE_NORMAL;
1053 inst->predicate_inverse = true;
1054 }
1055 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1056 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1057 }
1058 attr.reg_offset++;
1059 }
1060
1061 }
1062 location++;
1063 }
1064 }
1065
1066 return reg;
1067 }
1068
1069 fs_reg *
1070 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1071 {
1072 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1073
1074 /* The frontfacing comes in as a bit in the thread payload. */
1075 if (brw->gen >= 6) {
1076 emit(BRW_OPCODE_ASR, *reg,
1077 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1078 fs_reg(15));
1079 emit(BRW_OPCODE_NOT, *reg, *reg);
1080 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1081 } else {
1082 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1083 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1084 * us front face
1085 */
1086 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1087 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1088 }
1089
1090 return reg;
1091 }
1092
1093 fs_reg
1094 fs_visitor::fix_math_operand(fs_reg src)
1095 {
1096 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1097 * might be able to do better by doing execsize = 1 math and then
1098 * expanding that result out, but we would need to be careful with
1099 * masking.
1100 *
1101 * The hardware ignores source modifiers (negate and abs) on math
1102 * instructions, so we also move to a temp to set those up.
1103 */
1104 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1105 !src.abs && !src.negate)
1106 return src;
1107
1108 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1109 * operands to math
1110 */
1111 if (brw->gen >= 7 && src.file != IMM)
1112 return src;
1113
1114 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1115 expanded.type = src.type;
1116 emit(BRW_OPCODE_MOV, expanded, src);
1117 return expanded;
1118 }
1119
1120 fs_inst *
1121 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1122 {
1123 switch (opcode) {
1124 case SHADER_OPCODE_RCP:
1125 case SHADER_OPCODE_RSQ:
1126 case SHADER_OPCODE_SQRT:
1127 case SHADER_OPCODE_EXP2:
1128 case SHADER_OPCODE_LOG2:
1129 case SHADER_OPCODE_SIN:
1130 case SHADER_OPCODE_COS:
1131 break;
1132 default:
1133 assert(!"not reached: bad math opcode");
1134 return NULL;
1135 }
1136
1137 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1138 * might be able to do better by doing execsize = 1 math and then
1139 * expanding that result out, but we would need to be careful with
1140 * masking.
1141 *
1142 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1143 * instructions, so we also move to a temp to set those up.
1144 */
1145 if (brw->gen >= 6)
1146 src = fix_math_operand(src);
1147
1148 fs_inst *inst = emit(opcode, dst, src);
1149
1150 if (brw->gen < 6) {
1151 inst->base_mrf = 2;
1152 inst->mlen = dispatch_width / 8;
1153 }
1154
1155 return inst;
1156 }
1157
1158 fs_inst *
1159 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1160 {
1161 int base_mrf = 2;
1162 fs_inst *inst;
1163
1164 switch (opcode) {
1165 case SHADER_OPCODE_INT_QUOTIENT:
1166 case SHADER_OPCODE_INT_REMAINDER:
1167 if (brw->gen >= 7 && dispatch_width == 16)
1168 fail("16-wide INTDIV unsupported\n");
1169 break;
1170 case SHADER_OPCODE_POW:
1171 break;
1172 default:
1173 assert(!"not reached: unsupported binary math opcode.");
1174 return NULL;
1175 }
1176
1177 if (brw->gen >= 6) {
1178 src0 = fix_math_operand(src0);
1179 src1 = fix_math_operand(src1);
1180
1181 inst = emit(opcode, dst, src0, src1);
1182 } else {
1183 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1184 * "Message Payload":
1185 *
1186 * "Operand0[7]. For the INT DIV functions, this operand is the
1187 * denominator."
1188 * ...
1189 * "Operand1[7]. For the INT DIV functions, this operand is the
1190 * numerator."
1191 */
1192 bool is_int_div = opcode != SHADER_OPCODE_POW;
1193 fs_reg &op0 = is_int_div ? src1 : src0;
1194 fs_reg &op1 = is_int_div ? src0 : src1;
1195
1196 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1197 inst = emit(opcode, dst, op0, reg_null_f);
1198
1199 inst->base_mrf = base_mrf;
1200 inst->mlen = 2 * dispatch_width / 8;
1201 }
1202 return inst;
1203 }
1204
1205 void
1206 fs_visitor::assign_curb_setup()
1207 {
1208 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1209 if (dispatch_width == 8) {
1210 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1211 } else {
1212 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1213 }
1214
1215 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1216 foreach_list(node, &this->instructions) {
1217 fs_inst *inst = (fs_inst *)node;
1218
1219 for (unsigned int i = 0; i < 3; i++) {
1220 if (inst->src[i].file == UNIFORM) {
1221 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1222 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1223 constant_nr / 8,
1224 constant_nr % 8);
1225
1226 inst->src[i].file = HW_REG;
1227 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1228 }
1229 }
1230 }
1231 }
1232
1233 void
1234 fs_visitor::calculate_urb_setup()
1235 {
1236 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1237 c->prog_data.urb_setup[i] = -1;
1238 }
1239
1240 int urb_next = 0;
1241 /* Figure out where each of the incoming setup attributes lands. */
1242 if (brw->gen >= 6) {
1243 if (_mesa_bitcount_64(fp->Base.InputsRead &
1244 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1245 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1246 * first 16 varying inputs, so we can put them wherever we want.
1247 * Just put them in order.
1248 *
1249 * This is useful because it means that (a) inputs not used by the
1250 * fragment shader won't take up valuable register space, and (b) we
1251 * won't have to recompile the fragment shader if it gets paired with
1252 * a different vertex (or geometry) shader.
1253 */
1254 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1255 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1256 BITFIELD64_BIT(i)) {
1257 c->prog_data.urb_setup[i] = urb_next++;
1258 }
1259 }
1260 } else {
1261 /* We have enough input varyings that the SF/SBE pipeline stage can't
1262 * arbitrarily rearrange them to suit our whim; we have to put them
1263 * in an order that matches the output of the previous pipeline stage
1264 * (geometry or vertex shader).
1265 */
1266 struct brw_vue_map prev_stage_vue_map;
1267 brw_compute_vue_map(brw, &prev_stage_vue_map,
1268 c->key.input_slots_valid);
1269 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1270 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1271 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1272 slot++) {
1273 int varying = prev_stage_vue_map.slot_to_varying[slot];
1274 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1275 * unused.
1276 */
1277 if (varying != BRW_VARYING_SLOT_COUNT &&
1278 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1279 BITFIELD64_BIT(varying))) {
1280 c->prog_data.urb_setup[varying] = slot - first_slot;
1281 }
1282 }
1283 urb_next = prev_stage_vue_map.num_slots - first_slot;
1284 }
1285 } else {
1286 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1287 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1288 /* Point size is packed into the header, not as a general attribute */
1289 if (i == VARYING_SLOT_PSIZ)
1290 continue;
1291
1292 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1293 /* The back color slot is skipped when the front color is
1294 * also written to. In addition, some slots can be
1295 * written in the vertex shader and not read in the
1296 * fragment shader. So the register number must always be
1297 * incremented, mapped or not.
1298 */
1299 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1300 c->prog_data.urb_setup[i] = urb_next;
1301 urb_next++;
1302 }
1303 }
1304
1305 /*
1306 * It's a FS only attribute, and we did interpolation for this attribute
1307 * in SF thread. So, count it here, too.
1308 *
1309 * See compile_sf_prog() for more info.
1310 */
1311 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1312 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1313 }
1314
1315 c->prog_data.num_varying_inputs = urb_next;
1316 }
1317
1318 void
1319 fs_visitor::assign_urb_setup()
1320 {
1321 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1322
1323 /* Offset all the urb_setup[] index by the actual position of the
1324 * setup regs, now that the location of the constants has been chosen.
1325 */
1326 foreach_list(node, &this->instructions) {
1327 fs_inst *inst = (fs_inst *)node;
1328
1329 if (inst->opcode == FS_OPCODE_LINTERP) {
1330 assert(inst->src[2].file == HW_REG);
1331 inst->src[2].fixed_hw_reg.nr += urb_start;
1332 }
1333
1334 if (inst->opcode == FS_OPCODE_CINTERP) {
1335 assert(inst->src[0].file == HW_REG);
1336 inst->src[0].fixed_hw_reg.nr += urb_start;
1337 }
1338 }
1339
1340 /* Each attribute is 4 setup channels, each of which is half a reg. */
1341 this->first_non_payload_grf =
1342 urb_start + c->prog_data.num_varying_inputs * 2;
1343 }
1344
1345 /**
1346 * Split large virtual GRFs into separate components if we can.
1347 *
1348 * This is mostly duplicated with what brw_fs_vector_splitting does,
1349 * but that's really conservative because it's afraid of doing
1350 * splitting that doesn't result in real progress after the rest of
1351 * the optimization phases, which would cause infinite looping in
1352 * optimization. We can do it once here, safely. This also has the
1353 * opportunity to split interpolated values, or maybe even uniforms,
1354 * which we don't have at the IR level.
1355 *
1356 * We want to split, because virtual GRFs are what we register
1357 * allocate and spill (due to contiguousness requirements for some
1358 * instructions), and they're what we naturally generate in the
1359 * codegen process, but most virtual GRFs don't actually need to be
1360 * contiguous sets of GRFs. If we split, we'll end up with reduced
1361 * live intervals and better dead code elimination and coalescing.
1362 */
1363 void
1364 fs_visitor::split_virtual_grfs()
1365 {
1366 int num_vars = this->virtual_grf_count;
1367 bool split_grf[num_vars];
1368 int new_virtual_grf[num_vars];
1369
1370 /* Try to split anything > 0 sized. */
1371 for (int i = 0; i < num_vars; i++) {
1372 if (this->virtual_grf_sizes[i] != 1)
1373 split_grf[i] = true;
1374 else
1375 split_grf[i] = false;
1376 }
1377
1378 if (brw->has_pln &&
1379 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1380 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1381 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1382 * Gen6, that was the only supported interpolation mode, and since Gen6,
1383 * delta_x and delta_y are in fixed hardware registers.
1384 */
1385 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1386 false;
1387 }
1388
1389 foreach_list(node, &this->instructions) {
1390 fs_inst *inst = (fs_inst *)node;
1391
1392 /* If there's a SEND message that requires contiguous destination
1393 * registers, no splitting is allowed.
1394 */
1395 if (inst->regs_written > 1) {
1396 split_grf[inst->dst.reg] = false;
1397 }
1398
1399 /* If we're sending from a GRF, don't split it, on the assumption that
1400 * the send is reading the whole thing.
1401 */
1402 if (inst->is_send_from_grf()) {
1403 for (int i = 0; i < 3; i++) {
1404 if (inst->src[i].file == GRF) {
1405 split_grf[inst->src[i].reg] = false;
1406 }
1407 }
1408 }
1409 }
1410
1411 /* Allocate new space for split regs. Note that the virtual
1412 * numbers will be contiguous.
1413 */
1414 for (int i = 0; i < num_vars; i++) {
1415 if (split_grf[i]) {
1416 new_virtual_grf[i] = virtual_grf_alloc(1);
1417 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1418 int reg = virtual_grf_alloc(1);
1419 assert(reg == new_virtual_grf[i] + j - 1);
1420 (void) reg;
1421 }
1422 this->virtual_grf_sizes[i] = 1;
1423 }
1424 }
1425
1426 foreach_list(node, &this->instructions) {
1427 fs_inst *inst = (fs_inst *)node;
1428
1429 if (inst->dst.file == GRF &&
1430 split_grf[inst->dst.reg] &&
1431 inst->dst.reg_offset != 0) {
1432 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1433 inst->dst.reg_offset - 1);
1434 inst->dst.reg_offset = 0;
1435 }
1436 for (int i = 0; i < 3; i++) {
1437 if (inst->src[i].file == GRF &&
1438 split_grf[inst->src[i].reg] &&
1439 inst->src[i].reg_offset != 0) {
1440 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1441 inst->src[i].reg_offset - 1);
1442 inst->src[i].reg_offset = 0;
1443 }
1444 }
1445 }
1446 this->live_intervals_valid = false;
1447 }
1448
1449 /**
1450 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1451 *
1452 * During code generation, we create tons of temporary variables, many of
1453 * which get immediately killed and are never used again. Yet, in later
1454 * optimization and analysis passes, such as compute_live_intervals, we need
1455 * to loop over all the virtual GRFs. Compacting them can save a lot of
1456 * overhead.
1457 */
1458 void
1459 fs_visitor::compact_virtual_grfs()
1460 {
1461 /* Mark which virtual GRFs are used, and count how many. */
1462 int remap_table[this->virtual_grf_count];
1463 memset(remap_table, -1, sizeof(remap_table));
1464
1465 foreach_list(node, &this->instructions) {
1466 const fs_inst *inst = (const fs_inst *) node;
1467
1468 if (inst->dst.file == GRF)
1469 remap_table[inst->dst.reg] = 0;
1470
1471 for (int i = 0; i < 3; i++) {
1472 if (inst->src[i].file == GRF)
1473 remap_table[inst->src[i].reg] = 0;
1474 }
1475 }
1476
1477 /* In addition to registers used in instructions, fs_visitor keeps
1478 * direct references to certain special values which must be patched:
1479 */
1480 fs_reg *special[] = {
1481 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1482 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1483 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1484 &delta_x[0], &delta_x[1], &delta_x[2],
1485 &delta_x[3], &delta_x[4], &delta_x[5],
1486 &delta_y[0], &delta_y[1], &delta_y[2],
1487 &delta_y[3], &delta_y[4], &delta_y[5],
1488 };
1489 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1490 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1491
1492 /* Treat all special values as used, to be conservative */
1493 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1494 if (special[i]->file == GRF)
1495 remap_table[special[i]->reg] = 0;
1496 }
1497
1498 /* Compact the GRF arrays. */
1499 int new_index = 0;
1500 for (int i = 0; i < this->virtual_grf_count; i++) {
1501 if (remap_table[i] != -1) {
1502 remap_table[i] = new_index;
1503 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1504 if (live_intervals_valid) {
1505 virtual_grf_start[new_index] = virtual_grf_start[i];
1506 virtual_grf_end[new_index] = virtual_grf_end[i];
1507 }
1508 ++new_index;
1509 }
1510 }
1511
1512 this->virtual_grf_count = new_index;
1513
1514 /* Patch all the instructions to use the newly renumbered registers */
1515 foreach_list(node, &this->instructions) {
1516 fs_inst *inst = (fs_inst *) node;
1517
1518 if (inst->dst.file == GRF)
1519 inst->dst.reg = remap_table[inst->dst.reg];
1520
1521 for (int i = 0; i < 3; i++) {
1522 if (inst->src[i].file == GRF)
1523 inst->src[i].reg = remap_table[inst->src[i].reg];
1524 }
1525 }
1526
1527 /* Patch all the references to special values */
1528 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1529 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1530 special[i]->reg = remap_table[special[i]->reg];
1531 }
1532 }
1533
1534 bool
1535 fs_visitor::remove_dead_constants()
1536 {
1537 if (dispatch_width == 8) {
1538 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1539 this->nr_params_remap = c->prog_data.nr_params;
1540
1541 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1542 this->params_remap[i] = -1;
1543
1544 /* Find which params are still in use. */
1545 foreach_list(node, &this->instructions) {
1546 fs_inst *inst = (fs_inst *)node;
1547
1548 for (int i = 0; i < 3; i++) {
1549 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1550
1551 if (inst->src[i].file != UNIFORM)
1552 continue;
1553
1554 /* Section 5.11 of the OpenGL 4.3 spec says:
1555 *
1556 * "Out-of-bounds reads return undefined values, which include
1557 * values from other variables of the active program or zero."
1558 */
1559 if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1560 constant_nr = 0;
1561 }
1562
1563 /* For now, set this to non-negative. We'll give it the
1564 * actual new number in a moment, in order to keep the
1565 * register numbers nicely ordered.
1566 */
1567 this->params_remap[constant_nr] = 0;
1568 }
1569 }
1570
1571 /* Figure out what the new numbers for the params will be. At some
1572 * point when we're doing uniform array access, we're going to want
1573 * to keep the distinction between .reg and .reg_offset, but for
1574 * now we don't care.
1575 */
1576 unsigned int new_nr_params = 0;
1577 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1578 if (this->params_remap[i] != -1) {
1579 this->params_remap[i] = new_nr_params++;
1580 }
1581 }
1582
1583 /* Update the list of params to be uploaded to match our new numbering. */
1584 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1585 int remapped = this->params_remap[i];
1586
1587 if (remapped == -1)
1588 continue;
1589
1590 c->prog_data.param[remapped] = c->prog_data.param[i];
1591 }
1592
1593 c->prog_data.nr_params = new_nr_params;
1594 } else {
1595 /* This should have been generated in the 8-wide pass already. */
1596 assert(this->params_remap);
1597 }
1598
1599 /* Now do the renumbering of the shader to remove unused params. */
1600 foreach_list(node, &this->instructions) {
1601 fs_inst *inst = (fs_inst *)node;
1602
1603 for (int i = 0; i < 3; i++) {
1604 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1605
1606 if (inst->src[i].file != UNIFORM)
1607 continue;
1608
1609 /* as above alias to 0 */
1610 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1611 constant_nr = 0;
1612 }
1613 assert(this->params_remap[constant_nr] != -1);
1614 inst->src[i].reg = this->params_remap[constant_nr];
1615 inst->src[i].reg_offset = 0;
1616 }
1617 }
1618
1619 return true;
1620 }
1621
1622 /*
1623 * Implements array access of uniforms by inserting a
1624 * PULL_CONSTANT_LOAD instruction.
1625 *
1626 * Unlike temporary GRF array access (where we don't support it due to
1627 * the difficulty of doing relative addressing on instruction
1628 * destinations), we could potentially do array access of uniforms
1629 * that were loaded in GRF space as push constants. In real-world
1630 * usage we've seen, though, the arrays being used are always larger
1631 * than we could load as push constants, so just always move all
1632 * uniform array access out to a pull constant buffer.
1633 */
1634 void
1635 fs_visitor::move_uniform_array_access_to_pull_constants()
1636 {
1637 int pull_constant_loc[c->prog_data.nr_params];
1638
1639 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1640 pull_constant_loc[i] = -1;
1641 }
1642
1643 /* Walk through and find array access of uniforms. Put a copy of that
1644 * uniform in the pull constant buffer.
1645 *
1646 * Note that we don't move constant-indexed accesses to arrays. No
1647 * testing has been done of the performance impact of this choice.
1648 */
1649 foreach_list_safe(node, &this->instructions) {
1650 fs_inst *inst = (fs_inst *)node;
1651
1652 for (int i = 0 ; i < 3; i++) {
1653 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1654 continue;
1655
1656 int uniform = inst->src[i].reg;
1657
1658 /* If this array isn't already present in the pull constant buffer,
1659 * add it.
1660 */
1661 if (pull_constant_loc[uniform] == -1) {
1662 const float **values = &c->prog_data.param[uniform];
1663
1664 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1665
1666 assert(param_size[uniform]);
1667
1668 for (int j = 0; j < param_size[uniform]; j++) {
1669 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1670 values[j];
1671 }
1672 }
1673
1674 /* Set up the annotation tracking for new generated instructions. */
1675 base_ir = inst->ir;
1676 current_annotation = inst->annotation;
1677
1678 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1679 fs_reg temp = fs_reg(this, glsl_type::float_type);
1680 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1681 surf_index,
1682 *inst->src[i].reladdr,
1683 pull_constant_loc[uniform] +
1684 inst->src[i].reg_offset);
1685 inst->insert_before(&list);
1686
1687 inst->src[i].file = temp.file;
1688 inst->src[i].reg = temp.reg;
1689 inst->src[i].reg_offset = temp.reg_offset;
1690 inst->src[i].reladdr = NULL;
1691 }
1692 }
1693 }
1694
1695 /**
1696 * Choose accesses from the UNIFORM file to demote to using the pull
1697 * constant buffer.
1698 *
1699 * We allow a fragment shader to have more than the specified minimum
1700 * maximum number of fragment shader uniform components (64). If
1701 * there are too many of these, they'd fill up all of register space.
1702 * So, this will push some of them out to the pull constant buffer and
1703 * update the program to load them.
1704 */
1705 void
1706 fs_visitor::setup_pull_constants()
1707 {
1708 /* Only allow 16 registers (128 uniform components) as push constants. */
1709 unsigned int max_uniform_components = 16 * 8;
1710 if (c->prog_data.nr_params <= max_uniform_components)
1711 return;
1712
1713 if (dispatch_width == 16) {
1714 fail("Pull constants not supported in 16-wide\n");
1715 return;
1716 }
1717
1718 /* Just demote the end of the list. We could probably do better
1719 * here, demoting things that are rarely used in the program first.
1720 */
1721 unsigned int pull_uniform_base = max_uniform_components;
1722
1723 int pull_constant_loc[c->prog_data.nr_params];
1724 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1725 if (i < pull_uniform_base) {
1726 pull_constant_loc[i] = -1;
1727 } else {
1728 pull_constant_loc[i] = -1;
1729 /* If our constant is already being uploaded for reladdr purposes,
1730 * reuse it.
1731 */
1732 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1733 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1734 pull_constant_loc[i] = j;
1735 break;
1736 }
1737 }
1738 if (pull_constant_loc[i] == -1) {
1739 int pull_index = c->prog_data.nr_pull_params++;
1740 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1741 pull_constant_loc[i] = pull_index;;
1742 }
1743 }
1744 }
1745 c->prog_data.nr_params = pull_uniform_base;
1746
1747 foreach_list(node, &this->instructions) {
1748 fs_inst *inst = (fs_inst *)node;
1749
1750 for (int i = 0; i < 3; i++) {
1751 if (inst->src[i].file != UNIFORM)
1752 continue;
1753
1754 int pull_index = pull_constant_loc[inst->src[i].reg +
1755 inst->src[i].reg_offset];
1756 if (pull_index == -1)
1757 continue;
1758
1759 assert(!inst->src[i].reladdr);
1760
1761 fs_reg dst = fs_reg(this, glsl_type::float_type);
1762 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1763 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1764 fs_inst *pull =
1765 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1766 dst, index, offset);
1767 pull->ir = inst->ir;
1768 pull->annotation = inst->annotation;
1769
1770 inst->insert_before(pull);
1771
1772 inst->src[i].file = GRF;
1773 inst->src[i].reg = dst.reg;
1774 inst->src[i].reg_offset = 0;
1775 inst->src[i].smear = pull_index & 3;
1776 }
1777 }
1778 }
1779
1780 bool
1781 fs_visitor::opt_algebraic()
1782 {
1783 bool progress = false;
1784
1785 foreach_list(node, &this->instructions) {
1786 fs_inst *inst = (fs_inst *)node;
1787
1788 switch (inst->opcode) {
1789 case BRW_OPCODE_MUL:
1790 if (inst->src[1].file != IMM)
1791 continue;
1792
1793 /* a * 1.0 = a */
1794 if (inst->src[1].is_one()) {
1795 inst->opcode = BRW_OPCODE_MOV;
1796 inst->src[1] = reg_undef;
1797 progress = true;
1798 break;
1799 }
1800
1801 /* a * 0.0 = 0.0 */
1802 if (inst->src[1].is_zero()) {
1803 inst->opcode = BRW_OPCODE_MOV;
1804 inst->src[0] = inst->src[1];
1805 inst->src[1] = reg_undef;
1806 progress = true;
1807 break;
1808 }
1809
1810 break;
1811 case BRW_OPCODE_ADD:
1812 if (inst->src[1].file != IMM)
1813 continue;
1814
1815 /* a + 0.0 = a */
1816 if (inst->src[1].is_zero()) {
1817 inst->opcode = BRW_OPCODE_MOV;
1818 inst->src[1] = reg_undef;
1819 progress = true;
1820 break;
1821 }
1822 break;
1823 default:
1824 break;
1825 }
1826 }
1827
1828 return progress;
1829 }
1830
1831 /**
1832 * Removes any instructions writing a VGRF where that VGRF is not used by any
1833 * later instruction.
1834 */
1835 bool
1836 fs_visitor::dead_code_eliminate()
1837 {
1838 bool progress = false;
1839 int pc = 0;
1840
1841 calculate_live_intervals();
1842
1843 foreach_list_safe(node, &this->instructions) {
1844 fs_inst *inst = (fs_inst *)node;
1845
1846 if (inst->dst.file == GRF) {
1847 assert(this->virtual_grf_end[inst->dst.reg] >= pc);
1848 if (this->virtual_grf_end[inst->dst.reg] == pc) {
1849 /* Don't dead code eliminate instructions that write to the
1850 * accumulator as a side-effect. Instead just set the destination
1851 * to the null register to free it.
1852 */
1853 switch (inst->opcode) {
1854 case BRW_OPCODE_ADDC:
1855 case BRW_OPCODE_SUBB:
1856 case BRW_OPCODE_MACH:
1857 inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
1858 break;
1859 default:
1860 inst->remove();
1861 break;
1862 }
1863 progress = true;
1864 }
1865 }
1866
1867 pc++;
1868 }
1869
1870 if (progress)
1871 live_intervals_valid = false;
1872
1873 return progress;
1874 }
1875
1876 struct dead_code_hash_key
1877 {
1878 int vgrf;
1879 int reg_offset;
1880 };
1881
1882 static bool
1883 dead_code_hash_compare(const void *a, const void *b)
1884 {
1885 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1886 }
1887
1888 static void
1889 clear_dead_code_hash(struct hash_table *ht)
1890 {
1891 struct hash_entry *entry;
1892
1893 hash_table_foreach(ht, entry) {
1894 _mesa_hash_table_remove(ht, entry);
1895 }
1896 }
1897
1898 static void
1899 insert_dead_code_hash(struct hash_table *ht,
1900 int vgrf, int reg_offset, fs_inst *inst)
1901 {
1902 /* We don't bother freeing keys, because they'll be GCed with the ht. */
1903 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1904
1905 key->vgrf = vgrf;
1906 key->reg_offset = reg_offset;
1907
1908 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1909 }
1910
1911 static struct hash_entry *
1912 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1913 {
1914 struct dead_code_hash_key key;
1915
1916 key.vgrf = vgrf;
1917 key.reg_offset = reg_offset;
1918
1919 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1920 }
1921
1922 static void
1923 remove_dead_code_hash(struct hash_table *ht,
1924 int vgrf, int reg_offset)
1925 {
1926 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1927 if (!entry)
1928 return;
1929
1930 _mesa_hash_table_remove(ht, entry);
1931 }
1932
1933 /**
1934 * Walks basic blocks, removing any regs that are written but not read before
1935 * being redefined.
1936 *
1937 * The dead_code_eliminate() function implements a global dead code
1938 * elimination, but it only handles the removing the last write to a register
1939 * if it's never read. This one can handle intermediate writes, but only
1940 * within a basic block.
1941 */
1942 bool
1943 fs_visitor::dead_code_eliminate_local()
1944 {
1945 struct hash_table *ht;
1946 bool progress = false;
1947
1948 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1949
1950 foreach_list_safe(node, &this->instructions) {
1951 fs_inst *inst = (fs_inst *)node;
1952
1953 /* At a basic block, empty the HT since we don't understand dataflow
1954 * here.
1955 */
1956 if (inst->is_control_flow()) {
1957 clear_dead_code_hash(ht);
1958 continue;
1959 }
1960
1961 /* Clear the HT of any instructions that got read. */
1962 for (int i = 0; i < 3; i++) {
1963 fs_reg src = inst->src[i];
1964 if (src.file != GRF)
1965 continue;
1966
1967 int read = 1;
1968 if (inst->is_send_from_grf())
1969 read = virtual_grf_sizes[src.reg] - src.reg_offset;
1970
1971 for (int reg_offset = src.reg_offset;
1972 reg_offset < src.reg_offset + read;
1973 reg_offset++) {
1974 remove_dead_code_hash(ht, src.reg, reg_offset);
1975 }
1976 }
1977
1978 /* Add any update of a GRF to the HT, removing a previous write if it
1979 * wasn't read.
1980 */
1981 if (inst->dst.file == GRF) {
1982 if (inst->regs_written > 1) {
1983 /* We don't know how to trim channels from an instruction's
1984 * writes, so we can't incrementally remove unread channels from
1985 * it. Just remove whatever it overwrites from the table
1986 */
1987 for (int i = 0; i < inst->regs_written; i++) {
1988 remove_dead_code_hash(ht,
1989 inst->dst.reg,
1990 inst->dst.reg_offset + i);
1991 }
1992 } else {
1993 struct hash_entry *entry =
1994 get_dead_code_hash_entry(ht, inst->dst.reg,
1995 inst->dst.reg_offset);
1996
1997 if (inst->is_partial_write()) {
1998 /* For a partial write, we can't remove any previous dead code
1999 * candidate, since we're just modifying their result, but we can
2000 * be dead code eliminiated ourselves.
2001 */
2002 if (entry) {
2003 entry->data = inst;
2004 } else {
2005 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2006 inst);
2007 }
2008 } else {
2009 if (entry) {
2010 /* We're completely updating a channel, and there was a
2011 * previous write to the channel that wasn't read. Kill it!
2012 */
2013 fs_inst *inst = (fs_inst *)entry->data;
2014 inst->remove();
2015 progress = true;
2016 _mesa_hash_table_remove(ht, entry);
2017 }
2018
2019 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2020 inst);
2021 }
2022 }
2023 }
2024 }
2025
2026 _mesa_hash_table_destroy(ht, NULL);
2027
2028 if (progress)
2029 live_intervals_valid = false;
2030
2031 return progress;
2032 }
2033
2034 /**
2035 * Implements a second type of register coalescing: This one checks if
2036 * the two regs involved in a raw move don't interfere, in which case
2037 * they can both by stored in the same place and the MOV removed.
2038 */
2039 bool
2040 fs_visitor::register_coalesce_2()
2041 {
2042 bool progress = false;
2043
2044 calculate_live_intervals();
2045
2046 foreach_list_safe(node, &this->instructions) {
2047 fs_inst *inst = (fs_inst *)node;
2048
2049 if (inst->opcode != BRW_OPCODE_MOV ||
2050 inst->is_partial_write() ||
2051 inst->saturate ||
2052 inst->src[0].file != GRF ||
2053 inst->src[0].negate ||
2054 inst->src[0].abs ||
2055 inst->src[0].smear != -1 ||
2056 inst->dst.file != GRF ||
2057 inst->dst.type != inst->src[0].type ||
2058 virtual_grf_sizes[inst->src[0].reg] != 1 ||
2059 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
2060 continue;
2061 }
2062
2063 int reg_from = inst->src[0].reg;
2064 assert(inst->src[0].reg_offset == 0);
2065 int reg_to = inst->dst.reg;
2066 int reg_to_offset = inst->dst.reg_offset;
2067
2068 foreach_list(node, &this->instructions) {
2069 fs_inst *scan_inst = (fs_inst *)node;
2070
2071 if (scan_inst->dst.file == GRF &&
2072 scan_inst->dst.reg == reg_from) {
2073 scan_inst->dst.reg = reg_to;
2074 scan_inst->dst.reg_offset = reg_to_offset;
2075 }
2076 for (int i = 0; i < 3; i++) {
2077 if (scan_inst->src[i].file == GRF &&
2078 scan_inst->src[i].reg == reg_from) {
2079 scan_inst->src[i].reg = reg_to;
2080 scan_inst->src[i].reg_offset = reg_to_offset;
2081 }
2082 }
2083 }
2084
2085 inst->remove();
2086
2087 /* We don't need to recalculate live intervals inside the loop despite
2088 * flagging live_intervals_valid because we only use live intervals for
2089 * the interferes test, and we must have had a situation where the
2090 * intervals were:
2091 *
2092 * from to
2093 * ^
2094 * |
2095 * v
2096 * ^
2097 * |
2098 * v
2099 *
2100 * Some register R that might get coalesced with one of these two could
2101 * only be referencing "to", otherwise "from"'s range would have been
2102 * longer. R's range could also only start at the end of "to" or later,
2103 * otherwise it will conflict with "to" when we try to coalesce "to"
2104 * into Rw anyway.
2105 */
2106 live_intervals_valid = false;
2107
2108 progress = true;
2109 continue;
2110 }
2111
2112 return progress;
2113 }
2114
2115 bool
2116 fs_visitor::register_coalesce()
2117 {
2118 bool progress = false;
2119 int if_depth = 0;
2120 int loop_depth = 0;
2121
2122 foreach_list_safe(node, &this->instructions) {
2123 fs_inst *inst = (fs_inst *)node;
2124
2125 /* Make sure that we dominate the instructions we're going to
2126 * scan for interfering with our coalescing, or we won't have
2127 * scanned enough to see if anything interferes with our
2128 * coalescing. We don't dominate the following instructions if
2129 * we're in a loop or an if block.
2130 */
2131 switch (inst->opcode) {
2132 case BRW_OPCODE_DO:
2133 loop_depth++;
2134 break;
2135 case BRW_OPCODE_WHILE:
2136 loop_depth--;
2137 break;
2138 case BRW_OPCODE_IF:
2139 if_depth++;
2140 break;
2141 case BRW_OPCODE_ENDIF:
2142 if_depth--;
2143 break;
2144 default:
2145 break;
2146 }
2147 if (loop_depth || if_depth)
2148 continue;
2149
2150 if (inst->opcode != BRW_OPCODE_MOV ||
2151 inst->is_partial_write() ||
2152 inst->saturate ||
2153 inst->dst.file != GRF || (inst->src[0].file != GRF &&
2154 inst->src[0].file != UNIFORM)||
2155 inst->dst.type != inst->src[0].type)
2156 continue;
2157
2158 bool has_source_modifiers = (inst->src[0].abs ||
2159 inst->src[0].negate ||
2160 inst->src[0].smear != -1 ||
2161 inst->src[0].file == UNIFORM);
2162
2163 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
2164 * them: check for no writes to either one until the exit of the
2165 * program.
2166 */
2167 bool interfered = false;
2168
2169 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2170 !scan_inst->is_tail_sentinel();
2171 scan_inst = (fs_inst *)scan_inst->next) {
2172 if (scan_inst->dst.file == GRF) {
2173 if (scan_inst->overwrites_reg(inst->dst) ||
2174 scan_inst->overwrites_reg(inst->src[0])) {
2175 interfered = true;
2176 break;
2177 }
2178 }
2179
2180 if (has_source_modifiers) {
2181 for (int i = 0; i < 3; i++) {
2182 if (scan_inst->src[i].file == GRF &&
2183 scan_inst->src[i].reg == inst->dst.reg &&
2184 scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2185 inst->dst.type != scan_inst->src[i].type)
2186 {
2187 interfered = true;
2188 break;
2189 }
2190 }
2191 }
2192
2193
2194 /* The gen6 MATH instruction can't handle source modifiers or
2195 * unusual register regions, so avoid coalescing those for
2196 * now. We should do something more specific.
2197 */
2198 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2199 interfered = true;
2200 break;
2201 }
2202
2203 /* The accumulator result appears to get used for the
2204 * conditional modifier generation. When negating a UD
2205 * value, there is a 33rd bit generated for the sign in the
2206 * accumulator value, so now you can't check, for example,
2207 * equality with a 32-bit value. See piglit fs-op-neg-uint.
2208 */
2209 if (scan_inst->conditional_mod &&
2210 inst->src[0].negate &&
2211 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2212 interfered = true;
2213 break;
2214 }
2215 }
2216 if (interfered) {
2217 continue;
2218 }
2219
2220 /* Rewrite the later usage to point at the source of the move to
2221 * be removed.
2222 */
2223 for (fs_inst *scan_inst = inst;
2224 !scan_inst->is_tail_sentinel();
2225 scan_inst = (fs_inst *)scan_inst->next) {
2226 for (int i = 0; i < 3; i++) {
2227 if (scan_inst->src[i].file == GRF &&
2228 scan_inst->src[i].reg == inst->dst.reg &&
2229 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2230 fs_reg new_src = inst->src[0];
2231 if (scan_inst->src[i].abs) {
2232 new_src.negate = 0;
2233 new_src.abs = 1;
2234 }
2235 new_src.negate ^= scan_inst->src[i].negate;
2236 scan_inst->src[i] = new_src;
2237 }
2238 }
2239 }
2240
2241 inst->remove();
2242 progress = true;
2243 }
2244
2245 if (progress)
2246 live_intervals_valid = false;
2247
2248 return progress;
2249 }
2250
2251
2252 bool
2253 fs_visitor::compute_to_mrf()
2254 {
2255 bool progress = false;
2256 int next_ip = 0;
2257
2258 calculate_live_intervals();
2259
2260 foreach_list_safe(node, &this->instructions) {
2261 fs_inst *inst = (fs_inst *)node;
2262
2263 int ip = next_ip;
2264 next_ip++;
2265
2266 if (inst->opcode != BRW_OPCODE_MOV ||
2267 inst->is_partial_write() ||
2268 inst->dst.file != MRF || inst->src[0].file != GRF ||
2269 inst->dst.type != inst->src[0].type ||
2270 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2271 continue;
2272
2273 /* Work out which hardware MRF registers are written by this
2274 * instruction.
2275 */
2276 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2277 int mrf_high;
2278 if (inst->dst.reg & BRW_MRF_COMPR4) {
2279 mrf_high = mrf_low + 4;
2280 } else if (dispatch_width == 16 &&
2281 (!inst->force_uncompressed && !inst->force_sechalf)) {
2282 mrf_high = mrf_low + 1;
2283 } else {
2284 mrf_high = mrf_low;
2285 }
2286
2287 /* Can't compute-to-MRF this GRF if someone else was going to
2288 * read it later.
2289 */
2290 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2291 continue;
2292
2293 /* Found a move of a GRF to a MRF. Let's see if we can go
2294 * rewrite the thing that made this GRF to write into the MRF.
2295 */
2296 fs_inst *scan_inst;
2297 for (scan_inst = (fs_inst *)inst->prev;
2298 scan_inst->prev != NULL;
2299 scan_inst = (fs_inst *)scan_inst->prev) {
2300 if (scan_inst->dst.file == GRF &&
2301 scan_inst->dst.reg == inst->src[0].reg) {
2302 /* Found the last thing to write our reg we want to turn
2303 * into a compute-to-MRF.
2304 */
2305
2306 /* If this one instruction didn't populate all the
2307 * channels, bail. We might be able to rewrite everything
2308 * that writes that reg, but it would require smarter
2309 * tracking to delay the rewriting until complete success.
2310 */
2311 if (scan_inst->is_partial_write())
2312 break;
2313
2314 /* Things returning more than one register would need us to
2315 * understand coalescing out more than one MOV at a time.
2316 */
2317 if (scan_inst->regs_written > 1)
2318 break;
2319
2320 /* SEND instructions can't have MRF as a destination. */
2321 if (scan_inst->mlen)
2322 break;
2323
2324 if (brw->gen == 6) {
2325 /* gen6 math instructions must have the destination be
2326 * GRF, so no compute-to-MRF for them.
2327 */
2328 if (scan_inst->is_math()) {
2329 break;
2330 }
2331 }
2332
2333 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2334 /* Found the creator of our MRF's source value. */
2335 scan_inst->dst.file = MRF;
2336 scan_inst->dst.reg = inst->dst.reg;
2337 scan_inst->saturate |= inst->saturate;
2338 inst->remove();
2339 progress = true;
2340 }
2341 break;
2342 }
2343
2344 /* We don't handle control flow here. Most computation of
2345 * values that end up in MRFs are shortly before the MRF
2346 * write anyway.
2347 */
2348 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2349 break;
2350
2351 /* You can't read from an MRF, so if someone else reads our
2352 * MRF's source GRF that we wanted to rewrite, that stops us.
2353 */
2354 bool interfered = false;
2355 for (int i = 0; i < 3; i++) {
2356 if (scan_inst->src[i].file == GRF &&
2357 scan_inst->src[i].reg == inst->src[0].reg &&
2358 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2359 interfered = true;
2360 }
2361 }
2362 if (interfered)
2363 break;
2364
2365 if (scan_inst->dst.file == MRF) {
2366 /* If somebody else writes our MRF here, we can't
2367 * compute-to-MRF before that.
2368 */
2369 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2370 int scan_mrf_high;
2371
2372 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2373 scan_mrf_high = scan_mrf_low + 4;
2374 } else if (dispatch_width == 16 &&
2375 (!scan_inst->force_uncompressed &&
2376 !scan_inst->force_sechalf)) {
2377 scan_mrf_high = scan_mrf_low + 1;
2378 } else {
2379 scan_mrf_high = scan_mrf_low;
2380 }
2381
2382 if (mrf_low == scan_mrf_low ||
2383 mrf_low == scan_mrf_high ||
2384 mrf_high == scan_mrf_low ||
2385 mrf_high == scan_mrf_high) {
2386 break;
2387 }
2388 }
2389
2390 if (scan_inst->mlen > 0) {
2391 /* Found a SEND instruction, which means that there are
2392 * live values in MRFs from base_mrf to base_mrf +
2393 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2394 * above it.
2395 */
2396 if (mrf_low >= scan_inst->base_mrf &&
2397 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2398 break;
2399 }
2400 if (mrf_high >= scan_inst->base_mrf &&
2401 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2402 break;
2403 }
2404 }
2405 }
2406 }
2407
2408 if (progress)
2409 live_intervals_valid = false;
2410
2411 return progress;
2412 }
2413
2414 /**
2415 * Walks through basic blocks, looking for repeated MRF writes and
2416 * removing the later ones.
2417 */
2418 bool
2419 fs_visitor::remove_duplicate_mrf_writes()
2420 {
2421 fs_inst *last_mrf_move[16];
2422 bool progress = false;
2423
2424 /* Need to update the MRF tracking for compressed instructions. */
2425 if (dispatch_width == 16)
2426 return false;
2427
2428 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2429
2430 foreach_list_safe(node, &this->instructions) {
2431 fs_inst *inst = (fs_inst *)node;
2432
2433 if (inst->is_control_flow()) {
2434 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2435 }
2436
2437 if (inst->opcode == BRW_OPCODE_MOV &&
2438 inst->dst.file == MRF) {
2439 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2440 if (prev_inst && inst->equals(prev_inst)) {
2441 inst->remove();
2442 progress = true;
2443 continue;
2444 }
2445 }
2446
2447 /* Clear out the last-write records for MRFs that were overwritten. */
2448 if (inst->dst.file == MRF) {
2449 last_mrf_move[inst->dst.reg] = NULL;
2450 }
2451
2452 if (inst->mlen > 0) {
2453 /* Found a SEND instruction, which will include two or fewer
2454 * implied MRF writes. We could do better here.
2455 */
2456 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2457 last_mrf_move[inst->base_mrf + i] = NULL;
2458 }
2459 }
2460
2461 /* Clear out any MRF move records whose sources got overwritten. */
2462 if (inst->dst.file == GRF) {
2463 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2464 if (last_mrf_move[i] &&
2465 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2466 last_mrf_move[i] = NULL;
2467 }
2468 }
2469 }
2470
2471 if (inst->opcode == BRW_OPCODE_MOV &&
2472 inst->dst.file == MRF &&
2473 inst->src[0].file == GRF &&
2474 !inst->is_partial_write()) {
2475 last_mrf_move[inst->dst.reg] = inst;
2476 }
2477 }
2478
2479 if (progress)
2480 live_intervals_valid = false;
2481
2482 return progress;
2483 }
2484
2485 static void
2486 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2487 int first_grf, int grf_len)
2488 {
2489 bool inst_16wide = (dispatch_width > 8 &&
2490 !inst->force_uncompressed &&
2491 !inst->force_sechalf);
2492
2493 /* Clear the flag for registers that actually got read (as expected). */
2494 for (int i = 0; i < 3; i++) {
2495 int grf;
2496 if (inst->src[i].file == GRF) {
2497 grf = inst->src[i].reg;
2498 } else if (inst->src[i].file == HW_REG &&
2499 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2500 grf = inst->src[i].fixed_hw_reg.nr;
2501 } else {
2502 continue;
2503 }
2504
2505 if (grf >= first_grf &&
2506 grf < first_grf + grf_len) {
2507 deps[grf - first_grf] = false;
2508 if (inst_16wide)
2509 deps[grf - first_grf + 1] = false;
2510 }
2511 }
2512 }
2513
2514 /**
2515 * Implements this workaround for the original 965:
2516 *
2517 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2518 * check for post destination dependencies on this instruction, software
2519 * must ensure that there is no destination hazard for the case of ‘write
2520 * followed by a posted write’ shown in the following example.
2521 *
2522 * 1. mov r3 0
2523 * 2. send r3.xy <rest of send instruction>
2524 * 3. mov r2 r3
2525 *
2526 * Due to no post-destination dependency check on the ‘send’, the above
2527 * code sequence could have two instructions (1 and 2) in flight at the
2528 * same time that both consider ‘r3’ as the target of their final writes.
2529 */
2530 void
2531 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2532 {
2533 int reg_size = dispatch_width / 8;
2534 int write_len = inst->regs_written * reg_size;
2535 int first_write_grf = inst->dst.reg;
2536 bool needs_dep[BRW_MAX_MRF];
2537 assert(write_len < (int)sizeof(needs_dep) - 1);
2538
2539 memset(needs_dep, false, sizeof(needs_dep));
2540 memset(needs_dep, true, write_len);
2541
2542 clear_deps_for_inst_src(inst, dispatch_width,
2543 needs_dep, first_write_grf, write_len);
2544
2545 /* Walk backwards looking for writes to registers we're writing which
2546 * aren't read since being written. If we hit the start of the program,
2547 * we assume that there are no outstanding dependencies on entry to the
2548 * program.
2549 */
2550 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2551 scan_inst != NULL;
2552 scan_inst = (fs_inst *)scan_inst->prev) {
2553
2554 /* If we hit control flow, assume that there *are* outstanding
2555 * dependencies, and force their cleanup before our instruction.
2556 */
2557 if (scan_inst->is_control_flow()) {
2558 for (int i = 0; i < write_len; i++) {
2559 if (needs_dep[i]) {
2560 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2561 }
2562 }
2563 return;
2564 }
2565
2566 bool scan_inst_16wide = (dispatch_width > 8 &&
2567 !scan_inst->force_uncompressed &&
2568 !scan_inst->force_sechalf);
2569
2570 /* We insert our reads as late as possible on the assumption that any
2571 * instruction but a MOV that might have left us an outstanding
2572 * dependency has more latency than a MOV.
2573 */
2574 if (scan_inst->dst.file == GRF) {
2575 for (int i = 0; i < scan_inst->regs_written; i++) {
2576 int reg = scan_inst->dst.reg + i * reg_size;
2577
2578 if (reg >= first_write_grf &&
2579 reg < first_write_grf + write_len &&
2580 needs_dep[reg - first_write_grf]) {
2581 inst->insert_before(DEP_RESOLVE_MOV(reg));
2582 needs_dep[reg - first_write_grf] = false;
2583 if (scan_inst_16wide)
2584 needs_dep[reg - first_write_grf + 1] = false;
2585 }
2586 }
2587 }
2588
2589 /* Clear the flag for registers that actually got read (as expected). */
2590 clear_deps_for_inst_src(scan_inst, dispatch_width,
2591 needs_dep, first_write_grf, write_len);
2592
2593 /* Continue the loop only if we haven't resolved all the dependencies */
2594 int i;
2595 for (i = 0; i < write_len; i++) {
2596 if (needs_dep[i])
2597 break;
2598 }
2599 if (i == write_len)
2600 return;
2601 }
2602 }
2603
2604 /**
2605 * Implements this workaround for the original 965:
2606 *
2607 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2608 * used as a destination register until after it has been sourced by an
2609 * instruction with a different destination register.
2610 */
2611 void
2612 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2613 {
2614 int write_len = inst->regs_written * dispatch_width / 8;
2615 int first_write_grf = inst->dst.reg;
2616 bool needs_dep[BRW_MAX_MRF];
2617 assert(write_len < (int)sizeof(needs_dep) - 1);
2618
2619 memset(needs_dep, false, sizeof(needs_dep));
2620 memset(needs_dep, true, write_len);
2621 /* Walk forwards looking for writes to registers we're writing which aren't
2622 * read before being written.
2623 */
2624 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2625 !scan_inst->is_tail_sentinel();
2626 scan_inst = (fs_inst *)scan_inst->next) {
2627 /* If we hit control flow, force resolve all remaining dependencies. */
2628 if (scan_inst->is_control_flow()) {
2629 for (int i = 0; i < write_len; i++) {
2630 if (needs_dep[i])
2631 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2632 }
2633 return;
2634 }
2635
2636 /* Clear the flag for registers that actually got read (as expected). */
2637 clear_deps_for_inst_src(scan_inst, dispatch_width,
2638 needs_dep, first_write_grf, write_len);
2639
2640 /* We insert our reads as late as possible since they're reading the
2641 * result of a SEND, which has massive latency.
2642 */
2643 if (scan_inst->dst.file == GRF &&
2644 scan_inst->dst.reg >= first_write_grf &&
2645 scan_inst->dst.reg < first_write_grf + write_len &&
2646 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2647 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2648 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2649 }
2650
2651 /* Continue the loop only if we haven't resolved all the dependencies */
2652 int i;
2653 for (i = 0; i < write_len; i++) {
2654 if (needs_dep[i])
2655 break;
2656 }
2657 if (i == write_len)
2658 return;
2659 }
2660
2661 /* If we hit the end of the program, resolve all remaining dependencies out
2662 * of paranoia.
2663 */
2664 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2665 assert(last_inst->eot);
2666 for (int i = 0; i < write_len; i++) {
2667 if (needs_dep[i])
2668 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2669 }
2670 }
2671
2672 void
2673 fs_visitor::insert_gen4_send_dependency_workarounds()
2674 {
2675 if (brw->gen != 4 || brw->is_g4x)
2676 return;
2677
2678 /* Note that we're done with register allocation, so GRF fs_regs always
2679 * have a .reg_offset of 0.
2680 */
2681
2682 foreach_list_safe(node, &this->instructions) {
2683 fs_inst *inst = (fs_inst *)node;
2684
2685 if (inst->mlen != 0 && inst->dst.file == GRF) {
2686 insert_gen4_pre_send_dependency_workarounds(inst);
2687 insert_gen4_post_send_dependency_workarounds(inst);
2688 }
2689 }
2690 }
2691
2692 /**
2693 * Turns the generic expression-style uniform pull constant load instruction
2694 * into a hardware-specific series of instructions for loading a pull
2695 * constant.
2696 *
2697 * The expression style allows the CSE pass before this to optimize out
2698 * repeated loads from the same offset, and gives the pre-register-allocation
2699 * scheduling full flexibility, while the conversion to native instructions
2700 * allows the post-register-allocation scheduler the best information
2701 * possible.
2702 *
2703 * Note that execution masking for setting up pull constant loads is special:
2704 * the channels that need to be written are unrelated to the current execution
2705 * mask, since a later instruction will use one of the result channels as a
2706 * source operand for all 8 or 16 of its channels.
2707 */
2708 void
2709 fs_visitor::lower_uniform_pull_constant_loads()
2710 {
2711 foreach_list(node, &this->instructions) {
2712 fs_inst *inst = (fs_inst *)node;
2713
2714 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2715 continue;
2716
2717 if (brw->gen >= 7) {
2718 /* The offset arg before was a vec4-aligned byte offset. We need to
2719 * turn it into a dword offset.
2720 */
2721 fs_reg const_offset_reg = inst->src[1];
2722 assert(const_offset_reg.file == IMM &&
2723 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2724 const_offset_reg.imm.u /= 4;
2725 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2726
2727 /* This is actually going to be a MOV, but since only the first dword
2728 * is accessed, we have a special opcode to do just that one. Note
2729 * that this needs to be an operation that will be considered a def
2730 * by live variable analysis, or register allocation will explode.
2731 */
2732 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2733 payload, const_offset_reg);
2734 setup->force_writemask_all = true;
2735
2736 setup->ir = inst->ir;
2737 setup->annotation = inst->annotation;
2738 inst->insert_before(setup);
2739
2740 /* Similarly, this will only populate the first 4 channels of the
2741 * result register (since we only use smear values from 0-3), but we
2742 * don't tell the optimizer.
2743 */
2744 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2745 inst->src[1] = payload;
2746
2747 this->live_intervals_valid = false;
2748 } else {
2749 /* Before register allocation, we didn't tell the scheduler about the
2750 * MRF we use. We know it's safe to use this MRF because nothing
2751 * else does except for register spill/unspill, which generates and
2752 * uses its MRF within a single IR instruction.
2753 */
2754 inst->base_mrf = 14;
2755 inst->mlen = 1;
2756 }
2757 }
2758 }
2759
2760 void
2761 fs_visitor::dump_instruction(backend_instruction *be_inst)
2762 {
2763 fs_inst *inst = (fs_inst *)be_inst;
2764
2765 if (inst->predicate) {
2766 printf("(%cf0.%d) ",
2767 inst->predicate_inverse ? '-' : '+',
2768 inst->flag_subreg);
2769 }
2770
2771 printf("%s", brw_instruction_name(inst->opcode));
2772 if (inst->saturate)
2773 printf(".sat");
2774 if (inst->conditional_mod) {
2775 printf(".cmod");
2776 if (!inst->predicate &&
2777 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2778 inst->opcode != BRW_OPCODE_IF &&
2779 inst->opcode != BRW_OPCODE_WHILE))) {
2780 printf(".f0.%d", inst->flag_subreg);
2781 }
2782 }
2783 printf(" ");
2784
2785
2786 switch (inst->dst.file) {
2787 case GRF:
2788 printf("vgrf%d", inst->dst.reg);
2789 if (inst->dst.reg_offset)
2790 printf("+%d", inst->dst.reg_offset);
2791 break;
2792 case MRF:
2793 printf("m%d", inst->dst.reg);
2794 break;
2795 case BAD_FILE:
2796 printf("(null)");
2797 break;
2798 case UNIFORM:
2799 printf("***u%d***", inst->dst.reg);
2800 break;
2801 default:
2802 printf("???");
2803 break;
2804 }
2805 printf(", ");
2806
2807 for (int i = 0; i < 3; i++) {
2808 if (inst->src[i].negate)
2809 printf("-");
2810 if (inst->src[i].abs)
2811 printf("|");
2812 switch (inst->src[i].file) {
2813 case GRF:
2814 printf("vgrf%d", inst->src[i].reg);
2815 if (inst->src[i].reg_offset)
2816 printf("+%d", inst->src[i].reg_offset);
2817 break;
2818 case MRF:
2819 printf("***m%d***", inst->src[i].reg);
2820 break;
2821 case UNIFORM:
2822 printf("u%d", inst->src[i].reg);
2823 if (inst->src[i].reg_offset)
2824 printf(".%d", inst->src[i].reg_offset);
2825 break;
2826 case BAD_FILE:
2827 printf("(null)");
2828 break;
2829 case IMM:
2830 switch (inst->src[i].type) {
2831 case BRW_REGISTER_TYPE_F:
2832 printf("%ff", inst->src[i].imm.f);
2833 break;
2834 case BRW_REGISTER_TYPE_D:
2835 printf("%dd", inst->src[i].imm.i);
2836 break;
2837 case BRW_REGISTER_TYPE_UD:
2838 printf("%uu", inst->src[i].imm.u);
2839 break;
2840 default:
2841 printf("???");
2842 break;
2843 }
2844 break;
2845 default:
2846 printf("???");
2847 break;
2848 }
2849 if (inst->src[i].abs)
2850 printf("|");
2851
2852 if (i < 3)
2853 printf(", ");
2854 }
2855
2856 printf(" ");
2857
2858 if (inst->force_uncompressed)
2859 printf("1sthalf ");
2860
2861 if (inst->force_sechalf)
2862 printf("2ndhalf ");
2863
2864 printf("\n");
2865 }
2866
2867 /**
2868 * Possibly returns an instruction that set up @param reg.
2869 *
2870 * Sometimes we want to take the result of some expression/variable
2871 * dereference tree and rewrite the instruction generating the result
2872 * of the tree. When processing the tree, we know that the
2873 * instructions generated are all writing temporaries that are dead
2874 * outside of this tree. So, if we have some instructions that write
2875 * a temporary, we're free to point that temp write somewhere else.
2876 *
2877 * Note that this doesn't guarantee that the instruction generated
2878 * only reg -- it might be the size=4 destination of a texture instruction.
2879 */
2880 fs_inst *
2881 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2882 fs_inst *end,
2883 fs_reg reg)
2884 {
2885 if (end == start ||
2886 end->is_partial_write() ||
2887 reg.reladdr ||
2888 !reg.equals(end->dst)) {
2889 return NULL;
2890 } else {
2891 return end;
2892 }
2893 }
2894
2895 void
2896 fs_visitor::setup_payload_gen6()
2897 {
2898 bool uses_depth =
2899 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2900 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2901
2902 assert(brw->gen >= 6);
2903
2904 /* R0-1: masks, pixel X/Y coordinates. */
2905 c->nr_payload_regs = 2;
2906 /* R2: only for 32-pixel dispatch.*/
2907
2908 /* R3-26: barycentric interpolation coordinates. These appear in the
2909 * same order that they appear in the brw_wm_barycentric_interp_mode
2910 * enum. Each set of coordinates occupies 2 registers if dispatch width
2911 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2912 * appear if they were enabled using the "Barycentric Interpolation
2913 * Mode" bits in WM_STATE.
2914 */
2915 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2916 if (barycentric_interp_modes & (1 << i)) {
2917 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2918 c->nr_payload_regs += 2;
2919 if (dispatch_width == 16) {
2920 c->nr_payload_regs += 2;
2921 }
2922 }
2923 }
2924
2925 /* R27: interpolated depth if uses source depth */
2926 if (uses_depth) {
2927 c->source_depth_reg = c->nr_payload_regs;
2928 c->nr_payload_regs++;
2929 if (dispatch_width == 16) {
2930 /* R28: interpolated depth if not 8-wide. */
2931 c->nr_payload_regs++;
2932 }
2933 }
2934 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2935 if (uses_depth) {
2936 c->source_w_reg = c->nr_payload_regs;
2937 c->nr_payload_regs++;
2938 if (dispatch_width == 16) {
2939 /* R30: interpolated W if not 8-wide. */
2940 c->nr_payload_regs++;
2941 }
2942 }
2943 /* R31: MSAA position offsets. */
2944 /* R32-: bary for 32-pixel. */
2945 /* R58-59: interp W for 32-pixel. */
2946
2947 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2948 c->source_depth_to_render_target = true;
2949 }
2950 }
2951
2952 bool
2953 fs_visitor::run()
2954 {
2955 sanity_param_count = fp->Base.Parameters->NumParameters;
2956 uint32_t orig_nr_params = c->prog_data.nr_params;
2957
2958 if (brw->gen >= 6)
2959 setup_payload_gen6();
2960 else
2961 setup_payload_gen4();
2962
2963 if (0) {
2964 emit_dummy_fs();
2965 } else {
2966 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2967 emit_shader_time_begin();
2968
2969 calculate_urb_setup();
2970 if (brw->gen < 6)
2971 emit_interpolation_setup_gen4();
2972 else
2973 emit_interpolation_setup_gen6();
2974
2975 /* We handle discards by keeping track of the still-live pixels in f0.1.
2976 * Initialize it with the dispatched pixels.
2977 */
2978 if (fp->UsesKill) {
2979 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2980 discard_init->flag_subreg = 1;
2981 }
2982
2983 /* Generate FS IR for main(). (the visitor only descends into
2984 * functions called "main").
2985 */
2986 if (shader) {
2987 foreach_list(node, &*shader->ir) {
2988 ir_instruction *ir = (ir_instruction *)node;
2989 base_ir = ir;
2990 this->result = reg_undef;
2991 ir->accept(this);
2992 }
2993 } else {
2994 emit_fragment_program_code();
2995 }
2996 base_ir = NULL;
2997 if (failed)
2998 return false;
2999
3000 emit(FS_OPCODE_PLACEHOLDER_HALT);
3001
3002 emit_fb_writes();
3003
3004 split_virtual_grfs();
3005
3006 move_uniform_array_access_to_pull_constants();
3007 setup_pull_constants();
3008
3009 bool progress;
3010 do {
3011 progress = false;
3012
3013 compact_virtual_grfs();
3014
3015 progress = remove_duplicate_mrf_writes() || progress;
3016
3017 progress = opt_algebraic() || progress;
3018 progress = opt_cse() || progress;
3019 progress = opt_copy_propagate() || progress;
3020 progress = dead_code_eliminate() || progress;
3021 progress = dead_code_eliminate_local() || progress;
3022 progress = register_coalesce() || progress;
3023 progress = register_coalesce_2() || progress;
3024 progress = compute_to_mrf() || progress;
3025 } while (progress);
3026
3027 remove_dead_constants();
3028
3029 schedule_instructions(false);
3030
3031 lower_uniform_pull_constant_loads();
3032
3033 assign_curb_setup();
3034 assign_urb_setup();
3035
3036 if (0) {
3037 /* Debug of register spilling: Go spill everything. */
3038 for (int i = 0; i < virtual_grf_count; i++) {
3039 spill_reg(i);
3040 }
3041 }
3042
3043 if (0)
3044 assign_regs_trivial();
3045 else {
3046 while (!assign_regs()) {
3047 if (failed)
3048 break;
3049 }
3050 }
3051 }
3052 assert(force_uncompressed_stack == 0);
3053 assert(force_sechalf_stack == 0);
3054
3055 /* This must come after all optimization and register allocation, since
3056 * it inserts dead code that happens to have side effects, and it does
3057 * so based on the actual physical registers in use.
3058 */
3059 insert_gen4_send_dependency_workarounds();
3060
3061 if (failed)
3062 return false;
3063
3064 schedule_instructions(true);
3065
3066 if (dispatch_width == 8) {
3067 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3068 } else {
3069 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3070
3071 /* Make sure we didn't try to sneak in an extra uniform */
3072 assert(orig_nr_params == c->prog_data.nr_params);
3073 (void) orig_nr_params;
3074 }
3075
3076 /* If any state parameters were appended, then ParameterValues could have
3077 * been realloced, in which case the driver uniform storage set up by
3078 * _mesa_associate_uniform_storage() would point to freed memory. Make
3079 * sure that didn't happen.
3080 */
3081 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3082
3083 return !failed;
3084 }
3085
3086 const unsigned *
3087 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3088 struct gl_fragment_program *fp,
3089 struct gl_shader_program *prog,
3090 unsigned *final_assembly_size)
3091 {
3092 bool start_busy = false;
3093 float start_time = 0;
3094
3095 if (unlikely(brw->perf_debug)) {
3096 start_busy = (brw->batch.last_bo &&
3097 drm_intel_bo_busy(brw->batch.last_bo));
3098 start_time = get_time();
3099 }
3100
3101 struct brw_shader *shader = NULL;
3102 if (prog)
3103 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3104
3105 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3106 if (prog) {
3107 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3108 _mesa_print_ir(shader->ir, NULL);
3109 printf("\n\n");
3110 } else {
3111 printf("ARB_fragment_program %d ir for native fragment shader\n",
3112 fp->Base.Id);
3113 _mesa_print_program(&fp->Base);
3114 }
3115 }
3116
3117 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3118 */
3119 fs_visitor v(brw, c, prog, fp, 8);
3120 if (!v.run()) {
3121 if (prog) {
3122 prog->LinkStatus = false;
3123 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3124 }
3125
3126 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3127 v.fail_msg);
3128
3129 return NULL;
3130 }
3131
3132 exec_list *simd16_instructions = NULL;
3133 fs_visitor v2(brw, c, prog, fp, 16);
3134 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3135 if (c->prog_data.nr_pull_params == 0) {
3136 /* Try a 16-wide compile */
3137 v2.import_uniforms(&v);
3138 if (!v2.run()) {
3139 perf_debug("16-wide shader failed to compile, falling back to "
3140 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3141 } else {
3142 simd16_instructions = &v2.instructions;
3143 }
3144 } else {
3145 perf_debug("Skipping 16-wide due to pull parameters.\n");
3146 }
3147 }
3148
3149 c->prog_data.dispatch_width = 8;
3150
3151 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3152 const unsigned *generated = g.generate_assembly(&v.instructions,
3153 simd16_instructions,
3154 final_assembly_size);
3155
3156 if (unlikely(brw->perf_debug) && shader) {
3157 if (shader->compiled_once)
3158 brw_wm_debug_recompile(brw, prog, &c->key);
3159 shader->compiled_once = true;
3160
3161 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3162 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3163 (get_time() - start_time) * 1000);
3164 }
3165 }
3166
3167 return generated;
3168 }
3169
3170 bool
3171 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3172 {
3173 struct brw_context *brw = brw_context(ctx);
3174 struct brw_wm_prog_key key;
3175
3176 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3177 return true;
3178
3179 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3180 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3181 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3182 bool program_uses_dfdy = fp->UsesDFdy;
3183
3184 memset(&key, 0, sizeof(key));
3185
3186 if (brw->gen < 6) {
3187 if (fp->UsesKill)
3188 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3189
3190 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3191 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3192
3193 /* Just assume depth testing. */
3194 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3195 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3196 }
3197
3198 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3199 BRW_FS_VARYING_INPUT_MASK) > 16)
3200 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3201
3202 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3203
3204 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3205 for (unsigned i = 0; i < sampler_count; i++) {
3206 if (fp->Base.ShadowSamplers & (1 << i)) {
3207 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3208 key.tex.swizzles[i] =
3209 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3210 } else {
3211 /* Color sampler: assume no swizzling. */
3212 key.tex.swizzles[i] = SWIZZLE_XYZW;
3213 }
3214 }
3215
3216 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3217 key.drawable_height = ctx->DrawBuffer->Height;
3218 }
3219
3220 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3221 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3222 }
3223
3224 key.nr_color_regions = 1;
3225
3226 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3227 * quality of the derivatives is likely to be determined by the driconf
3228 * option.
3229 */
3230 key.high_quality_derivatives = brw->disable_derivative_optimization;
3231
3232 key.program_string_id = bfp->id;
3233
3234 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3235 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3236
3237 bool success = do_wm_prog(brw, prog, bfp, &key);
3238
3239 brw->wm.base.prog_offset = old_prog_offset;
3240 brw->wm.prog_data = old_prog_data;
3241
3242 return success;
3243 }