i965: compute DDX in a subspan based only on top row
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "main/uniforms.h"
50 #include "glsl/glsl_types.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63
64 /* This will be the case for almost all instructions. */
65 this->regs_written = 1;
66 }
67
68 fs_inst::fs_inst()
69 {
70 init();
71 }
72
73 fs_inst::fs_inst(enum opcode opcode)
74 {
75 init();
76 this->opcode = opcode;
77 }
78
79 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
80 {
81 init();
82 this->opcode = opcode;
83 this->dst = dst;
84
85 if (dst.file == GRF)
86 assert(dst.reg_offset >= 0);
87 }
88
89 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
90 {
91 init();
92 this->opcode = opcode;
93 this->dst = dst;
94 this->src[0] = src0;
95
96 if (dst.file == GRF)
97 assert(dst.reg_offset >= 0);
98 if (src[0].file == GRF)
99 assert(src[0].reg_offset >= 0);
100 }
101
102 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
103 {
104 init();
105 this->opcode = opcode;
106 this->dst = dst;
107 this->src[0] = src0;
108 this->src[1] = src1;
109
110 if (dst.file == GRF)
111 assert(dst.reg_offset >= 0);
112 if (src[0].file == GRF)
113 assert(src[0].reg_offset >= 0);
114 if (src[1].file == GRF)
115 assert(src[1].reg_offset >= 0);
116 }
117
118 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
119 fs_reg src0, fs_reg src1, fs_reg src2)
120 {
121 init();
122 this->opcode = opcode;
123 this->dst = dst;
124 this->src[0] = src0;
125 this->src[1] = src1;
126 this->src[2] = src2;
127
128 if (dst.file == GRF)
129 assert(dst.reg_offset >= 0);
130 if (src[0].file == GRF)
131 assert(src[0].reg_offset >= 0);
132 if (src[1].file == GRF)
133 assert(src[1].reg_offset >= 0);
134 if (src[2].file == GRF)
135 assert(src[2].reg_offset >= 0);
136 }
137
138 #define ALU1(op) \
139 fs_inst * \
140 fs_visitor::op(fs_reg dst, fs_reg src0) \
141 { \
142 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
143 }
144
145 #define ALU2(op) \
146 fs_inst * \
147 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
148 { \
149 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
150 }
151
152 #define ALU3(op) \
153 fs_inst * \
154 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
155 { \
156 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
157 }
158
159 ALU1(NOT)
160 ALU1(MOV)
161 ALU1(FRC)
162 ALU1(RNDD)
163 ALU1(RNDE)
164 ALU1(RNDZ)
165 ALU2(ADD)
166 ALU2(MUL)
167 ALU2(MACH)
168 ALU2(AND)
169 ALU2(OR)
170 ALU2(XOR)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183
184 /** Gen4 predicated IF. */
185 fs_inst *
186 fs_visitor::IF(uint32_t predicate)
187 {
188 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
189 inst->predicate = predicate;
190 return inst;
191 }
192
193 /** Gen6+ IF with embedded comparison. */
194 fs_inst *
195 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
196 {
197 assert(brw->gen >= 6);
198 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
199 reg_null_d, src0, src1);
200 inst->conditional_mod = condition;
201 return inst;
202 }
203
204 /**
205 * CMP: Sets the low bit of the destination channels with the result
206 * of the comparison, while the upper bits are undefined, and updates
207 * the flag register with the packed 16 bits of the result.
208 */
209 fs_inst *
210 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
211 {
212 fs_inst *inst;
213
214 /* Take the instruction:
215 *
216 * CMP null<d> src0<f> src1<f>
217 *
218 * Original gen4 does type conversion to the destination type before
219 * comparison, producing garbage results for floating point comparisons.
220 * gen5 does the comparison on the execution type (resolved source types),
221 * so dst type doesn't matter. gen6 does comparison and then uses the
222 * result as if it was the dst type with no conversion, which happens to
223 * mostly work out for float-interpreted-as-int since our comparisons are
224 * for >0, =0, <0.
225 */
226 if (brw->gen == 4) {
227 dst.type = src0.type;
228 if (dst.file == HW_REG)
229 dst.fixed_hw_reg.type = dst.type;
230 }
231
232 resolve_ud_negate(&src0);
233 resolve_ud_negate(&src1);
234
235 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
236 inst->conditional_mod = condition;
237
238 return inst;
239 }
240
241 exec_list
242 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
243 fs_reg varying_offset,
244 uint32_t const_offset)
245 {
246 exec_list instructions;
247 fs_inst *inst;
248
249 /* We have our constant surface use a pitch of 4 bytes, so our index can
250 * be any component of a vector, and then we load 4 contiguous
251 * components starting from that.
252 *
253 * We break down the const_offset to a portion added to the variable
254 * offset and a portion done using reg_offset, which means that if you
255 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
256 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
257 * CSE can later notice that those loads are all the same and eliminate
258 * the redundant ones.
259 */
260 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
261 instructions.push_tail(ADD(vec4_offset,
262 varying_offset, const_offset & ~3));
263
264 int scale = 1;
265 if (brw->gen == 4 && dispatch_width == 8) {
266 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
267 * u, v, r) as parameters, or we can just use the SIMD16 message
268 * consisting of (header, u). We choose the second, at the cost of a
269 * longer return length.
270 */
271 scale = 2;
272 }
273
274 enum opcode op;
275 if (brw->gen >= 7)
276 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
277 else
278 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
279 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
280 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
281 inst->regs_written = 4 * scale;
282 instructions.push_tail(inst);
283
284 if (brw->gen < 7) {
285 inst->base_mrf = 13;
286 inst->header_present = true;
287 if (brw->gen == 4)
288 inst->mlen = 3;
289 else
290 inst->mlen = 1 + dispatch_width / 8;
291 }
292
293 vec4_result.reg_offset += (const_offset & 3) * scale;
294 instructions.push_tail(MOV(dst, vec4_result));
295
296 return instructions;
297 }
298
299 /**
300 * A helper for MOV generation for fixing up broken hardware SEND dependency
301 * handling.
302 */
303 fs_inst *
304 fs_visitor::DEP_RESOLVE_MOV(int grf)
305 {
306 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
307
308 inst->ir = NULL;
309 inst->annotation = "send dependency resolve";
310
311 /* The caller always wants uncompressed to emit the minimal extra
312 * dependencies, and to avoid having to deal with aligning its regs to 2.
313 */
314 inst->force_uncompressed = true;
315
316 return inst;
317 }
318
319 bool
320 fs_inst::equals(fs_inst *inst)
321 {
322 return (opcode == inst->opcode &&
323 dst.equals(inst->dst) &&
324 src[0].equals(inst->src[0]) &&
325 src[1].equals(inst->src[1]) &&
326 src[2].equals(inst->src[2]) &&
327 saturate == inst->saturate &&
328 predicate == inst->predicate &&
329 conditional_mod == inst->conditional_mod &&
330 mlen == inst->mlen &&
331 base_mrf == inst->base_mrf &&
332 sampler == inst->sampler &&
333 target == inst->target &&
334 eot == inst->eot &&
335 header_present == inst->header_present &&
336 shadow_compare == inst->shadow_compare &&
337 offset == inst->offset);
338 }
339
340 bool
341 fs_inst::overwrites_reg(const fs_reg &reg)
342 {
343 return (reg.file == dst.file &&
344 reg.reg == dst.reg &&
345 reg.reg_offset >= dst.reg_offset &&
346 reg.reg_offset < dst.reg_offset + regs_written);
347 }
348
349 bool
350 fs_inst::is_send_from_grf()
351 {
352 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
353 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
354 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
355 src[1].file == GRF));
356 }
357
358 bool
359 fs_visitor::can_do_source_mods(fs_inst *inst)
360 {
361 if (brw->gen == 6 && inst->is_math())
362 return false;
363
364 if (inst->is_send_from_grf())
365 return false;
366
367 return true;
368 }
369
370 void
371 fs_reg::init()
372 {
373 memset(this, 0, sizeof(*this));
374 this->smear = -1;
375 }
376
377 /** Generic unset register constructor. */
378 fs_reg::fs_reg()
379 {
380 init();
381 this->file = BAD_FILE;
382 }
383
384 /** Immediate value constructor. */
385 fs_reg::fs_reg(float f)
386 {
387 init();
388 this->file = IMM;
389 this->type = BRW_REGISTER_TYPE_F;
390 this->imm.f = f;
391 }
392
393 /** Immediate value constructor. */
394 fs_reg::fs_reg(int32_t i)
395 {
396 init();
397 this->file = IMM;
398 this->type = BRW_REGISTER_TYPE_D;
399 this->imm.i = i;
400 }
401
402 /** Immediate value constructor. */
403 fs_reg::fs_reg(uint32_t u)
404 {
405 init();
406 this->file = IMM;
407 this->type = BRW_REGISTER_TYPE_UD;
408 this->imm.u = u;
409 }
410
411 /** Fixed brw_reg Immediate value constructor. */
412 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
413 {
414 init();
415 this->file = HW_REG;
416 this->fixed_hw_reg = fixed_hw_reg;
417 this->type = fixed_hw_reg.type;
418 }
419
420 bool
421 fs_reg::equals(const fs_reg &r) const
422 {
423 return (file == r.file &&
424 reg == r.reg &&
425 reg_offset == r.reg_offset &&
426 type == r.type &&
427 negate == r.negate &&
428 abs == r.abs &&
429 !reladdr && !r.reladdr &&
430 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
431 sizeof(fixed_hw_reg)) == 0 &&
432 smear == r.smear &&
433 imm.u == r.imm.u);
434 }
435
436 bool
437 fs_reg::is_zero() const
438 {
439 if (file != IMM)
440 return false;
441
442 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
443 }
444
445 bool
446 fs_reg::is_one() const
447 {
448 if (file != IMM)
449 return false;
450
451 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
452 }
453
454 bool
455 fs_reg::is_valid_3src() const
456 {
457 return file == GRF || file == UNIFORM;
458 }
459
460 int
461 fs_visitor::type_size(const struct glsl_type *type)
462 {
463 unsigned int size, i;
464
465 switch (type->base_type) {
466 case GLSL_TYPE_UINT:
467 case GLSL_TYPE_INT:
468 case GLSL_TYPE_FLOAT:
469 case GLSL_TYPE_BOOL:
470 return type->components();
471 case GLSL_TYPE_ARRAY:
472 return type_size(type->fields.array) * type->length;
473 case GLSL_TYPE_STRUCT:
474 size = 0;
475 for (i = 0; i < type->length; i++) {
476 size += type_size(type->fields.structure[i].type);
477 }
478 return size;
479 case GLSL_TYPE_SAMPLER:
480 /* Samplers take up no register space, since they're baked in at
481 * link time.
482 */
483 return 0;
484 case GLSL_TYPE_VOID:
485 case GLSL_TYPE_ERROR:
486 case GLSL_TYPE_INTERFACE:
487 assert(!"not reached");
488 break;
489 }
490
491 return 0;
492 }
493
494 fs_reg
495 fs_visitor::get_timestamp()
496 {
497 assert(brw->gen >= 7);
498
499 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
500 BRW_ARF_TIMESTAMP,
501 0),
502 BRW_REGISTER_TYPE_UD));
503
504 fs_reg dst = fs_reg(this, glsl_type::uint_type);
505
506 fs_inst *mov = emit(MOV(dst, ts));
507 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
508 * even if it's not enabled in the dispatch.
509 */
510 mov->force_writemask_all = true;
511 mov->force_uncompressed = true;
512
513 /* The caller wants the low 32 bits of the timestamp. Since it's running
514 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
515 * which is plenty of time for our purposes. It is identical across the
516 * EUs, but since it's tracking GPU core speed it will increment at a
517 * varying rate as render P-states change.
518 *
519 * The caller could also check if render P-states have changed (or anything
520 * else that might disrupt timing) by setting smear to 2 and checking if
521 * that field is != 0.
522 */
523 dst.smear = 0;
524
525 return dst;
526 }
527
528 void
529 fs_visitor::emit_shader_time_begin()
530 {
531 current_annotation = "shader time start";
532 shader_start_time = get_timestamp();
533 }
534
535 void
536 fs_visitor::emit_shader_time_end()
537 {
538 current_annotation = "shader time end";
539
540 enum shader_time_shader_type type, written_type, reset_type;
541 if (dispatch_width == 8) {
542 type = ST_FS8;
543 written_type = ST_FS8_WRITTEN;
544 reset_type = ST_FS8_RESET;
545 } else {
546 assert(dispatch_width == 16);
547 type = ST_FS16;
548 written_type = ST_FS16_WRITTEN;
549 reset_type = ST_FS16_RESET;
550 }
551
552 fs_reg shader_end_time = get_timestamp();
553
554 /* Check that there weren't any timestamp reset events (assuming these
555 * were the only two timestamp reads that happened).
556 */
557 fs_reg reset = shader_end_time;
558 reset.smear = 2;
559 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
560 test->conditional_mod = BRW_CONDITIONAL_Z;
561 emit(IF(BRW_PREDICATE_NORMAL));
562
563 push_force_uncompressed();
564 fs_reg start = shader_start_time;
565 start.negate = true;
566 fs_reg diff = fs_reg(this, glsl_type::uint_type);
567 emit(ADD(diff, start, shader_end_time));
568
569 /* If there were no instructions between the two timestamp gets, the diff
570 * is 2 cycles. Remove that overhead, so I can forget about that when
571 * trying to determine the time taken for single instructions.
572 */
573 emit(ADD(diff, diff, fs_reg(-2u)));
574
575 emit_shader_time_write(type, diff);
576 emit_shader_time_write(written_type, fs_reg(1u));
577 emit(BRW_OPCODE_ELSE);
578 emit_shader_time_write(reset_type, fs_reg(1u));
579 emit(BRW_OPCODE_ENDIF);
580
581 pop_force_uncompressed();
582 }
583
584 void
585 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
586 fs_reg value)
587 {
588 int shader_time_index =
589 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
590 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
591
592 fs_reg payload;
593 if (dispatch_width == 8)
594 payload = fs_reg(this, glsl_type::uvec2_type);
595 else
596 payload = fs_reg(this, glsl_type::uint_type);
597
598 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
599 fs_reg(), payload, offset, value));
600 }
601
602 void
603 fs_visitor::fail(const char *format, ...)
604 {
605 va_list va;
606 char *msg;
607
608 if (failed)
609 return;
610
611 failed = true;
612
613 va_start(va, format);
614 msg = ralloc_vasprintf(mem_ctx, format, va);
615 va_end(va);
616 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
617
618 this->fail_msg = msg;
619
620 if (INTEL_DEBUG & DEBUG_WM) {
621 fprintf(stderr, "%s", msg);
622 }
623 }
624
625 fs_inst *
626 fs_visitor::emit(enum opcode opcode)
627 {
628 return emit(fs_inst(opcode));
629 }
630
631 fs_inst *
632 fs_visitor::emit(enum opcode opcode, fs_reg dst)
633 {
634 return emit(fs_inst(opcode, dst));
635 }
636
637 fs_inst *
638 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
639 {
640 return emit(fs_inst(opcode, dst, src0));
641 }
642
643 fs_inst *
644 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
645 {
646 return emit(fs_inst(opcode, dst, src0, src1));
647 }
648
649 fs_inst *
650 fs_visitor::emit(enum opcode opcode, fs_reg dst,
651 fs_reg src0, fs_reg src1, fs_reg src2)
652 {
653 return emit(fs_inst(opcode, dst, src0, src1, src2));
654 }
655
656 void
657 fs_visitor::push_force_uncompressed()
658 {
659 force_uncompressed_stack++;
660 }
661
662 void
663 fs_visitor::pop_force_uncompressed()
664 {
665 force_uncompressed_stack--;
666 assert(force_uncompressed_stack >= 0);
667 }
668
669 void
670 fs_visitor::push_force_sechalf()
671 {
672 force_sechalf_stack++;
673 }
674
675 void
676 fs_visitor::pop_force_sechalf()
677 {
678 force_sechalf_stack--;
679 assert(force_sechalf_stack >= 0);
680 }
681
682 /**
683 * Returns true if the instruction has a flag that means it won't
684 * update an entire destination register.
685 *
686 * For example, dead code elimination and live variable analysis want to know
687 * when a write to a variable screens off any preceding values that were in
688 * it.
689 */
690 bool
691 fs_inst::is_partial_write()
692 {
693 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
694 this->force_uncompressed ||
695 this->force_sechalf);
696 }
697
698 /**
699 * Returns how many MRFs an FS opcode will write over.
700 *
701 * Note that this is not the 0 or 1 implied writes in an actual gen
702 * instruction -- the FS opcodes often generate MOVs in addition.
703 */
704 int
705 fs_visitor::implied_mrf_writes(fs_inst *inst)
706 {
707 if (inst->mlen == 0)
708 return 0;
709
710 switch (inst->opcode) {
711 case SHADER_OPCODE_RCP:
712 case SHADER_OPCODE_RSQ:
713 case SHADER_OPCODE_SQRT:
714 case SHADER_OPCODE_EXP2:
715 case SHADER_OPCODE_LOG2:
716 case SHADER_OPCODE_SIN:
717 case SHADER_OPCODE_COS:
718 return 1 * dispatch_width / 8;
719 case SHADER_OPCODE_POW:
720 case SHADER_OPCODE_INT_QUOTIENT:
721 case SHADER_OPCODE_INT_REMAINDER:
722 return 2 * dispatch_width / 8;
723 case SHADER_OPCODE_TEX:
724 case FS_OPCODE_TXB:
725 case SHADER_OPCODE_TXD:
726 case SHADER_OPCODE_TXF:
727 case SHADER_OPCODE_TXF_MS:
728 case SHADER_OPCODE_TXL:
729 case SHADER_OPCODE_TXS:
730 case SHADER_OPCODE_LOD:
731 return 1;
732 case FS_OPCODE_FB_WRITE:
733 return 2;
734 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
735 case FS_OPCODE_UNSPILL:
736 return 1;
737 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
738 return inst->mlen;
739 case FS_OPCODE_SPILL:
740 return 2;
741 default:
742 assert(!"not reached");
743 return inst->mlen;
744 }
745 }
746
747 int
748 fs_visitor::virtual_grf_alloc(int size)
749 {
750 if (virtual_grf_array_size <= virtual_grf_count) {
751 if (virtual_grf_array_size == 0)
752 virtual_grf_array_size = 16;
753 else
754 virtual_grf_array_size *= 2;
755 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
756 virtual_grf_array_size);
757 }
758 virtual_grf_sizes[virtual_grf_count] = size;
759 return virtual_grf_count++;
760 }
761
762 /** Fixed HW reg constructor. */
763 fs_reg::fs_reg(enum register_file file, int reg)
764 {
765 init();
766 this->file = file;
767 this->reg = reg;
768 this->type = BRW_REGISTER_TYPE_F;
769 }
770
771 /** Fixed HW reg constructor. */
772 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
773 {
774 init();
775 this->file = file;
776 this->reg = reg;
777 this->type = type;
778 }
779
780 /** Automatic reg constructor. */
781 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
782 {
783 init();
784
785 this->file = GRF;
786 this->reg = v->virtual_grf_alloc(v->type_size(type));
787 this->reg_offset = 0;
788 this->type = brw_type_for_base_type(type);
789 }
790
791 fs_reg *
792 fs_visitor::variable_storage(ir_variable *var)
793 {
794 return (fs_reg *)hash_table_find(this->variable_ht, var);
795 }
796
797 void
798 import_uniforms_callback(const void *key,
799 void *data,
800 void *closure)
801 {
802 struct hash_table *dst_ht = (struct hash_table *)closure;
803 const fs_reg *reg = (const fs_reg *)data;
804
805 if (reg->file != UNIFORM)
806 return;
807
808 hash_table_insert(dst_ht, data, key);
809 }
810
811 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
812 * This brings in those uniform definitions
813 */
814 void
815 fs_visitor::import_uniforms(fs_visitor *v)
816 {
817 hash_table_call_foreach(v->variable_ht,
818 import_uniforms_callback,
819 variable_ht);
820 this->params_remap = v->params_remap;
821 this->nr_params_remap = v->nr_params_remap;
822 }
823
824 /* Our support for uniforms is piggy-backed on the struct
825 * gl_fragment_program, because that's where the values actually
826 * get stored, rather than in some global gl_shader_program uniform
827 * store.
828 */
829 void
830 fs_visitor::setup_uniform_values(ir_variable *ir)
831 {
832 int namelen = strlen(ir->name);
833
834 /* The data for our (non-builtin) uniforms is stored in a series of
835 * gl_uniform_driver_storage structs for each subcomponent that
836 * glGetUniformLocation() could name. We know it's been set up in the same
837 * order we'd walk the type, so walk the list of storage and find anything
838 * with our name, or the prefix of a component that starts with our name.
839 */
840 unsigned params_before = c->prog_data.nr_params;
841 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
842 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
843
844 if (strncmp(ir->name, storage->name, namelen) != 0 ||
845 (storage->name[namelen] != 0 &&
846 storage->name[namelen] != '.' &&
847 storage->name[namelen] != '[')) {
848 continue;
849 }
850
851 unsigned slots = storage->type->component_slots();
852 if (storage->array_elements)
853 slots *= storage->array_elements;
854
855 for (unsigned i = 0; i < slots; i++) {
856 c->prog_data.param[c->prog_data.nr_params++] =
857 &storage->storage[i].f;
858 }
859 }
860
861 /* Make sure we actually initialized the right amount of stuff here. */
862 assert(params_before + ir->type->component_slots() ==
863 c->prog_data.nr_params);
864 (void)params_before;
865 }
866
867
868 /* Our support for builtin uniforms is even scarier than non-builtin.
869 * It sits on top of the PROG_STATE_VAR parameters that are
870 * automatically updated from GL context state.
871 */
872 void
873 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
874 {
875 const ir_state_slot *const slots = ir->state_slots;
876 assert(ir->state_slots != NULL);
877
878 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
879 /* This state reference has already been setup by ir_to_mesa, but we'll
880 * get the same index back here.
881 */
882 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
883 (gl_state_index *)slots[i].tokens);
884
885 /* Add each of the unique swizzles of the element as a parameter.
886 * This'll end up matching the expected layout of the
887 * array/matrix/structure we're trying to fill in.
888 */
889 int last_swiz = -1;
890 for (unsigned int j = 0; j < 4; j++) {
891 int swiz = GET_SWZ(slots[i].swizzle, j);
892 if (swiz == last_swiz)
893 break;
894 last_swiz = swiz;
895
896 c->prog_data.param[c->prog_data.nr_params++] =
897 &fp->Base.Parameters->ParameterValues[index][swiz].f;
898 }
899 }
900 }
901
902 fs_reg *
903 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
904 {
905 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
906 fs_reg wpos = *reg;
907 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
908
909 /* gl_FragCoord.x */
910 if (ir->pixel_center_integer) {
911 emit(MOV(wpos, this->pixel_x));
912 } else {
913 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
914 }
915 wpos.reg_offset++;
916
917 /* gl_FragCoord.y */
918 if (!flip && ir->pixel_center_integer) {
919 emit(MOV(wpos, this->pixel_y));
920 } else {
921 fs_reg pixel_y = this->pixel_y;
922 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
923
924 if (flip) {
925 pixel_y.negate = true;
926 offset += c->key.drawable_height - 1.0;
927 }
928
929 emit(ADD(wpos, pixel_y, fs_reg(offset)));
930 }
931 wpos.reg_offset++;
932
933 /* gl_FragCoord.z */
934 if (brw->gen >= 6) {
935 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
936 } else {
937 emit(FS_OPCODE_LINTERP, wpos,
938 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
939 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
940 interp_reg(VARYING_SLOT_POS, 2));
941 }
942 wpos.reg_offset++;
943
944 /* gl_FragCoord.w: Already set up in emit_interpolation */
945 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
946
947 return reg;
948 }
949
950 fs_inst *
951 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
952 glsl_interp_qualifier interpolation_mode,
953 bool is_centroid)
954 {
955 brw_wm_barycentric_interp_mode barycoord_mode;
956 if (brw->gen >= 6) {
957 if (is_centroid) {
958 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
959 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
960 else
961 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
962 } else {
963 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
964 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
965 else
966 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
967 }
968 } else {
969 /* On Ironlake and below, there is only one interpolation mode.
970 * Centroid interpolation doesn't mean anything on this hardware --
971 * there is no multisampling.
972 */
973 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
974 }
975 return emit(FS_OPCODE_LINTERP, attr,
976 this->delta_x[barycoord_mode],
977 this->delta_y[barycoord_mode], interp);
978 }
979
980 fs_reg *
981 fs_visitor::emit_general_interpolation(ir_variable *ir)
982 {
983 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
984 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
985 fs_reg attr = *reg;
986
987 unsigned int array_elements;
988 const glsl_type *type;
989
990 if (ir->type->is_array()) {
991 array_elements = ir->type->length;
992 if (array_elements == 0) {
993 fail("dereferenced array '%s' has length 0\n", ir->name);
994 }
995 type = ir->type->fields.array;
996 } else {
997 array_elements = 1;
998 type = ir->type;
999 }
1000
1001 glsl_interp_qualifier interpolation_mode =
1002 ir->determine_interpolation_mode(c->key.flat_shade);
1003
1004 int location = ir->location;
1005 for (unsigned int i = 0; i < array_elements; i++) {
1006 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1007 if (c->prog_data.urb_setup[location] == -1) {
1008 /* If there's no incoming setup data for this slot, don't
1009 * emit interpolation for it.
1010 */
1011 attr.reg_offset += type->vector_elements;
1012 location++;
1013 continue;
1014 }
1015
1016 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1017 /* Constant interpolation (flat shading) case. The SF has
1018 * handed us defined values in only the constant offset
1019 * field of the setup reg.
1020 */
1021 for (unsigned int k = 0; k < type->vector_elements; k++) {
1022 struct brw_reg interp = interp_reg(location, k);
1023 interp = suboffset(interp, 3);
1024 interp.type = reg->type;
1025 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1026 attr.reg_offset++;
1027 }
1028 } else {
1029 /* Smooth/noperspective interpolation case. */
1030 for (unsigned int k = 0; k < type->vector_elements; k++) {
1031 /* FINISHME: At some point we probably want to push
1032 * this farther by giving similar treatment to the
1033 * other potentially constant components of the
1034 * attribute, as well as making brw_vs_constval.c
1035 * handle varyings other than gl_TexCoord.
1036 */
1037 struct brw_reg interp = interp_reg(location, k);
1038 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1039 ir->centroid);
1040 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1041 /* Get the pixel/sample mask into f0 so that we know
1042 * which pixels are lit. Then, for each channel that is
1043 * unlit, replace the centroid data with non-centroid
1044 * data.
1045 */
1046 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1047 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1048 interpolation_mode, false);
1049 inst->predicate = BRW_PREDICATE_NORMAL;
1050 inst->predicate_inverse = true;
1051 }
1052 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1053 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1054 }
1055 attr.reg_offset++;
1056 }
1057
1058 }
1059 location++;
1060 }
1061 }
1062
1063 return reg;
1064 }
1065
1066 fs_reg *
1067 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1068 {
1069 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1070
1071 /* The frontfacing comes in as a bit in the thread payload. */
1072 if (brw->gen >= 6) {
1073 emit(BRW_OPCODE_ASR, *reg,
1074 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1075 fs_reg(15));
1076 emit(BRW_OPCODE_NOT, *reg, *reg);
1077 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1078 } else {
1079 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1080 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1081 * us front face
1082 */
1083 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1084 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1085 }
1086
1087 return reg;
1088 }
1089
1090 fs_reg
1091 fs_visitor::fix_math_operand(fs_reg src)
1092 {
1093 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1094 * might be able to do better by doing execsize = 1 math and then
1095 * expanding that result out, but we would need to be careful with
1096 * masking.
1097 *
1098 * The hardware ignores source modifiers (negate and abs) on math
1099 * instructions, so we also move to a temp to set those up.
1100 */
1101 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1102 !src.abs && !src.negate)
1103 return src;
1104
1105 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1106 * operands to math
1107 */
1108 if (brw->gen >= 7 && src.file != IMM)
1109 return src;
1110
1111 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1112 expanded.type = src.type;
1113 emit(BRW_OPCODE_MOV, expanded, src);
1114 return expanded;
1115 }
1116
1117 fs_inst *
1118 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1119 {
1120 switch (opcode) {
1121 case SHADER_OPCODE_RCP:
1122 case SHADER_OPCODE_RSQ:
1123 case SHADER_OPCODE_SQRT:
1124 case SHADER_OPCODE_EXP2:
1125 case SHADER_OPCODE_LOG2:
1126 case SHADER_OPCODE_SIN:
1127 case SHADER_OPCODE_COS:
1128 break;
1129 default:
1130 assert(!"not reached: bad math opcode");
1131 return NULL;
1132 }
1133
1134 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1135 * might be able to do better by doing execsize = 1 math and then
1136 * expanding that result out, but we would need to be careful with
1137 * masking.
1138 *
1139 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1140 * instructions, so we also move to a temp to set those up.
1141 */
1142 if (brw->gen >= 6)
1143 src = fix_math_operand(src);
1144
1145 fs_inst *inst = emit(opcode, dst, src);
1146
1147 if (brw->gen < 6) {
1148 inst->base_mrf = 2;
1149 inst->mlen = dispatch_width / 8;
1150 }
1151
1152 return inst;
1153 }
1154
1155 fs_inst *
1156 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1157 {
1158 int base_mrf = 2;
1159 fs_inst *inst;
1160
1161 switch (opcode) {
1162 case SHADER_OPCODE_INT_QUOTIENT:
1163 case SHADER_OPCODE_INT_REMAINDER:
1164 if (brw->gen >= 7 && dispatch_width == 16)
1165 fail("16-wide INTDIV unsupported\n");
1166 break;
1167 case SHADER_OPCODE_POW:
1168 break;
1169 default:
1170 assert(!"not reached: unsupported binary math opcode.");
1171 return NULL;
1172 }
1173
1174 if (brw->gen >= 6) {
1175 src0 = fix_math_operand(src0);
1176 src1 = fix_math_operand(src1);
1177
1178 inst = emit(opcode, dst, src0, src1);
1179 } else {
1180 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1181 * "Message Payload":
1182 *
1183 * "Operand0[7]. For the INT DIV functions, this operand is the
1184 * denominator."
1185 * ...
1186 * "Operand1[7]. For the INT DIV functions, this operand is the
1187 * numerator."
1188 */
1189 bool is_int_div = opcode != SHADER_OPCODE_POW;
1190 fs_reg &op0 = is_int_div ? src1 : src0;
1191 fs_reg &op1 = is_int_div ? src0 : src1;
1192
1193 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1194 inst = emit(opcode, dst, op0, reg_null_f);
1195
1196 inst->base_mrf = base_mrf;
1197 inst->mlen = 2 * dispatch_width / 8;
1198 }
1199 return inst;
1200 }
1201
1202 void
1203 fs_visitor::assign_curb_setup()
1204 {
1205 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1206 if (dispatch_width == 8) {
1207 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1208 } else {
1209 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1210 }
1211
1212 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1213 foreach_list(node, &this->instructions) {
1214 fs_inst *inst = (fs_inst *)node;
1215
1216 for (unsigned int i = 0; i < 3; i++) {
1217 if (inst->src[i].file == UNIFORM) {
1218 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1219 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1220 constant_nr / 8,
1221 constant_nr % 8);
1222
1223 inst->src[i].file = HW_REG;
1224 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1225 }
1226 }
1227 }
1228 }
1229
1230 void
1231 fs_visitor::calculate_urb_setup()
1232 {
1233 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1234 c->prog_data.urb_setup[i] = -1;
1235 }
1236
1237 int urb_next = 0;
1238 /* Figure out where each of the incoming setup attributes lands. */
1239 if (brw->gen >= 6) {
1240 if (_mesa_bitcount_64(fp->Base.InputsRead &
1241 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1242 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1243 * first 16 varying inputs, so we can put them wherever we want.
1244 * Just put them in order.
1245 *
1246 * This is useful because it means that (a) inputs not used by the
1247 * fragment shader won't take up valuable register space, and (b) we
1248 * won't have to recompile the fragment shader if it gets paired with
1249 * a different vertex (or geometry) shader.
1250 */
1251 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1252 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1253 BITFIELD64_BIT(i)) {
1254 c->prog_data.urb_setup[i] = urb_next++;
1255 }
1256 }
1257 } else {
1258 /* We have enough input varyings that the SF/SBE pipeline stage can't
1259 * arbitrarily rearrange them to suit our whim; we have to put them
1260 * in an order that matches the output of the previous pipeline stage
1261 * (geometry or vertex shader).
1262 */
1263 struct brw_vue_map prev_stage_vue_map;
1264 brw_compute_vue_map(brw, &prev_stage_vue_map,
1265 c->key.input_slots_valid);
1266 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1267 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1268 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1269 slot++) {
1270 int varying = prev_stage_vue_map.slot_to_varying[slot];
1271 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1272 * unused.
1273 */
1274 if (varying != BRW_VARYING_SLOT_COUNT &&
1275 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1276 BITFIELD64_BIT(varying))) {
1277 c->prog_data.urb_setup[varying] = slot - first_slot;
1278 }
1279 }
1280 urb_next = prev_stage_vue_map.num_slots - first_slot;
1281 }
1282 } else {
1283 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1284 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1285 /* Point size is packed into the header, not as a general attribute */
1286 if (i == VARYING_SLOT_PSIZ)
1287 continue;
1288
1289 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1290 /* The back color slot is skipped when the front color is
1291 * also written to. In addition, some slots can be
1292 * written in the vertex shader and not read in the
1293 * fragment shader. So the register number must always be
1294 * incremented, mapped or not.
1295 */
1296 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1297 c->prog_data.urb_setup[i] = urb_next;
1298 urb_next++;
1299 }
1300 }
1301
1302 /*
1303 * It's a FS only attribute, and we did interpolation for this attribute
1304 * in SF thread. So, count it here, too.
1305 *
1306 * See compile_sf_prog() for more info.
1307 */
1308 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1309 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1310 }
1311
1312 c->prog_data.num_varying_inputs = urb_next;
1313 }
1314
1315 void
1316 fs_visitor::assign_urb_setup()
1317 {
1318 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1319
1320 /* Offset all the urb_setup[] index by the actual position of the
1321 * setup regs, now that the location of the constants has been chosen.
1322 */
1323 foreach_list(node, &this->instructions) {
1324 fs_inst *inst = (fs_inst *)node;
1325
1326 if (inst->opcode == FS_OPCODE_LINTERP) {
1327 assert(inst->src[2].file == HW_REG);
1328 inst->src[2].fixed_hw_reg.nr += urb_start;
1329 }
1330
1331 if (inst->opcode == FS_OPCODE_CINTERP) {
1332 assert(inst->src[0].file == HW_REG);
1333 inst->src[0].fixed_hw_reg.nr += urb_start;
1334 }
1335 }
1336
1337 /* Each attribute is 4 setup channels, each of which is half a reg. */
1338 this->first_non_payload_grf =
1339 urb_start + c->prog_data.num_varying_inputs * 2;
1340 }
1341
1342 /**
1343 * Split large virtual GRFs into separate components if we can.
1344 *
1345 * This is mostly duplicated with what brw_fs_vector_splitting does,
1346 * but that's really conservative because it's afraid of doing
1347 * splitting that doesn't result in real progress after the rest of
1348 * the optimization phases, which would cause infinite looping in
1349 * optimization. We can do it once here, safely. This also has the
1350 * opportunity to split interpolated values, or maybe even uniforms,
1351 * which we don't have at the IR level.
1352 *
1353 * We want to split, because virtual GRFs are what we register
1354 * allocate and spill (due to contiguousness requirements for some
1355 * instructions), and they're what we naturally generate in the
1356 * codegen process, but most virtual GRFs don't actually need to be
1357 * contiguous sets of GRFs. If we split, we'll end up with reduced
1358 * live intervals and better dead code elimination and coalescing.
1359 */
1360 void
1361 fs_visitor::split_virtual_grfs()
1362 {
1363 int num_vars = this->virtual_grf_count;
1364 bool split_grf[num_vars];
1365 int new_virtual_grf[num_vars];
1366
1367 /* Try to split anything > 0 sized. */
1368 for (int i = 0; i < num_vars; i++) {
1369 if (this->virtual_grf_sizes[i] != 1)
1370 split_grf[i] = true;
1371 else
1372 split_grf[i] = false;
1373 }
1374
1375 if (brw->has_pln &&
1376 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1377 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1378 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1379 * Gen6, that was the only supported interpolation mode, and since Gen6,
1380 * delta_x and delta_y are in fixed hardware registers.
1381 */
1382 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1383 false;
1384 }
1385
1386 foreach_list(node, &this->instructions) {
1387 fs_inst *inst = (fs_inst *)node;
1388
1389 /* If there's a SEND message that requires contiguous destination
1390 * registers, no splitting is allowed.
1391 */
1392 if (inst->regs_written > 1) {
1393 split_grf[inst->dst.reg] = false;
1394 }
1395
1396 /* If we're sending from a GRF, don't split it, on the assumption that
1397 * the send is reading the whole thing.
1398 */
1399 if (inst->is_send_from_grf()) {
1400 for (int i = 0; i < 3; i++) {
1401 if (inst->src[i].file == GRF) {
1402 split_grf[inst->src[i].reg] = false;
1403 }
1404 }
1405 }
1406 }
1407
1408 /* Allocate new space for split regs. Note that the virtual
1409 * numbers will be contiguous.
1410 */
1411 for (int i = 0; i < num_vars; i++) {
1412 if (split_grf[i]) {
1413 new_virtual_grf[i] = virtual_grf_alloc(1);
1414 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1415 int reg = virtual_grf_alloc(1);
1416 assert(reg == new_virtual_grf[i] + j - 1);
1417 (void) reg;
1418 }
1419 this->virtual_grf_sizes[i] = 1;
1420 }
1421 }
1422
1423 foreach_list(node, &this->instructions) {
1424 fs_inst *inst = (fs_inst *)node;
1425
1426 if (inst->dst.file == GRF &&
1427 split_grf[inst->dst.reg] &&
1428 inst->dst.reg_offset != 0) {
1429 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1430 inst->dst.reg_offset - 1);
1431 inst->dst.reg_offset = 0;
1432 }
1433 for (int i = 0; i < 3; i++) {
1434 if (inst->src[i].file == GRF &&
1435 split_grf[inst->src[i].reg] &&
1436 inst->src[i].reg_offset != 0) {
1437 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1438 inst->src[i].reg_offset - 1);
1439 inst->src[i].reg_offset = 0;
1440 }
1441 }
1442 }
1443 this->live_intervals_valid = false;
1444 }
1445
1446 /**
1447 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1448 *
1449 * During code generation, we create tons of temporary variables, many of
1450 * which get immediately killed and are never used again. Yet, in later
1451 * optimization and analysis passes, such as compute_live_intervals, we need
1452 * to loop over all the virtual GRFs. Compacting them can save a lot of
1453 * overhead.
1454 */
1455 void
1456 fs_visitor::compact_virtual_grfs()
1457 {
1458 /* Mark which virtual GRFs are used, and count how many. */
1459 int remap_table[this->virtual_grf_count];
1460 memset(remap_table, -1, sizeof(remap_table));
1461
1462 foreach_list(node, &this->instructions) {
1463 const fs_inst *inst = (const fs_inst *) node;
1464
1465 if (inst->dst.file == GRF)
1466 remap_table[inst->dst.reg] = 0;
1467
1468 for (int i = 0; i < 3; i++) {
1469 if (inst->src[i].file == GRF)
1470 remap_table[inst->src[i].reg] = 0;
1471 }
1472 }
1473
1474 /* In addition to registers used in instructions, fs_visitor keeps
1475 * direct references to certain special values which must be patched:
1476 */
1477 fs_reg *special[] = {
1478 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1479 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1480 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1481 &delta_x[0], &delta_x[1], &delta_x[2],
1482 &delta_x[3], &delta_x[4], &delta_x[5],
1483 &delta_y[0], &delta_y[1], &delta_y[2],
1484 &delta_y[3], &delta_y[4], &delta_y[5],
1485 };
1486 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1487 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1488
1489 /* Treat all special values as used, to be conservative */
1490 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1491 if (special[i]->file == GRF)
1492 remap_table[special[i]->reg] = 0;
1493 }
1494
1495 /* Compact the GRF arrays. */
1496 int new_index = 0;
1497 for (int i = 0; i < this->virtual_grf_count; i++) {
1498 if (remap_table[i] != -1) {
1499 remap_table[i] = new_index;
1500 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1501 if (live_intervals_valid) {
1502 virtual_grf_start[new_index] = virtual_grf_start[i];
1503 virtual_grf_end[new_index] = virtual_grf_end[i];
1504 }
1505 ++new_index;
1506 }
1507 }
1508
1509 this->virtual_grf_count = new_index;
1510
1511 /* Patch all the instructions to use the newly renumbered registers */
1512 foreach_list(node, &this->instructions) {
1513 fs_inst *inst = (fs_inst *) node;
1514
1515 if (inst->dst.file == GRF)
1516 inst->dst.reg = remap_table[inst->dst.reg];
1517
1518 for (int i = 0; i < 3; i++) {
1519 if (inst->src[i].file == GRF)
1520 inst->src[i].reg = remap_table[inst->src[i].reg];
1521 }
1522 }
1523
1524 /* Patch all the references to special values */
1525 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1526 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1527 special[i]->reg = remap_table[special[i]->reg];
1528 }
1529 }
1530
1531 bool
1532 fs_visitor::remove_dead_constants()
1533 {
1534 if (dispatch_width == 8) {
1535 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1536 this->nr_params_remap = c->prog_data.nr_params;
1537
1538 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1539 this->params_remap[i] = -1;
1540
1541 /* Find which params are still in use. */
1542 foreach_list(node, &this->instructions) {
1543 fs_inst *inst = (fs_inst *)node;
1544
1545 for (int i = 0; i < 3; i++) {
1546 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1547
1548 if (inst->src[i].file != UNIFORM)
1549 continue;
1550
1551 /* Section 5.11 of the OpenGL 4.3 spec says:
1552 *
1553 * "Out-of-bounds reads return undefined values, which include
1554 * values from other variables of the active program or zero."
1555 */
1556 if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1557 constant_nr = 0;
1558 }
1559
1560 /* For now, set this to non-negative. We'll give it the
1561 * actual new number in a moment, in order to keep the
1562 * register numbers nicely ordered.
1563 */
1564 this->params_remap[constant_nr] = 0;
1565 }
1566 }
1567
1568 /* Figure out what the new numbers for the params will be. At some
1569 * point when we're doing uniform array access, we're going to want
1570 * to keep the distinction between .reg and .reg_offset, but for
1571 * now we don't care.
1572 */
1573 unsigned int new_nr_params = 0;
1574 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1575 if (this->params_remap[i] != -1) {
1576 this->params_remap[i] = new_nr_params++;
1577 }
1578 }
1579
1580 /* Update the list of params to be uploaded to match our new numbering. */
1581 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1582 int remapped = this->params_remap[i];
1583
1584 if (remapped == -1)
1585 continue;
1586
1587 c->prog_data.param[remapped] = c->prog_data.param[i];
1588 }
1589
1590 c->prog_data.nr_params = new_nr_params;
1591 } else {
1592 /* This should have been generated in the 8-wide pass already. */
1593 assert(this->params_remap);
1594 }
1595
1596 /* Now do the renumbering of the shader to remove unused params. */
1597 foreach_list(node, &this->instructions) {
1598 fs_inst *inst = (fs_inst *)node;
1599
1600 for (int i = 0; i < 3; i++) {
1601 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1602
1603 if (inst->src[i].file != UNIFORM)
1604 continue;
1605
1606 /* as above alias to 0 */
1607 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1608 constant_nr = 0;
1609 }
1610 assert(this->params_remap[constant_nr] != -1);
1611 inst->src[i].reg = this->params_remap[constant_nr];
1612 inst->src[i].reg_offset = 0;
1613 }
1614 }
1615
1616 return true;
1617 }
1618
1619 /*
1620 * Implements array access of uniforms by inserting a
1621 * PULL_CONSTANT_LOAD instruction.
1622 *
1623 * Unlike temporary GRF array access (where we don't support it due to
1624 * the difficulty of doing relative addressing on instruction
1625 * destinations), we could potentially do array access of uniforms
1626 * that were loaded in GRF space as push constants. In real-world
1627 * usage we've seen, though, the arrays being used are always larger
1628 * than we could load as push constants, so just always move all
1629 * uniform array access out to a pull constant buffer.
1630 */
1631 void
1632 fs_visitor::move_uniform_array_access_to_pull_constants()
1633 {
1634 int pull_constant_loc[c->prog_data.nr_params];
1635
1636 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1637 pull_constant_loc[i] = -1;
1638 }
1639
1640 /* Walk through and find array access of uniforms. Put a copy of that
1641 * uniform in the pull constant buffer.
1642 *
1643 * Note that we don't move constant-indexed accesses to arrays. No
1644 * testing has been done of the performance impact of this choice.
1645 */
1646 foreach_list_safe(node, &this->instructions) {
1647 fs_inst *inst = (fs_inst *)node;
1648
1649 for (int i = 0 ; i < 3; i++) {
1650 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1651 continue;
1652
1653 int uniform = inst->src[i].reg;
1654
1655 /* If this array isn't already present in the pull constant buffer,
1656 * add it.
1657 */
1658 if (pull_constant_loc[uniform] == -1) {
1659 const float **values = &c->prog_data.param[uniform];
1660
1661 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1662
1663 assert(param_size[uniform]);
1664
1665 for (int j = 0; j < param_size[uniform]; j++) {
1666 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1667 values[j];
1668 }
1669 }
1670
1671 /* Set up the annotation tracking for new generated instructions. */
1672 base_ir = inst->ir;
1673 current_annotation = inst->annotation;
1674
1675 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1676 fs_reg temp = fs_reg(this, glsl_type::float_type);
1677 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1678 surf_index,
1679 *inst->src[i].reladdr,
1680 pull_constant_loc[uniform] +
1681 inst->src[i].reg_offset);
1682 inst->insert_before(&list);
1683
1684 inst->src[i].file = temp.file;
1685 inst->src[i].reg = temp.reg;
1686 inst->src[i].reg_offset = temp.reg_offset;
1687 inst->src[i].reladdr = NULL;
1688 }
1689 }
1690 }
1691
1692 /**
1693 * Choose accesses from the UNIFORM file to demote to using the pull
1694 * constant buffer.
1695 *
1696 * We allow a fragment shader to have more than the specified minimum
1697 * maximum number of fragment shader uniform components (64). If
1698 * there are too many of these, they'd fill up all of register space.
1699 * So, this will push some of them out to the pull constant buffer and
1700 * update the program to load them.
1701 */
1702 void
1703 fs_visitor::setup_pull_constants()
1704 {
1705 /* Only allow 16 registers (128 uniform components) as push constants. */
1706 unsigned int max_uniform_components = 16 * 8;
1707 if (c->prog_data.nr_params <= max_uniform_components)
1708 return;
1709
1710 if (dispatch_width == 16) {
1711 fail("Pull constants not supported in 16-wide\n");
1712 return;
1713 }
1714
1715 /* Just demote the end of the list. We could probably do better
1716 * here, demoting things that are rarely used in the program first.
1717 */
1718 unsigned int pull_uniform_base = max_uniform_components;
1719
1720 int pull_constant_loc[c->prog_data.nr_params];
1721 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1722 if (i < pull_uniform_base) {
1723 pull_constant_loc[i] = -1;
1724 } else {
1725 pull_constant_loc[i] = -1;
1726 /* If our constant is already being uploaded for reladdr purposes,
1727 * reuse it.
1728 */
1729 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1730 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1731 pull_constant_loc[i] = j;
1732 break;
1733 }
1734 }
1735 if (pull_constant_loc[i] == -1) {
1736 int pull_index = c->prog_data.nr_pull_params++;
1737 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1738 pull_constant_loc[i] = pull_index;;
1739 }
1740 }
1741 }
1742 c->prog_data.nr_params = pull_uniform_base;
1743
1744 foreach_list(node, &this->instructions) {
1745 fs_inst *inst = (fs_inst *)node;
1746
1747 for (int i = 0; i < 3; i++) {
1748 if (inst->src[i].file != UNIFORM)
1749 continue;
1750
1751 int pull_index = pull_constant_loc[inst->src[i].reg +
1752 inst->src[i].reg_offset];
1753 if (pull_index == -1)
1754 continue;
1755
1756 assert(!inst->src[i].reladdr);
1757
1758 fs_reg dst = fs_reg(this, glsl_type::float_type);
1759 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1760 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1761 fs_inst *pull =
1762 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1763 dst, index, offset);
1764 pull->ir = inst->ir;
1765 pull->annotation = inst->annotation;
1766
1767 inst->insert_before(pull);
1768
1769 inst->src[i].file = GRF;
1770 inst->src[i].reg = dst.reg;
1771 inst->src[i].reg_offset = 0;
1772 inst->src[i].smear = pull_index & 3;
1773 }
1774 }
1775 }
1776
1777 bool
1778 fs_visitor::opt_algebraic()
1779 {
1780 bool progress = false;
1781
1782 foreach_list(node, &this->instructions) {
1783 fs_inst *inst = (fs_inst *)node;
1784
1785 switch (inst->opcode) {
1786 case BRW_OPCODE_MUL:
1787 if (inst->src[1].file != IMM)
1788 continue;
1789
1790 /* a * 1.0 = a */
1791 if (inst->src[1].is_one()) {
1792 inst->opcode = BRW_OPCODE_MOV;
1793 inst->src[1] = reg_undef;
1794 progress = true;
1795 break;
1796 }
1797
1798 /* a * 0.0 = 0.0 */
1799 if (inst->src[1].is_zero()) {
1800 inst->opcode = BRW_OPCODE_MOV;
1801 inst->src[0] = inst->src[1];
1802 inst->src[1] = reg_undef;
1803 progress = true;
1804 break;
1805 }
1806
1807 break;
1808 case BRW_OPCODE_ADD:
1809 if (inst->src[1].file != IMM)
1810 continue;
1811
1812 /* a + 0.0 = a */
1813 if (inst->src[1].is_zero()) {
1814 inst->opcode = BRW_OPCODE_MOV;
1815 inst->src[1] = reg_undef;
1816 progress = true;
1817 break;
1818 }
1819 break;
1820 default:
1821 break;
1822 }
1823 }
1824
1825 return progress;
1826 }
1827
1828 /**
1829 * Removes any instructions writing a VGRF where that VGRF is not used by any
1830 * later instruction.
1831 */
1832 bool
1833 fs_visitor::dead_code_eliminate()
1834 {
1835 bool progress = false;
1836 int pc = 0;
1837
1838 calculate_live_intervals();
1839
1840 foreach_list_safe(node, &this->instructions) {
1841 fs_inst *inst = (fs_inst *)node;
1842
1843 if (inst->dst.file == GRF) {
1844 assert(this->virtual_grf_end[inst->dst.reg] >= pc);
1845 if (this->virtual_grf_end[inst->dst.reg] == pc) {
1846 inst->remove();
1847 progress = true;
1848 }
1849 }
1850
1851 pc++;
1852 }
1853
1854 if (progress)
1855 live_intervals_valid = false;
1856
1857 return progress;
1858 }
1859
1860 struct dead_code_hash_key
1861 {
1862 int vgrf;
1863 int reg_offset;
1864 };
1865
1866 static bool
1867 dead_code_hash_compare(const void *a, const void *b)
1868 {
1869 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1870 }
1871
1872 static void
1873 clear_dead_code_hash(struct hash_table *ht)
1874 {
1875 struct hash_entry *entry;
1876
1877 hash_table_foreach(ht, entry) {
1878 _mesa_hash_table_remove(ht, entry);
1879 }
1880 }
1881
1882 static void
1883 insert_dead_code_hash(struct hash_table *ht,
1884 int vgrf, int reg_offset, fs_inst *inst)
1885 {
1886 /* We don't bother freeing keys, because they'll be GCed with the ht. */
1887 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1888
1889 key->vgrf = vgrf;
1890 key->reg_offset = reg_offset;
1891
1892 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1893 }
1894
1895 static struct hash_entry *
1896 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1897 {
1898 struct dead_code_hash_key key;
1899
1900 key.vgrf = vgrf;
1901 key.reg_offset = reg_offset;
1902
1903 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1904 }
1905
1906 static void
1907 remove_dead_code_hash(struct hash_table *ht,
1908 int vgrf, int reg_offset)
1909 {
1910 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1911 if (!entry)
1912 return;
1913
1914 _mesa_hash_table_remove(ht, entry);
1915 }
1916
1917 /**
1918 * Walks basic blocks, removing any regs that are written but not read before
1919 * being redefined.
1920 *
1921 * The dead_code_eliminate() function implements a global dead code
1922 * elimination, but it only handles the removing the last write to a register
1923 * if it's never read. This one can handle intermediate writes, but only
1924 * within a basic block.
1925 */
1926 bool
1927 fs_visitor::dead_code_eliminate_local()
1928 {
1929 struct hash_table *ht;
1930 bool progress = false;
1931
1932 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1933
1934 foreach_list_safe(node, &this->instructions) {
1935 fs_inst *inst = (fs_inst *)node;
1936
1937 /* At a basic block, empty the HT since we don't understand dataflow
1938 * here.
1939 */
1940 if (inst->is_control_flow()) {
1941 clear_dead_code_hash(ht);
1942 continue;
1943 }
1944
1945 /* Clear the HT of any instructions that got read. */
1946 for (int i = 0; i < 3; i++) {
1947 fs_reg src = inst->src[i];
1948 if (src.file != GRF)
1949 continue;
1950
1951 int read = 1;
1952 if (inst->is_send_from_grf())
1953 read = virtual_grf_sizes[src.reg] - src.reg_offset;
1954
1955 for (int reg_offset = src.reg_offset;
1956 reg_offset < src.reg_offset + read;
1957 reg_offset++) {
1958 remove_dead_code_hash(ht, src.reg, reg_offset);
1959 }
1960 }
1961
1962 /* Add any update of a GRF to the HT, removing a previous write if it
1963 * wasn't read.
1964 */
1965 if (inst->dst.file == GRF) {
1966 if (inst->regs_written > 1) {
1967 /* We don't know how to trim channels from an instruction's
1968 * writes, so we can't incrementally remove unread channels from
1969 * it. Just remove whatever it overwrites from the table
1970 */
1971 for (int i = 0; i < inst->regs_written; i++) {
1972 remove_dead_code_hash(ht,
1973 inst->dst.reg,
1974 inst->dst.reg_offset + i);
1975 }
1976 } else {
1977 struct hash_entry *entry =
1978 get_dead_code_hash_entry(ht, inst->dst.reg,
1979 inst->dst.reg_offset);
1980
1981 if (inst->is_partial_write()) {
1982 /* For a partial write, we can't remove any previous dead code
1983 * candidate, since we're just modifying their result, but we can
1984 * be dead code eliminiated ourselves.
1985 */
1986 if (entry) {
1987 entry->data = inst;
1988 } else {
1989 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1990 inst);
1991 }
1992 } else {
1993 if (entry) {
1994 /* We're completely updating a channel, and there was a
1995 * previous write to the channel that wasn't read. Kill it!
1996 */
1997 fs_inst *inst = (fs_inst *)entry->data;
1998 inst->remove();
1999 progress = true;
2000 _mesa_hash_table_remove(ht, entry);
2001 }
2002
2003 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2004 inst);
2005 }
2006 }
2007 }
2008 }
2009
2010 _mesa_hash_table_destroy(ht, NULL);
2011
2012 if (progress)
2013 live_intervals_valid = false;
2014
2015 return progress;
2016 }
2017
2018 /**
2019 * Implements a second type of register coalescing: This one checks if
2020 * the two regs involved in a raw move don't interfere, in which case
2021 * they can both by stored in the same place and the MOV removed.
2022 */
2023 bool
2024 fs_visitor::register_coalesce_2()
2025 {
2026 bool progress = false;
2027
2028 calculate_live_intervals();
2029
2030 foreach_list_safe(node, &this->instructions) {
2031 fs_inst *inst = (fs_inst *)node;
2032
2033 if (inst->opcode != BRW_OPCODE_MOV ||
2034 inst->is_partial_write() ||
2035 inst->saturate ||
2036 inst->src[0].file != GRF ||
2037 inst->src[0].negate ||
2038 inst->src[0].abs ||
2039 inst->src[0].smear != -1 ||
2040 inst->dst.file != GRF ||
2041 inst->dst.type != inst->src[0].type ||
2042 virtual_grf_sizes[inst->src[0].reg] != 1 ||
2043 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
2044 continue;
2045 }
2046
2047 int reg_from = inst->src[0].reg;
2048 assert(inst->src[0].reg_offset == 0);
2049 int reg_to = inst->dst.reg;
2050 int reg_to_offset = inst->dst.reg_offset;
2051
2052 foreach_list(node, &this->instructions) {
2053 fs_inst *scan_inst = (fs_inst *)node;
2054
2055 if (scan_inst->dst.file == GRF &&
2056 scan_inst->dst.reg == reg_from) {
2057 scan_inst->dst.reg = reg_to;
2058 scan_inst->dst.reg_offset = reg_to_offset;
2059 }
2060 for (int i = 0; i < 3; i++) {
2061 if (scan_inst->src[i].file == GRF &&
2062 scan_inst->src[i].reg == reg_from) {
2063 scan_inst->src[i].reg = reg_to;
2064 scan_inst->src[i].reg_offset = reg_to_offset;
2065 }
2066 }
2067 }
2068
2069 inst->remove();
2070
2071 /* We don't need to recalculate live intervals inside the loop despite
2072 * flagging live_intervals_valid because we only use live intervals for
2073 * the interferes test, and we must have had a situation where the
2074 * intervals were:
2075 *
2076 * from to
2077 * ^
2078 * |
2079 * v
2080 * ^
2081 * |
2082 * v
2083 *
2084 * Some register R that might get coalesced with one of these two could
2085 * only be referencing "to", otherwise "from"'s range would have been
2086 * longer. R's range could also only start at the end of "to" or later,
2087 * otherwise it will conflict with "to" when we try to coalesce "to"
2088 * into Rw anyway.
2089 */
2090 live_intervals_valid = false;
2091
2092 progress = true;
2093 continue;
2094 }
2095
2096 return progress;
2097 }
2098
2099 bool
2100 fs_visitor::register_coalesce()
2101 {
2102 bool progress = false;
2103 int if_depth = 0;
2104 int loop_depth = 0;
2105
2106 foreach_list_safe(node, &this->instructions) {
2107 fs_inst *inst = (fs_inst *)node;
2108
2109 /* Make sure that we dominate the instructions we're going to
2110 * scan for interfering with our coalescing, or we won't have
2111 * scanned enough to see if anything interferes with our
2112 * coalescing. We don't dominate the following instructions if
2113 * we're in a loop or an if block.
2114 */
2115 switch (inst->opcode) {
2116 case BRW_OPCODE_DO:
2117 loop_depth++;
2118 break;
2119 case BRW_OPCODE_WHILE:
2120 loop_depth--;
2121 break;
2122 case BRW_OPCODE_IF:
2123 if_depth++;
2124 break;
2125 case BRW_OPCODE_ENDIF:
2126 if_depth--;
2127 break;
2128 default:
2129 break;
2130 }
2131 if (loop_depth || if_depth)
2132 continue;
2133
2134 if (inst->opcode != BRW_OPCODE_MOV ||
2135 inst->is_partial_write() ||
2136 inst->saturate ||
2137 inst->dst.file != GRF || (inst->src[0].file != GRF &&
2138 inst->src[0].file != UNIFORM)||
2139 inst->dst.type != inst->src[0].type)
2140 continue;
2141
2142 bool has_source_modifiers = (inst->src[0].abs ||
2143 inst->src[0].negate ||
2144 inst->src[0].smear != -1 ||
2145 inst->src[0].file == UNIFORM);
2146
2147 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
2148 * them: check for no writes to either one until the exit of the
2149 * program.
2150 */
2151 bool interfered = false;
2152
2153 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2154 !scan_inst->is_tail_sentinel();
2155 scan_inst = (fs_inst *)scan_inst->next) {
2156 if (scan_inst->dst.file == GRF) {
2157 if (scan_inst->overwrites_reg(inst->dst) ||
2158 scan_inst->overwrites_reg(inst->src[0])) {
2159 interfered = true;
2160 break;
2161 }
2162 }
2163
2164 if (has_source_modifiers) {
2165 for (int i = 0; i < 3; i++) {
2166 if (scan_inst->src[i].file == GRF &&
2167 scan_inst->src[i].reg == inst->dst.reg &&
2168 scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2169 inst->dst.type != scan_inst->src[i].type)
2170 {
2171 interfered = true;
2172 break;
2173 }
2174 }
2175 }
2176
2177
2178 /* The gen6 MATH instruction can't handle source modifiers or
2179 * unusual register regions, so avoid coalescing those for
2180 * now. We should do something more specific.
2181 */
2182 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2183 interfered = true;
2184 break;
2185 }
2186
2187 /* The accumulator result appears to get used for the
2188 * conditional modifier generation. When negating a UD
2189 * value, there is a 33rd bit generated for the sign in the
2190 * accumulator value, so now you can't check, for example,
2191 * equality with a 32-bit value. See piglit fs-op-neg-uint.
2192 */
2193 if (scan_inst->conditional_mod &&
2194 inst->src[0].negate &&
2195 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2196 interfered = true;
2197 break;
2198 }
2199 }
2200 if (interfered) {
2201 continue;
2202 }
2203
2204 /* Rewrite the later usage to point at the source of the move to
2205 * be removed.
2206 */
2207 for (fs_inst *scan_inst = inst;
2208 !scan_inst->is_tail_sentinel();
2209 scan_inst = (fs_inst *)scan_inst->next) {
2210 for (int i = 0; i < 3; i++) {
2211 if (scan_inst->src[i].file == GRF &&
2212 scan_inst->src[i].reg == inst->dst.reg &&
2213 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2214 fs_reg new_src = inst->src[0];
2215 if (scan_inst->src[i].abs) {
2216 new_src.negate = 0;
2217 new_src.abs = 1;
2218 }
2219 new_src.negate ^= scan_inst->src[i].negate;
2220 scan_inst->src[i] = new_src;
2221 }
2222 }
2223 }
2224
2225 inst->remove();
2226 progress = true;
2227 }
2228
2229 if (progress)
2230 live_intervals_valid = false;
2231
2232 return progress;
2233 }
2234
2235
2236 bool
2237 fs_visitor::compute_to_mrf()
2238 {
2239 bool progress = false;
2240 int next_ip = 0;
2241
2242 calculate_live_intervals();
2243
2244 foreach_list_safe(node, &this->instructions) {
2245 fs_inst *inst = (fs_inst *)node;
2246
2247 int ip = next_ip;
2248 next_ip++;
2249
2250 if (inst->opcode != BRW_OPCODE_MOV ||
2251 inst->is_partial_write() ||
2252 inst->dst.file != MRF || inst->src[0].file != GRF ||
2253 inst->dst.type != inst->src[0].type ||
2254 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2255 continue;
2256
2257 /* Work out which hardware MRF registers are written by this
2258 * instruction.
2259 */
2260 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2261 int mrf_high;
2262 if (inst->dst.reg & BRW_MRF_COMPR4) {
2263 mrf_high = mrf_low + 4;
2264 } else if (dispatch_width == 16 &&
2265 (!inst->force_uncompressed && !inst->force_sechalf)) {
2266 mrf_high = mrf_low + 1;
2267 } else {
2268 mrf_high = mrf_low;
2269 }
2270
2271 /* Can't compute-to-MRF this GRF if someone else was going to
2272 * read it later.
2273 */
2274 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2275 continue;
2276
2277 /* Found a move of a GRF to a MRF. Let's see if we can go
2278 * rewrite the thing that made this GRF to write into the MRF.
2279 */
2280 fs_inst *scan_inst;
2281 for (scan_inst = (fs_inst *)inst->prev;
2282 scan_inst->prev != NULL;
2283 scan_inst = (fs_inst *)scan_inst->prev) {
2284 if (scan_inst->dst.file == GRF &&
2285 scan_inst->dst.reg == inst->src[0].reg) {
2286 /* Found the last thing to write our reg we want to turn
2287 * into a compute-to-MRF.
2288 */
2289
2290 /* If this one instruction didn't populate all the
2291 * channels, bail. We might be able to rewrite everything
2292 * that writes that reg, but it would require smarter
2293 * tracking to delay the rewriting until complete success.
2294 */
2295 if (scan_inst->is_partial_write())
2296 break;
2297
2298 /* Things returning more than one register would need us to
2299 * understand coalescing out more than one MOV at a time.
2300 */
2301 if (scan_inst->regs_written > 1)
2302 break;
2303
2304 /* SEND instructions can't have MRF as a destination. */
2305 if (scan_inst->mlen)
2306 break;
2307
2308 if (brw->gen == 6) {
2309 /* gen6 math instructions must have the destination be
2310 * GRF, so no compute-to-MRF for them.
2311 */
2312 if (scan_inst->is_math()) {
2313 break;
2314 }
2315 }
2316
2317 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2318 /* Found the creator of our MRF's source value. */
2319 scan_inst->dst.file = MRF;
2320 scan_inst->dst.reg = inst->dst.reg;
2321 scan_inst->saturate |= inst->saturate;
2322 inst->remove();
2323 progress = true;
2324 }
2325 break;
2326 }
2327
2328 /* We don't handle control flow here. Most computation of
2329 * values that end up in MRFs are shortly before the MRF
2330 * write anyway.
2331 */
2332 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2333 break;
2334
2335 /* You can't read from an MRF, so if someone else reads our
2336 * MRF's source GRF that we wanted to rewrite, that stops us.
2337 */
2338 bool interfered = false;
2339 for (int i = 0; i < 3; i++) {
2340 if (scan_inst->src[i].file == GRF &&
2341 scan_inst->src[i].reg == inst->src[0].reg &&
2342 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2343 interfered = true;
2344 }
2345 }
2346 if (interfered)
2347 break;
2348
2349 if (scan_inst->dst.file == MRF) {
2350 /* If somebody else writes our MRF here, we can't
2351 * compute-to-MRF before that.
2352 */
2353 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2354 int scan_mrf_high;
2355
2356 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2357 scan_mrf_high = scan_mrf_low + 4;
2358 } else if (dispatch_width == 16 &&
2359 (!scan_inst->force_uncompressed &&
2360 !scan_inst->force_sechalf)) {
2361 scan_mrf_high = scan_mrf_low + 1;
2362 } else {
2363 scan_mrf_high = scan_mrf_low;
2364 }
2365
2366 if (mrf_low == scan_mrf_low ||
2367 mrf_low == scan_mrf_high ||
2368 mrf_high == scan_mrf_low ||
2369 mrf_high == scan_mrf_high) {
2370 break;
2371 }
2372 }
2373
2374 if (scan_inst->mlen > 0) {
2375 /* Found a SEND instruction, which means that there are
2376 * live values in MRFs from base_mrf to base_mrf +
2377 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2378 * above it.
2379 */
2380 if (mrf_low >= scan_inst->base_mrf &&
2381 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2382 break;
2383 }
2384 if (mrf_high >= scan_inst->base_mrf &&
2385 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2386 break;
2387 }
2388 }
2389 }
2390 }
2391
2392 if (progress)
2393 live_intervals_valid = false;
2394
2395 return progress;
2396 }
2397
2398 /**
2399 * Walks through basic blocks, looking for repeated MRF writes and
2400 * removing the later ones.
2401 */
2402 bool
2403 fs_visitor::remove_duplicate_mrf_writes()
2404 {
2405 fs_inst *last_mrf_move[16];
2406 bool progress = false;
2407
2408 /* Need to update the MRF tracking for compressed instructions. */
2409 if (dispatch_width == 16)
2410 return false;
2411
2412 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2413
2414 foreach_list_safe(node, &this->instructions) {
2415 fs_inst *inst = (fs_inst *)node;
2416
2417 if (inst->is_control_flow()) {
2418 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2419 }
2420
2421 if (inst->opcode == BRW_OPCODE_MOV &&
2422 inst->dst.file == MRF) {
2423 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2424 if (prev_inst && inst->equals(prev_inst)) {
2425 inst->remove();
2426 progress = true;
2427 continue;
2428 }
2429 }
2430
2431 /* Clear out the last-write records for MRFs that were overwritten. */
2432 if (inst->dst.file == MRF) {
2433 last_mrf_move[inst->dst.reg] = NULL;
2434 }
2435
2436 if (inst->mlen > 0) {
2437 /* Found a SEND instruction, which will include two or fewer
2438 * implied MRF writes. We could do better here.
2439 */
2440 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2441 last_mrf_move[inst->base_mrf + i] = NULL;
2442 }
2443 }
2444
2445 /* Clear out any MRF move records whose sources got overwritten. */
2446 if (inst->dst.file == GRF) {
2447 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2448 if (last_mrf_move[i] &&
2449 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2450 last_mrf_move[i] = NULL;
2451 }
2452 }
2453 }
2454
2455 if (inst->opcode == BRW_OPCODE_MOV &&
2456 inst->dst.file == MRF &&
2457 inst->src[0].file == GRF &&
2458 !inst->is_partial_write()) {
2459 last_mrf_move[inst->dst.reg] = inst;
2460 }
2461 }
2462
2463 if (progress)
2464 live_intervals_valid = false;
2465
2466 return progress;
2467 }
2468
2469 static void
2470 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2471 int first_grf, int grf_len)
2472 {
2473 bool inst_16wide = (dispatch_width > 8 &&
2474 !inst->force_uncompressed &&
2475 !inst->force_sechalf);
2476
2477 /* Clear the flag for registers that actually got read (as expected). */
2478 for (int i = 0; i < 3; i++) {
2479 int grf;
2480 if (inst->src[i].file == GRF) {
2481 grf = inst->src[i].reg;
2482 } else if (inst->src[i].file == HW_REG &&
2483 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2484 grf = inst->src[i].fixed_hw_reg.nr;
2485 } else {
2486 continue;
2487 }
2488
2489 if (grf >= first_grf &&
2490 grf < first_grf + grf_len) {
2491 deps[grf - first_grf] = false;
2492 if (inst_16wide)
2493 deps[grf - first_grf + 1] = false;
2494 }
2495 }
2496 }
2497
2498 /**
2499 * Implements this workaround for the original 965:
2500 *
2501 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2502 * check for post destination dependencies on this instruction, software
2503 * must ensure that there is no destination hazard for the case of ‘write
2504 * followed by a posted write’ shown in the following example.
2505 *
2506 * 1. mov r3 0
2507 * 2. send r3.xy <rest of send instruction>
2508 * 3. mov r2 r3
2509 *
2510 * Due to no post-destination dependency check on the ‘send’, the above
2511 * code sequence could have two instructions (1 and 2) in flight at the
2512 * same time that both consider ‘r3’ as the target of their final writes.
2513 */
2514 void
2515 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2516 {
2517 int reg_size = dispatch_width / 8;
2518 int write_len = inst->regs_written * reg_size;
2519 int first_write_grf = inst->dst.reg;
2520 bool needs_dep[BRW_MAX_MRF];
2521 assert(write_len < (int)sizeof(needs_dep) - 1);
2522
2523 memset(needs_dep, false, sizeof(needs_dep));
2524 memset(needs_dep, true, write_len);
2525
2526 clear_deps_for_inst_src(inst, dispatch_width,
2527 needs_dep, first_write_grf, write_len);
2528
2529 /* Walk backwards looking for writes to registers we're writing which
2530 * aren't read since being written. If we hit the start of the program,
2531 * we assume that there are no outstanding dependencies on entry to the
2532 * program.
2533 */
2534 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2535 scan_inst != NULL;
2536 scan_inst = (fs_inst *)scan_inst->prev) {
2537
2538 /* If we hit control flow, assume that there *are* outstanding
2539 * dependencies, and force their cleanup before our instruction.
2540 */
2541 if (scan_inst->is_control_flow()) {
2542 for (int i = 0; i < write_len; i++) {
2543 if (needs_dep[i]) {
2544 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2545 }
2546 }
2547 return;
2548 }
2549
2550 bool scan_inst_16wide = (dispatch_width > 8 &&
2551 !scan_inst->force_uncompressed &&
2552 !scan_inst->force_sechalf);
2553
2554 /* We insert our reads as late as possible on the assumption that any
2555 * instruction but a MOV that might have left us an outstanding
2556 * dependency has more latency than a MOV.
2557 */
2558 if (scan_inst->dst.file == GRF) {
2559 for (int i = 0; i < scan_inst->regs_written; i++) {
2560 int reg = scan_inst->dst.reg + i * reg_size;
2561
2562 if (reg >= first_write_grf &&
2563 reg < first_write_grf + write_len &&
2564 needs_dep[reg - first_write_grf]) {
2565 inst->insert_before(DEP_RESOLVE_MOV(reg));
2566 needs_dep[reg - first_write_grf] = false;
2567 if (scan_inst_16wide)
2568 needs_dep[reg - first_write_grf + 1] = false;
2569 }
2570 }
2571 }
2572
2573 /* Clear the flag for registers that actually got read (as expected). */
2574 clear_deps_for_inst_src(scan_inst, dispatch_width,
2575 needs_dep, first_write_grf, write_len);
2576
2577 /* Continue the loop only if we haven't resolved all the dependencies */
2578 int i;
2579 for (i = 0; i < write_len; i++) {
2580 if (needs_dep[i])
2581 break;
2582 }
2583 if (i == write_len)
2584 return;
2585 }
2586 }
2587
2588 /**
2589 * Implements this workaround for the original 965:
2590 *
2591 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2592 * used as a destination register until after it has been sourced by an
2593 * instruction with a different destination register.
2594 */
2595 void
2596 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2597 {
2598 int write_len = inst->regs_written * dispatch_width / 8;
2599 int first_write_grf = inst->dst.reg;
2600 bool needs_dep[BRW_MAX_MRF];
2601 assert(write_len < (int)sizeof(needs_dep) - 1);
2602
2603 memset(needs_dep, false, sizeof(needs_dep));
2604 memset(needs_dep, true, write_len);
2605 /* Walk forwards looking for writes to registers we're writing which aren't
2606 * read before being written.
2607 */
2608 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2609 !scan_inst->is_tail_sentinel();
2610 scan_inst = (fs_inst *)scan_inst->next) {
2611 /* If we hit control flow, force resolve all remaining dependencies. */
2612 if (scan_inst->is_control_flow()) {
2613 for (int i = 0; i < write_len; i++) {
2614 if (needs_dep[i])
2615 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2616 }
2617 return;
2618 }
2619
2620 /* Clear the flag for registers that actually got read (as expected). */
2621 clear_deps_for_inst_src(scan_inst, dispatch_width,
2622 needs_dep, first_write_grf, write_len);
2623
2624 /* We insert our reads as late as possible since they're reading the
2625 * result of a SEND, which has massive latency.
2626 */
2627 if (scan_inst->dst.file == GRF &&
2628 scan_inst->dst.reg >= first_write_grf &&
2629 scan_inst->dst.reg < first_write_grf + write_len &&
2630 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2631 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2632 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2633 }
2634
2635 /* Continue the loop only if we haven't resolved all the dependencies */
2636 int i;
2637 for (i = 0; i < write_len; i++) {
2638 if (needs_dep[i])
2639 break;
2640 }
2641 if (i == write_len)
2642 return;
2643 }
2644
2645 /* If we hit the end of the program, resolve all remaining dependencies out
2646 * of paranoia.
2647 */
2648 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2649 assert(last_inst->eot);
2650 for (int i = 0; i < write_len; i++) {
2651 if (needs_dep[i])
2652 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2653 }
2654 }
2655
2656 void
2657 fs_visitor::insert_gen4_send_dependency_workarounds()
2658 {
2659 if (brw->gen != 4 || brw->is_g4x)
2660 return;
2661
2662 /* Note that we're done with register allocation, so GRF fs_regs always
2663 * have a .reg_offset of 0.
2664 */
2665
2666 foreach_list_safe(node, &this->instructions) {
2667 fs_inst *inst = (fs_inst *)node;
2668
2669 if (inst->mlen != 0 && inst->dst.file == GRF) {
2670 insert_gen4_pre_send_dependency_workarounds(inst);
2671 insert_gen4_post_send_dependency_workarounds(inst);
2672 }
2673 }
2674 }
2675
2676 /**
2677 * Turns the generic expression-style uniform pull constant load instruction
2678 * into a hardware-specific series of instructions for loading a pull
2679 * constant.
2680 *
2681 * The expression style allows the CSE pass before this to optimize out
2682 * repeated loads from the same offset, and gives the pre-register-allocation
2683 * scheduling full flexibility, while the conversion to native instructions
2684 * allows the post-register-allocation scheduler the best information
2685 * possible.
2686 *
2687 * Note that execution masking for setting up pull constant loads is special:
2688 * the channels that need to be written are unrelated to the current execution
2689 * mask, since a later instruction will use one of the result channels as a
2690 * source operand for all 8 or 16 of its channels.
2691 */
2692 void
2693 fs_visitor::lower_uniform_pull_constant_loads()
2694 {
2695 foreach_list(node, &this->instructions) {
2696 fs_inst *inst = (fs_inst *)node;
2697
2698 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2699 continue;
2700
2701 if (brw->gen >= 7) {
2702 /* The offset arg before was a vec4-aligned byte offset. We need to
2703 * turn it into a dword offset.
2704 */
2705 fs_reg const_offset_reg = inst->src[1];
2706 assert(const_offset_reg.file == IMM &&
2707 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2708 const_offset_reg.imm.u /= 4;
2709 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2710
2711 /* This is actually going to be a MOV, but since only the first dword
2712 * is accessed, we have a special opcode to do just that one. Note
2713 * that this needs to be an operation that will be considered a def
2714 * by live variable analysis, or register allocation will explode.
2715 */
2716 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2717 payload, const_offset_reg);
2718 setup->force_writemask_all = true;
2719
2720 setup->ir = inst->ir;
2721 setup->annotation = inst->annotation;
2722 inst->insert_before(setup);
2723
2724 /* Similarly, this will only populate the first 4 channels of the
2725 * result register (since we only use smear values from 0-3), but we
2726 * don't tell the optimizer.
2727 */
2728 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2729 inst->src[1] = payload;
2730
2731 this->live_intervals_valid = false;
2732 } else {
2733 /* Before register allocation, we didn't tell the scheduler about the
2734 * MRF we use. We know it's safe to use this MRF because nothing
2735 * else does except for register spill/unspill, which generates and
2736 * uses its MRF within a single IR instruction.
2737 */
2738 inst->base_mrf = 14;
2739 inst->mlen = 1;
2740 }
2741 }
2742 }
2743
2744 void
2745 fs_visitor::dump_instruction(backend_instruction *be_inst)
2746 {
2747 fs_inst *inst = (fs_inst *)be_inst;
2748
2749 if (inst->predicate) {
2750 printf("(%cf0.%d) ",
2751 inst->predicate_inverse ? '-' : '+',
2752 inst->flag_subreg);
2753 }
2754
2755 printf("%s", brw_instruction_name(inst->opcode));
2756 if (inst->saturate)
2757 printf(".sat");
2758 if (inst->conditional_mod) {
2759 printf(".cmod");
2760 if (!inst->predicate &&
2761 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2762 inst->opcode != BRW_OPCODE_IF &&
2763 inst->opcode != BRW_OPCODE_WHILE))) {
2764 printf(".f0.%d", inst->flag_subreg);
2765 }
2766 }
2767 printf(" ");
2768
2769
2770 switch (inst->dst.file) {
2771 case GRF:
2772 printf("vgrf%d", inst->dst.reg);
2773 if (inst->dst.reg_offset)
2774 printf("+%d", inst->dst.reg_offset);
2775 break;
2776 case MRF:
2777 printf("m%d", inst->dst.reg);
2778 break;
2779 case BAD_FILE:
2780 printf("(null)");
2781 break;
2782 case UNIFORM:
2783 printf("***u%d***", inst->dst.reg);
2784 break;
2785 case ARF:
2786 if (inst->dst.reg == BRW_ARF_NULL)
2787 printf("(null)");
2788 else
2789 printf("arf%d", inst->dst.reg);
2790 break;
2791 default:
2792 printf("???");
2793 break;
2794 }
2795 printf(", ");
2796
2797 for (int i = 0; i < 3; i++) {
2798 if (inst->src[i].negate)
2799 printf("-");
2800 if (inst->src[i].abs)
2801 printf("|");
2802 switch (inst->src[i].file) {
2803 case GRF:
2804 printf("vgrf%d", inst->src[i].reg);
2805 if (inst->src[i].reg_offset)
2806 printf("+%d", inst->src[i].reg_offset);
2807 break;
2808 case MRF:
2809 printf("***m%d***", inst->src[i].reg);
2810 break;
2811 case UNIFORM:
2812 printf("u%d", inst->src[i].reg);
2813 if (inst->src[i].reg_offset)
2814 printf(".%d", inst->src[i].reg_offset);
2815 break;
2816 case BAD_FILE:
2817 printf("(null)");
2818 break;
2819 case IMM:
2820 switch (inst->src[i].type) {
2821 case BRW_REGISTER_TYPE_F:
2822 printf("%ff", inst->src[i].imm.f);
2823 break;
2824 case BRW_REGISTER_TYPE_D:
2825 printf("%dd", inst->src[i].imm.i);
2826 break;
2827 case BRW_REGISTER_TYPE_UD:
2828 printf("%uu", inst->src[i].imm.u);
2829 break;
2830 default:
2831 printf("???");
2832 break;
2833 }
2834 break;
2835 default:
2836 printf("???");
2837 break;
2838 }
2839 if (inst->src[i].abs)
2840 printf("|");
2841
2842 if (i < 3)
2843 printf(", ");
2844 }
2845
2846 printf(" ");
2847
2848 if (inst->force_uncompressed)
2849 printf("1sthalf ");
2850
2851 if (inst->force_sechalf)
2852 printf("2ndhalf ");
2853
2854 printf("\n");
2855 }
2856
2857 /**
2858 * Possibly returns an instruction that set up @param reg.
2859 *
2860 * Sometimes we want to take the result of some expression/variable
2861 * dereference tree and rewrite the instruction generating the result
2862 * of the tree. When processing the tree, we know that the
2863 * instructions generated are all writing temporaries that are dead
2864 * outside of this tree. So, if we have some instructions that write
2865 * a temporary, we're free to point that temp write somewhere else.
2866 *
2867 * Note that this doesn't guarantee that the instruction generated
2868 * only reg -- it might be the size=4 destination of a texture instruction.
2869 */
2870 fs_inst *
2871 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2872 fs_inst *end,
2873 fs_reg reg)
2874 {
2875 if (end == start ||
2876 end->is_partial_write() ||
2877 reg.reladdr ||
2878 !reg.equals(end->dst)) {
2879 return NULL;
2880 } else {
2881 return end;
2882 }
2883 }
2884
2885 void
2886 fs_visitor::setup_payload_gen6()
2887 {
2888 bool uses_depth =
2889 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2890 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2891
2892 assert(brw->gen >= 6);
2893
2894 /* R0-1: masks, pixel X/Y coordinates. */
2895 c->nr_payload_regs = 2;
2896 /* R2: only for 32-pixel dispatch.*/
2897
2898 /* R3-26: barycentric interpolation coordinates. These appear in the
2899 * same order that they appear in the brw_wm_barycentric_interp_mode
2900 * enum. Each set of coordinates occupies 2 registers if dispatch width
2901 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2902 * appear if they were enabled using the "Barycentric Interpolation
2903 * Mode" bits in WM_STATE.
2904 */
2905 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2906 if (barycentric_interp_modes & (1 << i)) {
2907 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2908 c->nr_payload_regs += 2;
2909 if (dispatch_width == 16) {
2910 c->nr_payload_regs += 2;
2911 }
2912 }
2913 }
2914
2915 /* R27: interpolated depth if uses source depth */
2916 if (uses_depth) {
2917 c->source_depth_reg = c->nr_payload_regs;
2918 c->nr_payload_regs++;
2919 if (dispatch_width == 16) {
2920 /* R28: interpolated depth if not 8-wide. */
2921 c->nr_payload_regs++;
2922 }
2923 }
2924 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2925 if (uses_depth) {
2926 c->source_w_reg = c->nr_payload_regs;
2927 c->nr_payload_regs++;
2928 if (dispatch_width == 16) {
2929 /* R30: interpolated W if not 8-wide. */
2930 c->nr_payload_regs++;
2931 }
2932 }
2933 /* R31: MSAA position offsets. */
2934 /* R32-: bary for 32-pixel. */
2935 /* R58-59: interp W for 32-pixel. */
2936
2937 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2938 c->source_depth_to_render_target = true;
2939 }
2940 }
2941
2942 bool
2943 fs_visitor::run()
2944 {
2945 sanity_param_count = fp->Base.Parameters->NumParameters;
2946 uint32_t orig_nr_params = c->prog_data.nr_params;
2947
2948 if (brw->gen >= 6)
2949 setup_payload_gen6();
2950 else
2951 setup_payload_gen4();
2952
2953 if (0) {
2954 emit_dummy_fs();
2955 } else {
2956 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2957 emit_shader_time_begin();
2958
2959 calculate_urb_setup();
2960 if (brw->gen < 6)
2961 emit_interpolation_setup_gen4();
2962 else
2963 emit_interpolation_setup_gen6();
2964
2965 /* We handle discards by keeping track of the still-live pixels in f0.1.
2966 * Initialize it with the dispatched pixels.
2967 */
2968 if (fp->UsesKill) {
2969 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2970 discard_init->flag_subreg = 1;
2971 }
2972
2973 /* Generate FS IR for main(). (the visitor only descends into
2974 * functions called "main").
2975 */
2976 if (shader) {
2977 foreach_list(node, &*shader->ir) {
2978 ir_instruction *ir = (ir_instruction *)node;
2979 base_ir = ir;
2980 this->result = reg_undef;
2981 ir->accept(this);
2982 }
2983 } else {
2984 emit_fragment_program_code();
2985 }
2986 base_ir = NULL;
2987 if (failed)
2988 return false;
2989
2990 emit(FS_OPCODE_PLACEHOLDER_HALT);
2991
2992 emit_fb_writes();
2993
2994 split_virtual_grfs();
2995
2996 move_uniform_array_access_to_pull_constants();
2997 setup_pull_constants();
2998
2999 bool progress;
3000 do {
3001 progress = false;
3002
3003 compact_virtual_grfs();
3004
3005 progress = remove_duplicate_mrf_writes() || progress;
3006
3007 progress = opt_algebraic() || progress;
3008 progress = opt_cse() || progress;
3009 progress = opt_copy_propagate() || progress;
3010 progress = dead_code_eliminate() || progress;
3011 progress = dead_code_eliminate_local() || progress;
3012 progress = register_coalesce() || progress;
3013 progress = register_coalesce_2() || progress;
3014 progress = compute_to_mrf() || progress;
3015 } while (progress);
3016
3017 remove_dead_constants();
3018
3019 schedule_instructions(false);
3020
3021 lower_uniform_pull_constant_loads();
3022
3023 assign_curb_setup();
3024 assign_urb_setup();
3025
3026 if (0) {
3027 /* Debug of register spilling: Go spill everything. */
3028 for (int i = 0; i < virtual_grf_count; i++) {
3029 spill_reg(i);
3030 }
3031 }
3032
3033 if (0)
3034 assign_regs_trivial();
3035 else {
3036 while (!assign_regs()) {
3037 if (failed)
3038 break;
3039 }
3040 }
3041 }
3042 assert(force_uncompressed_stack == 0);
3043 assert(force_sechalf_stack == 0);
3044
3045 /* This must come after all optimization and register allocation, since
3046 * it inserts dead code that happens to have side effects, and it does
3047 * so based on the actual physical registers in use.
3048 */
3049 insert_gen4_send_dependency_workarounds();
3050
3051 if (failed)
3052 return false;
3053
3054 schedule_instructions(true);
3055
3056 if (dispatch_width == 8) {
3057 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3058 } else {
3059 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3060
3061 /* Make sure we didn't try to sneak in an extra uniform */
3062 assert(orig_nr_params == c->prog_data.nr_params);
3063 (void) orig_nr_params;
3064 }
3065
3066 /* If any state parameters were appended, then ParameterValues could have
3067 * been realloced, in which case the driver uniform storage set up by
3068 * _mesa_associate_uniform_storage() would point to freed memory. Make
3069 * sure that didn't happen.
3070 */
3071 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3072
3073 return !failed;
3074 }
3075
3076 const unsigned *
3077 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3078 struct gl_fragment_program *fp,
3079 struct gl_shader_program *prog,
3080 unsigned *final_assembly_size)
3081 {
3082 bool start_busy = false;
3083 float start_time = 0;
3084
3085 if (unlikely(brw->perf_debug)) {
3086 start_busy = (brw->batch.last_bo &&
3087 drm_intel_bo_busy(brw->batch.last_bo));
3088 start_time = get_time();
3089 }
3090
3091 struct brw_shader *shader = NULL;
3092 if (prog)
3093 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3094
3095 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3096 if (prog) {
3097 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3098 _mesa_print_ir(shader->ir, NULL);
3099 printf("\n\n");
3100 } else {
3101 printf("ARB_fragment_program %d ir for native fragment shader\n",
3102 fp->Base.Id);
3103 _mesa_print_program(&fp->Base);
3104 }
3105 }
3106
3107 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3108 */
3109 fs_visitor v(brw, c, prog, fp, 8);
3110 if (!v.run()) {
3111 if (prog) {
3112 prog->LinkStatus = false;
3113 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3114 }
3115
3116 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3117 v.fail_msg);
3118
3119 return NULL;
3120 }
3121
3122 exec_list *simd16_instructions = NULL;
3123 fs_visitor v2(brw, c, prog, fp, 16);
3124 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3125 if (c->prog_data.nr_pull_params == 0) {
3126 /* Try a 16-wide compile */
3127 v2.import_uniforms(&v);
3128 if (!v2.run()) {
3129 perf_debug("16-wide shader failed to compile, falling back to "
3130 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3131 } else {
3132 simd16_instructions = &v2.instructions;
3133 }
3134 } else {
3135 perf_debug("Skipping 16-wide due to pull parameters.\n");
3136 }
3137 }
3138
3139 c->prog_data.dispatch_width = 8;
3140
3141 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3142 const unsigned *generated = g.generate_assembly(&v.instructions,
3143 simd16_instructions,
3144 final_assembly_size);
3145
3146 if (unlikely(brw->perf_debug) && shader) {
3147 if (shader->compiled_once)
3148 brw_wm_debug_recompile(brw, prog, &c->key);
3149 shader->compiled_once = true;
3150
3151 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3152 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3153 (get_time() - start_time) * 1000);
3154 }
3155 }
3156
3157 return generated;
3158 }
3159
3160 bool
3161 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3162 {
3163 struct brw_context *brw = brw_context(ctx);
3164 struct brw_wm_prog_key key;
3165
3166 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3167 return true;
3168
3169 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3170 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3171 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3172 bool program_uses_dfdy = fp->UsesDFdy;
3173
3174 memset(&key, 0, sizeof(key));
3175
3176 if (brw->gen < 6) {
3177 if (fp->UsesKill)
3178 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3179
3180 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3181 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3182
3183 /* Just assume depth testing. */
3184 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3185 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3186 }
3187
3188 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3189 BRW_FS_VARYING_INPUT_MASK) > 16)
3190 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3191
3192 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3193
3194 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3195 for (unsigned i = 0; i < sampler_count; i++) {
3196 if (fp->Base.ShadowSamplers & (1 << i)) {
3197 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3198 key.tex.swizzles[i] =
3199 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3200 } else {
3201 /* Color sampler: assume no swizzling. */
3202 key.tex.swizzles[i] = SWIZZLE_XYZW;
3203 }
3204 }
3205
3206 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3207 key.drawable_height = ctx->DrawBuffer->Height;
3208 }
3209
3210 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3211 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3212 }
3213
3214 key.nr_color_regions = 1;
3215
3216 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3217 * quality of the derivatives is likely to be determined by the driconf
3218 * option.
3219 */
3220 key.high_quality_derivatives = brw->disable_derivative_optimization;
3221
3222 key.program_string_id = bfp->id;
3223
3224 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3225 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3226
3227 bool success = do_wm_prog(brw, prog, bfp, &key);
3228
3229 brw->wm.base.prog_offset = old_prog_offset;
3230 brw->wm.prog_data = old_prog_data;
3231
3232 return success;
3233 }