i965/vec4: Only zero out unused message components when there are any.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/uniforms.h"
39 #include "main/fbobject.h"
40 #include "program/prog_parameter.h"
41 #include "program/prog_print.h"
42 #include "program/register_allocate.h"
43 #include "program/sampler.h"
44 #include "program/hash_table.h"
45 #include "brw_context.h"
46 #include "brw_eu.h"
47 #include "brw_wm.h"
48 }
49 #include "brw_fs.h"
50 #include "glsl/glsl_types.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63
64 /* This will be the case for almost all instructions. */
65 this->regs_written = 1;
66 }
67
68 fs_inst::fs_inst()
69 {
70 init();
71 }
72
73 fs_inst::fs_inst(enum opcode opcode)
74 {
75 init();
76 this->opcode = opcode;
77 }
78
79 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
80 {
81 init();
82 this->opcode = opcode;
83 this->dst = dst;
84
85 if (dst.file == GRF)
86 assert(dst.reg_offset >= 0);
87 }
88
89 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
90 {
91 init();
92 this->opcode = opcode;
93 this->dst = dst;
94 this->src[0] = src0;
95
96 if (dst.file == GRF)
97 assert(dst.reg_offset >= 0);
98 if (src[0].file == GRF)
99 assert(src[0].reg_offset >= 0);
100 }
101
102 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
103 {
104 init();
105 this->opcode = opcode;
106 this->dst = dst;
107 this->src[0] = src0;
108 this->src[1] = src1;
109
110 if (dst.file == GRF)
111 assert(dst.reg_offset >= 0);
112 if (src[0].file == GRF)
113 assert(src[0].reg_offset >= 0);
114 if (src[1].file == GRF)
115 assert(src[1].reg_offset >= 0);
116 }
117
118 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
119 fs_reg src0, fs_reg src1, fs_reg src2)
120 {
121 init();
122 this->opcode = opcode;
123 this->dst = dst;
124 this->src[0] = src0;
125 this->src[1] = src1;
126 this->src[2] = src2;
127
128 if (dst.file == GRF)
129 assert(dst.reg_offset >= 0);
130 if (src[0].file == GRF)
131 assert(src[0].reg_offset >= 0);
132 if (src[1].file == GRF)
133 assert(src[1].reg_offset >= 0);
134 if (src[2].file == GRF)
135 assert(src[2].reg_offset >= 0);
136 }
137
138 #define ALU1(op) \
139 fs_inst * \
140 fs_visitor::op(fs_reg dst, fs_reg src0) \
141 { \
142 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
143 }
144
145 #define ALU2(op) \
146 fs_inst * \
147 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
148 { \
149 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
150 }
151
152 #define ALU3(op) \
153 fs_inst * \
154 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
155 { \
156 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
157 }
158
159 ALU1(NOT)
160 ALU1(MOV)
161 ALU1(FRC)
162 ALU1(RNDD)
163 ALU1(RNDE)
164 ALU1(RNDZ)
165 ALU2(ADD)
166 ALU2(MUL)
167 ALU2(MACH)
168 ALU2(AND)
169 ALU2(OR)
170 ALU2(XOR)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183
184 /** Gen4 predicated IF. */
185 fs_inst *
186 fs_visitor::IF(uint32_t predicate)
187 {
188 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
189 inst->predicate = predicate;
190 return inst;
191 }
192
193 /** Gen6+ IF with embedded comparison. */
194 fs_inst *
195 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
196 {
197 assert(brw->gen >= 6);
198 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
199 reg_null_d, src0, src1);
200 inst->conditional_mod = condition;
201 return inst;
202 }
203
204 /**
205 * CMP: Sets the low bit of the destination channels with the result
206 * of the comparison, while the upper bits are undefined, and updates
207 * the flag register with the packed 16 bits of the result.
208 */
209 fs_inst *
210 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
211 {
212 fs_inst *inst;
213
214 /* Take the instruction:
215 *
216 * CMP null<d> src0<f> src1<f>
217 *
218 * Original gen4 does type conversion to the destination type before
219 * comparison, producing garbage results for floating point comparisons.
220 * gen5 does the comparison on the execution type (resolved source types),
221 * so dst type doesn't matter. gen6 does comparison and then uses the
222 * result as if it was the dst type with no conversion, which happens to
223 * mostly work out for float-interpreted-as-int since our comparisons are
224 * for >0, =0, <0.
225 */
226 if (brw->gen == 4) {
227 dst.type = src0.type;
228 if (dst.file == HW_REG)
229 dst.fixed_hw_reg.type = dst.type;
230 }
231
232 resolve_ud_negate(&src0);
233 resolve_ud_negate(&src1);
234
235 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
236 inst->conditional_mod = condition;
237
238 return inst;
239 }
240
241 exec_list
242 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
243 fs_reg varying_offset,
244 uint32_t const_offset)
245 {
246 exec_list instructions;
247 fs_inst *inst;
248
249 /* We have our constant surface use a pitch of 4 bytes, so our index can
250 * be any component of a vector, and then we load 4 contiguous
251 * components starting from that.
252 *
253 * We break down the const_offset to a portion added to the variable
254 * offset and a portion done using reg_offset, which means that if you
255 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
256 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
257 * CSE can later notice that those loads are all the same and eliminate
258 * the redundant ones.
259 */
260 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
261 instructions.push_tail(ADD(vec4_offset,
262 varying_offset, const_offset & ~3));
263
264 int scale = 1;
265 if (brw->gen == 4 && dispatch_width == 8) {
266 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
267 * u, v, r) as parameters, or we can just use the SIMD16 message
268 * consisting of (header, u). We choose the second, at the cost of a
269 * longer return length.
270 */
271 scale = 2;
272 }
273
274 enum opcode op;
275 if (brw->gen >= 7)
276 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
277 else
278 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
279 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
280 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
281 inst->regs_written = 4 * scale;
282 instructions.push_tail(inst);
283
284 if (brw->gen < 7) {
285 inst->base_mrf = 13;
286 inst->header_present = true;
287 if (brw->gen == 4)
288 inst->mlen = 3;
289 else
290 inst->mlen = 1 + dispatch_width / 8;
291 }
292
293 vec4_result.reg_offset += (const_offset & 3) * scale;
294 instructions.push_tail(MOV(dst, vec4_result));
295
296 return instructions;
297 }
298
299 /**
300 * A helper for MOV generation for fixing up broken hardware SEND dependency
301 * handling.
302 */
303 fs_inst *
304 fs_visitor::DEP_RESOLVE_MOV(int grf)
305 {
306 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
307
308 inst->ir = NULL;
309 inst->annotation = "send dependency resolve";
310
311 /* The caller always wants uncompressed to emit the minimal extra
312 * dependencies, and to avoid having to deal with aligning its regs to 2.
313 */
314 inst->force_uncompressed = true;
315
316 return inst;
317 }
318
319 bool
320 fs_inst::equals(fs_inst *inst)
321 {
322 return (opcode == inst->opcode &&
323 dst.equals(inst->dst) &&
324 src[0].equals(inst->src[0]) &&
325 src[1].equals(inst->src[1]) &&
326 src[2].equals(inst->src[2]) &&
327 saturate == inst->saturate &&
328 predicate == inst->predicate &&
329 conditional_mod == inst->conditional_mod &&
330 mlen == inst->mlen &&
331 base_mrf == inst->base_mrf &&
332 sampler == inst->sampler &&
333 target == inst->target &&
334 eot == inst->eot &&
335 header_present == inst->header_present &&
336 shadow_compare == inst->shadow_compare &&
337 offset == inst->offset);
338 }
339
340 bool
341 fs_inst::overwrites_reg(const fs_reg &reg)
342 {
343 return (reg.file == dst.file &&
344 reg.reg == dst.reg &&
345 reg.reg_offset >= dst.reg_offset &&
346 reg.reg_offset < dst.reg_offset + regs_written);
347 }
348
349 bool
350 fs_inst::is_send_from_grf()
351 {
352 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
353 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
354 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
355 src[1].file == GRF));
356 }
357
358 bool
359 fs_visitor::can_do_source_mods(fs_inst *inst)
360 {
361 if (brw->gen == 6 && inst->is_math())
362 return false;
363
364 if (inst->is_send_from_grf())
365 return false;
366
367 return true;
368 }
369
370 void
371 fs_reg::init()
372 {
373 memset(this, 0, sizeof(*this));
374 this->smear = -1;
375 }
376
377 /** Generic unset register constructor. */
378 fs_reg::fs_reg()
379 {
380 init();
381 this->file = BAD_FILE;
382 }
383
384 /** Immediate value constructor. */
385 fs_reg::fs_reg(float f)
386 {
387 init();
388 this->file = IMM;
389 this->type = BRW_REGISTER_TYPE_F;
390 this->imm.f = f;
391 }
392
393 /** Immediate value constructor. */
394 fs_reg::fs_reg(int32_t i)
395 {
396 init();
397 this->file = IMM;
398 this->type = BRW_REGISTER_TYPE_D;
399 this->imm.i = i;
400 }
401
402 /** Immediate value constructor. */
403 fs_reg::fs_reg(uint32_t u)
404 {
405 init();
406 this->file = IMM;
407 this->type = BRW_REGISTER_TYPE_UD;
408 this->imm.u = u;
409 }
410
411 /** Fixed brw_reg Immediate value constructor. */
412 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
413 {
414 init();
415 this->file = HW_REG;
416 this->fixed_hw_reg = fixed_hw_reg;
417 this->type = fixed_hw_reg.type;
418 }
419
420 bool
421 fs_reg::equals(const fs_reg &r) const
422 {
423 return (file == r.file &&
424 reg == r.reg &&
425 reg_offset == r.reg_offset &&
426 type == r.type &&
427 negate == r.negate &&
428 abs == r.abs &&
429 !reladdr && !r.reladdr &&
430 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
431 sizeof(fixed_hw_reg)) == 0 &&
432 smear == r.smear &&
433 imm.u == r.imm.u);
434 }
435
436 bool
437 fs_reg::is_zero() const
438 {
439 if (file != IMM)
440 return false;
441
442 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
443 }
444
445 bool
446 fs_reg::is_one() const
447 {
448 if (file != IMM)
449 return false;
450
451 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
452 }
453
454 bool
455 fs_reg::is_valid_3src() const
456 {
457 return file == GRF || file == UNIFORM;
458 }
459
460 int
461 fs_visitor::type_size(const struct glsl_type *type)
462 {
463 unsigned int size, i;
464
465 switch (type->base_type) {
466 case GLSL_TYPE_UINT:
467 case GLSL_TYPE_INT:
468 case GLSL_TYPE_FLOAT:
469 case GLSL_TYPE_BOOL:
470 return type->components();
471 case GLSL_TYPE_ARRAY:
472 return type_size(type->fields.array) * type->length;
473 case GLSL_TYPE_STRUCT:
474 size = 0;
475 for (i = 0; i < type->length; i++) {
476 size += type_size(type->fields.structure[i].type);
477 }
478 return size;
479 case GLSL_TYPE_SAMPLER:
480 /* Samplers take up no register space, since they're baked in at
481 * link time.
482 */
483 return 0;
484 case GLSL_TYPE_VOID:
485 case GLSL_TYPE_ERROR:
486 case GLSL_TYPE_INTERFACE:
487 assert(!"not reached");
488 break;
489 }
490
491 return 0;
492 }
493
494 fs_reg
495 fs_visitor::get_timestamp()
496 {
497 assert(brw->gen >= 7);
498
499 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
500 BRW_ARF_TIMESTAMP,
501 0),
502 BRW_REGISTER_TYPE_UD));
503
504 fs_reg dst = fs_reg(this, glsl_type::uint_type);
505
506 fs_inst *mov = emit(MOV(dst, ts));
507 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
508 * even if it's not enabled in the dispatch.
509 */
510 mov->force_writemask_all = true;
511 mov->force_uncompressed = true;
512
513 /* The caller wants the low 32 bits of the timestamp. Since it's running
514 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
515 * which is plenty of time for our purposes. It is identical across the
516 * EUs, but since it's tracking GPU core speed it will increment at a
517 * varying rate as render P-states change.
518 *
519 * The caller could also check if render P-states have changed (or anything
520 * else that might disrupt timing) by setting smear to 2 and checking if
521 * that field is != 0.
522 */
523 dst.smear = 0;
524
525 return dst;
526 }
527
528 void
529 fs_visitor::emit_shader_time_begin()
530 {
531 current_annotation = "shader time start";
532 shader_start_time = get_timestamp();
533 }
534
535 void
536 fs_visitor::emit_shader_time_end()
537 {
538 current_annotation = "shader time end";
539
540 enum shader_time_shader_type type, written_type, reset_type;
541 if (dispatch_width == 8) {
542 type = ST_FS8;
543 written_type = ST_FS8_WRITTEN;
544 reset_type = ST_FS8_RESET;
545 } else {
546 assert(dispatch_width == 16);
547 type = ST_FS16;
548 written_type = ST_FS16_WRITTEN;
549 reset_type = ST_FS16_RESET;
550 }
551
552 fs_reg shader_end_time = get_timestamp();
553
554 /* Check that there weren't any timestamp reset events (assuming these
555 * were the only two timestamp reads that happened).
556 */
557 fs_reg reset = shader_end_time;
558 reset.smear = 2;
559 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
560 test->conditional_mod = BRW_CONDITIONAL_Z;
561 emit(IF(BRW_PREDICATE_NORMAL));
562
563 push_force_uncompressed();
564 fs_reg start = shader_start_time;
565 start.negate = true;
566 fs_reg diff = fs_reg(this, glsl_type::uint_type);
567 emit(ADD(diff, start, shader_end_time));
568
569 /* If there were no instructions between the two timestamp gets, the diff
570 * is 2 cycles. Remove that overhead, so I can forget about that when
571 * trying to determine the time taken for single instructions.
572 */
573 emit(ADD(diff, diff, fs_reg(-2u)));
574
575 emit_shader_time_write(type, diff);
576 emit_shader_time_write(written_type, fs_reg(1u));
577 emit(BRW_OPCODE_ELSE);
578 emit_shader_time_write(reset_type, fs_reg(1u));
579 emit(BRW_OPCODE_ENDIF);
580
581 pop_force_uncompressed();
582 }
583
584 void
585 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
586 fs_reg value)
587 {
588 int shader_time_index =
589 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
590 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
591
592 fs_reg payload;
593 if (dispatch_width == 8)
594 payload = fs_reg(this, glsl_type::uvec2_type);
595 else
596 payload = fs_reg(this, glsl_type::uint_type);
597
598 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
599 fs_reg(), payload, offset, value));
600 }
601
602 void
603 fs_visitor::fail(const char *format, ...)
604 {
605 va_list va;
606 char *msg;
607
608 if (failed)
609 return;
610
611 failed = true;
612
613 va_start(va, format);
614 msg = ralloc_vasprintf(mem_ctx, format, va);
615 va_end(va);
616 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
617
618 this->fail_msg = msg;
619
620 if (INTEL_DEBUG & DEBUG_WM) {
621 fprintf(stderr, "%s", msg);
622 }
623 }
624
625 fs_inst *
626 fs_visitor::emit(enum opcode opcode)
627 {
628 return emit(fs_inst(opcode));
629 }
630
631 fs_inst *
632 fs_visitor::emit(enum opcode opcode, fs_reg dst)
633 {
634 return emit(fs_inst(opcode, dst));
635 }
636
637 fs_inst *
638 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
639 {
640 return emit(fs_inst(opcode, dst, src0));
641 }
642
643 fs_inst *
644 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
645 {
646 return emit(fs_inst(opcode, dst, src0, src1));
647 }
648
649 fs_inst *
650 fs_visitor::emit(enum opcode opcode, fs_reg dst,
651 fs_reg src0, fs_reg src1, fs_reg src2)
652 {
653 return emit(fs_inst(opcode, dst, src0, src1, src2));
654 }
655
656 void
657 fs_visitor::push_force_uncompressed()
658 {
659 force_uncompressed_stack++;
660 }
661
662 void
663 fs_visitor::pop_force_uncompressed()
664 {
665 force_uncompressed_stack--;
666 assert(force_uncompressed_stack >= 0);
667 }
668
669 void
670 fs_visitor::push_force_sechalf()
671 {
672 force_sechalf_stack++;
673 }
674
675 void
676 fs_visitor::pop_force_sechalf()
677 {
678 force_sechalf_stack--;
679 assert(force_sechalf_stack >= 0);
680 }
681
682 /**
683 * Returns true if the instruction has a flag that means it won't
684 * update an entire destination register.
685 *
686 * For example, dead code elimination and live variable analysis want to know
687 * when a write to a variable screens off any preceding values that were in
688 * it.
689 */
690 bool
691 fs_inst::is_partial_write()
692 {
693 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
694 this->force_uncompressed ||
695 this->force_sechalf);
696 }
697
698 /**
699 * Returns how many MRFs an FS opcode will write over.
700 *
701 * Note that this is not the 0 or 1 implied writes in an actual gen
702 * instruction -- the FS opcodes often generate MOVs in addition.
703 */
704 int
705 fs_visitor::implied_mrf_writes(fs_inst *inst)
706 {
707 if (inst->mlen == 0)
708 return 0;
709
710 switch (inst->opcode) {
711 case SHADER_OPCODE_RCP:
712 case SHADER_OPCODE_RSQ:
713 case SHADER_OPCODE_SQRT:
714 case SHADER_OPCODE_EXP2:
715 case SHADER_OPCODE_LOG2:
716 case SHADER_OPCODE_SIN:
717 case SHADER_OPCODE_COS:
718 return 1 * dispatch_width / 8;
719 case SHADER_OPCODE_POW:
720 case SHADER_OPCODE_INT_QUOTIENT:
721 case SHADER_OPCODE_INT_REMAINDER:
722 return 2 * dispatch_width / 8;
723 case SHADER_OPCODE_TEX:
724 case FS_OPCODE_TXB:
725 case SHADER_OPCODE_TXD:
726 case SHADER_OPCODE_TXF:
727 case SHADER_OPCODE_TXF_MS:
728 case SHADER_OPCODE_TXL:
729 case SHADER_OPCODE_TXS:
730 case SHADER_OPCODE_LOD:
731 return 1;
732 case FS_OPCODE_FB_WRITE:
733 return 2;
734 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
735 case FS_OPCODE_UNSPILL:
736 return 1;
737 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
738 return inst->mlen;
739 case FS_OPCODE_SPILL:
740 return 2;
741 default:
742 assert(!"not reached");
743 return inst->mlen;
744 }
745 }
746
747 int
748 fs_visitor::virtual_grf_alloc(int size)
749 {
750 if (virtual_grf_array_size <= virtual_grf_count) {
751 if (virtual_grf_array_size == 0)
752 virtual_grf_array_size = 16;
753 else
754 virtual_grf_array_size *= 2;
755 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
756 virtual_grf_array_size);
757 }
758 virtual_grf_sizes[virtual_grf_count] = size;
759 return virtual_grf_count++;
760 }
761
762 /** Fixed HW reg constructor. */
763 fs_reg::fs_reg(enum register_file file, int reg)
764 {
765 init();
766 this->file = file;
767 this->reg = reg;
768 this->type = BRW_REGISTER_TYPE_F;
769 }
770
771 /** Fixed HW reg constructor. */
772 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
773 {
774 init();
775 this->file = file;
776 this->reg = reg;
777 this->type = type;
778 }
779
780 /** Automatic reg constructor. */
781 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
782 {
783 init();
784
785 this->file = GRF;
786 this->reg = v->virtual_grf_alloc(v->type_size(type));
787 this->reg_offset = 0;
788 this->type = brw_type_for_base_type(type);
789 }
790
791 fs_reg *
792 fs_visitor::variable_storage(ir_variable *var)
793 {
794 return (fs_reg *)hash_table_find(this->variable_ht, var);
795 }
796
797 void
798 import_uniforms_callback(const void *key,
799 void *data,
800 void *closure)
801 {
802 struct hash_table *dst_ht = (struct hash_table *)closure;
803 const fs_reg *reg = (const fs_reg *)data;
804
805 if (reg->file != UNIFORM)
806 return;
807
808 hash_table_insert(dst_ht, data, key);
809 }
810
811 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
812 * This brings in those uniform definitions
813 */
814 void
815 fs_visitor::import_uniforms(fs_visitor *v)
816 {
817 hash_table_call_foreach(v->variable_ht,
818 import_uniforms_callback,
819 variable_ht);
820 this->params_remap = v->params_remap;
821 this->nr_params_remap = v->nr_params_remap;
822 }
823
824 /* Our support for uniforms is piggy-backed on the struct
825 * gl_fragment_program, because that's where the values actually
826 * get stored, rather than in some global gl_shader_program uniform
827 * store.
828 */
829 void
830 fs_visitor::setup_uniform_values(ir_variable *ir)
831 {
832 int namelen = strlen(ir->name);
833
834 /* The data for our (non-builtin) uniforms is stored in a series of
835 * gl_uniform_driver_storage structs for each subcomponent that
836 * glGetUniformLocation() could name. We know it's been set up in the same
837 * order we'd walk the type, so walk the list of storage and find anything
838 * with our name, or the prefix of a component that starts with our name.
839 */
840 unsigned params_before = c->prog_data.nr_params;
841 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
842 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
843
844 if (strncmp(ir->name, storage->name, namelen) != 0 ||
845 (storage->name[namelen] != 0 &&
846 storage->name[namelen] != '.' &&
847 storage->name[namelen] != '[')) {
848 continue;
849 }
850
851 unsigned slots = storage->type->component_slots();
852 if (storage->array_elements)
853 slots *= storage->array_elements;
854
855 for (unsigned i = 0; i < slots; i++) {
856 c->prog_data.param[c->prog_data.nr_params++] =
857 &storage->storage[i].f;
858 }
859 }
860
861 /* Make sure we actually initialized the right amount of stuff here. */
862 assert(params_before + ir->type->component_slots() ==
863 c->prog_data.nr_params);
864 (void)params_before;
865 }
866
867
868 /* Our support for builtin uniforms is even scarier than non-builtin.
869 * It sits on top of the PROG_STATE_VAR parameters that are
870 * automatically updated from GL context state.
871 */
872 void
873 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
874 {
875 const ir_state_slot *const slots = ir->state_slots;
876 assert(ir->state_slots != NULL);
877
878 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
879 /* This state reference has already been setup by ir_to_mesa, but we'll
880 * get the same index back here.
881 */
882 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
883 (gl_state_index *)slots[i].tokens);
884
885 /* Add each of the unique swizzles of the element as a parameter.
886 * This'll end up matching the expected layout of the
887 * array/matrix/structure we're trying to fill in.
888 */
889 int last_swiz = -1;
890 for (unsigned int j = 0; j < 4; j++) {
891 int swiz = GET_SWZ(slots[i].swizzle, j);
892 if (swiz == last_swiz)
893 break;
894 last_swiz = swiz;
895
896 c->prog_data.param[c->prog_data.nr_params++] =
897 &fp->Base.Parameters->ParameterValues[index][swiz].f;
898 }
899 }
900 }
901
902 fs_reg *
903 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
904 {
905 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
906 fs_reg wpos = *reg;
907 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
908
909 /* gl_FragCoord.x */
910 if (ir->pixel_center_integer) {
911 emit(MOV(wpos, this->pixel_x));
912 } else {
913 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
914 }
915 wpos.reg_offset++;
916
917 /* gl_FragCoord.y */
918 if (!flip && ir->pixel_center_integer) {
919 emit(MOV(wpos, this->pixel_y));
920 } else {
921 fs_reg pixel_y = this->pixel_y;
922 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
923
924 if (flip) {
925 pixel_y.negate = true;
926 offset += c->key.drawable_height - 1.0;
927 }
928
929 emit(ADD(wpos, pixel_y, fs_reg(offset)));
930 }
931 wpos.reg_offset++;
932
933 /* gl_FragCoord.z */
934 if (brw->gen >= 6) {
935 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
936 } else {
937 emit(FS_OPCODE_LINTERP, wpos,
938 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
939 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
940 interp_reg(VARYING_SLOT_POS, 2));
941 }
942 wpos.reg_offset++;
943
944 /* gl_FragCoord.w: Already set up in emit_interpolation */
945 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
946
947 return reg;
948 }
949
950 fs_inst *
951 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
952 glsl_interp_qualifier interpolation_mode,
953 bool is_centroid)
954 {
955 brw_wm_barycentric_interp_mode barycoord_mode;
956 if (brw->gen >= 6) {
957 if (is_centroid) {
958 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
959 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
960 else
961 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
962 } else {
963 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
964 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
965 else
966 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
967 }
968 } else {
969 /* On Ironlake and below, there is only one interpolation mode.
970 * Centroid interpolation doesn't mean anything on this hardware --
971 * there is no multisampling.
972 */
973 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
974 }
975 return emit(FS_OPCODE_LINTERP, attr,
976 this->delta_x[barycoord_mode],
977 this->delta_y[barycoord_mode], interp);
978 }
979
980 fs_reg *
981 fs_visitor::emit_general_interpolation(ir_variable *ir)
982 {
983 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
984 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
985 fs_reg attr = *reg;
986
987 unsigned int array_elements;
988 const glsl_type *type;
989
990 if (ir->type->is_array()) {
991 array_elements = ir->type->length;
992 if (array_elements == 0) {
993 fail("dereferenced array '%s' has length 0\n", ir->name);
994 }
995 type = ir->type->fields.array;
996 } else {
997 array_elements = 1;
998 type = ir->type;
999 }
1000
1001 glsl_interp_qualifier interpolation_mode =
1002 ir->determine_interpolation_mode(c->key.flat_shade);
1003
1004 int location = ir->location;
1005 for (unsigned int i = 0; i < array_elements; i++) {
1006 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1007 if (urb_setup[location] == -1) {
1008 /* If there's no incoming setup data for this slot, don't
1009 * emit interpolation for it.
1010 */
1011 attr.reg_offset += type->vector_elements;
1012 location++;
1013 continue;
1014 }
1015
1016 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1017 /* Constant interpolation (flat shading) case. The SF has
1018 * handed us defined values in only the constant offset
1019 * field of the setup reg.
1020 */
1021 for (unsigned int k = 0; k < type->vector_elements; k++) {
1022 struct brw_reg interp = interp_reg(location, k);
1023 interp = suboffset(interp, 3);
1024 interp.type = reg->type;
1025 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1026 attr.reg_offset++;
1027 }
1028 } else {
1029 /* Smooth/noperspective interpolation case. */
1030 for (unsigned int k = 0; k < type->vector_elements; k++) {
1031 /* FINISHME: At some point we probably want to push
1032 * this farther by giving similar treatment to the
1033 * other potentially constant components of the
1034 * attribute, as well as making brw_vs_constval.c
1035 * handle varyings other than gl_TexCoord.
1036 */
1037 struct brw_reg interp = interp_reg(location, k);
1038 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1039 ir->centroid);
1040 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1041 /* Get the pixel/sample mask into f0 so that we know
1042 * which pixels are lit. Then, for each channel that is
1043 * unlit, replace the centroid data with non-centroid
1044 * data.
1045 */
1046 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1047 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1048 interpolation_mode, false);
1049 inst->predicate = BRW_PREDICATE_NORMAL;
1050 inst->predicate_inverse = true;
1051 }
1052 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1053 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1054 }
1055 attr.reg_offset++;
1056 }
1057
1058 }
1059 location++;
1060 }
1061 }
1062
1063 return reg;
1064 }
1065
1066 fs_reg *
1067 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1068 {
1069 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1070
1071 /* The frontfacing comes in as a bit in the thread payload. */
1072 if (brw->gen >= 6) {
1073 emit(BRW_OPCODE_ASR, *reg,
1074 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1075 fs_reg(15));
1076 emit(BRW_OPCODE_NOT, *reg, *reg);
1077 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1078 } else {
1079 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1080 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1081 * us front face
1082 */
1083 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1084 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1085 }
1086
1087 return reg;
1088 }
1089
1090 fs_reg
1091 fs_visitor::fix_math_operand(fs_reg src)
1092 {
1093 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1094 * might be able to do better by doing execsize = 1 math and then
1095 * expanding that result out, but we would need to be careful with
1096 * masking.
1097 *
1098 * The hardware ignores source modifiers (negate and abs) on math
1099 * instructions, so we also move to a temp to set those up.
1100 */
1101 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1102 !src.abs && !src.negate)
1103 return src;
1104
1105 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1106 * operands to math
1107 */
1108 if (brw->gen >= 7 && src.file != IMM)
1109 return src;
1110
1111 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1112 expanded.type = src.type;
1113 emit(BRW_OPCODE_MOV, expanded, src);
1114 return expanded;
1115 }
1116
1117 fs_inst *
1118 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1119 {
1120 switch (opcode) {
1121 case SHADER_OPCODE_RCP:
1122 case SHADER_OPCODE_RSQ:
1123 case SHADER_OPCODE_SQRT:
1124 case SHADER_OPCODE_EXP2:
1125 case SHADER_OPCODE_LOG2:
1126 case SHADER_OPCODE_SIN:
1127 case SHADER_OPCODE_COS:
1128 break;
1129 default:
1130 assert(!"not reached: bad math opcode");
1131 return NULL;
1132 }
1133
1134 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1135 * might be able to do better by doing execsize = 1 math and then
1136 * expanding that result out, but we would need to be careful with
1137 * masking.
1138 *
1139 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1140 * instructions, so we also move to a temp to set those up.
1141 */
1142 if (brw->gen >= 6)
1143 src = fix_math_operand(src);
1144
1145 fs_inst *inst = emit(opcode, dst, src);
1146
1147 if (brw->gen < 6) {
1148 inst->base_mrf = 2;
1149 inst->mlen = dispatch_width / 8;
1150 }
1151
1152 return inst;
1153 }
1154
1155 fs_inst *
1156 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1157 {
1158 int base_mrf = 2;
1159 fs_inst *inst;
1160
1161 switch (opcode) {
1162 case SHADER_OPCODE_INT_QUOTIENT:
1163 case SHADER_OPCODE_INT_REMAINDER:
1164 if (brw->gen >= 7 && dispatch_width == 16)
1165 fail("16-wide INTDIV unsupported\n");
1166 break;
1167 case SHADER_OPCODE_POW:
1168 break;
1169 default:
1170 assert(!"not reached: unsupported binary math opcode.");
1171 return NULL;
1172 }
1173
1174 if (brw->gen >= 6) {
1175 src0 = fix_math_operand(src0);
1176 src1 = fix_math_operand(src1);
1177
1178 inst = emit(opcode, dst, src0, src1);
1179 } else {
1180 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1181 * "Message Payload":
1182 *
1183 * "Operand0[7]. For the INT DIV functions, this operand is the
1184 * denominator."
1185 * ...
1186 * "Operand1[7]. For the INT DIV functions, this operand is the
1187 * numerator."
1188 */
1189 bool is_int_div = opcode != SHADER_OPCODE_POW;
1190 fs_reg &op0 = is_int_div ? src1 : src0;
1191 fs_reg &op1 = is_int_div ? src0 : src1;
1192
1193 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1194 inst = emit(opcode, dst, op0, reg_null_f);
1195
1196 inst->base_mrf = base_mrf;
1197 inst->mlen = 2 * dispatch_width / 8;
1198 }
1199 return inst;
1200 }
1201
1202 void
1203 fs_visitor::assign_curb_setup()
1204 {
1205 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1206 if (dispatch_width == 8) {
1207 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1208 } else {
1209 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1210 }
1211
1212 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1213 foreach_list(node, &this->instructions) {
1214 fs_inst *inst = (fs_inst *)node;
1215
1216 for (unsigned int i = 0; i < 3; i++) {
1217 if (inst->src[i].file == UNIFORM) {
1218 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1219 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1220 constant_nr / 8,
1221 constant_nr % 8);
1222
1223 inst->src[i].file = HW_REG;
1224 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1225 }
1226 }
1227 }
1228 }
1229
1230 void
1231 fs_visitor::calculate_urb_setup()
1232 {
1233 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1234 urb_setup[i] = -1;
1235 }
1236
1237 int urb_next = 0;
1238 /* Figure out where each of the incoming setup attributes lands. */
1239 if (brw->gen >= 6) {
1240 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1241 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1242 urb_setup[i] = urb_next++;
1243 }
1244 }
1245 } else {
1246 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1247 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1248 /* Point size is packed into the header, not as a general attribute */
1249 if (i == VARYING_SLOT_PSIZ)
1250 continue;
1251
1252 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1253 /* The back color slot is skipped when the front color is
1254 * also written to. In addition, some slots can be
1255 * written in the vertex shader and not read in the
1256 * fragment shader. So the register number must always be
1257 * incremented, mapped or not.
1258 */
1259 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1260 urb_setup[i] = urb_next;
1261 urb_next++;
1262 }
1263 }
1264
1265 /*
1266 * It's a FS only attribute, and we did interpolation for this attribute
1267 * in SF thread. So, count it here, too.
1268 *
1269 * See compile_sf_prog() for more info.
1270 */
1271 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1272 urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1273 }
1274
1275 /* Each attribute is 4 setup channels, each of which is half a reg. */
1276 c->prog_data.urb_read_length = urb_next * 2;
1277 }
1278
1279 void
1280 fs_visitor::assign_urb_setup()
1281 {
1282 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1283
1284 /* Offset all the urb_setup[] index by the actual position of the
1285 * setup regs, now that the location of the constants has been chosen.
1286 */
1287 foreach_list(node, &this->instructions) {
1288 fs_inst *inst = (fs_inst *)node;
1289
1290 if (inst->opcode == FS_OPCODE_LINTERP) {
1291 assert(inst->src[2].file == HW_REG);
1292 inst->src[2].fixed_hw_reg.nr += urb_start;
1293 }
1294
1295 if (inst->opcode == FS_OPCODE_CINTERP) {
1296 assert(inst->src[0].file == HW_REG);
1297 inst->src[0].fixed_hw_reg.nr += urb_start;
1298 }
1299 }
1300
1301 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1302 }
1303
1304 /**
1305 * Split large virtual GRFs into separate components if we can.
1306 *
1307 * This is mostly duplicated with what brw_fs_vector_splitting does,
1308 * but that's really conservative because it's afraid of doing
1309 * splitting that doesn't result in real progress after the rest of
1310 * the optimization phases, which would cause infinite looping in
1311 * optimization. We can do it once here, safely. This also has the
1312 * opportunity to split interpolated values, or maybe even uniforms,
1313 * which we don't have at the IR level.
1314 *
1315 * We want to split, because virtual GRFs are what we register
1316 * allocate and spill (due to contiguousness requirements for some
1317 * instructions), and they're what we naturally generate in the
1318 * codegen process, but most virtual GRFs don't actually need to be
1319 * contiguous sets of GRFs. If we split, we'll end up with reduced
1320 * live intervals and better dead code elimination and coalescing.
1321 */
1322 void
1323 fs_visitor::split_virtual_grfs()
1324 {
1325 int num_vars = this->virtual_grf_count;
1326 bool split_grf[num_vars];
1327 int new_virtual_grf[num_vars];
1328
1329 /* Try to split anything > 0 sized. */
1330 for (int i = 0; i < num_vars; i++) {
1331 if (this->virtual_grf_sizes[i] != 1)
1332 split_grf[i] = true;
1333 else
1334 split_grf[i] = false;
1335 }
1336
1337 if (brw->has_pln &&
1338 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1339 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1340 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1341 * Gen6, that was the only supported interpolation mode, and since Gen6,
1342 * delta_x and delta_y are in fixed hardware registers.
1343 */
1344 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1345 false;
1346 }
1347
1348 foreach_list(node, &this->instructions) {
1349 fs_inst *inst = (fs_inst *)node;
1350
1351 /* If there's a SEND message that requires contiguous destination
1352 * registers, no splitting is allowed.
1353 */
1354 if (inst->regs_written > 1) {
1355 split_grf[inst->dst.reg] = false;
1356 }
1357
1358 /* If we're sending from a GRF, don't split it, on the assumption that
1359 * the send is reading the whole thing.
1360 */
1361 if (inst->is_send_from_grf()) {
1362 for (int i = 0; i < 3; i++) {
1363 if (inst->src[i].file == GRF) {
1364 split_grf[inst->src[i].reg] = false;
1365 }
1366 }
1367 }
1368 }
1369
1370 /* Allocate new space for split regs. Note that the virtual
1371 * numbers will be contiguous.
1372 */
1373 for (int i = 0; i < num_vars; i++) {
1374 if (split_grf[i]) {
1375 new_virtual_grf[i] = virtual_grf_alloc(1);
1376 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1377 int reg = virtual_grf_alloc(1);
1378 assert(reg == new_virtual_grf[i] + j - 1);
1379 (void) reg;
1380 }
1381 this->virtual_grf_sizes[i] = 1;
1382 }
1383 }
1384
1385 foreach_list(node, &this->instructions) {
1386 fs_inst *inst = (fs_inst *)node;
1387
1388 if (inst->dst.file == GRF &&
1389 split_grf[inst->dst.reg] &&
1390 inst->dst.reg_offset != 0) {
1391 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1392 inst->dst.reg_offset - 1);
1393 inst->dst.reg_offset = 0;
1394 }
1395 for (int i = 0; i < 3; i++) {
1396 if (inst->src[i].file == GRF &&
1397 split_grf[inst->src[i].reg] &&
1398 inst->src[i].reg_offset != 0) {
1399 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1400 inst->src[i].reg_offset - 1);
1401 inst->src[i].reg_offset = 0;
1402 }
1403 }
1404 }
1405 this->live_intervals_valid = false;
1406 }
1407
1408 /**
1409 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1410 *
1411 * During code generation, we create tons of temporary variables, many of
1412 * which get immediately killed and are never used again. Yet, in later
1413 * optimization and analysis passes, such as compute_live_intervals, we need
1414 * to loop over all the virtual GRFs. Compacting them can save a lot of
1415 * overhead.
1416 */
1417 void
1418 fs_visitor::compact_virtual_grfs()
1419 {
1420 /* Mark which virtual GRFs are used, and count how many. */
1421 int remap_table[this->virtual_grf_count];
1422 memset(remap_table, -1, sizeof(remap_table));
1423
1424 foreach_list(node, &this->instructions) {
1425 const fs_inst *inst = (const fs_inst *) node;
1426
1427 if (inst->dst.file == GRF)
1428 remap_table[inst->dst.reg] = 0;
1429
1430 for (int i = 0; i < 3; i++) {
1431 if (inst->src[i].file == GRF)
1432 remap_table[inst->src[i].reg] = 0;
1433 }
1434 }
1435
1436 /* In addition to registers used in instructions, fs_visitor keeps
1437 * direct references to certain special values which must be patched:
1438 */
1439 fs_reg *special[] = {
1440 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1441 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1442 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1443 &delta_x[0], &delta_x[1], &delta_x[2],
1444 &delta_x[3], &delta_x[4], &delta_x[5],
1445 &delta_y[0], &delta_y[1], &delta_y[2],
1446 &delta_y[3], &delta_y[4], &delta_y[5],
1447 };
1448 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1449 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1450
1451 /* Treat all special values as used, to be conservative */
1452 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1453 if (special[i]->file == GRF)
1454 remap_table[special[i]->reg] = 0;
1455 }
1456
1457 /* Compact the GRF arrays. */
1458 int new_index = 0;
1459 for (int i = 0; i < this->virtual_grf_count; i++) {
1460 if (remap_table[i] != -1) {
1461 remap_table[i] = new_index;
1462 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1463 if (live_intervals_valid) {
1464 virtual_grf_start[new_index] = virtual_grf_start[i];
1465 virtual_grf_end[new_index] = virtual_grf_end[i];
1466 }
1467 ++new_index;
1468 }
1469 }
1470
1471 this->virtual_grf_count = new_index;
1472
1473 /* Patch all the instructions to use the newly renumbered registers */
1474 foreach_list(node, &this->instructions) {
1475 fs_inst *inst = (fs_inst *) node;
1476
1477 if (inst->dst.file == GRF)
1478 inst->dst.reg = remap_table[inst->dst.reg];
1479
1480 for (int i = 0; i < 3; i++) {
1481 if (inst->src[i].file == GRF)
1482 inst->src[i].reg = remap_table[inst->src[i].reg];
1483 }
1484 }
1485
1486 /* Patch all the references to special values */
1487 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1488 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1489 special[i]->reg = remap_table[special[i]->reg];
1490 }
1491 }
1492
1493 bool
1494 fs_visitor::remove_dead_constants()
1495 {
1496 if (dispatch_width == 8) {
1497 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1498 this->nr_params_remap = c->prog_data.nr_params;
1499
1500 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1501 this->params_remap[i] = -1;
1502
1503 /* Find which params are still in use. */
1504 foreach_list(node, &this->instructions) {
1505 fs_inst *inst = (fs_inst *)node;
1506
1507 for (int i = 0; i < 3; i++) {
1508 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1509
1510 if (inst->src[i].file != UNIFORM)
1511 continue;
1512
1513 /* Section 5.11 of the OpenGL 4.3 spec says:
1514 *
1515 * "Out-of-bounds reads return undefined values, which include
1516 * values from other variables of the active program or zero."
1517 */
1518 if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1519 constant_nr = 0;
1520 }
1521
1522 /* For now, set this to non-negative. We'll give it the
1523 * actual new number in a moment, in order to keep the
1524 * register numbers nicely ordered.
1525 */
1526 this->params_remap[constant_nr] = 0;
1527 }
1528 }
1529
1530 /* Figure out what the new numbers for the params will be. At some
1531 * point when we're doing uniform array access, we're going to want
1532 * to keep the distinction between .reg and .reg_offset, but for
1533 * now we don't care.
1534 */
1535 unsigned int new_nr_params = 0;
1536 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1537 if (this->params_remap[i] != -1) {
1538 this->params_remap[i] = new_nr_params++;
1539 }
1540 }
1541
1542 /* Update the list of params to be uploaded to match our new numbering. */
1543 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1544 int remapped = this->params_remap[i];
1545
1546 if (remapped == -1)
1547 continue;
1548
1549 c->prog_data.param[remapped] = c->prog_data.param[i];
1550 }
1551
1552 c->prog_data.nr_params = new_nr_params;
1553 } else {
1554 /* This should have been generated in the 8-wide pass already. */
1555 assert(this->params_remap);
1556 }
1557
1558 /* Now do the renumbering of the shader to remove unused params. */
1559 foreach_list(node, &this->instructions) {
1560 fs_inst *inst = (fs_inst *)node;
1561
1562 for (int i = 0; i < 3; i++) {
1563 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1564
1565 if (inst->src[i].file != UNIFORM)
1566 continue;
1567
1568 /* as above alias to 0 */
1569 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1570 constant_nr = 0;
1571 }
1572 assert(this->params_remap[constant_nr] != -1);
1573 inst->src[i].reg = this->params_remap[constant_nr];
1574 inst->src[i].reg_offset = 0;
1575 }
1576 }
1577
1578 return true;
1579 }
1580
1581 /*
1582 * Implements array access of uniforms by inserting a
1583 * PULL_CONSTANT_LOAD instruction.
1584 *
1585 * Unlike temporary GRF array access (where we don't support it due to
1586 * the difficulty of doing relative addressing on instruction
1587 * destinations), we could potentially do array access of uniforms
1588 * that were loaded in GRF space as push constants. In real-world
1589 * usage we've seen, though, the arrays being used are always larger
1590 * than we could load as push constants, so just always move all
1591 * uniform array access out to a pull constant buffer.
1592 */
1593 void
1594 fs_visitor::move_uniform_array_access_to_pull_constants()
1595 {
1596 int pull_constant_loc[c->prog_data.nr_params];
1597
1598 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1599 pull_constant_loc[i] = -1;
1600 }
1601
1602 /* Walk through and find array access of uniforms. Put a copy of that
1603 * uniform in the pull constant buffer.
1604 *
1605 * Note that we don't move constant-indexed accesses to arrays. No
1606 * testing has been done of the performance impact of this choice.
1607 */
1608 foreach_list_safe(node, &this->instructions) {
1609 fs_inst *inst = (fs_inst *)node;
1610
1611 for (int i = 0 ; i < 3; i++) {
1612 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1613 continue;
1614
1615 int uniform = inst->src[i].reg;
1616
1617 /* If this array isn't already present in the pull constant buffer,
1618 * add it.
1619 */
1620 if (pull_constant_loc[uniform] == -1) {
1621 const float **values = &c->prog_data.param[uniform];
1622
1623 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1624
1625 assert(param_size[uniform]);
1626
1627 for (int j = 0; j < param_size[uniform]; j++) {
1628 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1629 values[j];
1630 }
1631 }
1632
1633 /* Set up the annotation tracking for new generated instructions. */
1634 base_ir = inst->ir;
1635 current_annotation = inst->annotation;
1636
1637 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1638 fs_reg temp = fs_reg(this, glsl_type::float_type);
1639 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1640 surf_index,
1641 *inst->src[i].reladdr,
1642 pull_constant_loc[uniform] +
1643 inst->src[i].reg_offset);
1644 inst->insert_before(&list);
1645
1646 inst->src[i].file = temp.file;
1647 inst->src[i].reg = temp.reg;
1648 inst->src[i].reg_offset = temp.reg_offset;
1649 inst->src[i].reladdr = NULL;
1650 }
1651 }
1652 }
1653
1654 /**
1655 * Choose accesses from the UNIFORM file to demote to using the pull
1656 * constant buffer.
1657 *
1658 * We allow a fragment shader to have more than the specified minimum
1659 * maximum number of fragment shader uniform components (64). If
1660 * there are too many of these, they'd fill up all of register space.
1661 * So, this will push some of them out to the pull constant buffer and
1662 * update the program to load them.
1663 */
1664 void
1665 fs_visitor::setup_pull_constants()
1666 {
1667 /* Only allow 16 registers (128 uniform components) as push constants. */
1668 unsigned int max_uniform_components = 16 * 8;
1669 if (c->prog_data.nr_params <= max_uniform_components)
1670 return;
1671
1672 if (dispatch_width == 16) {
1673 fail("Pull constants not supported in 16-wide\n");
1674 return;
1675 }
1676
1677 /* Just demote the end of the list. We could probably do better
1678 * here, demoting things that are rarely used in the program first.
1679 */
1680 unsigned int pull_uniform_base = max_uniform_components;
1681
1682 int pull_constant_loc[c->prog_data.nr_params];
1683 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1684 if (i < pull_uniform_base) {
1685 pull_constant_loc[i] = -1;
1686 } else {
1687 pull_constant_loc[i] = -1;
1688 /* If our constant is already being uploaded for reladdr purposes,
1689 * reuse it.
1690 */
1691 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1692 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1693 pull_constant_loc[i] = j;
1694 break;
1695 }
1696 }
1697 if (pull_constant_loc[i] == -1) {
1698 int pull_index = c->prog_data.nr_pull_params++;
1699 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1700 pull_constant_loc[i] = pull_index;;
1701 }
1702 }
1703 }
1704 c->prog_data.nr_params = pull_uniform_base;
1705
1706 foreach_list(node, &this->instructions) {
1707 fs_inst *inst = (fs_inst *)node;
1708
1709 for (int i = 0; i < 3; i++) {
1710 if (inst->src[i].file != UNIFORM)
1711 continue;
1712
1713 int pull_index = pull_constant_loc[inst->src[i].reg +
1714 inst->src[i].reg_offset];
1715 if (pull_index == -1)
1716 continue;
1717
1718 assert(!inst->src[i].reladdr);
1719
1720 fs_reg dst = fs_reg(this, glsl_type::float_type);
1721 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1722 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1723 fs_inst *pull =
1724 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1725 dst, index, offset);
1726 pull->ir = inst->ir;
1727 pull->annotation = inst->annotation;
1728
1729 inst->insert_before(pull);
1730
1731 inst->src[i].file = GRF;
1732 inst->src[i].reg = dst.reg;
1733 inst->src[i].reg_offset = 0;
1734 inst->src[i].smear = pull_index & 3;
1735 }
1736 }
1737 }
1738
1739 bool
1740 fs_visitor::opt_algebraic()
1741 {
1742 bool progress = false;
1743
1744 foreach_list(node, &this->instructions) {
1745 fs_inst *inst = (fs_inst *)node;
1746
1747 switch (inst->opcode) {
1748 case BRW_OPCODE_MUL:
1749 if (inst->src[1].file != IMM)
1750 continue;
1751
1752 /* a * 1.0 = a */
1753 if (inst->src[1].is_one()) {
1754 inst->opcode = BRW_OPCODE_MOV;
1755 inst->src[1] = reg_undef;
1756 progress = true;
1757 break;
1758 }
1759
1760 /* a * 0.0 = 0.0 */
1761 if (inst->src[1].is_zero()) {
1762 inst->opcode = BRW_OPCODE_MOV;
1763 inst->src[0] = inst->src[1];
1764 inst->src[1] = reg_undef;
1765 progress = true;
1766 break;
1767 }
1768
1769 break;
1770 case BRW_OPCODE_ADD:
1771 if (inst->src[1].file != IMM)
1772 continue;
1773
1774 /* a + 0.0 = a */
1775 if (inst->src[1].is_zero()) {
1776 inst->opcode = BRW_OPCODE_MOV;
1777 inst->src[1] = reg_undef;
1778 progress = true;
1779 break;
1780 }
1781 break;
1782 default:
1783 break;
1784 }
1785 }
1786
1787 return progress;
1788 }
1789
1790 /**
1791 * Removes any instructions writing a VGRF where that VGRF is not used by any
1792 * later instruction.
1793 */
1794 bool
1795 fs_visitor::dead_code_eliminate()
1796 {
1797 bool progress = false;
1798 int pc = 0;
1799
1800 calculate_live_intervals();
1801
1802 foreach_list_safe(node, &this->instructions) {
1803 fs_inst *inst = (fs_inst *)node;
1804
1805 if (inst->dst.file == GRF) {
1806 assert(this->virtual_grf_end[inst->dst.reg] >= pc);
1807 if (this->virtual_grf_end[inst->dst.reg] == pc) {
1808 inst->remove();
1809 progress = true;
1810 }
1811 }
1812
1813 pc++;
1814 }
1815
1816 if (progress)
1817 live_intervals_valid = false;
1818
1819 return progress;
1820 }
1821
1822 struct dead_code_hash_key
1823 {
1824 int vgrf;
1825 int reg_offset;
1826 };
1827
1828 static bool
1829 dead_code_hash_compare(const void *a, const void *b)
1830 {
1831 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1832 }
1833
1834 static void
1835 clear_dead_code_hash(struct hash_table *ht)
1836 {
1837 struct hash_entry *entry;
1838
1839 hash_table_foreach(ht, entry) {
1840 _mesa_hash_table_remove(ht, entry);
1841 }
1842 }
1843
1844 static void
1845 insert_dead_code_hash(struct hash_table *ht,
1846 int vgrf, int reg_offset, fs_inst *inst)
1847 {
1848 /* We don't bother freeing keys, because they'll be GCed with the ht. */
1849 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1850
1851 key->vgrf = vgrf;
1852 key->reg_offset = reg_offset;
1853
1854 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1855 }
1856
1857 static struct hash_entry *
1858 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1859 {
1860 struct dead_code_hash_key key;
1861
1862 key.vgrf = vgrf;
1863 key.reg_offset = reg_offset;
1864
1865 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1866 }
1867
1868 static void
1869 remove_dead_code_hash(struct hash_table *ht,
1870 int vgrf, int reg_offset)
1871 {
1872 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1873 if (!entry)
1874 return;
1875
1876 _mesa_hash_table_remove(ht, entry);
1877 }
1878
1879 /**
1880 * Walks basic blocks, removing any regs that are written but not read before
1881 * being redefined.
1882 *
1883 * The dead_code_eliminate() function implements a global dead code
1884 * elimination, but it only handles the removing the last write to a register
1885 * if it's never read. This one can handle intermediate writes, but only
1886 * within a basic block.
1887 */
1888 bool
1889 fs_visitor::dead_code_eliminate_local()
1890 {
1891 struct hash_table *ht;
1892 bool progress = false;
1893
1894 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1895
1896 foreach_list_safe(node, &this->instructions) {
1897 fs_inst *inst = (fs_inst *)node;
1898
1899 /* At a basic block, empty the HT since we don't understand dataflow
1900 * here.
1901 */
1902 if (inst->is_control_flow()) {
1903 clear_dead_code_hash(ht);
1904 continue;
1905 }
1906
1907 /* Clear the HT of any instructions that got read. */
1908 for (int i = 0; i < 3; i++) {
1909 fs_reg src = inst->src[i];
1910 if (src.file != GRF)
1911 continue;
1912
1913 int read = 1;
1914 if (inst->is_send_from_grf())
1915 read = virtual_grf_sizes[src.reg] - src.reg_offset;
1916
1917 for (int reg_offset = src.reg_offset;
1918 reg_offset < src.reg_offset + read;
1919 reg_offset++) {
1920 remove_dead_code_hash(ht, src.reg, reg_offset);
1921 }
1922 }
1923
1924 /* Add any update of a GRF to the HT, removing a previous write if it
1925 * wasn't read.
1926 */
1927 if (inst->dst.file == GRF) {
1928 if (inst->regs_written > 1) {
1929 /* We don't know how to trim channels from an instruction's
1930 * writes, so we can't incrementally remove unread channels from
1931 * it. Just remove whatever it overwrites from the table
1932 */
1933 for (int i = 0; i < inst->regs_written; i++) {
1934 remove_dead_code_hash(ht,
1935 inst->dst.reg,
1936 inst->dst.reg_offset + i);
1937 }
1938 } else {
1939 struct hash_entry *entry =
1940 get_dead_code_hash_entry(ht, inst->dst.reg,
1941 inst->dst.reg_offset);
1942
1943 if (inst->is_partial_write()) {
1944 /* For a partial write, we can't remove any previous dead code
1945 * candidate, since we're just modifying their result, but we can
1946 * be dead code eliminiated ourselves.
1947 */
1948 if (entry) {
1949 entry->data = inst;
1950 } else {
1951 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1952 inst);
1953 }
1954 } else {
1955 if (entry) {
1956 /* We're completely updating a channel, and there was a
1957 * previous write to the channel that wasn't read. Kill it!
1958 */
1959 fs_inst *inst = (fs_inst *)entry->data;
1960 inst->remove();
1961 progress = true;
1962 _mesa_hash_table_remove(ht, entry);
1963 }
1964
1965 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1966 inst);
1967 }
1968 }
1969 }
1970 }
1971
1972 _mesa_hash_table_destroy(ht, NULL);
1973
1974 if (progress)
1975 live_intervals_valid = false;
1976
1977 return progress;
1978 }
1979
1980 /**
1981 * Implements a second type of register coalescing: This one checks if
1982 * the two regs involved in a raw move don't interfere, in which case
1983 * they can both by stored in the same place and the MOV removed.
1984 */
1985 bool
1986 fs_visitor::register_coalesce_2()
1987 {
1988 bool progress = false;
1989
1990 calculate_live_intervals();
1991
1992 foreach_list_safe(node, &this->instructions) {
1993 fs_inst *inst = (fs_inst *)node;
1994
1995 if (inst->opcode != BRW_OPCODE_MOV ||
1996 inst->is_partial_write() ||
1997 inst->saturate ||
1998 inst->src[0].file != GRF ||
1999 inst->src[0].negate ||
2000 inst->src[0].abs ||
2001 inst->src[0].smear != -1 ||
2002 inst->dst.file != GRF ||
2003 inst->dst.type != inst->src[0].type ||
2004 virtual_grf_sizes[inst->src[0].reg] != 1 ||
2005 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
2006 continue;
2007 }
2008
2009 int reg_from = inst->src[0].reg;
2010 assert(inst->src[0].reg_offset == 0);
2011 int reg_to = inst->dst.reg;
2012 int reg_to_offset = inst->dst.reg_offset;
2013
2014 foreach_list(node, &this->instructions) {
2015 fs_inst *scan_inst = (fs_inst *)node;
2016
2017 if (scan_inst->dst.file == GRF &&
2018 scan_inst->dst.reg == reg_from) {
2019 scan_inst->dst.reg = reg_to;
2020 scan_inst->dst.reg_offset = reg_to_offset;
2021 }
2022 for (int i = 0; i < 3; i++) {
2023 if (scan_inst->src[i].file == GRF &&
2024 scan_inst->src[i].reg == reg_from) {
2025 scan_inst->src[i].reg = reg_to;
2026 scan_inst->src[i].reg_offset = reg_to_offset;
2027 }
2028 }
2029 }
2030
2031 inst->remove();
2032
2033 /* We don't need to recalculate live intervals inside the loop despite
2034 * flagging live_intervals_valid because we only use live intervals for
2035 * the interferes test, and we must have had a situation where the
2036 * intervals were:
2037 *
2038 * from to
2039 * ^
2040 * |
2041 * v
2042 * ^
2043 * |
2044 * v
2045 *
2046 * Some register R that might get coalesced with one of these two could
2047 * only be referencing "to", otherwise "from"'s range would have been
2048 * longer. R's range could also only start at the end of "to" or later,
2049 * otherwise it will conflict with "to" when we try to coalesce "to"
2050 * into Rw anyway.
2051 */
2052 live_intervals_valid = false;
2053
2054 progress = true;
2055 continue;
2056 }
2057
2058 return progress;
2059 }
2060
2061 bool
2062 fs_visitor::register_coalesce()
2063 {
2064 bool progress = false;
2065 int if_depth = 0;
2066 int loop_depth = 0;
2067
2068 foreach_list_safe(node, &this->instructions) {
2069 fs_inst *inst = (fs_inst *)node;
2070
2071 /* Make sure that we dominate the instructions we're going to
2072 * scan for interfering with our coalescing, or we won't have
2073 * scanned enough to see if anything interferes with our
2074 * coalescing. We don't dominate the following instructions if
2075 * we're in a loop or an if block.
2076 */
2077 switch (inst->opcode) {
2078 case BRW_OPCODE_DO:
2079 loop_depth++;
2080 break;
2081 case BRW_OPCODE_WHILE:
2082 loop_depth--;
2083 break;
2084 case BRW_OPCODE_IF:
2085 if_depth++;
2086 break;
2087 case BRW_OPCODE_ENDIF:
2088 if_depth--;
2089 break;
2090 default:
2091 break;
2092 }
2093 if (loop_depth || if_depth)
2094 continue;
2095
2096 if (inst->opcode != BRW_OPCODE_MOV ||
2097 inst->is_partial_write() ||
2098 inst->saturate ||
2099 inst->dst.file != GRF || (inst->src[0].file != GRF &&
2100 inst->src[0].file != UNIFORM)||
2101 inst->dst.type != inst->src[0].type)
2102 continue;
2103
2104 bool has_source_modifiers = (inst->src[0].abs ||
2105 inst->src[0].negate ||
2106 inst->src[0].smear != -1 ||
2107 inst->src[0].file == UNIFORM);
2108
2109 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
2110 * them: check for no writes to either one until the exit of the
2111 * program.
2112 */
2113 bool interfered = false;
2114
2115 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2116 !scan_inst->is_tail_sentinel();
2117 scan_inst = (fs_inst *)scan_inst->next) {
2118 if (scan_inst->dst.file == GRF) {
2119 if (scan_inst->overwrites_reg(inst->dst) ||
2120 scan_inst->overwrites_reg(inst->src[0])) {
2121 interfered = true;
2122 break;
2123 }
2124 }
2125
2126 if (has_source_modifiers) {
2127 for (int i = 0; i < 3; i++) {
2128 if (scan_inst->src[i].file == GRF &&
2129 scan_inst->src[i].reg == inst->dst.reg &&
2130 scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2131 inst->dst.type != scan_inst->src[i].type)
2132 {
2133 interfered = true;
2134 break;
2135 }
2136 }
2137 }
2138
2139
2140 /* The gen6 MATH instruction can't handle source modifiers or
2141 * unusual register regions, so avoid coalescing those for
2142 * now. We should do something more specific.
2143 */
2144 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2145 interfered = true;
2146 break;
2147 }
2148
2149 /* The accumulator result appears to get used for the
2150 * conditional modifier generation. When negating a UD
2151 * value, there is a 33rd bit generated for the sign in the
2152 * accumulator value, so now you can't check, for example,
2153 * equality with a 32-bit value. See piglit fs-op-neg-uint.
2154 */
2155 if (scan_inst->conditional_mod &&
2156 inst->src[0].negate &&
2157 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2158 interfered = true;
2159 break;
2160 }
2161 }
2162 if (interfered) {
2163 continue;
2164 }
2165
2166 /* Rewrite the later usage to point at the source of the move to
2167 * be removed.
2168 */
2169 for (fs_inst *scan_inst = inst;
2170 !scan_inst->is_tail_sentinel();
2171 scan_inst = (fs_inst *)scan_inst->next) {
2172 for (int i = 0; i < 3; i++) {
2173 if (scan_inst->src[i].file == GRF &&
2174 scan_inst->src[i].reg == inst->dst.reg &&
2175 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2176 fs_reg new_src = inst->src[0];
2177 if (scan_inst->src[i].abs) {
2178 new_src.negate = 0;
2179 new_src.abs = 1;
2180 }
2181 new_src.negate ^= scan_inst->src[i].negate;
2182 scan_inst->src[i] = new_src;
2183 }
2184 }
2185 }
2186
2187 inst->remove();
2188 progress = true;
2189 }
2190
2191 if (progress)
2192 live_intervals_valid = false;
2193
2194 return progress;
2195 }
2196
2197
2198 bool
2199 fs_visitor::compute_to_mrf()
2200 {
2201 bool progress = false;
2202 int next_ip = 0;
2203
2204 calculate_live_intervals();
2205
2206 foreach_list_safe(node, &this->instructions) {
2207 fs_inst *inst = (fs_inst *)node;
2208
2209 int ip = next_ip;
2210 next_ip++;
2211
2212 if (inst->opcode != BRW_OPCODE_MOV ||
2213 inst->is_partial_write() ||
2214 inst->dst.file != MRF || inst->src[0].file != GRF ||
2215 inst->dst.type != inst->src[0].type ||
2216 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2217 continue;
2218
2219 /* Work out which hardware MRF registers are written by this
2220 * instruction.
2221 */
2222 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2223 int mrf_high;
2224 if (inst->dst.reg & BRW_MRF_COMPR4) {
2225 mrf_high = mrf_low + 4;
2226 } else if (dispatch_width == 16 &&
2227 (!inst->force_uncompressed && !inst->force_sechalf)) {
2228 mrf_high = mrf_low + 1;
2229 } else {
2230 mrf_high = mrf_low;
2231 }
2232
2233 /* Can't compute-to-MRF this GRF if someone else was going to
2234 * read it later.
2235 */
2236 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2237 continue;
2238
2239 /* Found a move of a GRF to a MRF. Let's see if we can go
2240 * rewrite the thing that made this GRF to write into the MRF.
2241 */
2242 fs_inst *scan_inst;
2243 for (scan_inst = (fs_inst *)inst->prev;
2244 scan_inst->prev != NULL;
2245 scan_inst = (fs_inst *)scan_inst->prev) {
2246 if (scan_inst->dst.file == GRF &&
2247 scan_inst->dst.reg == inst->src[0].reg) {
2248 /* Found the last thing to write our reg we want to turn
2249 * into a compute-to-MRF.
2250 */
2251
2252 /* If this one instruction didn't populate all the
2253 * channels, bail. We might be able to rewrite everything
2254 * that writes that reg, but it would require smarter
2255 * tracking to delay the rewriting until complete success.
2256 */
2257 if (scan_inst->is_partial_write())
2258 break;
2259
2260 /* Things returning more than one register would need us to
2261 * understand coalescing out more than one MOV at a time.
2262 */
2263 if (scan_inst->regs_written > 1)
2264 break;
2265
2266 /* SEND instructions can't have MRF as a destination. */
2267 if (scan_inst->mlen)
2268 break;
2269
2270 if (brw->gen == 6) {
2271 /* gen6 math instructions must have the destination be
2272 * GRF, so no compute-to-MRF for them.
2273 */
2274 if (scan_inst->is_math()) {
2275 break;
2276 }
2277 }
2278
2279 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2280 /* Found the creator of our MRF's source value. */
2281 scan_inst->dst.file = MRF;
2282 scan_inst->dst.reg = inst->dst.reg;
2283 scan_inst->saturate |= inst->saturate;
2284 inst->remove();
2285 progress = true;
2286 }
2287 break;
2288 }
2289
2290 /* We don't handle control flow here. Most computation of
2291 * values that end up in MRFs are shortly before the MRF
2292 * write anyway.
2293 */
2294 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2295 break;
2296
2297 /* You can't read from an MRF, so if someone else reads our
2298 * MRF's source GRF that we wanted to rewrite, that stops us.
2299 */
2300 bool interfered = false;
2301 for (int i = 0; i < 3; i++) {
2302 if (scan_inst->src[i].file == GRF &&
2303 scan_inst->src[i].reg == inst->src[0].reg &&
2304 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2305 interfered = true;
2306 }
2307 }
2308 if (interfered)
2309 break;
2310
2311 if (scan_inst->dst.file == MRF) {
2312 /* If somebody else writes our MRF here, we can't
2313 * compute-to-MRF before that.
2314 */
2315 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2316 int scan_mrf_high;
2317
2318 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2319 scan_mrf_high = scan_mrf_low + 4;
2320 } else if (dispatch_width == 16 &&
2321 (!scan_inst->force_uncompressed &&
2322 !scan_inst->force_sechalf)) {
2323 scan_mrf_high = scan_mrf_low + 1;
2324 } else {
2325 scan_mrf_high = scan_mrf_low;
2326 }
2327
2328 if (mrf_low == scan_mrf_low ||
2329 mrf_low == scan_mrf_high ||
2330 mrf_high == scan_mrf_low ||
2331 mrf_high == scan_mrf_high) {
2332 break;
2333 }
2334 }
2335
2336 if (scan_inst->mlen > 0) {
2337 /* Found a SEND instruction, which means that there are
2338 * live values in MRFs from base_mrf to base_mrf +
2339 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2340 * above it.
2341 */
2342 if (mrf_low >= scan_inst->base_mrf &&
2343 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2344 break;
2345 }
2346 if (mrf_high >= scan_inst->base_mrf &&
2347 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2348 break;
2349 }
2350 }
2351 }
2352 }
2353
2354 if (progress)
2355 live_intervals_valid = false;
2356
2357 return progress;
2358 }
2359
2360 /**
2361 * Walks through basic blocks, looking for repeated MRF writes and
2362 * removing the later ones.
2363 */
2364 bool
2365 fs_visitor::remove_duplicate_mrf_writes()
2366 {
2367 fs_inst *last_mrf_move[16];
2368 bool progress = false;
2369
2370 /* Need to update the MRF tracking for compressed instructions. */
2371 if (dispatch_width == 16)
2372 return false;
2373
2374 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2375
2376 foreach_list_safe(node, &this->instructions) {
2377 fs_inst *inst = (fs_inst *)node;
2378
2379 if (inst->is_control_flow()) {
2380 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2381 }
2382
2383 if (inst->opcode == BRW_OPCODE_MOV &&
2384 inst->dst.file == MRF) {
2385 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2386 if (prev_inst && inst->equals(prev_inst)) {
2387 inst->remove();
2388 progress = true;
2389 continue;
2390 }
2391 }
2392
2393 /* Clear out the last-write records for MRFs that were overwritten. */
2394 if (inst->dst.file == MRF) {
2395 last_mrf_move[inst->dst.reg] = NULL;
2396 }
2397
2398 if (inst->mlen > 0) {
2399 /* Found a SEND instruction, which will include two or fewer
2400 * implied MRF writes. We could do better here.
2401 */
2402 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2403 last_mrf_move[inst->base_mrf + i] = NULL;
2404 }
2405 }
2406
2407 /* Clear out any MRF move records whose sources got overwritten. */
2408 if (inst->dst.file == GRF) {
2409 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2410 if (last_mrf_move[i] &&
2411 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2412 last_mrf_move[i] = NULL;
2413 }
2414 }
2415 }
2416
2417 if (inst->opcode == BRW_OPCODE_MOV &&
2418 inst->dst.file == MRF &&
2419 inst->src[0].file == GRF &&
2420 !inst->is_partial_write()) {
2421 last_mrf_move[inst->dst.reg] = inst;
2422 }
2423 }
2424
2425 if (progress)
2426 live_intervals_valid = false;
2427
2428 return progress;
2429 }
2430
2431 static void
2432 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2433 int first_grf, int grf_len)
2434 {
2435 bool inst_16wide = (dispatch_width > 8 &&
2436 !inst->force_uncompressed &&
2437 !inst->force_sechalf);
2438
2439 /* Clear the flag for registers that actually got read (as expected). */
2440 for (int i = 0; i < 3; i++) {
2441 int grf;
2442 if (inst->src[i].file == GRF) {
2443 grf = inst->src[i].reg;
2444 } else if (inst->src[i].file == HW_REG &&
2445 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2446 grf = inst->src[i].fixed_hw_reg.nr;
2447 } else {
2448 continue;
2449 }
2450
2451 if (grf >= first_grf &&
2452 grf < first_grf + grf_len) {
2453 deps[grf - first_grf] = false;
2454 if (inst_16wide)
2455 deps[grf - first_grf + 1] = false;
2456 }
2457 }
2458 }
2459
2460 /**
2461 * Implements this workaround for the original 965:
2462 *
2463 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2464 * check for post destination dependencies on this instruction, software
2465 * must ensure that there is no destination hazard for the case of ‘write
2466 * followed by a posted write’ shown in the following example.
2467 *
2468 * 1. mov r3 0
2469 * 2. send r3.xy <rest of send instruction>
2470 * 3. mov r2 r3
2471 *
2472 * Due to no post-destination dependency check on the ‘send’, the above
2473 * code sequence could have two instructions (1 and 2) in flight at the
2474 * same time that both consider ‘r3’ as the target of their final writes.
2475 */
2476 void
2477 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2478 {
2479 int reg_size = dispatch_width / 8;
2480 int write_len = inst->regs_written * reg_size;
2481 int first_write_grf = inst->dst.reg;
2482 bool needs_dep[BRW_MAX_MRF];
2483 assert(write_len < (int)sizeof(needs_dep) - 1);
2484
2485 memset(needs_dep, false, sizeof(needs_dep));
2486 memset(needs_dep, true, write_len);
2487
2488 clear_deps_for_inst_src(inst, dispatch_width,
2489 needs_dep, first_write_grf, write_len);
2490
2491 /* Walk backwards looking for writes to registers we're writing which
2492 * aren't read since being written. If we hit the start of the program,
2493 * we assume that there are no outstanding dependencies on entry to the
2494 * program.
2495 */
2496 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2497 scan_inst != NULL;
2498 scan_inst = (fs_inst *)scan_inst->prev) {
2499
2500 /* If we hit control flow, assume that there *are* outstanding
2501 * dependencies, and force their cleanup before our instruction.
2502 */
2503 if (scan_inst->is_control_flow()) {
2504 for (int i = 0; i < write_len; i++) {
2505 if (needs_dep[i]) {
2506 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2507 }
2508 }
2509 return;
2510 }
2511
2512 bool scan_inst_16wide = (dispatch_width > 8 &&
2513 !scan_inst->force_uncompressed &&
2514 !scan_inst->force_sechalf);
2515
2516 /* We insert our reads as late as possible on the assumption that any
2517 * instruction but a MOV that might have left us an outstanding
2518 * dependency has more latency than a MOV.
2519 */
2520 if (scan_inst->dst.file == GRF) {
2521 for (int i = 0; i < scan_inst->regs_written; i++) {
2522 int reg = scan_inst->dst.reg + i * reg_size;
2523
2524 if (reg >= first_write_grf &&
2525 reg < first_write_grf + write_len &&
2526 needs_dep[reg - first_write_grf]) {
2527 inst->insert_before(DEP_RESOLVE_MOV(reg));
2528 needs_dep[reg - first_write_grf] = false;
2529 if (scan_inst_16wide)
2530 needs_dep[reg - first_write_grf + 1] = false;
2531 }
2532 }
2533 }
2534
2535 /* Clear the flag for registers that actually got read (as expected). */
2536 clear_deps_for_inst_src(scan_inst, dispatch_width,
2537 needs_dep, first_write_grf, write_len);
2538
2539 /* Continue the loop only if we haven't resolved all the dependencies */
2540 int i;
2541 for (i = 0; i < write_len; i++) {
2542 if (needs_dep[i])
2543 break;
2544 }
2545 if (i == write_len)
2546 return;
2547 }
2548 }
2549
2550 /**
2551 * Implements this workaround for the original 965:
2552 *
2553 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2554 * used as a destination register until after it has been sourced by an
2555 * instruction with a different destination register.
2556 */
2557 void
2558 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2559 {
2560 int write_len = inst->regs_written * dispatch_width / 8;
2561 int first_write_grf = inst->dst.reg;
2562 bool needs_dep[BRW_MAX_MRF];
2563 assert(write_len < (int)sizeof(needs_dep) - 1);
2564
2565 memset(needs_dep, false, sizeof(needs_dep));
2566 memset(needs_dep, true, write_len);
2567 /* Walk forwards looking for writes to registers we're writing which aren't
2568 * read before being written.
2569 */
2570 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2571 !scan_inst->is_tail_sentinel();
2572 scan_inst = (fs_inst *)scan_inst->next) {
2573 /* If we hit control flow, force resolve all remaining dependencies. */
2574 if (scan_inst->is_control_flow()) {
2575 for (int i = 0; i < write_len; i++) {
2576 if (needs_dep[i])
2577 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2578 }
2579 return;
2580 }
2581
2582 /* Clear the flag for registers that actually got read (as expected). */
2583 clear_deps_for_inst_src(scan_inst, dispatch_width,
2584 needs_dep, first_write_grf, write_len);
2585
2586 /* We insert our reads as late as possible since they're reading the
2587 * result of a SEND, which has massive latency.
2588 */
2589 if (scan_inst->dst.file == GRF &&
2590 scan_inst->dst.reg >= first_write_grf &&
2591 scan_inst->dst.reg < first_write_grf + write_len &&
2592 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2593 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2594 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2595 }
2596
2597 /* Continue the loop only if we haven't resolved all the dependencies */
2598 int i;
2599 for (i = 0; i < write_len; i++) {
2600 if (needs_dep[i])
2601 break;
2602 }
2603 if (i == write_len)
2604 return;
2605 }
2606
2607 /* If we hit the end of the program, resolve all remaining dependencies out
2608 * of paranoia.
2609 */
2610 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2611 assert(last_inst->eot);
2612 for (int i = 0; i < write_len; i++) {
2613 if (needs_dep[i])
2614 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2615 }
2616 }
2617
2618 void
2619 fs_visitor::insert_gen4_send_dependency_workarounds()
2620 {
2621 if (brw->gen != 4 || brw->is_g4x)
2622 return;
2623
2624 /* Note that we're done with register allocation, so GRF fs_regs always
2625 * have a .reg_offset of 0.
2626 */
2627
2628 foreach_list_safe(node, &this->instructions) {
2629 fs_inst *inst = (fs_inst *)node;
2630
2631 if (inst->mlen != 0 && inst->dst.file == GRF) {
2632 insert_gen4_pre_send_dependency_workarounds(inst);
2633 insert_gen4_post_send_dependency_workarounds(inst);
2634 }
2635 }
2636 }
2637
2638 /**
2639 * Turns the generic expression-style uniform pull constant load instruction
2640 * into a hardware-specific series of instructions for loading a pull
2641 * constant.
2642 *
2643 * The expression style allows the CSE pass before this to optimize out
2644 * repeated loads from the same offset, and gives the pre-register-allocation
2645 * scheduling full flexibility, while the conversion to native instructions
2646 * allows the post-register-allocation scheduler the best information
2647 * possible.
2648 *
2649 * Note that execution masking for setting up pull constant loads is special:
2650 * the channels that need to be written are unrelated to the current execution
2651 * mask, since a later instruction will use one of the result channels as a
2652 * source operand for all 8 or 16 of its channels.
2653 */
2654 void
2655 fs_visitor::lower_uniform_pull_constant_loads()
2656 {
2657 foreach_list(node, &this->instructions) {
2658 fs_inst *inst = (fs_inst *)node;
2659
2660 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2661 continue;
2662
2663 if (brw->gen >= 7) {
2664 /* The offset arg before was a vec4-aligned byte offset. We need to
2665 * turn it into a dword offset.
2666 */
2667 fs_reg const_offset_reg = inst->src[1];
2668 assert(const_offset_reg.file == IMM &&
2669 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2670 const_offset_reg.imm.u /= 4;
2671 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2672
2673 /* This is actually going to be a MOV, but since only the first dword
2674 * is accessed, we have a special opcode to do just that one. Note
2675 * that this needs to be an operation that will be considered a def
2676 * by live variable analysis, or register allocation will explode.
2677 */
2678 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2679 payload, const_offset_reg);
2680 setup->force_writemask_all = true;
2681
2682 setup->ir = inst->ir;
2683 setup->annotation = inst->annotation;
2684 inst->insert_before(setup);
2685
2686 /* Similarly, this will only populate the first 4 channels of the
2687 * result register (since we only use smear values from 0-3), but we
2688 * don't tell the optimizer.
2689 */
2690 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2691 inst->src[1] = payload;
2692
2693 this->live_intervals_valid = false;
2694 } else {
2695 /* Before register allocation, we didn't tell the scheduler about the
2696 * MRF we use. We know it's safe to use this MRF because nothing
2697 * else does except for register spill/unspill, which generates and
2698 * uses its MRF within a single IR instruction.
2699 */
2700 inst->base_mrf = 14;
2701 inst->mlen = 1;
2702 }
2703 }
2704 }
2705
2706 void
2707 fs_visitor::dump_instruction(backend_instruction *be_inst)
2708 {
2709 fs_inst *inst = (fs_inst *)be_inst;
2710
2711 if (inst->predicate) {
2712 printf("(%cf0.%d) ",
2713 inst->predicate_inverse ? '-' : '+',
2714 inst->flag_subreg);
2715 }
2716
2717 printf("%s", brw_instruction_name(inst->opcode));
2718 if (inst->saturate)
2719 printf(".sat");
2720 if (inst->conditional_mod) {
2721 printf(".cmod");
2722 if (!inst->predicate &&
2723 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2724 inst->opcode != BRW_OPCODE_IF &&
2725 inst->opcode != BRW_OPCODE_WHILE))) {
2726 printf(".f0.%d", inst->flag_subreg);
2727 }
2728 }
2729 printf(" ");
2730
2731
2732 switch (inst->dst.file) {
2733 case GRF:
2734 printf("vgrf%d", inst->dst.reg);
2735 if (inst->dst.reg_offset)
2736 printf("+%d", inst->dst.reg_offset);
2737 break;
2738 case MRF:
2739 printf("m%d", inst->dst.reg);
2740 break;
2741 case BAD_FILE:
2742 printf("(null)");
2743 break;
2744 case UNIFORM:
2745 printf("***u%d***", inst->dst.reg);
2746 break;
2747 case ARF:
2748 if (inst->dst.reg == BRW_ARF_NULL)
2749 printf("(null)");
2750 else
2751 printf("arf%d", inst->dst.reg);
2752 break;
2753 default:
2754 printf("???");
2755 break;
2756 }
2757 printf(", ");
2758
2759 for (int i = 0; i < 3; i++) {
2760 if (inst->src[i].negate)
2761 printf("-");
2762 if (inst->src[i].abs)
2763 printf("|");
2764 switch (inst->src[i].file) {
2765 case GRF:
2766 printf("vgrf%d", inst->src[i].reg);
2767 if (inst->src[i].reg_offset)
2768 printf("+%d", inst->src[i].reg_offset);
2769 break;
2770 case MRF:
2771 printf("***m%d***", inst->src[i].reg);
2772 break;
2773 case UNIFORM:
2774 printf("u%d", inst->src[i].reg);
2775 if (inst->src[i].reg_offset)
2776 printf(".%d", inst->src[i].reg_offset);
2777 break;
2778 case BAD_FILE:
2779 printf("(null)");
2780 break;
2781 case IMM:
2782 switch (inst->src[i].type) {
2783 case BRW_REGISTER_TYPE_F:
2784 printf("%ff", inst->src[i].imm.f);
2785 break;
2786 case BRW_REGISTER_TYPE_D:
2787 printf("%dd", inst->src[i].imm.i);
2788 break;
2789 case BRW_REGISTER_TYPE_UD:
2790 printf("%uu", inst->src[i].imm.u);
2791 break;
2792 default:
2793 printf("???");
2794 break;
2795 }
2796 break;
2797 default:
2798 printf("???");
2799 break;
2800 }
2801 if (inst->src[i].abs)
2802 printf("|");
2803
2804 if (i < 3)
2805 printf(", ");
2806 }
2807
2808 printf(" ");
2809
2810 if (inst->force_uncompressed)
2811 printf("1sthalf ");
2812
2813 if (inst->force_sechalf)
2814 printf("2ndhalf ");
2815
2816 printf("\n");
2817 }
2818
2819 /**
2820 * Possibly returns an instruction that set up @param reg.
2821 *
2822 * Sometimes we want to take the result of some expression/variable
2823 * dereference tree and rewrite the instruction generating the result
2824 * of the tree. When processing the tree, we know that the
2825 * instructions generated are all writing temporaries that are dead
2826 * outside of this tree. So, if we have some instructions that write
2827 * a temporary, we're free to point that temp write somewhere else.
2828 *
2829 * Note that this doesn't guarantee that the instruction generated
2830 * only reg -- it might be the size=4 destination of a texture instruction.
2831 */
2832 fs_inst *
2833 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2834 fs_inst *end,
2835 fs_reg reg)
2836 {
2837 if (end == start ||
2838 end->is_partial_write() ||
2839 reg.reladdr ||
2840 !reg.equals(end->dst)) {
2841 return NULL;
2842 } else {
2843 return end;
2844 }
2845 }
2846
2847 void
2848 fs_visitor::setup_payload_gen6()
2849 {
2850 bool uses_depth =
2851 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2852 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2853
2854 assert(brw->gen >= 6);
2855
2856 /* R0-1: masks, pixel X/Y coordinates. */
2857 c->nr_payload_regs = 2;
2858 /* R2: only for 32-pixel dispatch.*/
2859
2860 /* R3-26: barycentric interpolation coordinates. These appear in the
2861 * same order that they appear in the brw_wm_barycentric_interp_mode
2862 * enum. Each set of coordinates occupies 2 registers if dispatch width
2863 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2864 * appear if they were enabled using the "Barycentric Interpolation
2865 * Mode" bits in WM_STATE.
2866 */
2867 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2868 if (barycentric_interp_modes & (1 << i)) {
2869 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2870 c->nr_payload_regs += 2;
2871 if (dispatch_width == 16) {
2872 c->nr_payload_regs += 2;
2873 }
2874 }
2875 }
2876
2877 /* R27: interpolated depth if uses source depth */
2878 if (uses_depth) {
2879 c->source_depth_reg = c->nr_payload_regs;
2880 c->nr_payload_regs++;
2881 if (dispatch_width == 16) {
2882 /* R28: interpolated depth if not 8-wide. */
2883 c->nr_payload_regs++;
2884 }
2885 }
2886 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2887 if (uses_depth) {
2888 c->source_w_reg = c->nr_payload_regs;
2889 c->nr_payload_regs++;
2890 if (dispatch_width == 16) {
2891 /* R30: interpolated W if not 8-wide. */
2892 c->nr_payload_regs++;
2893 }
2894 }
2895 /* R31: MSAA position offsets. */
2896 /* R32-: bary for 32-pixel. */
2897 /* R58-59: interp W for 32-pixel. */
2898
2899 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2900 c->source_depth_to_render_target = true;
2901 }
2902 }
2903
2904 bool
2905 fs_visitor::run()
2906 {
2907 sanity_param_count = fp->Base.Parameters->NumParameters;
2908 uint32_t orig_nr_params = c->prog_data.nr_params;
2909
2910 if (brw->gen >= 6)
2911 setup_payload_gen6();
2912 else
2913 setup_payload_gen4();
2914
2915 if (0) {
2916 emit_dummy_fs();
2917 } else {
2918 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2919 emit_shader_time_begin();
2920
2921 calculate_urb_setup();
2922 if (brw->gen < 6)
2923 emit_interpolation_setup_gen4();
2924 else
2925 emit_interpolation_setup_gen6();
2926
2927 /* We handle discards by keeping track of the still-live pixels in f0.1.
2928 * Initialize it with the dispatched pixels.
2929 */
2930 if (fp->UsesKill) {
2931 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2932 discard_init->flag_subreg = 1;
2933 }
2934
2935 /* Generate FS IR for main(). (the visitor only descends into
2936 * functions called "main").
2937 */
2938 if (shader) {
2939 foreach_list(node, &*shader->ir) {
2940 ir_instruction *ir = (ir_instruction *)node;
2941 base_ir = ir;
2942 this->result = reg_undef;
2943 ir->accept(this);
2944 }
2945 } else {
2946 emit_fragment_program_code();
2947 }
2948 base_ir = NULL;
2949 if (failed)
2950 return false;
2951
2952 emit(FS_OPCODE_PLACEHOLDER_HALT);
2953
2954 emit_fb_writes();
2955
2956 split_virtual_grfs();
2957
2958 move_uniform_array_access_to_pull_constants();
2959 setup_pull_constants();
2960
2961 bool progress;
2962 do {
2963 progress = false;
2964
2965 compact_virtual_grfs();
2966
2967 progress = remove_duplicate_mrf_writes() || progress;
2968
2969 progress = opt_algebraic() || progress;
2970 progress = opt_cse() || progress;
2971 progress = opt_copy_propagate() || progress;
2972 progress = dead_code_eliminate() || progress;
2973 progress = dead_code_eliminate_local() || progress;
2974 progress = register_coalesce() || progress;
2975 progress = register_coalesce_2() || progress;
2976 progress = compute_to_mrf() || progress;
2977 } while (progress);
2978
2979 remove_dead_constants();
2980
2981 schedule_instructions(false);
2982
2983 lower_uniform_pull_constant_loads();
2984
2985 assign_curb_setup();
2986 assign_urb_setup();
2987
2988 if (0) {
2989 /* Debug of register spilling: Go spill everything. */
2990 for (int i = 0; i < virtual_grf_count; i++) {
2991 spill_reg(i);
2992 }
2993 }
2994
2995 if (0)
2996 assign_regs_trivial();
2997 else {
2998 while (!assign_regs()) {
2999 if (failed)
3000 break;
3001 }
3002 }
3003 }
3004 assert(force_uncompressed_stack == 0);
3005 assert(force_sechalf_stack == 0);
3006
3007 /* This must come after all optimization and register allocation, since
3008 * it inserts dead code that happens to have side effects, and it does
3009 * so based on the actual physical registers in use.
3010 */
3011 insert_gen4_send_dependency_workarounds();
3012
3013 if (failed)
3014 return false;
3015
3016 schedule_instructions(true);
3017
3018 if (dispatch_width == 8) {
3019 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3020 } else {
3021 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3022
3023 /* Make sure we didn't try to sneak in an extra uniform */
3024 assert(orig_nr_params == c->prog_data.nr_params);
3025 (void) orig_nr_params;
3026 }
3027
3028 /* If any state parameters were appended, then ParameterValues could have
3029 * been realloced, in which case the driver uniform storage set up by
3030 * _mesa_associate_uniform_storage() would point to freed memory. Make
3031 * sure that didn't happen.
3032 */
3033 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3034
3035 return !failed;
3036 }
3037
3038 const unsigned *
3039 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3040 struct gl_fragment_program *fp,
3041 struct gl_shader_program *prog,
3042 unsigned *final_assembly_size)
3043 {
3044 bool start_busy = false;
3045 float start_time = 0;
3046
3047 if (unlikely(brw->perf_debug)) {
3048 start_busy = (brw->batch.last_bo &&
3049 drm_intel_bo_busy(brw->batch.last_bo));
3050 start_time = get_time();
3051 }
3052
3053 struct brw_shader *shader = NULL;
3054 if (prog)
3055 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3056
3057 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3058 if (prog) {
3059 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3060 _mesa_print_ir(shader->ir, NULL);
3061 printf("\n\n");
3062 } else {
3063 printf("ARB_fragment_program %d ir for native fragment shader\n",
3064 fp->Base.Id);
3065 _mesa_print_program(&fp->Base);
3066 }
3067 }
3068
3069 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3070 */
3071 fs_visitor v(brw, c, prog, fp, 8);
3072 if (!v.run()) {
3073 if (prog) {
3074 prog->LinkStatus = false;
3075 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3076 }
3077
3078 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3079 v.fail_msg);
3080
3081 return NULL;
3082 }
3083
3084 exec_list *simd16_instructions = NULL;
3085 fs_visitor v2(brw, c, prog, fp, 16);
3086 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3087 if (c->prog_data.nr_pull_params == 0) {
3088 /* Try a 16-wide compile */
3089 v2.import_uniforms(&v);
3090 if (!v2.run()) {
3091 perf_debug("16-wide shader failed to compile, falling back to "
3092 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3093 } else {
3094 simd16_instructions = &v2.instructions;
3095 }
3096 } else {
3097 perf_debug("Skipping 16-wide due to pull parameters.\n");
3098 }
3099 }
3100
3101 c->prog_data.dispatch_width = 8;
3102
3103 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3104 const unsigned *generated = g.generate_assembly(&v.instructions,
3105 simd16_instructions,
3106 final_assembly_size);
3107
3108 if (unlikely(brw->perf_debug) && shader) {
3109 if (shader->compiled_once)
3110 brw_wm_debug_recompile(brw, prog, &c->key);
3111 shader->compiled_once = true;
3112
3113 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3114 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3115 (get_time() - start_time) * 1000);
3116 }
3117 }
3118
3119 return generated;
3120 }
3121
3122 bool
3123 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3124 {
3125 struct brw_context *brw = brw_context(ctx);
3126 struct brw_wm_prog_key key;
3127
3128 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3129 return true;
3130
3131 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3132 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3133 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3134 bool program_uses_dfdy = fp->UsesDFdy;
3135
3136 memset(&key, 0, sizeof(key));
3137
3138 if (brw->gen < 6) {
3139 if (fp->UsesKill)
3140 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3141
3142 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3143 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3144
3145 /* Just assume depth testing. */
3146 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3147 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3148 }
3149
3150 if (brw->gen < 6)
3151 key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
3152
3153 for (int i = 0; i < VARYING_SLOT_MAX; i++) {
3154 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
3155 continue;
3156
3157 if (brw->gen < 6) {
3158 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
3159 key.input_slots_valid |= BITFIELD64_BIT(i);
3160 }
3161 }
3162
3163 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3164
3165 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3166 for (unsigned i = 0; i < sampler_count; i++) {
3167 if (fp->Base.ShadowSamplers & (1 << i)) {
3168 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3169 key.tex.swizzles[i] =
3170 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3171 } else {
3172 /* Color sampler: assume no swizzling. */
3173 key.tex.swizzles[i] = SWIZZLE_XYZW;
3174 }
3175 }
3176
3177 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3178 key.drawable_height = ctx->DrawBuffer->Height;
3179 }
3180
3181 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3182 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3183 }
3184
3185 key.nr_color_regions = 1;
3186
3187 key.program_string_id = bfp->id;
3188
3189 uint32_t old_prog_offset = brw->wm.prog_offset;
3190 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3191
3192 bool success = do_wm_prog(brw, prog, bfp, &key);
3193
3194 brw->wm.prog_offset = old_prog_offset;
3195 brw->wm.prog_data = old_prog_data;
3196
3197 return success;
3198 }