i965: Move is_math/is_tex/is_control_flow() to backend_instruction.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/uniforms.h"
39 #include "main/fbobject.h"
40 #include "program/prog_parameter.h"
41 #include "program/prog_print.h"
42 #include "program/register_allocate.h"
43 #include "program/sampler.h"
44 #include "program/hash_table.h"
45 #include "brw_context.h"
46 #include "brw_eu.h"
47 #include "brw_wm.h"
48 }
49 #include "brw_fs.h"
50 #include "glsl/glsl_types.h"
51 #include "glsl/ir_print_visitor.h"
52
53 void
54 fs_inst::init()
55 {
56 memset(this, 0, sizeof(*this));
57 this->opcode = BRW_OPCODE_NOP;
58 this->conditional_mod = BRW_CONDITIONAL_NONE;
59
60 this->dst = reg_undef;
61 this->src[0] = reg_undef;
62 this->src[1] = reg_undef;
63 this->src[2] = reg_undef;
64
65 /* This will be the case for almost all instructions. */
66 this->regs_written = 1;
67 }
68
69 fs_inst::fs_inst()
70 {
71 init();
72 }
73
74 fs_inst::fs_inst(enum opcode opcode)
75 {
76 init();
77 this->opcode = opcode;
78 }
79
80 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
81 {
82 init();
83 this->opcode = opcode;
84 this->dst = dst;
85
86 if (dst.file == GRF)
87 assert(dst.reg_offset >= 0);
88 }
89
90 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
91 {
92 init();
93 this->opcode = opcode;
94 this->dst = dst;
95 this->src[0] = src0;
96
97 if (dst.file == GRF)
98 assert(dst.reg_offset >= 0);
99 if (src[0].file == GRF)
100 assert(src[0].reg_offset >= 0);
101 }
102
103 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
104 {
105 init();
106 this->opcode = opcode;
107 this->dst = dst;
108 this->src[0] = src0;
109 this->src[1] = src1;
110
111 if (dst.file == GRF)
112 assert(dst.reg_offset >= 0);
113 if (src[0].file == GRF)
114 assert(src[0].reg_offset >= 0);
115 if (src[1].file == GRF)
116 assert(src[1].reg_offset >= 0);
117 }
118
119 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
120 fs_reg src0, fs_reg src1, fs_reg src2)
121 {
122 init();
123 this->opcode = opcode;
124 this->dst = dst;
125 this->src[0] = src0;
126 this->src[1] = src1;
127 this->src[2] = src2;
128
129 if (dst.file == GRF)
130 assert(dst.reg_offset >= 0);
131 if (src[0].file == GRF)
132 assert(src[0].reg_offset >= 0);
133 if (src[1].file == GRF)
134 assert(src[1].reg_offset >= 0);
135 if (src[2].file == GRF)
136 assert(src[2].reg_offset >= 0);
137 }
138
139 #define ALU1(op) \
140 fs_inst * \
141 fs_visitor::op(fs_reg dst, fs_reg src0) \
142 { \
143 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
144 }
145
146 #define ALU2(op) \
147 fs_inst * \
148 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
149 { \
150 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
151 }
152
153 #define ALU3(op) \
154 fs_inst * \
155 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
156 { \
157 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
158 }
159
160 ALU1(NOT)
161 ALU1(MOV)
162 ALU1(FRC)
163 ALU1(RNDD)
164 ALU1(RNDE)
165 ALU1(RNDZ)
166 ALU2(ADD)
167 ALU2(MUL)
168 ALU2(MACH)
169 ALU2(AND)
170 ALU2(OR)
171 ALU2(XOR)
172 ALU2(SHL)
173 ALU2(SHR)
174 ALU2(ASR)
175 ALU3(LRP)
176
177 /** Gen4 predicated IF. */
178 fs_inst *
179 fs_visitor::IF(uint32_t predicate)
180 {
181 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
182 inst->predicate = predicate;
183 return inst;
184 }
185
186 /** Gen6+ IF with embedded comparison. */
187 fs_inst *
188 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
189 {
190 assert(intel->gen >= 6);
191 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
192 reg_null_d, src0, src1);
193 inst->conditional_mod = condition;
194 return inst;
195 }
196
197 /**
198 * CMP: Sets the low bit of the destination channels with the result
199 * of the comparison, while the upper bits are undefined, and updates
200 * the flag register with the packed 16 bits of the result.
201 */
202 fs_inst *
203 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
204 {
205 fs_inst *inst;
206
207 /* Take the instruction:
208 *
209 * CMP null<d> src0<f> src1<f>
210 *
211 * Original gen4 does type conversion to the destination type before
212 * comparison, producing garbage results for floating point comparisons.
213 * gen5 does the comparison on the execution type (resolved source types),
214 * so dst type doesn't matter. gen6 does comparison and then uses the
215 * result as if it was the dst type with no conversion, which happens to
216 * mostly work out for float-interpreted-as-int since our comparisons are
217 * for >0, =0, <0.
218 */
219 if (intel->gen == 4) {
220 dst.type = src0.type;
221 if (dst.file == FIXED_HW_REG)
222 dst.fixed_hw_reg.type = dst.type;
223 }
224
225 resolve_ud_negate(&src0);
226 resolve_ud_negate(&src1);
227
228 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
229 inst->conditional_mod = condition;
230
231 return inst;
232 }
233
234 exec_list
235 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
236 fs_reg varying_offset,
237 uint32_t const_offset)
238 {
239 exec_list instructions;
240 fs_inst *inst;
241
242 /* We have our constant surface use a pitch of 4 bytes, so our index can
243 * be any component of a vector, and then we load 4 contiguous
244 * components starting from that.
245 *
246 * We break down the const_offset to a portion added to the variable
247 * offset and a portion done using reg_offset, which means that if you
248 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
249 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
250 * CSE can later notice that those loads are all the same and eliminate
251 * the redundant ones.
252 */
253 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
254 instructions.push_tail(ADD(vec4_offset,
255 varying_offset, const_offset & ~3));
256
257 int scale = 1;
258 if (intel->gen == 4 && dispatch_width == 8) {
259 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
260 * u, v, r) as parameters, or we can just use the SIMD16 message
261 * consisting of (header, u). We choose the second, at the cost of a
262 * longer return length.
263 */
264 scale = 2;
265 }
266
267 enum opcode op;
268 if (intel->gen >= 7)
269 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
270 else
271 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
272 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
273 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
274 inst->regs_written = 4 * scale;
275 instructions.push_tail(inst);
276
277 if (intel->gen < 7) {
278 inst->base_mrf = 13;
279 inst->header_present = true;
280 if (intel->gen == 4)
281 inst->mlen = 3;
282 else
283 inst->mlen = 1 + dispatch_width / 8;
284 }
285
286 vec4_result.reg_offset += (const_offset & 3) * scale;
287 instructions.push_tail(MOV(dst, vec4_result));
288
289 return instructions;
290 }
291
292 /**
293 * A helper for MOV generation for fixing up broken hardware SEND dependency
294 * handling.
295 */
296 fs_inst *
297 fs_visitor::DEP_RESOLVE_MOV(int grf)
298 {
299 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
300
301 inst->ir = NULL;
302 inst->annotation = "send dependency resolve";
303
304 /* The caller always wants uncompressed to emit the minimal extra
305 * dependencies, and to avoid having to deal with aligning its regs to 2.
306 */
307 inst->force_uncompressed = true;
308
309 return inst;
310 }
311
312 bool
313 fs_inst::equals(fs_inst *inst)
314 {
315 return (opcode == inst->opcode &&
316 dst.equals(inst->dst) &&
317 src[0].equals(inst->src[0]) &&
318 src[1].equals(inst->src[1]) &&
319 src[2].equals(inst->src[2]) &&
320 saturate == inst->saturate &&
321 predicate == inst->predicate &&
322 conditional_mod == inst->conditional_mod &&
323 mlen == inst->mlen &&
324 base_mrf == inst->base_mrf &&
325 sampler == inst->sampler &&
326 target == inst->target &&
327 eot == inst->eot &&
328 header_present == inst->header_present &&
329 shadow_compare == inst->shadow_compare &&
330 offset == inst->offset);
331 }
332
333 bool
334 fs_inst::overwrites_reg(const fs_reg &reg)
335 {
336 return (reg.file == dst.file &&
337 reg.reg == dst.reg &&
338 reg.reg_offset >= dst.reg_offset &&
339 reg.reg_offset < dst.reg_offset + regs_written);
340 }
341
342 bool
343 fs_inst::is_send_from_grf()
344 {
345 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
346 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
347 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
348 src[1].file == GRF));
349 }
350
351 bool
352 fs_visitor::can_do_source_mods(fs_inst *inst)
353 {
354 if (intel->gen == 6 && inst->is_math())
355 return false;
356
357 if (inst->is_send_from_grf())
358 return false;
359
360 return true;
361 }
362
363 void
364 fs_reg::init()
365 {
366 memset(this, 0, sizeof(*this));
367 this->smear = -1;
368 }
369
370 /** Generic unset register constructor. */
371 fs_reg::fs_reg()
372 {
373 init();
374 this->file = BAD_FILE;
375 }
376
377 /** Immediate value constructor. */
378 fs_reg::fs_reg(float f)
379 {
380 init();
381 this->file = IMM;
382 this->type = BRW_REGISTER_TYPE_F;
383 this->imm.f = f;
384 }
385
386 /** Immediate value constructor. */
387 fs_reg::fs_reg(int32_t i)
388 {
389 init();
390 this->file = IMM;
391 this->type = BRW_REGISTER_TYPE_D;
392 this->imm.i = i;
393 }
394
395 /** Immediate value constructor. */
396 fs_reg::fs_reg(uint32_t u)
397 {
398 init();
399 this->file = IMM;
400 this->type = BRW_REGISTER_TYPE_UD;
401 this->imm.u = u;
402 }
403
404 /** Fixed brw_reg Immediate value constructor. */
405 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
406 {
407 init();
408 this->file = FIXED_HW_REG;
409 this->fixed_hw_reg = fixed_hw_reg;
410 this->type = fixed_hw_reg.type;
411 }
412
413 bool
414 fs_reg::equals(const fs_reg &r) const
415 {
416 return (file == r.file &&
417 reg == r.reg &&
418 reg_offset == r.reg_offset &&
419 type == r.type &&
420 negate == r.negate &&
421 abs == r.abs &&
422 !reladdr && !r.reladdr &&
423 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
424 sizeof(fixed_hw_reg)) == 0 &&
425 smear == r.smear &&
426 imm.u == r.imm.u);
427 }
428
429 bool
430 fs_reg::is_zero() const
431 {
432 if (file != IMM)
433 return false;
434
435 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
436 }
437
438 bool
439 fs_reg::is_one() const
440 {
441 if (file != IMM)
442 return false;
443
444 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
445 }
446
447 int
448 fs_visitor::type_size(const struct glsl_type *type)
449 {
450 unsigned int size, i;
451
452 switch (type->base_type) {
453 case GLSL_TYPE_UINT:
454 case GLSL_TYPE_INT:
455 case GLSL_TYPE_FLOAT:
456 case GLSL_TYPE_BOOL:
457 return type->components();
458 case GLSL_TYPE_ARRAY:
459 return type_size(type->fields.array) * type->length;
460 case GLSL_TYPE_STRUCT:
461 size = 0;
462 for (i = 0; i < type->length; i++) {
463 size += type_size(type->fields.structure[i].type);
464 }
465 return size;
466 case GLSL_TYPE_SAMPLER:
467 /* Samplers take up no register space, since they're baked in at
468 * link time.
469 */
470 return 0;
471 case GLSL_TYPE_VOID:
472 case GLSL_TYPE_ERROR:
473 case GLSL_TYPE_INTERFACE:
474 assert(!"not reached");
475 break;
476 }
477
478 return 0;
479 }
480
481 fs_reg
482 fs_visitor::get_timestamp()
483 {
484 assert(intel->gen >= 7);
485
486 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
487 BRW_ARF_TIMESTAMP,
488 0),
489 BRW_REGISTER_TYPE_UD));
490
491 fs_reg dst = fs_reg(this, glsl_type::uint_type);
492
493 fs_inst *mov = emit(MOV(dst, ts));
494 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
495 * even if it's not enabled in the dispatch.
496 */
497 mov->force_writemask_all = true;
498 mov->force_uncompressed = true;
499
500 /* The caller wants the low 32 bits of the timestamp. Since it's running
501 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
502 * which is plenty of time for our purposes. It is identical across the
503 * EUs, but since it's tracking GPU core speed it will increment at a
504 * varying rate as render P-states change.
505 *
506 * The caller could also check if render P-states have changed (or anything
507 * else that might disrupt timing) by setting smear to 2 and checking if
508 * that field is != 0.
509 */
510 dst.smear = 0;
511
512 return dst;
513 }
514
515 void
516 fs_visitor::emit_shader_time_begin()
517 {
518 current_annotation = "shader time start";
519 shader_start_time = get_timestamp();
520 }
521
522 void
523 fs_visitor::emit_shader_time_end()
524 {
525 current_annotation = "shader time end";
526
527 enum shader_time_shader_type type, written_type, reset_type;
528 if (dispatch_width == 8) {
529 type = ST_FS8;
530 written_type = ST_FS8_WRITTEN;
531 reset_type = ST_FS8_RESET;
532 } else {
533 assert(dispatch_width == 16);
534 type = ST_FS16;
535 written_type = ST_FS16_WRITTEN;
536 reset_type = ST_FS16_RESET;
537 }
538
539 fs_reg shader_end_time = get_timestamp();
540
541 /* Check that there weren't any timestamp reset events (assuming these
542 * were the only two timestamp reads that happened).
543 */
544 fs_reg reset = shader_end_time;
545 reset.smear = 2;
546 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
547 test->conditional_mod = BRW_CONDITIONAL_Z;
548 emit(IF(BRW_PREDICATE_NORMAL));
549
550 push_force_uncompressed();
551 fs_reg start = shader_start_time;
552 start.negate = true;
553 fs_reg diff = fs_reg(this, glsl_type::uint_type);
554 emit(ADD(diff, start, shader_end_time));
555
556 /* If there were no instructions between the two timestamp gets, the diff
557 * is 2 cycles. Remove that overhead, so I can forget about that when
558 * trying to determine the time taken for single instructions.
559 */
560 emit(ADD(diff, diff, fs_reg(-2u)));
561
562 emit_shader_time_write(type, diff);
563 emit_shader_time_write(written_type, fs_reg(1u));
564 emit(BRW_OPCODE_ELSE);
565 emit_shader_time_write(reset_type, fs_reg(1u));
566 emit(BRW_OPCODE_ENDIF);
567
568 pop_force_uncompressed();
569 }
570
571 void
572 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
573 fs_reg value)
574 {
575 int shader_time_index =
576 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
577 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
578
579 fs_reg payload;
580 if (dispatch_width == 8)
581 payload = fs_reg(this, glsl_type::uvec2_type);
582 else
583 payload = fs_reg(this, glsl_type::uint_type);
584
585 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
586 fs_reg(), payload, offset, value));
587 }
588
589 void
590 fs_visitor::fail(const char *format, ...)
591 {
592 va_list va;
593 char *msg;
594
595 if (failed)
596 return;
597
598 failed = true;
599
600 va_start(va, format);
601 msg = ralloc_vasprintf(mem_ctx, format, va);
602 va_end(va);
603 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
604
605 this->fail_msg = msg;
606
607 if (INTEL_DEBUG & DEBUG_WM) {
608 fprintf(stderr, "%s", msg);
609 }
610 }
611
612 fs_inst *
613 fs_visitor::emit(enum opcode opcode)
614 {
615 return emit(fs_inst(opcode));
616 }
617
618 fs_inst *
619 fs_visitor::emit(enum opcode opcode, fs_reg dst)
620 {
621 return emit(fs_inst(opcode, dst));
622 }
623
624 fs_inst *
625 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
626 {
627 return emit(fs_inst(opcode, dst, src0));
628 }
629
630 fs_inst *
631 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
632 {
633 return emit(fs_inst(opcode, dst, src0, src1));
634 }
635
636 fs_inst *
637 fs_visitor::emit(enum opcode opcode, fs_reg dst,
638 fs_reg src0, fs_reg src1, fs_reg src2)
639 {
640 return emit(fs_inst(opcode, dst, src0, src1, src2));
641 }
642
643 void
644 fs_visitor::push_force_uncompressed()
645 {
646 force_uncompressed_stack++;
647 }
648
649 void
650 fs_visitor::pop_force_uncompressed()
651 {
652 force_uncompressed_stack--;
653 assert(force_uncompressed_stack >= 0);
654 }
655
656 void
657 fs_visitor::push_force_sechalf()
658 {
659 force_sechalf_stack++;
660 }
661
662 void
663 fs_visitor::pop_force_sechalf()
664 {
665 force_sechalf_stack--;
666 assert(force_sechalf_stack >= 0);
667 }
668
669 /**
670 * Returns true if the instruction has a flag that means it won't
671 * update an entire destination register.
672 *
673 * For example, dead code elimination and live variable analysis want to know
674 * when a write to a variable screens off any preceding values that were in
675 * it.
676 */
677 bool
678 fs_inst::is_partial_write()
679 {
680 return (this->predicate ||
681 this->force_uncompressed ||
682 this->force_sechalf);
683 }
684
685 /**
686 * Returns how many MRFs an FS opcode will write over.
687 *
688 * Note that this is not the 0 or 1 implied writes in an actual gen
689 * instruction -- the FS opcodes often generate MOVs in addition.
690 */
691 int
692 fs_visitor::implied_mrf_writes(fs_inst *inst)
693 {
694 if (inst->mlen == 0)
695 return 0;
696
697 switch (inst->opcode) {
698 case SHADER_OPCODE_RCP:
699 case SHADER_OPCODE_RSQ:
700 case SHADER_OPCODE_SQRT:
701 case SHADER_OPCODE_EXP2:
702 case SHADER_OPCODE_LOG2:
703 case SHADER_OPCODE_SIN:
704 case SHADER_OPCODE_COS:
705 return 1 * dispatch_width / 8;
706 case SHADER_OPCODE_POW:
707 case SHADER_OPCODE_INT_QUOTIENT:
708 case SHADER_OPCODE_INT_REMAINDER:
709 return 2 * dispatch_width / 8;
710 case SHADER_OPCODE_TEX:
711 case FS_OPCODE_TXB:
712 case SHADER_OPCODE_TXD:
713 case SHADER_OPCODE_TXF:
714 case SHADER_OPCODE_TXF_MS:
715 case SHADER_OPCODE_TXL:
716 case SHADER_OPCODE_TXS:
717 case SHADER_OPCODE_LOD:
718 return 1;
719 case FS_OPCODE_FB_WRITE:
720 return 2;
721 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
722 case FS_OPCODE_UNSPILL:
723 return 1;
724 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
725 return inst->mlen;
726 case FS_OPCODE_SPILL:
727 return 2;
728 default:
729 assert(!"not reached");
730 return inst->mlen;
731 }
732 }
733
734 int
735 fs_visitor::virtual_grf_alloc(int size)
736 {
737 if (virtual_grf_array_size <= virtual_grf_count) {
738 if (virtual_grf_array_size == 0)
739 virtual_grf_array_size = 16;
740 else
741 virtual_grf_array_size *= 2;
742 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
743 virtual_grf_array_size);
744 }
745 virtual_grf_sizes[virtual_grf_count] = size;
746 return virtual_grf_count++;
747 }
748
749 /** Fixed HW reg constructor. */
750 fs_reg::fs_reg(enum register_file file, int reg)
751 {
752 init();
753 this->file = file;
754 this->reg = reg;
755 this->type = BRW_REGISTER_TYPE_F;
756 }
757
758 /** Fixed HW reg constructor. */
759 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
760 {
761 init();
762 this->file = file;
763 this->reg = reg;
764 this->type = type;
765 }
766
767 /** Automatic reg constructor. */
768 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
769 {
770 init();
771
772 this->file = GRF;
773 this->reg = v->virtual_grf_alloc(v->type_size(type));
774 this->reg_offset = 0;
775 this->type = brw_type_for_base_type(type);
776 }
777
778 fs_reg *
779 fs_visitor::variable_storage(ir_variable *var)
780 {
781 return (fs_reg *)hash_table_find(this->variable_ht, var);
782 }
783
784 void
785 import_uniforms_callback(const void *key,
786 void *data,
787 void *closure)
788 {
789 struct hash_table *dst_ht = (struct hash_table *)closure;
790 const fs_reg *reg = (const fs_reg *)data;
791
792 if (reg->file != UNIFORM)
793 return;
794
795 hash_table_insert(dst_ht, data, key);
796 }
797
798 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
799 * This brings in those uniform definitions
800 */
801 void
802 fs_visitor::import_uniforms(fs_visitor *v)
803 {
804 hash_table_call_foreach(v->variable_ht,
805 import_uniforms_callback,
806 variable_ht);
807 this->params_remap = v->params_remap;
808 }
809
810 /* Our support for uniforms is piggy-backed on the struct
811 * gl_fragment_program, because that's where the values actually
812 * get stored, rather than in some global gl_shader_program uniform
813 * store.
814 */
815 void
816 fs_visitor::setup_uniform_values(ir_variable *ir)
817 {
818 int namelen = strlen(ir->name);
819
820 /* The data for our (non-builtin) uniforms is stored in a series of
821 * gl_uniform_driver_storage structs for each subcomponent that
822 * glGetUniformLocation() could name. We know it's been set up in the same
823 * order we'd walk the type, so walk the list of storage and find anything
824 * with our name, or the prefix of a component that starts with our name.
825 */
826 unsigned params_before = c->prog_data.nr_params;
827 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
828 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
829
830 if (strncmp(ir->name, storage->name, namelen) != 0 ||
831 (storage->name[namelen] != 0 &&
832 storage->name[namelen] != '.' &&
833 storage->name[namelen] != '[')) {
834 continue;
835 }
836
837 unsigned slots = storage->type->component_slots();
838 if (storage->array_elements)
839 slots *= storage->array_elements;
840
841 for (unsigned i = 0; i < slots; i++) {
842 c->prog_data.param[c->prog_data.nr_params++] =
843 &storage->storage[i].f;
844 }
845 }
846
847 /* Make sure we actually initialized the right amount of stuff here. */
848 assert(params_before + ir->type->component_slots() ==
849 c->prog_data.nr_params);
850 (void)params_before;
851 }
852
853
854 /* Our support for builtin uniforms is even scarier than non-builtin.
855 * It sits on top of the PROG_STATE_VAR parameters that are
856 * automatically updated from GL context state.
857 */
858 void
859 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
860 {
861 const ir_state_slot *const slots = ir->state_slots;
862 assert(ir->state_slots != NULL);
863
864 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
865 /* This state reference has already been setup by ir_to_mesa, but we'll
866 * get the same index back here.
867 */
868 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
869 (gl_state_index *)slots[i].tokens);
870
871 /* Add each of the unique swizzles of the element as a parameter.
872 * This'll end up matching the expected layout of the
873 * array/matrix/structure we're trying to fill in.
874 */
875 int last_swiz = -1;
876 for (unsigned int j = 0; j < 4; j++) {
877 int swiz = GET_SWZ(slots[i].swizzle, j);
878 if (swiz == last_swiz)
879 break;
880 last_swiz = swiz;
881
882 c->prog_data.param[c->prog_data.nr_params++] =
883 &fp->Base.Parameters->ParameterValues[index][swiz].f;
884 }
885 }
886 }
887
888 fs_reg *
889 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
890 {
891 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
892 fs_reg wpos = *reg;
893 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
894
895 /* gl_FragCoord.x */
896 if (ir->pixel_center_integer) {
897 emit(MOV(wpos, this->pixel_x));
898 } else {
899 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
900 }
901 wpos.reg_offset++;
902
903 /* gl_FragCoord.y */
904 if (!flip && ir->pixel_center_integer) {
905 emit(MOV(wpos, this->pixel_y));
906 } else {
907 fs_reg pixel_y = this->pixel_y;
908 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
909
910 if (flip) {
911 pixel_y.negate = true;
912 offset += c->key.drawable_height - 1.0;
913 }
914
915 emit(ADD(wpos, pixel_y, fs_reg(offset)));
916 }
917 wpos.reg_offset++;
918
919 /* gl_FragCoord.z */
920 if (intel->gen >= 6) {
921 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
922 } else {
923 emit(FS_OPCODE_LINTERP, wpos,
924 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
925 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
926 interp_reg(VARYING_SLOT_POS, 2));
927 }
928 wpos.reg_offset++;
929
930 /* gl_FragCoord.w: Already set up in emit_interpolation */
931 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
932
933 return reg;
934 }
935
936 fs_inst *
937 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
938 glsl_interp_qualifier interpolation_mode,
939 bool is_centroid)
940 {
941 brw_wm_barycentric_interp_mode barycoord_mode;
942 if (intel->gen >= 6) {
943 if (is_centroid) {
944 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
945 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
946 else
947 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
948 } else {
949 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
950 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
951 else
952 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
953 }
954 } else {
955 /* On Ironlake and below, there is only one interpolation mode.
956 * Centroid interpolation doesn't mean anything on this hardware --
957 * there is no multisampling.
958 */
959 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
960 }
961 return emit(FS_OPCODE_LINTERP, attr,
962 this->delta_x[barycoord_mode],
963 this->delta_y[barycoord_mode], interp);
964 }
965
966 fs_reg *
967 fs_visitor::emit_general_interpolation(ir_variable *ir)
968 {
969 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
970 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
971 fs_reg attr = *reg;
972
973 unsigned int array_elements;
974 const glsl_type *type;
975
976 if (ir->type->is_array()) {
977 array_elements = ir->type->length;
978 if (array_elements == 0) {
979 fail("dereferenced array '%s' has length 0\n", ir->name);
980 }
981 type = ir->type->fields.array;
982 } else {
983 array_elements = 1;
984 type = ir->type;
985 }
986
987 glsl_interp_qualifier interpolation_mode =
988 ir->determine_interpolation_mode(c->key.flat_shade);
989
990 int location = ir->location;
991 for (unsigned int i = 0; i < array_elements; i++) {
992 for (unsigned int j = 0; j < type->matrix_columns; j++) {
993 if (urb_setup[location] == -1) {
994 /* If there's no incoming setup data for this slot, don't
995 * emit interpolation for it.
996 */
997 attr.reg_offset += type->vector_elements;
998 location++;
999 continue;
1000 }
1001
1002 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1003 /* Constant interpolation (flat shading) case. The SF has
1004 * handed us defined values in only the constant offset
1005 * field of the setup reg.
1006 */
1007 for (unsigned int k = 0; k < type->vector_elements; k++) {
1008 struct brw_reg interp = interp_reg(location, k);
1009 interp = suboffset(interp, 3);
1010 interp.type = reg->type;
1011 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1012 attr.reg_offset++;
1013 }
1014 } else {
1015 /* Smooth/noperspective interpolation case. */
1016 for (unsigned int k = 0; k < type->vector_elements; k++) {
1017 /* FINISHME: At some point we probably want to push
1018 * this farther by giving similar treatment to the
1019 * other potentially constant components of the
1020 * attribute, as well as making brw_vs_constval.c
1021 * handle varyings other than gl_TexCoord.
1022 */
1023 struct brw_reg interp = interp_reg(location, k);
1024 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1025 ir->centroid);
1026 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1027 /* Get the pixel/sample mask into f0 so that we know
1028 * which pixels are lit. Then, for each channel that is
1029 * unlit, replace the centroid data with non-centroid
1030 * data.
1031 */
1032 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1033 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1034 interpolation_mode, false);
1035 inst->predicate = BRW_PREDICATE_NORMAL;
1036 inst->predicate_inverse = true;
1037 }
1038 if (intel->gen < 6) {
1039 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1040 }
1041 attr.reg_offset++;
1042 }
1043
1044 }
1045 location++;
1046 }
1047 }
1048
1049 return reg;
1050 }
1051
1052 fs_reg *
1053 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1054 {
1055 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1056
1057 /* The frontfacing comes in as a bit in the thread payload. */
1058 if (intel->gen >= 6) {
1059 emit(BRW_OPCODE_ASR, *reg,
1060 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1061 fs_reg(15));
1062 emit(BRW_OPCODE_NOT, *reg, *reg);
1063 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1064 } else {
1065 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1066 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1067 * us front face
1068 */
1069 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1070 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1071 }
1072
1073 return reg;
1074 }
1075
1076 fs_reg
1077 fs_visitor::fix_math_operand(fs_reg src)
1078 {
1079 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1080 * might be able to do better by doing execsize = 1 math and then
1081 * expanding that result out, but we would need to be careful with
1082 * masking.
1083 *
1084 * The hardware ignores source modifiers (negate and abs) on math
1085 * instructions, so we also move to a temp to set those up.
1086 */
1087 if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1088 !src.abs && !src.negate)
1089 return src;
1090
1091 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1092 * operands to math
1093 */
1094 if (intel->gen >= 7 && src.file != IMM)
1095 return src;
1096
1097 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1098 expanded.type = src.type;
1099 emit(BRW_OPCODE_MOV, expanded, src);
1100 return expanded;
1101 }
1102
1103 fs_inst *
1104 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1105 {
1106 switch (opcode) {
1107 case SHADER_OPCODE_RCP:
1108 case SHADER_OPCODE_RSQ:
1109 case SHADER_OPCODE_SQRT:
1110 case SHADER_OPCODE_EXP2:
1111 case SHADER_OPCODE_LOG2:
1112 case SHADER_OPCODE_SIN:
1113 case SHADER_OPCODE_COS:
1114 break;
1115 default:
1116 assert(!"not reached: bad math opcode");
1117 return NULL;
1118 }
1119
1120 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1121 * might be able to do better by doing execsize = 1 math and then
1122 * expanding that result out, but we would need to be careful with
1123 * masking.
1124 *
1125 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1126 * instructions, so we also move to a temp to set those up.
1127 */
1128 if (intel->gen >= 6)
1129 src = fix_math_operand(src);
1130
1131 fs_inst *inst = emit(opcode, dst, src);
1132
1133 if (intel->gen < 6) {
1134 inst->base_mrf = 2;
1135 inst->mlen = dispatch_width / 8;
1136 }
1137
1138 return inst;
1139 }
1140
1141 fs_inst *
1142 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1143 {
1144 int base_mrf = 2;
1145 fs_inst *inst;
1146
1147 switch (opcode) {
1148 case SHADER_OPCODE_INT_QUOTIENT:
1149 case SHADER_OPCODE_INT_REMAINDER:
1150 if (intel->gen >= 7 && dispatch_width == 16)
1151 fail("16-wide INTDIV unsupported\n");
1152 break;
1153 case SHADER_OPCODE_POW:
1154 break;
1155 default:
1156 assert(!"not reached: unsupported binary math opcode.");
1157 return NULL;
1158 }
1159
1160 if (intel->gen >= 6) {
1161 src0 = fix_math_operand(src0);
1162 src1 = fix_math_operand(src1);
1163
1164 inst = emit(opcode, dst, src0, src1);
1165 } else {
1166 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1167 * "Message Payload":
1168 *
1169 * "Operand0[7]. For the INT DIV functions, this operand is the
1170 * denominator."
1171 * ...
1172 * "Operand1[7]. For the INT DIV functions, this operand is the
1173 * numerator."
1174 */
1175 bool is_int_div = opcode != SHADER_OPCODE_POW;
1176 fs_reg &op0 = is_int_div ? src1 : src0;
1177 fs_reg &op1 = is_int_div ? src0 : src1;
1178
1179 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1180 inst = emit(opcode, dst, op0, reg_null_f);
1181
1182 inst->base_mrf = base_mrf;
1183 inst->mlen = 2 * dispatch_width / 8;
1184 }
1185 return inst;
1186 }
1187
1188 void
1189 fs_visitor::assign_curb_setup()
1190 {
1191 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1192 if (dispatch_width == 8) {
1193 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1194 } else {
1195 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1196 }
1197
1198 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1199 foreach_list(node, &this->instructions) {
1200 fs_inst *inst = (fs_inst *)node;
1201
1202 for (unsigned int i = 0; i < 3; i++) {
1203 if (inst->src[i].file == UNIFORM) {
1204 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1205 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1206 constant_nr / 8,
1207 constant_nr % 8);
1208
1209 inst->src[i].file = FIXED_HW_REG;
1210 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1211 }
1212 }
1213 }
1214 }
1215
1216 void
1217 fs_visitor::calculate_urb_setup()
1218 {
1219 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1220 urb_setup[i] = -1;
1221 }
1222
1223 int urb_next = 0;
1224 /* Figure out where each of the incoming setup attributes lands. */
1225 if (intel->gen >= 6) {
1226 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1227 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1228 urb_setup[i] = urb_next++;
1229 }
1230 }
1231 } else {
1232 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1233 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1234 /* Point size is packed into the header, not as a general attribute */
1235 if (i == VARYING_SLOT_PSIZ)
1236 continue;
1237
1238 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1239 /* The back color slot is skipped when the front color is
1240 * also written to. In addition, some slots can be
1241 * written in the vertex shader and not read in the
1242 * fragment shader. So the register number must always be
1243 * incremented, mapped or not.
1244 */
1245 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1246 urb_setup[i] = urb_next;
1247 urb_next++;
1248 }
1249 }
1250
1251 /*
1252 * It's a FS only attribute, and we did interpolation for this attribute
1253 * in SF thread. So, count it here, too.
1254 *
1255 * See compile_sf_prog() for more info.
1256 */
1257 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1258 urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1259 }
1260
1261 /* Each attribute is 4 setup channels, each of which is half a reg. */
1262 c->prog_data.urb_read_length = urb_next * 2;
1263 }
1264
1265 void
1266 fs_visitor::assign_urb_setup()
1267 {
1268 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1269
1270 /* Offset all the urb_setup[] index by the actual position of the
1271 * setup regs, now that the location of the constants has been chosen.
1272 */
1273 foreach_list(node, &this->instructions) {
1274 fs_inst *inst = (fs_inst *)node;
1275
1276 if (inst->opcode == FS_OPCODE_LINTERP) {
1277 assert(inst->src[2].file == FIXED_HW_REG);
1278 inst->src[2].fixed_hw_reg.nr += urb_start;
1279 }
1280
1281 if (inst->opcode == FS_OPCODE_CINTERP) {
1282 assert(inst->src[0].file == FIXED_HW_REG);
1283 inst->src[0].fixed_hw_reg.nr += urb_start;
1284 }
1285 }
1286
1287 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1288 }
1289
1290 /**
1291 * Split large virtual GRFs into separate components if we can.
1292 *
1293 * This is mostly duplicated with what brw_fs_vector_splitting does,
1294 * but that's really conservative because it's afraid of doing
1295 * splitting that doesn't result in real progress after the rest of
1296 * the optimization phases, which would cause infinite looping in
1297 * optimization. We can do it once here, safely. This also has the
1298 * opportunity to split interpolated values, or maybe even uniforms,
1299 * which we don't have at the IR level.
1300 *
1301 * We want to split, because virtual GRFs are what we register
1302 * allocate and spill (due to contiguousness requirements for some
1303 * instructions), and they're what we naturally generate in the
1304 * codegen process, but most virtual GRFs don't actually need to be
1305 * contiguous sets of GRFs. If we split, we'll end up with reduced
1306 * live intervals and better dead code elimination and coalescing.
1307 */
1308 void
1309 fs_visitor::split_virtual_grfs()
1310 {
1311 int num_vars = this->virtual_grf_count;
1312 bool split_grf[num_vars];
1313 int new_virtual_grf[num_vars];
1314
1315 /* Try to split anything > 0 sized. */
1316 for (int i = 0; i < num_vars; i++) {
1317 if (this->virtual_grf_sizes[i] != 1)
1318 split_grf[i] = true;
1319 else
1320 split_grf[i] = false;
1321 }
1322
1323 if (brw->has_pln &&
1324 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1325 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1326 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1327 * Gen6, that was the only supported interpolation mode, and since Gen6,
1328 * delta_x and delta_y are in fixed hardware registers.
1329 */
1330 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1331 false;
1332 }
1333
1334 foreach_list(node, &this->instructions) {
1335 fs_inst *inst = (fs_inst *)node;
1336
1337 /* If there's a SEND message that requires contiguous destination
1338 * registers, no splitting is allowed.
1339 */
1340 if (inst->regs_written > 1) {
1341 split_grf[inst->dst.reg] = false;
1342 }
1343
1344 /* If we're sending from a GRF, don't split it, on the assumption that
1345 * the send is reading the whole thing.
1346 */
1347 if (inst->is_send_from_grf()) {
1348 split_grf[inst->src[0].reg] = false;
1349 }
1350 }
1351
1352 /* Allocate new space for split regs. Note that the virtual
1353 * numbers will be contiguous.
1354 */
1355 for (int i = 0; i < num_vars; i++) {
1356 if (split_grf[i]) {
1357 new_virtual_grf[i] = virtual_grf_alloc(1);
1358 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1359 int reg = virtual_grf_alloc(1);
1360 assert(reg == new_virtual_grf[i] + j - 1);
1361 (void) reg;
1362 }
1363 this->virtual_grf_sizes[i] = 1;
1364 }
1365 }
1366
1367 foreach_list(node, &this->instructions) {
1368 fs_inst *inst = (fs_inst *)node;
1369
1370 if (inst->dst.file == GRF &&
1371 split_grf[inst->dst.reg] &&
1372 inst->dst.reg_offset != 0) {
1373 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1374 inst->dst.reg_offset - 1);
1375 inst->dst.reg_offset = 0;
1376 }
1377 for (int i = 0; i < 3; i++) {
1378 if (inst->src[i].file == GRF &&
1379 split_grf[inst->src[i].reg] &&
1380 inst->src[i].reg_offset != 0) {
1381 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1382 inst->src[i].reg_offset - 1);
1383 inst->src[i].reg_offset = 0;
1384 }
1385 }
1386 }
1387 this->live_intervals_valid = false;
1388 }
1389
1390 /**
1391 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1392 *
1393 * During code generation, we create tons of temporary variables, many of
1394 * which get immediately killed and are never used again. Yet, in later
1395 * optimization and analysis passes, such as compute_live_intervals, we need
1396 * to loop over all the virtual GRFs. Compacting them can save a lot of
1397 * overhead.
1398 */
1399 void
1400 fs_visitor::compact_virtual_grfs()
1401 {
1402 /* Mark which virtual GRFs are used, and count how many. */
1403 int remap_table[this->virtual_grf_count];
1404 memset(remap_table, -1, sizeof(remap_table));
1405
1406 foreach_list(node, &this->instructions) {
1407 const fs_inst *inst = (const fs_inst *) node;
1408
1409 if (inst->dst.file == GRF)
1410 remap_table[inst->dst.reg] = 0;
1411
1412 for (int i = 0; i < 3; i++) {
1413 if (inst->src[i].file == GRF)
1414 remap_table[inst->src[i].reg] = 0;
1415 }
1416 }
1417
1418 /* In addition to registers used in instructions, fs_visitor keeps
1419 * direct references to certain special values which must be patched:
1420 */
1421 fs_reg *special[] = {
1422 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1423 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1424 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1425 &delta_x[0], &delta_x[1], &delta_x[2],
1426 &delta_x[3], &delta_x[4], &delta_x[5],
1427 &delta_y[0], &delta_y[1], &delta_y[2],
1428 &delta_y[3], &delta_y[4], &delta_y[5],
1429 };
1430 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1431 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1432
1433 /* Treat all special values as used, to be conservative */
1434 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1435 if (special[i]->file == GRF)
1436 remap_table[special[i]->reg] = 0;
1437 }
1438
1439 /* Compact the GRF arrays. */
1440 int new_index = 0;
1441 for (int i = 0; i < this->virtual_grf_count; i++) {
1442 if (remap_table[i] != -1) {
1443 remap_table[i] = new_index;
1444 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1445 if (live_intervals_valid) {
1446 virtual_grf_use[new_index] = virtual_grf_use[i];
1447 virtual_grf_def[new_index] = virtual_grf_def[i];
1448 }
1449 ++new_index;
1450 }
1451 }
1452
1453 this->virtual_grf_count = new_index;
1454
1455 /* Patch all the instructions to use the newly renumbered registers */
1456 foreach_list(node, &this->instructions) {
1457 fs_inst *inst = (fs_inst *) node;
1458
1459 if (inst->dst.file == GRF)
1460 inst->dst.reg = remap_table[inst->dst.reg];
1461
1462 for (int i = 0; i < 3; i++) {
1463 if (inst->src[i].file == GRF)
1464 inst->src[i].reg = remap_table[inst->src[i].reg];
1465 }
1466 }
1467
1468 /* Patch all the references to special values */
1469 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1470 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1471 special[i]->reg = remap_table[special[i]->reg];
1472 }
1473 }
1474
1475 bool
1476 fs_visitor::remove_dead_constants()
1477 {
1478 if (dispatch_width == 8) {
1479 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1480
1481 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1482 this->params_remap[i] = -1;
1483
1484 /* Find which params are still in use. */
1485 foreach_list(node, &this->instructions) {
1486 fs_inst *inst = (fs_inst *)node;
1487
1488 for (int i = 0; i < 3; i++) {
1489 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1490
1491 if (inst->src[i].file != UNIFORM)
1492 continue;
1493
1494 assert(constant_nr < (int)c->prog_data.nr_params);
1495
1496 /* For now, set this to non-negative. We'll give it the
1497 * actual new number in a moment, in order to keep the
1498 * register numbers nicely ordered.
1499 */
1500 this->params_remap[constant_nr] = 0;
1501 }
1502 }
1503
1504 /* Figure out what the new numbers for the params will be. At some
1505 * point when we're doing uniform array access, we're going to want
1506 * to keep the distinction between .reg and .reg_offset, but for
1507 * now we don't care.
1508 */
1509 unsigned int new_nr_params = 0;
1510 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1511 if (this->params_remap[i] != -1) {
1512 this->params_remap[i] = new_nr_params++;
1513 }
1514 }
1515
1516 /* Update the list of params to be uploaded to match our new numbering. */
1517 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1518 int remapped = this->params_remap[i];
1519
1520 if (remapped == -1)
1521 continue;
1522
1523 c->prog_data.param[remapped] = c->prog_data.param[i];
1524 }
1525
1526 c->prog_data.nr_params = new_nr_params;
1527 } else {
1528 /* This should have been generated in the 8-wide pass already. */
1529 assert(this->params_remap);
1530 }
1531
1532 /* Now do the renumbering of the shader to remove unused params. */
1533 foreach_list(node, &this->instructions) {
1534 fs_inst *inst = (fs_inst *)node;
1535
1536 for (int i = 0; i < 3; i++) {
1537 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1538
1539 if (inst->src[i].file != UNIFORM)
1540 continue;
1541
1542 assert(this->params_remap[constant_nr] != -1);
1543 inst->src[i].reg = this->params_remap[constant_nr];
1544 inst->src[i].reg_offset = 0;
1545 }
1546 }
1547
1548 return true;
1549 }
1550
1551 /*
1552 * Implements array access of uniforms by inserting a
1553 * PULL_CONSTANT_LOAD instruction.
1554 *
1555 * Unlike temporary GRF array access (where we don't support it due to
1556 * the difficulty of doing relative addressing on instruction
1557 * destinations), we could potentially do array access of uniforms
1558 * that were loaded in GRF space as push constants. In real-world
1559 * usage we've seen, though, the arrays being used are always larger
1560 * than we could load as push constants, so just always move all
1561 * uniform array access out to a pull constant buffer.
1562 */
1563 void
1564 fs_visitor::move_uniform_array_access_to_pull_constants()
1565 {
1566 int pull_constant_loc[c->prog_data.nr_params];
1567
1568 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1569 pull_constant_loc[i] = -1;
1570 }
1571
1572 /* Walk through and find array access of uniforms. Put a copy of that
1573 * uniform in the pull constant buffer.
1574 *
1575 * Note that we don't move constant-indexed accesses to arrays. No
1576 * testing has been done of the performance impact of this choice.
1577 */
1578 foreach_list_safe(node, &this->instructions) {
1579 fs_inst *inst = (fs_inst *)node;
1580
1581 for (int i = 0 ; i < 3; i++) {
1582 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1583 continue;
1584
1585 int uniform = inst->src[i].reg;
1586
1587 /* If this array isn't already present in the pull constant buffer,
1588 * add it.
1589 */
1590 if (pull_constant_loc[uniform] == -1) {
1591 const float **values = &c->prog_data.param[uniform];
1592
1593 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1594
1595 assert(param_size[uniform]);
1596
1597 for (int j = 0; j < param_size[uniform]; j++) {
1598 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1599 values[j];
1600 }
1601 }
1602
1603 /* Set up the annotation tracking for new generated instructions. */
1604 base_ir = inst->ir;
1605 current_annotation = inst->annotation;
1606
1607 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1608 fs_reg temp = fs_reg(this, glsl_type::float_type);
1609 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1610 surf_index,
1611 *inst->src[i].reladdr,
1612 pull_constant_loc[uniform] +
1613 inst->src[i].reg_offset);
1614 inst->insert_before(&list);
1615
1616 inst->src[i].file = temp.file;
1617 inst->src[i].reg = temp.reg;
1618 inst->src[i].reg_offset = temp.reg_offset;
1619 inst->src[i].reladdr = NULL;
1620 }
1621 }
1622 }
1623
1624 /**
1625 * Choose accesses from the UNIFORM file to demote to using the pull
1626 * constant buffer.
1627 *
1628 * We allow a fragment shader to have more than the specified minimum
1629 * maximum number of fragment shader uniform components (64). If
1630 * there are too many of these, they'd fill up all of register space.
1631 * So, this will push some of them out to the pull constant buffer and
1632 * update the program to load them.
1633 */
1634 void
1635 fs_visitor::setup_pull_constants()
1636 {
1637 /* Only allow 16 registers (128 uniform components) as push constants. */
1638 unsigned int max_uniform_components = 16 * 8;
1639 if (c->prog_data.nr_params <= max_uniform_components)
1640 return;
1641
1642 if (dispatch_width == 16) {
1643 fail("Pull constants not supported in 16-wide\n");
1644 return;
1645 }
1646
1647 /* Just demote the end of the list. We could probably do better
1648 * here, demoting things that are rarely used in the program first.
1649 */
1650 unsigned int pull_uniform_base = max_uniform_components;
1651
1652 int pull_constant_loc[c->prog_data.nr_params];
1653 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1654 if (i < pull_uniform_base) {
1655 pull_constant_loc[i] = -1;
1656 } else {
1657 pull_constant_loc[i] = -1;
1658 /* If our constant is already being uploaded for reladdr purposes,
1659 * reuse it.
1660 */
1661 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1662 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1663 pull_constant_loc[i] = j;
1664 break;
1665 }
1666 }
1667 if (pull_constant_loc[i] == -1) {
1668 int pull_index = c->prog_data.nr_pull_params++;
1669 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1670 pull_constant_loc[i] = pull_index;;
1671 }
1672 }
1673 }
1674 c->prog_data.nr_params = pull_uniform_base;
1675
1676 foreach_list(node, &this->instructions) {
1677 fs_inst *inst = (fs_inst *)node;
1678
1679 for (int i = 0; i < 3; i++) {
1680 if (inst->src[i].file != UNIFORM)
1681 continue;
1682
1683 int pull_index = pull_constant_loc[inst->src[i].reg +
1684 inst->src[i].reg_offset];
1685 if (pull_index == -1)
1686 continue;
1687
1688 assert(!inst->src[i].reladdr);
1689
1690 fs_reg dst = fs_reg(this, glsl_type::float_type);
1691 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1692 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1693 fs_inst *pull =
1694 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1695 dst, index, offset);
1696 pull->ir = inst->ir;
1697 pull->annotation = inst->annotation;
1698
1699 inst->insert_before(pull);
1700
1701 inst->src[i].file = GRF;
1702 inst->src[i].reg = dst.reg;
1703 inst->src[i].reg_offset = 0;
1704 inst->src[i].smear = pull_index & 3;
1705 }
1706 }
1707 }
1708
1709 bool
1710 fs_visitor::opt_algebraic()
1711 {
1712 bool progress = false;
1713
1714 foreach_list(node, &this->instructions) {
1715 fs_inst *inst = (fs_inst *)node;
1716
1717 switch (inst->opcode) {
1718 case BRW_OPCODE_MUL:
1719 if (inst->src[1].file != IMM)
1720 continue;
1721
1722 /* a * 1.0 = a */
1723 if (inst->src[1].is_one()) {
1724 inst->opcode = BRW_OPCODE_MOV;
1725 inst->src[1] = reg_undef;
1726 progress = true;
1727 break;
1728 }
1729
1730 /* a * 0.0 = 0.0 */
1731 if (inst->src[1].is_zero()) {
1732 inst->opcode = BRW_OPCODE_MOV;
1733 inst->src[0] = inst->src[1];
1734 inst->src[1] = reg_undef;
1735 progress = true;
1736 break;
1737 }
1738
1739 break;
1740 case BRW_OPCODE_ADD:
1741 if (inst->src[1].file != IMM)
1742 continue;
1743
1744 /* a + 0.0 = a */
1745 if (inst->src[1].is_zero()) {
1746 inst->opcode = BRW_OPCODE_MOV;
1747 inst->src[1] = reg_undef;
1748 progress = true;
1749 break;
1750 }
1751 break;
1752 default:
1753 break;
1754 }
1755 }
1756
1757 return progress;
1758 }
1759
1760 /**
1761 * Must be called after calculate_live_intervales() to remove unused
1762 * writes to registers -- register allocation will fail otherwise
1763 * because something deffed but not used won't be considered to
1764 * interfere with other regs.
1765 */
1766 bool
1767 fs_visitor::dead_code_eliminate()
1768 {
1769 bool progress = false;
1770 int pc = 0;
1771
1772 calculate_live_intervals();
1773
1774 foreach_list_safe(node, &this->instructions) {
1775 fs_inst *inst = (fs_inst *)node;
1776
1777 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1778 inst->remove();
1779 progress = true;
1780 }
1781
1782 pc++;
1783 }
1784
1785 if (progress)
1786 live_intervals_valid = false;
1787
1788 return progress;
1789 }
1790
1791 struct dead_code_hash_key
1792 {
1793 int vgrf;
1794 int reg_offset;
1795 };
1796
1797 static bool
1798 dead_code_hash_compare(const void *a, const void *b)
1799 {
1800 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1801 }
1802
1803 static void
1804 clear_dead_code_hash(struct hash_table *ht)
1805 {
1806 struct hash_entry *entry;
1807
1808 hash_table_foreach(ht, entry) {
1809 _mesa_hash_table_remove(ht, entry);
1810 }
1811 }
1812
1813 static void
1814 insert_dead_code_hash(struct hash_table *ht,
1815 int vgrf, int reg_offset, fs_inst *inst)
1816 {
1817 /* We don't bother freeing keys, because they'll be GCed with the ht. */
1818 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1819
1820 key->vgrf = vgrf;
1821 key->reg_offset = reg_offset;
1822
1823 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1824 }
1825
1826 static struct hash_entry *
1827 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1828 {
1829 struct dead_code_hash_key key;
1830
1831 key.vgrf = vgrf;
1832 key.reg_offset = reg_offset;
1833
1834 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1835 }
1836
1837 static void
1838 remove_dead_code_hash(struct hash_table *ht,
1839 int vgrf, int reg_offset)
1840 {
1841 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1842 if (!entry)
1843 return;
1844
1845 _mesa_hash_table_remove(ht, entry);
1846 }
1847
1848 /**
1849 * Walks basic blocks, removing any regs that are written but not read before
1850 * being redefined.
1851 *
1852 * The dead_code_eliminate() function implements a global dead code
1853 * elimination, but it only handles the removing the last write to a register
1854 * if it's never read. This one can handle intermediate writes, but only
1855 * within a basic block.
1856 */
1857 bool
1858 fs_visitor::dead_code_eliminate_local()
1859 {
1860 struct hash_table *ht;
1861 bool progress = false;
1862
1863 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1864
1865 foreach_list_safe(node, &this->instructions) {
1866 fs_inst *inst = (fs_inst *)node;
1867
1868 /* At a basic block, empty the HT since we don't understand dataflow
1869 * here.
1870 */
1871 if (inst->is_control_flow()) {
1872 clear_dead_code_hash(ht);
1873 continue;
1874 }
1875
1876 /* Clear the HT of any instructions that got read. */
1877 for (int i = 0; i < 3; i++) {
1878 fs_reg src = inst->src[i];
1879 if (src.file != GRF)
1880 continue;
1881
1882 int read = 1;
1883 if (inst->is_send_from_grf())
1884 read = virtual_grf_sizes[src.reg] - src.reg_offset;
1885
1886 for (int reg_offset = src.reg_offset;
1887 reg_offset < src.reg_offset + read;
1888 reg_offset++) {
1889 remove_dead_code_hash(ht, src.reg, reg_offset);
1890 }
1891 }
1892
1893 /* Add any update of a GRF to the HT, removing a previous write if it
1894 * wasn't read.
1895 */
1896 if (inst->dst.file == GRF) {
1897 if (inst->regs_written > 1) {
1898 /* We don't know how to trim channels from an instruction's
1899 * writes, so we can't incrementally remove unread channels from
1900 * it. Just remove whatever it overwrites from the table
1901 */
1902 for (int i = 0; i < inst->regs_written; i++) {
1903 remove_dead_code_hash(ht,
1904 inst->dst.reg,
1905 inst->dst.reg_offset + i);
1906 }
1907 } else {
1908 struct hash_entry *entry =
1909 get_dead_code_hash_entry(ht, inst->dst.reg,
1910 inst->dst.reg_offset);
1911
1912 if (inst->is_partial_write()) {
1913 /* For a partial write, we can't remove any previous dead code
1914 * candidate, since we're just modifying their result, but we can
1915 * be dead code eliminiated ourselves.
1916 */
1917 if (entry) {
1918 entry->data = inst;
1919 } else {
1920 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1921 inst);
1922 }
1923 } else {
1924 if (entry) {
1925 /* We're completely updating a channel, and there was a
1926 * previous write to the channel that wasn't read. Kill it!
1927 */
1928 fs_inst *inst = (fs_inst *)entry->data;
1929 inst->remove();
1930 progress = true;
1931 _mesa_hash_table_remove(ht, entry);
1932 }
1933
1934 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1935 inst);
1936 }
1937 }
1938 }
1939 }
1940
1941 _mesa_hash_table_destroy(ht, NULL);
1942
1943 if (progress)
1944 live_intervals_valid = false;
1945
1946 return progress;
1947 }
1948
1949 /**
1950 * Implements a second type of register coalescing: This one checks if
1951 * the two regs involved in a raw move don't interfere, in which case
1952 * they can both by stored in the same place and the MOV removed.
1953 */
1954 bool
1955 fs_visitor::register_coalesce_2()
1956 {
1957 bool progress = false;
1958
1959 calculate_live_intervals();
1960
1961 foreach_list_safe(node, &this->instructions) {
1962 fs_inst *inst = (fs_inst *)node;
1963
1964 if (inst->opcode != BRW_OPCODE_MOV ||
1965 inst->is_partial_write() ||
1966 inst->saturate ||
1967 inst->src[0].file != GRF ||
1968 inst->src[0].negate ||
1969 inst->src[0].abs ||
1970 inst->src[0].smear != -1 ||
1971 inst->dst.file != GRF ||
1972 inst->dst.type != inst->src[0].type ||
1973 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1974 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1975 continue;
1976 }
1977
1978 int reg_from = inst->src[0].reg;
1979 assert(inst->src[0].reg_offset == 0);
1980 int reg_to = inst->dst.reg;
1981 int reg_to_offset = inst->dst.reg_offset;
1982
1983 foreach_list(node, &this->instructions) {
1984 fs_inst *scan_inst = (fs_inst *)node;
1985
1986 if (scan_inst->dst.file == GRF &&
1987 scan_inst->dst.reg == reg_from) {
1988 scan_inst->dst.reg = reg_to;
1989 scan_inst->dst.reg_offset = reg_to_offset;
1990 }
1991 for (int i = 0; i < 3; i++) {
1992 if (scan_inst->src[i].file == GRF &&
1993 scan_inst->src[i].reg == reg_from) {
1994 scan_inst->src[i].reg = reg_to;
1995 scan_inst->src[i].reg_offset = reg_to_offset;
1996 }
1997 }
1998 }
1999
2000 inst->remove();
2001
2002 /* We don't need to recalculate live intervals inside the loop despite
2003 * flagging live_intervals_valid because we only use live intervals for
2004 * the interferes test, and we must have had a situation where the
2005 * intervals were:
2006 *
2007 * from to
2008 * ^
2009 * |
2010 * v
2011 * ^
2012 * |
2013 * v
2014 *
2015 * Some register R that might get coalesced with one of these two could
2016 * only be referencing "to", otherwise "from"'s range would have been
2017 * longer. R's range could also only start at the end of "to" or later,
2018 * otherwise it will conflict with "to" when we try to coalesce "to"
2019 * into Rw anyway.
2020 */
2021 live_intervals_valid = false;
2022
2023 progress = true;
2024 continue;
2025 }
2026
2027 return progress;
2028 }
2029
2030 bool
2031 fs_visitor::register_coalesce()
2032 {
2033 bool progress = false;
2034 int if_depth = 0;
2035 int loop_depth = 0;
2036
2037 foreach_list_safe(node, &this->instructions) {
2038 fs_inst *inst = (fs_inst *)node;
2039
2040 /* Make sure that we dominate the instructions we're going to
2041 * scan for interfering with our coalescing, or we won't have
2042 * scanned enough to see if anything interferes with our
2043 * coalescing. We don't dominate the following instructions if
2044 * we're in a loop or an if block.
2045 */
2046 switch (inst->opcode) {
2047 case BRW_OPCODE_DO:
2048 loop_depth++;
2049 break;
2050 case BRW_OPCODE_WHILE:
2051 loop_depth--;
2052 break;
2053 case BRW_OPCODE_IF:
2054 if_depth++;
2055 break;
2056 case BRW_OPCODE_ENDIF:
2057 if_depth--;
2058 break;
2059 default:
2060 break;
2061 }
2062 if (loop_depth || if_depth)
2063 continue;
2064
2065 if (inst->opcode != BRW_OPCODE_MOV ||
2066 inst->is_partial_write() ||
2067 inst->saturate ||
2068 inst->dst.file != GRF || (inst->src[0].file != GRF &&
2069 inst->src[0].file != UNIFORM)||
2070 inst->dst.type != inst->src[0].type)
2071 continue;
2072
2073 bool has_source_modifiers = (inst->src[0].abs ||
2074 inst->src[0].negate ||
2075 inst->src[0].smear != -1 ||
2076 inst->src[0].file == UNIFORM);
2077
2078 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
2079 * them: check for no writes to either one until the exit of the
2080 * program.
2081 */
2082 bool interfered = false;
2083
2084 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2085 !scan_inst->is_tail_sentinel();
2086 scan_inst = (fs_inst *)scan_inst->next) {
2087 if (scan_inst->dst.file == GRF) {
2088 if (scan_inst->overwrites_reg(inst->dst) ||
2089 scan_inst->overwrites_reg(inst->src[0])) {
2090 interfered = true;
2091 break;
2092 }
2093 }
2094
2095 /* The gen6 MATH instruction can't handle source modifiers or
2096 * unusual register regions, so avoid coalescing those for
2097 * now. We should do something more specific.
2098 */
2099 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2100 interfered = true;
2101 break;
2102 }
2103
2104 /* The accumulator result appears to get used for the
2105 * conditional modifier generation. When negating a UD
2106 * value, there is a 33rd bit generated for the sign in the
2107 * accumulator value, so now you can't check, for example,
2108 * equality with a 32-bit value. See piglit fs-op-neg-uint.
2109 */
2110 if (scan_inst->conditional_mod &&
2111 inst->src[0].negate &&
2112 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2113 interfered = true;
2114 break;
2115 }
2116 }
2117 if (interfered) {
2118 continue;
2119 }
2120
2121 /* Rewrite the later usage to point at the source of the move to
2122 * be removed.
2123 */
2124 for (fs_inst *scan_inst = inst;
2125 !scan_inst->is_tail_sentinel();
2126 scan_inst = (fs_inst *)scan_inst->next) {
2127 for (int i = 0; i < 3; i++) {
2128 if (scan_inst->src[i].file == GRF &&
2129 scan_inst->src[i].reg == inst->dst.reg &&
2130 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2131 fs_reg new_src = inst->src[0];
2132 if (scan_inst->src[i].abs) {
2133 new_src.negate = 0;
2134 new_src.abs = 1;
2135 }
2136 new_src.negate ^= scan_inst->src[i].negate;
2137 scan_inst->src[i] = new_src;
2138 }
2139 }
2140 }
2141
2142 inst->remove();
2143 progress = true;
2144 }
2145
2146 if (progress)
2147 live_intervals_valid = false;
2148
2149 return progress;
2150 }
2151
2152
2153 bool
2154 fs_visitor::compute_to_mrf()
2155 {
2156 bool progress = false;
2157 int next_ip = 0;
2158
2159 calculate_live_intervals();
2160
2161 foreach_list_safe(node, &this->instructions) {
2162 fs_inst *inst = (fs_inst *)node;
2163
2164 int ip = next_ip;
2165 next_ip++;
2166
2167 if (inst->opcode != BRW_OPCODE_MOV ||
2168 inst->is_partial_write() ||
2169 inst->dst.file != MRF || inst->src[0].file != GRF ||
2170 inst->dst.type != inst->src[0].type ||
2171 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2172 continue;
2173
2174 /* Work out which hardware MRF registers are written by this
2175 * instruction.
2176 */
2177 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2178 int mrf_high;
2179 if (inst->dst.reg & BRW_MRF_COMPR4) {
2180 mrf_high = mrf_low + 4;
2181 } else if (dispatch_width == 16 &&
2182 (!inst->force_uncompressed && !inst->force_sechalf)) {
2183 mrf_high = mrf_low + 1;
2184 } else {
2185 mrf_high = mrf_low;
2186 }
2187
2188 /* Can't compute-to-MRF this GRF if someone else was going to
2189 * read it later.
2190 */
2191 if (this->virtual_grf_use[inst->src[0].reg] > ip)
2192 continue;
2193
2194 /* Found a move of a GRF to a MRF. Let's see if we can go
2195 * rewrite the thing that made this GRF to write into the MRF.
2196 */
2197 fs_inst *scan_inst;
2198 for (scan_inst = (fs_inst *)inst->prev;
2199 scan_inst->prev != NULL;
2200 scan_inst = (fs_inst *)scan_inst->prev) {
2201 if (scan_inst->dst.file == GRF &&
2202 scan_inst->dst.reg == inst->src[0].reg) {
2203 /* Found the last thing to write our reg we want to turn
2204 * into a compute-to-MRF.
2205 */
2206
2207 /* If this one instruction didn't populate all the
2208 * channels, bail. We might be able to rewrite everything
2209 * that writes that reg, but it would require smarter
2210 * tracking to delay the rewriting until complete success.
2211 */
2212 if (scan_inst->is_partial_write())
2213 break;
2214
2215 /* Things returning more than one register would need us to
2216 * understand coalescing out more than one MOV at a time.
2217 */
2218 if (scan_inst->regs_written > 1)
2219 break;
2220
2221 /* SEND instructions can't have MRF as a destination. */
2222 if (scan_inst->mlen)
2223 break;
2224
2225 if (intel->gen == 6) {
2226 /* gen6 math instructions must have the destination be
2227 * GRF, so no compute-to-MRF for them.
2228 */
2229 if (scan_inst->is_math()) {
2230 break;
2231 }
2232 }
2233
2234 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2235 /* Found the creator of our MRF's source value. */
2236 scan_inst->dst.file = MRF;
2237 scan_inst->dst.reg = inst->dst.reg;
2238 scan_inst->saturate |= inst->saturate;
2239 inst->remove();
2240 progress = true;
2241 }
2242 break;
2243 }
2244
2245 /* We don't handle control flow here. Most computation of
2246 * values that end up in MRFs are shortly before the MRF
2247 * write anyway.
2248 */
2249 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2250 break;
2251
2252 /* You can't read from an MRF, so if someone else reads our
2253 * MRF's source GRF that we wanted to rewrite, that stops us.
2254 */
2255 bool interfered = false;
2256 for (int i = 0; i < 3; i++) {
2257 if (scan_inst->src[i].file == GRF &&
2258 scan_inst->src[i].reg == inst->src[0].reg &&
2259 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2260 interfered = true;
2261 }
2262 }
2263 if (interfered)
2264 break;
2265
2266 if (scan_inst->dst.file == MRF) {
2267 /* If somebody else writes our MRF here, we can't
2268 * compute-to-MRF before that.
2269 */
2270 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2271 int scan_mrf_high;
2272
2273 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2274 scan_mrf_high = scan_mrf_low + 4;
2275 } else if (dispatch_width == 16 &&
2276 (!scan_inst->force_uncompressed &&
2277 !scan_inst->force_sechalf)) {
2278 scan_mrf_high = scan_mrf_low + 1;
2279 } else {
2280 scan_mrf_high = scan_mrf_low;
2281 }
2282
2283 if (mrf_low == scan_mrf_low ||
2284 mrf_low == scan_mrf_high ||
2285 mrf_high == scan_mrf_low ||
2286 mrf_high == scan_mrf_high) {
2287 break;
2288 }
2289 }
2290
2291 if (scan_inst->mlen > 0) {
2292 /* Found a SEND instruction, which means that there are
2293 * live values in MRFs from base_mrf to base_mrf +
2294 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2295 * above it.
2296 */
2297 if (mrf_low >= scan_inst->base_mrf &&
2298 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2299 break;
2300 }
2301 if (mrf_high >= scan_inst->base_mrf &&
2302 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2303 break;
2304 }
2305 }
2306 }
2307 }
2308
2309 if (progress)
2310 live_intervals_valid = false;
2311
2312 return progress;
2313 }
2314
2315 /**
2316 * Walks through basic blocks, looking for repeated MRF writes and
2317 * removing the later ones.
2318 */
2319 bool
2320 fs_visitor::remove_duplicate_mrf_writes()
2321 {
2322 fs_inst *last_mrf_move[16];
2323 bool progress = false;
2324
2325 /* Need to update the MRF tracking for compressed instructions. */
2326 if (dispatch_width == 16)
2327 return false;
2328
2329 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2330
2331 foreach_list_safe(node, &this->instructions) {
2332 fs_inst *inst = (fs_inst *)node;
2333
2334 if (inst->is_control_flow()) {
2335 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2336 }
2337
2338 if (inst->opcode == BRW_OPCODE_MOV &&
2339 inst->dst.file == MRF) {
2340 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2341 if (prev_inst && inst->equals(prev_inst)) {
2342 inst->remove();
2343 progress = true;
2344 continue;
2345 }
2346 }
2347
2348 /* Clear out the last-write records for MRFs that were overwritten. */
2349 if (inst->dst.file == MRF) {
2350 last_mrf_move[inst->dst.reg] = NULL;
2351 }
2352
2353 if (inst->mlen > 0) {
2354 /* Found a SEND instruction, which will include two or fewer
2355 * implied MRF writes. We could do better here.
2356 */
2357 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2358 last_mrf_move[inst->base_mrf + i] = NULL;
2359 }
2360 }
2361
2362 /* Clear out any MRF move records whose sources got overwritten. */
2363 if (inst->dst.file == GRF) {
2364 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2365 if (last_mrf_move[i] &&
2366 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2367 last_mrf_move[i] = NULL;
2368 }
2369 }
2370 }
2371
2372 if (inst->opcode == BRW_OPCODE_MOV &&
2373 inst->dst.file == MRF &&
2374 inst->src[0].file == GRF &&
2375 !inst->is_partial_write()) {
2376 last_mrf_move[inst->dst.reg] = inst;
2377 }
2378 }
2379
2380 if (progress)
2381 live_intervals_valid = false;
2382
2383 return progress;
2384 }
2385
2386 static void
2387 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2388 int first_grf, int grf_len)
2389 {
2390 bool inst_16wide = (dispatch_width > 8 &&
2391 !inst->force_uncompressed &&
2392 !inst->force_sechalf);
2393
2394 /* Clear the flag for registers that actually got read (as expected). */
2395 for (int i = 0; i < 3; i++) {
2396 int grf;
2397 if (inst->src[i].file == GRF) {
2398 grf = inst->src[i].reg;
2399 } else if (inst->src[i].file == FIXED_HW_REG &&
2400 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2401 grf = inst->src[i].fixed_hw_reg.nr;
2402 } else {
2403 continue;
2404 }
2405
2406 if (grf >= first_grf &&
2407 grf < first_grf + grf_len) {
2408 deps[grf - first_grf] = false;
2409 if (inst_16wide)
2410 deps[grf - first_grf + 1] = false;
2411 }
2412 }
2413 }
2414
2415 /**
2416 * Implements this workaround for the original 965:
2417 *
2418 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2419 * check for post destination dependencies on this instruction, software
2420 * must ensure that there is no destination hazard for the case of ‘write
2421 * followed by a posted write’ shown in the following example.
2422 *
2423 * 1. mov r3 0
2424 * 2. send r3.xy <rest of send instruction>
2425 * 3. mov r2 r3
2426 *
2427 * Due to no post-destination dependency check on the ‘send’, the above
2428 * code sequence could have two instructions (1 and 2) in flight at the
2429 * same time that both consider ‘r3’ as the target of their final writes.
2430 */
2431 void
2432 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2433 {
2434 int reg_size = dispatch_width / 8;
2435 int write_len = inst->regs_written * reg_size;
2436 int first_write_grf = inst->dst.reg;
2437 bool needs_dep[BRW_MAX_MRF];
2438 assert(write_len < (int)sizeof(needs_dep) - 1);
2439
2440 memset(needs_dep, false, sizeof(needs_dep));
2441 memset(needs_dep, true, write_len);
2442
2443 clear_deps_for_inst_src(inst, dispatch_width,
2444 needs_dep, first_write_grf, write_len);
2445
2446 /* Walk backwards looking for writes to registers we're writing which
2447 * aren't read since being written. If we hit the start of the program,
2448 * we assume that there are no outstanding dependencies on entry to the
2449 * program.
2450 */
2451 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2452 scan_inst != NULL;
2453 scan_inst = (fs_inst *)scan_inst->prev) {
2454
2455 /* If we hit control flow, assume that there *are* outstanding
2456 * dependencies, and force their cleanup before our instruction.
2457 */
2458 if (scan_inst->is_control_flow()) {
2459 for (int i = 0; i < write_len; i++) {
2460 if (needs_dep[i]) {
2461 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2462 }
2463 }
2464 return;
2465 }
2466
2467 bool scan_inst_16wide = (dispatch_width > 8 &&
2468 !scan_inst->force_uncompressed &&
2469 !scan_inst->force_sechalf);
2470
2471 /* We insert our reads as late as possible on the assumption that any
2472 * instruction but a MOV that might have left us an outstanding
2473 * dependency has more latency than a MOV.
2474 */
2475 if (scan_inst->dst.file == GRF) {
2476 for (int i = 0; i < scan_inst->regs_written; i++) {
2477 int reg = scan_inst->dst.reg + i * reg_size;
2478
2479 if (reg >= first_write_grf &&
2480 reg < first_write_grf + write_len &&
2481 needs_dep[reg - first_write_grf]) {
2482 inst->insert_before(DEP_RESOLVE_MOV(reg));
2483 needs_dep[reg - first_write_grf] = false;
2484 if (scan_inst_16wide)
2485 needs_dep[reg - first_write_grf + 1] = false;
2486 }
2487 }
2488 }
2489
2490 /* Clear the flag for registers that actually got read (as expected). */
2491 clear_deps_for_inst_src(scan_inst, dispatch_width,
2492 needs_dep, first_write_grf, write_len);
2493
2494 /* Continue the loop only if we haven't resolved all the dependencies */
2495 int i;
2496 for (i = 0; i < write_len; i++) {
2497 if (needs_dep[i])
2498 break;
2499 }
2500 if (i == write_len)
2501 return;
2502 }
2503 }
2504
2505 /**
2506 * Implements this workaround for the original 965:
2507 *
2508 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2509 * used as a destination register until after it has been sourced by an
2510 * instruction with a different destination register.
2511 */
2512 void
2513 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2514 {
2515 int write_len = inst->regs_written * dispatch_width / 8;
2516 int first_write_grf = inst->dst.reg;
2517 bool needs_dep[BRW_MAX_MRF];
2518 assert(write_len < (int)sizeof(needs_dep) - 1);
2519
2520 memset(needs_dep, false, sizeof(needs_dep));
2521 memset(needs_dep, true, write_len);
2522 /* Walk forwards looking for writes to registers we're writing which aren't
2523 * read before being written.
2524 */
2525 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2526 !scan_inst->is_tail_sentinel();
2527 scan_inst = (fs_inst *)scan_inst->next) {
2528 /* If we hit control flow, force resolve all remaining dependencies. */
2529 if (scan_inst->is_control_flow()) {
2530 for (int i = 0; i < write_len; i++) {
2531 if (needs_dep[i])
2532 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2533 }
2534 return;
2535 }
2536
2537 /* Clear the flag for registers that actually got read (as expected). */
2538 clear_deps_for_inst_src(scan_inst, dispatch_width,
2539 needs_dep, first_write_grf, write_len);
2540
2541 /* We insert our reads as late as possible since they're reading the
2542 * result of a SEND, which has massive latency.
2543 */
2544 if (scan_inst->dst.file == GRF &&
2545 scan_inst->dst.reg >= first_write_grf &&
2546 scan_inst->dst.reg < first_write_grf + write_len &&
2547 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2548 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2549 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2550 }
2551
2552 /* Continue the loop only if we haven't resolved all the dependencies */
2553 int i;
2554 for (i = 0; i < write_len; i++) {
2555 if (needs_dep[i])
2556 break;
2557 }
2558 if (i == write_len)
2559 return;
2560 }
2561
2562 /* If we hit the end of the program, resolve all remaining dependencies out
2563 * of paranoia.
2564 */
2565 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2566 assert(last_inst->eot);
2567 for (int i = 0; i < write_len; i++) {
2568 if (needs_dep[i])
2569 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2570 }
2571 }
2572
2573 void
2574 fs_visitor::insert_gen4_send_dependency_workarounds()
2575 {
2576 if (intel->gen != 4 || intel->is_g4x)
2577 return;
2578
2579 /* Note that we're done with register allocation, so GRF fs_regs always
2580 * have a .reg_offset of 0.
2581 */
2582
2583 foreach_list_safe(node, &this->instructions) {
2584 fs_inst *inst = (fs_inst *)node;
2585
2586 if (inst->mlen != 0 && inst->dst.file == GRF) {
2587 insert_gen4_pre_send_dependency_workarounds(inst);
2588 insert_gen4_post_send_dependency_workarounds(inst);
2589 }
2590 }
2591 }
2592
2593 /**
2594 * Turns the generic expression-style uniform pull constant load instruction
2595 * into a hardware-specific series of instructions for loading a pull
2596 * constant.
2597 *
2598 * The expression style allows the CSE pass before this to optimize out
2599 * repeated loads from the same offset, and gives the pre-register-allocation
2600 * scheduling full flexibility, while the conversion to native instructions
2601 * allows the post-register-allocation scheduler the best information
2602 * possible.
2603 *
2604 * Note that execution masking for setting up pull constant loads is special:
2605 * the channels that need to be written are unrelated to the current execution
2606 * mask, since a later instruction will use one of the result channels as a
2607 * source operand for all 8 or 16 of its channels.
2608 */
2609 void
2610 fs_visitor::lower_uniform_pull_constant_loads()
2611 {
2612 foreach_list(node, &this->instructions) {
2613 fs_inst *inst = (fs_inst *)node;
2614
2615 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2616 continue;
2617
2618 if (intel->gen >= 7) {
2619 /* The offset arg before was a vec4-aligned byte offset. We need to
2620 * turn it into a dword offset.
2621 */
2622 fs_reg const_offset_reg = inst->src[1];
2623 assert(const_offset_reg.file == IMM &&
2624 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2625 const_offset_reg.imm.u /= 4;
2626 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2627
2628 /* This is actually going to be a MOV, but since only the first dword
2629 * is accessed, we have a special opcode to do just that one. Note
2630 * that this needs to be an operation that will be considered a def
2631 * by live variable analysis, or register allocation will explode.
2632 */
2633 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2634 payload, const_offset_reg);
2635 setup->force_writemask_all = true;
2636
2637 setup->ir = inst->ir;
2638 setup->annotation = inst->annotation;
2639 inst->insert_before(setup);
2640
2641 /* Similarly, this will only populate the first 4 channels of the
2642 * result register (since we only use smear values from 0-3), but we
2643 * don't tell the optimizer.
2644 */
2645 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2646 inst->src[1] = payload;
2647
2648 this->live_intervals_valid = false;
2649 } else {
2650 /* Before register allocation, we didn't tell the scheduler about the
2651 * MRF we use. We know it's safe to use this MRF because nothing
2652 * else does except for register spill/unspill, which generates and
2653 * uses its MRF within a single IR instruction.
2654 */
2655 inst->base_mrf = 14;
2656 inst->mlen = 1;
2657 }
2658 }
2659 }
2660
2661 void
2662 fs_visitor::dump_instruction(fs_inst *inst)
2663 {
2664 if (inst->predicate) {
2665 printf("(%cf0.%d) ",
2666 inst->predicate_inverse ? '-' : '+',
2667 inst->flag_subreg);
2668 }
2669
2670 printf("%s", brw_instruction_name(inst->opcode));
2671 if (inst->saturate)
2672 printf(".sat");
2673 if (inst->conditional_mod) {
2674 printf(".cmod");
2675 if (!inst->predicate &&
2676 (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2677 inst->opcode != BRW_OPCODE_IF &&
2678 inst->opcode != BRW_OPCODE_WHILE))) {
2679 printf(".f0.%d\n", inst->flag_subreg);
2680 }
2681 }
2682 printf(" ");
2683
2684
2685 switch (inst->dst.file) {
2686 case GRF:
2687 printf("vgrf%d", inst->dst.reg);
2688 if (inst->dst.reg_offset)
2689 printf("+%d", inst->dst.reg_offset);
2690 break;
2691 case MRF:
2692 printf("m%d", inst->dst.reg);
2693 break;
2694 case BAD_FILE:
2695 printf("(null)");
2696 break;
2697 case UNIFORM:
2698 printf("***u%d***", inst->dst.reg);
2699 break;
2700 default:
2701 printf("???");
2702 break;
2703 }
2704 printf(", ");
2705
2706 for (int i = 0; i < 3; i++) {
2707 if (inst->src[i].negate)
2708 printf("-");
2709 if (inst->src[i].abs)
2710 printf("|");
2711 switch (inst->src[i].file) {
2712 case GRF:
2713 printf("vgrf%d", inst->src[i].reg);
2714 if (inst->src[i].reg_offset)
2715 printf("+%d", inst->src[i].reg_offset);
2716 break;
2717 case MRF:
2718 printf("***m%d***", inst->src[i].reg);
2719 break;
2720 case UNIFORM:
2721 printf("u%d", inst->src[i].reg);
2722 if (inst->src[i].reg_offset)
2723 printf(".%d", inst->src[i].reg_offset);
2724 break;
2725 case BAD_FILE:
2726 printf("(null)");
2727 break;
2728 case IMM:
2729 switch (inst->src[i].type) {
2730 case BRW_REGISTER_TYPE_F:
2731 printf("%ff", inst->src[i].imm.f);
2732 break;
2733 case BRW_REGISTER_TYPE_D:
2734 printf("%dd", inst->src[i].imm.i);
2735 break;
2736 case BRW_REGISTER_TYPE_UD:
2737 printf("%uu", inst->src[i].imm.u);
2738 break;
2739 default:
2740 printf("???");
2741 break;
2742 }
2743 break;
2744 default:
2745 printf("???");
2746 break;
2747 }
2748 if (inst->src[i].abs)
2749 printf("|");
2750
2751 if (i < 3)
2752 printf(", ");
2753 }
2754
2755 printf(" ");
2756
2757 if (inst->force_uncompressed)
2758 printf("1sthalf ");
2759
2760 if (inst->force_sechalf)
2761 printf("2ndhalf ");
2762
2763 printf("\n");
2764 }
2765
2766 void
2767 fs_visitor::dump_instructions()
2768 {
2769 int ip = 0;
2770 foreach_list(node, &this->instructions) {
2771 fs_inst *inst = (fs_inst *)node;
2772 printf("%d: ", ip++);
2773 dump_instruction(inst);
2774 }
2775 }
2776
2777 /**
2778 * Possibly returns an instruction that set up @param reg.
2779 *
2780 * Sometimes we want to take the result of some expression/variable
2781 * dereference tree and rewrite the instruction generating the result
2782 * of the tree. When processing the tree, we know that the
2783 * instructions generated are all writing temporaries that are dead
2784 * outside of this tree. So, if we have some instructions that write
2785 * a temporary, we're free to point that temp write somewhere else.
2786 *
2787 * Note that this doesn't guarantee that the instruction generated
2788 * only reg -- it might be the size=4 destination of a texture instruction.
2789 */
2790 fs_inst *
2791 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2792 fs_inst *end,
2793 fs_reg reg)
2794 {
2795 if (end == start ||
2796 end->is_partial_write() ||
2797 reg.reladdr ||
2798 !reg.equals(end->dst)) {
2799 return NULL;
2800 } else {
2801 return end;
2802 }
2803 }
2804
2805 void
2806 fs_visitor::setup_payload_gen6()
2807 {
2808 bool uses_depth =
2809 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2810 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2811
2812 assert(intel->gen >= 6);
2813
2814 /* R0-1: masks, pixel X/Y coordinates. */
2815 c->nr_payload_regs = 2;
2816 /* R2: only for 32-pixel dispatch.*/
2817
2818 /* R3-26: barycentric interpolation coordinates. These appear in the
2819 * same order that they appear in the brw_wm_barycentric_interp_mode
2820 * enum. Each set of coordinates occupies 2 registers if dispatch width
2821 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2822 * appear if they were enabled using the "Barycentric Interpolation
2823 * Mode" bits in WM_STATE.
2824 */
2825 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2826 if (barycentric_interp_modes & (1 << i)) {
2827 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2828 c->nr_payload_regs += 2;
2829 if (dispatch_width == 16) {
2830 c->nr_payload_regs += 2;
2831 }
2832 }
2833 }
2834
2835 /* R27: interpolated depth if uses source depth */
2836 if (uses_depth) {
2837 c->source_depth_reg = c->nr_payload_regs;
2838 c->nr_payload_regs++;
2839 if (dispatch_width == 16) {
2840 /* R28: interpolated depth if not 8-wide. */
2841 c->nr_payload_regs++;
2842 }
2843 }
2844 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2845 if (uses_depth) {
2846 c->source_w_reg = c->nr_payload_regs;
2847 c->nr_payload_regs++;
2848 if (dispatch_width == 16) {
2849 /* R30: interpolated W if not 8-wide. */
2850 c->nr_payload_regs++;
2851 }
2852 }
2853 /* R31: MSAA position offsets. */
2854 /* R32-: bary for 32-pixel. */
2855 /* R58-59: interp W for 32-pixel. */
2856
2857 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2858 c->source_depth_to_render_target = true;
2859 }
2860 }
2861
2862 bool
2863 fs_visitor::run()
2864 {
2865 sanity_param_count = fp->Base.Parameters->NumParameters;
2866 uint32_t orig_nr_params = c->prog_data.nr_params;
2867
2868 if (intel->gen >= 6)
2869 setup_payload_gen6();
2870 else
2871 setup_payload_gen4();
2872
2873 if (0) {
2874 emit_dummy_fs();
2875 } else {
2876 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2877 emit_shader_time_begin();
2878
2879 calculate_urb_setup();
2880 if (intel->gen < 6)
2881 emit_interpolation_setup_gen4();
2882 else
2883 emit_interpolation_setup_gen6();
2884
2885 /* We handle discards by keeping track of the still-live pixels in f0.1.
2886 * Initialize it with the dispatched pixels.
2887 */
2888 if (fp->UsesKill) {
2889 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2890 discard_init->flag_subreg = 1;
2891 }
2892
2893 /* Generate FS IR for main(). (the visitor only descends into
2894 * functions called "main").
2895 */
2896 if (shader) {
2897 foreach_list(node, &*shader->ir) {
2898 ir_instruction *ir = (ir_instruction *)node;
2899 base_ir = ir;
2900 this->result = reg_undef;
2901 ir->accept(this);
2902 }
2903 } else {
2904 emit_fragment_program_code();
2905 }
2906 base_ir = NULL;
2907 if (failed)
2908 return false;
2909
2910 emit(FS_OPCODE_PLACEHOLDER_HALT);
2911
2912 emit_fb_writes();
2913
2914 split_virtual_grfs();
2915
2916 move_uniform_array_access_to_pull_constants();
2917 setup_pull_constants();
2918
2919 bool progress;
2920 do {
2921 progress = false;
2922
2923 compact_virtual_grfs();
2924
2925 progress = remove_duplicate_mrf_writes() || progress;
2926
2927 progress = opt_algebraic() || progress;
2928 progress = opt_cse() || progress;
2929 progress = opt_copy_propagate() || progress;
2930 progress = dead_code_eliminate() || progress;
2931 progress = dead_code_eliminate_local() || progress;
2932 progress = register_coalesce() || progress;
2933 progress = register_coalesce_2() || progress;
2934 progress = compute_to_mrf() || progress;
2935 } while (progress);
2936
2937 remove_dead_constants();
2938
2939 schedule_instructions(false);
2940
2941 lower_uniform_pull_constant_loads();
2942
2943 assign_curb_setup();
2944 assign_urb_setup();
2945
2946 if (0) {
2947 /* Debug of register spilling: Go spill everything. */
2948 for (int i = 0; i < virtual_grf_count; i++) {
2949 spill_reg(i);
2950 }
2951 }
2952
2953 if (0)
2954 assign_regs_trivial();
2955 else {
2956 while (!assign_regs()) {
2957 if (failed)
2958 break;
2959 }
2960 }
2961 }
2962 assert(force_uncompressed_stack == 0);
2963 assert(force_sechalf_stack == 0);
2964
2965 /* This must come after all optimization and register allocation, since
2966 * it inserts dead code that happens to have side effects, and it does
2967 * so based on the actual physical registers in use.
2968 */
2969 insert_gen4_send_dependency_workarounds();
2970
2971 if (failed)
2972 return false;
2973
2974 schedule_instructions(true);
2975
2976 if (dispatch_width == 8) {
2977 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2978 } else {
2979 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2980
2981 /* Make sure we didn't try to sneak in an extra uniform */
2982 assert(orig_nr_params == c->prog_data.nr_params);
2983 (void) orig_nr_params;
2984 }
2985
2986 /* If any state parameters were appended, then ParameterValues could have
2987 * been realloced, in which case the driver uniform storage set up by
2988 * _mesa_associate_uniform_storage() would point to freed memory. Make
2989 * sure that didn't happen.
2990 */
2991 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2992
2993 return !failed;
2994 }
2995
2996 const unsigned *
2997 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2998 struct gl_fragment_program *fp,
2999 struct gl_shader_program *prog,
3000 unsigned *final_assembly_size)
3001 {
3002 struct intel_context *intel = &brw->intel;
3003 bool start_busy = false;
3004 float start_time = 0;
3005
3006 if (unlikely(intel->perf_debug)) {
3007 start_busy = (intel->batch.last_bo &&
3008 drm_intel_bo_busy(intel->batch.last_bo));
3009 start_time = get_time();
3010 }
3011
3012 struct brw_shader *shader = NULL;
3013 if (prog)
3014 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3015
3016 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3017 if (prog) {
3018 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3019 _mesa_print_ir(shader->ir, NULL);
3020 printf("\n\n");
3021 } else {
3022 printf("ARB_fragment_program %d ir for native fragment shader\n",
3023 fp->Base.Id);
3024 _mesa_print_program(&fp->Base);
3025 }
3026 }
3027
3028 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3029 */
3030 fs_visitor v(brw, c, prog, fp, 8);
3031 if (!v.run()) {
3032 if (prog) {
3033 prog->LinkStatus = false;
3034 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3035 }
3036
3037 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3038 v.fail_msg);
3039
3040 return NULL;
3041 }
3042
3043 exec_list *simd16_instructions = NULL;
3044 fs_visitor v2(brw, c, prog, fp, 16);
3045 bool no16 = INTEL_DEBUG & DEBUG_NO16;
3046 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
3047 v2.import_uniforms(&v);
3048 if (!v2.run()) {
3049 perf_debug("16-wide shader failed to compile, falling back to "
3050 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3051 } else {
3052 simd16_instructions = &v2.instructions;
3053 }
3054 }
3055
3056 c->prog_data.dispatch_width = 8;
3057
3058 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3059 const unsigned *generated = g.generate_assembly(&v.instructions,
3060 simd16_instructions,
3061 final_assembly_size);
3062
3063 if (unlikely(intel->perf_debug) && shader) {
3064 if (shader->compiled_once)
3065 brw_wm_debug_recompile(brw, prog, &c->key);
3066 shader->compiled_once = true;
3067
3068 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
3069 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3070 (get_time() - start_time) * 1000);
3071 }
3072 }
3073
3074 return generated;
3075 }
3076
3077 bool
3078 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3079 {
3080 struct brw_context *brw = brw_context(ctx);
3081 struct intel_context *intel = &brw->intel;
3082 struct brw_wm_prog_key key;
3083
3084 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3085 return true;
3086
3087 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3088 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3089 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3090 bool program_uses_dfdy = fp->UsesDFdy;
3091
3092 memset(&key, 0, sizeof(key));
3093
3094 if (intel->gen < 6) {
3095 if (fp->UsesKill)
3096 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3097
3098 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3099 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3100
3101 /* Just assume depth testing. */
3102 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3103 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3104 }
3105
3106 if (intel->gen < 6)
3107 key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
3108
3109 for (int i = 0; i < VARYING_SLOT_MAX; i++) {
3110 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
3111 continue;
3112
3113 if (intel->gen < 6) {
3114 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
3115 key.input_slots_valid |= BITFIELD64_BIT(i);
3116 }
3117 }
3118
3119 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3120
3121 for (int i = 0; i < MAX_SAMPLERS; i++) {
3122 if (fp->Base.ShadowSamplers & (1 << i)) {
3123 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3124 key.tex.swizzles[i] =
3125 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3126 } else {
3127 /* Color sampler: assume no swizzling. */
3128 key.tex.swizzles[i] = SWIZZLE_XYZW;
3129 }
3130 }
3131
3132 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3133 key.drawable_height = ctx->DrawBuffer->Height;
3134 }
3135
3136 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3137 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3138 }
3139
3140 key.nr_color_regions = 1;
3141
3142 key.program_string_id = bfp->id;
3143
3144 uint32_t old_prog_offset = brw->wm.prog_offset;
3145 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3146
3147 bool success = do_wm_prog(brw, prog, bfp, &key);
3148
3149 brw->wm.prog_offset = old_prog_offset;
3150 brw->wm.prog_data = old_prog_data;
3151
3152 return success;
3153 }