778a69e7091cdfc78df784b9592220b2fa40a7cb
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/uniforms.h"
39 #include "main/fbobject.h"
40 #include "program/prog_parameter.h"
41 #include "program/prog_print.h"
42 #include "program/register_allocate.h"
43 #include "program/sampler.h"
44 #include "program/hash_table.h"
45 #include "brw_context.h"
46 #include "brw_eu.h"
47 #include "brw_wm.h"
48 }
49 #include "brw_fs.h"
50 #include "glsl/glsl_types.h"
51 #include "glsl/ir_print_visitor.h"
52
53 void
54 fs_inst::init()
55 {
56 memset(this, 0, sizeof(*this));
57 this->opcode = BRW_OPCODE_NOP;
58 this->conditional_mod = BRW_CONDITIONAL_NONE;
59
60 this->dst = reg_undef;
61 this->src[0] = reg_undef;
62 this->src[1] = reg_undef;
63 this->src[2] = reg_undef;
64
65 /* This will be the case for almost all instructions. */
66 this->regs_written = 1;
67 }
68
69 fs_inst::fs_inst()
70 {
71 init();
72 }
73
74 fs_inst::fs_inst(enum opcode opcode)
75 {
76 init();
77 this->opcode = opcode;
78 }
79
80 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
81 {
82 init();
83 this->opcode = opcode;
84 this->dst = dst;
85
86 if (dst.file == GRF)
87 assert(dst.reg_offset >= 0);
88 }
89
90 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
91 {
92 init();
93 this->opcode = opcode;
94 this->dst = dst;
95 this->src[0] = src0;
96
97 if (dst.file == GRF)
98 assert(dst.reg_offset >= 0);
99 if (src[0].file == GRF)
100 assert(src[0].reg_offset >= 0);
101 }
102
103 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
104 {
105 init();
106 this->opcode = opcode;
107 this->dst = dst;
108 this->src[0] = src0;
109 this->src[1] = src1;
110
111 if (dst.file == GRF)
112 assert(dst.reg_offset >= 0);
113 if (src[0].file == GRF)
114 assert(src[0].reg_offset >= 0);
115 if (src[1].file == GRF)
116 assert(src[1].reg_offset >= 0);
117 }
118
119 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
120 fs_reg src0, fs_reg src1, fs_reg src2)
121 {
122 init();
123 this->opcode = opcode;
124 this->dst = dst;
125 this->src[0] = src0;
126 this->src[1] = src1;
127 this->src[2] = src2;
128
129 if (dst.file == GRF)
130 assert(dst.reg_offset >= 0);
131 if (src[0].file == GRF)
132 assert(src[0].reg_offset >= 0);
133 if (src[1].file == GRF)
134 assert(src[1].reg_offset >= 0);
135 if (src[2].file == GRF)
136 assert(src[2].reg_offset >= 0);
137 }
138
139 #define ALU1(op) \
140 fs_inst * \
141 fs_visitor::op(fs_reg dst, fs_reg src0) \
142 { \
143 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
144 }
145
146 #define ALU2(op) \
147 fs_inst * \
148 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
149 { \
150 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
151 }
152
153 #define ALU3(op) \
154 fs_inst * \
155 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
156 { \
157 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
158 }
159
160 ALU1(NOT)
161 ALU1(MOV)
162 ALU1(FRC)
163 ALU1(RNDD)
164 ALU1(RNDE)
165 ALU1(RNDZ)
166 ALU2(ADD)
167 ALU2(MUL)
168 ALU2(MACH)
169 ALU2(AND)
170 ALU2(OR)
171 ALU2(XOR)
172 ALU2(SHL)
173 ALU2(SHR)
174 ALU2(ASR)
175 ALU3(LRP)
176 ALU1(BFREV)
177 ALU3(BFE)
178 ALU2(BFI1)
179 ALU3(BFI2)
180 ALU1(FBH)
181 ALU1(FBL)
182 ALU1(CBIT)
183
184 /** Gen4 predicated IF. */
185 fs_inst *
186 fs_visitor::IF(uint32_t predicate)
187 {
188 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
189 inst->predicate = predicate;
190 return inst;
191 }
192
193 /** Gen6+ IF with embedded comparison. */
194 fs_inst *
195 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
196 {
197 assert(intel->gen >= 6);
198 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
199 reg_null_d, src0, src1);
200 inst->conditional_mod = condition;
201 return inst;
202 }
203
204 /**
205 * CMP: Sets the low bit of the destination channels with the result
206 * of the comparison, while the upper bits are undefined, and updates
207 * the flag register with the packed 16 bits of the result.
208 */
209 fs_inst *
210 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
211 {
212 fs_inst *inst;
213
214 /* Take the instruction:
215 *
216 * CMP null<d> src0<f> src1<f>
217 *
218 * Original gen4 does type conversion to the destination type before
219 * comparison, producing garbage results for floating point comparisons.
220 * gen5 does the comparison on the execution type (resolved source types),
221 * so dst type doesn't matter. gen6 does comparison and then uses the
222 * result as if it was the dst type with no conversion, which happens to
223 * mostly work out for float-interpreted-as-int since our comparisons are
224 * for >0, =0, <0.
225 */
226 if (intel->gen == 4) {
227 dst.type = src0.type;
228 if (dst.file == HW_REG)
229 dst.fixed_hw_reg.type = dst.type;
230 }
231
232 resolve_ud_negate(&src0);
233 resolve_ud_negate(&src1);
234
235 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
236 inst->conditional_mod = condition;
237
238 return inst;
239 }
240
241 exec_list
242 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
243 fs_reg varying_offset,
244 uint32_t const_offset)
245 {
246 exec_list instructions;
247 fs_inst *inst;
248
249 /* We have our constant surface use a pitch of 4 bytes, so our index can
250 * be any component of a vector, and then we load 4 contiguous
251 * components starting from that.
252 *
253 * We break down the const_offset to a portion added to the variable
254 * offset and a portion done using reg_offset, which means that if you
255 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
256 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
257 * CSE can later notice that those loads are all the same and eliminate
258 * the redundant ones.
259 */
260 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
261 instructions.push_tail(ADD(vec4_offset,
262 varying_offset, const_offset & ~3));
263
264 int scale = 1;
265 if (intel->gen == 4 && dispatch_width == 8) {
266 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
267 * u, v, r) as parameters, or we can just use the SIMD16 message
268 * consisting of (header, u). We choose the second, at the cost of a
269 * longer return length.
270 */
271 scale = 2;
272 }
273
274 enum opcode op;
275 if (intel->gen >= 7)
276 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
277 else
278 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
279 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
280 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
281 inst->regs_written = 4 * scale;
282 instructions.push_tail(inst);
283
284 if (intel->gen < 7) {
285 inst->base_mrf = 13;
286 inst->header_present = true;
287 if (intel->gen == 4)
288 inst->mlen = 3;
289 else
290 inst->mlen = 1 + dispatch_width / 8;
291 }
292
293 vec4_result.reg_offset += (const_offset & 3) * scale;
294 instructions.push_tail(MOV(dst, vec4_result));
295
296 return instructions;
297 }
298
299 /**
300 * A helper for MOV generation for fixing up broken hardware SEND dependency
301 * handling.
302 */
303 fs_inst *
304 fs_visitor::DEP_RESOLVE_MOV(int grf)
305 {
306 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
307
308 inst->ir = NULL;
309 inst->annotation = "send dependency resolve";
310
311 /* The caller always wants uncompressed to emit the minimal extra
312 * dependencies, and to avoid having to deal with aligning its regs to 2.
313 */
314 inst->force_uncompressed = true;
315
316 return inst;
317 }
318
319 bool
320 fs_inst::equals(fs_inst *inst)
321 {
322 return (opcode == inst->opcode &&
323 dst.equals(inst->dst) &&
324 src[0].equals(inst->src[0]) &&
325 src[1].equals(inst->src[1]) &&
326 src[2].equals(inst->src[2]) &&
327 saturate == inst->saturate &&
328 predicate == inst->predicate &&
329 conditional_mod == inst->conditional_mod &&
330 mlen == inst->mlen &&
331 base_mrf == inst->base_mrf &&
332 sampler == inst->sampler &&
333 target == inst->target &&
334 eot == inst->eot &&
335 header_present == inst->header_present &&
336 shadow_compare == inst->shadow_compare &&
337 offset == inst->offset);
338 }
339
340 bool
341 fs_inst::overwrites_reg(const fs_reg &reg)
342 {
343 return (reg.file == dst.file &&
344 reg.reg == dst.reg &&
345 reg.reg_offset >= dst.reg_offset &&
346 reg.reg_offset < dst.reg_offset + regs_written);
347 }
348
349 bool
350 fs_inst::is_send_from_grf()
351 {
352 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
353 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
354 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
355 src[1].file == GRF));
356 }
357
358 bool
359 fs_visitor::can_do_source_mods(fs_inst *inst)
360 {
361 if (intel->gen == 6 && inst->is_math())
362 return false;
363
364 if (inst->is_send_from_grf())
365 return false;
366
367 return true;
368 }
369
370 void
371 fs_reg::init()
372 {
373 memset(this, 0, sizeof(*this));
374 this->smear = -1;
375 }
376
377 /** Generic unset register constructor. */
378 fs_reg::fs_reg()
379 {
380 init();
381 this->file = BAD_FILE;
382 }
383
384 /** Immediate value constructor. */
385 fs_reg::fs_reg(float f)
386 {
387 init();
388 this->file = IMM;
389 this->type = BRW_REGISTER_TYPE_F;
390 this->imm.f = f;
391 }
392
393 /** Immediate value constructor. */
394 fs_reg::fs_reg(int32_t i)
395 {
396 init();
397 this->file = IMM;
398 this->type = BRW_REGISTER_TYPE_D;
399 this->imm.i = i;
400 }
401
402 /** Immediate value constructor. */
403 fs_reg::fs_reg(uint32_t u)
404 {
405 init();
406 this->file = IMM;
407 this->type = BRW_REGISTER_TYPE_UD;
408 this->imm.u = u;
409 }
410
411 /** Fixed brw_reg Immediate value constructor. */
412 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
413 {
414 init();
415 this->file = HW_REG;
416 this->fixed_hw_reg = fixed_hw_reg;
417 this->type = fixed_hw_reg.type;
418 }
419
420 bool
421 fs_reg::equals(const fs_reg &r) const
422 {
423 return (file == r.file &&
424 reg == r.reg &&
425 reg_offset == r.reg_offset &&
426 type == r.type &&
427 negate == r.negate &&
428 abs == r.abs &&
429 !reladdr && !r.reladdr &&
430 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
431 sizeof(fixed_hw_reg)) == 0 &&
432 smear == r.smear &&
433 imm.u == r.imm.u);
434 }
435
436 bool
437 fs_reg::is_zero() const
438 {
439 if (file != IMM)
440 return false;
441
442 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
443 }
444
445 bool
446 fs_reg::is_one() const
447 {
448 if (file != IMM)
449 return false;
450
451 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
452 }
453
454 bool
455 fs_reg::is_valid_3src() const
456 {
457 return file == GRF || file == UNIFORM;
458 }
459
460 int
461 fs_visitor::type_size(const struct glsl_type *type)
462 {
463 unsigned int size, i;
464
465 switch (type->base_type) {
466 case GLSL_TYPE_UINT:
467 case GLSL_TYPE_INT:
468 case GLSL_TYPE_FLOAT:
469 case GLSL_TYPE_BOOL:
470 return type->components();
471 case GLSL_TYPE_ARRAY:
472 return type_size(type->fields.array) * type->length;
473 case GLSL_TYPE_STRUCT:
474 size = 0;
475 for (i = 0; i < type->length; i++) {
476 size += type_size(type->fields.structure[i].type);
477 }
478 return size;
479 case GLSL_TYPE_SAMPLER:
480 /* Samplers take up no register space, since they're baked in at
481 * link time.
482 */
483 return 0;
484 case GLSL_TYPE_VOID:
485 case GLSL_TYPE_ERROR:
486 case GLSL_TYPE_INTERFACE:
487 assert(!"not reached");
488 break;
489 }
490
491 return 0;
492 }
493
494 fs_reg
495 fs_visitor::get_timestamp()
496 {
497 assert(intel->gen >= 7);
498
499 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
500 BRW_ARF_TIMESTAMP,
501 0),
502 BRW_REGISTER_TYPE_UD));
503
504 fs_reg dst = fs_reg(this, glsl_type::uint_type);
505
506 fs_inst *mov = emit(MOV(dst, ts));
507 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
508 * even if it's not enabled in the dispatch.
509 */
510 mov->force_writemask_all = true;
511 mov->force_uncompressed = true;
512
513 /* The caller wants the low 32 bits of the timestamp. Since it's running
514 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
515 * which is plenty of time for our purposes. It is identical across the
516 * EUs, but since it's tracking GPU core speed it will increment at a
517 * varying rate as render P-states change.
518 *
519 * The caller could also check if render P-states have changed (or anything
520 * else that might disrupt timing) by setting smear to 2 and checking if
521 * that field is != 0.
522 */
523 dst.smear = 0;
524
525 return dst;
526 }
527
528 void
529 fs_visitor::emit_shader_time_begin()
530 {
531 current_annotation = "shader time start";
532 shader_start_time = get_timestamp();
533 }
534
535 void
536 fs_visitor::emit_shader_time_end()
537 {
538 current_annotation = "shader time end";
539
540 enum shader_time_shader_type type, written_type, reset_type;
541 if (dispatch_width == 8) {
542 type = ST_FS8;
543 written_type = ST_FS8_WRITTEN;
544 reset_type = ST_FS8_RESET;
545 } else {
546 assert(dispatch_width == 16);
547 type = ST_FS16;
548 written_type = ST_FS16_WRITTEN;
549 reset_type = ST_FS16_RESET;
550 }
551
552 fs_reg shader_end_time = get_timestamp();
553
554 /* Check that there weren't any timestamp reset events (assuming these
555 * were the only two timestamp reads that happened).
556 */
557 fs_reg reset = shader_end_time;
558 reset.smear = 2;
559 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
560 test->conditional_mod = BRW_CONDITIONAL_Z;
561 emit(IF(BRW_PREDICATE_NORMAL));
562
563 push_force_uncompressed();
564 fs_reg start = shader_start_time;
565 start.negate = true;
566 fs_reg diff = fs_reg(this, glsl_type::uint_type);
567 emit(ADD(diff, start, shader_end_time));
568
569 /* If there were no instructions between the two timestamp gets, the diff
570 * is 2 cycles. Remove that overhead, so I can forget about that when
571 * trying to determine the time taken for single instructions.
572 */
573 emit(ADD(diff, diff, fs_reg(-2u)));
574
575 emit_shader_time_write(type, diff);
576 emit_shader_time_write(written_type, fs_reg(1u));
577 emit(BRW_OPCODE_ELSE);
578 emit_shader_time_write(reset_type, fs_reg(1u));
579 emit(BRW_OPCODE_ENDIF);
580
581 pop_force_uncompressed();
582 }
583
584 void
585 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
586 fs_reg value)
587 {
588 int shader_time_index =
589 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
590 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
591
592 fs_reg payload;
593 if (dispatch_width == 8)
594 payload = fs_reg(this, glsl_type::uvec2_type);
595 else
596 payload = fs_reg(this, glsl_type::uint_type);
597
598 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
599 fs_reg(), payload, offset, value));
600 }
601
602 void
603 fs_visitor::fail(const char *format, ...)
604 {
605 va_list va;
606 char *msg;
607
608 if (failed)
609 return;
610
611 failed = true;
612
613 va_start(va, format);
614 msg = ralloc_vasprintf(mem_ctx, format, va);
615 va_end(va);
616 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
617
618 this->fail_msg = msg;
619
620 if (INTEL_DEBUG & DEBUG_WM) {
621 fprintf(stderr, "%s", msg);
622 }
623 }
624
625 fs_inst *
626 fs_visitor::emit(enum opcode opcode)
627 {
628 return emit(fs_inst(opcode));
629 }
630
631 fs_inst *
632 fs_visitor::emit(enum opcode opcode, fs_reg dst)
633 {
634 return emit(fs_inst(opcode, dst));
635 }
636
637 fs_inst *
638 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
639 {
640 return emit(fs_inst(opcode, dst, src0));
641 }
642
643 fs_inst *
644 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
645 {
646 return emit(fs_inst(opcode, dst, src0, src1));
647 }
648
649 fs_inst *
650 fs_visitor::emit(enum opcode opcode, fs_reg dst,
651 fs_reg src0, fs_reg src1, fs_reg src2)
652 {
653 return emit(fs_inst(opcode, dst, src0, src1, src2));
654 }
655
656 void
657 fs_visitor::push_force_uncompressed()
658 {
659 force_uncompressed_stack++;
660 }
661
662 void
663 fs_visitor::pop_force_uncompressed()
664 {
665 force_uncompressed_stack--;
666 assert(force_uncompressed_stack >= 0);
667 }
668
669 void
670 fs_visitor::push_force_sechalf()
671 {
672 force_sechalf_stack++;
673 }
674
675 void
676 fs_visitor::pop_force_sechalf()
677 {
678 force_sechalf_stack--;
679 assert(force_sechalf_stack >= 0);
680 }
681
682 /**
683 * Returns true if the instruction has a flag that means it won't
684 * update an entire destination register.
685 *
686 * For example, dead code elimination and live variable analysis want to know
687 * when a write to a variable screens off any preceding values that were in
688 * it.
689 */
690 bool
691 fs_inst::is_partial_write()
692 {
693 return (this->predicate ||
694 this->force_uncompressed ||
695 this->force_sechalf);
696 }
697
698 /**
699 * Returns how many MRFs an FS opcode will write over.
700 *
701 * Note that this is not the 0 or 1 implied writes in an actual gen
702 * instruction -- the FS opcodes often generate MOVs in addition.
703 */
704 int
705 fs_visitor::implied_mrf_writes(fs_inst *inst)
706 {
707 if (inst->mlen == 0)
708 return 0;
709
710 switch (inst->opcode) {
711 case SHADER_OPCODE_RCP:
712 case SHADER_OPCODE_RSQ:
713 case SHADER_OPCODE_SQRT:
714 case SHADER_OPCODE_EXP2:
715 case SHADER_OPCODE_LOG2:
716 case SHADER_OPCODE_SIN:
717 case SHADER_OPCODE_COS:
718 return 1 * dispatch_width / 8;
719 case SHADER_OPCODE_POW:
720 case SHADER_OPCODE_INT_QUOTIENT:
721 case SHADER_OPCODE_INT_REMAINDER:
722 return 2 * dispatch_width / 8;
723 case SHADER_OPCODE_TEX:
724 case FS_OPCODE_TXB:
725 case SHADER_OPCODE_TXD:
726 case SHADER_OPCODE_TXF:
727 case SHADER_OPCODE_TXF_MS:
728 case SHADER_OPCODE_TXL:
729 case SHADER_OPCODE_TXS:
730 case SHADER_OPCODE_LOD:
731 return 1;
732 case FS_OPCODE_FB_WRITE:
733 return 2;
734 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
735 case FS_OPCODE_UNSPILL:
736 return 1;
737 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
738 return inst->mlen;
739 case FS_OPCODE_SPILL:
740 return 2;
741 default:
742 assert(!"not reached");
743 return inst->mlen;
744 }
745 }
746
747 int
748 fs_visitor::virtual_grf_alloc(int size)
749 {
750 if (virtual_grf_array_size <= virtual_grf_count) {
751 if (virtual_grf_array_size == 0)
752 virtual_grf_array_size = 16;
753 else
754 virtual_grf_array_size *= 2;
755 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
756 virtual_grf_array_size);
757 }
758 virtual_grf_sizes[virtual_grf_count] = size;
759 return virtual_grf_count++;
760 }
761
762 /** Fixed HW reg constructor. */
763 fs_reg::fs_reg(enum register_file file, int reg)
764 {
765 init();
766 this->file = file;
767 this->reg = reg;
768 this->type = BRW_REGISTER_TYPE_F;
769 }
770
771 /** Fixed HW reg constructor. */
772 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
773 {
774 init();
775 this->file = file;
776 this->reg = reg;
777 this->type = type;
778 }
779
780 /** Automatic reg constructor. */
781 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
782 {
783 init();
784
785 this->file = GRF;
786 this->reg = v->virtual_grf_alloc(v->type_size(type));
787 this->reg_offset = 0;
788 this->type = brw_type_for_base_type(type);
789 }
790
791 fs_reg *
792 fs_visitor::variable_storage(ir_variable *var)
793 {
794 return (fs_reg *)hash_table_find(this->variable_ht, var);
795 }
796
797 void
798 import_uniforms_callback(const void *key,
799 void *data,
800 void *closure)
801 {
802 struct hash_table *dst_ht = (struct hash_table *)closure;
803 const fs_reg *reg = (const fs_reg *)data;
804
805 if (reg->file != UNIFORM)
806 return;
807
808 hash_table_insert(dst_ht, data, key);
809 }
810
811 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
812 * This brings in those uniform definitions
813 */
814 void
815 fs_visitor::import_uniforms(fs_visitor *v)
816 {
817 hash_table_call_foreach(v->variable_ht,
818 import_uniforms_callback,
819 variable_ht);
820 this->params_remap = v->params_remap;
821 }
822
823 /* Our support for uniforms is piggy-backed on the struct
824 * gl_fragment_program, because that's where the values actually
825 * get stored, rather than in some global gl_shader_program uniform
826 * store.
827 */
828 void
829 fs_visitor::setup_uniform_values(ir_variable *ir)
830 {
831 int namelen = strlen(ir->name);
832
833 /* The data for our (non-builtin) uniforms is stored in a series of
834 * gl_uniform_driver_storage structs for each subcomponent that
835 * glGetUniformLocation() could name. We know it's been set up in the same
836 * order we'd walk the type, so walk the list of storage and find anything
837 * with our name, or the prefix of a component that starts with our name.
838 */
839 unsigned params_before = c->prog_data.nr_params;
840 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
841 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
842
843 if (strncmp(ir->name, storage->name, namelen) != 0 ||
844 (storage->name[namelen] != 0 &&
845 storage->name[namelen] != '.' &&
846 storage->name[namelen] != '[')) {
847 continue;
848 }
849
850 unsigned slots = storage->type->component_slots();
851 if (storage->array_elements)
852 slots *= storage->array_elements;
853
854 for (unsigned i = 0; i < slots; i++) {
855 c->prog_data.param[c->prog_data.nr_params++] =
856 &storage->storage[i].f;
857 }
858 }
859
860 /* Make sure we actually initialized the right amount of stuff here. */
861 assert(params_before + ir->type->component_slots() ==
862 c->prog_data.nr_params);
863 (void)params_before;
864 }
865
866
867 /* Our support for builtin uniforms is even scarier than non-builtin.
868 * It sits on top of the PROG_STATE_VAR parameters that are
869 * automatically updated from GL context state.
870 */
871 void
872 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
873 {
874 const ir_state_slot *const slots = ir->state_slots;
875 assert(ir->state_slots != NULL);
876
877 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
878 /* This state reference has already been setup by ir_to_mesa, but we'll
879 * get the same index back here.
880 */
881 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
882 (gl_state_index *)slots[i].tokens);
883
884 /* Add each of the unique swizzles of the element as a parameter.
885 * This'll end up matching the expected layout of the
886 * array/matrix/structure we're trying to fill in.
887 */
888 int last_swiz = -1;
889 for (unsigned int j = 0; j < 4; j++) {
890 int swiz = GET_SWZ(slots[i].swizzle, j);
891 if (swiz == last_swiz)
892 break;
893 last_swiz = swiz;
894
895 c->prog_data.param[c->prog_data.nr_params++] =
896 &fp->Base.Parameters->ParameterValues[index][swiz].f;
897 }
898 }
899 }
900
901 fs_reg *
902 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
903 {
904 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
905 fs_reg wpos = *reg;
906 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
907
908 /* gl_FragCoord.x */
909 if (ir->pixel_center_integer) {
910 emit(MOV(wpos, this->pixel_x));
911 } else {
912 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
913 }
914 wpos.reg_offset++;
915
916 /* gl_FragCoord.y */
917 if (!flip && ir->pixel_center_integer) {
918 emit(MOV(wpos, this->pixel_y));
919 } else {
920 fs_reg pixel_y = this->pixel_y;
921 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
922
923 if (flip) {
924 pixel_y.negate = true;
925 offset += c->key.drawable_height - 1.0;
926 }
927
928 emit(ADD(wpos, pixel_y, fs_reg(offset)));
929 }
930 wpos.reg_offset++;
931
932 /* gl_FragCoord.z */
933 if (intel->gen >= 6) {
934 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
935 } else {
936 emit(FS_OPCODE_LINTERP, wpos,
937 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
938 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
939 interp_reg(VARYING_SLOT_POS, 2));
940 }
941 wpos.reg_offset++;
942
943 /* gl_FragCoord.w: Already set up in emit_interpolation */
944 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
945
946 return reg;
947 }
948
949 fs_inst *
950 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
951 glsl_interp_qualifier interpolation_mode,
952 bool is_centroid)
953 {
954 brw_wm_barycentric_interp_mode barycoord_mode;
955 if (intel->gen >= 6) {
956 if (is_centroid) {
957 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
958 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
959 else
960 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
961 } else {
962 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
963 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
964 else
965 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
966 }
967 } else {
968 /* On Ironlake and below, there is only one interpolation mode.
969 * Centroid interpolation doesn't mean anything on this hardware --
970 * there is no multisampling.
971 */
972 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
973 }
974 return emit(FS_OPCODE_LINTERP, attr,
975 this->delta_x[barycoord_mode],
976 this->delta_y[barycoord_mode], interp);
977 }
978
979 fs_reg *
980 fs_visitor::emit_general_interpolation(ir_variable *ir)
981 {
982 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
983 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
984 fs_reg attr = *reg;
985
986 unsigned int array_elements;
987 const glsl_type *type;
988
989 if (ir->type->is_array()) {
990 array_elements = ir->type->length;
991 if (array_elements == 0) {
992 fail("dereferenced array '%s' has length 0\n", ir->name);
993 }
994 type = ir->type->fields.array;
995 } else {
996 array_elements = 1;
997 type = ir->type;
998 }
999
1000 glsl_interp_qualifier interpolation_mode =
1001 ir->determine_interpolation_mode(c->key.flat_shade);
1002
1003 int location = ir->location;
1004 for (unsigned int i = 0; i < array_elements; i++) {
1005 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1006 if (urb_setup[location] == -1) {
1007 /* If there's no incoming setup data for this slot, don't
1008 * emit interpolation for it.
1009 */
1010 attr.reg_offset += type->vector_elements;
1011 location++;
1012 continue;
1013 }
1014
1015 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1016 /* Constant interpolation (flat shading) case. The SF has
1017 * handed us defined values in only the constant offset
1018 * field of the setup reg.
1019 */
1020 for (unsigned int k = 0; k < type->vector_elements; k++) {
1021 struct brw_reg interp = interp_reg(location, k);
1022 interp = suboffset(interp, 3);
1023 interp.type = reg->type;
1024 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1025 attr.reg_offset++;
1026 }
1027 } else {
1028 /* Smooth/noperspective interpolation case. */
1029 for (unsigned int k = 0; k < type->vector_elements; k++) {
1030 /* FINISHME: At some point we probably want to push
1031 * this farther by giving similar treatment to the
1032 * other potentially constant components of the
1033 * attribute, as well as making brw_vs_constval.c
1034 * handle varyings other than gl_TexCoord.
1035 */
1036 struct brw_reg interp = interp_reg(location, k);
1037 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1038 ir->centroid);
1039 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1040 /* Get the pixel/sample mask into f0 so that we know
1041 * which pixels are lit. Then, for each channel that is
1042 * unlit, replace the centroid data with non-centroid
1043 * data.
1044 */
1045 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1046 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1047 interpolation_mode, false);
1048 inst->predicate = BRW_PREDICATE_NORMAL;
1049 inst->predicate_inverse = true;
1050 }
1051 if (intel->gen < 6) {
1052 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1053 }
1054 attr.reg_offset++;
1055 }
1056
1057 }
1058 location++;
1059 }
1060 }
1061
1062 return reg;
1063 }
1064
1065 fs_reg *
1066 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1067 {
1068 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1069
1070 /* The frontfacing comes in as a bit in the thread payload. */
1071 if (intel->gen >= 6) {
1072 emit(BRW_OPCODE_ASR, *reg,
1073 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1074 fs_reg(15));
1075 emit(BRW_OPCODE_NOT, *reg, *reg);
1076 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1077 } else {
1078 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1079 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1080 * us front face
1081 */
1082 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1083 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1084 }
1085
1086 return reg;
1087 }
1088
1089 fs_reg
1090 fs_visitor::fix_math_operand(fs_reg src)
1091 {
1092 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1093 * might be able to do better by doing execsize = 1 math and then
1094 * expanding that result out, but we would need to be careful with
1095 * masking.
1096 *
1097 * The hardware ignores source modifiers (negate and abs) on math
1098 * instructions, so we also move to a temp to set those up.
1099 */
1100 if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1101 !src.abs && !src.negate)
1102 return src;
1103
1104 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1105 * operands to math
1106 */
1107 if (intel->gen >= 7 && src.file != IMM)
1108 return src;
1109
1110 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1111 expanded.type = src.type;
1112 emit(BRW_OPCODE_MOV, expanded, src);
1113 return expanded;
1114 }
1115
1116 fs_inst *
1117 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1118 {
1119 switch (opcode) {
1120 case SHADER_OPCODE_RCP:
1121 case SHADER_OPCODE_RSQ:
1122 case SHADER_OPCODE_SQRT:
1123 case SHADER_OPCODE_EXP2:
1124 case SHADER_OPCODE_LOG2:
1125 case SHADER_OPCODE_SIN:
1126 case SHADER_OPCODE_COS:
1127 break;
1128 default:
1129 assert(!"not reached: bad math opcode");
1130 return NULL;
1131 }
1132
1133 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1134 * might be able to do better by doing execsize = 1 math and then
1135 * expanding that result out, but we would need to be careful with
1136 * masking.
1137 *
1138 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1139 * instructions, so we also move to a temp to set those up.
1140 */
1141 if (intel->gen >= 6)
1142 src = fix_math_operand(src);
1143
1144 fs_inst *inst = emit(opcode, dst, src);
1145
1146 if (intel->gen < 6) {
1147 inst->base_mrf = 2;
1148 inst->mlen = dispatch_width / 8;
1149 }
1150
1151 return inst;
1152 }
1153
1154 fs_inst *
1155 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1156 {
1157 int base_mrf = 2;
1158 fs_inst *inst;
1159
1160 switch (opcode) {
1161 case SHADER_OPCODE_INT_QUOTIENT:
1162 case SHADER_OPCODE_INT_REMAINDER:
1163 if (intel->gen >= 7 && dispatch_width == 16)
1164 fail("16-wide INTDIV unsupported\n");
1165 break;
1166 case SHADER_OPCODE_POW:
1167 break;
1168 default:
1169 assert(!"not reached: unsupported binary math opcode.");
1170 return NULL;
1171 }
1172
1173 if (intel->gen >= 6) {
1174 src0 = fix_math_operand(src0);
1175 src1 = fix_math_operand(src1);
1176
1177 inst = emit(opcode, dst, src0, src1);
1178 } else {
1179 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1180 * "Message Payload":
1181 *
1182 * "Operand0[7]. For the INT DIV functions, this operand is the
1183 * denominator."
1184 * ...
1185 * "Operand1[7]. For the INT DIV functions, this operand is the
1186 * numerator."
1187 */
1188 bool is_int_div = opcode != SHADER_OPCODE_POW;
1189 fs_reg &op0 = is_int_div ? src1 : src0;
1190 fs_reg &op1 = is_int_div ? src0 : src1;
1191
1192 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1193 inst = emit(opcode, dst, op0, reg_null_f);
1194
1195 inst->base_mrf = base_mrf;
1196 inst->mlen = 2 * dispatch_width / 8;
1197 }
1198 return inst;
1199 }
1200
1201 void
1202 fs_visitor::assign_curb_setup()
1203 {
1204 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1205 if (dispatch_width == 8) {
1206 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1207 } else {
1208 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1209 }
1210
1211 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1212 foreach_list(node, &this->instructions) {
1213 fs_inst *inst = (fs_inst *)node;
1214
1215 for (unsigned int i = 0; i < 3; i++) {
1216 if (inst->src[i].file == UNIFORM) {
1217 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1218 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1219 constant_nr / 8,
1220 constant_nr % 8);
1221
1222 inst->src[i].file = HW_REG;
1223 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1224 }
1225 }
1226 }
1227 }
1228
1229 void
1230 fs_visitor::calculate_urb_setup()
1231 {
1232 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1233 urb_setup[i] = -1;
1234 }
1235
1236 int urb_next = 0;
1237 /* Figure out where each of the incoming setup attributes lands. */
1238 if (intel->gen >= 6) {
1239 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1240 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1241 urb_setup[i] = urb_next++;
1242 }
1243 }
1244 } else {
1245 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1246 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1247 /* Point size is packed into the header, not as a general attribute */
1248 if (i == VARYING_SLOT_PSIZ)
1249 continue;
1250
1251 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1252 /* The back color slot is skipped when the front color is
1253 * also written to. In addition, some slots can be
1254 * written in the vertex shader and not read in the
1255 * fragment shader. So the register number must always be
1256 * incremented, mapped or not.
1257 */
1258 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1259 urb_setup[i] = urb_next;
1260 urb_next++;
1261 }
1262 }
1263
1264 /*
1265 * It's a FS only attribute, and we did interpolation for this attribute
1266 * in SF thread. So, count it here, too.
1267 *
1268 * See compile_sf_prog() for more info.
1269 */
1270 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1271 urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1272 }
1273
1274 /* Each attribute is 4 setup channels, each of which is half a reg. */
1275 c->prog_data.urb_read_length = urb_next * 2;
1276 }
1277
1278 void
1279 fs_visitor::assign_urb_setup()
1280 {
1281 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1282
1283 /* Offset all the urb_setup[] index by the actual position of the
1284 * setup regs, now that the location of the constants has been chosen.
1285 */
1286 foreach_list(node, &this->instructions) {
1287 fs_inst *inst = (fs_inst *)node;
1288
1289 if (inst->opcode == FS_OPCODE_LINTERP) {
1290 assert(inst->src[2].file == HW_REG);
1291 inst->src[2].fixed_hw_reg.nr += urb_start;
1292 }
1293
1294 if (inst->opcode == FS_OPCODE_CINTERP) {
1295 assert(inst->src[0].file == HW_REG);
1296 inst->src[0].fixed_hw_reg.nr += urb_start;
1297 }
1298 }
1299
1300 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1301 }
1302
1303 /**
1304 * Split large virtual GRFs into separate components if we can.
1305 *
1306 * This is mostly duplicated with what brw_fs_vector_splitting does,
1307 * but that's really conservative because it's afraid of doing
1308 * splitting that doesn't result in real progress after the rest of
1309 * the optimization phases, which would cause infinite looping in
1310 * optimization. We can do it once here, safely. This also has the
1311 * opportunity to split interpolated values, or maybe even uniforms,
1312 * which we don't have at the IR level.
1313 *
1314 * We want to split, because virtual GRFs are what we register
1315 * allocate and spill (due to contiguousness requirements for some
1316 * instructions), and they're what we naturally generate in the
1317 * codegen process, but most virtual GRFs don't actually need to be
1318 * contiguous sets of GRFs. If we split, we'll end up with reduced
1319 * live intervals and better dead code elimination and coalescing.
1320 */
1321 void
1322 fs_visitor::split_virtual_grfs()
1323 {
1324 int num_vars = this->virtual_grf_count;
1325 bool split_grf[num_vars];
1326 int new_virtual_grf[num_vars];
1327
1328 /* Try to split anything > 0 sized. */
1329 for (int i = 0; i < num_vars; i++) {
1330 if (this->virtual_grf_sizes[i] != 1)
1331 split_grf[i] = true;
1332 else
1333 split_grf[i] = false;
1334 }
1335
1336 if (brw->has_pln &&
1337 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1338 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1339 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1340 * Gen6, that was the only supported interpolation mode, and since Gen6,
1341 * delta_x and delta_y are in fixed hardware registers.
1342 */
1343 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1344 false;
1345 }
1346
1347 foreach_list(node, &this->instructions) {
1348 fs_inst *inst = (fs_inst *)node;
1349
1350 /* If there's a SEND message that requires contiguous destination
1351 * registers, no splitting is allowed.
1352 */
1353 if (inst->regs_written > 1) {
1354 split_grf[inst->dst.reg] = false;
1355 }
1356
1357 /* If we're sending from a GRF, don't split it, on the assumption that
1358 * the send is reading the whole thing.
1359 */
1360 if (inst->is_send_from_grf()) {
1361 split_grf[inst->src[0].reg] = false;
1362 }
1363 }
1364
1365 /* Allocate new space for split regs. Note that the virtual
1366 * numbers will be contiguous.
1367 */
1368 for (int i = 0; i < num_vars; i++) {
1369 if (split_grf[i]) {
1370 new_virtual_grf[i] = virtual_grf_alloc(1);
1371 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1372 int reg = virtual_grf_alloc(1);
1373 assert(reg == new_virtual_grf[i] + j - 1);
1374 (void) reg;
1375 }
1376 this->virtual_grf_sizes[i] = 1;
1377 }
1378 }
1379
1380 foreach_list(node, &this->instructions) {
1381 fs_inst *inst = (fs_inst *)node;
1382
1383 if (inst->dst.file == GRF &&
1384 split_grf[inst->dst.reg] &&
1385 inst->dst.reg_offset != 0) {
1386 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1387 inst->dst.reg_offset - 1);
1388 inst->dst.reg_offset = 0;
1389 }
1390 for (int i = 0; i < 3; i++) {
1391 if (inst->src[i].file == GRF &&
1392 split_grf[inst->src[i].reg] &&
1393 inst->src[i].reg_offset != 0) {
1394 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1395 inst->src[i].reg_offset - 1);
1396 inst->src[i].reg_offset = 0;
1397 }
1398 }
1399 }
1400 this->live_intervals_valid = false;
1401 }
1402
1403 /**
1404 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1405 *
1406 * During code generation, we create tons of temporary variables, many of
1407 * which get immediately killed and are never used again. Yet, in later
1408 * optimization and analysis passes, such as compute_live_intervals, we need
1409 * to loop over all the virtual GRFs. Compacting them can save a lot of
1410 * overhead.
1411 */
1412 void
1413 fs_visitor::compact_virtual_grfs()
1414 {
1415 /* Mark which virtual GRFs are used, and count how many. */
1416 int remap_table[this->virtual_grf_count];
1417 memset(remap_table, -1, sizeof(remap_table));
1418
1419 foreach_list(node, &this->instructions) {
1420 const fs_inst *inst = (const fs_inst *) node;
1421
1422 if (inst->dst.file == GRF)
1423 remap_table[inst->dst.reg] = 0;
1424
1425 for (int i = 0; i < 3; i++) {
1426 if (inst->src[i].file == GRF)
1427 remap_table[inst->src[i].reg] = 0;
1428 }
1429 }
1430
1431 /* In addition to registers used in instructions, fs_visitor keeps
1432 * direct references to certain special values which must be patched:
1433 */
1434 fs_reg *special[] = {
1435 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1436 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1437 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1438 &delta_x[0], &delta_x[1], &delta_x[2],
1439 &delta_x[3], &delta_x[4], &delta_x[5],
1440 &delta_y[0], &delta_y[1], &delta_y[2],
1441 &delta_y[3], &delta_y[4], &delta_y[5],
1442 };
1443 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1444 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1445
1446 /* Treat all special values as used, to be conservative */
1447 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1448 if (special[i]->file == GRF)
1449 remap_table[special[i]->reg] = 0;
1450 }
1451
1452 /* Compact the GRF arrays. */
1453 int new_index = 0;
1454 for (int i = 0; i < this->virtual_grf_count; i++) {
1455 if (remap_table[i] != -1) {
1456 remap_table[i] = new_index;
1457 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1458 if (live_intervals_valid) {
1459 virtual_grf_use[new_index] = virtual_grf_use[i];
1460 virtual_grf_def[new_index] = virtual_grf_def[i];
1461 }
1462 ++new_index;
1463 }
1464 }
1465
1466 this->virtual_grf_count = new_index;
1467
1468 /* Patch all the instructions to use the newly renumbered registers */
1469 foreach_list(node, &this->instructions) {
1470 fs_inst *inst = (fs_inst *) node;
1471
1472 if (inst->dst.file == GRF)
1473 inst->dst.reg = remap_table[inst->dst.reg];
1474
1475 for (int i = 0; i < 3; i++) {
1476 if (inst->src[i].file == GRF)
1477 inst->src[i].reg = remap_table[inst->src[i].reg];
1478 }
1479 }
1480
1481 /* Patch all the references to special values */
1482 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1483 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1484 special[i]->reg = remap_table[special[i]->reg];
1485 }
1486 }
1487
1488 bool
1489 fs_visitor::remove_dead_constants()
1490 {
1491 if (dispatch_width == 8) {
1492 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1493
1494 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1495 this->params_remap[i] = -1;
1496
1497 /* Find which params are still in use. */
1498 foreach_list(node, &this->instructions) {
1499 fs_inst *inst = (fs_inst *)node;
1500
1501 for (int i = 0; i < 3; i++) {
1502 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1503
1504 if (inst->src[i].file != UNIFORM)
1505 continue;
1506
1507 assert(constant_nr < (int)c->prog_data.nr_params);
1508
1509 /* For now, set this to non-negative. We'll give it the
1510 * actual new number in a moment, in order to keep the
1511 * register numbers nicely ordered.
1512 */
1513 this->params_remap[constant_nr] = 0;
1514 }
1515 }
1516
1517 /* Figure out what the new numbers for the params will be. At some
1518 * point when we're doing uniform array access, we're going to want
1519 * to keep the distinction between .reg and .reg_offset, but for
1520 * now we don't care.
1521 */
1522 unsigned int new_nr_params = 0;
1523 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1524 if (this->params_remap[i] != -1) {
1525 this->params_remap[i] = new_nr_params++;
1526 }
1527 }
1528
1529 /* Update the list of params to be uploaded to match our new numbering. */
1530 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1531 int remapped = this->params_remap[i];
1532
1533 if (remapped == -1)
1534 continue;
1535
1536 c->prog_data.param[remapped] = c->prog_data.param[i];
1537 }
1538
1539 c->prog_data.nr_params = new_nr_params;
1540 } else {
1541 /* This should have been generated in the 8-wide pass already. */
1542 assert(this->params_remap);
1543 }
1544
1545 /* Now do the renumbering of the shader to remove unused params. */
1546 foreach_list(node, &this->instructions) {
1547 fs_inst *inst = (fs_inst *)node;
1548
1549 for (int i = 0; i < 3; i++) {
1550 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1551
1552 if (inst->src[i].file != UNIFORM)
1553 continue;
1554
1555 assert(this->params_remap[constant_nr] != -1);
1556 inst->src[i].reg = this->params_remap[constant_nr];
1557 inst->src[i].reg_offset = 0;
1558 }
1559 }
1560
1561 return true;
1562 }
1563
1564 /*
1565 * Implements array access of uniforms by inserting a
1566 * PULL_CONSTANT_LOAD instruction.
1567 *
1568 * Unlike temporary GRF array access (where we don't support it due to
1569 * the difficulty of doing relative addressing on instruction
1570 * destinations), we could potentially do array access of uniforms
1571 * that were loaded in GRF space as push constants. In real-world
1572 * usage we've seen, though, the arrays being used are always larger
1573 * than we could load as push constants, so just always move all
1574 * uniform array access out to a pull constant buffer.
1575 */
1576 void
1577 fs_visitor::move_uniform_array_access_to_pull_constants()
1578 {
1579 int pull_constant_loc[c->prog_data.nr_params];
1580
1581 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1582 pull_constant_loc[i] = -1;
1583 }
1584
1585 /* Walk through and find array access of uniforms. Put a copy of that
1586 * uniform in the pull constant buffer.
1587 *
1588 * Note that we don't move constant-indexed accesses to arrays. No
1589 * testing has been done of the performance impact of this choice.
1590 */
1591 foreach_list_safe(node, &this->instructions) {
1592 fs_inst *inst = (fs_inst *)node;
1593
1594 for (int i = 0 ; i < 3; i++) {
1595 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1596 continue;
1597
1598 int uniform = inst->src[i].reg;
1599
1600 /* If this array isn't already present in the pull constant buffer,
1601 * add it.
1602 */
1603 if (pull_constant_loc[uniform] == -1) {
1604 const float **values = &c->prog_data.param[uniform];
1605
1606 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1607
1608 assert(param_size[uniform]);
1609
1610 for (int j = 0; j < param_size[uniform]; j++) {
1611 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1612 values[j];
1613 }
1614 }
1615
1616 /* Set up the annotation tracking for new generated instructions. */
1617 base_ir = inst->ir;
1618 current_annotation = inst->annotation;
1619
1620 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1621 fs_reg temp = fs_reg(this, glsl_type::float_type);
1622 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1623 surf_index,
1624 *inst->src[i].reladdr,
1625 pull_constant_loc[uniform] +
1626 inst->src[i].reg_offset);
1627 inst->insert_before(&list);
1628
1629 inst->src[i].file = temp.file;
1630 inst->src[i].reg = temp.reg;
1631 inst->src[i].reg_offset = temp.reg_offset;
1632 inst->src[i].reladdr = NULL;
1633 }
1634 }
1635 }
1636
1637 /**
1638 * Choose accesses from the UNIFORM file to demote to using the pull
1639 * constant buffer.
1640 *
1641 * We allow a fragment shader to have more than the specified minimum
1642 * maximum number of fragment shader uniform components (64). If
1643 * there are too many of these, they'd fill up all of register space.
1644 * So, this will push some of them out to the pull constant buffer and
1645 * update the program to load them.
1646 */
1647 void
1648 fs_visitor::setup_pull_constants()
1649 {
1650 /* Only allow 16 registers (128 uniform components) as push constants. */
1651 unsigned int max_uniform_components = 16 * 8;
1652 if (c->prog_data.nr_params <= max_uniform_components)
1653 return;
1654
1655 if (dispatch_width == 16) {
1656 fail("Pull constants not supported in 16-wide\n");
1657 return;
1658 }
1659
1660 /* Just demote the end of the list. We could probably do better
1661 * here, demoting things that are rarely used in the program first.
1662 */
1663 unsigned int pull_uniform_base = max_uniform_components;
1664
1665 int pull_constant_loc[c->prog_data.nr_params];
1666 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1667 if (i < pull_uniform_base) {
1668 pull_constant_loc[i] = -1;
1669 } else {
1670 pull_constant_loc[i] = -1;
1671 /* If our constant is already being uploaded for reladdr purposes,
1672 * reuse it.
1673 */
1674 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1675 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1676 pull_constant_loc[i] = j;
1677 break;
1678 }
1679 }
1680 if (pull_constant_loc[i] == -1) {
1681 int pull_index = c->prog_data.nr_pull_params++;
1682 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1683 pull_constant_loc[i] = pull_index;;
1684 }
1685 }
1686 }
1687 c->prog_data.nr_params = pull_uniform_base;
1688
1689 foreach_list(node, &this->instructions) {
1690 fs_inst *inst = (fs_inst *)node;
1691
1692 for (int i = 0; i < 3; i++) {
1693 if (inst->src[i].file != UNIFORM)
1694 continue;
1695
1696 int pull_index = pull_constant_loc[inst->src[i].reg +
1697 inst->src[i].reg_offset];
1698 if (pull_index == -1)
1699 continue;
1700
1701 assert(!inst->src[i].reladdr);
1702
1703 fs_reg dst = fs_reg(this, glsl_type::float_type);
1704 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1705 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1706 fs_inst *pull =
1707 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1708 dst, index, offset);
1709 pull->ir = inst->ir;
1710 pull->annotation = inst->annotation;
1711
1712 inst->insert_before(pull);
1713
1714 inst->src[i].file = GRF;
1715 inst->src[i].reg = dst.reg;
1716 inst->src[i].reg_offset = 0;
1717 inst->src[i].smear = pull_index & 3;
1718 }
1719 }
1720 }
1721
1722 bool
1723 fs_visitor::opt_algebraic()
1724 {
1725 bool progress = false;
1726
1727 foreach_list(node, &this->instructions) {
1728 fs_inst *inst = (fs_inst *)node;
1729
1730 switch (inst->opcode) {
1731 case BRW_OPCODE_MUL:
1732 if (inst->src[1].file != IMM)
1733 continue;
1734
1735 /* a * 1.0 = a */
1736 if (inst->src[1].is_one()) {
1737 inst->opcode = BRW_OPCODE_MOV;
1738 inst->src[1] = reg_undef;
1739 progress = true;
1740 break;
1741 }
1742
1743 /* a * 0.0 = 0.0 */
1744 if (inst->src[1].is_zero()) {
1745 inst->opcode = BRW_OPCODE_MOV;
1746 inst->src[0] = inst->src[1];
1747 inst->src[1] = reg_undef;
1748 progress = true;
1749 break;
1750 }
1751
1752 break;
1753 case BRW_OPCODE_ADD:
1754 if (inst->src[1].file != IMM)
1755 continue;
1756
1757 /* a + 0.0 = a */
1758 if (inst->src[1].is_zero()) {
1759 inst->opcode = BRW_OPCODE_MOV;
1760 inst->src[1] = reg_undef;
1761 progress = true;
1762 break;
1763 }
1764 break;
1765 default:
1766 break;
1767 }
1768 }
1769
1770 return progress;
1771 }
1772
1773 /**
1774 * Must be called after calculate_live_intervales() to remove unused
1775 * writes to registers -- register allocation will fail otherwise
1776 * because something deffed but not used won't be considered to
1777 * interfere with other regs.
1778 */
1779 bool
1780 fs_visitor::dead_code_eliminate()
1781 {
1782 bool progress = false;
1783 int pc = 0;
1784
1785 calculate_live_intervals();
1786
1787 foreach_list_safe(node, &this->instructions) {
1788 fs_inst *inst = (fs_inst *)node;
1789
1790 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1791 inst->remove();
1792 progress = true;
1793 }
1794
1795 pc++;
1796 }
1797
1798 if (progress)
1799 live_intervals_valid = false;
1800
1801 return progress;
1802 }
1803
1804 struct dead_code_hash_key
1805 {
1806 int vgrf;
1807 int reg_offset;
1808 };
1809
1810 static bool
1811 dead_code_hash_compare(const void *a, const void *b)
1812 {
1813 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1814 }
1815
1816 static void
1817 clear_dead_code_hash(struct hash_table *ht)
1818 {
1819 struct hash_entry *entry;
1820
1821 hash_table_foreach(ht, entry) {
1822 _mesa_hash_table_remove(ht, entry);
1823 }
1824 }
1825
1826 static void
1827 insert_dead_code_hash(struct hash_table *ht,
1828 int vgrf, int reg_offset, fs_inst *inst)
1829 {
1830 /* We don't bother freeing keys, because they'll be GCed with the ht. */
1831 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1832
1833 key->vgrf = vgrf;
1834 key->reg_offset = reg_offset;
1835
1836 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1837 }
1838
1839 static struct hash_entry *
1840 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1841 {
1842 struct dead_code_hash_key key;
1843
1844 key.vgrf = vgrf;
1845 key.reg_offset = reg_offset;
1846
1847 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1848 }
1849
1850 static void
1851 remove_dead_code_hash(struct hash_table *ht,
1852 int vgrf, int reg_offset)
1853 {
1854 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1855 if (!entry)
1856 return;
1857
1858 _mesa_hash_table_remove(ht, entry);
1859 }
1860
1861 /**
1862 * Walks basic blocks, removing any regs that are written but not read before
1863 * being redefined.
1864 *
1865 * The dead_code_eliminate() function implements a global dead code
1866 * elimination, but it only handles the removing the last write to a register
1867 * if it's never read. This one can handle intermediate writes, but only
1868 * within a basic block.
1869 */
1870 bool
1871 fs_visitor::dead_code_eliminate_local()
1872 {
1873 struct hash_table *ht;
1874 bool progress = false;
1875
1876 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1877
1878 foreach_list_safe(node, &this->instructions) {
1879 fs_inst *inst = (fs_inst *)node;
1880
1881 /* At a basic block, empty the HT since we don't understand dataflow
1882 * here.
1883 */
1884 if (inst->is_control_flow()) {
1885 clear_dead_code_hash(ht);
1886 continue;
1887 }
1888
1889 /* Clear the HT of any instructions that got read. */
1890 for (int i = 0; i < 3; i++) {
1891 fs_reg src = inst->src[i];
1892 if (src.file != GRF)
1893 continue;
1894
1895 int read = 1;
1896 if (inst->is_send_from_grf())
1897 read = virtual_grf_sizes[src.reg] - src.reg_offset;
1898
1899 for (int reg_offset = src.reg_offset;
1900 reg_offset < src.reg_offset + read;
1901 reg_offset++) {
1902 remove_dead_code_hash(ht, src.reg, reg_offset);
1903 }
1904 }
1905
1906 /* Add any update of a GRF to the HT, removing a previous write if it
1907 * wasn't read.
1908 */
1909 if (inst->dst.file == GRF) {
1910 if (inst->regs_written > 1) {
1911 /* We don't know how to trim channels from an instruction's
1912 * writes, so we can't incrementally remove unread channels from
1913 * it. Just remove whatever it overwrites from the table
1914 */
1915 for (int i = 0; i < inst->regs_written; i++) {
1916 remove_dead_code_hash(ht,
1917 inst->dst.reg,
1918 inst->dst.reg_offset + i);
1919 }
1920 } else {
1921 struct hash_entry *entry =
1922 get_dead_code_hash_entry(ht, inst->dst.reg,
1923 inst->dst.reg_offset);
1924
1925 if (inst->is_partial_write()) {
1926 /* For a partial write, we can't remove any previous dead code
1927 * candidate, since we're just modifying their result, but we can
1928 * be dead code eliminiated ourselves.
1929 */
1930 if (entry) {
1931 entry->data = inst;
1932 } else {
1933 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1934 inst);
1935 }
1936 } else {
1937 if (entry) {
1938 /* We're completely updating a channel, and there was a
1939 * previous write to the channel that wasn't read. Kill it!
1940 */
1941 fs_inst *inst = (fs_inst *)entry->data;
1942 inst->remove();
1943 progress = true;
1944 _mesa_hash_table_remove(ht, entry);
1945 }
1946
1947 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1948 inst);
1949 }
1950 }
1951 }
1952 }
1953
1954 _mesa_hash_table_destroy(ht, NULL);
1955
1956 if (progress)
1957 live_intervals_valid = false;
1958
1959 return progress;
1960 }
1961
1962 /**
1963 * Implements a second type of register coalescing: This one checks if
1964 * the two regs involved in a raw move don't interfere, in which case
1965 * they can both by stored in the same place and the MOV removed.
1966 */
1967 bool
1968 fs_visitor::register_coalesce_2()
1969 {
1970 bool progress = false;
1971
1972 calculate_live_intervals();
1973
1974 foreach_list_safe(node, &this->instructions) {
1975 fs_inst *inst = (fs_inst *)node;
1976
1977 if (inst->opcode != BRW_OPCODE_MOV ||
1978 inst->is_partial_write() ||
1979 inst->saturate ||
1980 inst->src[0].file != GRF ||
1981 inst->src[0].negate ||
1982 inst->src[0].abs ||
1983 inst->src[0].smear != -1 ||
1984 inst->dst.file != GRF ||
1985 inst->dst.type != inst->src[0].type ||
1986 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1987 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1988 continue;
1989 }
1990
1991 int reg_from = inst->src[0].reg;
1992 assert(inst->src[0].reg_offset == 0);
1993 int reg_to = inst->dst.reg;
1994 int reg_to_offset = inst->dst.reg_offset;
1995
1996 foreach_list(node, &this->instructions) {
1997 fs_inst *scan_inst = (fs_inst *)node;
1998
1999 if (scan_inst->dst.file == GRF &&
2000 scan_inst->dst.reg == reg_from) {
2001 scan_inst->dst.reg = reg_to;
2002 scan_inst->dst.reg_offset = reg_to_offset;
2003 }
2004 for (int i = 0; i < 3; i++) {
2005 if (scan_inst->src[i].file == GRF &&
2006 scan_inst->src[i].reg == reg_from) {
2007 scan_inst->src[i].reg = reg_to;
2008 scan_inst->src[i].reg_offset = reg_to_offset;
2009 }
2010 }
2011 }
2012
2013 inst->remove();
2014
2015 /* We don't need to recalculate live intervals inside the loop despite
2016 * flagging live_intervals_valid because we only use live intervals for
2017 * the interferes test, and we must have had a situation where the
2018 * intervals were:
2019 *
2020 * from to
2021 * ^
2022 * |
2023 * v
2024 * ^
2025 * |
2026 * v
2027 *
2028 * Some register R that might get coalesced with one of these two could
2029 * only be referencing "to", otherwise "from"'s range would have been
2030 * longer. R's range could also only start at the end of "to" or later,
2031 * otherwise it will conflict with "to" when we try to coalesce "to"
2032 * into Rw anyway.
2033 */
2034 live_intervals_valid = false;
2035
2036 progress = true;
2037 continue;
2038 }
2039
2040 return progress;
2041 }
2042
2043 bool
2044 fs_visitor::register_coalesce()
2045 {
2046 bool progress = false;
2047 int if_depth = 0;
2048 int loop_depth = 0;
2049
2050 foreach_list_safe(node, &this->instructions) {
2051 fs_inst *inst = (fs_inst *)node;
2052
2053 /* Make sure that we dominate the instructions we're going to
2054 * scan for interfering with our coalescing, or we won't have
2055 * scanned enough to see if anything interferes with our
2056 * coalescing. We don't dominate the following instructions if
2057 * we're in a loop or an if block.
2058 */
2059 switch (inst->opcode) {
2060 case BRW_OPCODE_DO:
2061 loop_depth++;
2062 break;
2063 case BRW_OPCODE_WHILE:
2064 loop_depth--;
2065 break;
2066 case BRW_OPCODE_IF:
2067 if_depth++;
2068 break;
2069 case BRW_OPCODE_ENDIF:
2070 if_depth--;
2071 break;
2072 default:
2073 break;
2074 }
2075 if (loop_depth || if_depth)
2076 continue;
2077
2078 if (inst->opcode != BRW_OPCODE_MOV ||
2079 inst->is_partial_write() ||
2080 inst->saturate ||
2081 inst->dst.file != GRF || (inst->src[0].file != GRF &&
2082 inst->src[0].file != UNIFORM)||
2083 inst->dst.type != inst->src[0].type)
2084 continue;
2085
2086 bool has_source_modifiers = (inst->src[0].abs ||
2087 inst->src[0].negate ||
2088 inst->src[0].smear != -1 ||
2089 inst->src[0].file == UNIFORM);
2090
2091 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
2092 * them: check for no writes to either one until the exit of the
2093 * program.
2094 */
2095 bool interfered = false;
2096
2097 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2098 !scan_inst->is_tail_sentinel();
2099 scan_inst = (fs_inst *)scan_inst->next) {
2100 if (scan_inst->dst.file == GRF) {
2101 if (scan_inst->overwrites_reg(inst->dst) ||
2102 scan_inst->overwrites_reg(inst->src[0])) {
2103 interfered = true;
2104 break;
2105 }
2106 }
2107
2108 /* The gen6 MATH instruction can't handle source modifiers or
2109 * unusual register regions, so avoid coalescing those for
2110 * now. We should do something more specific.
2111 */
2112 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2113 interfered = true;
2114 break;
2115 }
2116
2117 /* The accumulator result appears to get used for the
2118 * conditional modifier generation. When negating a UD
2119 * value, there is a 33rd bit generated for the sign in the
2120 * accumulator value, so now you can't check, for example,
2121 * equality with a 32-bit value. See piglit fs-op-neg-uint.
2122 */
2123 if (scan_inst->conditional_mod &&
2124 inst->src[0].negate &&
2125 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2126 interfered = true;
2127 break;
2128 }
2129 }
2130 if (interfered) {
2131 continue;
2132 }
2133
2134 /* Rewrite the later usage to point at the source of the move to
2135 * be removed.
2136 */
2137 for (fs_inst *scan_inst = inst;
2138 !scan_inst->is_tail_sentinel();
2139 scan_inst = (fs_inst *)scan_inst->next) {
2140 for (int i = 0; i < 3; i++) {
2141 if (scan_inst->src[i].file == GRF &&
2142 scan_inst->src[i].reg == inst->dst.reg &&
2143 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2144 fs_reg new_src = inst->src[0];
2145 if (scan_inst->src[i].abs) {
2146 new_src.negate = 0;
2147 new_src.abs = 1;
2148 }
2149 new_src.negate ^= scan_inst->src[i].negate;
2150 scan_inst->src[i] = new_src;
2151 }
2152 }
2153 }
2154
2155 inst->remove();
2156 progress = true;
2157 }
2158
2159 if (progress)
2160 live_intervals_valid = false;
2161
2162 return progress;
2163 }
2164
2165
2166 bool
2167 fs_visitor::compute_to_mrf()
2168 {
2169 bool progress = false;
2170 int next_ip = 0;
2171
2172 calculate_live_intervals();
2173
2174 foreach_list_safe(node, &this->instructions) {
2175 fs_inst *inst = (fs_inst *)node;
2176
2177 int ip = next_ip;
2178 next_ip++;
2179
2180 if (inst->opcode != BRW_OPCODE_MOV ||
2181 inst->is_partial_write() ||
2182 inst->dst.file != MRF || inst->src[0].file != GRF ||
2183 inst->dst.type != inst->src[0].type ||
2184 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2185 continue;
2186
2187 /* Work out which hardware MRF registers are written by this
2188 * instruction.
2189 */
2190 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2191 int mrf_high;
2192 if (inst->dst.reg & BRW_MRF_COMPR4) {
2193 mrf_high = mrf_low + 4;
2194 } else if (dispatch_width == 16 &&
2195 (!inst->force_uncompressed && !inst->force_sechalf)) {
2196 mrf_high = mrf_low + 1;
2197 } else {
2198 mrf_high = mrf_low;
2199 }
2200
2201 /* Can't compute-to-MRF this GRF if someone else was going to
2202 * read it later.
2203 */
2204 if (this->virtual_grf_use[inst->src[0].reg] > ip)
2205 continue;
2206
2207 /* Found a move of a GRF to a MRF. Let's see if we can go
2208 * rewrite the thing that made this GRF to write into the MRF.
2209 */
2210 fs_inst *scan_inst;
2211 for (scan_inst = (fs_inst *)inst->prev;
2212 scan_inst->prev != NULL;
2213 scan_inst = (fs_inst *)scan_inst->prev) {
2214 if (scan_inst->dst.file == GRF &&
2215 scan_inst->dst.reg == inst->src[0].reg) {
2216 /* Found the last thing to write our reg we want to turn
2217 * into a compute-to-MRF.
2218 */
2219
2220 /* If this one instruction didn't populate all the
2221 * channels, bail. We might be able to rewrite everything
2222 * that writes that reg, but it would require smarter
2223 * tracking to delay the rewriting until complete success.
2224 */
2225 if (scan_inst->is_partial_write())
2226 break;
2227
2228 /* Things returning more than one register would need us to
2229 * understand coalescing out more than one MOV at a time.
2230 */
2231 if (scan_inst->regs_written > 1)
2232 break;
2233
2234 /* SEND instructions can't have MRF as a destination. */
2235 if (scan_inst->mlen)
2236 break;
2237
2238 if (intel->gen == 6) {
2239 /* gen6 math instructions must have the destination be
2240 * GRF, so no compute-to-MRF for them.
2241 */
2242 if (scan_inst->is_math()) {
2243 break;
2244 }
2245 }
2246
2247 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2248 /* Found the creator of our MRF's source value. */
2249 scan_inst->dst.file = MRF;
2250 scan_inst->dst.reg = inst->dst.reg;
2251 scan_inst->saturate |= inst->saturate;
2252 inst->remove();
2253 progress = true;
2254 }
2255 break;
2256 }
2257
2258 /* We don't handle control flow here. Most computation of
2259 * values that end up in MRFs are shortly before the MRF
2260 * write anyway.
2261 */
2262 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2263 break;
2264
2265 /* You can't read from an MRF, so if someone else reads our
2266 * MRF's source GRF that we wanted to rewrite, that stops us.
2267 */
2268 bool interfered = false;
2269 for (int i = 0; i < 3; i++) {
2270 if (scan_inst->src[i].file == GRF &&
2271 scan_inst->src[i].reg == inst->src[0].reg &&
2272 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2273 interfered = true;
2274 }
2275 }
2276 if (interfered)
2277 break;
2278
2279 if (scan_inst->dst.file == MRF) {
2280 /* If somebody else writes our MRF here, we can't
2281 * compute-to-MRF before that.
2282 */
2283 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2284 int scan_mrf_high;
2285
2286 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2287 scan_mrf_high = scan_mrf_low + 4;
2288 } else if (dispatch_width == 16 &&
2289 (!scan_inst->force_uncompressed &&
2290 !scan_inst->force_sechalf)) {
2291 scan_mrf_high = scan_mrf_low + 1;
2292 } else {
2293 scan_mrf_high = scan_mrf_low;
2294 }
2295
2296 if (mrf_low == scan_mrf_low ||
2297 mrf_low == scan_mrf_high ||
2298 mrf_high == scan_mrf_low ||
2299 mrf_high == scan_mrf_high) {
2300 break;
2301 }
2302 }
2303
2304 if (scan_inst->mlen > 0) {
2305 /* Found a SEND instruction, which means that there are
2306 * live values in MRFs from base_mrf to base_mrf +
2307 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2308 * above it.
2309 */
2310 if (mrf_low >= scan_inst->base_mrf &&
2311 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2312 break;
2313 }
2314 if (mrf_high >= scan_inst->base_mrf &&
2315 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2316 break;
2317 }
2318 }
2319 }
2320 }
2321
2322 if (progress)
2323 live_intervals_valid = false;
2324
2325 return progress;
2326 }
2327
2328 /**
2329 * Walks through basic blocks, looking for repeated MRF writes and
2330 * removing the later ones.
2331 */
2332 bool
2333 fs_visitor::remove_duplicate_mrf_writes()
2334 {
2335 fs_inst *last_mrf_move[16];
2336 bool progress = false;
2337
2338 /* Need to update the MRF tracking for compressed instructions. */
2339 if (dispatch_width == 16)
2340 return false;
2341
2342 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2343
2344 foreach_list_safe(node, &this->instructions) {
2345 fs_inst *inst = (fs_inst *)node;
2346
2347 if (inst->is_control_flow()) {
2348 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2349 }
2350
2351 if (inst->opcode == BRW_OPCODE_MOV &&
2352 inst->dst.file == MRF) {
2353 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2354 if (prev_inst && inst->equals(prev_inst)) {
2355 inst->remove();
2356 progress = true;
2357 continue;
2358 }
2359 }
2360
2361 /* Clear out the last-write records for MRFs that were overwritten. */
2362 if (inst->dst.file == MRF) {
2363 last_mrf_move[inst->dst.reg] = NULL;
2364 }
2365
2366 if (inst->mlen > 0) {
2367 /* Found a SEND instruction, which will include two or fewer
2368 * implied MRF writes. We could do better here.
2369 */
2370 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2371 last_mrf_move[inst->base_mrf + i] = NULL;
2372 }
2373 }
2374
2375 /* Clear out any MRF move records whose sources got overwritten. */
2376 if (inst->dst.file == GRF) {
2377 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2378 if (last_mrf_move[i] &&
2379 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2380 last_mrf_move[i] = NULL;
2381 }
2382 }
2383 }
2384
2385 if (inst->opcode == BRW_OPCODE_MOV &&
2386 inst->dst.file == MRF &&
2387 inst->src[0].file == GRF &&
2388 !inst->is_partial_write()) {
2389 last_mrf_move[inst->dst.reg] = inst;
2390 }
2391 }
2392
2393 if (progress)
2394 live_intervals_valid = false;
2395
2396 return progress;
2397 }
2398
2399 static void
2400 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2401 int first_grf, int grf_len)
2402 {
2403 bool inst_16wide = (dispatch_width > 8 &&
2404 !inst->force_uncompressed &&
2405 !inst->force_sechalf);
2406
2407 /* Clear the flag for registers that actually got read (as expected). */
2408 for (int i = 0; i < 3; i++) {
2409 int grf;
2410 if (inst->src[i].file == GRF) {
2411 grf = inst->src[i].reg;
2412 } else if (inst->src[i].file == HW_REG &&
2413 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2414 grf = inst->src[i].fixed_hw_reg.nr;
2415 } else {
2416 continue;
2417 }
2418
2419 if (grf >= first_grf &&
2420 grf < first_grf + grf_len) {
2421 deps[grf - first_grf] = false;
2422 if (inst_16wide)
2423 deps[grf - first_grf + 1] = false;
2424 }
2425 }
2426 }
2427
2428 /**
2429 * Implements this workaround for the original 965:
2430 *
2431 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2432 * check for post destination dependencies on this instruction, software
2433 * must ensure that there is no destination hazard for the case of ‘write
2434 * followed by a posted write’ shown in the following example.
2435 *
2436 * 1. mov r3 0
2437 * 2. send r3.xy <rest of send instruction>
2438 * 3. mov r2 r3
2439 *
2440 * Due to no post-destination dependency check on the ‘send’, the above
2441 * code sequence could have two instructions (1 and 2) in flight at the
2442 * same time that both consider ‘r3’ as the target of their final writes.
2443 */
2444 void
2445 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2446 {
2447 int reg_size = dispatch_width / 8;
2448 int write_len = inst->regs_written * reg_size;
2449 int first_write_grf = inst->dst.reg;
2450 bool needs_dep[BRW_MAX_MRF];
2451 assert(write_len < (int)sizeof(needs_dep) - 1);
2452
2453 memset(needs_dep, false, sizeof(needs_dep));
2454 memset(needs_dep, true, write_len);
2455
2456 clear_deps_for_inst_src(inst, dispatch_width,
2457 needs_dep, first_write_grf, write_len);
2458
2459 /* Walk backwards looking for writes to registers we're writing which
2460 * aren't read since being written. If we hit the start of the program,
2461 * we assume that there are no outstanding dependencies on entry to the
2462 * program.
2463 */
2464 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2465 scan_inst != NULL;
2466 scan_inst = (fs_inst *)scan_inst->prev) {
2467
2468 /* If we hit control flow, assume that there *are* outstanding
2469 * dependencies, and force their cleanup before our instruction.
2470 */
2471 if (scan_inst->is_control_flow()) {
2472 for (int i = 0; i < write_len; i++) {
2473 if (needs_dep[i]) {
2474 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2475 }
2476 }
2477 return;
2478 }
2479
2480 bool scan_inst_16wide = (dispatch_width > 8 &&
2481 !scan_inst->force_uncompressed &&
2482 !scan_inst->force_sechalf);
2483
2484 /* We insert our reads as late as possible on the assumption that any
2485 * instruction but a MOV that might have left us an outstanding
2486 * dependency has more latency than a MOV.
2487 */
2488 if (scan_inst->dst.file == GRF) {
2489 for (int i = 0; i < scan_inst->regs_written; i++) {
2490 int reg = scan_inst->dst.reg + i * reg_size;
2491
2492 if (reg >= first_write_grf &&
2493 reg < first_write_grf + write_len &&
2494 needs_dep[reg - first_write_grf]) {
2495 inst->insert_before(DEP_RESOLVE_MOV(reg));
2496 needs_dep[reg - first_write_grf] = false;
2497 if (scan_inst_16wide)
2498 needs_dep[reg - first_write_grf + 1] = false;
2499 }
2500 }
2501 }
2502
2503 /* Clear the flag for registers that actually got read (as expected). */
2504 clear_deps_for_inst_src(scan_inst, dispatch_width,
2505 needs_dep, first_write_grf, write_len);
2506
2507 /* Continue the loop only if we haven't resolved all the dependencies */
2508 int i;
2509 for (i = 0; i < write_len; i++) {
2510 if (needs_dep[i])
2511 break;
2512 }
2513 if (i == write_len)
2514 return;
2515 }
2516 }
2517
2518 /**
2519 * Implements this workaround for the original 965:
2520 *
2521 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2522 * used as a destination register until after it has been sourced by an
2523 * instruction with a different destination register.
2524 */
2525 void
2526 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2527 {
2528 int write_len = inst->regs_written * dispatch_width / 8;
2529 int first_write_grf = inst->dst.reg;
2530 bool needs_dep[BRW_MAX_MRF];
2531 assert(write_len < (int)sizeof(needs_dep) - 1);
2532
2533 memset(needs_dep, false, sizeof(needs_dep));
2534 memset(needs_dep, true, write_len);
2535 /* Walk forwards looking for writes to registers we're writing which aren't
2536 * read before being written.
2537 */
2538 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2539 !scan_inst->is_tail_sentinel();
2540 scan_inst = (fs_inst *)scan_inst->next) {
2541 /* If we hit control flow, force resolve all remaining dependencies. */
2542 if (scan_inst->is_control_flow()) {
2543 for (int i = 0; i < write_len; i++) {
2544 if (needs_dep[i])
2545 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2546 }
2547 return;
2548 }
2549
2550 /* Clear the flag for registers that actually got read (as expected). */
2551 clear_deps_for_inst_src(scan_inst, dispatch_width,
2552 needs_dep, first_write_grf, write_len);
2553
2554 /* We insert our reads as late as possible since they're reading the
2555 * result of a SEND, which has massive latency.
2556 */
2557 if (scan_inst->dst.file == GRF &&
2558 scan_inst->dst.reg >= first_write_grf &&
2559 scan_inst->dst.reg < first_write_grf + write_len &&
2560 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2561 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2562 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2563 }
2564
2565 /* Continue the loop only if we haven't resolved all the dependencies */
2566 int i;
2567 for (i = 0; i < write_len; i++) {
2568 if (needs_dep[i])
2569 break;
2570 }
2571 if (i == write_len)
2572 return;
2573 }
2574
2575 /* If we hit the end of the program, resolve all remaining dependencies out
2576 * of paranoia.
2577 */
2578 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2579 assert(last_inst->eot);
2580 for (int i = 0; i < write_len; i++) {
2581 if (needs_dep[i])
2582 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2583 }
2584 }
2585
2586 void
2587 fs_visitor::insert_gen4_send_dependency_workarounds()
2588 {
2589 if (intel->gen != 4 || intel->is_g4x)
2590 return;
2591
2592 /* Note that we're done with register allocation, so GRF fs_regs always
2593 * have a .reg_offset of 0.
2594 */
2595
2596 foreach_list_safe(node, &this->instructions) {
2597 fs_inst *inst = (fs_inst *)node;
2598
2599 if (inst->mlen != 0 && inst->dst.file == GRF) {
2600 insert_gen4_pre_send_dependency_workarounds(inst);
2601 insert_gen4_post_send_dependency_workarounds(inst);
2602 }
2603 }
2604 }
2605
2606 /**
2607 * Turns the generic expression-style uniform pull constant load instruction
2608 * into a hardware-specific series of instructions for loading a pull
2609 * constant.
2610 *
2611 * The expression style allows the CSE pass before this to optimize out
2612 * repeated loads from the same offset, and gives the pre-register-allocation
2613 * scheduling full flexibility, while the conversion to native instructions
2614 * allows the post-register-allocation scheduler the best information
2615 * possible.
2616 *
2617 * Note that execution masking for setting up pull constant loads is special:
2618 * the channels that need to be written are unrelated to the current execution
2619 * mask, since a later instruction will use one of the result channels as a
2620 * source operand for all 8 or 16 of its channels.
2621 */
2622 void
2623 fs_visitor::lower_uniform_pull_constant_loads()
2624 {
2625 foreach_list(node, &this->instructions) {
2626 fs_inst *inst = (fs_inst *)node;
2627
2628 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2629 continue;
2630
2631 if (intel->gen >= 7) {
2632 /* The offset arg before was a vec4-aligned byte offset. We need to
2633 * turn it into a dword offset.
2634 */
2635 fs_reg const_offset_reg = inst->src[1];
2636 assert(const_offset_reg.file == IMM &&
2637 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2638 const_offset_reg.imm.u /= 4;
2639 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2640
2641 /* This is actually going to be a MOV, but since only the first dword
2642 * is accessed, we have a special opcode to do just that one. Note
2643 * that this needs to be an operation that will be considered a def
2644 * by live variable analysis, or register allocation will explode.
2645 */
2646 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2647 payload, const_offset_reg);
2648 setup->force_writemask_all = true;
2649
2650 setup->ir = inst->ir;
2651 setup->annotation = inst->annotation;
2652 inst->insert_before(setup);
2653
2654 /* Similarly, this will only populate the first 4 channels of the
2655 * result register (since we only use smear values from 0-3), but we
2656 * don't tell the optimizer.
2657 */
2658 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2659 inst->src[1] = payload;
2660
2661 this->live_intervals_valid = false;
2662 } else {
2663 /* Before register allocation, we didn't tell the scheduler about the
2664 * MRF we use. We know it's safe to use this MRF because nothing
2665 * else does except for register spill/unspill, which generates and
2666 * uses its MRF within a single IR instruction.
2667 */
2668 inst->base_mrf = 14;
2669 inst->mlen = 1;
2670 }
2671 }
2672 }
2673
2674 void
2675 fs_visitor::dump_instruction(backend_instruction *be_inst)
2676 {
2677 fs_inst *inst = (fs_inst *)be_inst;
2678
2679 if (inst->predicate) {
2680 printf("(%cf0.%d) ",
2681 inst->predicate_inverse ? '-' : '+',
2682 inst->flag_subreg);
2683 }
2684
2685 printf("%s", brw_instruction_name(inst->opcode));
2686 if (inst->saturate)
2687 printf(".sat");
2688 if (inst->conditional_mod) {
2689 printf(".cmod");
2690 if (!inst->predicate &&
2691 (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2692 inst->opcode != BRW_OPCODE_IF &&
2693 inst->opcode != BRW_OPCODE_WHILE))) {
2694 printf(".f0.%d\n", inst->flag_subreg);
2695 }
2696 }
2697 printf(" ");
2698
2699
2700 switch (inst->dst.file) {
2701 case GRF:
2702 printf("vgrf%d", inst->dst.reg);
2703 if (inst->dst.reg_offset)
2704 printf("+%d", inst->dst.reg_offset);
2705 break;
2706 case MRF:
2707 printf("m%d", inst->dst.reg);
2708 break;
2709 case BAD_FILE:
2710 printf("(null)");
2711 break;
2712 case UNIFORM:
2713 printf("***u%d***", inst->dst.reg);
2714 break;
2715 default:
2716 printf("???");
2717 break;
2718 }
2719 printf(", ");
2720
2721 for (int i = 0; i < 3; i++) {
2722 if (inst->src[i].negate)
2723 printf("-");
2724 if (inst->src[i].abs)
2725 printf("|");
2726 switch (inst->src[i].file) {
2727 case GRF:
2728 printf("vgrf%d", inst->src[i].reg);
2729 if (inst->src[i].reg_offset)
2730 printf("+%d", inst->src[i].reg_offset);
2731 break;
2732 case MRF:
2733 printf("***m%d***", inst->src[i].reg);
2734 break;
2735 case UNIFORM:
2736 printf("u%d", inst->src[i].reg);
2737 if (inst->src[i].reg_offset)
2738 printf(".%d", inst->src[i].reg_offset);
2739 break;
2740 case BAD_FILE:
2741 printf("(null)");
2742 break;
2743 case IMM:
2744 switch (inst->src[i].type) {
2745 case BRW_REGISTER_TYPE_F:
2746 printf("%ff", inst->src[i].imm.f);
2747 break;
2748 case BRW_REGISTER_TYPE_D:
2749 printf("%dd", inst->src[i].imm.i);
2750 break;
2751 case BRW_REGISTER_TYPE_UD:
2752 printf("%uu", inst->src[i].imm.u);
2753 break;
2754 default:
2755 printf("???");
2756 break;
2757 }
2758 break;
2759 default:
2760 printf("???");
2761 break;
2762 }
2763 if (inst->src[i].abs)
2764 printf("|");
2765
2766 if (i < 3)
2767 printf(", ");
2768 }
2769
2770 printf(" ");
2771
2772 if (inst->force_uncompressed)
2773 printf("1sthalf ");
2774
2775 if (inst->force_sechalf)
2776 printf("2ndhalf ");
2777
2778 printf("\n");
2779 }
2780
2781 /**
2782 * Possibly returns an instruction that set up @param reg.
2783 *
2784 * Sometimes we want to take the result of some expression/variable
2785 * dereference tree and rewrite the instruction generating the result
2786 * of the tree. When processing the tree, we know that the
2787 * instructions generated are all writing temporaries that are dead
2788 * outside of this tree. So, if we have some instructions that write
2789 * a temporary, we're free to point that temp write somewhere else.
2790 *
2791 * Note that this doesn't guarantee that the instruction generated
2792 * only reg -- it might be the size=4 destination of a texture instruction.
2793 */
2794 fs_inst *
2795 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2796 fs_inst *end,
2797 fs_reg reg)
2798 {
2799 if (end == start ||
2800 end->is_partial_write() ||
2801 reg.reladdr ||
2802 !reg.equals(end->dst)) {
2803 return NULL;
2804 } else {
2805 return end;
2806 }
2807 }
2808
2809 void
2810 fs_visitor::setup_payload_gen6()
2811 {
2812 bool uses_depth =
2813 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2814 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2815
2816 assert(intel->gen >= 6);
2817
2818 /* R0-1: masks, pixel X/Y coordinates. */
2819 c->nr_payload_regs = 2;
2820 /* R2: only for 32-pixel dispatch.*/
2821
2822 /* R3-26: barycentric interpolation coordinates. These appear in the
2823 * same order that they appear in the brw_wm_barycentric_interp_mode
2824 * enum. Each set of coordinates occupies 2 registers if dispatch width
2825 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2826 * appear if they were enabled using the "Barycentric Interpolation
2827 * Mode" bits in WM_STATE.
2828 */
2829 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2830 if (barycentric_interp_modes & (1 << i)) {
2831 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2832 c->nr_payload_regs += 2;
2833 if (dispatch_width == 16) {
2834 c->nr_payload_regs += 2;
2835 }
2836 }
2837 }
2838
2839 /* R27: interpolated depth if uses source depth */
2840 if (uses_depth) {
2841 c->source_depth_reg = c->nr_payload_regs;
2842 c->nr_payload_regs++;
2843 if (dispatch_width == 16) {
2844 /* R28: interpolated depth if not 8-wide. */
2845 c->nr_payload_regs++;
2846 }
2847 }
2848 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2849 if (uses_depth) {
2850 c->source_w_reg = c->nr_payload_regs;
2851 c->nr_payload_regs++;
2852 if (dispatch_width == 16) {
2853 /* R30: interpolated W if not 8-wide. */
2854 c->nr_payload_regs++;
2855 }
2856 }
2857 /* R31: MSAA position offsets. */
2858 /* R32-: bary for 32-pixel. */
2859 /* R58-59: interp W for 32-pixel. */
2860
2861 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2862 c->source_depth_to_render_target = true;
2863 }
2864 }
2865
2866 bool
2867 fs_visitor::run()
2868 {
2869 sanity_param_count = fp->Base.Parameters->NumParameters;
2870 uint32_t orig_nr_params = c->prog_data.nr_params;
2871
2872 if (intel->gen >= 6)
2873 setup_payload_gen6();
2874 else
2875 setup_payload_gen4();
2876
2877 if (0) {
2878 emit_dummy_fs();
2879 } else {
2880 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2881 emit_shader_time_begin();
2882
2883 calculate_urb_setup();
2884 if (intel->gen < 6)
2885 emit_interpolation_setup_gen4();
2886 else
2887 emit_interpolation_setup_gen6();
2888
2889 /* We handle discards by keeping track of the still-live pixels in f0.1.
2890 * Initialize it with the dispatched pixels.
2891 */
2892 if (fp->UsesKill) {
2893 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2894 discard_init->flag_subreg = 1;
2895 }
2896
2897 /* Generate FS IR for main(). (the visitor only descends into
2898 * functions called "main").
2899 */
2900 if (shader) {
2901 foreach_list(node, &*shader->ir) {
2902 ir_instruction *ir = (ir_instruction *)node;
2903 base_ir = ir;
2904 this->result = reg_undef;
2905 ir->accept(this);
2906 }
2907 } else {
2908 emit_fragment_program_code();
2909 }
2910 base_ir = NULL;
2911 if (failed)
2912 return false;
2913
2914 emit(FS_OPCODE_PLACEHOLDER_HALT);
2915
2916 emit_fb_writes();
2917
2918 split_virtual_grfs();
2919
2920 move_uniform_array_access_to_pull_constants();
2921 setup_pull_constants();
2922
2923 bool progress;
2924 do {
2925 progress = false;
2926
2927 compact_virtual_grfs();
2928
2929 progress = remove_duplicate_mrf_writes() || progress;
2930
2931 progress = opt_algebraic() || progress;
2932 progress = opt_cse() || progress;
2933 progress = opt_copy_propagate() || progress;
2934 progress = dead_code_eliminate() || progress;
2935 progress = dead_code_eliminate_local() || progress;
2936 progress = register_coalesce() || progress;
2937 progress = register_coalesce_2() || progress;
2938 progress = compute_to_mrf() || progress;
2939 } while (progress);
2940
2941 remove_dead_constants();
2942
2943 schedule_instructions(false);
2944
2945 lower_uniform_pull_constant_loads();
2946
2947 assign_curb_setup();
2948 assign_urb_setup();
2949
2950 if (0) {
2951 /* Debug of register spilling: Go spill everything. */
2952 for (int i = 0; i < virtual_grf_count; i++) {
2953 spill_reg(i);
2954 }
2955 }
2956
2957 if (0)
2958 assign_regs_trivial();
2959 else {
2960 while (!assign_regs()) {
2961 if (failed)
2962 break;
2963 }
2964 }
2965 }
2966 assert(force_uncompressed_stack == 0);
2967 assert(force_sechalf_stack == 0);
2968
2969 /* This must come after all optimization and register allocation, since
2970 * it inserts dead code that happens to have side effects, and it does
2971 * so based on the actual physical registers in use.
2972 */
2973 insert_gen4_send_dependency_workarounds();
2974
2975 if (failed)
2976 return false;
2977
2978 schedule_instructions(true);
2979
2980 if (dispatch_width == 8) {
2981 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2982 } else {
2983 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2984
2985 /* Make sure we didn't try to sneak in an extra uniform */
2986 assert(orig_nr_params == c->prog_data.nr_params);
2987 (void) orig_nr_params;
2988 }
2989
2990 /* If any state parameters were appended, then ParameterValues could have
2991 * been realloced, in which case the driver uniform storage set up by
2992 * _mesa_associate_uniform_storage() would point to freed memory. Make
2993 * sure that didn't happen.
2994 */
2995 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2996
2997 return !failed;
2998 }
2999
3000 const unsigned *
3001 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3002 struct gl_fragment_program *fp,
3003 struct gl_shader_program *prog,
3004 unsigned *final_assembly_size)
3005 {
3006 struct intel_context *intel = &brw->intel;
3007 bool start_busy = false;
3008 float start_time = 0;
3009
3010 if (unlikely(intel->perf_debug)) {
3011 start_busy = (intel->batch.last_bo &&
3012 drm_intel_bo_busy(intel->batch.last_bo));
3013 start_time = get_time();
3014 }
3015
3016 struct brw_shader *shader = NULL;
3017 if (prog)
3018 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3019
3020 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3021 if (prog) {
3022 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3023 _mesa_print_ir(shader->ir, NULL);
3024 printf("\n\n");
3025 } else {
3026 printf("ARB_fragment_program %d ir for native fragment shader\n",
3027 fp->Base.Id);
3028 _mesa_print_program(&fp->Base);
3029 }
3030 }
3031
3032 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3033 */
3034 fs_visitor v(brw, c, prog, fp, 8);
3035 if (!v.run()) {
3036 if (prog) {
3037 prog->LinkStatus = false;
3038 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3039 }
3040
3041 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3042 v.fail_msg);
3043
3044 return NULL;
3045 }
3046
3047 exec_list *simd16_instructions = NULL;
3048 fs_visitor v2(brw, c, prog, fp, 16);
3049 bool no16 = INTEL_DEBUG & DEBUG_NO16;
3050 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
3051 v2.import_uniforms(&v);
3052 if (!v2.run()) {
3053 perf_debug("16-wide shader failed to compile, falling back to "
3054 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3055 } else {
3056 simd16_instructions = &v2.instructions;
3057 }
3058 }
3059
3060 c->prog_data.dispatch_width = 8;
3061
3062 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3063 const unsigned *generated = g.generate_assembly(&v.instructions,
3064 simd16_instructions,
3065 final_assembly_size);
3066
3067 if (unlikely(intel->perf_debug) && shader) {
3068 if (shader->compiled_once)
3069 brw_wm_debug_recompile(brw, prog, &c->key);
3070 shader->compiled_once = true;
3071
3072 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
3073 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3074 (get_time() - start_time) * 1000);
3075 }
3076 }
3077
3078 return generated;
3079 }
3080
3081 bool
3082 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3083 {
3084 struct brw_context *brw = brw_context(ctx);
3085 struct intel_context *intel = &brw->intel;
3086 struct brw_wm_prog_key key;
3087
3088 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3089 return true;
3090
3091 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3092 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3093 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3094 bool program_uses_dfdy = fp->UsesDFdy;
3095
3096 memset(&key, 0, sizeof(key));
3097
3098 if (intel->gen < 6) {
3099 if (fp->UsesKill)
3100 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3101
3102 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3103 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3104
3105 /* Just assume depth testing. */
3106 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3107 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3108 }
3109
3110 if (intel->gen < 6)
3111 key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
3112
3113 for (int i = 0; i < VARYING_SLOT_MAX; i++) {
3114 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
3115 continue;
3116
3117 if (intel->gen < 6) {
3118 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
3119 key.input_slots_valid |= BITFIELD64_BIT(i);
3120 }
3121 }
3122
3123 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3124
3125 for (int i = 0; i < MAX_SAMPLERS; i++) {
3126 if (fp->Base.ShadowSamplers & (1 << i)) {
3127 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3128 key.tex.swizzles[i] =
3129 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3130 } else {
3131 /* Color sampler: assume no swizzling. */
3132 key.tex.swizzles[i] = SWIZZLE_XYZW;
3133 }
3134 }
3135
3136 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3137 key.drawable_height = ctx->DrawBuffer->Height;
3138 }
3139
3140 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3141 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3142 }
3143
3144 key.nr_color_regions = 1;
3145
3146 key.program_string_id = bfp->id;
3147
3148 uint32_t old_prog_offset = brw->wm.prog_offset;
3149 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3150
3151 bool success = do_wm_prog(brw, prog, bfp, &key);
3152
3153 brw->wm.prog_offset = old_prog_offset;
3154 brw->wm.prog_data = old_prog_data;
3155
3156 return success;
3157 }