i965/fs: Add basic-block-level dead code elimination.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/uniforms.h"
39 #include "main/fbobject.h"
40 #include "program/prog_parameter.h"
41 #include "program/prog_print.h"
42 #include "program/register_allocate.h"
43 #include "program/sampler.h"
44 #include "program/hash_table.h"
45 #include "brw_context.h"
46 #include "brw_eu.h"
47 #include "brw_wm.h"
48 }
49 #include "brw_fs.h"
50 #include "glsl/glsl_types.h"
51 #include "glsl/ir_print_visitor.h"
52
53 void
54 fs_inst::init()
55 {
56 memset(this, 0, sizeof(*this));
57 this->opcode = BRW_OPCODE_NOP;
58 this->conditional_mod = BRW_CONDITIONAL_NONE;
59
60 this->dst = reg_undef;
61 this->src[0] = reg_undef;
62 this->src[1] = reg_undef;
63 this->src[2] = reg_undef;
64
65 /* This will be the case for almost all instructions. */
66 this->regs_written = 1;
67 }
68
69 fs_inst::fs_inst()
70 {
71 init();
72 }
73
74 fs_inst::fs_inst(enum opcode opcode)
75 {
76 init();
77 this->opcode = opcode;
78 }
79
80 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
81 {
82 init();
83 this->opcode = opcode;
84 this->dst = dst;
85
86 if (dst.file == GRF)
87 assert(dst.reg_offset >= 0);
88 }
89
90 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
91 {
92 init();
93 this->opcode = opcode;
94 this->dst = dst;
95 this->src[0] = src0;
96
97 if (dst.file == GRF)
98 assert(dst.reg_offset >= 0);
99 if (src[0].file == GRF)
100 assert(src[0].reg_offset >= 0);
101 }
102
103 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
104 {
105 init();
106 this->opcode = opcode;
107 this->dst = dst;
108 this->src[0] = src0;
109 this->src[1] = src1;
110
111 if (dst.file == GRF)
112 assert(dst.reg_offset >= 0);
113 if (src[0].file == GRF)
114 assert(src[0].reg_offset >= 0);
115 if (src[1].file == GRF)
116 assert(src[1].reg_offset >= 0);
117 }
118
119 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
120 fs_reg src0, fs_reg src1, fs_reg src2)
121 {
122 init();
123 this->opcode = opcode;
124 this->dst = dst;
125 this->src[0] = src0;
126 this->src[1] = src1;
127 this->src[2] = src2;
128
129 if (dst.file == GRF)
130 assert(dst.reg_offset >= 0);
131 if (src[0].file == GRF)
132 assert(src[0].reg_offset >= 0);
133 if (src[1].file == GRF)
134 assert(src[1].reg_offset >= 0);
135 if (src[2].file == GRF)
136 assert(src[2].reg_offset >= 0);
137 }
138
139 #define ALU1(op) \
140 fs_inst * \
141 fs_visitor::op(fs_reg dst, fs_reg src0) \
142 { \
143 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
144 }
145
146 #define ALU2(op) \
147 fs_inst * \
148 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
149 { \
150 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
151 }
152
153 #define ALU3(op) \
154 fs_inst * \
155 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
156 { \
157 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
158 }
159
160 ALU1(NOT)
161 ALU1(MOV)
162 ALU1(FRC)
163 ALU1(RNDD)
164 ALU1(RNDE)
165 ALU1(RNDZ)
166 ALU2(ADD)
167 ALU2(MUL)
168 ALU2(MACH)
169 ALU2(AND)
170 ALU2(OR)
171 ALU2(XOR)
172 ALU2(SHL)
173 ALU2(SHR)
174 ALU2(ASR)
175 ALU3(LRP)
176
177 /** Gen4 predicated IF. */
178 fs_inst *
179 fs_visitor::IF(uint32_t predicate)
180 {
181 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
182 inst->predicate = predicate;
183 return inst;
184 }
185
186 /** Gen6+ IF with embedded comparison. */
187 fs_inst *
188 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
189 {
190 assert(intel->gen >= 6);
191 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
192 reg_null_d, src0, src1);
193 inst->conditional_mod = condition;
194 return inst;
195 }
196
197 /**
198 * CMP: Sets the low bit of the destination channels with the result
199 * of the comparison, while the upper bits are undefined, and updates
200 * the flag register with the packed 16 bits of the result.
201 */
202 fs_inst *
203 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
204 {
205 fs_inst *inst;
206
207 /* Take the instruction:
208 *
209 * CMP null<d> src0<f> src1<f>
210 *
211 * Original gen4 does type conversion to the destination type before
212 * comparison, producing garbage results for floating point comparisons.
213 * gen5 does the comparison on the execution type (resolved source types),
214 * so dst type doesn't matter. gen6 does comparison and then uses the
215 * result as if it was the dst type with no conversion, which happens to
216 * mostly work out for float-interpreted-as-int since our comparisons are
217 * for >0, =0, <0.
218 */
219 if (intel->gen == 4) {
220 dst.type = src0.type;
221 if (dst.file == FIXED_HW_REG)
222 dst.fixed_hw_reg.type = dst.type;
223 }
224
225 resolve_ud_negate(&src0);
226 resolve_ud_negate(&src1);
227
228 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
229 inst->conditional_mod = condition;
230
231 return inst;
232 }
233
234 exec_list
235 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
236 fs_reg varying_offset,
237 uint32_t const_offset)
238 {
239 exec_list instructions;
240 fs_inst *inst;
241
242 /* We have our constant surface use a pitch of 4 bytes, so our index can
243 * be any component of a vector, and then we load 4 contiguous
244 * components starting from that.
245 *
246 * We break down the const_offset to a portion added to the variable
247 * offset and a portion done using reg_offset, which means that if you
248 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
249 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
250 * CSE can later notice that those loads are all the same and eliminate
251 * the redundant ones.
252 */
253 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
254 instructions.push_tail(ADD(vec4_offset,
255 varying_offset, const_offset & ~3));
256
257 int scale = 1;
258 if (intel->gen == 4 && dispatch_width == 8) {
259 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
260 * u, v, r) as parameters, or we can just use the SIMD16 message
261 * consisting of (header, u). We choose the second, at the cost of a
262 * longer return length.
263 */
264 scale = 2;
265 }
266
267 enum opcode op;
268 if (intel->gen >= 7)
269 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
270 else
271 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
272 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
273 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
274 inst->regs_written = 4 * scale;
275 instructions.push_tail(inst);
276
277 if (intel->gen < 7) {
278 inst->base_mrf = 13;
279 inst->header_present = true;
280 if (intel->gen == 4)
281 inst->mlen = 3;
282 else
283 inst->mlen = 1 + dispatch_width / 8;
284 }
285
286 vec4_result.reg_offset += (const_offset & 3) * scale;
287 instructions.push_tail(MOV(dst, vec4_result));
288
289 return instructions;
290 }
291
292 /**
293 * A helper for MOV generation for fixing up broken hardware SEND dependency
294 * handling.
295 */
296 fs_inst *
297 fs_visitor::DEP_RESOLVE_MOV(int grf)
298 {
299 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
300
301 inst->ir = NULL;
302 inst->annotation = "send dependency resolve";
303
304 /* The caller always wants uncompressed to emit the minimal extra
305 * dependencies, and to avoid having to deal with aligning its regs to 2.
306 */
307 inst->force_uncompressed = true;
308
309 return inst;
310 }
311
312 bool
313 fs_inst::equals(fs_inst *inst)
314 {
315 return (opcode == inst->opcode &&
316 dst.equals(inst->dst) &&
317 src[0].equals(inst->src[0]) &&
318 src[1].equals(inst->src[1]) &&
319 src[2].equals(inst->src[2]) &&
320 saturate == inst->saturate &&
321 predicate == inst->predicate &&
322 conditional_mod == inst->conditional_mod &&
323 mlen == inst->mlen &&
324 base_mrf == inst->base_mrf &&
325 sampler == inst->sampler &&
326 target == inst->target &&
327 eot == inst->eot &&
328 header_present == inst->header_present &&
329 shadow_compare == inst->shadow_compare &&
330 offset == inst->offset);
331 }
332
333 bool
334 fs_inst::overwrites_reg(const fs_reg &reg)
335 {
336 return (reg.file == dst.file &&
337 reg.reg == dst.reg &&
338 reg.reg_offset >= dst.reg_offset &&
339 reg.reg_offset < dst.reg_offset + regs_written);
340 }
341
342 bool
343 fs_inst::is_tex()
344 {
345 return (opcode == SHADER_OPCODE_TEX ||
346 opcode == FS_OPCODE_TXB ||
347 opcode == SHADER_OPCODE_TXD ||
348 opcode == SHADER_OPCODE_TXF ||
349 opcode == SHADER_OPCODE_TXF_MS ||
350 opcode == SHADER_OPCODE_TXL ||
351 opcode == SHADER_OPCODE_TXS ||
352 opcode == SHADER_OPCODE_LOD);
353 }
354
355 bool
356 fs_inst::is_math()
357 {
358 return (opcode == SHADER_OPCODE_RCP ||
359 opcode == SHADER_OPCODE_RSQ ||
360 opcode == SHADER_OPCODE_SQRT ||
361 opcode == SHADER_OPCODE_EXP2 ||
362 opcode == SHADER_OPCODE_LOG2 ||
363 opcode == SHADER_OPCODE_SIN ||
364 opcode == SHADER_OPCODE_COS ||
365 opcode == SHADER_OPCODE_INT_QUOTIENT ||
366 opcode == SHADER_OPCODE_INT_REMAINDER ||
367 opcode == SHADER_OPCODE_POW);
368 }
369
370 bool
371 fs_inst::is_control_flow()
372 {
373 switch (opcode) {
374 case BRW_OPCODE_DO:
375 case BRW_OPCODE_WHILE:
376 case BRW_OPCODE_IF:
377 case BRW_OPCODE_ELSE:
378 case BRW_OPCODE_ENDIF:
379 case BRW_OPCODE_BREAK:
380 case BRW_OPCODE_CONTINUE:
381 return true;
382 default:
383 return false;
384 }
385 }
386
387 bool
388 fs_inst::is_send_from_grf()
389 {
390 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
391 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
392 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
393 src[1].file == GRF));
394 }
395
396 bool
397 fs_visitor::can_do_source_mods(fs_inst *inst)
398 {
399 if (intel->gen == 6 && inst->is_math())
400 return false;
401
402 if (inst->is_send_from_grf())
403 return false;
404
405 return true;
406 }
407
408 void
409 fs_reg::init()
410 {
411 memset(this, 0, sizeof(*this));
412 this->smear = -1;
413 }
414
415 /** Generic unset register constructor. */
416 fs_reg::fs_reg()
417 {
418 init();
419 this->file = BAD_FILE;
420 }
421
422 /** Immediate value constructor. */
423 fs_reg::fs_reg(float f)
424 {
425 init();
426 this->file = IMM;
427 this->type = BRW_REGISTER_TYPE_F;
428 this->imm.f = f;
429 }
430
431 /** Immediate value constructor. */
432 fs_reg::fs_reg(int32_t i)
433 {
434 init();
435 this->file = IMM;
436 this->type = BRW_REGISTER_TYPE_D;
437 this->imm.i = i;
438 }
439
440 /** Immediate value constructor. */
441 fs_reg::fs_reg(uint32_t u)
442 {
443 init();
444 this->file = IMM;
445 this->type = BRW_REGISTER_TYPE_UD;
446 this->imm.u = u;
447 }
448
449 /** Fixed brw_reg Immediate value constructor. */
450 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
451 {
452 init();
453 this->file = FIXED_HW_REG;
454 this->fixed_hw_reg = fixed_hw_reg;
455 this->type = fixed_hw_reg.type;
456 }
457
458 bool
459 fs_reg::equals(const fs_reg &r) const
460 {
461 return (file == r.file &&
462 reg == r.reg &&
463 reg_offset == r.reg_offset &&
464 type == r.type &&
465 negate == r.negate &&
466 abs == r.abs &&
467 !reladdr && !r.reladdr &&
468 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
469 sizeof(fixed_hw_reg)) == 0 &&
470 smear == r.smear &&
471 imm.u == r.imm.u);
472 }
473
474 bool
475 fs_reg::is_zero() const
476 {
477 if (file != IMM)
478 return false;
479
480 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
481 }
482
483 bool
484 fs_reg::is_one() const
485 {
486 if (file != IMM)
487 return false;
488
489 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
490 }
491
492 int
493 fs_visitor::type_size(const struct glsl_type *type)
494 {
495 unsigned int size, i;
496
497 switch (type->base_type) {
498 case GLSL_TYPE_UINT:
499 case GLSL_TYPE_INT:
500 case GLSL_TYPE_FLOAT:
501 case GLSL_TYPE_BOOL:
502 return type->components();
503 case GLSL_TYPE_ARRAY:
504 return type_size(type->fields.array) * type->length;
505 case GLSL_TYPE_STRUCT:
506 size = 0;
507 for (i = 0; i < type->length; i++) {
508 size += type_size(type->fields.structure[i].type);
509 }
510 return size;
511 case GLSL_TYPE_SAMPLER:
512 /* Samplers take up no register space, since they're baked in at
513 * link time.
514 */
515 return 0;
516 case GLSL_TYPE_VOID:
517 case GLSL_TYPE_ERROR:
518 case GLSL_TYPE_INTERFACE:
519 assert(!"not reached");
520 break;
521 }
522
523 return 0;
524 }
525
526 fs_reg
527 fs_visitor::get_timestamp()
528 {
529 assert(intel->gen >= 7);
530
531 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
532 BRW_ARF_TIMESTAMP,
533 0),
534 BRW_REGISTER_TYPE_UD));
535
536 fs_reg dst = fs_reg(this, glsl_type::uint_type);
537
538 fs_inst *mov = emit(MOV(dst, ts));
539 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
540 * even if it's not enabled in the dispatch.
541 */
542 mov->force_writemask_all = true;
543 mov->force_uncompressed = true;
544
545 /* The caller wants the low 32 bits of the timestamp. Since it's running
546 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
547 * which is plenty of time for our purposes. It is identical across the
548 * EUs, but since it's tracking GPU core speed it will increment at a
549 * varying rate as render P-states change.
550 *
551 * The caller could also check if render P-states have changed (or anything
552 * else that might disrupt timing) by setting smear to 2 and checking if
553 * that field is != 0.
554 */
555 dst.smear = 0;
556
557 return dst;
558 }
559
560 void
561 fs_visitor::emit_shader_time_begin()
562 {
563 current_annotation = "shader time start";
564 shader_start_time = get_timestamp();
565 }
566
567 void
568 fs_visitor::emit_shader_time_end()
569 {
570 current_annotation = "shader time end";
571
572 enum shader_time_shader_type type, written_type, reset_type;
573 if (dispatch_width == 8) {
574 type = ST_FS8;
575 written_type = ST_FS8_WRITTEN;
576 reset_type = ST_FS8_RESET;
577 } else {
578 assert(dispatch_width == 16);
579 type = ST_FS16;
580 written_type = ST_FS16_WRITTEN;
581 reset_type = ST_FS16_RESET;
582 }
583
584 fs_reg shader_end_time = get_timestamp();
585
586 /* Check that there weren't any timestamp reset events (assuming these
587 * were the only two timestamp reads that happened).
588 */
589 fs_reg reset = shader_end_time;
590 reset.smear = 2;
591 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
592 test->conditional_mod = BRW_CONDITIONAL_Z;
593 emit(IF(BRW_PREDICATE_NORMAL));
594
595 push_force_uncompressed();
596 fs_reg start = shader_start_time;
597 start.negate = true;
598 fs_reg diff = fs_reg(this, glsl_type::uint_type);
599 emit(ADD(diff, start, shader_end_time));
600
601 /* If there were no instructions between the two timestamp gets, the diff
602 * is 2 cycles. Remove that overhead, so I can forget about that when
603 * trying to determine the time taken for single instructions.
604 */
605 emit(ADD(diff, diff, fs_reg(-2u)));
606
607 emit_shader_time_write(type, diff);
608 emit_shader_time_write(written_type, fs_reg(1u));
609 emit(BRW_OPCODE_ELSE);
610 emit_shader_time_write(reset_type, fs_reg(1u));
611 emit(BRW_OPCODE_ENDIF);
612
613 pop_force_uncompressed();
614 }
615
616 void
617 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
618 fs_reg value)
619 {
620 int shader_time_index =
621 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
622 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
623
624 fs_reg payload;
625 if (dispatch_width == 8)
626 payload = fs_reg(this, glsl_type::uvec2_type);
627 else
628 payload = fs_reg(this, glsl_type::uint_type);
629
630 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
631 fs_reg(), payload, offset, value));
632 }
633
634 void
635 fs_visitor::fail(const char *format, ...)
636 {
637 va_list va;
638 char *msg;
639
640 if (failed)
641 return;
642
643 failed = true;
644
645 va_start(va, format);
646 msg = ralloc_vasprintf(mem_ctx, format, va);
647 va_end(va);
648 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
649
650 this->fail_msg = msg;
651
652 if (INTEL_DEBUG & DEBUG_WM) {
653 fprintf(stderr, "%s", msg);
654 }
655 }
656
657 fs_inst *
658 fs_visitor::emit(enum opcode opcode)
659 {
660 return emit(fs_inst(opcode));
661 }
662
663 fs_inst *
664 fs_visitor::emit(enum opcode opcode, fs_reg dst)
665 {
666 return emit(fs_inst(opcode, dst));
667 }
668
669 fs_inst *
670 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
671 {
672 return emit(fs_inst(opcode, dst, src0));
673 }
674
675 fs_inst *
676 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
677 {
678 return emit(fs_inst(opcode, dst, src0, src1));
679 }
680
681 fs_inst *
682 fs_visitor::emit(enum opcode opcode, fs_reg dst,
683 fs_reg src0, fs_reg src1, fs_reg src2)
684 {
685 return emit(fs_inst(opcode, dst, src0, src1, src2));
686 }
687
688 void
689 fs_visitor::push_force_uncompressed()
690 {
691 force_uncompressed_stack++;
692 }
693
694 void
695 fs_visitor::pop_force_uncompressed()
696 {
697 force_uncompressed_stack--;
698 assert(force_uncompressed_stack >= 0);
699 }
700
701 void
702 fs_visitor::push_force_sechalf()
703 {
704 force_sechalf_stack++;
705 }
706
707 void
708 fs_visitor::pop_force_sechalf()
709 {
710 force_sechalf_stack--;
711 assert(force_sechalf_stack >= 0);
712 }
713
714 /**
715 * Returns true if the instruction has a flag that means it won't
716 * update an entire destination register.
717 *
718 * For example, dead code elimination and live variable analysis want to know
719 * when a write to a variable screens off any preceding values that were in
720 * it.
721 */
722 bool
723 fs_inst::is_partial_write()
724 {
725 return (this->predicate ||
726 this->force_uncompressed ||
727 this->force_sechalf);
728 }
729
730 /**
731 * Returns how many MRFs an FS opcode will write over.
732 *
733 * Note that this is not the 0 or 1 implied writes in an actual gen
734 * instruction -- the FS opcodes often generate MOVs in addition.
735 */
736 int
737 fs_visitor::implied_mrf_writes(fs_inst *inst)
738 {
739 if (inst->mlen == 0)
740 return 0;
741
742 switch (inst->opcode) {
743 case SHADER_OPCODE_RCP:
744 case SHADER_OPCODE_RSQ:
745 case SHADER_OPCODE_SQRT:
746 case SHADER_OPCODE_EXP2:
747 case SHADER_OPCODE_LOG2:
748 case SHADER_OPCODE_SIN:
749 case SHADER_OPCODE_COS:
750 return 1 * dispatch_width / 8;
751 case SHADER_OPCODE_POW:
752 case SHADER_OPCODE_INT_QUOTIENT:
753 case SHADER_OPCODE_INT_REMAINDER:
754 return 2 * dispatch_width / 8;
755 case SHADER_OPCODE_TEX:
756 case FS_OPCODE_TXB:
757 case SHADER_OPCODE_TXD:
758 case SHADER_OPCODE_TXF:
759 case SHADER_OPCODE_TXF_MS:
760 case SHADER_OPCODE_TXL:
761 case SHADER_OPCODE_TXS:
762 case SHADER_OPCODE_LOD:
763 return 1;
764 case FS_OPCODE_FB_WRITE:
765 return 2;
766 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
767 case FS_OPCODE_UNSPILL:
768 return 1;
769 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
770 return inst->mlen;
771 case FS_OPCODE_SPILL:
772 return 2;
773 default:
774 assert(!"not reached");
775 return inst->mlen;
776 }
777 }
778
779 int
780 fs_visitor::virtual_grf_alloc(int size)
781 {
782 if (virtual_grf_array_size <= virtual_grf_count) {
783 if (virtual_grf_array_size == 0)
784 virtual_grf_array_size = 16;
785 else
786 virtual_grf_array_size *= 2;
787 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
788 virtual_grf_array_size);
789 }
790 virtual_grf_sizes[virtual_grf_count] = size;
791 return virtual_grf_count++;
792 }
793
794 /** Fixed HW reg constructor. */
795 fs_reg::fs_reg(enum register_file file, int reg)
796 {
797 init();
798 this->file = file;
799 this->reg = reg;
800 this->type = BRW_REGISTER_TYPE_F;
801 }
802
803 /** Fixed HW reg constructor. */
804 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
805 {
806 init();
807 this->file = file;
808 this->reg = reg;
809 this->type = type;
810 }
811
812 /** Automatic reg constructor. */
813 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
814 {
815 init();
816
817 this->file = GRF;
818 this->reg = v->virtual_grf_alloc(v->type_size(type));
819 this->reg_offset = 0;
820 this->type = brw_type_for_base_type(type);
821 }
822
823 fs_reg *
824 fs_visitor::variable_storage(ir_variable *var)
825 {
826 return (fs_reg *)hash_table_find(this->variable_ht, var);
827 }
828
829 void
830 import_uniforms_callback(const void *key,
831 void *data,
832 void *closure)
833 {
834 struct hash_table *dst_ht = (struct hash_table *)closure;
835 const fs_reg *reg = (const fs_reg *)data;
836
837 if (reg->file != UNIFORM)
838 return;
839
840 hash_table_insert(dst_ht, data, key);
841 }
842
843 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
844 * This brings in those uniform definitions
845 */
846 void
847 fs_visitor::import_uniforms(fs_visitor *v)
848 {
849 hash_table_call_foreach(v->variable_ht,
850 import_uniforms_callback,
851 variable_ht);
852 this->params_remap = v->params_remap;
853 }
854
855 /* Our support for uniforms is piggy-backed on the struct
856 * gl_fragment_program, because that's where the values actually
857 * get stored, rather than in some global gl_shader_program uniform
858 * store.
859 */
860 void
861 fs_visitor::setup_uniform_values(ir_variable *ir)
862 {
863 int namelen = strlen(ir->name);
864
865 /* The data for our (non-builtin) uniforms is stored in a series of
866 * gl_uniform_driver_storage structs for each subcomponent that
867 * glGetUniformLocation() could name. We know it's been set up in the same
868 * order we'd walk the type, so walk the list of storage and find anything
869 * with our name, or the prefix of a component that starts with our name.
870 */
871 unsigned params_before = c->prog_data.nr_params;
872 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
873 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
874
875 if (strncmp(ir->name, storage->name, namelen) != 0 ||
876 (storage->name[namelen] != 0 &&
877 storage->name[namelen] != '.' &&
878 storage->name[namelen] != '[')) {
879 continue;
880 }
881
882 unsigned slots = storage->type->component_slots();
883 if (storage->array_elements)
884 slots *= storage->array_elements;
885
886 for (unsigned i = 0; i < slots; i++) {
887 c->prog_data.param[c->prog_data.nr_params++] =
888 &storage->storage[i].f;
889 }
890 }
891
892 /* Make sure we actually initialized the right amount of stuff here. */
893 assert(params_before + ir->type->component_slots() ==
894 c->prog_data.nr_params);
895 }
896
897
898 /* Our support for builtin uniforms is even scarier than non-builtin.
899 * It sits on top of the PROG_STATE_VAR parameters that are
900 * automatically updated from GL context state.
901 */
902 void
903 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
904 {
905 const ir_state_slot *const slots = ir->state_slots;
906 assert(ir->state_slots != NULL);
907
908 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
909 /* This state reference has already been setup by ir_to_mesa, but we'll
910 * get the same index back here.
911 */
912 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
913 (gl_state_index *)slots[i].tokens);
914
915 /* Add each of the unique swizzles of the element as a parameter.
916 * This'll end up matching the expected layout of the
917 * array/matrix/structure we're trying to fill in.
918 */
919 int last_swiz = -1;
920 for (unsigned int j = 0; j < 4; j++) {
921 int swiz = GET_SWZ(slots[i].swizzle, j);
922 if (swiz == last_swiz)
923 break;
924 last_swiz = swiz;
925
926 c->prog_data.param[c->prog_data.nr_params++] =
927 &fp->Base.Parameters->ParameterValues[index][swiz].f;
928 }
929 }
930 }
931
932 fs_reg *
933 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
934 {
935 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
936 fs_reg wpos = *reg;
937 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
938
939 /* gl_FragCoord.x */
940 if (ir->pixel_center_integer) {
941 emit(MOV(wpos, this->pixel_x));
942 } else {
943 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
944 }
945 wpos.reg_offset++;
946
947 /* gl_FragCoord.y */
948 if (!flip && ir->pixel_center_integer) {
949 emit(MOV(wpos, this->pixel_y));
950 } else {
951 fs_reg pixel_y = this->pixel_y;
952 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
953
954 if (flip) {
955 pixel_y.negate = true;
956 offset += c->key.drawable_height - 1.0;
957 }
958
959 emit(ADD(wpos, pixel_y, fs_reg(offset)));
960 }
961 wpos.reg_offset++;
962
963 /* gl_FragCoord.z */
964 if (intel->gen >= 6) {
965 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
966 } else {
967 emit(FS_OPCODE_LINTERP, wpos,
968 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
969 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
970 interp_reg(VARYING_SLOT_POS, 2));
971 }
972 wpos.reg_offset++;
973
974 /* gl_FragCoord.w: Already set up in emit_interpolation */
975 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
976
977 return reg;
978 }
979
980 fs_inst *
981 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
982 glsl_interp_qualifier interpolation_mode,
983 bool is_centroid)
984 {
985 brw_wm_barycentric_interp_mode barycoord_mode;
986 if (is_centroid) {
987 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
988 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
989 else
990 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
991 } else {
992 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
993 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
994 else
995 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
996 }
997 return emit(FS_OPCODE_LINTERP, attr,
998 this->delta_x[barycoord_mode],
999 this->delta_y[barycoord_mode], interp);
1000 }
1001
1002 fs_reg *
1003 fs_visitor::emit_general_interpolation(ir_variable *ir)
1004 {
1005 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1006 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1007 fs_reg attr = *reg;
1008
1009 unsigned int array_elements;
1010 const glsl_type *type;
1011
1012 if (ir->type->is_array()) {
1013 array_elements = ir->type->length;
1014 if (array_elements == 0) {
1015 fail("dereferenced array '%s' has length 0\n", ir->name);
1016 }
1017 type = ir->type->fields.array;
1018 } else {
1019 array_elements = 1;
1020 type = ir->type;
1021 }
1022
1023 glsl_interp_qualifier interpolation_mode =
1024 ir->determine_interpolation_mode(c->key.flat_shade);
1025
1026 int location = ir->location;
1027 for (unsigned int i = 0; i < array_elements; i++) {
1028 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1029 if (urb_setup[location] == -1) {
1030 /* If there's no incoming setup data for this slot, don't
1031 * emit interpolation for it.
1032 */
1033 attr.reg_offset += type->vector_elements;
1034 location++;
1035 continue;
1036 }
1037
1038 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1039 /* Constant interpolation (flat shading) case. The SF has
1040 * handed us defined values in only the constant offset
1041 * field of the setup reg.
1042 */
1043 for (unsigned int k = 0; k < type->vector_elements; k++) {
1044 struct brw_reg interp = interp_reg(location, k);
1045 interp = suboffset(interp, 3);
1046 interp.type = reg->type;
1047 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1048 attr.reg_offset++;
1049 }
1050 } else {
1051 /* Smooth/noperspective interpolation case. */
1052 for (unsigned int k = 0; k < type->vector_elements; k++) {
1053 /* FINISHME: At some point we probably want to push
1054 * this farther by giving similar treatment to the
1055 * other potentially constant components of the
1056 * attribute, as well as making brw_vs_constval.c
1057 * handle varyings other than gl_TexCoord.
1058 */
1059 struct brw_reg interp = interp_reg(location, k);
1060 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1061 ir->centroid);
1062 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1063 /* Get the pixel/sample mask into f0 so that we know
1064 * which pixels are lit. Then, for each channel that is
1065 * unlit, replace the centroid data with non-centroid
1066 * data.
1067 */
1068 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1069 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1070 interpolation_mode, false);
1071 inst->predicate = BRW_PREDICATE_NORMAL;
1072 inst->predicate_inverse = true;
1073 }
1074 if (intel->gen < 6) {
1075 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1076 }
1077 attr.reg_offset++;
1078 }
1079
1080 }
1081 location++;
1082 }
1083 }
1084
1085 return reg;
1086 }
1087
1088 fs_reg *
1089 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1090 {
1091 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1092
1093 /* The frontfacing comes in as a bit in the thread payload. */
1094 if (intel->gen >= 6) {
1095 emit(BRW_OPCODE_ASR, *reg,
1096 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1097 fs_reg(15));
1098 emit(BRW_OPCODE_NOT, *reg, *reg);
1099 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1100 } else {
1101 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1102 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1103 * us front face
1104 */
1105 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1106 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1107 }
1108
1109 return reg;
1110 }
1111
1112 fs_reg
1113 fs_visitor::fix_math_operand(fs_reg src)
1114 {
1115 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1116 * might be able to do better by doing execsize = 1 math and then
1117 * expanding that result out, but we would need to be careful with
1118 * masking.
1119 *
1120 * The hardware ignores source modifiers (negate and abs) on math
1121 * instructions, so we also move to a temp to set those up.
1122 */
1123 if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1124 !src.abs && !src.negate)
1125 return src;
1126
1127 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1128 * operands to math
1129 */
1130 if (intel->gen >= 7 && src.file != IMM)
1131 return src;
1132
1133 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1134 expanded.type = src.type;
1135 emit(BRW_OPCODE_MOV, expanded, src);
1136 return expanded;
1137 }
1138
1139 fs_inst *
1140 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1141 {
1142 switch (opcode) {
1143 case SHADER_OPCODE_RCP:
1144 case SHADER_OPCODE_RSQ:
1145 case SHADER_OPCODE_SQRT:
1146 case SHADER_OPCODE_EXP2:
1147 case SHADER_OPCODE_LOG2:
1148 case SHADER_OPCODE_SIN:
1149 case SHADER_OPCODE_COS:
1150 break;
1151 default:
1152 assert(!"not reached: bad math opcode");
1153 return NULL;
1154 }
1155
1156 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1157 * might be able to do better by doing execsize = 1 math and then
1158 * expanding that result out, but we would need to be careful with
1159 * masking.
1160 *
1161 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1162 * instructions, so we also move to a temp to set those up.
1163 */
1164 if (intel->gen >= 6)
1165 src = fix_math_operand(src);
1166
1167 fs_inst *inst = emit(opcode, dst, src);
1168
1169 if (intel->gen < 6) {
1170 inst->base_mrf = 2;
1171 inst->mlen = dispatch_width / 8;
1172 }
1173
1174 return inst;
1175 }
1176
1177 fs_inst *
1178 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1179 {
1180 int base_mrf = 2;
1181 fs_inst *inst;
1182
1183 switch (opcode) {
1184 case SHADER_OPCODE_INT_QUOTIENT:
1185 case SHADER_OPCODE_INT_REMAINDER:
1186 if (intel->gen >= 7 && dispatch_width == 16)
1187 fail("16-wide INTDIV unsupported\n");
1188 break;
1189 case SHADER_OPCODE_POW:
1190 break;
1191 default:
1192 assert(!"not reached: unsupported binary math opcode.");
1193 return NULL;
1194 }
1195
1196 if (intel->gen >= 6) {
1197 src0 = fix_math_operand(src0);
1198 src1 = fix_math_operand(src1);
1199
1200 inst = emit(opcode, dst, src0, src1);
1201 } else {
1202 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1203 * "Message Payload":
1204 *
1205 * "Operand0[7]. For the INT DIV functions, this operand is the
1206 * denominator."
1207 * ...
1208 * "Operand1[7]. For the INT DIV functions, this operand is the
1209 * numerator."
1210 */
1211 bool is_int_div = opcode != SHADER_OPCODE_POW;
1212 fs_reg &op0 = is_int_div ? src1 : src0;
1213 fs_reg &op1 = is_int_div ? src0 : src1;
1214
1215 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1216 inst = emit(opcode, dst, op0, reg_null_f);
1217
1218 inst->base_mrf = base_mrf;
1219 inst->mlen = 2 * dispatch_width / 8;
1220 }
1221 return inst;
1222 }
1223
1224 void
1225 fs_visitor::assign_curb_setup()
1226 {
1227 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1228 if (dispatch_width == 8) {
1229 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1230 } else {
1231 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1232 }
1233
1234 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1235 foreach_list(node, &this->instructions) {
1236 fs_inst *inst = (fs_inst *)node;
1237
1238 for (unsigned int i = 0; i < 3; i++) {
1239 if (inst->src[i].file == UNIFORM) {
1240 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1241 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1242 constant_nr / 8,
1243 constant_nr % 8);
1244
1245 inst->src[i].file = FIXED_HW_REG;
1246 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1247 }
1248 }
1249 }
1250 }
1251
1252 void
1253 fs_visitor::calculate_urb_setup()
1254 {
1255 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1256 urb_setup[i] = -1;
1257 }
1258
1259 int urb_next = 0;
1260 /* Figure out where each of the incoming setup attributes lands. */
1261 if (intel->gen >= 6) {
1262 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1263 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1264 urb_setup[i] = urb_next++;
1265 }
1266 }
1267 } else {
1268 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1269 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1270 /* Point size is packed into the header, not as a general attribute */
1271 if (i == VARYING_SLOT_PSIZ)
1272 continue;
1273
1274 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1275 /* The back color slot is skipped when the front color is
1276 * also written to. In addition, some slots can be
1277 * written in the vertex shader and not read in the
1278 * fragment shader. So the register number must always be
1279 * incremented, mapped or not.
1280 */
1281 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1282 urb_setup[i] = urb_next;
1283 urb_next++;
1284 }
1285 }
1286
1287 /*
1288 * It's a FS only attribute, and we did interpolation for this attribute
1289 * in SF thread. So, count it here, too.
1290 *
1291 * See compile_sf_prog() for more info.
1292 */
1293 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1294 urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1295 }
1296
1297 /* Each attribute is 4 setup channels, each of which is half a reg. */
1298 c->prog_data.urb_read_length = urb_next * 2;
1299 }
1300
1301 void
1302 fs_visitor::assign_urb_setup()
1303 {
1304 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1305
1306 /* Offset all the urb_setup[] index by the actual position of the
1307 * setup regs, now that the location of the constants has been chosen.
1308 */
1309 foreach_list(node, &this->instructions) {
1310 fs_inst *inst = (fs_inst *)node;
1311
1312 if (inst->opcode == FS_OPCODE_LINTERP) {
1313 assert(inst->src[2].file == FIXED_HW_REG);
1314 inst->src[2].fixed_hw_reg.nr += urb_start;
1315 }
1316
1317 if (inst->opcode == FS_OPCODE_CINTERP) {
1318 assert(inst->src[0].file == FIXED_HW_REG);
1319 inst->src[0].fixed_hw_reg.nr += urb_start;
1320 }
1321 }
1322
1323 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1324 }
1325
1326 /**
1327 * Split large virtual GRFs into separate components if we can.
1328 *
1329 * This is mostly duplicated with what brw_fs_vector_splitting does,
1330 * but that's really conservative because it's afraid of doing
1331 * splitting that doesn't result in real progress after the rest of
1332 * the optimization phases, which would cause infinite looping in
1333 * optimization. We can do it once here, safely. This also has the
1334 * opportunity to split interpolated values, or maybe even uniforms,
1335 * which we don't have at the IR level.
1336 *
1337 * We want to split, because virtual GRFs are what we register
1338 * allocate and spill (due to contiguousness requirements for some
1339 * instructions), and they're what we naturally generate in the
1340 * codegen process, but most virtual GRFs don't actually need to be
1341 * contiguous sets of GRFs. If we split, we'll end up with reduced
1342 * live intervals and better dead code elimination and coalescing.
1343 */
1344 void
1345 fs_visitor::split_virtual_grfs()
1346 {
1347 int num_vars = this->virtual_grf_count;
1348 bool split_grf[num_vars];
1349 int new_virtual_grf[num_vars];
1350
1351 /* Try to split anything > 0 sized. */
1352 for (int i = 0; i < num_vars; i++) {
1353 if (this->virtual_grf_sizes[i] != 1)
1354 split_grf[i] = true;
1355 else
1356 split_grf[i] = false;
1357 }
1358
1359 if (brw->has_pln &&
1360 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1361 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1362 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1363 * Gen6, that was the only supported interpolation mode, and since Gen6,
1364 * delta_x and delta_y are in fixed hardware registers.
1365 */
1366 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1367 false;
1368 }
1369
1370 foreach_list(node, &this->instructions) {
1371 fs_inst *inst = (fs_inst *)node;
1372
1373 /* If there's a SEND message that requires contiguous destination
1374 * registers, no splitting is allowed.
1375 */
1376 if (inst->regs_written > 1) {
1377 split_grf[inst->dst.reg] = false;
1378 }
1379
1380 /* If we're sending from a GRF, don't split it, on the assumption that
1381 * the send is reading the whole thing.
1382 */
1383 if (inst->is_send_from_grf()) {
1384 split_grf[inst->src[0].reg] = false;
1385 }
1386 }
1387
1388 /* Allocate new space for split regs. Note that the virtual
1389 * numbers will be contiguous.
1390 */
1391 for (int i = 0; i < num_vars; i++) {
1392 if (split_grf[i]) {
1393 new_virtual_grf[i] = virtual_grf_alloc(1);
1394 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1395 int reg = virtual_grf_alloc(1);
1396 assert(reg == new_virtual_grf[i] + j - 1);
1397 (void) reg;
1398 }
1399 this->virtual_grf_sizes[i] = 1;
1400 }
1401 }
1402
1403 foreach_list(node, &this->instructions) {
1404 fs_inst *inst = (fs_inst *)node;
1405
1406 if (inst->dst.file == GRF &&
1407 split_grf[inst->dst.reg] &&
1408 inst->dst.reg_offset != 0) {
1409 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1410 inst->dst.reg_offset - 1);
1411 inst->dst.reg_offset = 0;
1412 }
1413 for (int i = 0; i < 3; i++) {
1414 if (inst->src[i].file == GRF &&
1415 split_grf[inst->src[i].reg] &&
1416 inst->src[i].reg_offset != 0) {
1417 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1418 inst->src[i].reg_offset - 1);
1419 inst->src[i].reg_offset = 0;
1420 }
1421 }
1422 }
1423 this->live_intervals_valid = false;
1424 }
1425
1426 /**
1427 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1428 *
1429 * During code generation, we create tons of temporary variables, many of
1430 * which get immediately killed and are never used again. Yet, in later
1431 * optimization and analysis passes, such as compute_live_intervals, we need
1432 * to loop over all the virtual GRFs. Compacting them can save a lot of
1433 * overhead.
1434 */
1435 void
1436 fs_visitor::compact_virtual_grfs()
1437 {
1438 /* Mark which virtual GRFs are used, and count how many. */
1439 int remap_table[this->virtual_grf_count];
1440 memset(remap_table, -1, sizeof(remap_table));
1441
1442 foreach_list(node, &this->instructions) {
1443 const fs_inst *inst = (const fs_inst *) node;
1444
1445 if (inst->dst.file == GRF)
1446 remap_table[inst->dst.reg] = 0;
1447
1448 for (int i = 0; i < 3; i++) {
1449 if (inst->src[i].file == GRF)
1450 remap_table[inst->src[i].reg] = 0;
1451 }
1452 }
1453
1454 /* In addition to registers used in instructions, fs_visitor keeps
1455 * direct references to certain special values which must be patched:
1456 */
1457 fs_reg *special[] = {
1458 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1459 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1460 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1461 &delta_x[0], &delta_x[1], &delta_x[2],
1462 &delta_x[3], &delta_x[4], &delta_x[5],
1463 &delta_y[0], &delta_y[1], &delta_y[2],
1464 &delta_y[3], &delta_y[4], &delta_y[5],
1465 };
1466 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1467 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1468
1469 /* Treat all special values as used, to be conservative */
1470 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1471 if (special[i]->file == GRF)
1472 remap_table[special[i]->reg] = 0;
1473 }
1474
1475 /* Compact the GRF arrays. */
1476 int new_index = 0;
1477 for (int i = 0; i < this->virtual_grf_count; i++) {
1478 if (remap_table[i] != -1) {
1479 remap_table[i] = new_index;
1480 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1481 if (live_intervals_valid) {
1482 virtual_grf_use[new_index] = virtual_grf_use[i];
1483 virtual_grf_def[new_index] = virtual_grf_def[i];
1484 }
1485 ++new_index;
1486 }
1487 }
1488
1489 this->virtual_grf_count = new_index;
1490
1491 /* Patch all the instructions to use the newly renumbered registers */
1492 foreach_list(node, &this->instructions) {
1493 fs_inst *inst = (fs_inst *) node;
1494
1495 if (inst->dst.file == GRF)
1496 inst->dst.reg = remap_table[inst->dst.reg];
1497
1498 for (int i = 0; i < 3; i++) {
1499 if (inst->src[i].file == GRF)
1500 inst->src[i].reg = remap_table[inst->src[i].reg];
1501 }
1502 }
1503
1504 /* Patch all the references to special values */
1505 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1506 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1507 special[i]->reg = remap_table[special[i]->reg];
1508 }
1509 }
1510
1511 bool
1512 fs_visitor::remove_dead_constants()
1513 {
1514 if (dispatch_width == 8) {
1515 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1516
1517 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1518 this->params_remap[i] = -1;
1519
1520 /* Find which params are still in use. */
1521 foreach_list(node, &this->instructions) {
1522 fs_inst *inst = (fs_inst *)node;
1523
1524 for (int i = 0; i < 3; i++) {
1525 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1526
1527 if (inst->src[i].file != UNIFORM)
1528 continue;
1529
1530 assert(constant_nr < (int)c->prog_data.nr_params);
1531
1532 /* For now, set this to non-negative. We'll give it the
1533 * actual new number in a moment, in order to keep the
1534 * register numbers nicely ordered.
1535 */
1536 this->params_remap[constant_nr] = 0;
1537 }
1538 }
1539
1540 /* Figure out what the new numbers for the params will be. At some
1541 * point when we're doing uniform array access, we're going to want
1542 * to keep the distinction between .reg and .reg_offset, but for
1543 * now we don't care.
1544 */
1545 unsigned int new_nr_params = 0;
1546 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1547 if (this->params_remap[i] != -1) {
1548 this->params_remap[i] = new_nr_params++;
1549 }
1550 }
1551
1552 /* Update the list of params to be uploaded to match our new numbering. */
1553 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1554 int remapped = this->params_remap[i];
1555
1556 if (remapped == -1)
1557 continue;
1558
1559 c->prog_data.param[remapped] = c->prog_data.param[i];
1560 }
1561
1562 c->prog_data.nr_params = new_nr_params;
1563 } else {
1564 /* This should have been generated in the 8-wide pass already. */
1565 assert(this->params_remap);
1566 }
1567
1568 /* Now do the renumbering of the shader to remove unused params. */
1569 foreach_list(node, &this->instructions) {
1570 fs_inst *inst = (fs_inst *)node;
1571
1572 for (int i = 0; i < 3; i++) {
1573 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1574
1575 if (inst->src[i].file != UNIFORM)
1576 continue;
1577
1578 assert(this->params_remap[constant_nr] != -1);
1579 inst->src[i].reg = this->params_remap[constant_nr];
1580 inst->src[i].reg_offset = 0;
1581 }
1582 }
1583
1584 return true;
1585 }
1586
1587 /*
1588 * Implements array access of uniforms by inserting a
1589 * PULL_CONSTANT_LOAD instruction.
1590 *
1591 * Unlike temporary GRF array access (where we don't support it due to
1592 * the difficulty of doing relative addressing on instruction
1593 * destinations), we could potentially do array access of uniforms
1594 * that were loaded in GRF space as push constants. In real-world
1595 * usage we've seen, though, the arrays being used are always larger
1596 * than we could load as push constants, so just always move all
1597 * uniform array access out to a pull constant buffer.
1598 */
1599 void
1600 fs_visitor::move_uniform_array_access_to_pull_constants()
1601 {
1602 int pull_constant_loc[c->prog_data.nr_params];
1603
1604 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1605 pull_constant_loc[i] = -1;
1606 }
1607
1608 /* Walk through and find array access of uniforms. Put a copy of that
1609 * uniform in the pull constant buffer.
1610 *
1611 * Note that we don't move constant-indexed accesses to arrays. No
1612 * testing has been done of the performance impact of this choice.
1613 */
1614 foreach_list_safe(node, &this->instructions) {
1615 fs_inst *inst = (fs_inst *)node;
1616
1617 for (int i = 0 ; i < 3; i++) {
1618 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1619 continue;
1620
1621 int uniform = inst->src[i].reg;
1622
1623 /* If this array isn't already present in the pull constant buffer,
1624 * add it.
1625 */
1626 if (pull_constant_loc[uniform] == -1) {
1627 const float **values = &c->prog_data.param[uniform];
1628
1629 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1630
1631 assert(param_size[uniform]);
1632
1633 for (int j = 0; j < param_size[uniform]; j++) {
1634 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1635 values[j];
1636 }
1637 }
1638
1639 /* Set up the annotation tracking for new generated instructions. */
1640 base_ir = inst->ir;
1641 current_annotation = inst->annotation;
1642
1643 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1644 fs_reg temp = fs_reg(this, glsl_type::float_type);
1645 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1646 surf_index,
1647 *inst->src[i].reladdr,
1648 pull_constant_loc[uniform] +
1649 inst->src[i].reg_offset);
1650 inst->insert_before(&list);
1651
1652 inst->src[i].file = temp.file;
1653 inst->src[i].reg = temp.reg;
1654 inst->src[i].reg_offset = temp.reg_offset;
1655 inst->src[i].reladdr = NULL;
1656 }
1657 }
1658 }
1659
1660 /**
1661 * Choose accesses from the UNIFORM file to demote to using the pull
1662 * constant buffer.
1663 *
1664 * We allow a fragment shader to have more than the specified minimum
1665 * maximum number of fragment shader uniform components (64). If
1666 * there are too many of these, they'd fill up all of register space.
1667 * So, this will push some of them out to the pull constant buffer and
1668 * update the program to load them.
1669 */
1670 void
1671 fs_visitor::setup_pull_constants()
1672 {
1673 /* Only allow 16 registers (128 uniform components) as push constants. */
1674 unsigned int max_uniform_components = 16 * 8;
1675 if (c->prog_data.nr_params <= max_uniform_components)
1676 return;
1677
1678 if (dispatch_width == 16) {
1679 fail("Pull constants not supported in 16-wide\n");
1680 return;
1681 }
1682
1683 /* Just demote the end of the list. We could probably do better
1684 * here, demoting things that are rarely used in the program first.
1685 */
1686 unsigned int pull_uniform_base = max_uniform_components;
1687
1688 int pull_constant_loc[c->prog_data.nr_params];
1689 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1690 if (i < pull_uniform_base) {
1691 pull_constant_loc[i] = -1;
1692 } else {
1693 pull_constant_loc[i] = -1;
1694 /* If our constant is already being uploaded for reladdr purposes,
1695 * reuse it.
1696 */
1697 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1698 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1699 pull_constant_loc[i] = j;
1700 break;
1701 }
1702 }
1703 if (pull_constant_loc[i] == -1) {
1704 int pull_index = c->prog_data.nr_pull_params++;
1705 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1706 pull_constant_loc[i] = pull_index;;
1707 }
1708 }
1709 }
1710 c->prog_data.nr_params = pull_uniform_base;
1711
1712 foreach_list(node, &this->instructions) {
1713 fs_inst *inst = (fs_inst *)node;
1714
1715 for (int i = 0; i < 3; i++) {
1716 if (inst->src[i].file != UNIFORM)
1717 continue;
1718
1719 int pull_index = pull_constant_loc[inst->src[i].reg +
1720 inst->src[i].reg_offset];
1721 if (pull_index == -1)
1722 continue;
1723
1724 assert(!inst->src[i].reladdr);
1725
1726 fs_reg dst = fs_reg(this, glsl_type::float_type);
1727 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1728 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1729 fs_inst *pull =
1730 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1731 dst, index, offset);
1732 pull->ir = inst->ir;
1733 pull->annotation = inst->annotation;
1734
1735 inst->insert_before(pull);
1736
1737 inst->src[i].file = GRF;
1738 inst->src[i].reg = dst.reg;
1739 inst->src[i].reg_offset = 0;
1740 inst->src[i].smear = pull_index & 3;
1741 }
1742 }
1743 }
1744
1745 bool
1746 fs_visitor::opt_algebraic()
1747 {
1748 bool progress = false;
1749
1750 foreach_list(node, &this->instructions) {
1751 fs_inst *inst = (fs_inst *)node;
1752
1753 switch (inst->opcode) {
1754 case BRW_OPCODE_MUL:
1755 if (inst->src[1].file != IMM)
1756 continue;
1757
1758 /* a * 1.0 = a */
1759 if (inst->src[1].is_one()) {
1760 inst->opcode = BRW_OPCODE_MOV;
1761 inst->src[1] = reg_undef;
1762 progress = true;
1763 break;
1764 }
1765
1766 /* a * 0.0 = 0.0 */
1767 if (inst->src[1].is_zero()) {
1768 inst->opcode = BRW_OPCODE_MOV;
1769 inst->src[0] = inst->src[1];
1770 inst->src[1] = reg_undef;
1771 progress = true;
1772 break;
1773 }
1774
1775 break;
1776 case BRW_OPCODE_ADD:
1777 if (inst->src[1].file != IMM)
1778 continue;
1779
1780 /* a + 0.0 = a */
1781 if (inst->src[1].is_zero()) {
1782 inst->opcode = BRW_OPCODE_MOV;
1783 inst->src[1] = reg_undef;
1784 progress = true;
1785 break;
1786 }
1787 break;
1788 default:
1789 break;
1790 }
1791 }
1792
1793 return progress;
1794 }
1795
1796 /**
1797 * Must be called after calculate_live_intervales() to remove unused
1798 * writes to registers -- register allocation will fail otherwise
1799 * because something deffed but not used won't be considered to
1800 * interfere with other regs.
1801 */
1802 bool
1803 fs_visitor::dead_code_eliminate()
1804 {
1805 bool progress = false;
1806 int pc = 0;
1807
1808 calculate_live_intervals();
1809
1810 foreach_list_safe(node, &this->instructions) {
1811 fs_inst *inst = (fs_inst *)node;
1812
1813 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1814 inst->remove();
1815 progress = true;
1816 }
1817
1818 pc++;
1819 }
1820
1821 if (progress)
1822 live_intervals_valid = false;
1823
1824 return progress;
1825 }
1826
1827 struct dead_code_hash_key
1828 {
1829 int vgrf;
1830 int reg_offset;
1831 };
1832
1833 static bool
1834 dead_code_hash_compare(const void *a, const void *b)
1835 {
1836 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1837 }
1838
1839 static void
1840 clear_dead_code_hash(struct hash_table *ht)
1841 {
1842 struct hash_entry *entry;
1843
1844 hash_table_foreach(ht, entry) {
1845 _mesa_hash_table_remove(ht, entry);
1846 }
1847 }
1848
1849 static void
1850 insert_dead_code_hash(struct hash_table *ht,
1851 int vgrf, int reg_offset, fs_inst *inst)
1852 {
1853 /* We don't bother freeing keys, because they'll be GCed with the ht. */
1854 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1855
1856 key->vgrf = vgrf;
1857 key->reg_offset = reg_offset;
1858
1859 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1860 }
1861
1862 static struct hash_entry *
1863 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1864 {
1865 struct dead_code_hash_key key;
1866
1867 key.vgrf = vgrf;
1868 key.reg_offset = reg_offset;
1869
1870 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1871 }
1872
1873 static void
1874 remove_dead_code_hash(struct hash_table *ht,
1875 int vgrf, int reg_offset)
1876 {
1877 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1878 if (!entry)
1879 return;
1880
1881 _mesa_hash_table_remove(ht, entry);
1882 }
1883
1884 /**
1885 * Walks basic blocks, removing any regs that are written but not read before
1886 * being redefined.
1887 *
1888 * The dead_code_eliminate() function implements a global dead code
1889 * elimination, but it only handles the removing the last write to a register
1890 * if it's never read. This one can handle intermediate writes, but only
1891 * within a basic block.
1892 */
1893 bool
1894 fs_visitor::dead_code_eliminate_local()
1895 {
1896 struct hash_table *ht;
1897 bool progress = false;
1898
1899 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1900
1901 foreach_list_safe(node, &this->instructions) {
1902 fs_inst *inst = (fs_inst *)node;
1903
1904 /* At a basic block, empty the HT since we don't understand dataflow
1905 * here.
1906 */
1907 if (inst->is_control_flow()) {
1908 clear_dead_code_hash(ht);
1909 continue;
1910 }
1911
1912 /* Clear the HT of any instructions that got read. */
1913 for (int i = 0; i < 3; i++) {
1914 fs_reg src = inst->src[i];
1915 if (src.file != GRF)
1916 continue;
1917
1918 int read = 1;
1919 if (inst->is_send_from_grf())
1920 read = virtual_grf_sizes[src.reg] - src.reg_offset;
1921
1922 for (int reg_offset = src.reg_offset;
1923 reg_offset < src.reg_offset + read;
1924 reg_offset++) {
1925 remove_dead_code_hash(ht, src.reg, reg_offset);
1926 }
1927 }
1928
1929 /* Add any update of a GRF to the HT, removing a previous write if it
1930 * wasn't read.
1931 */
1932 if (inst->dst.file == GRF) {
1933 if (inst->regs_written > 1) {
1934 /* We don't know how to trim channels from an instruction's
1935 * writes, so we can't incrementally remove unread channels from
1936 * it. Just remove whatever it overwrites from the table
1937 */
1938 for (int i = 0; i < inst->regs_written; i++) {
1939 remove_dead_code_hash(ht,
1940 inst->dst.reg,
1941 inst->dst.reg_offset + i);
1942 }
1943 } else {
1944 struct hash_entry *entry =
1945 get_dead_code_hash_entry(ht, inst->dst.reg,
1946 inst->dst.reg_offset);
1947
1948 if (inst->is_partial_write()) {
1949 /* For a partial write, we can't remove any previous dead code
1950 * candidate, since we're just modifying their result, but we can
1951 * be dead code eliminiated ourselves.
1952 */
1953 if (entry) {
1954 entry->data = inst;
1955 } else {
1956 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1957 inst);
1958 }
1959 } else {
1960 if (entry) {
1961 /* We're completely updating a channel, and there was a
1962 * previous write to the channel that wasn't read. Kill it!
1963 */
1964 fs_inst *inst = (fs_inst *)entry->data;
1965 inst->remove();
1966 progress = true;
1967 _mesa_hash_table_remove(ht, entry);
1968 }
1969
1970 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1971 inst);
1972 }
1973 }
1974 }
1975 }
1976
1977 _mesa_hash_table_destroy(ht, NULL);
1978
1979 if (progress)
1980 live_intervals_valid = false;
1981
1982 return progress;
1983 }
1984
1985 /**
1986 * Implements a second type of register coalescing: This one checks if
1987 * the two regs involved in a raw move don't interfere, in which case
1988 * they can both by stored in the same place and the MOV removed.
1989 */
1990 bool
1991 fs_visitor::register_coalesce_2()
1992 {
1993 bool progress = false;
1994
1995 calculate_live_intervals();
1996
1997 foreach_list_safe(node, &this->instructions) {
1998 fs_inst *inst = (fs_inst *)node;
1999
2000 if (inst->opcode != BRW_OPCODE_MOV ||
2001 inst->predicate ||
2002 inst->saturate ||
2003 inst->src[0].file != GRF ||
2004 inst->src[0].negate ||
2005 inst->src[0].abs ||
2006 inst->src[0].smear != -1 ||
2007 inst->dst.file != GRF ||
2008 inst->dst.type != inst->src[0].type ||
2009 virtual_grf_sizes[inst->src[0].reg] != 1 ||
2010 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
2011 continue;
2012 }
2013
2014 int reg_from = inst->src[0].reg;
2015 assert(inst->src[0].reg_offset == 0);
2016 int reg_to = inst->dst.reg;
2017 int reg_to_offset = inst->dst.reg_offset;
2018
2019 foreach_list(node, &this->instructions) {
2020 fs_inst *scan_inst = (fs_inst *)node;
2021
2022 if (scan_inst->dst.file == GRF &&
2023 scan_inst->dst.reg == reg_from) {
2024 scan_inst->dst.reg = reg_to;
2025 scan_inst->dst.reg_offset = reg_to_offset;
2026 }
2027 for (int i = 0; i < 3; i++) {
2028 if (scan_inst->src[i].file == GRF &&
2029 scan_inst->src[i].reg == reg_from) {
2030 scan_inst->src[i].reg = reg_to;
2031 scan_inst->src[i].reg_offset = reg_to_offset;
2032 }
2033 }
2034 }
2035
2036 inst->remove();
2037
2038 /* We don't need to recalculate live intervals inside the loop despite
2039 * flagging live_intervals_valid because we only use live intervals for
2040 * the interferes test, and we must have had a situation where the
2041 * intervals were:
2042 *
2043 * from to
2044 * ^
2045 * |
2046 * v
2047 * ^
2048 * |
2049 * v
2050 *
2051 * Some register R that might get coalesced with one of these two could
2052 * only be referencing "to", otherwise "from"'s range would have been
2053 * longer. R's range could also only start at the end of "to" or later,
2054 * otherwise it will conflict with "to" when we try to coalesce "to"
2055 * into Rw anyway.
2056 */
2057 live_intervals_valid = false;
2058
2059 progress = true;
2060 continue;
2061 }
2062
2063 return progress;
2064 }
2065
2066 bool
2067 fs_visitor::register_coalesce()
2068 {
2069 bool progress = false;
2070 int if_depth = 0;
2071 int loop_depth = 0;
2072
2073 foreach_list_safe(node, &this->instructions) {
2074 fs_inst *inst = (fs_inst *)node;
2075
2076 /* Make sure that we dominate the instructions we're going to
2077 * scan for interfering with our coalescing, or we won't have
2078 * scanned enough to see if anything interferes with our
2079 * coalescing. We don't dominate the following instructions if
2080 * we're in a loop or an if block.
2081 */
2082 switch (inst->opcode) {
2083 case BRW_OPCODE_DO:
2084 loop_depth++;
2085 break;
2086 case BRW_OPCODE_WHILE:
2087 loop_depth--;
2088 break;
2089 case BRW_OPCODE_IF:
2090 if_depth++;
2091 break;
2092 case BRW_OPCODE_ENDIF:
2093 if_depth--;
2094 break;
2095 default:
2096 break;
2097 }
2098 if (loop_depth || if_depth)
2099 continue;
2100
2101 if (inst->opcode != BRW_OPCODE_MOV ||
2102 inst->predicate ||
2103 inst->saturate ||
2104 inst->dst.file != GRF || (inst->src[0].file != GRF &&
2105 inst->src[0].file != UNIFORM)||
2106 inst->dst.type != inst->src[0].type)
2107 continue;
2108
2109 bool has_source_modifiers = (inst->src[0].abs ||
2110 inst->src[0].negate ||
2111 inst->src[0].smear != -1 ||
2112 inst->src[0].file == UNIFORM);
2113
2114 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
2115 * them: check for no writes to either one until the exit of the
2116 * program.
2117 */
2118 bool interfered = false;
2119
2120 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2121 !scan_inst->is_tail_sentinel();
2122 scan_inst = (fs_inst *)scan_inst->next) {
2123 if (scan_inst->dst.file == GRF) {
2124 if (scan_inst->overwrites_reg(inst->dst) ||
2125 scan_inst->overwrites_reg(inst->src[0])) {
2126 interfered = true;
2127 break;
2128 }
2129 }
2130
2131 /* The gen6 MATH instruction can't handle source modifiers or
2132 * unusual register regions, so avoid coalescing those for
2133 * now. We should do something more specific.
2134 */
2135 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2136 interfered = true;
2137 break;
2138 }
2139
2140 /* The accumulator result appears to get used for the
2141 * conditional modifier generation. When negating a UD
2142 * value, there is a 33rd bit generated for the sign in the
2143 * accumulator value, so now you can't check, for example,
2144 * equality with a 32-bit value. See piglit fs-op-neg-uint.
2145 */
2146 if (scan_inst->conditional_mod &&
2147 inst->src[0].negate &&
2148 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2149 interfered = true;
2150 break;
2151 }
2152 }
2153 if (interfered) {
2154 continue;
2155 }
2156
2157 /* Rewrite the later usage to point at the source of the move to
2158 * be removed.
2159 */
2160 for (fs_inst *scan_inst = inst;
2161 !scan_inst->is_tail_sentinel();
2162 scan_inst = (fs_inst *)scan_inst->next) {
2163 for (int i = 0; i < 3; i++) {
2164 if (scan_inst->src[i].file == GRF &&
2165 scan_inst->src[i].reg == inst->dst.reg &&
2166 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2167 fs_reg new_src = inst->src[0];
2168 if (scan_inst->src[i].abs) {
2169 new_src.negate = 0;
2170 new_src.abs = 1;
2171 }
2172 new_src.negate ^= scan_inst->src[i].negate;
2173 scan_inst->src[i] = new_src;
2174 }
2175 }
2176 }
2177
2178 inst->remove();
2179 progress = true;
2180 }
2181
2182 if (progress)
2183 live_intervals_valid = false;
2184
2185 return progress;
2186 }
2187
2188
2189 bool
2190 fs_visitor::compute_to_mrf()
2191 {
2192 bool progress = false;
2193 int next_ip = 0;
2194
2195 calculate_live_intervals();
2196
2197 foreach_list_safe(node, &this->instructions) {
2198 fs_inst *inst = (fs_inst *)node;
2199
2200 int ip = next_ip;
2201 next_ip++;
2202
2203 if (inst->opcode != BRW_OPCODE_MOV ||
2204 inst->predicate ||
2205 inst->dst.file != MRF || inst->src[0].file != GRF ||
2206 inst->dst.type != inst->src[0].type ||
2207 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2208 continue;
2209
2210 /* Work out which hardware MRF registers are written by this
2211 * instruction.
2212 */
2213 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2214 int mrf_high;
2215 if (inst->dst.reg & BRW_MRF_COMPR4) {
2216 mrf_high = mrf_low + 4;
2217 } else if (dispatch_width == 16 &&
2218 (!inst->force_uncompressed && !inst->force_sechalf)) {
2219 mrf_high = mrf_low + 1;
2220 } else {
2221 mrf_high = mrf_low;
2222 }
2223
2224 /* Can't compute-to-MRF this GRF if someone else was going to
2225 * read it later.
2226 */
2227 if (this->virtual_grf_use[inst->src[0].reg] > ip)
2228 continue;
2229
2230 /* Found a move of a GRF to a MRF. Let's see if we can go
2231 * rewrite the thing that made this GRF to write into the MRF.
2232 */
2233 fs_inst *scan_inst;
2234 for (scan_inst = (fs_inst *)inst->prev;
2235 scan_inst->prev != NULL;
2236 scan_inst = (fs_inst *)scan_inst->prev) {
2237 if (scan_inst->dst.file == GRF &&
2238 scan_inst->dst.reg == inst->src[0].reg) {
2239 /* Found the last thing to write our reg we want to turn
2240 * into a compute-to-MRF.
2241 */
2242
2243 /* If this one instruction didn't populate all the
2244 * channels, bail. We might be able to rewrite everything
2245 * that writes that reg, but it would require smarter
2246 * tracking to delay the rewriting until complete success.
2247 */
2248 if (scan_inst->is_partial_write())
2249 break;
2250
2251 /* Things returning more than one register would need us to
2252 * understand coalescing out more than one MOV at a time.
2253 */
2254 if (scan_inst->regs_written > 1)
2255 break;
2256
2257 /* SEND instructions can't have MRF as a destination. */
2258 if (scan_inst->mlen)
2259 break;
2260
2261 if (intel->gen == 6) {
2262 /* gen6 math instructions must have the destination be
2263 * GRF, so no compute-to-MRF for them.
2264 */
2265 if (scan_inst->is_math()) {
2266 break;
2267 }
2268 }
2269
2270 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2271 /* Found the creator of our MRF's source value. */
2272 scan_inst->dst.file = MRF;
2273 scan_inst->dst.reg = inst->dst.reg;
2274 scan_inst->saturate |= inst->saturate;
2275 inst->remove();
2276 progress = true;
2277 }
2278 break;
2279 }
2280
2281 /* We don't handle control flow here. Most computation of
2282 * values that end up in MRFs are shortly before the MRF
2283 * write anyway.
2284 */
2285 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2286 break;
2287
2288 /* You can't read from an MRF, so if someone else reads our
2289 * MRF's source GRF that we wanted to rewrite, that stops us.
2290 */
2291 bool interfered = false;
2292 for (int i = 0; i < 3; i++) {
2293 if (scan_inst->src[i].file == GRF &&
2294 scan_inst->src[i].reg == inst->src[0].reg &&
2295 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2296 interfered = true;
2297 }
2298 }
2299 if (interfered)
2300 break;
2301
2302 if (scan_inst->dst.file == MRF) {
2303 /* If somebody else writes our MRF here, we can't
2304 * compute-to-MRF before that.
2305 */
2306 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2307 int scan_mrf_high;
2308
2309 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2310 scan_mrf_high = scan_mrf_low + 4;
2311 } else if (dispatch_width == 16 &&
2312 (!scan_inst->force_uncompressed &&
2313 !scan_inst->force_sechalf)) {
2314 scan_mrf_high = scan_mrf_low + 1;
2315 } else {
2316 scan_mrf_high = scan_mrf_low;
2317 }
2318
2319 if (mrf_low == scan_mrf_low ||
2320 mrf_low == scan_mrf_high ||
2321 mrf_high == scan_mrf_low ||
2322 mrf_high == scan_mrf_high) {
2323 break;
2324 }
2325 }
2326
2327 if (scan_inst->mlen > 0) {
2328 /* Found a SEND instruction, which means that there are
2329 * live values in MRFs from base_mrf to base_mrf +
2330 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2331 * above it.
2332 */
2333 if (mrf_low >= scan_inst->base_mrf &&
2334 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2335 break;
2336 }
2337 if (mrf_high >= scan_inst->base_mrf &&
2338 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2339 break;
2340 }
2341 }
2342 }
2343 }
2344
2345 if (progress)
2346 live_intervals_valid = false;
2347
2348 return progress;
2349 }
2350
2351 /**
2352 * Walks through basic blocks, looking for repeated MRF writes and
2353 * removing the later ones.
2354 */
2355 bool
2356 fs_visitor::remove_duplicate_mrf_writes()
2357 {
2358 fs_inst *last_mrf_move[16];
2359 bool progress = false;
2360
2361 /* Need to update the MRF tracking for compressed instructions. */
2362 if (dispatch_width == 16)
2363 return false;
2364
2365 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2366
2367 foreach_list_safe(node, &this->instructions) {
2368 fs_inst *inst = (fs_inst *)node;
2369
2370 if (inst->is_control_flow()) {
2371 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2372 }
2373
2374 if (inst->opcode == BRW_OPCODE_MOV &&
2375 inst->dst.file == MRF) {
2376 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2377 if (prev_inst && inst->equals(prev_inst)) {
2378 inst->remove();
2379 progress = true;
2380 continue;
2381 }
2382 }
2383
2384 /* Clear out the last-write records for MRFs that were overwritten. */
2385 if (inst->dst.file == MRF) {
2386 last_mrf_move[inst->dst.reg] = NULL;
2387 }
2388
2389 if (inst->mlen > 0) {
2390 /* Found a SEND instruction, which will include two or fewer
2391 * implied MRF writes. We could do better here.
2392 */
2393 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2394 last_mrf_move[inst->base_mrf + i] = NULL;
2395 }
2396 }
2397
2398 /* Clear out any MRF move records whose sources got overwritten. */
2399 if (inst->dst.file == GRF) {
2400 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2401 if (last_mrf_move[i] &&
2402 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2403 last_mrf_move[i] = NULL;
2404 }
2405 }
2406 }
2407
2408 if (inst->opcode == BRW_OPCODE_MOV &&
2409 inst->dst.file == MRF &&
2410 inst->src[0].file == GRF &&
2411 !inst->predicate) {
2412 last_mrf_move[inst->dst.reg] = inst;
2413 }
2414 }
2415
2416 if (progress)
2417 live_intervals_valid = false;
2418
2419 return progress;
2420 }
2421
2422 static void
2423 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2424 int first_grf, int grf_len)
2425 {
2426 bool inst_16wide = (dispatch_width > 8 &&
2427 !inst->force_uncompressed &&
2428 !inst->force_sechalf);
2429
2430 /* Clear the flag for registers that actually got read (as expected). */
2431 for (int i = 0; i < 3; i++) {
2432 int grf;
2433 if (inst->src[i].file == GRF) {
2434 grf = inst->src[i].reg;
2435 } else if (inst->src[i].file == FIXED_HW_REG &&
2436 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2437 grf = inst->src[i].fixed_hw_reg.nr;
2438 } else {
2439 continue;
2440 }
2441
2442 if (grf >= first_grf &&
2443 grf < first_grf + grf_len) {
2444 deps[grf - first_grf] = false;
2445 if (inst_16wide)
2446 deps[grf - first_grf + 1] = false;
2447 }
2448 }
2449 }
2450
2451 /**
2452 * Implements this workaround for the original 965:
2453 *
2454 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2455 * check for post destination dependencies on this instruction, software
2456 * must ensure that there is no destination hazard for the case of ‘write
2457 * followed by a posted write’ shown in the following example.
2458 *
2459 * 1. mov r3 0
2460 * 2. send r3.xy <rest of send instruction>
2461 * 3. mov r2 r3
2462 *
2463 * Due to no post-destination dependency check on the ‘send’, the above
2464 * code sequence could have two instructions (1 and 2) in flight at the
2465 * same time that both consider ‘r3’ as the target of their final writes.
2466 */
2467 void
2468 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2469 {
2470 int reg_size = dispatch_width / 8;
2471 int write_len = inst->regs_written * reg_size;
2472 int first_write_grf = inst->dst.reg;
2473 bool needs_dep[BRW_MAX_MRF];
2474 assert(write_len < (int)sizeof(needs_dep) - 1);
2475
2476 memset(needs_dep, false, sizeof(needs_dep));
2477 memset(needs_dep, true, write_len);
2478
2479 clear_deps_for_inst_src(inst, dispatch_width,
2480 needs_dep, first_write_grf, write_len);
2481
2482 /* Walk backwards looking for writes to registers we're writing which
2483 * aren't read since being written. If we hit the start of the program,
2484 * we assume that there are no outstanding dependencies on entry to the
2485 * program.
2486 */
2487 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2488 scan_inst != NULL;
2489 scan_inst = (fs_inst *)scan_inst->prev) {
2490
2491 /* If we hit control flow, assume that there *are* outstanding
2492 * dependencies, and force their cleanup before our instruction.
2493 */
2494 if (scan_inst->is_control_flow()) {
2495 for (int i = 0; i < write_len; i++) {
2496 if (needs_dep[i]) {
2497 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2498 }
2499 }
2500 return;
2501 }
2502
2503 bool scan_inst_16wide = (dispatch_width > 8 &&
2504 !scan_inst->force_uncompressed &&
2505 !scan_inst->force_sechalf);
2506
2507 /* We insert our reads as late as possible on the assumption that any
2508 * instruction but a MOV that might have left us an outstanding
2509 * dependency has more latency than a MOV.
2510 */
2511 if (scan_inst->dst.file == GRF) {
2512 for (int i = 0; i < scan_inst->regs_written; i++) {
2513 int reg = scan_inst->dst.reg + i * reg_size;
2514
2515 if (reg >= first_write_grf &&
2516 reg < first_write_grf + write_len &&
2517 needs_dep[reg - first_write_grf]) {
2518 inst->insert_before(DEP_RESOLVE_MOV(reg));
2519 needs_dep[reg - first_write_grf] = false;
2520 if (scan_inst_16wide)
2521 needs_dep[reg - first_write_grf + 1] = false;
2522 }
2523 }
2524 }
2525
2526 /* Clear the flag for registers that actually got read (as expected). */
2527 clear_deps_for_inst_src(scan_inst, dispatch_width,
2528 needs_dep, first_write_grf, write_len);
2529
2530 /* Continue the loop only if we haven't resolved all the dependencies */
2531 int i;
2532 for (i = 0; i < write_len; i++) {
2533 if (needs_dep[i])
2534 break;
2535 }
2536 if (i == write_len)
2537 return;
2538 }
2539 }
2540
2541 /**
2542 * Implements this workaround for the original 965:
2543 *
2544 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2545 * used as a destination register until after it has been sourced by an
2546 * instruction with a different destination register.
2547 */
2548 void
2549 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2550 {
2551 int write_len = inst->regs_written * dispatch_width / 8;
2552 int first_write_grf = inst->dst.reg;
2553 bool needs_dep[BRW_MAX_MRF];
2554 assert(write_len < (int)sizeof(needs_dep) - 1);
2555
2556 memset(needs_dep, false, sizeof(needs_dep));
2557 memset(needs_dep, true, write_len);
2558 /* Walk forwards looking for writes to registers we're writing which aren't
2559 * read before being written.
2560 */
2561 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2562 !scan_inst->is_tail_sentinel();
2563 scan_inst = (fs_inst *)scan_inst->next) {
2564 /* If we hit control flow, force resolve all remaining dependencies. */
2565 if (scan_inst->is_control_flow()) {
2566 for (int i = 0; i < write_len; i++) {
2567 if (needs_dep[i])
2568 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2569 }
2570 return;
2571 }
2572
2573 /* Clear the flag for registers that actually got read (as expected). */
2574 clear_deps_for_inst_src(scan_inst, dispatch_width,
2575 needs_dep, first_write_grf, write_len);
2576
2577 /* We insert our reads as late as possible since they're reading the
2578 * result of a SEND, which has massive latency.
2579 */
2580 if (scan_inst->dst.file == GRF &&
2581 scan_inst->dst.reg >= first_write_grf &&
2582 scan_inst->dst.reg < first_write_grf + write_len &&
2583 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2584 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2585 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2586 }
2587
2588 /* Continue the loop only if we haven't resolved all the dependencies */
2589 int i;
2590 for (i = 0; i < write_len; i++) {
2591 if (needs_dep[i])
2592 break;
2593 }
2594 if (i == write_len)
2595 return;
2596 }
2597
2598 /* If we hit the end of the program, resolve all remaining dependencies out
2599 * of paranoia.
2600 */
2601 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2602 assert(last_inst->eot);
2603 for (int i = 0; i < write_len; i++) {
2604 if (needs_dep[i])
2605 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2606 }
2607 }
2608
2609 void
2610 fs_visitor::insert_gen4_send_dependency_workarounds()
2611 {
2612 if (intel->gen != 4 || intel->is_g4x)
2613 return;
2614
2615 /* Note that we're done with register allocation, so GRF fs_regs always
2616 * have a .reg_offset of 0.
2617 */
2618
2619 foreach_list_safe(node, &this->instructions) {
2620 fs_inst *inst = (fs_inst *)node;
2621
2622 if (inst->mlen != 0 && inst->dst.file == GRF) {
2623 insert_gen4_pre_send_dependency_workarounds(inst);
2624 insert_gen4_post_send_dependency_workarounds(inst);
2625 }
2626 }
2627 }
2628
2629 /**
2630 * Turns the generic expression-style uniform pull constant load instruction
2631 * into a hardware-specific series of instructions for loading a pull
2632 * constant.
2633 *
2634 * The expression style allows the CSE pass before this to optimize out
2635 * repeated loads from the same offset, and gives the pre-register-allocation
2636 * scheduling full flexibility, while the conversion to native instructions
2637 * allows the post-register-allocation scheduler the best information
2638 * possible.
2639 *
2640 * Note that execution masking for setting up pull constant loads is special:
2641 * the channels that need to be written are unrelated to the current execution
2642 * mask, since a later instruction will use one of the result channels as a
2643 * source operand for all 8 or 16 of its channels.
2644 */
2645 void
2646 fs_visitor::lower_uniform_pull_constant_loads()
2647 {
2648 foreach_list(node, &this->instructions) {
2649 fs_inst *inst = (fs_inst *)node;
2650
2651 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2652 continue;
2653
2654 if (intel->gen >= 7) {
2655 /* The offset arg before was a vec4-aligned byte offset. We need to
2656 * turn it into a dword offset.
2657 */
2658 fs_reg const_offset_reg = inst->src[1];
2659 assert(const_offset_reg.file == IMM &&
2660 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2661 const_offset_reg.imm.u /= 4;
2662 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2663
2664 /* This is actually going to be a MOV, but since only the first dword
2665 * is accessed, we have a special opcode to do just that one. Note
2666 * that this needs to be an operation that will be considered a def
2667 * by live variable analysis, or register allocation will explode.
2668 */
2669 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2670 payload, const_offset_reg);
2671 setup->force_writemask_all = true;
2672
2673 setup->ir = inst->ir;
2674 setup->annotation = inst->annotation;
2675 inst->insert_before(setup);
2676
2677 /* Similarly, this will only populate the first 4 channels of the
2678 * result register (since we only use smear values from 0-3), but we
2679 * don't tell the optimizer.
2680 */
2681 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2682 inst->src[1] = payload;
2683
2684 this->live_intervals_valid = false;
2685 } else {
2686 /* Before register allocation, we didn't tell the scheduler about the
2687 * MRF we use. We know it's safe to use this MRF because nothing
2688 * else does except for register spill/unspill, which generates and
2689 * uses its MRF within a single IR instruction.
2690 */
2691 inst->base_mrf = 14;
2692 inst->mlen = 1;
2693 }
2694 }
2695 }
2696
2697 void
2698 fs_visitor::dump_instruction(fs_inst *inst)
2699 {
2700 if (inst->predicate) {
2701 printf("(%cf0.%d) ",
2702 inst->predicate_inverse ? '-' : '+',
2703 inst->flag_subreg);
2704 }
2705
2706 printf("%s", brw_instruction_name(inst->opcode));
2707 if (inst->saturate)
2708 printf(".sat");
2709 if (inst->conditional_mod) {
2710 printf(".cmod");
2711 if (!inst->predicate &&
2712 (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2713 inst->opcode != BRW_OPCODE_IF &&
2714 inst->opcode != BRW_OPCODE_WHILE))) {
2715 printf(".f0.%d\n", inst->flag_subreg);
2716 }
2717 }
2718 printf(" ");
2719
2720
2721 switch (inst->dst.file) {
2722 case GRF:
2723 printf("vgrf%d", inst->dst.reg);
2724 if (inst->dst.reg_offset)
2725 printf("+%d", inst->dst.reg_offset);
2726 break;
2727 case MRF:
2728 printf("m%d", inst->dst.reg);
2729 break;
2730 case BAD_FILE:
2731 printf("(null)");
2732 break;
2733 case UNIFORM:
2734 printf("***u%d***", inst->dst.reg);
2735 break;
2736 default:
2737 printf("???");
2738 break;
2739 }
2740 printf(", ");
2741
2742 for (int i = 0; i < 3; i++) {
2743 if (inst->src[i].negate)
2744 printf("-");
2745 if (inst->src[i].abs)
2746 printf("|");
2747 switch (inst->src[i].file) {
2748 case GRF:
2749 printf("vgrf%d", inst->src[i].reg);
2750 if (inst->src[i].reg_offset)
2751 printf("+%d", inst->src[i].reg_offset);
2752 break;
2753 case MRF:
2754 printf("***m%d***", inst->src[i].reg);
2755 break;
2756 case UNIFORM:
2757 printf("u%d", inst->src[i].reg);
2758 if (inst->src[i].reg_offset)
2759 printf(".%d", inst->src[i].reg_offset);
2760 break;
2761 case BAD_FILE:
2762 printf("(null)");
2763 break;
2764 case IMM:
2765 switch (inst->src[i].type) {
2766 case BRW_REGISTER_TYPE_F:
2767 printf("%ff", inst->src[i].imm.f);
2768 break;
2769 case BRW_REGISTER_TYPE_D:
2770 printf("%dd", inst->src[i].imm.i);
2771 break;
2772 case BRW_REGISTER_TYPE_UD:
2773 printf("%uu", inst->src[i].imm.u);
2774 break;
2775 default:
2776 printf("???");
2777 break;
2778 }
2779 break;
2780 default:
2781 printf("???");
2782 break;
2783 }
2784 if (inst->src[i].abs)
2785 printf("|");
2786
2787 if (i < 3)
2788 printf(", ");
2789 }
2790
2791 printf(" ");
2792
2793 if (inst->force_uncompressed)
2794 printf("1sthalf ");
2795
2796 if (inst->force_sechalf)
2797 printf("2ndhalf ");
2798
2799 printf("\n");
2800 }
2801
2802 void
2803 fs_visitor::dump_instructions()
2804 {
2805 int ip = 0;
2806 foreach_list(node, &this->instructions) {
2807 fs_inst *inst = (fs_inst *)node;
2808 printf("%d: ", ip++);
2809 dump_instruction(inst);
2810 }
2811 }
2812
2813 /**
2814 * Possibly returns an instruction that set up @param reg.
2815 *
2816 * Sometimes we want to take the result of some expression/variable
2817 * dereference tree and rewrite the instruction generating the result
2818 * of the tree. When processing the tree, we know that the
2819 * instructions generated are all writing temporaries that are dead
2820 * outside of this tree. So, if we have some instructions that write
2821 * a temporary, we're free to point that temp write somewhere else.
2822 *
2823 * Note that this doesn't guarantee that the instruction generated
2824 * only reg -- it might be the size=4 destination of a texture instruction.
2825 */
2826 fs_inst *
2827 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2828 fs_inst *end,
2829 fs_reg reg)
2830 {
2831 if (end == start ||
2832 end->is_partial_write() ||
2833 reg.reladdr ||
2834 !reg.equals(end->dst)) {
2835 return NULL;
2836 } else {
2837 return end;
2838 }
2839 }
2840
2841 void
2842 fs_visitor::setup_payload_gen6()
2843 {
2844 struct intel_context *intel = &brw->intel;
2845 bool uses_depth =
2846 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2847 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2848
2849 assert(intel->gen >= 6);
2850
2851 /* R0-1: masks, pixel X/Y coordinates. */
2852 c->nr_payload_regs = 2;
2853 /* R2: only for 32-pixel dispatch.*/
2854
2855 /* R3-26: barycentric interpolation coordinates. These appear in the
2856 * same order that they appear in the brw_wm_barycentric_interp_mode
2857 * enum. Each set of coordinates occupies 2 registers if dispatch width
2858 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2859 * appear if they were enabled using the "Barycentric Interpolation
2860 * Mode" bits in WM_STATE.
2861 */
2862 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2863 if (barycentric_interp_modes & (1 << i)) {
2864 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2865 c->nr_payload_regs += 2;
2866 if (dispatch_width == 16) {
2867 c->nr_payload_regs += 2;
2868 }
2869 }
2870 }
2871
2872 /* R27: interpolated depth if uses source depth */
2873 if (uses_depth) {
2874 c->source_depth_reg = c->nr_payload_regs;
2875 c->nr_payload_regs++;
2876 if (dispatch_width == 16) {
2877 /* R28: interpolated depth if not 8-wide. */
2878 c->nr_payload_regs++;
2879 }
2880 }
2881 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2882 if (uses_depth) {
2883 c->source_w_reg = c->nr_payload_regs;
2884 c->nr_payload_regs++;
2885 if (dispatch_width == 16) {
2886 /* R30: interpolated W if not 8-wide. */
2887 c->nr_payload_regs++;
2888 }
2889 }
2890 /* R31: MSAA position offsets. */
2891 /* R32-: bary for 32-pixel. */
2892 /* R58-59: interp W for 32-pixel. */
2893
2894 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2895 c->source_depth_to_render_target = true;
2896 }
2897 }
2898
2899 bool
2900 fs_visitor::run()
2901 {
2902 sanity_param_count = fp->Base.Parameters->NumParameters;
2903 uint32_t orig_nr_params = c->prog_data.nr_params;
2904
2905 if (intel->gen >= 6)
2906 setup_payload_gen6();
2907 else
2908 setup_payload_gen4();
2909
2910 if (0) {
2911 emit_dummy_fs();
2912 } else {
2913 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2914 emit_shader_time_begin();
2915
2916 calculate_urb_setup();
2917 if (intel->gen < 6)
2918 emit_interpolation_setup_gen4();
2919 else
2920 emit_interpolation_setup_gen6();
2921
2922 /* We handle discards by keeping track of the still-live pixels in f0.1.
2923 * Initialize it with the dispatched pixels.
2924 */
2925 if (fp->UsesKill) {
2926 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2927 discard_init->flag_subreg = 1;
2928 }
2929
2930 /* Generate FS IR for main(). (the visitor only descends into
2931 * functions called "main").
2932 */
2933 if (shader) {
2934 foreach_list(node, &*shader->ir) {
2935 ir_instruction *ir = (ir_instruction *)node;
2936 base_ir = ir;
2937 this->result = reg_undef;
2938 ir->accept(this);
2939 }
2940 } else {
2941 emit_fragment_program_code();
2942 }
2943 base_ir = NULL;
2944 if (failed)
2945 return false;
2946
2947 emit(FS_OPCODE_PLACEHOLDER_HALT);
2948
2949 emit_fb_writes();
2950
2951 split_virtual_grfs();
2952
2953 move_uniform_array_access_to_pull_constants();
2954 setup_pull_constants();
2955
2956 bool progress;
2957 do {
2958 progress = false;
2959
2960 compact_virtual_grfs();
2961
2962 progress = remove_duplicate_mrf_writes() || progress;
2963
2964 progress = opt_algebraic() || progress;
2965 progress = opt_cse() || progress;
2966 progress = opt_copy_propagate() || progress;
2967 progress = dead_code_eliminate() || progress;
2968 progress = dead_code_eliminate_local() || progress;
2969 progress = register_coalesce() || progress;
2970 progress = register_coalesce_2() || progress;
2971 progress = compute_to_mrf() || progress;
2972 } while (progress);
2973
2974 remove_dead_constants();
2975
2976 schedule_instructions(false);
2977
2978 lower_uniform_pull_constant_loads();
2979
2980 assign_curb_setup();
2981 assign_urb_setup();
2982
2983 if (0) {
2984 /* Debug of register spilling: Go spill everything. */
2985 for (int i = 0; i < virtual_grf_count; i++) {
2986 spill_reg(i);
2987 }
2988 }
2989
2990 if (0)
2991 assign_regs_trivial();
2992 else {
2993 while (!assign_regs()) {
2994 if (failed)
2995 break;
2996 }
2997 }
2998 }
2999 assert(force_uncompressed_stack == 0);
3000 assert(force_sechalf_stack == 0);
3001
3002 /* This must come after all optimization and register allocation, since
3003 * it inserts dead code that happens to have side effects, and it does
3004 * so based on the actual physical registers in use.
3005 */
3006 insert_gen4_send_dependency_workarounds();
3007
3008 if (failed)
3009 return false;
3010
3011 schedule_instructions(true);
3012
3013 if (dispatch_width == 8) {
3014 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3015 } else {
3016 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3017
3018 /* Make sure we didn't try to sneak in an extra uniform */
3019 assert(orig_nr_params == c->prog_data.nr_params);
3020 (void) orig_nr_params;
3021 }
3022
3023 /* If any state parameters were appended, then ParameterValues could have
3024 * been realloced, in which case the driver uniform storage set up by
3025 * _mesa_associate_uniform_storage() would point to freed memory. Make
3026 * sure that didn't happen.
3027 */
3028 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3029
3030 return !failed;
3031 }
3032
3033 const unsigned *
3034 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3035 struct gl_fragment_program *fp,
3036 struct gl_shader_program *prog,
3037 unsigned *final_assembly_size)
3038 {
3039 struct intel_context *intel = &brw->intel;
3040 bool start_busy = false;
3041 float start_time = 0;
3042
3043 if (unlikely(intel->perf_debug)) {
3044 start_busy = (intel->batch.last_bo &&
3045 drm_intel_bo_busy(intel->batch.last_bo));
3046 start_time = get_time();
3047 }
3048
3049 struct brw_shader *shader = NULL;
3050 if (prog)
3051 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3052
3053 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3054 if (prog) {
3055 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3056 _mesa_print_ir(shader->ir, NULL);
3057 printf("\n\n");
3058 } else {
3059 printf("ARB_fragment_program %d ir for native fragment shader\n",
3060 fp->Base.Id);
3061 _mesa_print_program(&fp->Base);
3062 }
3063 }
3064
3065 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3066 */
3067 fs_visitor v(brw, c, prog, fp, 8);
3068 if (!v.run()) {
3069 if (prog) {
3070 prog->LinkStatus = false;
3071 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3072 }
3073
3074 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3075 v.fail_msg);
3076
3077 return NULL;
3078 }
3079
3080 exec_list *simd16_instructions = NULL;
3081 fs_visitor v2(brw, c, prog, fp, 16);
3082 bool no16 = INTEL_DEBUG & DEBUG_NO16;
3083 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
3084 v2.import_uniforms(&v);
3085 if (!v2.run()) {
3086 perf_debug("16-wide shader failed to compile, falling back to "
3087 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3088 } else {
3089 simd16_instructions = &v2.instructions;
3090 }
3091 }
3092
3093 c->prog_data.dispatch_width = 8;
3094
3095 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3096 const unsigned *generated = g.generate_assembly(&v.instructions,
3097 simd16_instructions,
3098 final_assembly_size);
3099
3100 if (unlikely(intel->perf_debug) && shader) {
3101 if (shader->compiled_once)
3102 brw_wm_debug_recompile(brw, prog, &c->key);
3103 shader->compiled_once = true;
3104
3105 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
3106 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3107 (get_time() - start_time) * 1000);
3108 }
3109 }
3110
3111 return generated;
3112 }
3113
3114 bool
3115 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3116 {
3117 struct brw_context *brw = brw_context(ctx);
3118 struct intel_context *intel = &brw->intel;
3119 struct brw_wm_prog_key key;
3120
3121 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3122 return true;
3123
3124 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3125 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3126 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3127 bool program_uses_dfdy = fp->UsesDFdy;
3128
3129 memset(&key, 0, sizeof(key));
3130
3131 if (intel->gen < 6) {
3132 if (fp->UsesKill)
3133 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3134
3135 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3136 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3137
3138 /* Just assume depth testing. */
3139 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3140 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3141 }
3142
3143 if (intel->gen < 6)
3144 key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
3145
3146 for (int i = 0; i < VARYING_SLOT_MAX; i++) {
3147 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
3148 continue;
3149
3150 if (intel->gen < 6) {
3151 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
3152 key.input_slots_valid |= BITFIELD64_BIT(i);
3153 }
3154 }
3155
3156 key.clamp_fragment_color = true;
3157
3158 for (int i = 0; i < MAX_SAMPLERS; i++) {
3159 if (fp->Base.ShadowSamplers & (1 << i)) {
3160 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3161 key.tex.swizzles[i] =
3162 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3163 } else {
3164 /* Color sampler: assume no swizzling. */
3165 key.tex.swizzles[i] = SWIZZLE_XYZW;
3166 }
3167 }
3168
3169 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3170 key.drawable_height = ctx->DrawBuffer->Height;
3171 }
3172
3173 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3174 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3175 }
3176
3177 key.nr_color_regions = 1;
3178
3179 key.program_string_id = bfp->id;
3180
3181 uint32_t old_prog_offset = brw->wm.prog_offset;
3182 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3183
3184 bool success = do_wm_prog(brw, prog, bfp, &key);
3185
3186 brw->wm.prog_offset = old_prog_offset;
3187 brw->wm.prog_data = old_prog_data;
3188
3189 return success;
3190 }