glsl: Add image type to the GLSL IR.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53
54 void
55 fs_inst::init()
56 {
57 memset(this, 0, sizeof(*this));
58 this->opcode = BRW_OPCODE_NOP;
59 this->conditional_mod = BRW_CONDITIONAL_NONE;
60
61 this->dst = reg_undef;
62 this->src[0] = reg_undef;
63 this->src[1] = reg_undef;
64 this->src[2] = reg_undef;
65
66 /* This will be the case for almost all instructions. */
67 this->regs_written = 1;
68 }
69
70 fs_inst::fs_inst()
71 {
72 init();
73 }
74
75 fs_inst::fs_inst(enum opcode opcode)
76 {
77 init();
78 this->opcode = opcode;
79 }
80
81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
82 {
83 init();
84 this->opcode = opcode;
85 this->dst = dst;
86
87 if (dst.file == GRF)
88 assert(dst.reg_offset >= 0);
89 }
90
91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
92 {
93 init();
94 this->opcode = opcode;
95 this->dst = dst;
96 this->src[0] = src0;
97
98 if (dst.file == GRF)
99 assert(dst.reg_offset >= 0);
100 if (src[0].file == GRF)
101 assert(src[0].reg_offset >= 0);
102 }
103
104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
105 {
106 init();
107 this->opcode = opcode;
108 this->dst = dst;
109 this->src[0] = src0;
110 this->src[1] = src1;
111
112 if (dst.file == GRF)
113 assert(dst.reg_offset >= 0);
114 if (src[0].file == GRF)
115 assert(src[0].reg_offset >= 0);
116 if (src[1].file == GRF)
117 assert(src[1].reg_offset >= 0);
118 }
119
120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
121 fs_reg src0, fs_reg src1, fs_reg src2)
122 {
123 init();
124 this->opcode = opcode;
125 this->dst = dst;
126 this->src[0] = src0;
127 this->src[1] = src1;
128 this->src[2] = src2;
129
130 if (dst.file == GRF)
131 assert(dst.reg_offset >= 0);
132 if (src[0].file == GRF)
133 assert(src[0].reg_offset >= 0);
134 if (src[1].file == GRF)
135 assert(src[1].reg_offset >= 0);
136 if (src[2].file == GRF)
137 assert(src[2].reg_offset >= 0);
138 }
139
140 #define ALU1(op) \
141 fs_inst * \
142 fs_visitor::op(fs_reg dst, fs_reg src0) \
143 { \
144 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
145 }
146
147 #define ALU2(op) \
148 fs_inst * \
149 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
150 { \
151 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
152 }
153
154 #define ALU3(op) \
155 fs_inst * \
156 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
157 { \
158 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
159 }
160
161 ALU1(NOT)
162 ALU1(MOV)
163 ALU1(FRC)
164 ALU1(RNDD)
165 ALU1(RNDE)
166 ALU1(RNDZ)
167 ALU2(ADD)
168 ALU2(MUL)
169 ALU2(MACH)
170 ALU2(AND)
171 ALU2(OR)
172 ALU2(XOR)
173 ALU2(SHL)
174 ALU2(SHR)
175 ALU2(ASR)
176 ALU3(LRP)
177 ALU1(BFREV)
178 ALU3(BFE)
179 ALU2(BFI1)
180 ALU3(BFI2)
181 ALU1(FBH)
182 ALU1(FBL)
183 ALU1(CBIT)
184 ALU3(MAD)
185 ALU2(ADDC)
186 ALU2(SUBB)
187 ALU2(SEL)
188
189 /** Gen4 predicated IF. */
190 fs_inst *
191 fs_visitor::IF(uint32_t predicate)
192 {
193 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195 return inst;
196 }
197
198 /** Gen6 IF with embedded comparison. */
199 fs_inst *
200 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
201 {
202 assert(brw->gen == 6);
203 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
204 reg_null_d, src0, src1);
205 inst->conditional_mod = condition;
206 return inst;
207 }
208
209 /**
210 * CMP: Sets the low bit of the destination channels with the result
211 * of the comparison, while the upper bits are undefined, and updates
212 * the flag register with the packed 16 bits of the result.
213 */
214 fs_inst *
215 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
216 {
217 fs_inst *inst;
218
219 /* Take the instruction:
220 *
221 * CMP null<d> src0<f> src1<f>
222 *
223 * Original gen4 does type conversion to the destination type before
224 * comparison, producing garbage results for floating point comparisons.
225 * gen5 does the comparison on the execution type (resolved source types),
226 * so dst type doesn't matter. gen6 does comparison and then uses the
227 * result as if it was the dst type with no conversion, which happens to
228 * mostly work out for float-interpreted-as-int since our comparisons are
229 * for >0, =0, <0.
230 */
231 if (brw->gen == 4) {
232 dst.type = src0.type;
233 if (dst.file == HW_REG)
234 dst.fixed_hw_reg.type = dst.type;
235 }
236
237 resolve_ud_negate(&src0);
238 resolve_ud_negate(&src1);
239
240 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
241 inst->conditional_mod = condition;
242
243 return inst;
244 }
245
246 exec_list
247 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
248 fs_reg varying_offset,
249 uint32_t const_offset)
250 {
251 exec_list instructions;
252 fs_inst *inst;
253
254 /* We have our constant surface use a pitch of 4 bytes, so our index can
255 * be any component of a vector, and then we load 4 contiguous
256 * components starting from that.
257 *
258 * We break down the const_offset to a portion added to the variable
259 * offset and a portion done using reg_offset, which means that if you
260 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
261 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
262 * CSE can later notice that those loads are all the same and eliminate
263 * the redundant ones.
264 */
265 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
266 instructions.push_tail(ADD(vec4_offset,
267 varying_offset, const_offset & ~3));
268
269 int scale = 1;
270 if (brw->gen == 4 && dispatch_width == 8) {
271 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
272 * u, v, r) as parameters, or we can just use the SIMD16 message
273 * consisting of (header, u). We choose the second, at the cost of a
274 * longer return length.
275 */
276 scale = 2;
277 }
278
279 enum opcode op;
280 if (brw->gen >= 7)
281 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
282 else
283 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
284 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
285 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
286 inst->regs_written = 4 * scale;
287 instructions.push_tail(inst);
288
289 if (brw->gen < 7) {
290 inst->base_mrf = 13;
291 inst->header_present = true;
292 if (brw->gen == 4)
293 inst->mlen = 3;
294 else
295 inst->mlen = 1 + dispatch_width / 8;
296 }
297
298 vec4_result.reg_offset += (const_offset & 3) * scale;
299 instructions.push_tail(MOV(dst, vec4_result));
300
301 return instructions;
302 }
303
304 /**
305 * A helper for MOV generation for fixing up broken hardware SEND dependency
306 * handling.
307 */
308 fs_inst *
309 fs_visitor::DEP_RESOLVE_MOV(int grf)
310 {
311 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
312
313 inst->ir = NULL;
314 inst->annotation = "send dependency resolve";
315
316 /* The caller always wants uncompressed to emit the minimal extra
317 * dependencies, and to avoid having to deal with aligning its regs to 2.
318 */
319 inst->force_uncompressed = true;
320
321 return inst;
322 }
323
324 bool
325 fs_inst::equals(fs_inst *inst)
326 {
327 return (opcode == inst->opcode &&
328 dst.equals(inst->dst) &&
329 src[0].equals(inst->src[0]) &&
330 src[1].equals(inst->src[1]) &&
331 src[2].equals(inst->src[2]) &&
332 saturate == inst->saturate &&
333 predicate == inst->predicate &&
334 conditional_mod == inst->conditional_mod &&
335 mlen == inst->mlen &&
336 base_mrf == inst->base_mrf &&
337 sampler == inst->sampler &&
338 target == inst->target &&
339 eot == inst->eot &&
340 header_present == inst->header_present &&
341 shadow_compare == inst->shadow_compare &&
342 offset == inst->offset);
343 }
344
345 bool
346 fs_inst::overwrites_reg(const fs_reg &reg)
347 {
348 return (reg.file == dst.file &&
349 reg.reg == dst.reg &&
350 reg.reg_offset >= dst.reg_offset &&
351 reg.reg_offset < dst.reg_offset + regs_written);
352 }
353
354 bool
355 fs_inst::is_send_from_grf()
356 {
357 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
358 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
359 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
360 src[1].file == GRF) ||
361 (is_tex() && src[0].file == GRF));
362 }
363
364 bool
365 fs_visitor::can_do_source_mods(fs_inst *inst)
366 {
367 if (brw->gen == 6 && inst->is_math())
368 return false;
369
370 if (inst->is_send_from_grf())
371 return false;
372
373 if (!inst->can_do_source_mods())
374 return false;
375
376 return true;
377 }
378
379 void
380 fs_reg::init()
381 {
382 memset(this, 0, sizeof(*this));
383 this->smear = -1;
384 }
385
386 /** Generic unset register constructor. */
387 fs_reg::fs_reg()
388 {
389 init();
390 this->file = BAD_FILE;
391 }
392
393 /** Immediate value constructor. */
394 fs_reg::fs_reg(float f)
395 {
396 init();
397 this->file = IMM;
398 this->type = BRW_REGISTER_TYPE_F;
399 this->imm.f = f;
400 }
401
402 /** Immediate value constructor. */
403 fs_reg::fs_reg(int32_t i)
404 {
405 init();
406 this->file = IMM;
407 this->type = BRW_REGISTER_TYPE_D;
408 this->imm.i = i;
409 }
410
411 /** Immediate value constructor. */
412 fs_reg::fs_reg(uint32_t u)
413 {
414 init();
415 this->file = IMM;
416 this->type = BRW_REGISTER_TYPE_UD;
417 this->imm.u = u;
418 }
419
420 /** Fixed brw_reg. */
421 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
422 {
423 init();
424 this->file = HW_REG;
425 this->fixed_hw_reg = fixed_hw_reg;
426 this->type = fixed_hw_reg.type;
427 }
428
429 bool
430 fs_reg::equals(const fs_reg &r) const
431 {
432 return (file == r.file &&
433 reg == r.reg &&
434 reg_offset == r.reg_offset &&
435 type == r.type &&
436 negate == r.negate &&
437 abs == r.abs &&
438 !reladdr && !r.reladdr &&
439 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
440 sizeof(fixed_hw_reg)) == 0 &&
441 smear == r.smear &&
442 imm.u == r.imm.u);
443 }
444
445 fs_reg
446 fs_reg::retype(uint32_t type)
447 {
448 fs_reg result = *this;
449 result.type = type;
450 return result;
451 }
452
453 bool
454 fs_reg::is_zero() const
455 {
456 if (file != IMM)
457 return false;
458
459 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
460 }
461
462 bool
463 fs_reg::is_one() const
464 {
465 if (file != IMM)
466 return false;
467
468 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
469 }
470
471 bool
472 fs_reg::is_null() const
473 {
474 return file == HW_REG &&
475 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
476 fixed_hw_reg.nr == BRW_ARF_NULL;
477 }
478
479 bool
480 fs_reg::is_valid_3src() const
481 {
482 return file == GRF || file == UNIFORM;
483 }
484
485 int
486 fs_visitor::type_size(const struct glsl_type *type)
487 {
488 unsigned int size, i;
489
490 switch (type->base_type) {
491 case GLSL_TYPE_UINT:
492 case GLSL_TYPE_INT:
493 case GLSL_TYPE_FLOAT:
494 case GLSL_TYPE_BOOL:
495 return type->components();
496 case GLSL_TYPE_ARRAY:
497 return type_size(type->fields.array) * type->length;
498 case GLSL_TYPE_STRUCT:
499 size = 0;
500 for (i = 0; i < type->length; i++) {
501 size += type_size(type->fields.structure[i].type);
502 }
503 return size;
504 case GLSL_TYPE_SAMPLER:
505 /* Samplers take up no register space, since they're baked in at
506 * link time.
507 */
508 return 0;
509 case GLSL_TYPE_ATOMIC_UINT:
510 return 0;
511 case GLSL_TYPE_IMAGE:
512 case GLSL_TYPE_VOID:
513 case GLSL_TYPE_ERROR:
514 case GLSL_TYPE_INTERFACE:
515 assert(!"not reached");
516 break;
517 }
518
519 return 0;
520 }
521
522 fs_reg
523 fs_visitor::get_timestamp()
524 {
525 assert(brw->gen >= 7);
526
527 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
528 BRW_ARF_TIMESTAMP,
529 0),
530 BRW_REGISTER_TYPE_UD));
531
532 fs_reg dst = fs_reg(this, glsl_type::uint_type);
533
534 fs_inst *mov = emit(MOV(dst, ts));
535 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
536 * even if it's not enabled in the dispatch.
537 */
538 mov->force_writemask_all = true;
539 mov->force_uncompressed = true;
540
541 /* The caller wants the low 32 bits of the timestamp. Since it's running
542 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
543 * which is plenty of time for our purposes. It is identical across the
544 * EUs, but since it's tracking GPU core speed it will increment at a
545 * varying rate as render P-states change.
546 *
547 * The caller could also check if render P-states have changed (or anything
548 * else that might disrupt timing) by setting smear to 2 and checking if
549 * that field is != 0.
550 */
551 dst.smear = 0;
552
553 return dst;
554 }
555
556 void
557 fs_visitor::emit_shader_time_begin()
558 {
559 current_annotation = "shader time start";
560 shader_start_time = get_timestamp();
561 }
562
563 void
564 fs_visitor::emit_shader_time_end()
565 {
566 current_annotation = "shader time end";
567
568 enum shader_time_shader_type type, written_type, reset_type;
569 if (dispatch_width == 8) {
570 type = ST_FS8;
571 written_type = ST_FS8_WRITTEN;
572 reset_type = ST_FS8_RESET;
573 } else {
574 assert(dispatch_width == 16);
575 type = ST_FS16;
576 written_type = ST_FS16_WRITTEN;
577 reset_type = ST_FS16_RESET;
578 }
579
580 fs_reg shader_end_time = get_timestamp();
581
582 /* Check that there weren't any timestamp reset events (assuming these
583 * were the only two timestamp reads that happened).
584 */
585 fs_reg reset = shader_end_time;
586 reset.smear = 2;
587 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
588 test->conditional_mod = BRW_CONDITIONAL_Z;
589 emit(IF(BRW_PREDICATE_NORMAL));
590
591 push_force_uncompressed();
592 fs_reg start = shader_start_time;
593 start.negate = true;
594 fs_reg diff = fs_reg(this, glsl_type::uint_type);
595 emit(ADD(diff, start, shader_end_time));
596
597 /* If there were no instructions between the two timestamp gets, the diff
598 * is 2 cycles. Remove that overhead, so I can forget about that when
599 * trying to determine the time taken for single instructions.
600 */
601 emit(ADD(diff, diff, fs_reg(-2u)));
602
603 emit_shader_time_write(type, diff);
604 emit_shader_time_write(written_type, fs_reg(1u));
605 emit(BRW_OPCODE_ELSE);
606 emit_shader_time_write(reset_type, fs_reg(1u));
607 emit(BRW_OPCODE_ENDIF);
608
609 pop_force_uncompressed();
610 }
611
612 void
613 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
614 fs_reg value)
615 {
616 int shader_time_index =
617 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
618 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
619
620 fs_reg payload;
621 if (dispatch_width == 8)
622 payload = fs_reg(this, glsl_type::uvec2_type);
623 else
624 payload = fs_reg(this, glsl_type::uint_type);
625
626 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
627 fs_reg(), payload, offset, value));
628 }
629
630 void
631 fs_visitor::fail(const char *format, ...)
632 {
633 va_list va;
634 char *msg;
635
636 if (failed)
637 return;
638
639 failed = true;
640
641 va_start(va, format);
642 msg = ralloc_vasprintf(mem_ctx, format, va);
643 va_end(va);
644 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
645
646 this->fail_msg = msg;
647
648 if (INTEL_DEBUG & DEBUG_WM) {
649 fprintf(stderr, "%s", msg);
650 }
651 }
652
653 fs_inst *
654 fs_visitor::emit(enum opcode opcode)
655 {
656 return emit(fs_inst(opcode));
657 }
658
659 fs_inst *
660 fs_visitor::emit(enum opcode opcode, fs_reg dst)
661 {
662 return emit(fs_inst(opcode, dst));
663 }
664
665 fs_inst *
666 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
667 {
668 return emit(fs_inst(opcode, dst, src0));
669 }
670
671 fs_inst *
672 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
673 {
674 return emit(fs_inst(opcode, dst, src0, src1));
675 }
676
677 fs_inst *
678 fs_visitor::emit(enum opcode opcode, fs_reg dst,
679 fs_reg src0, fs_reg src1, fs_reg src2)
680 {
681 return emit(fs_inst(opcode, dst, src0, src1, src2));
682 }
683
684 void
685 fs_visitor::push_force_uncompressed()
686 {
687 force_uncompressed_stack++;
688 }
689
690 void
691 fs_visitor::pop_force_uncompressed()
692 {
693 force_uncompressed_stack--;
694 assert(force_uncompressed_stack >= 0);
695 }
696
697 /**
698 * Returns true if the instruction has a flag that means it won't
699 * update an entire destination register.
700 *
701 * For example, dead code elimination and live variable analysis want to know
702 * when a write to a variable screens off any preceding values that were in
703 * it.
704 */
705 bool
706 fs_inst::is_partial_write()
707 {
708 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
709 this->force_uncompressed ||
710 this->force_sechalf);
711 }
712
713 int
714 fs_inst::regs_read(fs_visitor *v, int arg)
715 {
716 if (is_tex() && arg == 0 && src[0].file == GRF) {
717 if (v->dispatch_width == 16)
718 return (mlen + 1) / 2;
719 else
720 return mlen;
721 }
722 return 1;
723 }
724
725 bool
726 fs_inst::reads_flag()
727 {
728 return predicate;
729 }
730
731 bool
732 fs_inst::writes_flag()
733 {
734 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
735 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
736 }
737
738 /**
739 * Returns how many MRFs an FS opcode will write over.
740 *
741 * Note that this is not the 0 or 1 implied writes in an actual gen
742 * instruction -- the FS opcodes often generate MOVs in addition.
743 */
744 int
745 fs_visitor::implied_mrf_writes(fs_inst *inst)
746 {
747 if (inst->mlen == 0)
748 return 0;
749
750 if (inst->base_mrf == -1)
751 return 0;
752
753 switch (inst->opcode) {
754 case SHADER_OPCODE_RCP:
755 case SHADER_OPCODE_RSQ:
756 case SHADER_OPCODE_SQRT:
757 case SHADER_OPCODE_EXP2:
758 case SHADER_OPCODE_LOG2:
759 case SHADER_OPCODE_SIN:
760 case SHADER_OPCODE_COS:
761 return 1 * dispatch_width / 8;
762 case SHADER_OPCODE_POW:
763 case SHADER_OPCODE_INT_QUOTIENT:
764 case SHADER_OPCODE_INT_REMAINDER:
765 return 2 * dispatch_width / 8;
766 case SHADER_OPCODE_TEX:
767 case FS_OPCODE_TXB:
768 case SHADER_OPCODE_TXD:
769 case SHADER_OPCODE_TXF:
770 case SHADER_OPCODE_TXF_CMS:
771 case SHADER_OPCODE_TXF_MCS:
772 case SHADER_OPCODE_TG4:
773 case SHADER_OPCODE_TG4_OFFSET:
774 case SHADER_OPCODE_TXL:
775 case SHADER_OPCODE_TXS:
776 case SHADER_OPCODE_LOD:
777 return 1;
778 case FS_OPCODE_FB_WRITE:
779 return 2;
780 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
781 case SHADER_OPCODE_GEN4_SCRATCH_READ:
782 return 1;
783 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
784 return inst->mlen;
785 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
786 return 2;
787 case SHADER_OPCODE_UNTYPED_ATOMIC:
788 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
789 return 0;
790 default:
791 assert(!"not reached");
792 return inst->mlen;
793 }
794 }
795
796 int
797 fs_visitor::virtual_grf_alloc(int size)
798 {
799 if (virtual_grf_array_size <= virtual_grf_count) {
800 if (virtual_grf_array_size == 0)
801 virtual_grf_array_size = 16;
802 else
803 virtual_grf_array_size *= 2;
804 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
805 virtual_grf_array_size);
806 }
807 virtual_grf_sizes[virtual_grf_count] = size;
808 return virtual_grf_count++;
809 }
810
811 /** Fixed HW reg constructor. */
812 fs_reg::fs_reg(enum register_file file, int reg)
813 {
814 init();
815 this->file = file;
816 this->reg = reg;
817 this->type = BRW_REGISTER_TYPE_F;
818 }
819
820 /** Fixed HW reg constructor. */
821 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
822 {
823 init();
824 this->file = file;
825 this->reg = reg;
826 this->type = type;
827 }
828
829 /** Automatic reg constructor. */
830 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
831 {
832 init();
833
834 this->file = GRF;
835 this->reg = v->virtual_grf_alloc(v->type_size(type));
836 this->reg_offset = 0;
837 this->type = brw_type_for_base_type(type);
838 }
839
840 fs_reg *
841 fs_visitor::variable_storage(ir_variable *var)
842 {
843 return (fs_reg *)hash_table_find(this->variable_ht, var);
844 }
845
846 void
847 import_uniforms_callback(const void *key,
848 void *data,
849 void *closure)
850 {
851 struct hash_table *dst_ht = (struct hash_table *)closure;
852 const fs_reg *reg = (const fs_reg *)data;
853
854 if (reg->file != UNIFORM)
855 return;
856
857 hash_table_insert(dst_ht, data, key);
858 }
859
860 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
861 * This brings in those uniform definitions
862 */
863 void
864 fs_visitor::import_uniforms(fs_visitor *v)
865 {
866 hash_table_call_foreach(v->variable_ht,
867 import_uniforms_callback,
868 variable_ht);
869 this->params_remap = v->params_remap;
870 this->nr_params_remap = v->nr_params_remap;
871 }
872
873 /* Our support for uniforms is piggy-backed on the struct
874 * gl_fragment_program, because that's where the values actually
875 * get stored, rather than in some global gl_shader_program uniform
876 * store.
877 */
878 void
879 fs_visitor::setup_uniform_values(ir_variable *ir)
880 {
881 int namelen = strlen(ir->name);
882
883 /* The data for our (non-builtin) uniforms is stored in a series of
884 * gl_uniform_driver_storage structs for each subcomponent that
885 * glGetUniformLocation() could name. We know it's been set up in the same
886 * order we'd walk the type, so walk the list of storage and find anything
887 * with our name, or the prefix of a component that starts with our name.
888 */
889 unsigned params_before = c->prog_data.nr_params;
890 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
891 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
892
893 if (strncmp(ir->name, storage->name, namelen) != 0 ||
894 (storage->name[namelen] != 0 &&
895 storage->name[namelen] != '.' &&
896 storage->name[namelen] != '[')) {
897 continue;
898 }
899
900 unsigned slots = storage->type->component_slots();
901 if (storage->array_elements)
902 slots *= storage->array_elements;
903
904 for (unsigned i = 0; i < slots; i++) {
905 c->prog_data.param[c->prog_data.nr_params++] =
906 &storage->storage[i].f;
907 }
908 }
909
910 /* Make sure we actually initialized the right amount of stuff here. */
911 assert(params_before + ir->type->component_slots() ==
912 c->prog_data.nr_params);
913 (void)params_before;
914 }
915
916
917 /* Our support for builtin uniforms is even scarier than non-builtin.
918 * It sits on top of the PROG_STATE_VAR parameters that are
919 * automatically updated from GL context state.
920 */
921 void
922 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
923 {
924 const ir_state_slot *const slots = ir->state_slots;
925 assert(ir->state_slots != NULL);
926
927 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
928 /* This state reference has already been setup by ir_to_mesa, but we'll
929 * get the same index back here.
930 */
931 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
932 (gl_state_index *)slots[i].tokens);
933
934 /* Add each of the unique swizzles of the element as a parameter.
935 * This'll end up matching the expected layout of the
936 * array/matrix/structure we're trying to fill in.
937 */
938 int last_swiz = -1;
939 for (unsigned int j = 0; j < 4; j++) {
940 int swiz = GET_SWZ(slots[i].swizzle, j);
941 if (swiz == last_swiz)
942 break;
943 last_swiz = swiz;
944
945 c->prog_data.param[c->prog_data.nr_params++] =
946 &fp->Base.Parameters->ParameterValues[index][swiz].f;
947 }
948 }
949 }
950
951 fs_reg *
952 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
953 {
954 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
955 fs_reg wpos = *reg;
956 bool flip = !ir->data.origin_upper_left ^ c->key.render_to_fbo;
957
958 /* gl_FragCoord.x */
959 if (ir->data.pixel_center_integer) {
960 emit(MOV(wpos, this->pixel_x));
961 } else {
962 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
963 }
964 wpos.reg_offset++;
965
966 /* gl_FragCoord.y */
967 if (!flip && ir->data.pixel_center_integer) {
968 emit(MOV(wpos, this->pixel_y));
969 } else {
970 fs_reg pixel_y = this->pixel_y;
971 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
972
973 if (flip) {
974 pixel_y.negate = true;
975 offset += c->key.drawable_height - 1.0;
976 }
977
978 emit(ADD(wpos, pixel_y, fs_reg(offset)));
979 }
980 wpos.reg_offset++;
981
982 /* gl_FragCoord.z */
983 if (brw->gen >= 6) {
984 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
985 } else {
986 emit(FS_OPCODE_LINTERP, wpos,
987 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
988 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
989 interp_reg(VARYING_SLOT_POS, 2));
990 }
991 wpos.reg_offset++;
992
993 /* gl_FragCoord.w: Already set up in emit_interpolation */
994 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
995
996 return reg;
997 }
998
999 fs_inst *
1000 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1001 glsl_interp_qualifier interpolation_mode,
1002 bool is_centroid, bool is_sample)
1003 {
1004 brw_wm_barycentric_interp_mode barycoord_mode;
1005 if (brw->gen >= 6) {
1006 if (is_centroid) {
1007 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1008 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1009 else
1010 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1011 } else if (is_sample) {
1012 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1013 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1014 else
1015 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1016 } else {
1017 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1018 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1019 else
1020 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1021 }
1022 } else {
1023 /* On Ironlake and below, there is only one interpolation mode.
1024 * Centroid interpolation doesn't mean anything on this hardware --
1025 * there is no multisampling.
1026 */
1027 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1028 }
1029 return emit(FS_OPCODE_LINTERP, attr,
1030 this->delta_x[barycoord_mode],
1031 this->delta_y[barycoord_mode], interp);
1032 }
1033
1034 fs_reg *
1035 fs_visitor::emit_general_interpolation(ir_variable *ir)
1036 {
1037 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1038 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1039 fs_reg attr = *reg;
1040
1041 unsigned int array_elements;
1042 const glsl_type *type;
1043
1044 if (ir->type->is_array()) {
1045 array_elements = ir->type->length;
1046 if (array_elements == 0) {
1047 fail("dereferenced array '%s' has length 0\n", ir->name);
1048 }
1049 type = ir->type->fields.array;
1050 } else {
1051 array_elements = 1;
1052 type = ir->type;
1053 }
1054
1055 glsl_interp_qualifier interpolation_mode =
1056 ir->determine_interpolation_mode(c->key.flat_shade);
1057
1058 int location = ir->data.location;
1059 for (unsigned int i = 0; i < array_elements; i++) {
1060 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1061 if (c->prog_data.urb_setup[location] == -1) {
1062 /* If there's no incoming setup data for this slot, don't
1063 * emit interpolation for it.
1064 */
1065 attr.reg_offset += type->vector_elements;
1066 location++;
1067 continue;
1068 }
1069
1070 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1071 /* Constant interpolation (flat shading) case. The SF has
1072 * handed us defined values in only the constant offset
1073 * field of the setup reg.
1074 */
1075 for (unsigned int k = 0; k < type->vector_elements; k++) {
1076 struct brw_reg interp = interp_reg(location, k);
1077 interp = suboffset(interp, 3);
1078 interp.type = reg->type;
1079 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1080 attr.reg_offset++;
1081 }
1082 } else {
1083 /* Smooth/noperspective interpolation case. */
1084 for (unsigned int k = 0; k < type->vector_elements; k++) {
1085 /* FINISHME: At some point we probably want to push
1086 * this farther by giving similar treatment to the
1087 * other potentially constant components of the
1088 * attribute, as well as making brw_vs_constval.c
1089 * handle varyings other than gl_TexCoord.
1090 */
1091 struct brw_reg interp = interp_reg(location, k);
1092 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1093 ir->data.centroid && !c->key.persample_shading,
1094 ir->data.sample || c->key.persample_shading);
1095 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1096 /* Get the pixel/sample mask into f0 so that we know
1097 * which pixels are lit. Then, for each channel that is
1098 * unlit, replace the centroid data with non-centroid
1099 * data.
1100 */
1101 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1102 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1103 interpolation_mode,
1104 false, false);
1105 inst->predicate = BRW_PREDICATE_NORMAL;
1106 inst->predicate_inverse = true;
1107 }
1108 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1109 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1110 }
1111 attr.reg_offset++;
1112 }
1113
1114 }
1115 location++;
1116 }
1117 }
1118
1119 return reg;
1120 }
1121
1122 fs_reg *
1123 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1124 {
1125 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1126
1127 /* The frontfacing comes in as a bit in the thread payload. */
1128 if (brw->gen >= 6) {
1129 emit(BRW_OPCODE_ASR, *reg,
1130 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1131 fs_reg(15));
1132 emit(BRW_OPCODE_NOT, *reg, *reg);
1133 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1134 } else {
1135 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1136 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1137 * us front face
1138 */
1139 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1140 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1141 }
1142
1143 return reg;
1144 }
1145
1146 void
1147 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1148 {
1149 assert(dst.type == BRW_REGISTER_TYPE_F);
1150
1151 if (c->key.compute_pos_offset) {
1152 /* Convert int_sample_pos to floating point */
1153 emit(MOV(dst, int_sample_pos));
1154 /* Scale to the range [0, 1] */
1155 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1156 }
1157 else {
1158 /* From ARB_sample_shading specification:
1159 * "When rendering to a non-multisample buffer, or if multisample
1160 * rasterization is disabled, gl_SamplePosition will always be
1161 * (0.5, 0.5).
1162 */
1163 emit(MOV(dst, fs_reg(0.5f)));
1164 }
1165 }
1166
1167 fs_reg *
1168 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1169 {
1170 assert(brw->gen >= 6);
1171 assert(ir->type == glsl_type::vec2_type);
1172
1173 this->current_annotation = "compute sample position";
1174 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1175 fs_reg pos = *reg;
1176 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1177 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1178
1179 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1180 * mode will be enabled.
1181 *
1182 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1183 * R31.1:0 Position Offset X/Y for Slot[3:0]
1184 * R31.3:2 Position Offset X/Y for Slot[7:4]
1185 * .....
1186 *
1187 * The X, Y sample positions come in as bytes in thread payload. So, read
1188 * the positions using vstride=16, width=8, hstride=2.
1189 */
1190 struct brw_reg sample_pos_reg =
1191 stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1192 BRW_REGISTER_TYPE_B), 16, 8, 2);
1193
1194 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1195 if (dispatch_width == 16) {
1196 int_sample_x.sechalf = true;
1197 fs_inst *inst = emit(MOV(int_sample_x,
1198 fs_reg(suboffset(sample_pos_reg, 16))));
1199 inst->force_sechalf = true;
1200 int_sample_x.sechalf = false;
1201 }
1202 /* Compute gl_SamplePosition.x */
1203 compute_sample_position(pos, int_sample_x);
1204 pos.reg_offset++;
1205 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1206 if (dispatch_width == 16) {
1207 int_sample_y.sechalf = true;
1208 fs_inst *inst = emit(MOV(int_sample_y,
1209 fs_reg(suboffset(sample_pos_reg, 17))));
1210 inst->force_sechalf = true;
1211 int_sample_y.sechalf = false;
1212 }
1213 /* Compute gl_SamplePosition.y */
1214 compute_sample_position(pos, int_sample_y);
1215 return reg;
1216 }
1217
1218 fs_reg *
1219 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1220 {
1221 assert(brw->gen >= 6);
1222
1223 this->current_annotation = "compute sample id";
1224 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1225
1226 if (c->key.compute_sample_id) {
1227 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1228 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1229 t2.type = BRW_REGISTER_TYPE_UW;
1230
1231 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1232 * 8x multisampling, subspan 0 will represent sample N (where N
1233 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1234 * 7. We can find the value of N by looking at R0.0 bits 7:6
1235 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1236 * (since samples are always delivered in pairs). That is, we
1237 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1238 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1239 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1240 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1241 * populating a temporary variable with the sequence (0, 1, 2, 3),
1242 * and then reading from it using vstride=1, width=4, hstride=0.
1243 * These computations hold good for 4x multisampling as well.
1244 */
1245 emit(BRW_OPCODE_AND, t1,
1246 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1247 fs_reg(brw_imm_d(0xc0)));
1248 emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1249 /* This works for both SIMD8 and SIMD16 */
1250 emit(MOV(t2, brw_imm_v(0x3210)));
1251 /* This special instruction takes care of setting vstride=1,
1252 * width=4, hstride=0 of t2 during an ADD instruction.
1253 */
1254 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1255 } else {
1256 /* As per GL_ARB_sample_shading specification:
1257 * "When rendering to a non-multisample buffer, or if multisample
1258 * rasterization is disabled, gl_SampleID will always be zero."
1259 */
1260 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1261 }
1262
1263 return reg;
1264 }
1265
1266 fs_reg *
1267 fs_visitor::emit_samplemaskin_setup(ir_variable *ir)
1268 {
1269 assert(brw->gen >= 7);
1270 this->current_annotation = "compute gl_SampleMaskIn";
1271 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1272 emit(MOV(*reg, fs_reg(retype(brw_vec8_grf(c->sample_mask_reg, 0), BRW_REGISTER_TYPE_D))));
1273 return reg;
1274 }
1275
1276 fs_reg
1277 fs_visitor::fix_math_operand(fs_reg src)
1278 {
1279 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1280 * might be able to do better by doing execsize = 1 math and then
1281 * expanding that result out, but we would need to be careful with
1282 * masking.
1283 *
1284 * The hardware ignores source modifiers (negate and abs) on math
1285 * instructions, so we also move to a temp to set those up.
1286 */
1287 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1288 !src.abs && !src.negate)
1289 return src;
1290
1291 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1292 * operands to math
1293 */
1294 if (brw->gen >= 7 && src.file != IMM)
1295 return src;
1296
1297 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1298 expanded.type = src.type;
1299 emit(BRW_OPCODE_MOV, expanded, src);
1300 return expanded;
1301 }
1302
1303 fs_inst *
1304 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1305 {
1306 switch (opcode) {
1307 case SHADER_OPCODE_RCP:
1308 case SHADER_OPCODE_RSQ:
1309 case SHADER_OPCODE_SQRT:
1310 case SHADER_OPCODE_EXP2:
1311 case SHADER_OPCODE_LOG2:
1312 case SHADER_OPCODE_SIN:
1313 case SHADER_OPCODE_COS:
1314 break;
1315 default:
1316 assert(!"not reached: bad math opcode");
1317 return NULL;
1318 }
1319
1320 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1321 * might be able to do better by doing execsize = 1 math and then
1322 * expanding that result out, but we would need to be careful with
1323 * masking.
1324 *
1325 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1326 * instructions, so we also move to a temp to set those up.
1327 */
1328 if (brw->gen >= 6)
1329 src = fix_math_operand(src);
1330
1331 fs_inst *inst = emit(opcode, dst, src);
1332
1333 if (brw->gen < 6) {
1334 inst->base_mrf = 2;
1335 inst->mlen = dispatch_width / 8;
1336 }
1337
1338 return inst;
1339 }
1340
1341 fs_inst *
1342 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1343 {
1344 int base_mrf = 2;
1345 fs_inst *inst;
1346
1347 switch (opcode) {
1348 case SHADER_OPCODE_INT_QUOTIENT:
1349 case SHADER_OPCODE_INT_REMAINDER:
1350 if (brw->gen >= 7 && dispatch_width == 16)
1351 fail("SIMD16 INTDIV unsupported\n");
1352 break;
1353 case SHADER_OPCODE_POW:
1354 break;
1355 default:
1356 assert(!"not reached: unsupported binary math opcode.");
1357 return NULL;
1358 }
1359
1360 if (brw->gen >= 6) {
1361 src0 = fix_math_operand(src0);
1362 src1 = fix_math_operand(src1);
1363
1364 inst = emit(opcode, dst, src0, src1);
1365 } else {
1366 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1367 * "Message Payload":
1368 *
1369 * "Operand0[7]. For the INT DIV functions, this operand is the
1370 * denominator."
1371 * ...
1372 * "Operand1[7]. For the INT DIV functions, this operand is the
1373 * numerator."
1374 */
1375 bool is_int_div = opcode != SHADER_OPCODE_POW;
1376 fs_reg &op0 = is_int_div ? src1 : src0;
1377 fs_reg &op1 = is_int_div ? src0 : src1;
1378
1379 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1380 inst = emit(opcode, dst, op0, reg_null_f);
1381
1382 inst->base_mrf = base_mrf;
1383 inst->mlen = 2 * dispatch_width / 8;
1384 }
1385 return inst;
1386 }
1387
1388 void
1389 fs_visitor::assign_curb_setup()
1390 {
1391 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1392 if (dispatch_width == 8) {
1393 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1394 } else {
1395 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1396 }
1397
1398 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1399 foreach_list(node, &this->instructions) {
1400 fs_inst *inst = (fs_inst *)node;
1401
1402 for (unsigned int i = 0; i < 3; i++) {
1403 if (inst->src[i].file == UNIFORM) {
1404 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1405 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1406 constant_nr / 8,
1407 constant_nr % 8);
1408
1409 inst->src[i].file = HW_REG;
1410 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1411 }
1412 }
1413 }
1414 }
1415
1416 void
1417 fs_visitor::calculate_urb_setup()
1418 {
1419 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1420 c->prog_data.urb_setup[i] = -1;
1421 }
1422
1423 int urb_next = 0;
1424 /* Figure out where each of the incoming setup attributes lands. */
1425 if (brw->gen >= 6) {
1426 if (_mesa_bitcount_64(fp->Base.InputsRead &
1427 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1428 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1429 * first 16 varying inputs, so we can put them wherever we want.
1430 * Just put them in order.
1431 *
1432 * This is useful because it means that (a) inputs not used by the
1433 * fragment shader won't take up valuable register space, and (b) we
1434 * won't have to recompile the fragment shader if it gets paired with
1435 * a different vertex (or geometry) shader.
1436 */
1437 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1438 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1439 BITFIELD64_BIT(i)) {
1440 c->prog_data.urb_setup[i] = urb_next++;
1441 }
1442 }
1443 } else {
1444 /* We have enough input varyings that the SF/SBE pipeline stage can't
1445 * arbitrarily rearrange them to suit our whim; we have to put them
1446 * in an order that matches the output of the previous pipeline stage
1447 * (geometry or vertex shader).
1448 */
1449 struct brw_vue_map prev_stage_vue_map;
1450 brw_compute_vue_map(brw, &prev_stage_vue_map,
1451 c->key.input_slots_valid);
1452 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1453 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1454 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1455 slot++) {
1456 int varying = prev_stage_vue_map.slot_to_varying[slot];
1457 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1458 * unused.
1459 */
1460 if (varying != BRW_VARYING_SLOT_COUNT &&
1461 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1462 BITFIELD64_BIT(varying))) {
1463 c->prog_data.urb_setup[varying] = slot - first_slot;
1464 }
1465 }
1466 urb_next = prev_stage_vue_map.num_slots - first_slot;
1467 }
1468 } else {
1469 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1470 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1471 /* Point size is packed into the header, not as a general attribute */
1472 if (i == VARYING_SLOT_PSIZ)
1473 continue;
1474
1475 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1476 /* The back color slot is skipped when the front color is
1477 * also written to. In addition, some slots can be
1478 * written in the vertex shader and not read in the
1479 * fragment shader. So the register number must always be
1480 * incremented, mapped or not.
1481 */
1482 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1483 c->prog_data.urb_setup[i] = urb_next;
1484 urb_next++;
1485 }
1486 }
1487
1488 /*
1489 * It's a FS only attribute, and we did interpolation for this attribute
1490 * in SF thread. So, count it here, too.
1491 *
1492 * See compile_sf_prog() for more info.
1493 */
1494 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1495 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1496 }
1497
1498 c->prog_data.num_varying_inputs = urb_next;
1499 }
1500
1501 void
1502 fs_visitor::assign_urb_setup()
1503 {
1504 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1505
1506 /* Offset all the urb_setup[] index by the actual position of the
1507 * setup regs, now that the location of the constants has been chosen.
1508 */
1509 foreach_list(node, &this->instructions) {
1510 fs_inst *inst = (fs_inst *)node;
1511
1512 if (inst->opcode == FS_OPCODE_LINTERP) {
1513 assert(inst->src[2].file == HW_REG);
1514 inst->src[2].fixed_hw_reg.nr += urb_start;
1515 }
1516
1517 if (inst->opcode == FS_OPCODE_CINTERP) {
1518 assert(inst->src[0].file == HW_REG);
1519 inst->src[0].fixed_hw_reg.nr += urb_start;
1520 }
1521 }
1522
1523 /* Each attribute is 4 setup channels, each of which is half a reg. */
1524 this->first_non_payload_grf =
1525 urb_start + c->prog_data.num_varying_inputs * 2;
1526 }
1527
1528 /**
1529 * Split large virtual GRFs into separate components if we can.
1530 *
1531 * This is mostly duplicated with what brw_fs_vector_splitting does,
1532 * but that's really conservative because it's afraid of doing
1533 * splitting that doesn't result in real progress after the rest of
1534 * the optimization phases, which would cause infinite looping in
1535 * optimization. We can do it once here, safely. This also has the
1536 * opportunity to split interpolated values, or maybe even uniforms,
1537 * which we don't have at the IR level.
1538 *
1539 * We want to split, because virtual GRFs are what we register
1540 * allocate and spill (due to contiguousness requirements for some
1541 * instructions), and they're what we naturally generate in the
1542 * codegen process, but most virtual GRFs don't actually need to be
1543 * contiguous sets of GRFs. If we split, we'll end up with reduced
1544 * live intervals and better dead code elimination and coalescing.
1545 */
1546 void
1547 fs_visitor::split_virtual_grfs()
1548 {
1549 int num_vars = this->virtual_grf_count;
1550 bool split_grf[num_vars];
1551 int new_virtual_grf[num_vars];
1552
1553 /* Try to split anything > 0 sized. */
1554 for (int i = 0; i < num_vars; i++) {
1555 if (this->virtual_grf_sizes[i] != 1)
1556 split_grf[i] = true;
1557 else
1558 split_grf[i] = false;
1559 }
1560
1561 if (brw->has_pln &&
1562 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1563 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1564 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1565 * Gen6, that was the only supported interpolation mode, and since Gen6,
1566 * delta_x and delta_y are in fixed hardware registers.
1567 */
1568 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1569 false;
1570 }
1571
1572 foreach_list(node, &this->instructions) {
1573 fs_inst *inst = (fs_inst *)node;
1574
1575 /* If there's a SEND message that requires contiguous destination
1576 * registers, no splitting is allowed.
1577 */
1578 if (inst->regs_written > 1) {
1579 split_grf[inst->dst.reg] = false;
1580 }
1581
1582 /* If we're sending from a GRF, don't split it, on the assumption that
1583 * the send is reading the whole thing.
1584 */
1585 if (inst->is_send_from_grf()) {
1586 for (int i = 0; i < 3; i++) {
1587 if (inst->src[i].file == GRF) {
1588 split_grf[inst->src[i].reg] = false;
1589 }
1590 }
1591 }
1592 }
1593
1594 /* Allocate new space for split regs. Note that the virtual
1595 * numbers will be contiguous.
1596 */
1597 for (int i = 0; i < num_vars; i++) {
1598 if (split_grf[i]) {
1599 new_virtual_grf[i] = virtual_grf_alloc(1);
1600 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1601 int reg = virtual_grf_alloc(1);
1602 assert(reg == new_virtual_grf[i] + j - 1);
1603 (void) reg;
1604 }
1605 this->virtual_grf_sizes[i] = 1;
1606 }
1607 }
1608
1609 foreach_list(node, &this->instructions) {
1610 fs_inst *inst = (fs_inst *)node;
1611
1612 if (inst->dst.file == GRF &&
1613 split_grf[inst->dst.reg] &&
1614 inst->dst.reg_offset != 0) {
1615 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1616 inst->dst.reg_offset - 1);
1617 inst->dst.reg_offset = 0;
1618 }
1619 for (int i = 0; i < 3; i++) {
1620 if (inst->src[i].file == GRF &&
1621 split_grf[inst->src[i].reg] &&
1622 inst->src[i].reg_offset != 0) {
1623 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1624 inst->src[i].reg_offset - 1);
1625 inst->src[i].reg_offset = 0;
1626 }
1627 }
1628 }
1629 invalidate_live_intervals();
1630 }
1631
1632 /**
1633 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1634 *
1635 * During code generation, we create tons of temporary variables, many of
1636 * which get immediately killed and are never used again. Yet, in later
1637 * optimization and analysis passes, such as compute_live_intervals, we need
1638 * to loop over all the virtual GRFs. Compacting them can save a lot of
1639 * overhead.
1640 */
1641 void
1642 fs_visitor::compact_virtual_grfs()
1643 {
1644 /* Mark which virtual GRFs are used, and count how many. */
1645 int remap_table[this->virtual_grf_count];
1646 memset(remap_table, -1, sizeof(remap_table));
1647
1648 foreach_list(node, &this->instructions) {
1649 const fs_inst *inst = (const fs_inst *) node;
1650
1651 if (inst->dst.file == GRF)
1652 remap_table[inst->dst.reg] = 0;
1653
1654 for (int i = 0; i < 3; i++) {
1655 if (inst->src[i].file == GRF)
1656 remap_table[inst->src[i].reg] = 0;
1657 }
1658 }
1659
1660 /* In addition to registers used in instructions, fs_visitor keeps
1661 * direct references to certain special values which must be patched:
1662 */
1663 fs_reg *special[] = {
1664 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1665 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1666 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1667 &delta_x[0], &delta_x[1], &delta_x[2],
1668 &delta_x[3], &delta_x[4], &delta_x[5],
1669 &delta_y[0], &delta_y[1], &delta_y[2],
1670 &delta_y[3], &delta_y[4], &delta_y[5],
1671 };
1672 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1673 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1674
1675 /* Treat all special values as used, to be conservative */
1676 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1677 if (special[i]->file == GRF)
1678 remap_table[special[i]->reg] = 0;
1679 }
1680
1681 /* Compact the GRF arrays. */
1682 int new_index = 0;
1683 for (int i = 0; i < this->virtual_grf_count; i++) {
1684 if (remap_table[i] != -1) {
1685 remap_table[i] = new_index;
1686 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1687 invalidate_live_intervals();
1688 ++new_index;
1689 }
1690 }
1691
1692 this->virtual_grf_count = new_index;
1693
1694 /* Patch all the instructions to use the newly renumbered registers */
1695 foreach_list(node, &this->instructions) {
1696 fs_inst *inst = (fs_inst *) node;
1697
1698 if (inst->dst.file == GRF)
1699 inst->dst.reg = remap_table[inst->dst.reg];
1700
1701 for (int i = 0; i < 3; i++) {
1702 if (inst->src[i].file == GRF)
1703 inst->src[i].reg = remap_table[inst->src[i].reg];
1704 }
1705 }
1706
1707 /* Patch all the references to special values */
1708 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1709 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1710 special[i]->reg = remap_table[special[i]->reg];
1711 }
1712 }
1713
1714 bool
1715 fs_visitor::remove_dead_constants()
1716 {
1717 if (dispatch_width == 8) {
1718 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1719 this->nr_params_remap = c->prog_data.nr_params;
1720
1721 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1722 this->params_remap[i] = -1;
1723
1724 /* Find which params are still in use. */
1725 foreach_list(node, &this->instructions) {
1726 fs_inst *inst = (fs_inst *)node;
1727
1728 for (int i = 0; i < 3; i++) {
1729 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1730
1731 if (inst->src[i].file != UNIFORM)
1732 continue;
1733
1734 /* Section 5.11 of the OpenGL 4.3 spec says:
1735 *
1736 * "Out-of-bounds reads return undefined values, which include
1737 * values from other variables of the active program or zero."
1738 */
1739 if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1740 constant_nr = 0;
1741 }
1742
1743 /* For now, set this to non-negative. We'll give it the
1744 * actual new number in a moment, in order to keep the
1745 * register numbers nicely ordered.
1746 */
1747 this->params_remap[constant_nr] = 0;
1748 }
1749 }
1750
1751 /* Figure out what the new numbers for the params will be. At some
1752 * point when we're doing uniform array access, we're going to want
1753 * to keep the distinction between .reg and .reg_offset, but for
1754 * now we don't care.
1755 */
1756 unsigned int new_nr_params = 0;
1757 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1758 if (this->params_remap[i] != -1) {
1759 this->params_remap[i] = new_nr_params++;
1760 }
1761 }
1762
1763 /* Update the list of params to be uploaded to match our new numbering. */
1764 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1765 int remapped = this->params_remap[i];
1766
1767 if (remapped == -1)
1768 continue;
1769
1770 c->prog_data.param[remapped] = c->prog_data.param[i];
1771 }
1772
1773 c->prog_data.nr_params = new_nr_params;
1774 } else {
1775 /* This should have been generated in the SIMD8 pass already. */
1776 assert(this->params_remap);
1777 }
1778
1779 /* Now do the renumbering of the shader to remove unused params. */
1780 foreach_list(node, &this->instructions) {
1781 fs_inst *inst = (fs_inst *)node;
1782
1783 for (int i = 0; i < 3; i++) {
1784 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1785
1786 if (inst->src[i].file != UNIFORM)
1787 continue;
1788
1789 /* as above alias to 0 */
1790 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1791 constant_nr = 0;
1792 }
1793 assert(this->params_remap[constant_nr] != -1);
1794 inst->src[i].reg = this->params_remap[constant_nr];
1795 inst->src[i].reg_offset = 0;
1796 }
1797 }
1798
1799 return true;
1800 }
1801
1802 /*
1803 * Implements array access of uniforms by inserting a
1804 * PULL_CONSTANT_LOAD instruction.
1805 *
1806 * Unlike temporary GRF array access (where we don't support it due to
1807 * the difficulty of doing relative addressing on instruction
1808 * destinations), we could potentially do array access of uniforms
1809 * that were loaded in GRF space as push constants. In real-world
1810 * usage we've seen, though, the arrays being used are always larger
1811 * than we could load as push constants, so just always move all
1812 * uniform array access out to a pull constant buffer.
1813 */
1814 void
1815 fs_visitor::move_uniform_array_access_to_pull_constants()
1816 {
1817 int pull_constant_loc[c->prog_data.nr_params];
1818
1819 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1820 pull_constant_loc[i] = -1;
1821 }
1822
1823 /* Walk through and find array access of uniforms. Put a copy of that
1824 * uniform in the pull constant buffer.
1825 *
1826 * Note that we don't move constant-indexed accesses to arrays. No
1827 * testing has been done of the performance impact of this choice.
1828 */
1829 foreach_list_safe(node, &this->instructions) {
1830 fs_inst *inst = (fs_inst *)node;
1831
1832 for (int i = 0 ; i < 3; i++) {
1833 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1834 continue;
1835
1836 int uniform = inst->src[i].reg;
1837
1838 /* If this array isn't already present in the pull constant buffer,
1839 * add it.
1840 */
1841 if (pull_constant_loc[uniform] == -1) {
1842 const float **values = &c->prog_data.param[uniform];
1843
1844 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1845
1846 assert(param_size[uniform]);
1847
1848 for (int j = 0; j < param_size[uniform]; j++) {
1849 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1850 values[j];
1851 }
1852 }
1853
1854 /* Set up the annotation tracking for new generated instructions. */
1855 base_ir = inst->ir;
1856 current_annotation = inst->annotation;
1857
1858 fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1859 fs_reg temp = fs_reg(this, glsl_type::float_type);
1860 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1861 surf_index,
1862 *inst->src[i].reladdr,
1863 pull_constant_loc[uniform] +
1864 inst->src[i].reg_offset);
1865 inst->insert_before(&list);
1866
1867 inst->src[i].file = temp.file;
1868 inst->src[i].reg = temp.reg;
1869 inst->src[i].reg_offset = temp.reg_offset;
1870 inst->src[i].reladdr = NULL;
1871 }
1872 }
1873 }
1874
1875 /**
1876 * Choose accesses from the UNIFORM file to demote to using the pull
1877 * constant buffer.
1878 *
1879 * We allow a fragment shader to have more than the specified minimum
1880 * maximum number of fragment shader uniform components (64). If
1881 * there are too many of these, they'd fill up all of register space.
1882 * So, this will push some of them out to the pull constant buffer and
1883 * update the program to load them.
1884 */
1885 void
1886 fs_visitor::setup_pull_constants()
1887 {
1888 /* Only allow 16 registers (128 uniform components) as push constants. */
1889 unsigned int max_uniform_components = 16 * 8;
1890 if (c->prog_data.nr_params <= max_uniform_components)
1891 return;
1892
1893 if (dispatch_width == 16) {
1894 fail("Pull constants not supported in SIMD16\n");
1895 return;
1896 }
1897
1898 /* Just demote the end of the list. We could probably do better
1899 * here, demoting things that are rarely used in the program first.
1900 */
1901 unsigned int pull_uniform_base = max_uniform_components;
1902
1903 int pull_constant_loc[c->prog_data.nr_params];
1904 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1905 if (i < pull_uniform_base) {
1906 pull_constant_loc[i] = -1;
1907 } else {
1908 pull_constant_loc[i] = -1;
1909 /* If our constant is already being uploaded for reladdr purposes,
1910 * reuse it.
1911 */
1912 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1913 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1914 pull_constant_loc[i] = j;
1915 break;
1916 }
1917 }
1918 if (pull_constant_loc[i] == -1) {
1919 int pull_index = c->prog_data.nr_pull_params++;
1920 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1921 pull_constant_loc[i] = pull_index;;
1922 }
1923 }
1924 }
1925 c->prog_data.nr_params = pull_uniform_base;
1926
1927 foreach_list(node, &this->instructions) {
1928 fs_inst *inst = (fs_inst *)node;
1929
1930 for (int i = 0; i < 3; i++) {
1931 if (inst->src[i].file != UNIFORM)
1932 continue;
1933
1934 int pull_index = pull_constant_loc[inst->src[i].reg +
1935 inst->src[i].reg_offset];
1936 if (pull_index == -1)
1937 continue;
1938
1939 assert(!inst->src[i].reladdr);
1940
1941 fs_reg dst = fs_reg(this, glsl_type::float_type);
1942 fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1943 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1944 fs_inst *pull =
1945 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1946 dst, index, offset);
1947 pull->ir = inst->ir;
1948 pull->annotation = inst->annotation;
1949
1950 inst->insert_before(pull);
1951
1952 inst->src[i].file = GRF;
1953 inst->src[i].reg = dst.reg;
1954 inst->src[i].reg_offset = 0;
1955 inst->src[i].smear = pull_index & 3;
1956 }
1957 }
1958 }
1959
1960 bool
1961 fs_visitor::opt_algebraic()
1962 {
1963 bool progress = false;
1964
1965 foreach_list(node, &this->instructions) {
1966 fs_inst *inst = (fs_inst *)node;
1967
1968 switch (inst->opcode) {
1969 case BRW_OPCODE_MUL:
1970 if (inst->src[1].file != IMM)
1971 continue;
1972
1973 /* a * 1.0 = a */
1974 if (inst->src[1].is_one()) {
1975 inst->opcode = BRW_OPCODE_MOV;
1976 inst->src[1] = reg_undef;
1977 progress = true;
1978 break;
1979 }
1980
1981 /* a * 0.0 = 0.0 */
1982 if (inst->src[1].is_zero()) {
1983 inst->opcode = BRW_OPCODE_MOV;
1984 inst->src[0] = inst->src[1];
1985 inst->src[1] = reg_undef;
1986 progress = true;
1987 break;
1988 }
1989
1990 break;
1991 case BRW_OPCODE_ADD:
1992 if (inst->src[1].file != IMM)
1993 continue;
1994
1995 /* a + 0.0 = a */
1996 if (inst->src[1].is_zero()) {
1997 inst->opcode = BRW_OPCODE_MOV;
1998 inst->src[1] = reg_undef;
1999 progress = true;
2000 break;
2001 }
2002 break;
2003 case BRW_OPCODE_OR:
2004 if (inst->src[0].equals(inst->src[1])) {
2005 inst->opcode = BRW_OPCODE_MOV;
2006 inst->src[1] = reg_undef;
2007 progress = true;
2008 break;
2009 }
2010 break;
2011 case BRW_OPCODE_LRP:
2012 if (inst->src[1].equals(inst->src[2])) {
2013 inst->opcode = BRW_OPCODE_MOV;
2014 inst->src[0] = inst->src[1];
2015 inst->src[1] = reg_undef;
2016 inst->src[2] = reg_undef;
2017 progress = true;
2018 break;
2019 }
2020 break;
2021 case BRW_OPCODE_SEL:
2022 if (inst->saturate && inst->src[1].file == IMM) {
2023 switch (inst->conditional_mod) {
2024 case BRW_CONDITIONAL_LE:
2025 case BRW_CONDITIONAL_L:
2026 switch (inst->src[1].type) {
2027 case BRW_REGISTER_TYPE_F:
2028 if (inst->src[1].imm.f >= 1.0f) {
2029 inst->opcode = BRW_OPCODE_MOV;
2030 inst->src[1] = reg_undef;
2031 progress = true;
2032 }
2033 break;
2034 default:
2035 break;
2036 }
2037 break;
2038 case BRW_CONDITIONAL_GE:
2039 case BRW_CONDITIONAL_G:
2040 switch (inst->src[1].type) {
2041 case BRW_REGISTER_TYPE_F:
2042 if (inst->src[1].imm.f <= 0.0f) {
2043 inst->opcode = BRW_OPCODE_MOV;
2044 inst->src[1] = reg_undef;
2045 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2046 progress = true;
2047 }
2048 break;
2049 default:
2050 break;
2051 }
2052 default:
2053 break;
2054 }
2055 }
2056 break;
2057 default:
2058 break;
2059 }
2060 }
2061
2062 return progress;
2063 }
2064
2065 /**
2066 * Removes any instructions writing a VGRF where that VGRF is not used by any
2067 * later instruction.
2068 */
2069 bool
2070 fs_visitor::dead_code_eliminate()
2071 {
2072 bool progress = false;
2073 int pc = 0;
2074
2075 calculate_live_intervals();
2076
2077 foreach_list_safe(node, &this->instructions) {
2078 fs_inst *inst = (fs_inst *)node;
2079
2080 if (inst->dst.file == GRF && !inst->has_side_effects()) {
2081 bool dead = true;
2082
2083 for (int i = 0; i < inst->regs_written; i++) {
2084 int var = live_intervals->var_from_vgrf[inst->dst.reg];
2085 assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2086 if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2087 dead = false;
2088 break;
2089 }
2090 }
2091
2092 if (dead) {
2093 /* Don't dead code eliminate instructions that write to the
2094 * accumulator as a side-effect. Instead just set the destination
2095 * to the null register to free it.
2096 */
2097 switch (inst->opcode) {
2098 case BRW_OPCODE_ADDC:
2099 case BRW_OPCODE_SUBB:
2100 case BRW_OPCODE_MACH:
2101 inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2102 break;
2103 default:
2104 inst->remove();
2105 progress = true;
2106 break;
2107 }
2108 }
2109 }
2110
2111 pc++;
2112 }
2113
2114 if (progress)
2115 invalidate_live_intervals();
2116
2117 return progress;
2118 }
2119
2120 struct dead_code_hash_key
2121 {
2122 int vgrf;
2123 int reg_offset;
2124 };
2125
2126 static bool
2127 dead_code_hash_compare(const void *a, const void *b)
2128 {
2129 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2130 }
2131
2132 static void
2133 clear_dead_code_hash(struct hash_table *ht)
2134 {
2135 struct hash_entry *entry;
2136
2137 hash_table_foreach(ht, entry) {
2138 _mesa_hash_table_remove(ht, entry);
2139 }
2140 }
2141
2142 static void
2143 insert_dead_code_hash(struct hash_table *ht,
2144 int vgrf, int reg_offset, fs_inst *inst)
2145 {
2146 /* We don't bother freeing keys, because they'll be GCed with the ht. */
2147 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2148
2149 key->vgrf = vgrf;
2150 key->reg_offset = reg_offset;
2151
2152 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2153 }
2154
2155 static struct hash_entry *
2156 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2157 {
2158 struct dead_code_hash_key key;
2159
2160 key.vgrf = vgrf;
2161 key.reg_offset = reg_offset;
2162
2163 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2164 }
2165
2166 static void
2167 remove_dead_code_hash(struct hash_table *ht,
2168 int vgrf, int reg_offset)
2169 {
2170 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2171 if (!entry)
2172 return;
2173
2174 _mesa_hash_table_remove(ht, entry);
2175 }
2176
2177 /**
2178 * Walks basic blocks, removing any regs that are written but not read before
2179 * being redefined.
2180 *
2181 * The dead_code_eliminate() function implements a global dead code
2182 * elimination, but it only handles the removing the last write to a register
2183 * if it's never read. This one can handle intermediate writes, but only
2184 * within a basic block.
2185 */
2186 bool
2187 fs_visitor::dead_code_eliminate_local()
2188 {
2189 struct hash_table *ht;
2190 bool progress = false;
2191
2192 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2193
2194 if (ht == NULL) {
2195 return false;
2196 }
2197
2198 foreach_list_safe(node, &this->instructions) {
2199 fs_inst *inst = (fs_inst *)node;
2200
2201 /* At a basic block, empty the HT since we don't understand dataflow
2202 * here.
2203 */
2204 if (inst->is_control_flow()) {
2205 clear_dead_code_hash(ht);
2206 continue;
2207 }
2208
2209 /* Clear the HT of any instructions that got read. */
2210 for (int i = 0; i < 3; i++) {
2211 fs_reg src = inst->src[i];
2212 if (src.file != GRF)
2213 continue;
2214
2215 int read = 1;
2216 if (inst->is_send_from_grf())
2217 read = virtual_grf_sizes[src.reg] - src.reg_offset;
2218
2219 for (int reg_offset = src.reg_offset;
2220 reg_offset < src.reg_offset + read;
2221 reg_offset++) {
2222 remove_dead_code_hash(ht, src.reg, reg_offset);
2223 }
2224 }
2225
2226 /* Add any update of a GRF to the HT, removing a previous write if it
2227 * wasn't read.
2228 */
2229 if (inst->dst.file == GRF) {
2230 if (inst->regs_written > 1) {
2231 /* We don't know how to trim channels from an instruction's
2232 * writes, so we can't incrementally remove unread channels from
2233 * it. Just remove whatever it overwrites from the table
2234 */
2235 for (int i = 0; i < inst->regs_written; i++) {
2236 remove_dead_code_hash(ht,
2237 inst->dst.reg,
2238 inst->dst.reg_offset + i);
2239 }
2240 } else {
2241 struct hash_entry *entry =
2242 get_dead_code_hash_entry(ht, inst->dst.reg,
2243 inst->dst.reg_offset);
2244
2245 if (entry) {
2246 if (inst->is_partial_write()) {
2247 /* For a partial write, we can't remove any previous dead code
2248 * candidate, since we're just modifying their result.
2249 */
2250 } else {
2251 /* We're completely updating a channel, and there was a
2252 * previous write to the channel that wasn't read. Kill it!
2253 */
2254 fs_inst *inst = (fs_inst *)entry->data;
2255 inst->remove();
2256 progress = true;
2257 }
2258
2259 _mesa_hash_table_remove(ht, entry);
2260 }
2261
2262 if (!inst->has_side_effects())
2263 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2264 inst);
2265 }
2266 }
2267 }
2268
2269 _mesa_hash_table_destroy(ht, NULL);
2270
2271 if (progress)
2272 invalidate_live_intervals();
2273
2274 return progress;
2275 }
2276
2277 /**
2278 * Implements register coalescing: Checks if the two registers involved in a
2279 * raw move don't interfere, in which case they can both be stored in the same
2280 * place and the MOV removed.
2281 *
2282 * To do this, all uses of the source of the MOV in the shader are replaced
2283 * with the destination of the MOV. For example:
2284 *
2285 * add vgrf3:F, vgrf1:F, vgrf2:F
2286 * mov vgrf4:F, vgrf3:F
2287 * mul vgrf5:F, vgrf5:F, vgrf4:F
2288 *
2289 * becomes
2290 *
2291 * add vgrf4:F, vgrf1:F, vgrf2:F
2292 * mul vgrf5:F, vgrf5:F, vgrf4:F
2293 */
2294 bool
2295 fs_visitor::register_coalesce()
2296 {
2297 bool progress = false;
2298
2299 calculate_live_intervals();
2300
2301 int src_size = 0;
2302 int channels_remaining = 0;
2303 int reg_from = -1, reg_to = -1;
2304 int reg_to_offset[MAX_SAMPLER_MESSAGE_SIZE];
2305 fs_inst *mov[MAX_SAMPLER_MESSAGE_SIZE];
2306
2307 foreach_list(node, &this->instructions) {
2308 fs_inst *inst = (fs_inst *)node;
2309
2310 if (inst->opcode != BRW_OPCODE_MOV ||
2311 inst->is_partial_write() ||
2312 inst->saturate ||
2313 inst->src[0].file != GRF ||
2314 inst->src[0].negate ||
2315 inst->src[0].abs ||
2316 inst->src[0].smear != -1 ||
2317 inst->dst.file != GRF ||
2318 inst->dst.type != inst->src[0].type) {
2319 continue;
2320 }
2321
2322 if (virtual_grf_sizes[inst->src[0].reg] >
2323 virtual_grf_sizes[inst->dst.reg])
2324 continue;
2325
2326 int var_from = live_intervals->var_from_reg(&inst->src[0]);
2327 int var_to = live_intervals->var_from_reg(&inst->dst);
2328
2329 if (live_intervals->vars_interfere(var_from, var_to) &&
2330 !inst->dst.equals(inst->src[0])) {
2331
2332 /* We know that the live ranges of A (var_from) and B (var_to)
2333 * interfere because of the ->vars_interfere() call above. If the end
2334 * of B's live range is after the end of A's range, then we know two
2335 * things:
2336 * - the start of B's live range must be in A's live range (since we
2337 * already know the two ranges interfere, this is the only remaining
2338 * possibility)
2339 * - the interference isn't of the form we're looking for (where B is
2340 * entirely inside A)
2341 */
2342 if (live_intervals->end[var_to] > live_intervals->end[var_from])
2343 continue;
2344
2345 bool overwritten = false;
2346 int scan_ip = -1;
2347
2348 foreach_list(n, &this->instructions) {
2349 fs_inst *scan_inst = (fs_inst *)n;
2350 scan_ip++;
2351
2352 if (scan_inst->is_control_flow()) {
2353 overwritten = true;
2354 break;
2355 }
2356
2357 if (scan_ip <= live_intervals->start[var_to])
2358 continue;
2359
2360 if (scan_ip > live_intervals->end[var_to])
2361 break;
2362
2363 if (scan_inst->dst.equals(inst->dst) ||
2364 scan_inst->dst.equals(inst->src[0])) {
2365 overwritten = true;
2366 break;
2367 }
2368 }
2369
2370 if (overwritten)
2371 continue;
2372 }
2373
2374 if (reg_from != inst->src[0].reg) {
2375 reg_from = inst->src[0].reg;
2376
2377 src_size = virtual_grf_sizes[inst->src[0].reg];
2378 assert(src_size <= MAX_SAMPLER_MESSAGE_SIZE);
2379
2380 channels_remaining = src_size;
2381 memset(mov, 0, sizeof(mov));
2382
2383 reg_to = inst->dst.reg;
2384 }
2385
2386 if (reg_to != inst->dst.reg)
2387 continue;
2388
2389 const int offset = inst->src[0].reg_offset;
2390 reg_to_offset[offset] = inst->dst.reg_offset;
2391 mov[offset] = inst;
2392 channels_remaining--;
2393
2394 if (channels_remaining)
2395 continue;
2396
2397 bool removed = false;
2398 for (int i = 0; i < src_size; i++) {
2399 if (mov[i]) {
2400 removed = true;
2401
2402 mov[i]->opcode = BRW_OPCODE_NOP;
2403 mov[i]->conditional_mod = BRW_CONDITIONAL_NONE;
2404 mov[i]->dst = reg_undef;
2405 mov[i]->src[0] = reg_undef;
2406 mov[i]->src[1] = reg_undef;
2407 mov[i]->src[2] = reg_undef;
2408 }
2409 }
2410
2411 foreach_list(node, &this->instructions) {
2412 fs_inst *scan_inst = (fs_inst *)node;
2413
2414 for (int i = 0; i < src_size; i++) {
2415 if (mov[i]) {
2416 if (scan_inst->dst.file == GRF &&
2417 scan_inst->dst.reg == reg_from &&
2418 scan_inst->dst.reg_offset == i) {
2419 scan_inst->dst.reg = reg_to;
2420 scan_inst->dst.reg_offset = reg_to_offset[i];
2421 }
2422 for (int j = 0; j < 3; j++) {
2423 if (scan_inst->src[j].file == GRF &&
2424 scan_inst->src[j].reg == reg_from &&
2425 scan_inst->src[j].reg_offset == i) {
2426 scan_inst->src[j].reg = reg_to;
2427 scan_inst->src[j].reg_offset = reg_to_offset[i];
2428 }
2429 }
2430 }
2431 }
2432 }
2433
2434 if (removed) {
2435 live_intervals->start[var_to] = MIN2(live_intervals->start[var_to],
2436 live_intervals->start[var_from]);
2437 live_intervals->end[var_to] = MAX2(live_intervals->end[var_to],
2438 live_intervals->end[var_from]);
2439 reg_from = -1;
2440 }
2441 }
2442
2443 foreach_list_safe(node, &this->instructions) {
2444 fs_inst *inst = (fs_inst *)node;
2445
2446 if (inst->opcode == BRW_OPCODE_NOP) {
2447 inst->remove();
2448 progress = true;
2449 }
2450 }
2451
2452 if (progress)
2453 invalidate_live_intervals();
2454
2455 return progress;
2456 }
2457
2458 bool
2459 fs_visitor::compute_to_mrf()
2460 {
2461 bool progress = false;
2462 int next_ip = 0;
2463
2464 calculate_live_intervals();
2465
2466 foreach_list_safe(node, &this->instructions) {
2467 fs_inst *inst = (fs_inst *)node;
2468
2469 int ip = next_ip;
2470 next_ip++;
2471
2472 if (inst->opcode != BRW_OPCODE_MOV ||
2473 inst->is_partial_write() ||
2474 inst->dst.file != MRF || inst->src[0].file != GRF ||
2475 inst->dst.type != inst->src[0].type ||
2476 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2477 continue;
2478
2479 /* Work out which hardware MRF registers are written by this
2480 * instruction.
2481 */
2482 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2483 int mrf_high;
2484 if (inst->dst.reg & BRW_MRF_COMPR4) {
2485 mrf_high = mrf_low + 4;
2486 } else if (dispatch_width == 16 &&
2487 (!inst->force_uncompressed && !inst->force_sechalf)) {
2488 mrf_high = mrf_low + 1;
2489 } else {
2490 mrf_high = mrf_low;
2491 }
2492
2493 /* Can't compute-to-MRF this GRF if someone else was going to
2494 * read it later.
2495 */
2496 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2497 continue;
2498
2499 /* Found a move of a GRF to a MRF. Let's see if we can go
2500 * rewrite the thing that made this GRF to write into the MRF.
2501 */
2502 fs_inst *scan_inst;
2503 for (scan_inst = (fs_inst *)inst->prev;
2504 scan_inst->prev != NULL;
2505 scan_inst = (fs_inst *)scan_inst->prev) {
2506 if (scan_inst->dst.file == GRF &&
2507 scan_inst->dst.reg == inst->src[0].reg) {
2508 /* Found the last thing to write our reg we want to turn
2509 * into a compute-to-MRF.
2510 */
2511
2512 /* If this one instruction didn't populate all the
2513 * channels, bail. We might be able to rewrite everything
2514 * that writes that reg, but it would require smarter
2515 * tracking to delay the rewriting until complete success.
2516 */
2517 if (scan_inst->is_partial_write())
2518 break;
2519
2520 /* Things returning more than one register would need us to
2521 * understand coalescing out more than one MOV at a time.
2522 */
2523 if (scan_inst->regs_written > 1)
2524 break;
2525
2526 /* SEND instructions can't have MRF as a destination. */
2527 if (scan_inst->mlen)
2528 break;
2529
2530 if (brw->gen == 6) {
2531 /* gen6 math instructions must have the destination be
2532 * GRF, so no compute-to-MRF for them.
2533 */
2534 if (scan_inst->is_math()) {
2535 break;
2536 }
2537 }
2538
2539 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2540 /* Found the creator of our MRF's source value. */
2541 scan_inst->dst.file = MRF;
2542 scan_inst->dst.reg = inst->dst.reg;
2543 scan_inst->saturate |= inst->saturate;
2544 inst->remove();
2545 progress = true;
2546 }
2547 break;
2548 }
2549
2550 /* We don't handle control flow here. Most computation of
2551 * values that end up in MRFs are shortly before the MRF
2552 * write anyway.
2553 */
2554 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2555 break;
2556
2557 /* You can't read from an MRF, so if someone else reads our
2558 * MRF's source GRF that we wanted to rewrite, that stops us.
2559 */
2560 bool interfered = false;
2561 for (int i = 0; i < 3; i++) {
2562 if (scan_inst->src[i].file == GRF &&
2563 scan_inst->src[i].reg == inst->src[0].reg &&
2564 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2565 interfered = true;
2566 }
2567 }
2568 if (interfered)
2569 break;
2570
2571 if (scan_inst->dst.file == MRF) {
2572 /* If somebody else writes our MRF here, we can't
2573 * compute-to-MRF before that.
2574 */
2575 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2576 int scan_mrf_high;
2577
2578 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2579 scan_mrf_high = scan_mrf_low + 4;
2580 } else if (dispatch_width == 16 &&
2581 (!scan_inst->force_uncompressed &&
2582 !scan_inst->force_sechalf)) {
2583 scan_mrf_high = scan_mrf_low + 1;
2584 } else {
2585 scan_mrf_high = scan_mrf_low;
2586 }
2587
2588 if (mrf_low == scan_mrf_low ||
2589 mrf_low == scan_mrf_high ||
2590 mrf_high == scan_mrf_low ||
2591 mrf_high == scan_mrf_high) {
2592 break;
2593 }
2594 }
2595
2596 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2597 /* Found a SEND instruction, which means that there are
2598 * live values in MRFs from base_mrf to base_mrf +
2599 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2600 * above it.
2601 */
2602 if (mrf_low >= scan_inst->base_mrf &&
2603 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2604 break;
2605 }
2606 if (mrf_high >= scan_inst->base_mrf &&
2607 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2608 break;
2609 }
2610 }
2611 }
2612 }
2613
2614 if (progress)
2615 invalidate_live_intervals();
2616
2617 return progress;
2618 }
2619
2620 /**
2621 * Walks through basic blocks, looking for repeated MRF writes and
2622 * removing the later ones.
2623 */
2624 bool
2625 fs_visitor::remove_duplicate_mrf_writes()
2626 {
2627 fs_inst *last_mrf_move[16];
2628 bool progress = false;
2629
2630 /* Need to update the MRF tracking for compressed instructions. */
2631 if (dispatch_width == 16)
2632 return false;
2633
2634 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2635
2636 foreach_list_safe(node, &this->instructions) {
2637 fs_inst *inst = (fs_inst *)node;
2638
2639 if (inst->is_control_flow()) {
2640 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2641 }
2642
2643 if (inst->opcode == BRW_OPCODE_MOV &&
2644 inst->dst.file == MRF) {
2645 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2646 if (prev_inst && inst->equals(prev_inst)) {
2647 inst->remove();
2648 progress = true;
2649 continue;
2650 }
2651 }
2652
2653 /* Clear out the last-write records for MRFs that were overwritten. */
2654 if (inst->dst.file == MRF) {
2655 last_mrf_move[inst->dst.reg] = NULL;
2656 }
2657
2658 if (inst->mlen > 0 && inst->base_mrf != -1) {
2659 /* Found a SEND instruction, which will include two or fewer
2660 * implied MRF writes. We could do better here.
2661 */
2662 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2663 last_mrf_move[inst->base_mrf + i] = NULL;
2664 }
2665 }
2666
2667 /* Clear out any MRF move records whose sources got overwritten. */
2668 if (inst->dst.file == GRF) {
2669 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2670 if (last_mrf_move[i] &&
2671 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2672 last_mrf_move[i] = NULL;
2673 }
2674 }
2675 }
2676
2677 if (inst->opcode == BRW_OPCODE_MOV &&
2678 inst->dst.file == MRF &&
2679 inst->src[0].file == GRF &&
2680 !inst->is_partial_write()) {
2681 last_mrf_move[inst->dst.reg] = inst;
2682 }
2683 }
2684
2685 if (progress)
2686 invalidate_live_intervals();
2687
2688 return progress;
2689 }
2690
2691 static void
2692 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2693 int first_grf, int grf_len)
2694 {
2695 bool inst_simd16 = (dispatch_width > 8 &&
2696 !inst->force_uncompressed &&
2697 !inst->force_sechalf);
2698
2699 /* Clear the flag for registers that actually got read (as expected). */
2700 for (int i = 0; i < 3; i++) {
2701 int grf;
2702 if (inst->src[i].file == GRF) {
2703 grf = inst->src[i].reg;
2704 } else if (inst->src[i].file == HW_REG &&
2705 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2706 grf = inst->src[i].fixed_hw_reg.nr;
2707 } else {
2708 continue;
2709 }
2710
2711 if (grf >= first_grf &&
2712 grf < first_grf + grf_len) {
2713 deps[grf - first_grf] = false;
2714 if (inst_simd16)
2715 deps[grf - first_grf + 1] = false;
2716 }
2717 }
2718 }
2719
2720 /**
2721 * Implements this workaround for the original 965:
2722 *
2723 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2724 * check for post destination dependencies on this instruction, software
2725 * must ensure that there is no destination hazard for the case of ‘write
2726 * followed by a posted write’ shown in the following example.
2727 *
2728 * 1. mov r3 0
2729 * 2. send r3.xy <rest of send instruction>
2730 * 3. mov r2 r3
2731 *
2732 * Due to no post-destination dependency check on the ‘send’, the above
2733 * code sequence could have two instructions (1 and 2) in flight at the
2734 * same time that both consider ‘r3’ as the target of their final writes.
2735 */
2736 void
2737 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2738 {
2739 int reg_size = dispatch_width / 8;
2740 int write_len = inst->regs_written * reg_size;
2741 int first_write_grf = inst->dst.reg;
2742 bool needs_dep[BRW_MAX_MRF];
2743 assert(write_len < (int)sizeof(needs_dep) - 1);
2744
2745 memset(needs_dep, false, sizeof(needs_dep));
2746 memset(needs_dep, true, write_len);
2747
2748 clear_deps_for_inst_src(inst, dispatch_width,
2749 needs_dep, first_write_grf, write_len);
2750
2751 /* Walk backwards looking for writes to registers we're writing which
2752 * aren't read since being written. If we hit the start of the program,
2753 * we assume that there are no outstanding dependencies on entry to the
2754 * program.
2755 */
2756 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2757 scan_inst != NULL;
2758 scan_inst = (fs_inst *)scan_inst->prev) {
2759
2760 /* If we hit control flow, assume that there *are* outstanding
2761 * dependencies, and force their cleanup before our instruction.
2762 */
2763 if (scan_inst->is_control_flow()) {
2764 for (int i = 0; i < write_len; i++) {
2765 if (needs_dep[i]) {
2766 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2767 }
2768 }
2769 return;
2770 }
2771
2772 bool scan_inst_simd16 = (dispatch_width > 8 &&
2773 !scan_inst->force_uncompressed &&
2774 !scan_inst->force_sechalf);
2775
2776 /* We insert our reads as late as possible on the assumption that any
2777 * instruction but a MOV that might have left us an outstanding
2778 * dependency has more latency than a MOV.
2779 */
2780 if (scan_inst->dst.file == GRF) {
2781 for (int i = 0; i < scan_inst->regs_written; i++) {
2782 int reg = scan_inst->dst.reg + i * reg_size;
2783
2784 if (reg >= first_write_grf &&
2785 reg < first_write_grf + write_len &&
2786 needs_dep[reg - first_write_grf]) {
2787 inst->insert_before(DEP_RESOLVE_MOV(reg));
2788 needs_dep[reg - first_write_grf] = false;
2789 if (scan_inst_simd16)
2790 needs_dep[reg - first_write_grf + 1] = false;
2791 }
2792 }
2793 }
2794
2795 /* Clear the flag for registers that actually got read (as expected). */
2796 clear_deps_for_inst_src(scan_inst, dispatch_width,
2797 needs_dep, first_write_grf, write_len);
2798
2799 /* Continue the loop only if we haven't resolved all the dependencies */
2800 int i;
2801 for (i = 0; i < write_len; i++) {
2802 if (needs_dep[i])
2803 break;
2804 }
2805 if (i == write_len)
2806 return;
2807 }
2808 }
2809
2810 /**
2811 * Implements this workaround for the original 965:
2812 *
2813 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2814 * used as a destination register until after it has been sourced by an
2815 * instruction with a different destination register.
2816 */
2817 void
2818 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2819 {
2820 int write_len = inst->regs_written * dispatch_width / 8;
2821 int first_write_grf = inst->dst.reg;
2822 bool needs_dep[BRW_MAX_MRF];
2823 assert(write_len < (int)sizeof(needs_dep) - 1);
2824
2825 memset(needs_dep, false, sizeof(needs_dep));
2826 memset(needs_dep, true, write_len);
2827 /* Walk forwards looking for writes to registers we're writing which aren't
2828 * read before being written.
2829 */
2830 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2831 !scan_inst->is_tail_sentinel();
2832 scan_inst = (fs_inst *)scan_inst->next) {
2833 /* If we hit control flow, force resolve all remaining dependencies. */
2834 if (scan_inst->is_control_flow()) {
2835 for (int i = 0; i < write_len; i++) {
2836 if (needs_dep[i])
2837 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2838 }
2839 return;
2840 }
2841
2842 /* Clear the flag for registers that actually got read (as expected). */
2843 clear_deps_for_inst_src(scan_inst, dispatch_width,
2844 needs_dep, first_write_grf, write_len);
2845
2846 /* We insert our reads as late as possible since they're reading the
2847 * result of a SEND, which has massive latency.
2848 */
2849 if (scan_inst->dst.file == GRF &&
2850 scan_inst->dst.reg >= first_write_grf &&
2851 scan_inst->dst.reg < first_write_grf + write_len &&
2852 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2853 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2854 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2855 }
2856
2857 /* Continue the loop only if we haven't resolved all the dependencies */
2858 int i;
2859 for (i = 0; i < write_len; i++) {
2860 if (needs_dep[i])
2861 break;
2862 }
2863 if (i == write_len)
2864 return;
2865 }
2866
2867 /* If we hit the end of the program, resolve all remaining dependencies out
2868 * of paranoia.
2869 */
2870 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2871 assert(last_inst->eot);
2872 for (int i = 0; i < write_len; i++) {
2873 if (needs_dep[i])
2874 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2875 }
2876 }
2877
2878 void
2879 fs_visitor::insert_gen4_send_dependency_workarounds()
2880 {
2881 if (brw->gen != 4 || brw->is_g4x)
2882 return;
2883
2884 /* Note that we're done with register allocation, so GRF fs_regs always
2885 * have a .reg_offset of 0.
2886 */
2887
2888 foreach_list_safe(node, &this->instructions) {
2889 fs_inst *inst = (fs_inst *)node;
2890
2891 if (inst->mlen != 0 && inst->dst.file == GRF) {
2892 insert_gen4_pre_send_dependency_workarounds(inst);
2893 insert_gen4_post_send_dependency_workarounds(inst);
2894 }
2895 }
2896 }
2897
2898 /**
2899 * Turns the generic expression-style uniform pull constant load instruction
2900 * into a hardware-specific series of instructions for loading a pull
2901 * constant.
2902 *
2903 * The expression style allows the CSE pass before this to optimize out
2904 * repeated loads from the same offset, and gives the pre-register-allocation
2905 * scheduling full flexibility, while the conversion to native instructions
2906 * allows the post-register-allocation scheduler the best information
2907 * possible.
2908 *
2909 * Note that execution masking for setting up pull constant loads is special:
2910 * the channels that need to be written are unrelated to the current execution
2911 * mask, since a later instruction will use one of the result channels as a
2912 * source operand for all 8 or 16 of its channels.
2913 */
2914 void
2915 fs_visitor::lower_uniform_pull_constant_loads()
2916 {
2917 foreach_list(node, &this->instructions) {
2918 fs_inst *inst = (fs_inst *)node;
2919
2920 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2921 continue;
2922
2923 if (brw->gen >= 7) {
2924 /* The offset arg before was a vec4-aligned byte offset. We need to
2925 * turn it into a dword offset.
2926 */
2927 fs_reg const_offset_reg = inst->src[1];
2928 assert(const_offset_reg.file == IMM &&
2929 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2930 const_offset_reg.imm.u /= 4;
2931 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2932
2933 /* This is actually going to be a MOV, but since only the first dword
2934 * is accessed, we have a special opcode to do just that one. Note
2935 * that this needs to be an operation that will be considered a def
2936 * by live variable analysis, or register allocation will explode.
2937 */
2938 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2939 payload, const_offset_reg);
2940 setup->force_writemask_all = true;
2941
2942 setup->ir = inst->ir;
2943 setup->annotation = inst->annotation;
2944 inst->insert_before(setup);
2945
2946 /* Similarly, this will only populate the first 4 channels of the
2947 * result register (since we only use smear values from 0-3), but we
2948 * don't tell the optimizer.
2949 */
2950 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2951 inst->src[1] = payload;
2952
2953 invalidate_live_intervals();
2954 } else {
2955 /* Before register allocation, we didn't tell the scheduler about the
2956 * MRF we use. We know it's safe to use this MRF because nothing
2957 * else does except for register spill/unspill, which generates and
2958 * uses its MRF within a single IR instruction.
2959 */
2960 inst->base_mrf = 14;
2961 inst->mlen = 1;
2962 }
2963 }
2964 }
2965
2966 void
2967 fs_visitor::dump_instructions()
2968 {
2969 calculate_register_pressure();
2970
2971 int ip = 0, max_pressure = 0;
2972 foreach_list(node, &this->instructions) {
2973 backend_instruction *inst = (backend_instruction *)node;
2974 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2975 printf("{%3d} %4d: ", regs_live_at_ip[ip], ip);
2976 dump_instruction(inst);
2977 ++ip;
2978 }
2979 printf("Maximum %3d registers live at once.\n", max_pressure);
2980 }
2981
2982 void
2983 fs_visitor::dump_instruction(backend_instruction *be_inst)
2984 {
2985 fs_inst *inst = (fs_inst *)be_inst;
2986
2987 if (inst->predicate) {
2988 printf("(%cf0.%d) ",
2989 inst->predicate_inverse ? '-' : '+',
2990 inst->flag_subreg);
2991 }
2992
2993 printf("%s", brw_instruction_name(inst->opcode));
2994 if (inst->saturate)
2995 printf(".sat");
2996 if (inst->conditional_mod) {
2997 printf("%s", conditional_modifier[inst->conditional_mod]);
2998 if (!inst->predicate &&
2999 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3000 inst->opcode != BRW_OPCODE_IF &&
3001 inst->opcode != BRW_OPCODE_WHILE))) {
3002 printf(".f0.%d", inst->flag_subreg);
3003 }
3004 }
3005 printf(" ");
3006
3007
3008 switch (inst->dst.file) {
3009 case GRF:
3010 printf("vgrf%d", inst->dst.reg);
3011 if (virtual_grf_sizes[inst->dst.reg] != 1)
3012 printf("+%d", inst->dst.reg_offset);
3013 break;
3014 case MRF:
3015 printf("m%d", inst->dst.reg);
3016 break;
3017 case BAD_FILE:
3018 printf("(null)");
3019 break;
3020 case UNIFORM:
3021 printf("***u%d***", inst->dst.reg);
3022 break;
3023 case HW_REG:
3024 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3025 switch (inst->dst.fixed_hw_reg.nr) {
3026 case BRW_ARF_NULL:
3027 printf("null");
3028 break;
3029 case BRW_ARF_ADDRESS:
3030 printf("a0.%d", inst->dst.fixed_hw_reg.subnr);
3031 break;
3032 case BRW_ARF_ACCUMULATOR:
3033 printf("acc%d", inst->dst.fixed_hw_reg.subnr);
3034 break;
3035 case BRW_ARF_FLAG:
3036 printf("f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3037 inst->dst.fixed_hw_reg.subnr);
3038 break;
3039 default:
3040 printf("arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3041 inst->dst.fixed_hw_reg.subnr);
3042 break;
3043 }
3044 } else {
3045 printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
3046 }
3047 if (inst->dst.fixed_hw_reg.subnr)
3048 printf("+%d", inst->dst.fixed_hw_reg.subnr);
3049 break;
3050 default:
3051 printf("???");
3052 break;
3053 }
3054 printf(":%s, ", reg_encoding[inst->dst.type]);
3055
3056 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
3057 if (inst->src[i].negate)
3058 printf("-");
3059 if (inst->src[i].abs)
3060 printf("|");
3061 switch (inst->src[i].file) {
3062 case GRF:
3063 printf("vgrf%d", inst->src[i].reg);
3064 if (virtual_grf_sizes[inst->src[i].reg] != 1)
3065 printf("+%d", inst->src[i].reg_offset);
3066 break;
3067 case MRF:
3068 printf("***m%d***", inst->src[i].reg);
3069 break;
3070 case UNIFORM:
3071 printf("u%d", inst->src[i].reg);
3072 if (virtual_grf_sizes[inst->src[i].reg] != 1)
3073 printf(".%d", inst->src[i].reg_offset);
3074 break;
3075 case BAD_FILE:
3076 printf("(null)");
3077 break;
3078 case IMM:
3079 switch (inst->src[i].type) {
3080 case BRW_REGISTER_TYPE_F:
3081 printf("%ff", inst->src[i].imm.f);
3082 break;
3083 case BRW_REGISTER_TYPE_D:
3084 printf("%dd", inst->src[i].imm.i);
3085 break;
3086 case BRW_REGISTER_TYPE_UD:
3087 printf("%uu", inst->src[i].imm.u);
3088 break;
3089 default:
3090 printf("???");
3091 break;
3092 }
3093 break;
3094 case HW_REG:
3095 if (inst->src[i].fixed_hw_reg.negate)
3096 printf("-");
3097 if (inst->src[i].fixed_hw_reg.abs)
3098 printf("|");
3099 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3100 switch (inst->src[i].fixed_hw_reg.nr) {
3101 case BRW_ARF_NULL:
3102 printf("null");
3103 break;
3104 case BRW_ARF_ADDRESS:
3105 printf("a0.%d", inst->src[i].fixed_hw_reg.subnr);
3106 break;
3107 case BRW_ARF_ACCUMULATOR:
3108 printf("acc%d", inst->src[i].fixed_hw_reg.subnr);
3109 break;
3110 case BRW_ARF_FLAG:
3111 printf("f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3112 inst->src[i].fixed_hw_reg.subnr);
3113 break;
3114 default:
3115 printf("arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3116 inst->src[i].fixed_hw_reg.subnr);
3117 break;
3118 }
3119 } else {
3120 printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3121 }
3122 if (inst->src[i].fixed_hw_reg.subnr)
3123 printf("+%d", inst->src[i].fixed_hw_reg.subnr);
3124 if (inst->src[i].fixed_hw_reg.abs)
3125 printf("|");
3126 break;
3127 default:
3128 printf("???");
3129 break;
3130 }
3131 if (inst->src[i].abs)
3132 printf("|");
3133
3134 if (inst->src[i].file != IMM) {
3135 printf(":%s", brw_reg_type_letters(inst->src[i].type));
3136 }
3137
3138 if (i < 2 && inst->src[i + 1].file != BAD_FILE)
3139 printf(", ");
3140 }
3141
3142 printf(" ");
3143
3144 if (inst->force_uncompressed)
3145 printf("1sthalf ");
3146
3147 if (inst->force_sechalf)
3148 printf("2ndhalf ");
3149
3150 printf("\n");
3151 }
3152
3153 /**
3154 * Possibly returns an instruction that set up @param reg.
3155 *
3156 * Sometimes we want to take the result of some expression/variable
3157 * dereference tree and rewrite the instruction generating the result
3158 * of the tree. When processing the tree, we know that the
3159 * instructions generated are all writing temporaries that are dead
3160 * outside of this tree. So, if we have some instructions that write
3161 * a temporary, we're free to point that temp write somewhere else.
3162 *
3163 * Note that this doesn't guarantee that the instruction generated
3164 * only reg -- it might be the size=4 destination of a texture instruction.
3165 */
3166 fs_inst *
3167 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3168 fs_inst *end,
3169 fs_reg reg)
3170 {
3171 if (end == start ||
3172 end->is_partial_write() ||
3173 reg.reladdr ||
3174 !reg.equals(end->dst)) {
3175 return NULL;
3176 } else {
3177 return end;
3178 }
3179 }
3180
3181 void
3182 fs_visitor::setup_payload_gen6()
3183 {
3184 bool uses_depth =
3185 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3186 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3187
3188 assert(brw->gen >= 6);
3189
3190 /* R0-1: masks, pixel X/Y coordinates. */
3191 c->nr_payload_regs = 2;
3192 /* R2: only for 32-pixel dispatch.*/
3193
3194 /* R3-26: barycentric interpolation coordinates. These appear in the
3195 * same order that they appear in the brw_wm_barycentric_interp_mode
3196 * enum. Each set of coordinates occupies 2 registers if dispatch width
3197 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3198 * appear if they were enabled using the "Barycentric Interpolation
3199 * Mode" bits in WM_STATE.
3200 */
3201 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3202 if (barycentric_interp_modes & (1 << i)) {
3203 c->barycentric_coord_reg[i] = c->nr_payload_regs;
3204 c->nr_payload_regs += 2;
3205 if (dispatch_width == 16) {
3206 c->nr_payload_regs += 2;
3207 }
3208 }
3209 }
3210
3211 /* R27: interpolated depth if uses source depth */
3212 if (uses_depth) {
3213 c->source_depth_reg = c->nr_payload_regs;
3214 c->nr_payload_regs++;
3215 if (dispatch_width == 16) {
3216 /* R28: interpolated depth if not SIMD8. */
3217 c->nr_payload_regs++;
3218 }
3219 }
3220 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3221 if (uses_depth) {
3222 c->source_w_reg = c->nr_payload_regs;
3223 c->nr_payload_regs++;
3224 if (dispatch_width == 16) {
3225 /* R30: interpolated W if not SIMD8. */
3226 c->nr_payload_regs++;
3227 }
3228 }
3229
3230 c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3231 /* R31: MSAA position offsets. */
3232 if (c->prog_data.uses_pos_offset) {
3233 c->sample_pos_reg = c->nr_payload_regs;
3234 c->nr_payload_regs++;
3235 }
3236
3237 /* R32: MSAA input coverage mask */
3238 if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3239 assert(brw->gen >= 7);
3240 c->sample_mask_reg = c->nr_payload_regs;
3241 c->nr_payload_regs++;
3242 if (dispatch_width == 16) {
3243 /* R33: input coverage mask if not SIMD8. */
3244 c->nr_payload_regs++;
3245 }
3246 }
3247
3248 /* R34-: bary for 32-pixel. */
3249 /* R58-59: interp W for 32-pixel. */
3250
3251 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3252 c->source_depth_to_render_target = true;
3253 }
3254 }
3255
3256 void
3257 fs_visitor::assign_binding_table_offsets()
3258 {
3259 uint32_t next_binding_table_offset = 0;
3260
3261 /* If there are no color regions, we still perform an FB write to a null
3262 * renderbuffer, which we place at surface index 0.
3263 */
3264 c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3265 next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3266
3267 assign_common_binding_table_offsets(next_binding_table_offset);
3268 }
3269
3270 void
3271 fs_visitor::calculate_register_pressure()
3272 {
3273 calculate_live_intervals();
3274
3275 int num_instructions = 0;
3276 foreach_list(node, &this->instructions) {
3277 ++num_instructions;
3278 }
3279
3280 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3281
3282 for (int reg = 0; reg < virtual_grf_count; reg++) {
3283 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3284 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3285 }
3286 }
3287
3288 bool
3289 fs_visitor::run()
3290 {
3291 sanity_param_count = fp->Base.Parameters->NumParameters;
3292 uint32_t orig_nr_params = c->prog_data.nr_params;
3293 bool allocated_without_spills;
3294
3295 assign_binding_table_offsets();
3296
3297 if (brw->gen >= 6)
3298 setup_payload_gen6();
3299 else
3300 setup_payload_gen4();
3301
3302 if (0) {
3303 emit_dummy_fs();
3304 } else {
3305 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3306 emit_shader_time_begin();
3307
3308 calculate_urb_setup();
3309 if (fp->Base.InputsRead > 0) {
3310 if (brw->gen < 6)
3311 emit_interpolation_setup_gen4();
3312 else
3313 emit_interpolation_setup_gen6();
3314 }
3315
3316 /* We handle discards by keeping track of the still-live pixels in f0.1.
3317 * Initialize it with the dispatched pixels.
3318 */
3319 if (fp->UsesKill || c->key.alpha_test_func) {
3320 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3321 discard_init->flag_subreg = 1;
3322 }
3323
3324 /* Generate FS IR for main(). (the visitor only descends into
3325 * functions called "main").
3326 */
3327 if (shader) {
3328 foreach_list(node, &*shader->base.ir) {
3329 ir_instruction *ir = (ir_instruction *)node;
3330 base_ir = ir;
3331 this->result = reg_undef;
3332 ir->accept(this);
3333 }
3334 } else {
3335 emit_fragment_program_code();
3336 }
3337 base_ir = NULL;
3338 if (failed)
3339 return false;
3340
3341 emit(FS_OPCODE_PLACEHOLDER_HALT);
3342
3343 if (c->key.alpha_test_func)
3344 emit_alpha_test();
3345
3346 emit_fb_writes();
3347
3348 split_virtual_grfs();
3349
3350 move_uniform_array_access_to_pull_constants();
3351 remove_dead_constants();
3352 setup_pull_constants();
3353
3354 bool progress;
3355 do {
3356 progress = false;
3357
3358 compact_virtual_grfs();
3359
3360 progress = remove_duplicate_mrf_writes() || progress;
3361
3362 progress = opt_algebraic() || progress;
3363 progress = opt_cse() || progress;
3364 progress = opt_copy_propagate() || progress;
3365 progress = opt_peephole_predicated_break() || progress;
3366 progress = dead_code_eliminate() || progress;
3367 progress = dead_code_eliminate_local() || progress;
3368 progress = opt_peephole_sel() || progress;
3369 progress = dead_control_flow_eliminate(this) || progress;
3370 progress = opt_saturate_propagation() || progress;
3371 progress = register_coalesce() || progress;
3372 progress = compute_to_mrf() || progress;
3373 } while (progress);
3374
3375 lower_uniform_pull_constant_loads();
3376
3377 assign_curb_setup();
3378 assign_urb_setup();
3379
3380 static enum instruction_scheduler_mode pre_modes[] = {
3381 SCHEDULE_PRE,
3382 SCHEDULE_PRE_NON_LIFO,
3383 SCHEDULE_PRE_LIFO,
3384 };
3385
3386 /* Try each scheduling heuristic to see if it can successfully register
3387 * allocate without spilling. They should be ordered by decreasing
3388 * performance but increasing likelihood of allocating.
3389 */
3390 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3391 schedule_instructions(pre_modes[i]);
3392
3393 if (0) {
3394 assign_regs_trivial();
3395 allocated_without_spills = true;
3396 } else {
3397 allocated_without_spills = assign_regs(false);
3398 }
3399 if (allocated_without_spills)
3400 break;
3401 }
3402
3403 if (!allocated_without_spills) {
3404 /* We assume that any spilling is worse than just dropping back to
3405 * SIMD8. There's probably actually some intermediate point where
3406 * SIMD16 with a couple of spills is still better.
3407 */
3408 if (dispatch_width == 16) {
3409 fail("Failure to register allocate. Reduce number of "
3410 "live scalar values to avoid this.");
3411 }
3412
3413 /* Since we're out of heuristics, just go spill registers until we
3414 * get an allocation.
3415 */
3416 while (!assign_regs(true)) {
3417 if (failed)
3418 break;
3419 }
3420 }
3421 }
3422 assert(force_uncompressed_stack == 0);
3423
3424 /* This must come after all optimization and register allocation, since
3425 * it inserts dead code that happens to have side effects, and it does
3426 * so based on the actual physical registers in use.
3427 */
3428 insert_gen4_send_dependency_workarounds();
3429
3430 if (failed)
3431 return false;
3432
3433 if (!allocated_without_spills)
3434 schedule_instructions(SCHEDULE_POST);
3435
3436 if (dispatch_width == 8) {
3437 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3438 } else {
3439 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3440
3441 /* Make sure we didn't try to sneak in an extra uniform */
3442 assert(orig_nr_params == c->prog_data.nr_params);
3443 (void) orig_nr_params;
3444 }
3445
3446 /* If any state parameters were appended, then ParameterValues could have
3447 * been realloced, in which case the driver uniform storage set up by
3448 * _mesa_associate_uniform_storage() would point to freed memory. Make
3449 * sure that didn't happen.
3450 */
3451 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3452
3453 return !failed;
3454 }
3455
3456 const unsigned *
3457 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3458 struct gl_fragment_program *fp,
3459 struct gl_shader_program *prog,
3460 unsigned *final_assembly_size)
3461 {
3462 bool start_busy = false;
3463 float start_time = 0;
3464
3465 if (unlikely(brw->perf_debug)) {
3466 start_busy = (brw->batch.last_bo &&
3467 drm_intel_bo_busy(brw->batch.last_bo));
3468 start_time = get_time();
3469 }
3470
3471 struct brw_shader *shader = NULL;
3472 if (prog)
3473 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3474
3475 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3476 if (prog) {
3477 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3478 _mesa_print_ir(shader->base.ir, NULL);
3479 printf("\n\n");
3480 } else {
3481 printf("ARB_fragment_program %d ir for native fragment shader\n",
3482 fp->Base.Id);
3483 _mesa_print_program(&fp->Base);
3484 }
3485 }
3486
3487 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3488 */
3489 fs_visitor v(brw, c, prog, fp, 8);
3490 if (!v.run()) {
3491 if (prog) {
3492 prog->LinkStatus = false;
3493 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3494 }
3495
3496 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3497 v.fail_msg);
3498
3499 return NULL;
3500 }
3501
3502 exec_list *simd16_instructions = NULL;
3503 fs_visitor v2(brw, c, prog, fp, 16);
3504 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3505 if (c->prog_data.nr_pull_params == 0) {
3506 /* Try a SIMD16 compile */
3507 v2.import_uniforms(&v);
3508 if (!v2.run()) {
3509 perf_debug("SIMD16 shader failed to compile, falling back to "
3510 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3511 } else {
3512 simd16_instructions = &v2.instructions;
3513 }
3514 } else {
3515 perf_debug("Skipping SIMD16 due to pull parameters.\n");
3516 }
3517 }
3518
3519 const unsigned *assembly = NULL;
3520 if (brw->gen >= 8) {
3521 gen8_fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3522 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3523 final_assembly_size);
3524 } else {
3525 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3526 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3527 final_assembly_size);
3528 }
3529
3530 if (unlikely(brw->perf_debug) && shader) {
3531 if (shader->compiled_once)
3532 brw_wm_debug_recompile(brw, prog, &c->key);
3533 shader->compiled_once = true;
3534
3535 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3536 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3537 (get_time() - start_time) * 1000);
3538 }
3539 }
3540
3541 return assembly;
3542 }
3543
3544 bool
3545 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3546 {
3547 struct brw_context *brw = brw_context(ctx);
3548 struct brw_wm_prog_key key;
3549
3550 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3551 return true;
3552
3553 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3554 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3555 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3556 bool program_uses_dfdy = fp->UsesDFdy;
3557
3558 memset(&key, 0, sizeof(key));
3559
3560 if (brw->gen < 6) {
3561 if (fp->UsesKill)
3562 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3563
3564 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3565 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3566
3567 /* Just assume depth testing. */
3568 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3569 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3570 }
3571
3572 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3573 BRW_FS_VARYING_INPUT_MASK) > 16)
3574 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3575
3576 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3577
3578 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3579 for (unsigned i = 0; i < sampler_count; i++) {
3580 if (fp->Base.ShadowSamplers & (1 << i)) {
3581 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3582 key.tex.swizzles[i] =
3583 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3584 } else {
3585 /* Color sampler: assume no swizzling. */
3586 key.tex.swizzles[i] = SWIZZLE_XYZW;
3587 }
3588 }
3589
3590 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3591 key.drawable_height = ctx->DrawBuffer->Height;
3592 }
3593
3594 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3595 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3596 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3597
3598 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3599 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3600 key.nr_color_regions > 1;
3601 }
3602
3603 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3604 * quality of the derivatives is likely to be determined by the driconf
3605 * option.
3606 */
3607 key.high_quality_derivatives = brw->disable_derivative_optimization;
3608
3609 key.program_string_id = bfp->id;
3610
3611 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3612 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3613
3614 bool success = do_wm_prog(brw, prog, bfp, &key);
3615
3616 brw->wm.base.prog_offset = old_prog_offset;
3617 brw->wm.prog_data = old_prog_data;
3618
3619 return success;
3620 }