d82e3adfd6afde8bc3229b14e9b3bfba7e83d54a
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53
54 void
55 fs_inst::init()
56 {
57 memset(this, 0, sizeof(*this));
58 this->opcode = BRW_OPCODE_NOP;
59 this->conditional_mod = BRW_CONDITIONAL_NONE;
60
61 this->dst = reg_undef;
62 this->src[0] = reg_undef;
63 this->src[1] = reg_undef;
64 this->src[2] = reg_undef;
65
66 /* This will be the case for almost all instructions. */
67 this->regs_written = 1;
68 }
69
70 fs_inst::fs_inst()
71 {
72 init();
73 }
74
75 fs_inst::fs_inst(enum opcode opcode)
76 {
77 init();
78 this->opcode = opcode;
79 }
80
81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
82 {
83 init();
84 this->opcode = opcode;
85 this->dst = dst;
86
87 if (dst.file == GRF)
88 assert(dst.reg_offset >= 0);
89 }
90
91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
92 {
93 init();
94 this->opcode = opcode;
95 this->dst = dst;
96 this->src[0] = src0;
97
98 if (dst.file == GRF)
99 assert(dst.reg_offset >= 0);
100 if (src[0].file == GRF)
101 assert(src[0].reg_offset >= 0);
102 }
103
104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
105 {
106 init();
107 this->opcode = opcode;
108 this->dst = dst;
109 this->src[0] = src0;
110 this->src[1] = src1;
111
112 if (dst.file == GRF)
113 assert(dst.reg_offset >= 0);
114 if (src[0].file == GRF)
115 assert(src[0].reg_offset >= 0);
116 if (src[1].file == GRF)
117 assert(src[1].reg_offset >= 0);
118 }
119
120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
121 fs_reg src0, fs_reg src1, fs_reg src2)
122 {
123 init();
124 this->opcode = opcode;
125 this->dst = dst;
126 this->src[0] = src0;
127 this->src[1] = src1;
128 this->src[2] = src2;
129
130 if (dst.file == GRF)
131 assert(dst.reg_offset >= 0);
132 if (src[0].file == GRF)
133 assert(src[0].reg_offset >= 0);
134 if (src[1].file == GRF)
135 assert(src[1].reg_offset >= 0);
136 if (src[2].file == GRF)
137 assert(src[2].reg_offset >= 0);
138 }
139
140 #define ALU1(op) \
141 fs_inst * \
142 fs_visitor::op(fs_reg dst, fs_reg src0) \
143 { \
144 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
145 }
146
147 #define ALU2(op) \
148 fs_inst * \
149 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
150 { \
151 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
152 }
153
154 #define ALU3(op) \
155 fs_inst * \
156 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
157 { \
158 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
159 }
160
161 ALU1(NOT)
162 ALU1(MOV)
163 ALU1(FRC)
164 ALU1(RNDD)
165 ALU1(RNDE)
166 ALU1(RNDZ)
167 ALU2(ADD)
168 ALU2(MUL)
169 ALU2(MACH)
170 ALU2(AND)
171 ALU2(OR)
172 ALU2(XOR)
173 ALU2(SHL)
174 ALU2(SHR)
175 ALU2(ASR)
176 ALU3(LRP)
177 ALU1(BFREV)
178 ALU3(BFE)
179 ALU2(BFI1)
180 ALU3(BFI2)
181 ALU1(FBH)
182 ALU1(FBL)
183 ALU1(CBIT)
184 ALU3(MAD)
185 ALU2(ADDC)
186 ALU2(SUBB)
187 ALU2(SEL)
188
189 /** Gen4 predicated IF. */
190 fs_inst *
191 fs_visitor::IF(uint32_t predicate)
192 {
193 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195 return inst;
196 }
197
198 /** Gen6 IF with embedded comparison. */
199 fs_inst *
200 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
201 {
202 assert(brw->gen == 6);
203 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
204 reg_null_d, src0, src1);
205 inst->conditional_mod = condition;
206 return inst;
207 }
208
209 /**
210 * CMP: Sets the low bit of the destination channels with the result
211 * of the comparison, while the upper bits are undefined, and updates
212 * the flag register with the packed 16 bits of the result.
213 */
214 fs_inst *
215 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
216 {
217 fs_inst *inst;
218
219 /* Take the instruction:
220 *
221 * CMP null<d> src0<f> src1<f>
222 *
223 * Original gen4 does type conversion to the destination type before
224 * comparison, producing garbage results for floating point comparisons.
225 * gen5 does the comparison on the execution type (resolved source types),
226 * so dst type doesn't matter. gen6 does comparison and then uses the
227 * result as if it was the dst type with no conversion, which happens to
228 * mostly work out for float-interpreted-as-int since our comparisons are
229 * for >0, =0, <0.
230 */
231 if (brw->gen == 4) {
232 dst.type = src0.type;
233 if (dst.file == HW_REG)
234 dst.fixed_hw_reg.type = dst.type;
235 }
236
237 resolve_ud_negate(&src0);
238 resolve_ud_negate(&src1);
239
240 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
241 inst->conditional_mod = condition;
242
243 return inst;
244 }
245
246 exec_list
247 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
248 fs_reg varying_offset,
249 uint32_t const_offset)
250 {
251 exec_list instructions;
252 fs_inst *inst;
253
254 /* We have our constant surface use a pitch of 4 bytes, so our index can
255 * be any component of a vector, and then we load 4 contiguous
256 * components starting from that.
257 *
258 * We break down the const_offset to a portion added to the variable
259 * offset and a portion done using reg_offset, which means that if you
260 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
261 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
262 * CSE can later notice that those loads are all the same and eliminate
263 * the redundant ones.
264 */
265 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
266 instructions.push_tail(ADD(vec4_offset,
267 varying_offset, const_offset & ~3));
268
269 int scale = 1;
270 if (brw->gen == 4 && dispatch_width == 8) {
271 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
272 * u, v, r) as parameters, or we can just use the SIMD16 message
273 * consisting of (header, u). We choose the second, at the cost of a
274 * longer return length.
275 */
276 scale = 2;
277 }
278
279 enum opcode op;
280 if (brw->gen >= 7)
281 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
282 else
283 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
284 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
285 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
286 inst->regs_written = 4 * scale;
287 instructions.push_tail(inst);
288
289 if (brw->gen < 7) {
290 inst->base_mrf = 13;
291 inst->header_present = true;
292 if (brw->gen == 4)
293 inst->mlen = 3;
294 else
295 inst->mlen = 1 + dispatch_width / 8;
296 }
297
298 vec4_result.reg_offset += (const_offset & 3) * scale;
299 instructions.push_tail(MOV(dst, vec4_result));
300
301 return instructions;
302 }
303
304 /**
305 * A helper for MOV generation for fixing up broken hardware SEND dependency
306 * handling.
307 */
308 fs_inst *
309 fs_visitor::DEP_RESOLVE_MOV(int grf)
310 {
311 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
312
313 inst->ir = NULL;
314 inst->annotation = "send dependency resolve";
315
316 /* The caller always wants uncompressed to emit the minimal extra
317 * dependencies, and to avoid having to deal with aligning its regs to 2.
318 */
319 inst->force_uncompressed = true;
320
321 return inst;
322 }
323
324 bool
325 fs_inst::equals(fs_inst *inst)
326 {
327 return (opcode == inst->opcode &&
328 dst.equals(inst->dst) &&
329 src[0].equals(inst->src[0]) &&
330 src[1].equals(inst->src[1]) &&
331 src[2].equals(inst->src[2]) &&
332 saturate == inst->saturate &&
333 predicate == inst->predicate &&
334 conditional_mod == inst->conditional_mod &&
335 mlen == inst->mlen &&
336 base_mrf == inst->base_mrf &&
337 sampler == inst->sampler &&
338 target == inst->target &&
339 eot == inst->eot &&
340 header_present == inst->header_present &&
341 shadow_compare == inst->shadow_compare &&
342 offset == inst->offset);
343 }
344
345 bool
346 fs_inst::overwrites_reg(const fs_reg &reg)
347 {
348 return (reg.file == dst.file &&
349 reg.reg == dst.reg &&
350 reg.reg_offset >= dst.reg_offset &&
351 reg.reg_offset < dst.reg_offset + regs_written);
352 }
353
354 bool
355 fs_inst::is_send_from_grf()
356 {
357 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
358 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
359 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
360 src[1].file == GRF) ||
361 (is_tex() && src[0].file == GRF));
362 }
363
364 bool
365 fs_visitor::can_do_source_mods(fs_inst *inst)
366 {
367 if (brw->gen == 6 && inst->is_math())
368 return false;
369
370 if (inst->is_send_from_grf())
371 return false;
372
373 if (!inst->can_do_source_mods())
374 return false;
375
376 return true;
377 }
378
379 void
380 fs_reg::init()
381 {
382 memset(this, 0, sizeof(*this));
383 this->smear = -1;
384 }
385
386 /** Generic unset register constructor. */
387 fs_reg::fs_reg()
388 {
389 init();
390 this->file = BAD_FILE;
391 }
392
393 /** Immediate value constructor. */
394 fs_reg::fs_reg(float f)
395 {
396 init();
397 this->file = IMM;
398 this->type = BRW_REGISTER_TYPE_F;
399 this->imm.f = f;
400 }
401
402 /** Immediate value constructor. */
403 fs_reg::fs_reg(int32_t i)
404 {
405 init();
406 this->file = IMM;
407 this->type = BRW_REGISTER_TYPE_D;
408 this->imm.i = i;
409 }
410
411 /** Immediate value constructor. */
412 fs_reg::fs_reg(uint32_t u)
413 {
414 init();
415 this->file = IMM;
416 this->type = BRW_REGISTER_TYPE_UD;
417 this->imm.u = u;
418 }
419
420 /** Fixed brw_reg. */
421 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
422 {
423 init();
424 this->file = HW_REG;
425 this->fixed_hw_reg = fixed_hw_reg;
426 this->type = fixed_hw_reg.type;
427 }
428
429 bool
430 fs_reg::equals(const fs_reg &r) const
431 {
432 return (file == r.file &&
433 reg == r.reg &&
434 reg_offset == r.reg_offset &&
435 subreg_offset == r.subreg_offset &&
436 type == r.type &&
437 negate == r.negate &&
438 abs == r.abs &&
439 !reladdr && !r.reladdr &&
440 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
441 sizeof(fixed_hw_reg)) == 0 &&
442 smear == r.smear &&
443 imm.u == r.imm.u);
444 }
445
446 fs_reg
447 fs_reg::retype(uint32_t type)
448 {
449 fs_reg result = *this;
450 result.type = type;
451 return result;
452 }
453
454 bool
455 fs_reg::is_zero() const
456 {
457 if (file != IMM)
458 return false;
459
460 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
461 }
462
463 bool
464 fs_reg::is_one() const
465 {
466 if (file != IMM)
467 return false;
468
469 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
470 }
471
472 bool
473 fs_reg::is_null() const
474 {
475 return file == HW_REG &&
476 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
477 fixed_hw_reg.nr == BRW_ARF_NULL;
478 }
479
480 bool
481 fs_reg::is_valid_3src() const
482 {
483 return file == GRF || file == UNIFORM;
484 }
485
486 int
487 fs_visitor::type_size(const struct glsl_type *type)
488 {
489 unsigned int size, i;
490
491 switch (type->base_type) {
492 case GLSL_TYPE_UINT:
493 case GLSL_TYPE_INT:
494 case GLSL_TYPE_FLOAT:
495 case GLSL_TYPE_BOOL:
496 return type->components();
497 case GLSL_TYPE_ARRAY:
498 return type_size(type->fields.array) * type->length;
499 case GLSL_TYPE_STRUCT:
500 size = 0;
501 for (i = 0; i < type->length; i++) {
502 size += type_size(type->fields.structure[i].type);
503 }
504 return size;
505 case GLSL_TYPE_SAMPLER:
506 /* Samplers take up no register space, since they're baked in at
507 * link time.
508 */
509 return 0;
510 case GLSL_TYPE_ATOMIC_UINT:
511 return 0;
512 case GLSL_TYPE_IMAGE:
513 case GLSL_TYPE_VOID:
514 case GLSL_TYPE_ERROR:
515 case GLSL_TYPE_INTERFACE:
516 assert(!"not reached");
517 break;
518 }
519
520 return 0;
521 }
522
523 fs_reg
524 fs_visitor::get_timestamp()
525 {
526 assert(brw->gen >= 7);
527
528 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
529 BRW_ARF_TIMESTAMP,
530 0),
531 BRW_REGISTER_TYPE_UD));
532
533 fs_reg dst = fs_reg(this, glsl_type::uint_type);
534
535 fs_inst *mov = emit(MOV(dst, ts));
536 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
537 * even if it's not enabled in the dispatch.
538 */
539 mov->force_writemask_all = true;
540 mov->force_uncompressed = true;
541
542 /* The caller wants the low 32 bits of the timestamp. Since it's running
543 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
544 * which is plenty of time for our purposes. It is identical across the
545 * EUs, but since it's tracking GPU core speed it will increment at a
546 * varying rate as render P-states change.
547 *
548 * The caller could also check if render P-states have changed (or anything
549 * else that might disrupt timing) by setting smear to 2 and checking if
550 * that field is != 0.
551 */
552 dst.smear = 0;
553
554 return dst;
555 }
556
557 void
558 fs_visitor::emit_shader_time_begin()
559 {
560 current_annotation = "shader time start";
561 shader_start_time = get_timestamp();
562 }
563
564 void
565 fs_visitor::emit_shader_time_end()
566 {
567 current_annotation = "shader time end";
568
569 enum shader_time_shader_type type, written_type, reset_type;
570 if (dispatch_width == 8) {
571 type = ST_FS8;
572 written_type = ST_FS8_WRITTEN;
573 reset_type = ST_FS8_RESET;
574 } else {
575 assert(dispatch_width == 16);
576 type = ST_FS16;
577 written_type = ST_FS16_WRITTEN;
578 reset_type = ST_FS16_RESET;
579 }
580
581 fs_reg shader_end_time = get_timestamp();
582
583 /* Check that there weren't any timestamp reset events (assuming these
584 * were the only two timestamp reads that happened).
585 */
586 fs_reg reset = shader_end_time;
587 reset.smear = 2;
588 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
589 test->conditional_mod = BRW_CONDITIONAL_Z;
590 emit(IF(BRW_PREDICATE_NORMAL));
591
592 push_force_uncompressed();
593 fs_reg start = shader_start_time;
594 start.negate = true;
595 fs_reg diff = fs_reg(this, glsl_type::uint_type);
596 emit(ADD(diff, start, shader_end_time));
597
598 /* If there were no instructions between the two timestamp gets, the diff
599 * is 2 cycles. Remove that overhead, so I can forget about that when
600 * trying to determine the time taken for single instructions.
601 */
602 emit(ADD(diff, diff, fs_reg(-2u)));
603
604 emit_shader_time_write(type, diff);
605 emit_shader_time_write(written_type, fs_reg(1u));
606 emit(BRW_OPCODE_ELSE);
607 emit_shader_time_write(reset_type, fs_reg(1u));
608 emit(BRW_OPCODE_ENDIF);
609
610 pop_force_uncompressed();
611 }
612
613 void
614 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
615 fs_reg value)
616 {
617 int shader_time_index =
618 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
619 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
620
621 fs_reg payload;
622 if (dispatch_width == 8)
623 payload = fs_reg(this, glsl_type::uvec2_type);
624 else
625 payload = fs_reg(this, glsl_type::uint_type);
626
627 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
628 fs_reg(), payload, offset, value));
629 }
630
631 void
632 fs_visitor::fail(const char *format, ...)
633 {
634 va_list va;
635 char *msg;
636
637 if (failed)
638 return;
639
640 failed = true;
641
642 va_start(va, format);
643 msg = ralloc_vasprintf(mem_ctx, format, va);
644 va_end(va);
645 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
646
647 this->fail_msg = msg;
648
649 if (INTEL_DEBUG & DEBUG_WM) {
650 fprintf(stderr, "%s", msg);
651 }
652 }
653
654 fs_inst *
655 fs_visitor::emit(enum opcode opcode)
656 {
657 return emit(fs_inst(opcode));
658 }
659
660 fs_inst *
661 fs_visitor::emit(enum opcode opcode, fs_reg dst)
662 {
663 return emit(fs_inst(opcode, dst));
664 }
665
666 fs_inst *
667 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
668 {
669 return emit(fs_inst(opcode, dst, src0));
670 }
671
672 fs_inst *
673 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
674 {
675 return emit(fs_inst(opcode, dst, src0, src1));
676 }
677
678 fs_inst *
679 fs_visitor::emit(enum opcode opcode, fs_reg dst,
680 fs_reg src0, fs_reg src1, fs_reg src2)
681 {
682 return emit(fs_inst(opcode, dst, src0, src1, src2));
683 }
684
685 void
686 fs_visitor::push_force_uncompressed()
687 {
688 force_uncompressed_stack++;
689 }
690
691 void
692 fs_visitor::pop_force_uncompressed()
693 {
694 force_uncompressed_stack--;
695 assert(force_uncompressed_stack >= 0);
696 }
697
698 /**
699 * Returns true if the instruction has a flag that means it won't
700 * update an entire destination register.
701 *
702 * For example, dead code elimination and live variable analysis want to know
703 * when a write to a variable screens off any preceding values that were in
704 * it.
705 */
706 bool
707 fs_inst::is_partial_write()
708 {
709 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
710 this->force_uncompressed ||
711 this->force_sechalf);
712 }
713
714 int
715 fs_inst::regs_read(fs_visitor *v, int arg)
716 {
717 if (is_tex() && arg == 0 && src[0].file == GRF) {
718 if (v->dispatch_width == 16)
719 return (mlen + 1) / 2;
720 else
721 return mlen;
722 }
723 return 1;
724 }
725
726 bool
727 fs_inst::reads_flag()
728 {
729 return predicate;
730 }
731
732 bool
733 fs_inst::writes_flag()
734 {
735 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
736 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
737 }
738
739 /**
740 * Returns how many MRFs an FS opcode will write over.
741 *
742 * Note that this is not the 0 or 1 implied writes in an actual gen
743 * instruction -- the FS opcodes often generate MOVs in addition.
744 */
745 int
746 fs_visitor::implied_mrf_writes(fs_inst *inst)
747 {
748 if (inst->mlen == 0)
749 return 0;
750
751 if (inst->base_mrf == -1)
752 return 0;
753
754 switch (inst->opcode) {
755 case SHADER_OPCODE_RCP:
756 case SHADER_OPCODE_RSQ:
757 case SHADER_OPCODE_SQRT:
758 case SHADER_OPCODE_EXP2:
759 case SHADER_OPCODE_LOG2:
760 case SHADER_OPCODE_SIN:
761 case SHADER_OPCODE_COS:
762 return 1 * dispatch_width / 8;
763 case SHADER_OPCODE_POW:
764 case SHADER_OPCODE_INT_QUOTIENT:
765 case SHADER_OPCODE_INT_REMAINDER:
766 return 2 * dispatch_width / 8;
767 case SHADER_OPCODE_TEX:
768 case FS_OPCODE_TXB:
769 case SHADER_OPCODE_TXD:
770 case SHADER_OPCODE_TXF:
771 case SHADER_OPCODE_TXF_CMS:
772 case SHADER_OPCODE_TXF_MCS:
773 case SHADER_OPCODE_TG4:
774 case SHADER_OPCODE_TG4_OFFSET:
775 case SHADER_OPCODE_TXL:
776 case SHADER_OPCODE_TXS:
777 case SHADER_OPCODE_LOD:
778 return 1;
779 case FS_OPCODE_FB_WRITE:
780 return 2;
781 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
782 case SHADER_OPCODE_GEN4_SCRATCH_READ:
783 return 1;
784 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
785 return inst->mlen;
786 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
787 return 2;
788 case SHADER_OPCODE_UNTYPED_ATOMIC:
789 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
790 return 0;
791 default:
792 assert(!"not reached");
793 return inst->mlen;
794 }
795 }
796
797 int
798 fs_visitor::virtual_grf_alloc(int size)
799 {
800 if (virtual_grf_array_size <= virtual_grf_count) {
801 if (virtual_grf_array_size == 0)
802 virtual_grf_array_size = 16;
803 else
804 virtual_grf_array_size *= 2;
805 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
806 virtual_grf_array_size);
807 }
808 virtual_grf_sizes[virtual_grf_count] = size;
809 return virtual_grf_count++;
810 }
811
812 /** Fixed HW reg constructor. */
813 fs_reg::fs_reg(enum register_file file, int reg)
814 {
815 init();
816 this->file = file;
817 this->reg = reg;
818 this->type = BRW_REGISTER_TYPE_F;
819 }
820
821 /** Fixed HW reg constructor. */
822 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
823 {
824 init();
825 this->file = file;
826 this->reg = reg;
827 this->type = type;
828 }
829
830 /** Automatic reg constructor. */
831 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
832 {
833 init();
834
835 this->file = GRF;
836 this->reg = v->virtual_grf_alloc(v->type_size(type));
837 this->reg_offset = 0;
838 this->type = brw_type_for_base_type(type);
839 }
840
841 fs_reg *
842 fs_visitor::variable_storage(ir_variable *var)
843 {
844 return (fs_reg *)hash_table_find(this->variable_ht, var);
845 }
846
847 void
848 import_uniforms_callback(const void *key,
849 void *data,
850 void *closure)
851 {
852 struct hash_table *dst_ht = (struct hash_table *)closure;
853 const fs_reg *reg = (const fs_reg *)data;
854
855 if (reg->file != UNIFORM)
856 return;
857
858 hash_table_insert(dst_ht, data, key);
859 }
860
861 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
862 * This brings in those uniform definitions
863 */
864 void
865 fs_visitor::import_uniforms(fs_visitor *v)
866 {
867 hash_table_call_foreach(v->variable_ht,
868 import_uniforms_callback,
869 variable_ht);
870 this->params_remap = v->params_remap;
871 this->nr_params_remap = v->nr_params_remap;
872 }
873
874 /* Our support for uniforms is piggy-backed on the struct
875 * gl_fragment_program, because that's where the values actually
876 * get stored, rather than in some global gl_shader_program uniform
877 * store.
878 */
879 void
880 fs_visitor::setup_uniform_values(ir_variable *ir)
881 {
882 int namelen = strlen(ir->name);
883
884 /* The data for our (non-builtin) uniforms is stored in a series of
885 * gl_uniform_driver_storage structs for each subcomponent that
886 * glGetUniformLocation() could name. We know it's been set up in the same
887 * order we'd walk the type, so walk the list of storage and find anything
888 * with our name, or the prefix of a component that starts with our name.
889 */
890 unsigned params_before = c->prog_data.nr_params;
891 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
892 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
893
894 if (strncmp(ir->name, storage->name, namelen) != 0 ||
895 (storage->name[namelen] != 0 &&
896 storage->name[namelen] != '.' &&
897 storage->name[namelen] != '[')) {
898 continue;
899 }
900
901 unsigned slots = storage->type->component_slots();
902 if (storage->array_elements)
903 slots *= storage->array_elements;
904
905 for (unsigned i = 0; i < slots; i++) {
906 c->prog_data.param[c->prog_data.nr_params++] =
907 &storage->storage[i].f;
908 }
909 }
910
911 /* Make sure we actually initialized the right amount of stuff here. */
912 assert(params_before + ir->type->component_slots() ==
913 c->prog_data.nr_params);
914 (void)params_before;
915 }
916
917
918 /* Our support for builtin uniforms is even scarier than non-builtin.
919 * It sits on top of the PROG_STATE_VAR parameters that are
920 * automatically updated from GL context state.
921 */
922 void
923 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
924 {
925 const ir_state_slot *const slots = ir->state_slots;
926 assert(ir->state_slots != NULL);
927
928 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
929 /* This state reference has already been setup by ir_to_mesa, but we'll
930 * get the same index back here.
931 */
932 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
933 (gl_state_index *)slots[i].tokens);
934
935 /* Add each of the unique swizzles of the element as a parameter.
936 * This'll end up matching the expected layout of the
937 * array/matrix/structure we're trying to fill in.
938 */
939 int last_swiz = -1;
940 for (unsigned int j = 0; j < 4; j++) {
941 int swiz = GET_SWZ(slots[i].swizzle, j);
942 if (swiz == last_swiz)
943 break;
944 last_swiz = swiz;
945
946 c->prog_data.param[c->prog_data.nr_params++] =
947 &fp->Base.Parameters->ParameterValues[index][swiz].f;
948 }
949 }
950 }
951
952 fs_reg *
953 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
954 {
955 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
956 fs_reg wpos = *reg;
957 bool flip = !ir->data.origin_upper_left ^ c->key.render_to_fbo;
958
959 /* gl_FragCoord.x */
960 if (ir->data.pixel_center_integer) {
961 emit(MOV(wpos, this->pixel_x));
962 } else {
963 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
964 }
965 wpos.reg_offset++;
966
967 /* gl_FragCoord.y */
968 if (!flip && ir->data.pixel_center_integer) {
969 emit(MOV(wpos, this->pixel_y));
970 } else {
971 fs_reg pixel_y = this->pixel_y;
972 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
973
974 if (flip) {
975 pixel_y.negate = true;
976 offset += c->key.drawable_height - 1.0;
977 }
978
979 emit(ADD(wpos, pixel_y, fs_reg(offset)));
980 }
981 wpos.reg_offset++;
982
983 /* gl_FragCoord.z */
984 if (brw->gen >= 6) {
985 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
986 } else {
987 emit(FS_OPCODE_LINTERP, wpos,
988 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
989 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
990 interp_reg(VARYING_SLOT_POS, 2));
991 }
992 wpos.reg_offset++;
993
994 /* gl_FragCoord.w: Already set up in emit_interpolation */
995 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
996
997 return reg;
998 }
999
1000 fs_inst *
1001 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1002 glsl_interp_qualifier interpolation_mode,
1003 bool is_centroid, bool is_sample)
1004 {
1005 brw_wm_barycentric_interp_mode barycoord_mode;
1006 if (brw->gen >= 6) {
1007 if (is_centroid) {
1008 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1009 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1010 else
1011 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1012 } else if (is_sample) {
1013 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1014 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1015 else
1016 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1017 } else {
1018 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1019 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1020 else
1021 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1022 }
1023 } else {
1024 /* On Ironlake and below, there is only one interpolation mode.
1025 * Centroid interpolation doesn't mean anything on this hardware --
1026 * there is no multisampling.
1027 */
1028 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1029 }
1030 return emit(FS_OPCODE_LINTERP, attr,
1031 this->delta_x[barycoord_mode],
1032 this->delta_y[barycoord_mode], interp);
1033 }
1034
1035 fs_reg *
1036 fs_visitor::emit_general_interpolation(ir_variable *ir)
1037 {
1038 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1039 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1040 fs_reg attr = *reg;
1041
1042 unsigned int array_elements;
1043 const glsl_type *type;
1044
1045 if (ir->type->is_array()) {
1046 array_elements = ir->type->length;
1047 if (array_elements == 0) {
1048 fail("dereferenced array '%s' has length 0\n", ir->name);
1049 }
1050 type = ir->type->fields.array;
1051 } else {
1052 array_elements = 1;
1053 type = ir->type;
1054 }
1055
1056 glsl_interp_qualifier interpolation_mode =
1057 ir->determine_interpolation_mode(c->key.flat_shade);
1058
1059 int location = ir->data.location;
1060 for (unsigned int i = 0; i < array_elements; i++) {
1061 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1062 if (c->prog_data.urb_setup[location] == -1) {
1063 /* If there's no incoming setup data for this slot, don't
1064 * emit interpolation for it.
1065 */
1066 attr.reg_offset += type->vector_elements;
1067 location++;
1068 continue;
1069 }
1070
1071 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1072 /* Constant interpolation (flat shading) case. The SF has
1073 * handed us defined values in only the constant offset
1074 * field of the setup reg.
1075 */
1076 for (unsigned int k = 0; k < type->vector_elements; k++) {
1077 struct brw_reg interp = interp_reg(location, k);
1078 interp = suboffset(interp, 3);
1079 interp.type = reg->type;
1080 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1081 attr.reg_offset++;
1082 }
1083 } else {
1084 /* Smooth/noperspective interpolation case. */
1085 for (unsigned int k = 0; k < type->vector_elements; k++) {
1086 /* FINISHME: At some point we probably want to push
1087 * this farther by giving similar treatment to the
1088 * other potentially constant components of the
1089 * attribute, as well as making brw_vs_constval.c
1090 * handle varyings other than gl_TexCoord.
1091 */
1092 struct brw_reg interp = interp_reg(location, k);
1093 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1094 ir->data.centroid && !c->key.persample_shading,
1095 ir->data.sample || c->key.persample_shading);
1096 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1097 /* Get the pixel/sample mask into f0 so that we know
1098 * which pixels are lit. Then, for each channel that is
1099 * unlit, replace the centroid data with non-centroid
1100 * data.
1101 */
1102 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1103 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1104 interpolation_mode,
1105 false, false);
1106 inst->predicate = BRW_PREDICATE_NORMAL;
1107 inst->predicate_inverse = true;
1108 }
1109 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1110 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1111 }
1112 attr.reg_offset++;
1113 }
1114
1115 }
1116 location++;
1117 }
1118 }
1119
1120 return reg;
1121 }
1122
1123 fs_reg *
1124 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1125 {
1126 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1127
1128 /* The frontfacing comes in as a bit in the thread payload. */
1129 if (brw->gen >= 6) {
1130 emit(BRW_OPCODE_ASR, *reg,
1131 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1132 fs_reg(15));
1133 emit(BRW_OPCODE_NOT, *reg, *reg);
1134 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1135 } else {
1136 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1137 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1138 * us front face
1139 */
1140 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1141 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1142 }
1143
1144 return reg;
1145 }
1146
1147 void
1148 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1149 {
1150 assert(dst.type == BRW_REGISTER_TYPE_F);
1151
1152 if (c->key.compute_pos_offset) {
1153 /* Convert int_sample_pos to floating point */
1154 emit(MOV(dst, int_sample_pos));
1155 /* Scale to the range [0, 1] */
1156 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1157 }
1158 else {
1159 /* From ARB_sample_shading specification:
1160 * "When rendering to a non-multisample buffer, or if multisample
1161 * rasterization is disabled, gl_SamplePosition will always be
1162 * (0.5, 0.5).
1163 */
1164 emit(MOV(dst, fs_reg(0.5f)));
1165 }
1166 }
1167
1168 fs_reg *
1169 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1170 {
1171 assert(brw->gen >= 6);
1172 assert(ir->type == glsl_type::vec2_type);
1173
1174 this->current_annotation = "compute sample position";
1175 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1176 fs_reg pos = *reg;
1177 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1178 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1179
1180 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1181 * mode will be enabled.
1182 *
1183 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1184 * R31.1:0 Position Offset X/Y for Slot[3:0]
1185 * R31.3:2 Position Offset X/Y for Slot[7:4]
1186 * .....
1187 *
1188 * The X, Y sample positions come in as bytes in thread payload. So, read
1189 * the positions using vstride=16, width=8, hstride=2.
1190 */
1191 struct brw_reg sample_pos_reg =
1192 stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1193 BRW_REGISTER_TYPE_B), 16, 8, 2);
1194
1195 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1196 if (dispatch_width == 16) {
1197 int_sample_x.sechalf = true;
1198 fs_inst *inst = emit(MOV(int_sample_x,
1199 fs_reg(suboffset(sample_pos_reg, 16))));
1200 inst->force_sechalf = true;
1201 int_sample_x.sechalf = false;
1202 }
1203 /* Compute gl_SamplePosition.x */
1204 compute_sample_position(pos, int_sample_x);
1205 pos.reg_offset++;
1206 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1207 if (dispatch_width == 16) {
1208 int_sample_y.sechalf = true;
1209 fs_inst *inst = emit(MOV(int_sample_y,
1210 fs_reg(suboffset(sample_pos_reg, 17))));
1211 inst->force_sechalf = true;
1212 int_sample_y.sechalf = false;
1213 }
1214 /* Compute gl_SamplePosition.y */
1215 compute_sample_position(pos, int_sample_y);
1216 return reg;
1217 }
1218
1219 fs_reg *
1220 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1221 {
1222 assert(brw->gen >= 6);
1223
1224 this->current_annotation = "compute sample id";
1225 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1226
1227 if (c->key.compute_sample_id) {
1228 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1229 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1230 t2.type = BRW_REGISTER_TYPE_UW;
1231
1232 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1233 * 8x multisampling, subspan 0 will represent sample N (where N
1234 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1235 * 7. We can find the value of N by looking at R0.0 bits 7:6
1236 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1237 * (since samples are always delivered in pairs). That is, we
1238 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1239 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1240 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1241 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1242 * populating a temporary variable with the sequence (0, 1, 2, 3),
1243 * and then reading from it using vstride=1, width=4, hstride=0.
1244 * These computations hold good for 4x multisampling as well.
1245 */
1246 emit(BRW_OPCODE_AND, t1,
1247 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1248 fs_reg(brw_imm_d(0xc0)));
1249 emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1250 /* This works for both SIMD8 and SIMD16 */
1251 emit(MOV(t2, brw_imm_v(0x3210)));
1252 /* This special instruction takes care of setting vstride=1,
1253 * width=4, hstride=0 of t2 during an ADD instruction.
1254 */
1255 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1256 } else {
1257 /* As per GL_ARB_sample_shading specification:
1258 * "When rendering to a non-multisample buffer, or if multisample
1259 * rasterization is disabled, gl_SampleID will always be zero."
1260 */
1261 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1262 }
1263
1264 return reg;
1265 }
1266
1267 fs_reg *
1268 fs_visitor::emit_samplemaskin_setup(ir_variable *ir)
1269 {
1270 assert(brw->gen >= 7);
1271 this->current_annotation = "compute gl_SampleMaskIn";
1272 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1273 emit(MOV(*reg, fs_reg(retype(brw_vec8_grf(c->sample_mask_reg, 0), BRW_REGISTER_TYPE_D))));
1274 return reg;
1275 }
1276
1277 fs_reg
1278 fs_visitor::fix_math_operand(fs_reg src)
1279 {
1280 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1281 * might be able to do better by doing execsize = 1 math and then
1282 * expanding that result out, but we would need to be careful with
1283 * masking.
1284 *
1285 * The hardware ignores source modifiers (negate and abs) on math
1286 * instructions, so we also move to a temp to set those up.
1287 */
1288 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1289 !src.abs && !src.negate)
1290 return src;
1291
1292 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1293 * operands to math
1294 */
1295 if (brw->gen >= 7 && src.file != IMM)
1296 return src;
1297
1298 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1299 expanded.type = src.type;
1300 emit(BRW_OPCODE_MOV, expanded, src);
1301 return expanded;
1302 }
1303
1304 fs_inst *
1305 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1306 {
1307 switch (opcode) {
1308 case SHADER_OPCODE_RCP:
1309 case SHADER_OPCODE_RSQ:
1310 case SHADER_OPCODE_SQRT:
1311 case SHADER_OPCODE_EXP2:
1312 case SHADER_OPCODE_LOG2:
1313 case SHADER_OPCODE_SIN:
1314 case SHADER_OPCODE_COS:
1315 break;
1316 default:
1317 assert(!"not reached: bad math opcode");
1318 return NULL;
1319 }
1320
1321 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1322 * might be able to do better by doing execsize = 1 math and then
1323 * expanding that result out, but we would need to be careful with
1324 * masking.
1325 *
1326 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1327 * instructions, so we also move to a temp to set those up.
1328 */
1329 if (brw->gen >= 6)
1330 src = fix_math_operand(src);
1331
1332 fs_inst *inst = emit(opcode, dst, src);
1333
1334 if (brw->gen < 6) {
1335 inst->base_mrf = 2;
1336 inst->mlen = dispatch_width / 8;
1337 }
1338
1339 return inst;
1340 }
1341
1342 fs_inst *
1343 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1344 {
1345 int base_mrf = 2;
1346 fs_inst *inst;
1347
1348 switch (opcode) {
1349 case SHADER_OPCODE_INT_QUOTIENT:
1350 case SHADER_OPCODE_INT_REMAINDER:
1351 if (brw->gen >= 7 && dispatch_width == 16)
1352 fail("SIMD16 INTDIV unsupported\n");
1353 break;
1354 case SHADER_OPCODE_POW:
1355 break;
1356 default:
1357 assert(!"not reached: unsupported binary math opcode.");
1358 return NULL;
1359 }
1360
1361 if (brw->gen >= 6) {
1362 src0 = fix_math_operand(src0);
1363 src1 = fix_math_operand(src1);
1364
1365 inst = emit(opcode, dst, src0, src1);
1366 } else {
1367 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1368 * "Message Payload":
1369 *
1370 * "Operand0[7]. For the INT DIV functions, this operand is the
1371 * denominator."
1372 * ...
1373 * "Operand1[7]. For the INT DIV functions, this operand is the
1374 * numerator."
1375 */
1376 bool is_int_div = opcode != SHADER_OPCODE_POW;
1377 fs_reg &op0 = is_int_div ? src1 : src0;
1378 fs_reg &op1 = is_int_div ? src0 : src1;
1379
1380 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1381 inst = emit(opcode, dst, op0, reg_null_f);
1382
1383 inst->base_mrf = base_mrf;
1384 inst->mlen = 2 * dispatch_width / 8;
1385 }
1386 return inst;
1387 }
1388
1389 void
1390 fs_visitor::assign_curb_setup()
1391 {
1392 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1393 if (dispatch_width == 8) {
1394 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1395 } else {
1396 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1397 }
1398
1399 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1400 foreach_list(node, &this->instructions) {
1401 fs_inst *inst = (fs_inst *)node;
1402
1403 for (unsigned int i = 0; i < 3; i++) {
1404 if (inst->src[i].file == UNIFORM) {
1405 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1406 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1407 constant_nr / 8,
1408 constant_nr % 8);
1409
1410 inst->src[i].file = HW_REG;
1411 inst->src[i].fixed_hw_reg = byte_offset(
1412 retype(brw_reg, inst->src[i].type),
1413 inst->src[i].subreg_offset);
1414 }
1415 }
1416 }
1417 }
1418
1419 void
1420 fs_visitor::calculate_urb_setup()
1421 {
1422 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1423 c->prog_data.urb_setup[i] = -1;
1424 }
1425
1426 int urb_next = 0;
1427 /* Figure out where each of the incoming setup attributes lands. */
1428 if (brw->gen >= 6) {
1429 if (_mesa_bitcount_64(fp->Base.InputsRead &
1430 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1431 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1432 * first 16 varying inputs, so we can put them wherever we want.
1433 * Just put them in order.
1434 *
1435 * This is useful because it means that (a) inputs not used by the
1436 * fragment shader won't take up valuable register space, and (b) we
1437 * won't have to recompile the fragment shader if it gets paired with
1438 * a different vertex (or geometry) shader.
1439 */
1440 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1441 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1442 BITFIELD64_BIT(i)) {
1443 c->prog_data.urb_setup[i] = urb_next++;
1444 }
1445 }
1446 } else {
1447 /* We have enough input varyings that the SF/SBE pipeline stage can't
1448 * arbitrarily rearrange them to suit our whim; we have to put them
1449 * in an order that matches the output of the previous pipeline stage
1450 * (geometry or vertex shader).
1451 */
1452 struct brw_vue_map prev_stage_vue_map;
1453 brw_compute_vue_map(brw, &prev_stage_vue_map,
1454 c->key.input_slots_valid);
1455 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1456 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1457 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1458 slot++) {
1459 int varying = prev_stage_vue_map.slot_to_varying[slot];
1460 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1461 * unused.
1462 */
1463 if (varying != BRW_VARYING_SLOT_COUNT &&
1464 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1465 BITFIELD64_BIT(varying))) {
1466 c->prog_data.urb_setup[varying] = slot - first_slot;
1467 }
1468 }
1469 urb_next = prev_stage_vue_map.num_slots - first_slot;
1470 }
1471 } else {
1472 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1473 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1474 /* Point size is packed into the header, not as a general attribute */
1475 if (i == VARYING_SLOT_PSIZ)
1476 continue;
1477
1478 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1479 /* The back color slot is skipped when the front color is
1480 * also written to. In addition, some slots can be
1481 * written in the vertex shader and not read in the
1482 * fragment shader. So the register number must always be
1483 * incremented, mapped or not.
1484 */
1485 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1486 c->prog_data.urb_setup[i] = urb_next;
1487 urb_next++;
1488 }
1489 }
1490
1491 /*
1492 * It's a FS only attribute, and we did interpolation for this attribute
1493 * in SF thread. So, count it here, too.
1494 *
1495 * See compile_sf_prog() for more info.
1496 */
1497 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1498 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1499 }
1500
1501 c->prog_data.num_varying_inputs = urb_next;
1502 }
1503
1504 void
1505 fs_visitor::assign_urb_setup()
1506 {
1507 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1508
1509 /* Offset all the urb_setup[] index by the actual position of the
1510 * setup regs, now that the location of the constants has been chosen.
1511 */
1512 foreach_list(node, &this->instructions) {
1513 fs_inst *inst = (fs_inst *)node;
1514
1515 if (inst->opcode == FS_OPCODE_LINTERP) {
1516 assert(inst->src[2].file == HW_REG);
1517 inst->src[2].fixed_hw_reg.nr += urb_start;
1518 }
1519
1520 if (inst->opcode == FS_OPCODE_CINTERP) {
1521 assert(inst->src[0].file == HW_REG);
1522 inst->src[0].fixed_hw_reg.nr += urb_start;
1523 }
1524 }
1525
1526 /* Each attribute is 4 setup channels, each of which is half a reg. */
1527 this->first_non_payload_grf =
1528 urb_start + c->prog_data.num_varying_inputs * 2;
1529 }
1530
1531 /**
1532 * Split large virtual GRFs into separate components if we can.
1533 *
1534 * This is mostly duplicated with what brw_fs_vector_splitting does,
1535 * but that's really conservative because it's afraid of doing
1536 * splitting that doesn't result in real progress after the rest of
1537 * the optimization phases, which would cause infinite looping in
1538 * optimization. We can do it once here, safely. This also has the
1539 * opportunity to split interpolated values, or maybe even uniforms,
1540 * which we don't have at the IR level.
1541 *
1542 * We want to split, because virtual GRFs are what we register
1543 * allocate and spill (due to contiguousness requirements for some
1544 * instructions), and they're what we naturally generate in the
1545 * codegen process, but most virtual GRFs don't actually need to be
1546 * contiguous sets of GRFs. If we split, we'll end up with reduced
1547 * live intervals and better dead code elimination and coalescing.
1548 */
1549 void
1550 fs_visitor::split_virtual_grfs()
1551 {
1552 int num_vars = this->virtual_grf_count;
1553 bool split_grf[num_vars];
1554 int new_virtual_grf[num_vars];
1555
1556 /* Try to split anything > 0 sized. */
1557 for (int i = 0; i < num_vars; i++) {
1558 if (this->virtual_grf_sizes[i] != 1)
1559 split_grf[i] = true;
1560 else
1561 split_grf[i] = false;
1562 }
1563
1564 if (brw->has_pln &&
1565 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1566 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1567 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1568 * Gen6, that was the only supported interpolation mode, and since Gen6,
1569 * delta_x and delta_y are in fixed hardware registers.
1570 */
1571 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1572 false;
1573 }
1574
1575 foreach_list(node, &this->instructions) {
1576 fs_inst *inst = (fs_inst *)node;
1577
1578 /* If there's a SEND message that requires contiguous destination
1579 * registers, no splitting is allowed.
1580 */
1581 if (inst->regs_written > 1) {
1582 split_grf[inst->dst.reg] = false;
1583 }
1584
1585 /* If we're sending from a GRF, don't split it, on the assumption that
1586 * the send is reading the whole thing.
1587 */
1588 if (inst->is_send_from_grf()) {
1589 for (int i = 0; i < 3; i++) {
1590 if (inst->src[i].file == GRF) {
1591 split_grf[inst->src[i].reg] = false;
1592 }
1593 }
1594 }
1595 }
1596
1597 /* Allocate new space for split regs. Note that the virtual
1598 * numbers will be contiguous.
1599 */
1600 for (int i = 0; i < num_vars; i++) {
1601 if (split_grf[i]) {
1602 new_virtual_grf[i] = virtual_grf_alloc(1);
1603 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1604 int reg = virtual_grf_alloc(1);
1605 assert(reg == new_virtual_grf[i] + j - 1);
1606 (void) reg;
1607 }
1608 this->virtual_grf_sizes[i] = 1;
1609 }
1610 }
1611
1612 foreach_list(node, &this->instructions) {
1613 fs_inst *inst = (fs_inst *)node;
1614
1615 if (inst->dst.file == GRF &&
1616 split_grf[inst->dst.reg] &&
1617 inst->dst.reg_offset != 0) {
1618 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1619 inst->dst.reg_offset - 1);
1620 inst->dst.reg_offset = 0;
1621 }
1622 for (int i = 0; i < 3; i++) {
1623 if (inst->src[i].file == GRF &&
1624 split_grf[inst->src[i].reg] &&
1625 inst->src[i].reg_offset != 0) {
1626 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1627 inst->src[i].reg_offset - 1);
1628 inst->src[i].reg_offset = 0;
1629 }
1630 }
1631 }
1632 invalidate_live_intervals();
1633 }
1634
1635 /**
1636 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1637 *
1638 * During code generation, we create tons of temporary variables, many of
1639 * which get immediately killed and are never used again. Yet, in later
1640 * optimization and analysis passes, such as compute_live_intervals, we need
1641 * to loop over all the virtual GRFs. Compacting them can save a lot of
1642 * overhead.
1643 */
1644 void
1645 fs_visitor::compact_virtual_grfs()
1646 {
1647 /* Mark which virtual GRFs are used, and count how many. */
1648 int remap_table[this->virtual_grf_count];
1649 memset(remap_table, -1, sizeof(remap_table));
1650
1651 foreach_list(node, &this->instructions) {
1652 const fs_inst *inst = (const fs_inst *) node;
1653
1654 if (inst->dst.file == GRF)
1655 remap_table[inst->dst.reg] = 0;
1656
1657 for (int i = 0; i < 3; i++) {
1658 if (inst->src[i].file == GRF)
1659 remap_table[inst->src[i].reg] = 0;
1660 }
1661 }
1662
1663 /* In addition to registers used in instructions, fs_visitor keeps
1664 * direct references to certain special values which must be patched:
1665 */
1666 fs_reg *special[] = {
1667 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1668 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1669 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1670 &delta_x[0], &delta_x[1], &delta_x[2],
1671 &delta_x[3], &delta_x[4], &delta_x[5],
1672 &delta_y[0], &delta_y[1], &delta_y[2],
1673 &delta_y[3], &delta_y[4], &delta_y[5],
1674 };
1675 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1676 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1677
1678 /* Treat all special values as used, to be conservative */
1679 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1680 if (special[i]->file == GRF)
1681 remap_table[special[i]->reg] = 0;
1682 }
1683
1684 /* Compact the GRF arrays. */
1685 int new_index = 0;
1686 for (int i = 0; i < this->virtual_grf_count; i++) {
1687 if (remap_table[i] != -1) {
1688 remap_table[i] = new_index;
1689 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1690 invalidate_live_intervals();
1691 ++new_index;
1692 }
1693 }
1694
1695 this->virtual_grf_count = new_index;
1696
1697 /* Patch all the instructions to use the newly renumbered registers */
1698 foreach_list(node, &this->instructions) {
1699 fs_inst *inst = (fs_inst *) node;
1700
1701 if (inst->dst.file == GRF)
1702 inst->dst.reg = remap_table[inst->dst.reg];
1703
1704 for (int i = 0; i < 3; i++) {
1705 if (inst->src[i].file == GRF)
1706 inst->src[i].reg = remap_table[inst->src[i].reg];
1707 }
1708 }
1709
1710 /* Patch all the references to special values */
1711 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1712 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1713 special[i]->reg = remap_table[special[i]->reg];
1714 }
1715 }
1716
1717 bool
1718 fs_visitor::remove_dead_constants()
1719 {
1720 if (dispatch_width == 8) {
1721 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1722 this->nr_params_remap = c->prog_data.nr_params;
1723
1724 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1725 this->params_remap[i] = -1;
1726
1727 /* Find which params are still in use. */
1728 foreach_list(node, &this->instructions) {
1729 fs_inst *inst = (fs_inst *)node;
1730
1731 for (int i = 0; i < 3; i++) {
1732 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1733
1734 if (inst->src[i].file != UNIFORM)
1735 continue;
1736
1737 /* Section 5.11 of the OpenGL 4.3 spec says:
1738 *
1739 * "Out-of-bounds reads return undefined values, which include
1740 * values from other variables of the active program or zero."
1741 */
1742 if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1743 constant_nr = 0;
1744 }
1745
1746 /* For now, set this to non-negative. We'll give it the
1747 * actual new number in a moment, in order to keep the
1748 * register numbers nicely ordered.
1749 */
1750 this->params_remap[constant_nr] = 0;
1751 }
1752 }
1753
1754 /* Figure out what the new numbers for the params will be. At some
1755 * point when we're doing uniform array access, we're going to want
1756 * to keep the distinction between .reg and .reg_offset, but for
1757 * now we don't care.
1758 */
1759 unsigned int new_nr_params = 0;
1760 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1761 if (this->params_remap[i] != -1) {
1762 this->params_remap[i] = new_nr_params++;
1763 }
1764 }
1765
1766 /* Update the list of params to be uploaded to match our new numbering. */
1767 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1768 int remapped = this->params_remap[i];
1769
1770 if (remapped == -1)
1771 continue;
1772
1773 c->prog_data.param[remapped] = c->prog_data.param[i];
1774 }
1775
1776 c->prog_data.nr_params = new_nr_params;
1777 } else {
1778 /* This should have been generated in the SIMD8 pass already. */
1779 assert(this->params_remap);
1780 }
1781
1782 /* Now do the renumbering of the shader to remove unused params. */
1783 foreach_list(node, &this->instructions) {
1784 fs_inst *inst = (fs_inst *)node;
1785
1786 for (int i = 0; i < 3; i++) {
1787 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1788
1789 if (inst->src[i].file != UNIFORM)
1790 continue;
1791
1792 /* as above alias to 0 */
1793 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1794 constant_nr = 0;
1795 }
1796 assert(this->params_remap[constant_nr] != -1);
1797 inst->src[i].reg = this->params_remap[constant_nr];
1798 inst->src[i].reg_offset = 0;
1799 }
1800 }
1801
1802 return true;
1803 }
1804
1805 /*
1806 * Implements array access of uniforms by inserting a
1807 * PULL_CONSTANT_LOAD instruction.
1808 *
1809 * Unlike temporary GRF array access (where we don't support it due to
1810 * the difficulty of doing relative addressing on instruction
1811 * destinations), we could potentially do array access of uniforms
1812 * that were loaded in GRF space as push constants. In real-world
1813 * usage we've seen, though, the arrays being used are always larger
1814 * than we could load as push constants, so just always move all
1815 * uniform array access out to a pull constant buffer.
1816 */
1817 void
1818 fs_visitor::move_uniform_array_access_to_pull_constants()
1819 {
1820 int pull_constant_loc[c->prog_data.nr_params];
1821
1822 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1823 pull_constant_loc[i] = -1;
1824 }
1825
1826 /* Walk through and find array access of uniforms. Put a copy of that
1827 * uniform in the pull constant buffer.
1828 *
1829 * Note that we don't move constant-indexed accesses to arrays. No
1830 * testing has been done of the performance impact of this choice.
1831 */
1832 foreach_list_safe(node, &this->instructions) {
1833 fs_inst *inst = (fs_inst *)node;
1834
1835 for (int i = 0 ; i < 3; i++) {
1836 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1837 continue;
1838
1839 int uniform = inst->src[i].reg;
1840
1841 /* If this array isn't already present in the pull constant buffer,
1842 * add it.
1843 */
1844 if (pull_constant_loc[uniform] == -1) {
1845 const float **values = &c->prog_data.param[uniform];
1846
1847 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1848
1849 assert(param_size[uniform]);
1850
1851 for (int j = 0; j < param_size[uniform]; j++) {
1852 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1853 values[j];
1854 }
1855 }
1856
1857 /* Set up the annotation tracking for new generated instructions. */
1858 base_ir = inst->ir;
1859 current_annotation = inst->annotation;
1860
1861 fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1862 fs_reg temp = fs_reg(this, glsl_type::float_type);
1863 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1864 surf_index,
1865 *inst->src[i].reladdr,
1866 pull_constant_loc[uniform] +
1867 inst->src[i].reg_offset);
1868 inst->insert_before(&list);
1869
1870 inst->src[i].file = temp.file;
1871 inst->src[i].reg = temp.reg;
1872 inst->src[i].reg_offset = temp.reg_offset;
1873 inst->src[i].reladdr = NULL;
1874 }
1875 }
1876 }
1877
1878 /**
1879 * Choose accesses from the UNIFORM file to demote to using the pull
1880 * constant buffer.
1881 *
1882 * We allow a fragment shader to have more than the specified minimum
1883 * maximum number of fragment shader uniform components (64). If
1884 * there are too many of these, they'd fill up all of register space.
1885 * So, this will push some of them out to the pull constant buffer and
1886 * update the program to load them.
1887 */
1888 void
1889 fs_visitor::setup_pull_constants()
1890 {
1891 /* Only allow 16 registers (128 uniform components) as push constants. */
1892 unsigned int max_uniform_components = 16 * 8;
1893 if (c->prog_data.nr_params <= max_uniform_components)
1894 return;
1895
1896 if (dispatch_width == 16) {
1897 fail("Pull constants not supported in SIMD16\n");
1898 return;
1899 }
1900
1901 /* Just demote the end of the list. We could probably do better
1902 * here, demoting things that are rarely used in the program first.
1903 */
1904 unsigned int pull_uniform_base = max_uniform_components;
1905
1906 int pull_constant_loc[c->prog_data.nr_params];
1907 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1908 if (i < pull_uniform_base) {
1909 pull_constant_loc[i] = -1;
1910 } else {
1911 pull_constant_loc[i] = -1;
1912 /* If our constant is already being uploaded for reladdr purposes,
1913 * reuse it.
1914 */
1915 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1916 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1917 pull_constant_loc[i] = j;
1918 break;
1919 }
1920 }
1921 if (pull_constant_loc[i] == -1) {
1922 int pull_index = c->prog_data.nr_pull_params++;
1923 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1924 pull_constant_loc[i] = pull_index;;
1925 }
1926 }
1927 }
1928 c->prog_data.nr_params = pull_uniform_base;
1929
1930 foreach_list(node, &this->instructions) {
1931 fs_inst *inst = (fs_inst *)node;
1932
1933 for (int i = 0; i < 3; i++) {
1934 if (inst->src[i].file != UNIFORM)
1935 continue;
1936
1937 int pull_index = pull_constant_loc[inst->src[i].reg +
1938 inst->src[i].reg_offset];
1939 if (pull_index == -1)
1940 continue;
1941
1942 assert(!inst->src[i].reladdr);
1943
1944 fs_reg dst = fs_reg(this, glsl_type::float_type);
1945 fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1946 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1947 fs_inst *pull =
1948 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1949 dst, index, offset);
1950 pull->ir = inst->ir;
1951 pull->annotation = inst->annotation;
1952
1953 inst->insert_before(pull);
1954
1955 inst->src[i].file = GRF;
1956 inst->src[i].reg = dst.reg;
1957 inst->src[i].reg_offset = 0;
1958 inst->src[i].smear = pull_index & 3;
1959 }
1960 }
1961 }
1962
1963 bool
1964 fs_visitor::opt_algebraic()
1965 {
1966 bool progress = false;
1967
1968 foreach_list(node, &this->instructions) {
1969 fs_inst *inst = (fs_inst *)node;
1970
1971 switch (inst->opcode) {
1972 case BRW_OPCODE_MUL:
1973 if (inst->src[1].file != IMM)
1974 continue;
1975
1976 /* a * 1.0 = a */
1977 if (inst->src[1].is_one()) {
1978 inst->opcode = BRW_OPCODE_MOV;
1979 inst->src[1] = reg_undef;
1980 progress = true;
1981 break;
1982 }
1983
1984 /* a * 0.0 = 0.0 */
1985 if (inst->src[1].is_zero()) {
1986 inst->opcode = BRW_OPCODE_MOV;
1987 inst->src[0] = inst->src[1];
1988 inst->src[1] = reg_undef;
1989 progress = true;
1990 break;
1991 }
1992
1993 break;
1994 case BRW_OPCODE_ADD:
1995 if (inst->src[1].file != IMM)
1996 continue;
1997
1998 /* a + 0.0 = a */
1999 if (inst->src[1].is_zero()) {
2000 inst->opcode = BRW_OPCODE_MOV;
2001 inst->src[1] = reg_undef;
2002 progress = true;
2003 break;
2004 }
2005 break;
2006 case BRW_OPCODE_OR:
2007 if (inst->src[0].equals(inst->src[1])) {
2008 inst->opcode = BRW_OPCODE_MOV;
2009 inst->src[1] = reg_undef;
2010 progress = true;
2011 break;
2012 }
2013 break;
2014 case BRW_OPCODE_LRP:
2015 if (inst->src[1].equals(inst->src[2])) {
2016 inst->opcode = BRW_OPCODE_MOV;
2017 inst->src[0] = inst->src[1];
2018 inst->src[1] = reg_undef;
2019 inst->src[2] = reg_undef;
2020 progress = true;
2021 break;
2022 }
2023 break;
2024 case BRW_OPCODE_SEL:
2025 if (inst->saturate && inst->src[1].file == IMM) {
2026 switch (inst->conditional_mod) {
2027 case BRW_CONDITIONAL_LE:
2028 case BRW_CONDITIONAL_L:
2029 switch (inst->src[1].type) {
2030 case BRW_REGISTER_TYPE_F:
2031 if (inst->src[1].imm.f >= 1.0f) {
2032 inst->opcode = BRW_OPCODE_MOV;
2033 inst->src[1] = reg_undef;
2034 progress = true;
2035 }
2036 break;
2037 default:
2038 break;
2039 }
2040 break;
2041 case BRW_CONDITIONAL_GE:
2042 case BRW_CONDITIONAL_G:
2043 switch (inst->src[1].type) {
2044 case BRW_REGISTER_TYPE_F:
2045 if (inst->src[1].imm.f <= 0.0f) {
2046 inst->opcode = BRW_OPCODE_MOV;
2047 inst->src[1] = reg_undef;
2048 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2049 progress = true;
2050 }
2051 break;
2052 default:
2053 break;
2054 }
2055 default:
2056 break;
2057 }
2058 }
2059 break;
2060 default:
2061 break;
2062 }
2063 }
2064
2065 return progress;
2066 }
2067
2068 /**
2069 * Removes any instructions writing a VGRF where that VGRF is not used by any
2070 * later instruction.
2071 */
2072 bool
2073 fs_visitor::dead_code_eliminate()
2074 {
2075 bool progress = false;
2076 int pc = 0;
2077
2078 calculate_live_intervals();
2079
2080 foreach_list_safe(node, &this->instructions) {
2081 fs_inst *inst = (fs_inst *)node;
2082
2083 if (inst->dst.file == GRF && !inst->has_side_effects()) {
2084 bool dead = true;
2085
2086 for (int i = 0; i < inst->regs_written; i++) {
2087 int var = live_intervals->var_from_vgrf[inst->dst.reg];
2088 assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2089 if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2090 dead = false;
2091 break;
2092 }
2093 }
2094
2095 if (dead) {
2096 /* Don't dead code eliminate instructions that write to the
2097 * accumulator as a side-effect. Instead just set the destination
2098 * to the null register to free it.
2099 */
2100 switch (inst->opcode) {
2101 case BRW_OPCODE_ADDC:
2102 case BRW_OPCODE_SUBB:
2103 case BRW_OPCODE_MACH:
2104 inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2105 break;
2106 default:
2107 inst->remove();
2108 progress = true;
2109 break;
2110 }
2111 }
2112 }
2113
2114 pc++;
2115 }
2116
2117 if (progress)
2118 invalidate_live_intervals();
2119
2120 return progress;
2121 }
2122
2123 struct dead_code_hash_key
2124 {
2125 int vgrf;
2126 int reg_offset;
2127 };
2128
2129 static bool
2130 dead_code_hash_compare(const void *a, const void *b)
2131 {
2132 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2133 }
2134
2135 static void
2136 clear_dead_code_hash(struct hash_table *ht)
2137 {
2138 struct hash_entry *entry;
2139
2140 hash_table_foreach(ht, entry) {
2141 _mesa_hash_table_remove(ht, entry);
2142 }
2143 }
2144
2145 static void
2146 insert_dead_code_hash(struct hash_table *ht,
2147 int vgrf, int reg_offset, fs_inst *inst)
2148 {
2149 /* We don't bother freeing keys, because they'll be GCed with the ht. */
2150 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2151
2152 key->vgrf = vgrf;
2153 key->reg_offset = reg_offset;
2154
2155 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2156 }
2157
2158 static struct hash_entry *
2159 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2160 {
2161 struct dead_code_hash_key key;
2162
2163 key.vgrf = vgrf;
2164 key.reg_offset = reg_offset;
2165
2166 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2167 }
2168
2169 static void
2170 remove_dead_code_hash(struct hash_table *ht,
2171 int vgrf, int reg_offset)
2172 {
2173 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2174 if (!entry)
2175 return;
2176
2177 _mesa_hash_table_remove(ht, entry);
2178 }
2179
2180 /**
2181 * Walks basic blocks, removing any regs that are written but not read before
2182 * being redefined.
2183 *
2184 * The dead_code_eliminate() function implements a global dead code
2185 * elimination, but it only handles the removing the last write to a register
2186 * if it's never read. This one can handle intermediate writes, but only
2187 * within a basic block.
2188 */
2189 bool
2190 fs_visitor::dead_code_eliminate_local()
2191 {
2192 struct hash_table *ht;
2193 bool progress = false;
2194
2195 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2196
2197 if (ht == NULL) {
2198 return false;
2199 }
2200
2201 foreach_list_safe(node, &this->instructions) {
2202 fs_inst *inst = (fs_inst *)node;
2203
2204 /* At a basic block, empty the HT since we don't understand dataflow
2205 * here.
2206 */
2207 if (inst->is_control_flow()) {
2208 clear_dead_code_hash(ht);
2209 continue;
2210 }
2211
2212 /* Clear the HT of any instructions that got read. */
2213 for (int i = 0; i < 3; i++) {
2214 fs_reg src = inst->src[i];
2215 if (src.file != GRF)
2216 continue;
2217
2218 int read = 1;
2219 if (inst->is_send_from_grf())
2220 read = virtual_grf_sizes[src.reg] - src.reg_offset;
2221
2222 for (int reg_offset = src.reg_offset;
2223 reg_offset < src.reg_offset + read;
2224 reg_offset++) {
2225 remove_dead_code_hash(ht, src.reg, reg_offset);
2226 }
2227 }
2228
2229 /* Add any update of a GRF to the HT, removing a previous write if it
2230 * wasn't read.
2231 */
2232 if (inst->dst.file == GRF) {
2233 if (inst->regs_written > 1) {
2234 /* We don't know how to trim channels from an instruction's
2235 * writes, so we can't incrementally remove unread channels from
2236 * it. Just remove whatever it overwrites from the table
2237 */
2238 for (int i = 0; i < inst->regs_written; i++) {
2239 remove_dead_code_hash(ht,
2240 inst->dst.reg,
2241 inst->dst.reg_offset + i);
2242 }
2243 } else {
2244 struct hash_entry *entry =
2245 get_dead_code_hash_entry(ht, inst->dst.reg,
2246 inst->dst.reg_offset);
2247
2248 if (entry) {
2249 if (inst->is_partial_write()) {
2250 /* For a partial write, we can't remove any previous dead code
2251 * candidate, since we're just modifying their result.
2252 */
2253 } else {
2254 /* We're completely updating a channel, and there was a
2255 * previous write to the channel that wasn't read. Kill it!
2256 */
2257 fs_inst *inst = (fs_inst *)entry->data;
2258 inst->remove();
2259 progress = true;
2260 }
2261
2262 _mesa_hash_table_remove(ht, entry);
2263 }
2264
2265 if (!inst->has_side_effects())
2266 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2267 inst);
2268 }
2269 }
2270 }
2271
2272 _mesa_hash_table_destroy(ht, NULL);
2273
2274 if (progress)
2275 invalidate_live_intervals();
2276
2277 return progress;
2278 }
2279
2280 /**
2281 * Implements register coalescing: Checks if the two registers involved in a
2282 * raw move don't interfere, in which case they can both be stored in the same
2283 * place and the MOV removed.
2284 *
2285 * To do this, all uses of the source of the MOV in the shader are replaced
2286 * with the destination of the MOV. For example:
2287 *
2288 * add vgrf3:F, vgrf1:F, vgrf2:F
2289 * mov vgrf4:F, vgrf3:F
2290 * mul vgrf5:F, vgrf5:F, vgrf4:F
2291 *
2292 * becomes
2293 *
2294 * add vgrf4:F, vgrf1:F, vgrf2:F
2295 * mul vgrf5:F, vgrf5:F, vgrf4:F
2296 */
2297 bool
2298 fs_visitor::register_coalesce()
2299 {
2300 bool progress = false;
2301
2302 calculate_live_intervals();
2303
2304 int src_size = 0;
2305 int channels_remaining = 0;
2306 int reg_from = -1, reg_to = -1;
2307 int reg_to_offset[MAX_SAMPLER_MESSAGE_SIZE];
2308 fs_inst *mov[MAX_SAMPLER_MESSAGE_SIZE];
2309
2310 foreach_list(node, &this->instructions) {
2311 fs_inst *inst = (fs_inst *)node;
2312
2313 if (inst->opcode != BRW_OPCODE_MOV ||
2314 inst->is_partial_write() ||
2315 inst->saturate ||
2316 inst->src[0].file != GRF ||
2317 inst->src[0].negate ||
2318 inst->src[0].abs ||
2319 inst->src[0].smear != -1 ||
2320 inst->dst.file != GRF ||
2321 inst->dst.type != inst->src[0].type) {
2322 continue;
2323 }
2324
2325 if (virtual_grf_sizes[inst->src[0].reg] >
2326 virtual_grf_sizes[inst->dst.reg])
2327 continue;
2328
2329 int var_from = live_intervals->var_from_reg(&inst->src[0]);
2330 int var_to = live_intervals->var_from_reg(&inst->dst);
2331
2332 if (live_intervals->vars_interfere(var_from, var_to) &&
2333 !inst->dst.equals(inst->src[0])) {
2334
2335 /* We know that the live ranges of A (var_from) and B (var_to)
2336 * interfere because of the ->vars_interfere() call above. If the end
2337 * of B's live range is after the end of A's range, then we know two
2338 * things:
2339 * - the start of B's live range must be in A's live range (since we
2340 * already know the two ranges interfere, this is the only remaining
2341 * possibility)
2342 * - the interference isn't of the form we're looking for (where B is
2343 * entirely inside A)
2344 */
2345 if (live_intervals->end[var_to] > live_intervals->end[var_from])
2346 continue;
2347
2348 bool overwritten = false;
2349 int scan_ip = -1;
2350
2351 foreach_list(n, &this->instructions) {
2352 fs_inst *scan_inst = (fs_inst *)n;
2353 scan_ip++;
2354
2355 if (scan_inst->is_control_flow()) {
2356 overwritten = true;
2357 break;
2358 }
2359
2360 if (scan_ip <= live_intervals->start[var_to])
2361 continue;
2362
2363 if (scan_ip > live_intervals->end[var_to])
2364 break;
2365
2366 if (scan_inst->dst.equals(inst->dst) ||
2367 scan_inst->dst.equals(inst->src[0])) {
2368 overwritten = true;
2369 break;
2370 }
2371 }
2372
2373 if (overwritten)
2374 continue;
2375 }
2376
2377 if (reg_from != inst->src[0].reg) {
2378 reg_from = inst->src[0].reg;
2379
2380 src_size = virtual_grf_sizes[inst->src[0].reg];
2381 assert(src_size <= MAX_SAMPLER_MESSAGE_SIZE);
2382
2383 channels_remaining = src_size;
2384 memset(mov, 0, sizeof(mov));
2385
2386 reg_to = inst->dst.reg;
2387 }
2388
2389 if (reg_to != inst->dst.reg)
2390 continue;
2391
2392 const int offset = inst->src[0].reg_offset;
2393 reg_to_offset[offset] = inst->dst.reg_offset;
2394 mov[offset] = inst;
2395 channels_remaining--;
2396
2397 if (channels_remaining)
2398 continue;
2399
2400 bool removed = false;
2401 for (int i = 0; i < src_size; i++) {
2402 if (mov[i]) {
2403 removed = true;
2404
2405 mov[i]->opcode = BRW_OPCODE_NOP;
2406 mov[i]->conditional_mod = BRW_CONDITIONAL_NONE;
2407 mov[i]->dst = reg_undef;
2408 mov[i]->src[0] = reg_undef;
2409 mov[i]->src[1] = reg_undef;
2410 mov[i]->src[2] = reg_undef;
2411 }
2412 }
2413
2414 foreach_list(node, &this->instructions) {
2415 fs_inst *scan_inst = (fs_inst *)node;
2416
2417 for (int i = 0; i < src_size; i++) {
2418 if (mov[i]) {
2419 if (scan_inst->dst.file == GRF &&
2420 scan_inst->dst.reg == reg_from &&
2421 scan_inst->dst.reg_offset == i) {
2422 scan_inst->dst.reg = reg_to;
2423 scan_inst->dst.reg_offset = reg_to_offset[i];
2424 }
2425 for (int j = 0; j < 3; j++) {
2426 if (scan_inst->src[j].file == GRF &&
2427 scan_inst->src[j].reg == reg_from &&
2428 scan_inst->src[j].reg_offset == i) {
2429 scan_inst->src[j].reg = reg_to;
2430 scan_inst->src[j].reg_offset = reg_to_offset[i];
2431 }
2432 }
2433 }
2434 }
2435 }
2436
2437 if (removed) {
2438 live_intervals->start[var_to] = MIN2(live_intervals->start[var_to],
2439 live_intervals->start[var_from]);
2440 live_intervals->end[var_to] = MAX2(live_intervals->end[var_to],
2441 live_intervals->end[var_from]);
2442 reg_from = -1;
2443 }
2444 }
2445
2446 foreach_list_safe(node, &this->instructions) {
2447 fs_inst *inst = (fs_inst *)node;
2448
2449 if (inst->opcode == BRW_OPCODE_NOP) {
2450 inst->remove();
2451 progress = true;
2452 }
2453 }
2454
2455 if (progress)
2456 invalidate_live_intervals();
2457
2458 return progress;
2459 }
2460
2461 bool
2462 fs_visitor::compute_to_mrf()
2463 {
2464 bool progress = false;
2465 int next_ip = 0;
2466
2467 calculate_live_intervals();
2468
2469 foreach_list_safe(node, &this->instructions) {
2470 fs_inst *inst = (fs_inst *)node;
2471
2472 int ip = next_ip;
2473 next_ip++;
2474
2475 if (inst->opcode != BRW_OPCODE_MOV ||
2476 inst->is_partial_write() ||
2477 inst->dst.file != MRF || inst->src[0].file != GRF ||
2478 inst->dst.type != inst->src[0].type ||
2479 inst->src[0].abs || inst->src[0].negate ||
2480 inst->src[0].smear != -1 || inst->src[0].subreg_offset)
2481 continue;
2482
2483 /* Work out which hardware MRF registers are written by this
2484 * instruction.
2485 */
2486 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2487 int mrf_high;
2488 if (inst->dst.reg & BRW_MRF_COMPR4) {
2489 mrf_high = mrf_low + 4;
2490 } else if (dispatch_width == 16 &&
2491 (!inst->force_uncompressed && !inst->force_sechalf)) {
2492 mrf_high = mrf_low + 1;
2493 } else {
2494 mrf_high = mrf_low;
2495 }
2496
2497 /* Can't compute-to-MRF this GRF if someone else was going to
2498 * read it later.
2499 */
2500 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2501 continue;
2502
2503 /* Found a move of a GRF to a MRF. Let's see if we can go
2504 * rewrite the thing that made this GRF to write into the MRF.
2505 */
2506 fs_inst *scan_inst;
2507 for (scan_inst = (fs_inst *)inst->prev;
2508 scan_inst->prev != NULL;
2509 scan_inst = (fs_inst *)scan_inst->prev) {
2510 if (scan_inst->dst.file == GRF &&
2511 scan_inst->dst.reg == inst->src[0].reg) {
2512 /* Found the last thing to write our reg we want to turn
2513 * into a compute-to-MRF.
2514 */
2515
2516 /* If this one instruction didn't populate all the
2517 * channels, bail. We might be able to rewrite everything
2518 * that writes that reg, but it would require smarter
2519 * tracking to delay the rewriting until complete success.
2520 */
2521 if (scan_inst->is_partial_write())
2522 break;
2523
2524 /* Things returning more than one register would need us to
2525 * understand coalescing out more than one MOV at a time.
2526 */
2527 if (scan_inst->regs_written > 1)
2528 break;
2529
2530 /* SEND instructions can't have MRF as a destination. */
2531 if (scan_inst->mlen)
2532 break;
2533
2534 if (brw->gen == 6) {
2535 /* gen6 math instructions must have the destination be
2536 * GRF, so no compute-to-MRF for them.
2537 */
2538 if (scan_inst->is_math()) {
2539 break;
2540 }
2541 }
2542
2543 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2544 /* Found the creator of our MRF's source value. */
2545 scan_inst->dst.file = MRF;
2546 scan_inst->dst.reg = inst->dst.reg;
2547 scan_inst->saturate |= inst->saturate;
2548 inst->remove();
2549 progress = true;
2550 }
2551 break;
2552 }
2553
2554 /* We don't handle control flow here. Most computation of
2555 * values that end up in MRFs are shortly before the MRF
2556 * write anyway.
2557 */
2558 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2559 break;
2560
2561 /* You can't read from an MRF, so if someone else reads our
2562 * MRF's source GRF that we wanted to rewrite, that stops us.
2563 */
2564 bool interfered = false;
2565 for (int i = 0; i < 3; i++) {
2566 if (scan_inst->src[i].file == GRF &&
2567 scan_inst->src[i].reg == inst->src[0].reg &&
2568 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2569 interfered = true;
2570 }
2571 }
2572 if (interfered)
2573 break;
2574
2575 if (scan_inst->dst.file == MRF) {
2576 /* If somebody else writes our MRF here, we can't
2577 * compute-to-MRF before that.
2578 */
2579 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2580 int scan_mrf_high;
2581
2582 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2583 scan_mrf_high = scan_mrf_low + 4;
2584 } else if (dispatch_width == 16 &&
2585 (!scan_inst->force_uncompressed &&
2586 !scan_inst->force_sechalf)) {
2587 scan_mrf_high = scan_mrf_low + 1;
2588 } else {
2589 scan_mrf_high = scan_mrf_low;
2590 }
2591
2592 if (mrf_low == scan_mrf_low ||
2593 mrf_low == scan_mrf_high ||
2594 mrf_high == scan_mrf_low ||
2595 mrf_high == scan_mrf_high) {
2596 break;
2597 }
2598 }
2599
2600 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2601 /* Found a SEND instruction, which means that there are
2602 * live values in MRFs from base_mrf to base_mrf +
2603 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2604 * above it.
2605 */
2606 if (mrf_low >= scan_inst->base_mrf &&
2607 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2608 break;
2609 }
2610 if (mrf_high >= scan_inst->base_mrf &&
2611 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2612 break;
2613 }
2614 }
2615 }
2616 }
2617
2618 if (progress)
2619 invalidate_live_intervals();
2620
2621 return progress;
2622 }
2623
2624 /**
2625 * Walks through basic blocks, looking for repeated MRF writes and
2626 * removing the later ones.
2627 */
2628 bool
2629 fs_visitor::remove_duplicate_mrf_writes()
2630 {
2631 fs_inst *last_mrf_move[16];
2632 bool progress = false;
2633
2634 /* Need to update the MRF tracking for compressed instructions. */
2635 if (dispatch_width == 16)
2636 return false;
2637
2638 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2639
2640 foreach_list_safe(node, &this->instructions) {
2641 fs_inst *inst = (fs_inst *)node;
2642
2643 if (inst->is_control_flow()) {
2644 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2645 }
2646
2647 if (inst->opcode == BRW_OPCODE_MOV &&
2648 inst->dst.file == MRF) {
2649 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2650 if (prev_inst && inst->equals(prev_inst)) {
2651 inst->remove();
2652 progress = true;
2653 continue;
2654 }
2655 }
2656
2657 /* Clear out the last-write records for MRFs that were overwritten. */
2658 if (inst->dst.file == MRF) {
2659 last_mrf_move[inst->dst.reg] = NULL;
2660 }
2661
2662 if (inst->mlen > 0 && inst->base_mrf != -1) {
2663 /* Found a SEND instruction, which will include two or fewer
2664 * implied MRF writes. We could do better here.
2665 */
2666 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2667 last_mrf_move[inst->base_mrf + i] = NULL;
2668 }
2669 }
2670
2671 /* Clear out any MRF move records whose sources got overwritten. */
2672 if (inst->dst.file == GRF) {
2673 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2674 if (last_mrf_move[i] &&
2675 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2676 last_mrf_move[i] = NULL;
2677 }
2678 }
2679 }
2680
2681 if (inst->opcode == BRW_OPCODE_MOV &&
2682 inst->dst.file == MRF &&
2683 inst->src[0].file == GRF &&
2684 !inst->is_partial_write()) {
2685 last_mrf_move[inst->dst.reg] = inst;
2686 }
2687 }
2688
2689 if (progress)
2690 invalidate_live_intervals();
2691
2692 return progress;
2693 }
2694
2695 static void
2696 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2697 int first_grf, int grf_len)
2698 {
2699 bool inst_simd16 = (dispatch_width > 8 &&
2700 !inst->force_uncompressed &&
2701 !inst->force_sechalf);
2702
2703 /* Clear the flag for registers that actually got read (as expected). */
2704 for (int i = 0; i < 3; i++) {
2705 int grf;
2706 if (inst->src[i].file == GRF) {
2707 grf = inst->src[i].reg;
2708 } else if (inst->src[i].file == HW_REG &&
2709 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2710 grf = inst->src[i].fixed_hw_reg.nr;
2711 } else {
2712 continue;
2713 }
2714
2715 if (grf >= first_grf &&
2716 grf < first_grf + grf_len) {
2717 deps[grf - first_grf] = false;
2718 if (inst_simd16)
2719 deps[grf - first_grf + 1] = false;
2720 }
2721 }
2722 }
2723
2724 /**
2725 * Implements this workaround for the original 965:
2726 *
2727 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2728 * check for post destination dependencies on this instruction, software
2729 * must ensure that there is no destination hazard for the case of ‘write
2730 * followed by a posted write’ shown in the following example.
2731 *
2732 * 1. mov r3 0
2733 * 2. send r3.xy <rest of send instruction>
2734 * 3. mov r2 r3
2735 *
2736 * Due to no post-destination dependency check on the ‘send’, the above
2737 * code sequence could have two instructions (1 and 2) in flight at the
2738 * same time that both consider ‘r3’ as the target of their final writes.
2739 */
2740 void
2741 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2742 {
2743 int reg_size = dispatch_width / 8;
2744 int write_len = inst->regs_written * reg_size;
2745 int first_write_grf = inst->dst.reg;
2746 bool needs_dep[BRW_MAX_MRF];
2747 assert(write_len < (int)sizeof(needs_dep) - 1);
2748
2749 memset(needs_dep, false, sizeof(needs_dep));
2750 memset(needs_dep, true, write_len);
2751
2752 clear_deps_for_inst_src(inst, dispatch_width,
2753 needs_dep, first_write_grf, write_len);
2754
2755 /* Walk backwards looking for writes to registers we're writing which
2756 * aren't read since being written. If we hit the start of the program,
2757 * we assume that there are no outstanding dependencies on entry to the
2758 * program.
2759 */
2760 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2761 scan_inst != NULL;
2762 scan_inst = (fs_inst *)scan_inst->prev) {
2763
2764 /* If we hit control flow, assume that there *are* outstanding
2765 * dependencies, and force their cleanup before our instruction.
2766 */
2767 if (scan_inst->is_control_flow()) {
2768 for (int i = 0; i < write_len; i++) {
2769 if (needs_dep[i]) {
2770 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2771 }
2772 }
2773 return;
2774 }
2775
2776 bool scan_inst_simd16 = (dispatch_width > 8 &&
2777 !scan_inst->force_uncompressed &&
2778 !scan_inst->force_sechalf);
2779
2780 /* We insert our reads as late as possible on the assumption that any
2781 * instruction but a MOV that might have left us an outstanding
2782 * dependency has more latency than a MOV.
2783 */
2784 if (scan_inst->dst.file == GRF) {
2785 for (int i = 0; i < scan_inst->regs_written; i++) {
2786 int reg = scan_inst->dst.reg + i * reg_size;
2787
2788 if (reg >= first_write_grf &&
2789 reg < first_write_grf + write_len &&
2790 needs_dep[reg - first_write_grf]) {
2791 inst->insert_before(DEP_RESOLVE_MOV(reg));
2792 needs_dep[reg - first_write_grf] = false;
2793 if (scan_inst_simd16)
2794 needs_dep[reg - first_write_grf + 1] = false;
2795 }
2796 }
2797 }
2798
2799 /* Clear the flag for registers that actually got read (as expected). */
2800 clear_deps_for_inst_src(scan_inst, dispatch_width,
2801 needs_dep, first_write_grf, write_len);
2802
2803 /* Continue the loop only if we haven't resolved all the dependencies */
2804 int i;
2805 for (i = 0; i < write_len; i++) {
2806 if (needs_dep[i])
2807 break;
2808 }
2809 if (i == write_len)
2810 return;
2811 }
2812 }
2813
2814 /**
2815 * Implements this workaround for the original 965:
2816 *
2817 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2818 * used as a destination register until after it has been sourced by an
2819 * instruction with a different destination register.
2820 */
2821 void
2822 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2823 {
2824 int write_len = inst->regs_written * dispatch_width / 8;
2825 int first_write_grf = inst->dst.reg;
2826 bool needs_dep[BRW_MAX_MRF];
2827 assert(write_len < (int)sizeof(needs_dep) - 1);
2828
2829 memset(needs_dep, false, sizeof(needs_dep));
2830 memset(needs_dep, true, write_len);
2831 /* Walk forwards looking for writes to registers we're writing which aren't
2832 * read before being written.
2833 */
2834 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2835 !scan_inst->is_tail_sentinel();
2836 scan_inst = (fs_inst *)scan_inst->next) {
2837 /* If we hit control flow, force resolve all remaining dependencies. */
2838 if (scan_inst->is_control_flow()) {
2839 for (int i = 0; i < write_len; i++) {
2840 if (needs_dep[i])
2841 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2842 }
2843 return;
2844 }
2845
2846 /* Clear the flag for registers that actually got read (as expected). */
2847 clear_deps_for_inst_src(scan_inst, dispatch_width,
2848 needs_dep, first_write_grf, write_len);
2849
2850 /* We insert our reads as late as possible since they're reading the
2851 * result of a SEND, which has massive latency.
2852 */
2853 if (scan_inst->dst.file == GRF &&
2854 scan_inst->dst.reg >= first_write_grf &&
2855 scan_inst->dst.reg < first_write_grf + write_len &&
2856 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2857 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2858 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2859 }
2860
2861 /* Continue the loop only if we haven't resolved all the dependencies */
2862 int i;
2863 for (i = 0; i < write_len; i++) {
2864 if (needs_dep[i])
2865 break;
2866 }
2867 if (i == write_len)
2868 return;
2869 }
2870
2871 /* If we hit the end of the program, resolve all remaining dependencies out
2872 * of paranoia.
2873 */
2874 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2875 assert(last_inst->eot);
2876 for (int i = 0; i < write_len; i++) {
2877 if (needs_dep[i])
2878 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2879 }
2880 }
2881
2882 void
2883 fs_visitor::insert_gen4_send_dependency_workarounds()
2884 {
2885 if (brw->gen != 4 || brw->is_g4x)
2886 return;
2887
2888 /* Note that we're done with register allocation, so GRF fs_regs always
2889 * have a .reg_offset of 0.
2890 */
2891
2892 foreach_list_safe(node, &this->instructions) {
2893 fs_inst *inst = (fs_inst *)node;
2894
2895 if (inst->mlen != 0 && inst->dst.file == GRF) {
2896 insert_gen4_pre_send_dependency_workarounds(inst);
2897 insert_gen4_post_send_dependency_workarounds(inst);
2898 }
2899 }
2900 }
2901
2902 /**
2903 * Turns the generic expression-style uniform pull constant load instruction
2904 * into a hardware-specific series of instructions for loading a pull
2905 * constant.
2906 *
2907 * The expression style allows the CSE pass before this to optimize out
2908 * repeated loads from the same offset, and gives the pre-register-allocation
2909 * scheduling full flexibility, while the conversion to native instructions
2910 * allows the post-register-allocation scheduler the best information
2911 * possible.
2912 *
2913 * Note that execution masking for setting up pull constant loads is special:
2914 * the channels that need to be written are unrelated to the current execution
2915 * mask, since a later instruction will use one of the result channels as a
2916 * source operand for all 8 or 16 of its channels.
2917 */
2918 void
2919 fs_visitor::lower_uniform_pull_constant_loads()
2920 {
2921 foreach_list(node, &this->instructions) {
2922 fs_inst *inst = (fs_inst *)node;
2923
2924 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2925 continue;
2926
2927 if (brw->gen >= 7) {
2928 /* The offset arg before was a vec4-aligned byte offset. We need to
2929 * turn it into a dword offset.
2930 */
2931 fs_reg const_offset_reg = inst->src[1];
2932 assert(const_offset_reg.file == IMM &&
2933 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2934 const_offset_reg.imm.u /= 4;
2935 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2936
2937 /* This is actually going to be a MOV, but since only the first dword
2938 * is accessed, we have a special opcode to do just that one. Note
2939 * that this needs to be an operation that will be considered a def
2940 * by live variable analysis, or register allocation will explode.
2941 */
2942 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2943 payload, const_offset_reg);
2944 setup->force_writemask_all = true;
2945
2946 setup->ir = inst->ir;
2947 setup->annotation = inst->annotation;
2948 inst->insert_before(setup);
2949
2950 /* Similarly, this will only populate the first 4 channels of the
2951 * result register (since we only use smear values from 0-3), but we
2952 * don't tell the optimizer.
2953 */
2954 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2955 inst->src[1] = payload;
2956
2957 invalidate_live_intervals();
2958 } else {
2959 /* Before register allocation, we didn't tell the scheduler about the
2960 * MRF we use. We know it's safe to use this MRF because nothing
2961 * else does except for register spill/unspill, which generates and
2962 * uses its MRF within a single IR instruction.
2963 */
2964 inst->base_mrf = 14;
2965 inst->mlen = 1;
2966 }
2967 }
2968 }
2969
2970 void
2971 fs_visitor::dump_instructions()
2972 {
2973 calculate_register_pressure();
2974
2975 int ip = 0, max_pressure = 0;
2976 foreach_list(node, &this->instructions) {
2977 backend_instruction *inst = (backend_instruction *)node;
2978 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2979 printf("{%3d} %4d: ", regs_live_at_ip[ip], ip);
2980 dump_instruction(inst);
2981 ++ip;
2982 }
2983 printf("Maximum %3d registers live at once.\n", max_pressure);
2984 }
2985
2986 void
2987 fs_visitor::dump_instruction(backend_instruction *be_inst)
2988 {
2989 fs_inst *inst = (fs_inst *)be_inst;
2990
2991 if (inst->predicate) {
2992 printf("(%cf0.%d) ",
2993 inst->predicate_inverse ? '-' : '+',
2994 inst->flag_subreg);
2995 }
2996
2997 printf("%s", brw_instruction_name(inst->opcode));
2998 if (inst->saturate)
2999 printf(".sat");
3000 if (inst->conditional_mod) {
3001 printf("%s", conditional_modifier[inst->conditional_mod]);
3002 if (!inst->predicate &&
3003 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3004 inst->opcode != BRW_OPCODE_IF &&
3005 inst->opcode != BRW_OPCODE_WHILE))) {
3006 printf(".f0.%d", inst->flag_subreg);
3007 }
3008 }
3009 printf(" ");
3010
3011
3012 switch (inst->dst.file) {
3013 case GRF:
3014 printf("vgrf%d", inst->dst.reg);
3015 if (virtual_grf_sizes[inst->dst.reg] != 1 ||
3016 inst->dst.subreg_offset)
3017 printf("+%d.%d", inst->dst.reg_offset, inst->dst.subreg_offset);
3018 break;
3019 case MRF:
3020 printf("m%d", inst->dst.reg);
3021 break;
3022 case BAD_FILE:
3023 printf("(null)");
3024 break;
3025 case UNIFORM:
3026 printf("***u%d***", inst->dst.reg);
3027 break;
3028 case HW_REG:
3029 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3030 switch (inst->dst.fixed_hw_reg.nr) {
3031 case BRW_ARF_NULL:
3032 printf("null");
3033 break;
3034 case BRW_ARF_ADDRESS:
3035 printf("a0.%d", inst->dst.fixed_hw_reg.subnr);
3036 break;
3037 case BRW_ARF_ACCUMULATOR:
3038 printf("acc%d", inst->dst.fixed_hw_reg.subnr);
3039 break;
3040 case BRW_ARF_FLAG:
3041 printf("f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3042 inst->dst.fixed_hw_reg.subnr);
3043 break;
3044 default:
3045 printf("arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3046 inst->dst.fixed_hw_reg.subnr);
3047 break;
3048 }
3049 } else {
3050 printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
3051 }
3052 if (inst->dst.fixed_hw_reg.subnr)
3053 printf("+%d", inst->dst.fixed_hw_reg.subnr);
3054 break;
3055 default:
3056 printf("???");
3057 break;
3058 }
3059 printf(":%s, ", reg_encoding[inst->dst.type]);
3060
3061 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
3062 if (inst->src[i].negate)
3063 printf("-");
3064 if (inst->src[i].abs)
3065 printf("|");
3066 switch (inst->src[i].file) {
3067 case GRF:
3068 printf("vgrf%d", inst->src[i].reg);
3069 if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
3070 inst->src[i].subreg_offset)
3071 printf("+%d.%d", inst->src[i].reg_offset,
3072 inst->src[i].subreg_offset);
3073 break;
3074 case MRF:
3075 printf("***m%d***", inst->src[i].reg);
3076 break;
3077 case UNIFORM:
3078 printf("u%d", inst->src[i].reg);
3079 if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
3080 inst->src[i].subreg_offset)
3081 printf("+%d.%d", inst->src[i].reg_offset,
3082 inst->src[i].subreg_offset);
3083 break;
3084 case BAD_FILE:
3085 printf("(null)");
3086 break;
3087 case IMM:
3088 switch (inst->src[i].type) {
3089 case BRW_REGISTER_TYPE_F:
3090 printf("%ff", inst->src[i].imm.f);
3091 break;
3092 case BRW_REGISTER_TYPE_D:
3093 printf("%dd", inst->src[i].imm.i);
3094 break;
3095 case BRW_REGISTER_TYPE_UD:
3096 printf("%uu", inst->src[i].imm.u);
3097 break;
3098 default:
3099 printf("???");
3100 break;
3101 }
3102 break;
3103 case HW_REG:
3104 if (inst->src[i].fixed_hw_reg.negate)
3105 printf("-");
3106 if (inst->src[i].fixed_hw_reg.abs)
3107 printf("|");
3108 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3109 switch (inst->src[i].fixed_hw_reg.nr) {
3110 case BRW_ARF_NULL:
3111 printf("null");
3112 break;
3113 case BRW_ARF_ADDRESS:
3114 printf("a0.%d", inst->src[i].fixed_hw_reg.subnr);
3115 break;
3116 case BRW_ARF_ACCUMULATOR:
3117 printf("acc%d", inst->src[i].fixed_hw_reg.subnr);
3118 break;
3119 case BRW_ARF_FLAG:
3120 printf("f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3121 inst->src[i].fixed_hw_reg.subnr);
3122 break;
3123 default:
3124 printf("arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3125 inst->src[i].fixed_hw_reg.subnr);
3126 break;
3127 }
3128 } else {
3129 printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3130 }
3131 if (inst->src[i].fixed_hw_reg.subnr)
3132 printf("+%d", inst->src[i].fixed_hw_reg.subnr);
3133 if (inst->src[i].fixed_hw_reg.abs)
3134 printf("|");
3135 break;
3136 default:
3137 printf("???");
3138 break;
3139 }
3140 if (inst->src[i].abs)
3141 printf("|");
3142
3143 if (inst->src[i].file != IMM) {
3144 printf(":%s", brw_reg_type_letters(inst->src[i].type));
3145 }
3146
3147 if (i < 2 && inst->src[i + 1].file != BAD_FILE)
3148 printf(", ");
3149 }
3150
3151 printf(" ");
3152
3153 if (inst->force_uncompressed)
3154 printf("1sthalf ");
3155
3156 if (inst->force_sechalf)
3157 printf("2ndhalf ");
3158
3159 printf("\n");
3160 }
3161
3162 /**
3163 * Possibly returns an instruction that set up @param reg.
3164 *
3165 * Sometimes we want to take the result of some expression/variable
3166 * dereference tree and rewrite the instruction generating the result
3167 * of the tree. When processing the tree, we know that the
3168 * instructions generated are all writing temporaries that are dead
3169 * outside of this tree. So, if we have some instructions that write
3170 * a temporary, we're free to point that temp write somewhere else.
3171 *
3172 * Note that this doesn't guarantee that the instruction generated
3173 * only reg -- it might be the size=4 destination of a texture instruction.
3174 */
3175 fs_inst *
3176 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3177 fs_inst *end,
3178 fs_reg reg)
3179 {
3180 if (end == start ||
3181 end->is_partial_write() ||
3182 reg.reladdr ||
3183 !reg.equals(end->dst)) {
3184 return NULL;
3185 } else {
3186 return end;
3187 }
3188 }
3189
3190 void
3191 fs_visitor::setup_payload_gen6()
3192 {
3193 bool uses_depth =
3194 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3195 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3196
3197 assert(brw->gen >= 6);
3198
3199 /* R0-1: masks, pixel X/Y coordinates. */
3200 c->nr_payload_regs = 2;
3201 /* R2: only for 32-pixel dispatch.*/
3202
3203 /* R3-26: barycentric interpolation coordinates. These appear in the
3204 * same order that they appear in the brw_wm_barycentric_interp_mode
3205 * enum. Each set of coordinates occupies 2 registers if dispatch width
3206 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3207 * appear if they were enabled using the "Barycentric Interpolation
3208 * Mode" bits in WM_STATE.
3209 */
3210 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3211 if (barycentric_interp_modes & (1 << i)) {
3212 c->barycentric_coord_reg[i] = c->nr_payload_regs;
3213 c->nr_payload_regs += 2;
3214 if (dispatch_width == 16) {
3215 c->nr_payload_regs += 2;
3216 }
3217 }
3218 }
3219
3220 /* R27: interpolated depth if uses source depth */
3221 if (uses_depth) {
3222 c->source_depth_reg = c->nr_payload_regs;
3223 c->nr_payload_regs++;
3224 if (dispatch_width == 16) {
3225 /* R28: interpolated depth if not SIMD8. */
3226 c->nr_payload_regs++;
3227 }
3228 }
3229 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3230 if (uses_depth) {
3231 c->source_w_reg = c->nr_payload_regs;
3232 c->nr_payload_regs++;
3233 if (dispatch_width == 16) {
3234 /* R30: interpolated W if not SIMD8. */
3235 c->nr_payload_regs++;
3236 }
3237 }
3238
3239 c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3240 /* R31: MSAA position offsets. */
3241 if (c->prog_data.uses_pos_offset) {
3242 c->sample_pos_reg = c->nr_payload_regs;
3243 c->nr_payload_regs++;
3244 }
3245
3246 /* R32: MSAA input coverage mask */
3247 if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3248 assert(brw->gen >= 7);
3249 c->sample_mask_reg = c->nr_payload_regs;
3250 c->nr_payload_regs++;
3251 if (dispatch_width == 16) {
3252 /* R33: input coverage mask if not SIMD8. */
3253 c->nr_payload_regs++;
3254 }
3255 }
3256
3257 /* R34-: bary for 32-pixel. */
3258 /* R58-59: interp W for 32-pixel. */
3259
3260 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3261 c->source_depth_to_render_target = true;
3262 }
3263 }
3264
3265 void
3266 fs_visitor::assign_binding_table_offsets()
3267 {
3268 uint32_t next_binding_table_offset = 0;
3269
3270 /* If there are no color regions, we still perform an FB write to a null
3271 * renderbuffer, which we place at surface index 0.
3272 */
3273 c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3274 next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3275
3276 assign_common_binding_table_offsets(next_binding_table_offset);
3277 }
3278
3279 void
3280 fs_visitor::calculate_register_pressure()
3281 {
3282 calculate_live_intervals();
3283
3284 int num_instructions = 0;
3285 foreach_list(node, &this->instructions) {
3286 ++num_instructions;
3287 }
3288
3289 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3290
3291 for (int reg = 0; reg < virtual_grf_count; reg++) {
3292 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3293 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3294 }
3295 }
3296
3297 bool
3298 fs_visitor::run()
3299 {
3300 sanity_param_count = fp->Base.Parameters->NumParameters;
3301 uint32_t orig_nr_params = c->prog_data.nr_params;
3302 bool allocated_without_spills;
3303
3304 assign_binding_table_offsets();
3305
3306 if (brw->gen >= 6)
3307 setup_payload_gen6();
3308 else
3309 setup_payload_gen4();
3310
3311 if (0) {
3312 emit_dummy_fs();
3313 } else {
3314 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3315 emit_shader_time_begin();
3316
3317 calculate_urb_setup();
3318 if (fp->Base.InputsRead > 0) {
3319 if (brw->gen < 6)
3320 emit_interpolation_setup_gen4();
3321 else
3322 emit_interpolation_setup_gen6();
3323 }
3324
3325 /* We handle discards by keeping track of the still-live pixels in f0.1.
3326 * Initialize it with the dispatched pixels.
3327 */
3328 if (fp->UsesKill || c->key.alpha_test_func) {
3329 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3330 discard_init->flag_subreg = 1;
3331 }
3332
3333 /* Generate FS IR for main(). (the visitor only descends into
3334 * functions called "main").
3335 */
3336 if (shader) {
3337 foreach_list(node, &*shader->base.ir) {
3338 ir_instruction *ir = (ir_instruction *)node;
3339 base_ir = ir;
3340 this->result = reg_undef;
3341 ir->accept(this);
3342 }
3343 } else {
3344 emit_fragment_program_code();
3345 }
3346 base_ir = NULL;
3347 if (failed)
3348 return false;
3349
3350 emit(FS_OPCODE_PLACEHOLDER_HALT);
3351
3352 if (c->key.alpha_test_func)
3353 emit_alpha_test();
3354
3355 emit_fb_writes();
3356
3357 split_virtual_grfs();
3358
3359 move_uniform_array_access_to_pull_constants();
3360 remove_dead_constants();
3361 setup_pull_constants();
3362
3363 bool progress;
3364 do {
3365 progress = false;
3366
3367 compact_virtual_grfs();
3368
3369 progress = remove_duplicate_mrf_writes() || progress;
3370
3371 progress = opt_algebraic() || progress;
3372 progress = opt_cse() || progress;
3373 progress = opt_copy_propagate() || progress;
3374 progress = opt_peephole_predicated_break() || progress;
3375 progress = dead_code_eliminate() || progress;
3376 progress = dead_code_eliminate_local() || progress;
3377 progress = opt_peephole_sel() || progress;
3378 progress = dead_control_flow_eliminate(this) || progress;
3379 progress = opt_saturate_propagation() || progress;
3380 progress = register_coalesce() || progress;
3381 progress = compute_to_mrf() || progress;
3382 } while (progress);
3383
3384 lower_uniform_pull_constant_loads();
3385
3386 assign_curb_setup();
3387 assign_urb_setup();
3388
3389 static enum instruction_scheduler_mode pre_modes[] = {
3390 SCHEDULE_PRE,
3391 SCHEDULE_PRE_NON_LIFO,
3392 SCHEDULE_PRE_LIFO,
3393 };
3394
3395 /* Try each scheduling heuristic to see if it can successfully register
3396 * allocate without spilling. They should be ordered by decreasing
3397 * performance but increasing likelihood of allocating.
3398 */
3399 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3400 schedule_instructions(pre_modes[i]);
3401
3402 if (0) {
3403 assign_regs_trivial();
3404 allocated_without_spills = true;
3405 } else {
3406 allocated_without_spills = assign_regs(false);
3407 }
3408 if (allocated_without_spills)
3409 break;
3410 }
3411
3412 if (!allocated_without_spills) {
3413 /* We assume that any spilling is worse than just dropping back to
3414 * SIMD8. There's probably actually some intermediate point where
3415 * SIMD16 with a couple of spills is still better.
3416 */
3417 if (dispatch_width == 16) {
3418 fail("Failure to register allocate. Reduce number of "
3419 "live scalar values to avoid this.");
3420 }
3421
3422 /* Since we're out of heuristics, just go spill registers until we
3423 * get an allocation.
3424 */
3425 while (!assign_regs(true)) {
3426 if (failed)
3427 break;
3428 }
3429 }
3430 }
3431 assert(force_uncompressed_stack == 0);
3432
3433 /* This must come after all optimization and register allocation, since
3434 * it inserts dead code that happens to have side effects, and it does
3435 * so based on the actual physical registers in use.
3436 */
3437 insert_gen4_send_dependency_workarounds();
3438
3439 if (failed)
3440 return false;
3441
3442 if (!allocated_without_spills)
3443 schedule_instructions(SCHEDULE_POST);
3444
3445 if (dispatch_width == 8) {
3446 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3447 } else {
3448 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3449
3450 /* Make sure we didn't try to sneak in an extra uniform */
3451 assert(orig_nr_params == c->prog_data.nr_params);
3452 (void) orig_nr_params;
3453 }
3454
3455 /* If any state parameters were appended, then ParameterValues could have
3456 * been realloced, in which case the driver uniform storage set up by
3457 * _mesa_associate_uniform_storage() would point to freed memory. Make
3458 * sure that didn't happen.
3459 */
3460 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3461
3462 return !failed;
3463 }
3464
3465 const unsigned *
3466 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3467 struct gl_fragment_program *fp,
3468 struct gl_shader_program *prog,
3469 unsigned *final_assembly_size)
3470 {
3471 bool start_busy = false;
3472 float start_time = 0;
3473
3474 if (unlikely(brw->perf_debug)) {
3475 start_busy = (brw->batch.last_bo &&
3476 drm_intel_bo_busy(brw->batch.last_bo));
3477 start_time = get_time();
3478 }
3479
3480 struct brw_shader *shader = NULL;
3481 if (prog)
3482 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3483
3484 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3485 if (prog) {
3486 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3487 _mesa_print_ir(shader->base.ir, NULL);
3488 printf("\n\n");
3489 } else {
3490 printf("ARB_fragment_program %d ir for native fragment shader\n",
3491 fp->Base.Id);
3492 _mesa_print_program(&fp->Base);
3493 }
3494 }
3495
3496 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3497 */
3498 fs_visitor v(brw, c, prog, fp, 8);
3499 if (!v.run()) {
3500 if (prog) {
3501 prog->LinkStatus = false;
3502 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3503 }
3504
3505 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3506 v.fail_msg);
3507
3508 return NULL;
3509 }
3510
3511 exec_list *simd16_instructions = NULL;
3512 fs_visitor v2(brw, c, prog, fp, 16);
3513 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3514 if (c->prog_data.nr_pull_params == 0) {
3515 /* Try a SIMD16 compile */
3516 v2.import_uniforms(&v);
3517 if (!v2.run()) {
3518 perf_debug("SIMD16 shader failed to compile, falling back to "
3519 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3520 } else {
3521 simd16_instructions = &v2.instructions;
3522 }
3523 } else {
3524 perf_debug("Skipping SIMD16 due to pull parameters.\n");
3525 }
3526 }
3527
3528 const unsigned *assembly = NULL;
3529 if (brw->gen >= 8) {
3530 gen8_fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3531 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3532 final_assembly_size);
3533 } else {
3534 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3535 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3536 final_assembly_size);
3537 }
3538
3539 if (unlikely(brw->perf_debug) && shader) {
3540 if (shader->compiled_once)
3541 brw_wm_debug_recompile(brw, prog, &c->key);
3542 shader->compiled_once = true;
3543
3544 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3545 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3546 (get_time() - start_time) * 1000);
3547 }
3548 }
3549
3550 return assembly;
3551 }
3552
3553 bool
3554 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3555 {
3556 struct brw_context *brw = brw_context(ctx);
3557 struct brw_wm_prog_key key;
3558
3559 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3560 return true;
3561
3562 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3563 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3564 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3565 bool program_uses_dfdy = fp->UsesDFdy;
3566
3567 memset(&key, 0, sizeof(key));
3568
3569 if (brw->gen < 6) {
3570 if (fp->UsesKill)
3571 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3572
3573 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3574 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3575
3576 /* Just assume depth testing. */
3577 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3578 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3579 }
3580
3581 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3582 BRW_FS_VARYING_INPUT_MASK) > 16)
3583 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3584
3585 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3586
3587 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3588 for (unsigned i = 0; i < sampler_count; i++) {
3589 if (fp->Base.ShadowSamplers & (1 << i)) {
3590 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3591 key.tex.swizzles[i] =
3592 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3593 } else {
3594 /* Color sampler: assume no swizzling. */
3595 key.tex.swizzles[i] = SWIZZLE_XYZW;
3596 }
3597 }
3598
3599 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3600 key.drawable_height = ctx->DrawBuffer->Height;
3601 }
3602
3603 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3604 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3605 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3606
3607 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3608 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3609 key.nr_color_regions > 1;
3610 }
3611
3612 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3613 * quality of the derivatives is likely to be determined by the driconf
3614 * option.
3615 */
3616 key.high_quality_derivatives = brw->disable_derivative_optimization;
3617
3618 key.program_string_id = bfp->id;
3619
3620 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3621 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3622
3623 bool success = do_wm_prog(brw, prog, bfp, &key);
3624
3625 brw->wm.base.prog_offset = old_prog_offset;
3626 brw->wm.prog_data = old_prog_data;
3627
3628 return success;
3629 }