i965/fs: Let register_coalesce_2() eliminate self-moves.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53
54 void
55 fs_inst::init()
56 {
57 memset(this, 0, sizeof(*this));
58 this->opcode = BRW_OPCODE_NOP;
59 this->conditional_mod = BRW_CONDITIONAL_NONE;
60
61 this->dst = reg_undef;
62 this->src[0] = reg_undef;
63 this->src[1] = reg_undef;
64 this->src[2] = reg_undef;
65
66 /* This will be the case for almost all instructions. */
67 this->regs_written = 1;
68 }
69
70 fs_inst::fs_inst()
71 {
72 init();
73 }
74
75 fs_inst::fs_inst(enum opcode opcode)
76 {
77 init();
78 this->opcode = opcode;
79 }
80
81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
82 {
83 init();
84 this->opcode = opcode;
85 this->dst = dst;
86
87 if (dst.file == GRF)
88 assert(dst.reg_offset >= 0);
89 }
90
91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
92 {
93 init();
94 this->opcode = opcode;
95 this->dst = dst;
96 this->src[0] = src0;
97
98 if (dst.file == GRF)
99 assert(dst.reg_offset >= 0);
100 if (src[0].file == GRF)
101 assert(src[0].reg_offset >= 0);
102 }
103
104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
105 {
106 init();
107 this->opcode = opcode;
108 this->dst = dst;
109 this->src[0] = src0;
110 this->src[1] = src1;
111
112 if (dst.file == GRF)
113 assert(dst.reg_offset >= 0);
114 if (src[0].file == GRF)
115 assert(src[0].reg_offset >= 0);
116 if (src[1].file == GRF)
117 assert(src[1].reg_offset >= 0);
118 }
119
120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
121 fs_reg src0, fs_reg src1, fs_reg src2)
122 {
123 init();
124 this->opcode = opcode;
125 this->dst = dst;
126 this->src[0] = src0;
127 this->src[1] = src1;
128 this->src[2] = src2;
129
130 if (dst.file == GRF)
131 assert(dst.reg_offset >= 0);
132 if (src[0].file == GRF)
133 assert(src[0].reg_offset >= 0);
134 if (src[1].file == GRF)
135 assert(src[1].reg_offset >= 0);
136 if (src[2].file == GRF)
137 assert(src[2].reg_offset >= 0);
138 }
139
140 #define ALU1(op) \
141 fs_inst * \
142 fs_visitor::op(fs_reg dst, fs_reg src0) \
143 { \
144 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
145 }
146
147 #define ALU2(op) \
148 fs_inst * \
149 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
150 { \
151 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
152 }
153
154 #define ALU3(op) \
155 fs_inst * \
156 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
157 { \
158 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
159 }
160
161 ALU1(NOT)
162 ALU1(MOV)
163 ALU1(FRC)
164 ALU1(RNDD)
165 ALU1(RNDE)
166 ALU1(RNDZ)
167 ALU2(ADD)
168 ALU2(MUL)
169 ALU2(MACH)
170 ALU2(AND)
171 ALU2(OR)
172 ALU2(XOR)
173 ALU2(SHL)
174 ALU2(SHR)
175 ALU2(ASR)
176 ALU3(LRP)
177 ALU1(BFREV)
178 ALU3(BFE)
179 ALU2(BFI1)
180 ALU3(BFI2)
181 ALU1(FBH)
182 ALU1(FBL)
183 ALU1(CBIT)
184 ALU3(MAD)
185 ALU2(ADDC)
186 ALU2(SUBB)
187
188 /** Gen4 predicated IF. */
189 fs_inst *
190 fs_visitor::IF(uint32_t predicate)
191 {
192 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
193 inst->predicate = predicate;
194 return inst;
195 }
196
197 /** Gen6 IF with embedded comparison. */
198 fs_inst *
199 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
200 {
201 assert(brw->gen == 6);
202 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
203 reg_null_d, src0, src1);
204 inst->conditional_mod = condition;
205 return inst;
206 }
207
208 /**
209 * CMP: Sets the low bit of the destination channels with the result
210 * of the comparison, while the upper bits are undefined, and updates
211 * the flag register with the packed 16 bits of the result.
212 */
213 fs_inst *
214 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
215 {
216 fs_inst *inst;
217
218 /* Take the instruction:
219 *
220 * CMP null<d> src0<f> src1<f>
221 *
222 * Original gen4 does type conversion to the destination type before
223 * comparison, producing garbage results for floating point comparisons.
224 * gen5 does the comparison on the execution type (resolved source types),
225 * so dst type doesn't matter. gen6 does comparison and then uses the
226 * result as if it was the dst type with no conversion, which happens to
227 * mostly work out for float-interpreted-as-int since our comparisons are
228 * for >0, =0, <0.
229 */
230 if (brw->gen == 4) {
231 dst.type = src0.type;
232 if (dst.file == HW_REG)
233 dst.fixed_hw_reg.type = dst.type;
234 }
235
236 resolve_ud_negate(&src0);
237 resolve_ud_negate(&src1);
238
239 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
240 inst->conditional_mod = condition;
241
242 return inst;
243 }
244
245 exec_list
246 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
247 fs_reg varying_offset,
248 uint32_t const_offset)
249 {
250 exec_list instructions;
251 fs_inst *inst;
252
253 /* We have our constant surface use a pitch of 4 bytes, so our index can
254 * be any component of a vector, and then we load 4 contiguous
255 * components starting from that.
256 *
257 * We break down the const_offset to a portion added to the variable
258 * offset and a portion done using reg_offset, which means that if you
259 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
260 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
261 * CSE can later notice that those loads are all the same and eliminate
262 * the redundant ones.
263 */
264 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
265 instructions.push_tail(ADD(vec4_offset,
266 varying_offset, const_offset & ~3));
267
268 int scale = 1;
269 if (brw->gen == 4 && dispatch_width == 8) {
270 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
271 * u, v, r) as parameters, or we can just use the SIMD16 message
272 * consisting of (header, u). We choose the second, at the cost of a
273 * longer return length.
274 */
275 scale = 2;
276 }
277
278 enum opcode op;
279 if (brw->gen >= 7)
280 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
281 else
282 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
283 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
284 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
285 inst->regs_written = 4 * scale;
286 instructions.push_tail(inst);
287
288 if (brw->gen < 7) {
289 inst->base_mrf = 13;
290 inst->header_present = true;
291 if (brw->gen == 4)
292 inst->mlen = 3;
293 else
294 inst->mlen = 1 + dispatch_width / 8;
295 }
296
297 vec4_result.reg_offset += (const_offset & 3) * scale;
298 instructions.push_tail(MOV(dst, vec4_result));
299
300 return instructions;
301 }
302
303 /**
304 * A helper for MOV generation for fixing up broken hardware SEND dependency
305 * handling.
306 */
307 fs_inst *
308 fs_visitor::DEP_RESOLVE_MOV(int grf)
309 {
310 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
311
312 inst->ir = NULL;
313 inst->annotation = "send dependency resolve";
314
315 /* The caller always wants uncompressed to emit the minimal extra
316 * dependencies, and to avoid having to deal with aligning its regs to 2.
317 */
318 inst->force_uncompressed = true;
319
320 return inst;
321 }
322
323 bool
324 fs_inst::equals(fs_inst *inst)
325 {
326 return (opcode == inst->opcode &&
327 dst.equals(inst->dst) &&
328 src[0].equals(inst->src[0]) &&
329 src[1].equals(inst->src[1]) &&
330 src[2].equals(inst->src[2]) &&
331 saturate == inst->saturate &&
332 predicate == inst->predicate &&
333 conditional_mod == inst->conditional_mod &&
334 mlen == inst->mlen &&
335 base_mrf == inst->base_mrf &&
336 sampler == inst->sampler &&
337 target == inst->target &&
338 eot == inst->eot &&
339 header_present == inst->header_present &&
340 shadow_compare == inst->shadow_compare &&
341 offset == inst->offset);
342 }
343
344 bool
345 fs_inst::overwrites_reg(const fs_reg &reg)
346 {
347 return (reg.file == dst.file &&
348 reg.reg == dst.reg &&
349 reg.reg_offset >= dst.reg_offset &&
350 reg.reg_offset < dst.reg_offset + regs_written);
351 }
352
353 bool
354 fs_inst::is_send_from_grf()
355 {
356 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
357 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
358 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
359 src[1].file == GRF) ||
360 (is_tex() && src[0].file == GRF));
361 }
362
363 bool
364 fs_visitor::can_do_source_mods(fs_inst *inst)
365 {
366 if (brw->gen == 6 && inst->is_math())
367 return false;
368
369 if (inst->is_send_from_grf())
370 return false;
371
372 if (!inst->can_do_source_mods())
373 return false;
374
375 return true;
376 }
377
378 void
379 fs_reg::init()
380 {
381 memset(this, 0, sizeof(*this));
382 this->smear = -1;
383 }
384
385 /** Generic unset register constructor. */
386 fs_reg::fs_reg()
387 {
388 init();
389 this->file = BAD_FILE;
390 }
391
392 /** Immediate value constructor. */
393 fs_reg::fs_reg(float f)
394 {
395 init();
396 this->file = IMM;
397 this->type = BRW_REGISTER_TYPE_F;
398 this->imm.f = f;
399 }
400
401 /** Immediate value constructor. */
402 fs_reg::fs_reg(int32_t i)
403 {
404 init();
405 this->file = IMM;
406 this->type = BRW_REGISTER_TYPE_D;
407 this->imm.i = i;
408 }
409
410 /** Immediate value constructor. */
411 fs_reg::fs_reg(uint32_t u)
412 {
413 init();
414 this->file = IMM;
415 this->type = BRW_REGISTER_TYPE_UD;
416 this->imm.u = u;
417 }
418
419 /** Fixed brw_reg. */
420 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
421 {
422 init();
423 this->file = HW_REG;
424 this->fixed_hw_reg = fixed_hw_reg;
425 this->type = fixed_hw_reg.type;
426 }
427
428 bool
429 fs_reg::equals(const fs_reg &r) const
430 {
431 return (file == r.file &&
432 reg == r.reg &&
433 reg_offset == r.reg_offset &&
434 type == r.type &&
435 negate == r.negate &&
436 abs == r.abs &&
437 !reladdr && !r.reladdr &&
438 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
439 sizeof(fixed_hw_reg)) == 0 &&
440 smear == r.smear &&
441 imm.u == r.imm.u);
442 }
443
444 fs_reg
445 fs_reg::retype(uint32_t type)
446 {
447 fs_reg result = *this;
448 result.type = type;
449 return result;
450 }
451
452 bool
453 fs_reg::is_zero() const
454 {
455 if (file != IMM)
456 return false;
457
458 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
459 }
460
461 bool
462 fs_reg::is_one() const
463 {
464 if (file != IMM)
465 return false;
466
467 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
468 }
469
470 bool
471 fs_reg::is_null() const
472 {
473 return file == HW_REG &&
474 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
475 fixed_hw_reg.nr == BRW_ARF_NULL;
476 }
477
478 bool
479 fs_reg::is_valid_3src() const
480 {
481 return file == GRF || file == UNIFORM;
482 }
483
484 int
485 fs_visitor::type_size(const struct glsl_type *type)
486 {
487 unsigned int size, i;
488
489 switch (type->base_type) {
490 case GLSL_TYPE_UINT:
491 case GLSL_TYPE_INT:
492 case GLSL_TYPE_FLOAT:
493 case GLSL_TYPE_BOOL:
494 return type->components();
495 case GLSL_TYPE_ARRAY:
496 return type_size(type->fields.array) * type->length;
497 case GLSL_TYPE_STRUCT:
498 size = 0;
499 for (i = 0; i < type->length; i++) {
500 size += type_size(type->fields.structure[i].type);
501 }
502 return size;
503 case GLSL_TYPE_SAMPLER:
504 /* Samplers take up no register space, since they're baked in at
505 * link time.
506 */
507 return 0;
508 case GLSL_TYPE_ATOMIC_UINT:
509 return 0;
510 case GLSL_TYPE_VOID:
511 case GLSL_TYPE_ERROR:
512 case GLSL_TYPE_INTERFACE:
513 assert(!"not reached");
514 break;
515 }
516
517 return 0;
518 }
519
520 fs_reg
521 fs_visitor::get_timestamp()
522 {
523 assert(brw->gen >= 7);
524
525 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
526 BRW_ARF_TIMESTAMP,
527 0),
528 BRW_REGISTER_TYPE_UD));
529
530 fs_reg dst = fs_reg(this, glsl_type::uint_type);
531
532 fs_inst *mov = emit(MOV(dst, ts));
533 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
534 * even if it's not enabled in the dispatch.
535 */
536 mov->force_writemask_all = true;
537 mov->force_uncompressed = true;
538
539 /* The caller wants the low 32 bits of the timestamp. Since it's running
540 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
541 * which is plenty of time for our purposes. It is identical across the
542 * EUs, but since it's tracking GPU core speed it will increment at a
543 * varying rate as render P-states change.
544 *
545 * The caller could also check if render P-states have changed (or anything
546 * else that might disrupt timing) by setting smear to 2 and checking if
547 * that field is != 0.
548 */
549 dst.smear = 0;
550
551 return dst;
552 }
553
554 void
555 fs_visitor::emit_shader_time_begin()
556 {
557 current_annotation = "shader time start";
558 shader_start_time = get_timestamp();
559 }
560
561 void
562 fs_visitor::emit_shader_time_end()
563 {
564 current_annotation = "shader time end";
565
566 enum shader_time_shader_type type, written_type, reset_type;
567 if (dispatch_width == 8) {
568 type = ST_FS8;
569 written_type = ST_FS8_WRITTEN;
570 reset_type = ST_FS8_RESET;
571 } else {
572 assert(dispatch_width == 16);
573 type = ST_FS16;
574 written_type = ST_FS16_WRITTEN;
575 reset_type = ST_FS16_RESET;
576 }
577
578 fs_reg shader_end_time = get_timestamp();
579
580 /* Check that there weren't any timestamp reset events (assuming these
581 * were the only two timestamp reads that happened).
582 */
583 fs_reg reset = shader_end_time;
584 reset.smear = 2;
585 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
586 test->conditional_mod = BRW_CONDITIONAL_Z;
587 emit(IF(BRW_PREDICATE_NORMAL));
588
589 push_force_uncompressed();
590 fs_reg start = shader_start_time;
591 start.negate = true;
592 fs_reg diff = fs_reg(this, glsl_type::uint_type);
593 emit(ADD(diff, start, shader_end_time));
594
595 /* If there were no instructions between the two timestamp gets, the diff
596 * is 2 cycles. Remove that overhead, so I can forget about that when
597 * trying to determine the time taken for single instructions.
598 */
599 emit(ADD(diff, diff, fs_reg(-2u)));
600
601 emit_shader_time_write(type, diff);
602 emit_shader_time_write(written_type, fs_reg(1u));
603 emit(BRW_OPCODE_ELSE);
604 emit_shader_time_write(reset_type, fs_reg(1u));
605 emit(BRW_OPCODE_ENDIF);
606
607 pop_force_uncompressed();
608 }
609
610 void
611 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
612 fs_reg value)
613 {
614 int shader_time_index =
615 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
616 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
617
618 fs_reg payload;
619 if (dispatch_width == 8)
620 payload = fs_reg(this, glsl_type::uvec2_type);
621 else
622 payload = fs_reg(this, glsl_type::uint_type);
623
624 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
625 fs_reg(), payload, offset, value));
626 }
627
628 void
629 fs_visitor::fail(const char *format, ...)
630 {
631 va_list va;
632 char *msg;
633
634 if (failed)
635 return;
636
637 failed = true;
638
639 va_start(va, format);
640 msg = ralloc_vasprintf(mem_ctx, format, va);
641 va_end(va);
642 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
643
644 this->fail_msg = msg;
645
646 if (INTEL_DEBUG & DEBUG_WM) {
647 fprintf(stderr, "%s", msg);
648 }
649 }
650
651 fs_inst *
652 fs_visitor::emit(enum opcode opcode)
653 {
654 return emit(fs_inst(opcode));
655 }
656
657 fs_inst *
658 fs_visitor::emit(enum opcode opcode, fs_reg dst)
659 {
660 return emit(fs_inst(opcode, dst));
661 }
662
663 fs_inst *
664 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
665 {
666 return emit(fs_inst(opcode, dst, src0));
667 }
668
669 fs_inst *
670 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
671 {
672 return emit(fs_inst(opcode, dst, src0, src1));
673 }
674
675 fs_inst *
676 fs_visitor::emit(enum opcode opcode, fs_reg dst,
677 fs_reg src0, fs_reg src1, fs_reg src2)
678 {
679 return emit(fs_inst(opcode, dst, src0, src1, src2));
680 }
681
682 void
683 fs_visitor::push_force_uncompressed()
684 {
685 force_uncompressed_stack++;
686 }
687
688 void
689 fs_visitor::pop_force_uncompressed()
690 {
691 force_uncompressed_stack--;
692 assert(force_uncompressed_stack >= 0);
693 }
694
695 /**
696 * Returns true if the instruction has a flag that means it won't
697 * update an entire destination register.
698 *
699 * For example, dead code elimination and live variable analysis want to know
700 * when a write to a variable screens off any preceding values that were in
701 * it.
702 */
703 bool
704 fs_inst::is_partial_write()
705 {
706 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
707 this->force_uncompressed ||
708 this->force_sechalf);
709 }
710
711 int
712 fs_inst::regs_read(fs_visitor *v, int arg)
713 {
714 if (is_tex() && arg == 0 && src[0].file == GRF) {
715 if (v->dispatch_width == 16)
716 return (mlen + 1) / 2;
717 else
718 return mlen;
719 }
720 return 1;
721 }
722
723 bool
724 fs_inst::reads_flag()
725 {
726 return predicate;
727 }
728
729 bool
730 fs_inst::writes_flag()
731 {
732 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
733 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
734 }
735
736 /**
737 * Returns how many MRFs an FS opcode will write over.
738 *
739 * Note that this is not the 0 or 1 implied writes in an actual gen
740 * instruction -- the FS opcodes often generate MOVs in addition.
741 */
742 int
743 fs_visitor::implied_mrf_writes(fs_inst *inst)
744 {
745 if (inst->mlen == 0)
746 return 0;
747
748 if (inst->base_mrf == -1)
749 return 0;
750
751 switch (inst->opcode) {
752 case SHADER_OPCODE_RCP:
753 case SHADER_OPCODE_RSQ:
754 case SHADER_OPCODE_SQRT:
755 case SHADER_OPCODE_EXP2:
756 case SHADER_OPCODE_LOG2:
757 case SHADER_OPCODE_SIN:
758 case SHADER_OPCODE_COS:
759 return 1 * dispatch_width / 8;
760 case SHADER_OPCODE_POW:
761 case SHADER_OPCODE_INT_QUOTIENT:
762 case SHADER_OPCODE_INT_REMAINDER:
763 return 2 * dispatch_width / 8;
764 case SHADER_OPCODE_TEX:
765 case FS_OPCODE_TXB:
766 case SHADER_OPCODE_TXD:
767 case SHADER_OPCODE_TXF:
768 case SHADER_OPCODE_TXF_MS:
769 case SHADER_OPCODE_TG4:
770 case SHADER_OPCODE_TG4_OFFSET:
771 case SHADER_OPCODE_TXL:
772 case SHADER_OPCODE_TXS:
773 case SHADER_OPCODE_LOD:
774 return 1;
775 case FS_OPCODE_FB_WRITE:
776 return 2;
777 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
778 case SHADER_OPCODE_GEN4_SCRATCH_READ:
779 return 1;
780 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
781 return inst->mlen;
782 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
783 return 2;
784 case SHADER_OPCODE_UNTYPED_ATOMIC:
785 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
786 return 0;
787 default:
788 assert(!"not reached");
789 return inst->mlen;
790 }
791 }
792
793 int
794 fs_visitor::virtual_grf_alloc(int size)
795 {
796 if (virtual_grf_array_size <= virtual_grf_count) {
797 if (virtual_grf_array_size == 0)
798 virtual_grf_array_size = 16;
799 else
800 virtual_grf_array_size *= 2;
801 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
802 virtual_grf_array_size);
803 }
804 virtual_grf_sizes[virtual_grf_count] = size;
805 return virtual_grf_count++;
806 }
807
808 /** Fixed HW reg constructor. */
809 fs_reg::fs_reg(enum register_file file, int reg)
810 {
811 init();
812 this->file = file;
813 this->reg = reg;
814 this->type = BRW_REGISTER_TYPE_F;
815 }
816
817 /** Fixed HW reg constructor. */
818 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
819 {
820 init();
821 this->file = file;
822 this->reg = reg;
823 this->type = type;
824 }
825
826 /** Automatic reg constructor. */
827 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
828 {
829 init();
830
831 this->file = GRF;
832 this->reg = v->virtual_grf_alloc(v->type_size(type));
833 this->reg_offset = 0;
834 this->type = brw_type_for_base_type(type);
835 }
836
837 fs_reg *
838 fs_visitor::variable_storage(ir_variable *var)
839 {
840 return (fs_reg *)hash_table_find(this->variable_ht, var);
841 }
842
843 void
844 import_uniforms_callback(const void *key,
845 void *data,
846 void *closure)
847 {
848 struct hash_table *dst_ht = (struct hash_table *)closure;
849 const fs_reg *reg = (const fs_reg *)data;
850
851 if (reg->file != UNIFORM)
852 return;
853
854 hash_table_insert(dst_ht, data, key);
855 }
856
857 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
858 * This brings in those uniform definitions
859 */
860 void
861 fs_visitor::import_uniforms(fs_visitor *v)
862 {
863 hash_table_call_foreach(v->variable_ht,
864 import_uniforms_callback,
865 variable_ht);
866 this->params_remap = v->params_remap;
867 this->nr_params_remap = v->nr_params_remap;
868 }
869
870 /* Our support for uniforms is piggy-backed on the struct
871 * gl_fragment_program, because that's where the values actually
872 * get stored, rather than in some global gl_shader_program uniform
873 * store.
874 */
875 void
876 fs_visitor::setup_uniform_values(ir_variable *ir)
877 {
878 int namelen = strlen(ir->name);
879
880 /* The data for our (non-builtin) uniforms is stored in a series of
881 * gl_uniform_driver_storage structs for each subcomponent that
882 * glGetUniformLocation() could name. We know it's been set up in the same
883 * order we'd walk the type, so walk the list of storage and find anything
884 * with our name, or the prefix of a component that starts with our name.
885 */
886 unsigned params_before = c->prog_data.nr_params;
887 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
888 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
889
890 if (strncmp(ir->name, storage->name, namelen) != 0 ||
891 (storage->name[namelen] != 0 &&
892 storage->name[namelen] != '.' &&
893 storage->name[namelen] != '[')) {
894 continue;
895 }
896
897 unsigned slots = storage->type->component_slots();
898 if (storage->array_elements)
899 slots *= storage->array_elements;
900
901 for (unsigned i = 0; i < slots; i++) {
902 c->prog_data.param[c->prog_data.nr_params++] =
903 &storage->storage[i].f;
904 }
905 }
906
907 /* Make sure we actually initialized the right amount of stuff here. */
908 assert(params_before + ir->type->component_slots() ==
909 c->prog_data.nr_params);
910 (void)params_before;
911 }
912
913
914 /* Our support for builtin uniforms is even scarier than non-builtin.
915 * It sits on top of the PROG_STATE_VAR parameters that are
916 * automatically updated from GL context state.
917 */
918 void
919 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
920 {
921 const ir_state_slot *const slots = ir->state_slots;
922 assert(ir->state_slots != NULL);
923
924 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
925 /* This state reference has already been setup by ir_to_mesa, but we'll
926 * get the same index back here.
927 */
928 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
929 (gl_state_index *)slots[i].tokens);
930
931 /* Add each of the unique swizzles of the element as a parameter.
932 * This'll end up matching the expected layout of the
933 * array/matrix/structure we're trying to fill in.
934 */
935 int last_swiz = -1;
936 for (unsigned int j = 0; j < 4; j++) {
937 int swiz = GET_SWZ(slots[i].swizzle, j);
938 if (swiz == last_swiz)
939 break;
940 last_swiz = swiz;
941
942 c->prog_data.param[c->prog_data.nr_params++] =
943 &fp->Base.Parameters->ParameterValues[index][swiz].f;
944 }
945 }
946 }
947
948 fs_reg *
949 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
950 {
951 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
952 fs_reg wpos = *reg;
953 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
954
955 /* gl_FragCoord.x */
956 if (ir->pixel_center_integer) {
957 emit(MOV(wpos, this->pixel_x));
958 } else {
959 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
960 }
961 wpos.reg_offset++;
962
963 /* gl_FragCoord.y */
964 if (!flip && ir->pixel_center_integer) {
965 emit(MOV(wpos, this->pixel_y));
966 } else {
967 fs_reg pixel_y = this->pixel_y;
968 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
969
970 if (flip) {
971 pixel_y.negate = true;
972 offset += c->key.drawable_height - 1.0;
973 }
974
975 emit(ADD(wpos, pixel_y, fs_reg(offset)));
976 }
977 wpos.reg_offset++;
978
979 /* gl_FragCoord.z */
980 if (brw->gen >= 6) {
981 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
982 } else {
983 emit(FS_OPCODE_LINTERP, wpos,
984 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
985 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
986 interp_reg(VARYING_SLOT_POS, 2));
987 }
988 wpos.reg_offset++;
989
990 /* gl_FragCoord.w: Already set up in emit_interpolation */
991 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
992
993 return reg;
994 }
995
996 fs_inst *
997 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
998 glsl_interp_qualifier interpolation_mode,
999 bool is_centroid)
1000 {
1001 brw_wm_barycentric_interp_mode barycoord_mode;
1002 if (brw->gen >= 6) {
1003 if (is_centroid) {
1004 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1005 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1006 else
1007 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1008 } else {
1009 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1010 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1011 else
1012 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1013 }
1014 } else {
1015 /* On Ironlake and below, there is only one interpolation mode.
1016 * Centroid interpolation doesn't mean anything on this hardware --
1017 * there is no multisampling.
1018 */
1019 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1020 }
1021 return emit(FS_OPCODE_LINTERP, attr,
1022 this->delta_x[barycoord_mode],
1023 this->delta_y[barycoord_mode], interp);
1024 }
1025
1026 fs_reg *
1027 fs_visitor::emit_general_interpolation(ir_variable *ir)
1028 {
1029 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1030 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1031 fs_reg attr = *reg;
1032
1033 unsigned int array_elements;
1034 const glsl_type *type;
1035
1036 if (ir->type->is_array()) {
1037 array_elements = ir->type->length;
1038 if (array_elements == 0) {
1039 fail("dereferenced array '%s' has length 0\n", ir->name);
1040 }
1041 type = ir->type->fields.array;
1042 } else {
1043 array_elements = 1;
1044 type = ir->type;
1045 }
1046
1047 glsl_interp_qualifier interpolation_mode =
1048 ir->determine_interpolation_mode(c->key.flat_shade);
1049
1050 int location = ir->location;
1051 for (unsigned int i = 0; i < array_elements; i++) {
1052 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1053 if (c->prog_data.urb_setup[location] == -1) {
1054 /* If there's no incoming setup data for this slot, don't
1055 * emit interpolation for it.
1056 */
1057 attr.reg_offset += type->vector_elements;
1058 location++;
1059 continue;
1060 }
1061
1062 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1063 /* Constant interpolation (flat shading) case. The SF has
1064 * handed us defined values in only the constant offset
1065 * field of the setup reg.
1066 */
1067 for (unsigned int k = 0; k < type->vector_elements; k++) {
1068 struct brw_reg interp = interp_reg(location, k);
1069 interp = suboffset(interp, 3);
1070 interp.type = reg->type;
1071 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1072 attr.reg_offset++;
1073 }
1074 } else {
1075 /* Smooth/noperspective interpolation case. */
1076 for (unsigned int k = 0; k < type->vector_elements; k++) {
1077 /* FINISHME: At some point we probably want to push
1078 * this farther by giving similar treatment to the
1079 * other potentially constant components of the
1080 * attribute, as well as making brw_vs_constval.c
1081 * handle varyings other than gl_TexCoord.
1082 */
1083 struct brw_reg interp = interp_reg(location, k);
1084 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1085 ir->centroid);
1086 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1087 /* Get the pixel/sample mask into f0 so that we know
1088 * which pixels are lit. Then, for each channel that is
1089 * unlit, replace the centroid data with non-centroid
1090 * data.
1091 */
1092 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1093 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1094 interpolation_mode, false);
1095 inst->predicate = BRW_PREDICATE_NORMAL;
1096 inst->predicate_inverse = true;
1097 }
1098 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1099 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1100 }
1101 attr.reg_offset++;
1102 }
1103
1104 }
1105 location++;
1106 }
1107 }
1108
1109 return reg;
1110 }
1111
1112 fs_reg *
1113 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1114 {
1115 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1116
1117 /* The frontfacing comes in as a bit in the thread payload. */
1118 if (brw->gen >= 6) {
1119 emit(BRW_OPCODE_ASR, *reg,
1120 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1121 fs_reg(15));
1122 emit(BRW_OPCODE_NOT, *reg, *reg);
1123 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1124 } else {
1125 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1126 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1127 * us front face
1128 */
1129 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1130 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1131 }
1132
1133 return reg;
1134 }
1135
1136 void
1137 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1138 {
1139 assert(dst.type == BRW_REGISTER_TYPE_F);
1140
1141 if (c->key.compute_pos_offset) {
1142 /* Convert int_sample_pos to floating point */
1143 emit(MOV(dst, int_sample_pos));
1144 /* Scale to the range [0, 1] */
1145 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1146 }
1147 else {
1148 /* From ARB_sample_shading specification:
1149 * "When rendering to a non-multisample buffer, or if multisample
1150 * rasterization is disabled, gl_SamplePosition will always be
1151 * (0.5, 0.5).
1152 */
1153 emit(MOV(dst, fs_reg(0.5f)));
1154 }
1155 }
1156
1157 fs_reg *
1158 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1159 {
1160 assert(brw->gen >= 6);
1161 assert(ir->type == glsl_type::vec2_type);
1162
1163 this->current_annotation = "compute sample position";
1164 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1165 fs_reg pos = *reg;
1166 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1167 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1168
1169 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1170 * mode will be enabled.
1171 *
1172 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1173 * R31.1:0 Position Offset X/Y for Slot[3:0]
1174 * R31.3:2 Position Offset X/Y for Slot[7:4]
1175 * .....
1176 *
1177 * The X, Y sample positions come in as bytes in thread payload. So, read
1178 * the positions using vstride=16, width=8, hstride=2.
1179 */
1180 struct brw_reg sample_pos_reg =
1181 stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1182 BRW_REGISTER_TYPE_B), 16, 8, 2);
1183
1184 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1185 if (dispatch_width == 16) {
1186 int_sample_x.sechalf = true;
1187 fs_inst *inst = emit(MOV(int_sample_x,
1188 fs_reg(suboffset(sample_pos_reg, 16))));
1189 inst->force_sechalf = true;
1190 int_sample_x.sechalf = false;
1191 }
1192 /* Compute gl_SamplePosition.x */
1193 compute_sample_position(pos, int_sample_x);
1194 pos.reg_offset++;
1195 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1196 if (dispatch_width == 16) {
1197 int_sample_y.sechalf = true;
1198 fs_inst *inst = emit(MOV(int_sample_y,
1199 fs_reg(suboffset(sample_pos_reg, 17))));
1200 inst->force_sechalf = true;
1201 int_sample_y.sechalf = false;
1202 }
1203 /* Compute gl_SamplePosition.y */
1204 compute_sample_position(pos, int_sample_y);
1205 return reg;
1206 }
1207
1208 fs_reg *
1209 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1210 {
1211 assert(brw->gen >= 6);
1212
1213 this->current_annotation = "compute sample id";
1214 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1215
1216 if (c->key.compute_sample_id) {
1217 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1218 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1219 t2.type = BRW_REGISTER_TYPE_UW;
1220
1221 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1222 * 8x multisampling, subspan 0 will represent sample N (where N
1223 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1224 * 7. We can find the value of N by looking at R0.0 bits 7:6
1225 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1226 * (since samples are always delivered in pairs). That is, we
1227 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1228 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1229 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1230 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1231 * populating a temporary variable with the sequence (0, 1, 2, 3),
1232 * and then reading from it using vstride=1, width=4, hstride=0.
1233 * These computations hold good for 4x multisampling as well.
1234 */
1235 emit(BRW_OPCODE_AND, t1,
1236 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1237 fs_reg(brw_imm_d(0xc0)));
1238 emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1239 /* This works for both SIMD8 and SIMD16 */
1240 emit(MOV(t2, brw_imm_v(0x3210)));
1241 /* This special instruction takes care of setting vstride=1,
1242 * width=4, hstride=0 of t2 during an ADD instruction.
1243 */
1244 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1245 } else {
1246 /* As per GL_ARB_sample_shading specification:
1247 * "When rendering to a non-multisample buffer, or if multisample
1248 * rasterization is disabled, gl_SampleID will always be zero."
1249 */
1250 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1251 }
1252
1253 return reg;
1254 }
1255
1256 fs_reg
1257 fs_visitor::fix_math_operand(fs_reg src)
1258 {
1259 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1260 * might be able to do better by doing execsize = 1 math and then
1261 * expanding that result out, but we would need to be careful with
1262 * masking.
1263 *
1264 * The hardware ignores source modifiers (negate and abs) on math
1265 * instructions, so we also move to a temp to set those up.
1266 */
1267 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1268 !src.abs && !src.negate)
1269 return src;
1270
1271 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1272 * operands to math
1273 */
1274 if (brw->gen >= 7 && src.file != IMM)
1275 return src;
1276
1277 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1278 expanded.type = src.type;
1279 emit(BRW_OPCODE_MOV, expanded, src);
1280 return expanded;
1281 }
1282
1283 fs_inst *
1284 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1285 {
1286 switch (opcode) {
1287 case SHADER_OPCODE_RCP:
1288 case SHADER_OPCODE_RSQ:
1289 case SHADER_OPCODE_SQRT:
1290 case SHADER_OPCODE_EXP2:
1291 case SHADER_OPCODE_LOG2:
1292 case SHADER_OPCODE_SIN:
1293 case SHADER_OPCODE_COS:
1294 break;
1295 default:
1296 assert(!"not reached: bad math opcode");
1297 return NULL;
1298 }
1299
1300 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1301 * might be able to do better by doing execsize = 1 math and then
1302 * expanding that result out, but we would need to be careful with
1303 * masking.
1304 *
1305 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1306 * instructions, so we also move to a temp to set those up.
1307 */
1308 if (brw->gen >= 6)
1309 src = fix_math_operand(src);
1310
1311 fs_inst *inst = emit(opcode, dst, src);
1312
1313 if (brw->gen < 6) {
1314 inst->base_mrf = 2;
1315 inst->mlen = dispatch_width / 8;
1316 }
1317
1318 return inst;
1319 }
1320
1321 fs_inst *
1322 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1323 {
1324 int base_mrf = 2;
1325 fs_inst *inst;
1326
1327 switch (opcode) {
1328 case SHADER_OPCODE_INT_QUOTIENT:
1329 case SHADER_OPCODE_INT_REMAINDER:
1330 if (brw->gen >= 7 && dispatch_width == 16)
1331 fail("16-wide INTDIV unsupported\n");
1332 break;
1333 case SHADER_OPCODE_POW:
1334 break;
1335 default:
1336 assert(!"not reached: unsupported binary math opcode.");
1337 return NULL;
1338 }
1339
1340 if (brw->gen >= 6) {
1341 src0 = fix_math_operand(src0);
1342 src1 = fix_math_operand(src1);
1343
1344 inst = emit(opcode, dst, src0, src1);
1345 } else {
1346 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1347 * "Message Payload":
1348 *
1349 * "Operand0[7]. For the INT DIV functions, this operand is the
1350 * denominator."
1351 * ...
1352 * "Operand1[7]. For the INT DIV functions, this operand is the
1353 * numerator."
1354 */
1355 bool is_int_div = opcode != SHADER_OPCODE_POW;
1356 fs_reg &op0 = is_int_div ? src1 : src0;
1357 fs_reg &op1 = is_int_div ? src0 : src1;
1358
1359 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1360 inst = emit(opcode, dst, op0, reg_null_f);
1361
1362 inst->base_mrf = base_mrf;
1363 inst->mlen = 2 * dispatch_width / 8;
1364 }
1365 return inst;
1366 }
1367
1368 void
1369 fs_visitor::assign_curb_setup()
1370 {
1371 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1372 if (dispatch_width == 8) {
1373 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1374 } else {
1375 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1376 }
1377
1378 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1379 foreach_list(node, &this->instructions) {
1380 fs_inst *inst = (fs_inst *)node;
1381
1382 for (unsigned int i = 0; i < 3; i++) {
1383 if (inst->src[i].file == UNIFORM) {
1384 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1385 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1386 constant_nr / 8,
1387 constant_nr % 8);
1388
1389 inst->src[i].file = HW_REG;
1390 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1391 }
1392 }
1393 }
1394 }
1395
1396 void
1397 fs_visitor::calculate_urb_setup()
1398 {
1399 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1400 c->prog_data.urb_setup[i] = -1;
1401 }
1402
1403 int urb_next = 0;
1404 /* Figure out where each of the incoming setup attributes lands. */
1405 if (brw->gen >= 6) {
1406 if (_mesa_bitcount_64(fp->Base.InputsRead &
1407 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1408 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1409 * first 16 varying inputs, so we can put them wherever we want.
1410 * Just put them in order.
1411 *
1412 * This is useful because it means that (a) inputs not used by the
1413 * fragment shader won't take up valuable register space, and (b) we
1414 * won't have to recompile the fragment shader if it gets paired with
1415 * a different vertex (or geometry) shader.
1416 */
1417 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1418 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1419 BITFIELD64_BIT(i)) {
1420 c->prog_data.urb_setup[i] = urb_next++;
1421 }
1422 }
1423 } else {
1424 /* We have enough input varyings that the SF/SBE pipeline stage can't
1425 * arbitrarily rearrange them to suit our whim; we have to put them
1426 * in an order that matches the output of the previous pipeline stage
1427 * (geometry or vertex shader).
1428 */
1429 struct brw_vue_map prev_stage_vue_map;
1430 brw_compute_vue_map(brw, &prev_stage_vue_map,
1431 c->key.input_slots_valid);
1432 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1433 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1434 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1435 slot++) {
1436 int varying = prev_stage_vue_map.slot_to_varying[slot];
1437 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1438 * unused.
1439 */
1440 if (varying != BRW_VARYING_SLOT_COUNT &&
1441 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1442 BITFIELD64_BIT(varying))) {
1443 c->prog_data.urb_setup[varying] = slot - first_slot;
1444 }
1445 }
1446 urb_next = prev_stage_vue_map.num_slots - first_slot;
1447 }
1448 } else {
1449 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1450 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1451 /* Point size is packed into the header, not as a general attribute */
1452 if (i == VARYING_SLOT_PSIZ)
1453 continue;
1454
1455 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1456 /* The back color slot is skipped when the front color is
1457 * also written to. In addition, some slots can be
1458 * written in the vertex shader and not read in the
1459 * fragment shader. So the register number must always be
1460 * incremented, mapped or not.
1461 */
1462 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1463 c->prog_data.urb_setup[i] = urb_next;
1464 urb_next++;
1465 }
1466 }
1467
1468 /*
1469 * It's a FS only attribute, and we did interpolation for this attribute
1470 * in SF thread. So, count it here, too.
1471 *
1472 * See compile_sf_prog() for more info.
1473 */
1474 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1475 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1476 }
1477
1478 c->prog_data.num_varying_inputs = urb_next;
1479 }
1480
1481 void
1482 fs_visitor::assign_urb_setup()
1483 {
1484 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1485
1486 /* Offset all the urb_setup[] index by the actual position of the
1487 * setup regs, now that the location of the constants has been chosen.
1488 */
1489 foreach_list(node, &this->instructions) {
1490 fs_inst *inst = (fs_inst *)node;
1491
1492 if (inst->opcode == FS_OPCODE_LINTERP) {
1493 assert(inst->src[2].file == HW_REG);
1494 inst->src[2].fixed_hw_reg.nr += urb_start;
1495 }
1496
1497 if (inst->opcode == FS_OPCODE_CINTERP) {
1498 assert(inst->src[0].file == HW_REG);
1499 inst->src[0].fixed_hw_reg.nr += urb_start;
1500 }
1501 }
1502
1503 /* Each attribute is 4 setup channels, each of which is half a reg. */
1504 this->first_non_payload_grf =
1505 urb_start + c->prog_data.num_varying_inputs * 2;
1506 }
1507
1508 /**
1509 * Split large virtual GRFs into separate components if we can.
1510 *
1511 * This is mostly duplicated with what brw_fs_vector_splitting does,
1512 * but that's really conservative because it's afraid of doing
1513 * splitting that doesn't result in real progress after the rest of
1514 * the optimization phases, which would cause infinite looping in
1515 * optimization. We can do it once here, safely. This also has the
1516 * opportunity to split interpolated values, or maybe even uniforms,
1517 * which we don't have at the IR level.
1518 *
1519 * We want to split, because virtual GRFs are what we register
1520 * allocate and spill (due to contiguousness requirements for some
1521 * instructions), and they're what we naturally generate in the
1522 * codegen process, but most virtual GRFs don't actually need to be
1523 * contiguous sets of GRFs. If we split, we'll end up with reduced
1524 * live intervals and better dead code elimination and coalescing.
1525 */
1526 void
1527 fs_visitor::split_virtual_grfs()
1528 {
1529 int num_vars = this->virtual_grf_count;
1530 bool split_grf[num_vars];
1531 int new_virtual_grf[num_vars];
1532
1533 /* Try to split anything > 0 sized. */
1534 for (int i = 0; i < num_vars; i++) {
1535 if (this->virtual_grf_sizes[i] != 1)
1536 split_grf[i] = true;
1537 else
1538 split_grf[i] = false;
1539 }
1540
1541 if (brw->has_pln &&
1542 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1543 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1544 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1545 * Gen6, that was the only supported interpolation mode, and since Gen6,
1546 * delta_x and delta_y are in fixed hardware registers.
1547 */
1548 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1549 false;
1550 }
1551
1552 foreach_list(node, &this->instructions) {
1553 fs_inst *inst = (fs_inst *)node;
1554
1555 /* If there's a SEND message that requires contiguous destination
1556 * registers, no splitting is allowed.
1557 */
1558 if (inst->regs_written > 1) {
1559 split_grf[inst->dst.reg] = false;
1560 }
1561
1562 /* If we're sending from a GRF, don't split it, on the assumption that
1563 * the send is reading the whole thing.
1564 */
1565 if (inst->is_send_from_grf()) {
1566 for (int i = 0; i < 3; i++) {
1567 if (inst->src[i].file == GRF) {
1568 split_grf[inst->src[i].reg] = false;
1569 }
1570 }
1571 }
1572 }
1573
1574 /* Allocate new space for split regs. Note that the virtual
1575 * numbers will be contiguous.
1576 */
1577 for (int i = 0; i < num_vars; i++) {
1578 if (split_grf[i]) {
1579 new_virtual_grf[i] = virtual_grf_alloc(1);
1580 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1581 int reg = virtual_grf_alloc(1);
1582 assert(reg == new_virtual_grf[i] + j - 1);
1583 (void) reg;
1584 }
1585 this->virtual_grf_sizes[i] = 1;
1586 }
1587 }
1588
1589 foreach_list(node, &this->instructions) {
1590 fs_inst *inst = (fs_inst *)node;
1591
1592 if (inst->dst.file == GRF &&
1593 split_grf[inst->dst.reg] &&
1594 inst->dst.reg_offset != 0) {
1595 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1596 inst->dst.reg_offset - 1);
1597 inst->dst.reg_offset = 0;
1598 }
1599 for (int i = 0; i < 3; i++) {
1600 if (inst->src[i].file == GRF &&
1601 split_grf[inst->src[i].reg] &&
1602 inst->src[i].reg_offset != 0) {
1603 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1604 inst->src[i].reg_offset - 1);
1605 inst->src[i].reg_offset = 0;
1606 }
1607 }
1608 }
1609 invalidate_live_intervals();
1610 }
1611
1612 /**
1613 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1614 *
1615 * During code generation, we create tons of temporary variables, many of
1616 * which get immediately killed and are never used again. Yet, in later
1617 * optimization and analysis passes, such as compute_live_intervals, we need
1618 * to loop over all the virtual GRFs. Compacting them can save a lot of
1619 * overhead.
1620 */
1621 void
1622 fs_visitor::compact_virtual_grfs()
1623 {
1624 /* Mark which virtual GRFs are used, and count how many. */
1625 int remap_table[this->virtual_grf_count];
1626 memset(remap_table, -1, sizeof(remap_table));
1627
1628 foreach_list(node, &this->instructions) {
1629 const fs_inst *inst = (const fs_inst *) node;
1630
1631 if (inst->dst.file == GRF)
1632 remap_table[inst->dst.reg] = 0;
1633
1634 for (int i = 0; i < 3; i++) {
1635 if (inst->src[i].file == GRF)
1636 remap_table[inst->src[i].reg] = 0;
1637 }
1638 }
1639
1640 /* In addition to registers used in instructions, fs_visitor keeps
1641 * direct references to certain special values which must be patched:
1642 */
1643 fs_reg *special[] = {
1644 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1645 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1646 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1647 &delta_x[0], &delta_x[1], &delta_x[2],
1648 &delta_x[3], &delta_x[4], &delta_x[5],
1649 &delta_y[0], &delta_y[1], &delta_y[2],
1650 &delta_y[3], &delta_y[4], &delta_y[5],
1651 };
1652 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1653 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1654
1655 /* Treat all special values as used, to be conservative */
1656 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1657 if (special[i]->file == GRF)
1658 remap_table[special[i]->reg] = 0;
1659 }
1660
1661 /* Compact the GRF arrays. */
1662 int new_index = 0;
1663 for (int i = 0; i < this->virtual_grf_count; i++) {
1664 if (remap_table[i] != -1) {
1665 remap_table[i] = new_index;
1666 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1667 invalidate_live_intervals();
1668 ++new_index;
1669 }
1670 }
1671
1672 this->virtual_grf_count = new_index;
1673
1674 /* Patch all the instructions to use the newly renumbered registers */
1675 foreach_list(node, &this->instructions) {
1676 fs_inst *inst = (fs_inst *) node;
1677
1678 if (inst->dst.file == GRF)
1679 inst->dst.reg = remap_table[inst->dst.reg];
1680
1681 for (int i = 0; i < 3; i++) {
1682 if (inst->src[i].file == GRF)
1683 inst->src[i].reg = remap_table[inst->src[i].reg];
1684 }
1685 }
1686
1687 /* Patch all the references to special values */
1688 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1689 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1690 special[i]->reg = remap_table[special[i]->reg];
1691 }
1692 }
1693
1694 bool
1695 fs_visitor::remove_dead_constants()
1696 {
1697 if (dispatch_width == 8) {
1698 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1699 this->nr_params_remap = c->prog_data.nr_params;
1700
1701 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1702 this->params_remap[i] = -1;
1703
1704 /* Find which params are still in use. */
1705 foreach_list(node, &this->instructions) {
1706 fs_inst *inst = (fs_inst *)node;
1707
1708 for (int i = 0; i < 3; i++) {
1709 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1710
1711 if (inst->src[i].file != UNIFORM)
1712 continue;
1713
1714 /* Section 5.11 of the OpenGL 4.3 spec says:
1715 *
1716 * "Out-of-bounds reads return undefined values, which include
1717 * values from other variables of the active program or zero."
1718 */
1719 if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1720 constant_nr = 0;
1721 }
1722
1723 /* For now, set this to non-negative. We'll give it the
1724 * actual new number in a moment, in order to keep the
1725 * register numbers nicely ordered.
1726 */
1727 this->params_remap[constant_nr] = 0;
1728 }
1729 }
1730
1731 /* Figure out what the new numbers for the params will be. At some
1732 * point when we're doing uniform array access, we're going to want
1733 * to keep the distinction between .reg and .reg_offset, but for
1734 * now we don't care.
1735 */
1736 unsigned int new_nr_params = 0;
1737 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1738 if (this->params_remap[i] != -1) {
1739 this->params_remap[i] = new_nr_params++;
1740 }
1741 }
1742
1743 /* Update the list of params to be uploaded to match our new numbering. */
1744 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1745 int remapped = this->params_remap[i];
1746
1747 if (remapped == -1)
1748 continue;
1749
1750 c->prog_data.param[remapped] = c->prog_data.param[i];
1751 }
1752
1753 c->prog_data.nr_params = new_nr_params;
1754 } else {
1755 /* This should have been generated in the 8-wide pass already. */
1756 assert(this->params_remap);
1757 }
1758
1759 /* Now do the renumbering of the shader to remove unused params. */
1760 foreach_list(node, &this->instructions) {
1761 fs_inst *inst = (fs_inst *)node;
1762
1763 for (int i = 0; i < 3; i++) {
1764 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1765
1766 if (inst->src[i].file != UNIFORM)
1767 continue;
1768
1769 /* as above alias to 0 */
1770 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1771 constant_nr = 0;
1772 }
1773 assert(this->params_remap[constant_nr] != -1);
1774 inst->src[i].reg = this->params_remap[constant_nr];
1775 inst->src[i].reg_offset = 0;
1776 }
1777 }
1778
1779 return true;
1780 }
1781
1782 /*
1783 * Implements array access of uniforms by inserting a
1784 * PULL_CONSTANT_LOAD instruction.
1785 *
1786 * Unlike temporary GRF array access (where we don't support it due to
1787 * the difficulty of doing relative addressing on instruction
1788 * destinations), we could potentially do array access of uniforms
1789 * that were loaded in GRF space as push constants. In real-world
1790 * usage we've seen, though, the arrays being used are always larger
1791 * than we could load as push constants, so just always move all
1792 * uniform array access out to a pull constant buffer.
1793 */
1794 void
1795 fs_visitor::move_uniform_array_access_to_pull_constants()
1796 {
1797 int pull_constant_loc[c->prog_data.nr_params];
1798
1799 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1800 pull_constant_loc[i] = -1;
1801 }
1802
1803 /* Walk through and find array access of uniforms. Put a copy of that
1804 * uniform in the pull constant buffer.
1805 *
1806 * Note that we don't move constant-indexed accesses to arrays. No
1807 * testing has been done of the performance impact of this choice.
1808 */
1809 foreach_list_safe(node, &this->instructions) {
1810 fs_inst *inst = (fs_inst *)node;
1811
1812 for (int i = 0 ; i < 3; i++) {
1813 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1814 continue;
1815
1816 int uniform = inst->src[i].reg;
1817
1818 /* If this array isn't already present in the pull constant buffer,
1819 * add it.
1820 */
1821 if (pull_constant_loc[uniform] == -1) {
1822 const float **values = &c->prog_data.param[uniform];
1823
1824 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1825
1826 assert(param_size[uniform]);
1827
1828 for (int j = 0; j < param_size[uniform]; j++) {
1829 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1830 values[j];
1831 }
1832 }
1833
1834 /* Set up the annotation tracking for new generated instructions. */
1835 base_ir = inst->ir;
1836 current_annotation = inst->annotation;
1837
1838 fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1839 fs_reg temp = fs_reg(this, glsl_type::float_type);
1840 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1841 surf_index,
1842 *inst->src[i].reladdr,
1843 pull_constant_loc[uniform] +
1844 inst->src[i].reg_offset);
1845 inst->insert_before(&list);
1846
1847 inst->src[i].file = temp.file;
1848 inst->src[i].reg = temp.reg;
1849 inst->src[i].reg_offset = temp.reg_offset;
1850 inst->src[i].reladdr = NULL;
1851 }
1852 }
1853 }
1854
1855 /**
1856 * Choose accesses from the UNIFORM file to demote to using the pull
1857 * constant buffer.
1858 *
1859 * We allow a fragment shader to have more than the specified minimum
1860 * maximum number of fragment shader uniform components (64). If
1861 * there are too many of these, they'd fill up all of register space.
1862 * So, this will push some of them out to the pull constant buffer and
1863 * update the program to load them.
1864 */
1865 void
1866 fs_visitor::setup_pull_constants()
1867 {
1868 /* Only allow 16 registers (128 uniform components) as push constants. */
1869 unsigned int max_uniform_components = 16 * 8;
1870 if (c->prog_data.nr_params <= max_uniform_components)
1871 return;
1872
1873 if (dispatch_width == 16) {
1874 fail("Pull constants not supported in 16-wide\n");
1875 return;
1876 }
1877
1878 /* Just demote the end of the list. We could probably do better
1879 * here, demoting things that are rarely used in the program first.
1880 */
1881 unsigned int pull_uniform_base = max_uniform_components;
1882
1883 int pull_constant_loc[c->prog_data.nr_params];
1884 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1885 if (i < pull_uniform_base) {
1886 pull_constant_loc[i] = -1;
1887 } else {
1888 pull_constant_loc[i] = -1;
1889 /* If our constant is already being uploaded for reladdr purposes,
1890 * reuse it.
1891 */
1892 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1893 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1894 pull_constant_loc[i] = j;
1895 break;
1896 }
1897 }
1898 if (pull_constant_loc[i] == -1) {
1899 int pull_index = c->prog_data.nr_pull_params++;
1900 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1901 pull_constant_loc[i] = pull_index;;
1902 }
1903 }
1904 }
1905 c->prog_data.nr_params = pull_uniform_base;
1906
1907 foreach_list(node, &this->instructions) {
1908 fs_inst *inst = (fs_inst *)node;
1909
1910 for (int i = 0; i < 3; i++) {
1911 if (inst->src[i].file != UNIFORM)
1912 continue;
1913
1914 int pull_index = pull_constant_loc[inst->src[i].reg +
1915 inst->src[i].reg_offset];
1916 if (pull_index == -1)
1917 continue;
1918
1919 assert(!inst->src[i].reladdr);
1920
1921 fs_reg dst = fs_reg(this, glsl_type::float_type);
1922 fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1923 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1924 fs_inst *pull =
1925 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1926 dst, index, offset);
1927 pull->ir = inst->ir;
1928 pull->annotation = inst->annotation;
1929
1930 inst->insert_before(pull);
1931
1932 inst->src[i].file = GRF;
1933 inst->src[i].reg = dst.reg;
1934 inst->src[i].reg_offset = 0;
1935 inst->src[i].smear = pull_index & 3;
1936 }
1937 }
1938 }
1939
1940 bool
1941 fs_visitor::opt_algebraic()
1942 {
1943 bool progress = false;
1944
1945 foreach_list(node, &this->instructions) {
1946 fs_inst *inst = (fs_inst *)node;
1947
1948 switch (inst->opcode) {
1949 case BRW_OPCODE_MUL:
1950 if (inst->src[1].file != IMM)
1951 continue;
1952
1953 /* a * 1.0 = a */
1954 if (inst->src[1].is_one()) {
1955 inst->opcode = BRW_OPCODE_MOV;
1956 inst->src[1] = reg_undef;
1957 progress = true;
1958 break;
1959 }
1960
1961 /* a * 0.0 = 0.0 */
1962 if (inst->src[1].is_zero()) {
1963 inst->opcode = BRW_OPCODE_MOV;
1964 inst->src[0] = inst->src[1];
1965 inst->src[1] = reg_undef;
1966 progress = true;
1967 break;
1968 }
1969
1970 break;
1971 case BRW_OPCODE_ADD:
1972 if (inst->src[1].file != IMM)
1973 continue;
1974
1975 /* a + 0.0 = a */
1976 if (inst->src[1].is_zero()) {
1977 inst->opcode = BRW_OPCODE_MOV;
1978 inst->src[1] = reg_undef;
1979 progress = true;
1980 break;
1981 }
1982 break;
1983 case BRW_OPCODE_OR:
1984 if (inst->src[0].equals(inst->src[1])) {
1985 inst->opcode = BRW_OPCODE_MOV;
1986 inst->src[1] = reg_undef;
1987 progress = true;
1988 break;
1989 }
1990 break;
1991 case BRW_OPCODE_SEL:
1992 if (inst->saturate && inst->src[1].file == IMM) {
1993 switch (inst->conditional_mod) {
1994 case BRW_CONDITIONAL_LE:
1995 case BRW_CONDITIONAL_L:
1996 switch (inst->src[1].type) {
1997 case BRW_REGISTER_TYPE_F:
1998 if (inst->src[1].imm.f >= 1.0f) {
1999 inst->opcode = BRW_OPCODE_MOV;
2000 inst->src[1] = reg_undef;
2001 progress = true;
2002 }
2003 break;
2004 default:
2005 break;
2006 }
2007 break;
2008 case BRW_CONDITIONAL_GE:
2009 case BRW_CONDITIONAL_G:
2010 switch (inst->src[1].type) {
2011 case BRW_REGISTER_TYPE_F:
2012 if (inst->src[1].imm.f <= 0.0f) {
2013 inst->opcode = BRW_OPCODE_MOV;
2014 inst->src[1] = reg_undef;
2015 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2016 progress = true;
2017 }
2018 break;
2019 default:
2020 break;
2021 }
2022 default:
2023 break;
2024 }
2025 }
2026 break;
2027 default:
2028 break;
2029 }
2030 }
2031
2032 return progress;
2033 }
2034
2035 /**
2036 * Removes any instructions writing a VGRF where that VGRF is not used by any
2037 * later instruction.
2038 */
2039 bool
2040 fs_visitor::dead_code_eliminate()
2041 {
2042 bool progress = false;
2043 int pc = 0;
2044
2045 calculate_live_intervals();
2046
2047 foreach_list_safe(node, &this->instructions) {
2048 fs_inst *inst = (fs_inst *)node;
2049
2050 if (inst->dst.file == GRF && !inst->has_side_effects()) {
2051 bool dead = true;
2052
2053 for (int i = 0; i < inst->regs_written; i++) {
2054 int var = live_intervals->var_from_vgrf[inst->dst.reg];
2055 assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2056 if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2057 dead = false;
2058 break;
2059 }
2060 }
2061
2062 if (dead) {
2063 /* Don't dead code eliminate instructions that write to the
2064 * accumulator as a side-effect. Instead just set the destination
2065 * to the null register to free it.
2066 */
2067 switch (inst->opcode) {
2068 case BRW_OPCODE_ADDC:
2069 case BRW_OPCODE_SUBB:
2070 case BRW_OPCODE_MACH:
2071 inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2072 break;
2073 default:
2074 inst->remove();
2075 progress = true;
2076 break;
2077 }
2078 }
2079 }
2080
2081 pc++;
2082 }
2083
2084 if (progress)
2085 invalidate_live_intervals();
2086
2087 return progress;
2088 }
2089
2090 struct dead_code_hash_key
2091 {
2092 int vgrf;
2093 int reg_offset;
2094 };
2095
2096 static bool
2097 dead_code_hash_compare(const void *a, const void *b)
2098 {
2099 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2100 }
2101
2102 static void
2103 clear_dead_code_hash(struct hash_table *ht)
2104 {
2105 struct hash_entry *entry;
2106
2107 hash_table_foreach(ht, entry) {
2108 _mesa_hash_table_remove(ht, entry);
2109 }
2110 }
2111
2112 static void
2113 insert_dead_code_hash(struct hash_table *ht,
2114 int vgrf, int reg_offset, fs_inst *inst)
2115 {
2116 /* We don't bother freeing keys, because they'll be GCed with the ht. */
2117 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2118
2119 key->vgrf = vgrf;
2120 key->reg_offset = reg_offset;
2121
2122 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2123 }
2124
2125 static struct hash_entry *
2126 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2127 {
2128 struct dead_code_hash_key key;
2129
2130 key.vgrf = vgrf;
2131 key.reg_offset = reg_offset;
2132
2133 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2134 }
2135
2136 static void
2137 remove_dead_code_hash(struct hash_table *ht,
2138 int vgrf, int reg_offset)
2139 {
2140 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2141 if (!entry)
2142 return;
2143
2144 _mesa_hash_table_remove(ht, entry);
2145 }
2146
2147 /**
2148 * Walks basic blocks, removing any regs that are written but not read before
2149 * being redefined.
2150 *
2151 * The dead_code_eliminate() function implements a global dead code
2152 * elimination, but it only handles the removing the last write to a register
2153 * if it's never read. This one can handle intermediate writes, but only
2154 * within a basic block.
2155 */
2156 bool
2157 fs_visitor::dead_code_eliminate_local()
2158 {
2159 struct hash_table *ht;
2160 bool progress = false;
2161
2162 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2163
2164 foreach_list_safe(node, &this->instructions) {
2165 fs_inst *inst = (fs_inst *)node;
2166
2167 /* At a basic block, empty the HT since we don't understand dataflow
2168 * here.
2169 */
2170 if (inst->is_control_flow()) {
2171 clear_dead_code_hash(ht);
2172 continue;
2173 }
2174
2175 /* Clear the HT of any instructions that got read. */
2176 for (int i = 0; i < 3; i++) {
2177 fs_reg src = inst->src[i];
2178 if (src.file != GRF)
2179 continue;
2180
2181 int read = 1;
2182 if (inst->is_send_from_grf())
2183 read = virtual_grf_sizes[src.reg] - src.reg_offset;
2184
2185 for (int reg_offset = src.reg_offset;
2186 reg_offset < src.reg_offset + read;
2187 reg_offset++) {
2188 remove_dead_code_hash(ht, src.reg, reg_offset);
2189 }
2190 }
2191
2192 /* Add any update of a GRF to the HT, removing a previous write if it
2193 * wasn't read.
2194 */
2195 if (inst->dst.file == GRF) {
2196 if (inst->regs_written > 1) {
2197 /* We don't know how to trim channels from an instruction's
2198 * writes, so we can't incrementally remove unread channels from
2199 * it. Just remove whatever it overwrites from the table
2200 */
2201 for (int i = 0; i < inst->regs_written; i++) {
2202 remove_dead_code_hash(ht,
2203 inst->dst.reg,
2204 inst->dst.reg_offset + i);
2205 }
2206 } else {
2207 struct hash_entry *entry =
2208 get_dead_code_hash_entry(ht, inst->dst.reg,
2209 inst->dst.reg_offset);
2210
2211 if (entry) {
2212 if (inst->is_partial_write()) {
2213 /* For a partial write, we can't remove any previous dead code
2214 * candidate, since we're just modifying their result.
2215 */
2216 } else {
2217 /* We're completely updating a channel, and there was a
2218 * previous write to the channel that wasn't read. Kill it!
2219 */
2220 fs_inst *inst = (fs_inst *)entry->data;
2221 inst->remove();
2222 progress = true;
2223 }
2224
2225 _mesa_hash_table_remove(ht, entry);
2226 }
2227
2228 if (!inst->has_side_effects())
2229 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2230 inst);
2231 }
2232 }
2233 }
2234
2235 _mesa_hash_table_destroy(ht, NULL);
2236
2237 if (progress)
2238 invalidate_live_intervals();
2239
2240 return progress;
2241 }
2242
2243 /**
2244 * Implements a second type of register coalescing: This one checks if
2245 * the two regs involved in a raw move don't interfere, in which case
2246 * they can both by stored in the same place and the MOV removed.
2247 */
2248 bool
2249 fs_visitor::register_coalesce_2()
2250 {
2251 bool progress = false;
2252
2253 calculate_live_intervals();
2254
2255 foreach_list_safe(node, &this->instructions) {
2256 fs_inst *inst = (fs_inst *)node;
2257
2258 if (inst->opcode != BRW_OPCODE_MOV ||
2259 inst->is_partial_write() ||
2260 inst->saturate ||
2261 inst->src[0].file != GRF ||
2262 inst->src[0].negate ||
2263 inst->src[0].abs ||
2264 inst->src[0].smear != -1 ||
2265 inst->dst.file != GRF ||
2266 inst->dst.type != inst->src[0].type ||
2267 virtual_grf_sizes[inst->src[0].reg] != 1) {
2268 continue;
2269 }
2270
2271 int var_from = live_intervals->var_from_reg(&inst->src[0]);
2272 int var_to = live_intervals->var_from_reg(&inst->dst);
2273
2274 if (live_intervals->vars_interfere(var_from, var_to) &&
2275 !inst->dst.equals(inst->src[0]))
2276 continue;
2277
2278 int reg_from = inst->src[0].reg;
2279 assert(inst->src[0].reg_offset == 0);
2280 int reg_to = inst->dst.reg;
2281 int reg_to_offset = inst->dst.reg_offset;
2282
2283 foreach_list(node, &this->instructions) {
2284 fs_inst *scan_inst = (fs_inst *)node;
2285
2286 if (scan_inst->dst.file == GRF &&
2287 scan_inst->dst.reg == reg_from) {
2288 scan_inst->dst.reg = reg_to;
2289 scan_inst->dst.reg_offset = reg_to_offset;
2290 }
2291 for (int i = 0; i < 3; i++) {
2292 if (scan_inst->src[i].file == GRF &&
2293 scan_inst->src[i].reg == reg_from) {
2294 scan_inst->src[i].reg = reg_to;
2295 scan_inst->src[i].reg_offset = reg_to_offset;
2296 }
2297 }
2298 }
2299
2300 inst->remove();
2301 progress = true;
2302 continue;
2303 }
2304
2305 if (progress)
2306 invalidate_live_intervals();
2307
2308 return progress;
2309 }
2310
2311 bool
2312 fs_visitor::register_coalesce()
2313 {
2314 bool progress = false;
2315 int if_depth = 0;
2316 int loop_depth = 0;
2317
2318 foreach_list_safe(node, &this->instructions) {
2319 fs_inst *inst = (fs_inst *)node;
2320
2321 /* Make sure that we dominate the instructions we're going to
2322 * scan for interfering with our coalescing, or we won't have
2323 * scanned enough to see if anything interferes with our
2324 * coalescing. We don't dominate the following instructions if
2325 * we're in a loop or an if block.
2326 */
2327 switch (inst->opcode) {
2328 case BRW_OPCODE_DO:
2329 loop_depth++;
2330 break;
2331 case BRW_OPCODE_WHILE:
2332 loop_depth--;
2333 break;
2334 case BRW_OPCODE_IF:
2335 if_depth++;
2336 break;
2337 case BRW_OPCODE_ENDIF:
2338 if_depth--;
2339 break;
2340 default:
2341 break;
2342 }
2343 if (loop_depth || if_depth)
2344 continue;
2345
2346 if (inst->opcode != BRW_OPCODE_MOV ||
2347 inst->is_partial_write() ||
2348 inst->saturate ||
2349 inst->dst.file != GRF || (inst->src[0].file != GRF &&
2350 inst->src[0].file != UNIFORM)||
2351 inst->dst.type != inst->src[0].type)
2352 continue;
2353
2354 bool has_source_modifiers = (inst->src[0].abs ||
2355 inst->src[0].negate ||
2356 inst->src[0].smear != -1 ||
2357 inst->src[0].file == UNIFORM);
2358
2359 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
2360 * them: check for no writes to either one until the exit of the
2361 * program.
2362 */
2363 bool interfered = false;
2364
2365 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2366 !scan_inst->is_tail_sentinel();
2367 scan_inst = (fs_inst *)scan_inst->next) {
2368 if (scan_inst->dst.file == GRF) {
2369 if (scan_inst->overwrites_reg(inst->dst) ||
2370 scan_inst->overwrites_reg(inst->src[0])) {
2371 interfered = true;
2372 break;
2373 }
2374 }
2375
2376 if (has_source_modifiers) {
2377 for (int i = 0; i < 3; i++) {
2378 if (scan_inst->src[i].file == GRF &&
2379 scan_inst->src[i].reg == inst->dst.reg &&
2380 scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2381 inst->dst.type != scan_inst->src[i].type)
2382 {
2383 interfered = true;
2384 break;
2385 }
2386 }
2387 }
2388
2389
2390 /* The gen6 MATH instruction can't handle source modifiers or
2391 * unusual register regions, so avoid coalescing those for
2392 * now. We should do something more specific.
2393 */
2394 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2395 interfered = true;
2396 break;
2397 }
2398
2399 if (scan_inst->mlen > 0 && scan_inst->base_mrf == -1 &&
2400 scan_inst->src[0].file == GRF &&
2401 scan_inst->src[0].reg == inst->dst.reg) {
2402 interfered = true;
2403 break;
2404 }
2405
2406 /* The accumulator result appears to get used for the
2407 * conditional modifier generation. When negating a UD
2408 * value, there is a 33rd bit generated for the sign in the
2409 * accumulator value, so now you can't check, for example,
2410 * equality with a 32-bit value. See piglit fs-op-neg-uint.
2411 */
2412 if (scan_inst->conditional_mod &&
2413 inst->src[0].negate &&
2414 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2415 interfered = true;
2416 break;
2417 }
2418 }
2419 if (interfered) {
2420 continue;
2421 }
2422
2423 /* Rewrite the later usage to point at the source of the move to
2424 * be removed.
2425 */
2426 for (fs_inst *scan_inst = inst;
2427 !scan_inst->is_tail_sentinel();
2428 scan_inst = (fs_inst *)scan_inst->next) {
2429 for (int i = 0; i < 3; i++) {
2430 if (scan_inst->src[i].file == GRF &&
2431 scan_inst->src[i].reg == inst->dst.reg &&
2432 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2433 fs_reg new_src = inst->src[0];
2434 new_src.type = scan_inst->src[i].type;
2435 if (scan_inst->src[i].abs) {
2436 new_src.negate = 0;
2437 new_src.abs = 1;
2438 }
2439 new_src.negate ^= scan_inst->src[i].negate;
2440 new_src.sechalf = scan_inst->src[i].sechalf;
2441 scan_inst->src[i] = new_src;
2442 }
2443 }
2444 }
2445
2446 inst->remove();
2447 progress = true;
2448 }
2449
2450 if (progress)
2451 invalidate_live_intervals();
2452
2453 return progress;
2454 }
2455
2456
2457 bool
2458 fs_visitor::compute_to_mrf()
2459 {
2460 bool progress = false;
2461 int next_ip = 0;
2462
2463 calculate_live_intervals();
2464
2465 foreach_list_safe(node, &this->instructions) {
2466 fs_inst *inst = (fs_inst *)node;
2467
2468 int ip = next_ip;
2469 next_ip++;
2470
2471 if (inst->opcode != BRW_OPCODE_MOV ||
2472 inst->is_partial_write() ||
2473 inst->dst.file != MRF || inst->src[0].file != GRF ||
2474 inst->dst.type != inst->src[0].type ||
2475 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2476 continue;
2477
2478 /* Work out which hardware MRF registers are written by this
2479 * instruction.
2480 */
2481 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2482 int mrf_high;
2483 if (inst->dst.reg & BRW_MRF_COMPR4) {
2484 mrf_high = mrf_low + 4;
2485 } else if (dispatch_width == 16 &&
2486 (!inst->force_uncompressed && !inst->force_sechalf)) {
2487 mrf_high = mrf_low + 1;
2488 } else {
2489 mrf_high = mrf_low;
2490 }
2491
2492 /* Can't compute-to-MRF this GRF if someone else was going to
2493 * read it later.
2494 */
2495 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2496 continue;
2497
2498 /* Found a move of a GRF to a MRF. Let's see if we can go
2499 * rewrite the thing that made this GRF to write into the MRF.
2500 */
2501 fs_inst *scan_inst;
2502 for (scan_inst = (fs_inst *)inst->prev;
2503 scan_inst->prev != NULL;
2504 scan_inst = (fs_inst *)scan_inst->prev) {
2505 if (scan_inst->dst.file == GRF &&
2506 scan_inst->dst.reg == inst->src[0].reg) {
2507 /* Found the last thing to write our reg we want to turn
2508 * into a compute-to-MRF.
2509 */
2510
2511 /* If this one instruction didn't populate all the
2512 * channels, bail. We might be able to rewrite everything
2513 * that writes that reg, but it would require smarter
2514 * tracking to delay the rewriting until complete success.
2515 */
2516 if (scan_inst->is_partial_write())
2517 break;
2518
2519 /* Things returning more than one register would need us to
2520 * understand coalescing out more than one MOV at a time.
2521 */
2522 if (scan_inst->regs_written > 1)
2523 break;
2524
2525 /* SEND instructions can't have MRF as a destination. */
2526 if (scan_inst->mlen)
2527 break;
2528
2529 if (brw->gen == 6) {
2530 /* gen6 math instructions must have the destination be
2531 * GRF, so no compute-to-MRF for them.
2532 */
2533 if (scan_inst->is_math()) {
2534 break;
2535 }
2536 }
2537
2538 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2539 /* Found the creator of our MRF's source value. */
2540 scan_inst->dst.file = MRF;
2541 scan_inst->dst.reg = inst->dst.reg;
2542 scan_inst->saturate |= inst->saturate;
2543 inst->remove();
2544 progress = true;
2545 }
2546 break;
2547 }
2548
2549 /* We don't handle control flow here. Most computation of
2550 * values that end up in MRFs are shortly before the MRF
2551 * write anyway.
2552 */
2553 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2554 break;
2555
2556 /* You can't read from an MRF, so if someone else reads our
2557 * MRF's source GRF that we wanted to rewrite, that stops us.
2558 */
2559 bool interfered = false;
2560 for (int i = 0; i < 3; i++) {
2561 if (scan_inst->src[i].file == GRF &&
2562 scan_inst->src[i].reg == inst->src[0].reg &&
2563 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2564 interfered = true;
2565 }
2566 }
2567 if (interfered)
2568 break;
2569
2570 if (scan_inst->dst.file == MRF) {
2571 /* If somebody else writes our MRF here, we can't
2572 * compute-to-MRF before that.
2573 */
2574 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2575 int scan_mrf_high;
2576
2577 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2578 scan_mrf_high = scan_mrf_low + 4;
2579 } else if (dispatch_width == 16 &&
2580 (!scan_inst->force_uncompressed &&
2581 !scan_inst->force_sechalf)) {
2582 scan_mrf_high = scan_mrf_low + 1;
2583 } else {
2584 scan_mrf_high = scan_mrf_low;
2585 }
2586
2587 if (mrf_low == scan_mrf_low ||
2588 mrf_low == scan_mrf_high ||
2589 mrf_high == scan_mrf_low ||
2590 mrf_high == scan_mrf_high) {
2591 break;
2592 }
2593 }
2594
2595 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2596 /* Found a SEND instruction, which means that there are
2597 * live values in MRFs from base_mrf to base_mrf +
2598 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2599 * above it.
2600 */
2601 if (mrf_low >= scan_inst->base_mrf &&
2602 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2603 break;
2604 }
2605 if (mrf_high >= scan_inst->base_mrf &&
2606 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2607 break;
2608 }
2609 }
2610 }
2611 }
2612
2613 if (progress)
2614 invalidate_live_intervals();
2615
2616 return progress;
2617 }
2618
2619 /**
2620 * Walks through basic blocks, looking for repeated MRF writes and
2621 * removing the later ones.
2622 */
2623 bool
2624 fs_visitor::remove_duplicate_mrf_writes()
2625 {
2626 fs_inst *last_mrf_move[16];
2627 bool progress = false;
2628
2629 /* Need to update the MRF tracking for compressed instructions. */
2630 if (dispatch_width == 16)
2631 return false;
2632
2633 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2634
2635 foreach_list_safe(node, &this->instructions) {
2636 fs_inst *inst = (fs_inst *)node;
2637
2638 if (inst->is_control_flow()) {
2639 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2640 }
2641
2642 if (inst->opcode == BRW_OPCODE_MOV &&
2643 inst->dst.file == MRF) {
2644 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2645 if (prev_inst && inst->equals(prev_inst)) {
2646 inst->remove();
2647 progress = true;
2648 continue;
2649 }
2650 }
2651
2652 /* Clear out the last-write records for MRFs that were overwritten. */
2653 if (inst->dst.file == MRF) {
2654 last_mrf_move[inst->dst.reg] = NULL;
2655 }
2656
2657 if (inst->mlen > 0 && inst->base_mrf != -1) {
2658 /* Found a SEND instruction, which will include two or fewer
2659 * implied MRF writes. We could do better here.
2660 */
2661 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2662 last_mrf_move[inst->base_mrf + i] = NULL;
2663 }
2664 }
2665
2666 /* Clear out any MRF move records whose sources got overwritten. */
2667 if (inst->dst.file == GRF) {
2668 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2669 if (last_mrf_move[i] &&
2670 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2671 last_mrf_move[i] = NULL;
2672 }
2673 }
2674 }
2675
2676 if (inst->opcode == BRW_OPCODE_MOV &&
2677 inst->dst.file == MRF &&
2678 inst->src[0].file == GRF &&
2679 !inst->is_partial_write()) {
2680 last_mrf_move[inst->dst.reg] = inst;
2681 }
2682 }
2683
2684 if (progress)
2685 invalidate_live_intervals();
2686
2687 return progress;
2688 }
2689
2690 static void
2691 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2692 int first_grf, int grf_len)
2693 {
2694 bool inst_16wide = (dispatch_width > 8 &&
2695 !inst->force_uncompressed &&
2696 !inst->force_sechalf);
2697
2698 /* Clear the flag for registers that actually got read (as expected). */
2699 for (int i = 0; i < 3; i++) {
2700 int grf;
2701 if (inst->src[i].file == GRF) {
2702 grf = inst->src[i].reg;
2703 } else if (inst->src[i].file == HW_REG &&
2704 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2705 grf = inst->src[i].fixed_hw_reg.nr;
2706 } else {
2707 continue;
2708 }
2709
2710 if (grf >= first_grf &&
2711 grf < first_grf + grf_len) {
2712 deps[grf - first_grf] = false;
2713 if (inst_16wide)
2714 deps[grf - first_grf + 1] = false;
2715 }
2716 }
2717 }
2718
2719 /**
2720 * Implements this workaround for the original 965:
2721 *
2722 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2723 * check for post destination dependencies on this instruction, software
2724 * must ensure that there is no destination hazard for the case of ‘write
2725 * followed by a posted write’ shown in the following example.
2726 *
2727 * 1. mov r3 0
2728 * 2. send r3.xy <rest of send instruction>
2729 * 3. mov r2 r3
2730 *
2731 * Due to no post-destination dependency check on the ‘send’, the above
2732 * code sequence could have two instructions (1 and 2) in flight at the
2733 * same time that both consider ‘r3’ as the target of their final writes.
2734 */
2735 void
2736 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2737 {
2738 int reg_size = dispatch_width / 8;
2739 int write_len = inst->regs_written * reg_size;
2740 int first_write_grf = inst->dst.reg;
2741 bool needs_dep[BRW_MAX_MRF];
2742 assert(write_len < (int)sizeof(needs_dep) - 1);
2743
2744 memset(needs_dep, false, sizeof(needs_dep));
2745 memset(needs_dep, true, write_len);
2746
2747 clear_deps_for_inst_src(inst, dispatch_width,
2748 needs_dep, first_write_grf, write_len);
2749
2750 /* Walk backwards looking for writes to registers we're writing which
2751 * aren't read since being written. If we hit the start of the program,
2752 * we assume that there are no outstanding dependencies on entry to the
2753 * program.
2754 */
2755 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2756 scan_inst != NULL;
2757 scan_inst = (fs_inst *)scan_inst->prev) {
2758
2759 /* If we hit control flow, assume that there *are* outstanding
2760 * dependencies, and force their cleanup before our instruction.
2761 */
2762 if (scan_inst->is_control_flow()) {
2763 for (int i = 0; i < write_len; i++) {
2764 if (needs_dep[i]) {
2765 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2766 }
2767 }
2768 return;
2769 }
2770
2771 bool scan_inst_16wide = (dispatch_width > 8 &&
2772 !scan_inst->force_uncompressed &&
2773 !scan_inst->force_sechalf);
2774
2775 /* We insert our reads as late as possible on the assumption that any
2776 * instruction but a MOV that might have left us an outstanding
2777 * dependency has more latency than a MOV.
2778 */
2779 if (scan_inst->dst.file == GRF) {
2780 for (int i = 0; i < scan_inst->regs_written; i++) {
2781 int reg = scan_inst->dst.reg + i * reg_size;
2782
2783 if (reg >= first_write_grf &&
2784 reg < first_write_grf + write_len &&
2785 needs_dep[reg - first_write_grf]) {
2786 inst->insert_before(DEP_RESOLVE_MOV(reg));
2787 needs_dep[reg - first_write_grf] = false;
2788 if (scan_inst_16wide)
2789 needs_dep[reg - first_write_grf + 1] = false;
2790 }
2791 }
2792 }
2793
2794 /* Clear the flag for registers that actually got read (as expected). */
2795 clear_deps_for_inst_src(scan_inst, dispatch_width,
2796 needs_dep, first_write_grf, write_len);
2797
2798 /* Continue the loop only if we haven't resolved all the dependencies */
2799 int i;
2800 for (i = 0; i < write_len; i++) {
2801 if (needs_dep[i])
2802 break;
2803 }
2804 if (i == write_len)
2805 return;
2806 }
2807 }
2808
2809 /**
2810 * Implements this workaround for the original 965:
2811 *
2812 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2813 * used as a destination register until after it has been sourced by an
2814 * instruction with a different destination register.
2815 */
2816 void
2817 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2818 {
2819 int write_len = inst->regs_written * dispatch_width / 8;
2820 int first_write_grf = inst->dst.reg;
2821 bool needs_dep[BRW_MAX_MRF];
2822 assert(write_len < (int)sizeof(needs_dep) - 1);
2823
2824 memset(needs_dep, false, sizeof(needs_dep));
2825 memset(needs_dep, true, write_len);
2826 /* Walk forwards looking for writes to registers we're writing which aren't
2827 * read before being written.
2828 */
2829 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2830 !scan_inst->is_tail_sentinel();
2831 scan_inst = (fs_inst *)scan_inst->next) {
2832 /* If we hit control flow, force resolve all remaining dependencies. */
2833 if (scan_inst->is_control_flow()) {
2834 for (int i = 0; i < write_len; i++) {
2835 if (needs_dep[i])
2836 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2837 }
2838 return;
2839 }
2840
2841 /* Clear the flag for registers that actually got read (as expected). */
2842 clear_deps_for_inst_src(scan_inst, dispatch_width,
2843 needs_dep, first_write_grf, write_len);
2844
2845 /* We insert our reads as late as possible since they're reading the
2846 * result of a SEND, which has massive latency.
2847 */
2848 if (scan_inst->dst.file == GRF &&
2849 scan_inst->dst.reg >= first_write_grf &&
2850 scan_inst->dst.reg < first_write_grf + write_len &&
2851 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2852 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2853 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2854 }
2855
2856 /* Continue the loop only if we haven't resolved all the dependencies */
2857 int i;
2858 for (i = 0; i < write_len; i++) {
2859 if (needs_dep[i])
2860 break;
2861 }
2862 if (i == write_len)
2863 return;
2864 }
2865
2866 /* If we hit the end of the program, resolve all remaining dependencies out
2867 * of paranoia.
2868 */
2869 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2870 assert(last_inst->eot);
2871 for (int i = 0; i < write_len; i++) {
2872 if (needs_dep[i])
2873 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2874 }
2875 }
2876
2877 void
2878 fs_visitor::insert_gen4_send_dependency_workarounds()
2879 {
2880 if (brw->gen != 4 || brw->is_g4x)
2881 return;
2882
2883 /* Note that we're done with register allocation, so GRF fs_regs always
2884 * have a .reg_offset of 0.
2885 */
2886
2887 foreach_list_safe(node, &this->instructions) {
2888 fs_inst *inst = (fs_inst *)node;
2889
2890 if (inst->mlen != 0 && inst->dst.file == GRF) {
2891 insert_gen4_pre_send_dependency_workarounds(inst);
2892 insert_gen4_post_send_dependency_workarounds(inst);
2893 }
2894 }
2895 }
2896
2897 /**
2898 * Turns the generic expression-style uniform pull constant load instruction
2899 * into a hardware-specific series of instructions for loading a pull
2900 * constant.
2901 *
2902 * The expression style allows the CSE pass before this to optimize out
2903 * repeated loads from the same offset, and gives the pre-register-allocation
2904 * scheduling full flexibility, while the conversion to native instructions
2905 * allows the post-register-allocation scheduler the best information
2906 * possible.
2907 *
2908 * Note that execution masking for setting up pull constant loads is special:
2909 * the channels that need to be written are unrelated to the current execution
2910 * mask, since a later instruction will use one of the result channels as a
2911 * source operand for all 8 or 16 of its channels.
2912 */
2913 void
2914 fs_visitor::lower_uniform_pull_constant_loads()
2915 {
2916 foreach_list(node, &this->instructions) {
2917 fs_inst *inst = (fs_inst *)node;
2918
2919 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2920 continue;
2921
2922 if (brw->gen >= 7) {
2923 /* The offset arg before was a vec4-aligned byte offset. We need to
2924 * turn it into a dword offset.
2925 */
2926 fs_reg const_offset_reg = inst->src[1];
2927 assert(const_offset_reg.file == IMM &&
2928 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2929 const_offset_reg.imm.u /= 4;
2930 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2931
2932 /* This is actually going to be a MOV, but since only the first dword
2933 * is accessed, we have a special opcode to do just that one. Note
2934 * that this needs to be an operation that will be considered a def
2935 * by live variable analysis, or register allocation will explode.
2936 */
2937 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2938 payload, const_offset_reg);
2939 setup->force_writemask_all = true;
2940
2941 setup->ir = inst->ir;
2942 setup->annotation = inst->annotation;
2943 inst->insert_before(setup);
2944
2945 /* Similarly, this will only populate the first 4 channels of the
2946 * result register (since we only use smear values from 0-3), but we
2947 * don't tell the optimizer.
2948 */
2949 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2950 inst->src[1] = payload;
2951
2952 invalidate_live_intervals();
2953 } else {
2954 /* Before register allocation, we didn't tell the scheduler about the
2955 * MRF we use. We know it's safe to use this MRF because nothing
2956 * else does except for register spill/unspill, which generates and
2957 * uses its MRF within a single IR instruction.
2958 */
2959 inst->base_mrf = 14;
2960 inst->mlen = 1;
2961 }
2962 }
2963 }
2964
2965 void
2966 fs_visitor::dump_instruction(backend_instruction *be_inst)
2967 {
2968 fs_inst *inst = (fs_inst *)be_inst;
2969
2970 if (inst->predicate) {
2971 printf("(%cf0.%d) ",
2972 inst->predicate_inverse ? '-' : '+',
2973 inst->flag_subreg);
2974 }
2975
2976 printf("%s", brw_instruction_name(inst->opcode));
2977 if (inst->saturate)
2978 printf(".sat");
2979 if (inst->conditional_mod) {
2980 printf(".cmod");
2981 if (!inst->predicate &&
2982 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2983 inst->opcode != BRW_OPCODE_IF &&
2984 inst->opcode != BRW_OPCODE_WHILE))) {
2985 printf(".f0.%d", inst->flag_subreg);
2986 }
2987 }
2988 printf(" ");
2989
2990
2991 switch (inst->dst.file) {
2992 case GRF:
2993 printf("vgrf%d", inst->dst.reg);
2994 if (inst->dst.reg_offset)
2995 printf("+%d", inst->dst.reg_offset);
2996 break;
2997 case MRF:
2998 printf("m%d", inst->dst.reg);
2999 break;
3000 case BAD_FILE:
3001 printf("(null)");
3002 break;
3003 case UNIFORM:
3004 printf("***u%d***", inst->dst.reg);
3005 break;
3006 case HW_REG:
3007 printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
3008 if (inst->dst.fixed_hw_reg.subnr)
3009 printf("+%d", inst->dst.fixed_hw_reg.subnr);
3010 break;
3011 default:
3012 printf("???");
3013 break;
3014 }
3015 printf(", ");
3016
3017 for (int i = 0; i < 3; i++) {
3018 if (inst->src[i].negate)
3019 printf("-");
3020 if (inst->src[i].abs)
3021 printf("|");
3022 switch (inst->src[i].file) {
3023 case GRF:
3024 printf("vgrf%d", inst->src[i].reg);
3025 if (inst->src[i].reg_offset)
3026 printf("+%d", inst->src[i].reg_offset);
3027 break;
3028 case MRF:
3029 printf("***m%d***", inst->src[i].reg);
3030 break;
3031 case UNIFORM:
3032 printf("u%d", inst->src[i].reg);
3033 if (inst->src[i].reg_offset)
3034 printf(".%d", inst->src[i].reg_offset);
3035 break;
3036 case BAD_FILE:
3037 printf("(null)");
3038 break;
3039 case IMM:
3040 switch (inst->src[i].type) {
3041 case BRW_REGISTER_TYPE_F:
3042 printf("%ff", inst->src[i].imm.f);
3043 break;
3044 case BRW_REGISTER_TYPE_D:
3045 printf("%dd", inst->src[i].imm.i);
3046 break;
3047 case BRW_REGISTER_TYPE_UD:
3048 printf("%uu", inst->src[i].imm.u);
3049 break;
3050 default:
3051 printf("???");
3052 break;
3053 }
3054 break;
3055 case HW_REG:
3056 if (inst->src[i].fixed_hw_reg.negate)
3057 printf("-");
3058 if (inst->src[i].fixed_hw_reg.abs)
3059 printf("|");
3060 printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3061 if (inst->src[i].fixed_hw_reg.subnr)
3062 printf("+%d", inst->src[i].fixed_hw_reg.subnr);
3063 if (inst->src[i].fixed_hw_reg.abs)
3064 printf("|");
3065 break;
3066 default:
3067 printf("???");
3068 break;
3069 }
3070 if (inst->src[i].abs)
3071 printf("|");
3072
3073 if (i < 3)
3074 printf(", ");
3075 }
3076
3077 printf(" ");
3078
3079 if (inst->force_uncompressed)
3080 printf("1sthalf ");
3081
3082 if (inst->force_sechalf)
3083 printf("2ndhalf ");
3084
3085 printf("\n");
3086 }
3087
3088 /**
3089 * Possibly returns an instruction that set up @param reg.
3090 *
3091 * Sometimes we want to take the result of some expression/variable
3092 * dereference tree and rewrite the instruction generating the result
3093 * of the tree. When processing the tree, we know that the
3094 * instructions generated are all writing temporaries that are dead
3095 * outside of this tree. So, if we have some instructions that write
3096 * a temporary, we're free to point that temp write somewhere else.
3097 *
3098 * Note that this doesn't guarantee that the instruction generated
3099 * only reg -- it might be the size=4 destination of a texture instruction.
3100 */
3101 fs_inst *
3102 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3103 fs_inst *end,
3104 fs_reg reg)
3105 {
3106 if (end == start ||
3107 end->is_partial_write() ||
3108 reg.reladdr ||
3109 !reg.equals(end->dst)) {
3110 return NULL;
3111 } else {
3112 return end;
3113 }
3114 }
3115
3116 void
3117 fs_visitor::setup_payload_gen6()
3118 {
3119 bool uses_depth =
3120 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3121 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3122
3123 assert(brw->gen >= 6);
3124
3125 /* R0-1: masks, pixel X/Y coordinates. */
3126 c->nr_payload_regs = 2;
3127 /* R2: only for 32-pixel dispatch.*/
3128
3129 /* R3-26: barycentric interpolation coordinates. These appear in the
3130 * same order that they appear in the brw_wm_barycentric_interp_mode
3131 * enum. Each set of coordinates occupies 2 registers if dispatch width
3132 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3133 * appear if they were enabled using the "Barycentric Interpolation
3134 * Mode" bits in WM_STATE.
3135 */
3136 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3137 if (barycentric_interp_modes & (1 << i)) {
3138 c->barycentric_coord_reg[i] = c->nr_payload_regs;
3139 c->nr_payload_regs += 2;
3140 if (dispatch_width == 16) {
3141 c->nr_payload_regs += 2;
3142 }
3143 }
3144 }
3145
3146 /* R27: interpolated depth if uses source depth */
3147 if (uses_depth) {
3148 c->source_depth_reg = c->nr_payload_regs;
3149 c->nr_payload_regs++;
3150 if (dispatch_width == 16) {
3151 /* R28: interpolated depth if not 8-wide. */
3152 c->nr_payload_regs++;
3153 }
3154 }
3155 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3156 if (uses_depth) {
3157 c->source_w_reg = c->nr_payload_regs;
3158 c->nr_payload_regs++;
3159 if (dispatch_width == 16) {
3160 /* R30: interpolated W if not 8-wide. */
3161 c->nr_payload_regs++;
3162 }
3163 }
3164
3165 c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3166 /* R31: MSAA position offsets. */
3167 if (c->prog_data.uses_pos_offset) {
3168 c->sample_pos_reg = c->nr_payload_regs;
3169 c->nr_payload_regs++;
3170 }
3171
3172 /* R32-: bary for 32-pixel. */
3173 /* R58-59: interp W for 32-pixel. */
3174
3175 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3176 c->source_depth_to_render_target = true;
3177 }
3178 }
3179
3180 void
3181 fs_visitor::assign_binding_table_offsets()
3182 {
3183 uint32_t next_binding_table_offset = 0;
3184
3185 /* If there are no color regions, we still perform an FB write to a null
3186 * renderbuffer, which we place at surface index 0.
3187 */
3188 c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3189 next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3190
3191 assign_common_binding_table_offsets(next_binding_table_offset);
3192 }
3193
3194 bool
3195 fs_visitor::run()
3196 {
3197 sanity_param_count = fp->Base.Parameters->NumParameters;
3198 uint32_t orig_nr_params = c->prog_data.nr_params;
3199 bool allocated_without_spills;
3200
3201 assign_binding_table_offsets();
3202
3203 if (brw->gen >= 6)
3204 setup_payload_gen6();
3205 else
3206 setup_payload_gen4();
3207
3208 if (0) {
3209 emit_dummy_fs();
3210 } else {
3211 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3212 emit_shader_time_begin();
3213
3214 calculate_urb_setup();
3215 if (fp->Base.InputsRead > 0) {
3216 if (brw->gen < 6)
3217 emit_interpolation_setup_gen4();
3218 else
3219 emit_interpolation_setup_gen6();
3220 }
3221
3222 /* We handle discards by keeping track of the still-live pixels in f0.1.
3223 * Initialize it with the dispatched pixels.
3224 */
3225 if (fp->UsesKill || c->key.alpha_test_func) {
3226 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3227 discard_init->flag_subreg = 1;
3228 }
3229
3230 /* Generate FS IR for main(). (the visitor only descends into
3231 * functions called "main").
3232 */
3233 if (shader) {
3234 foreach_list(node, &*shader->ir) {
3235 ir_instruction *ir = (ir_instruction *)node;
3236 base_ir = ir;
3237 this->result = reg_undef;
3238 ir->accept(this);
3239 }
3240 } else {
3241 emit_fragment_program_code();
3242 }
3243 base_ir = NULL;
3244 if (failed)
3245 return false;
3246
3247 emit(FS_OPCODE_PLACEHOLDER_HALT);
3248
3249 if (c->key.alpha_test_func)
3250 emit_alpha_test();
3251
3252 emit_fb_writes();
3253
3254 split_virtual_grfs();
3255
3256 move_uniform_array_access_to_pull_constants();
3257 remove_dead_constants();
3258 setup_pull_constants();
3259
3260 bool progress;
3261 do {
3262 progress = false;
3263
3264 compact_virtual_grfs();
3265
3266 progress = remove_duplicate_mrf_writes() || progress;
3267
3268 progress = opt_algebraic() || progress;
3269 progress = opt_cse() || progress;
3270 progress = opt_copy_propagate() || progress;
3271 progress = dead_code_eliminate() || progress;
3272 progress = dead_code_eliminate_local() || progress;
3273 progress = dead_control_flow_eliminate(this) || progress;
3274 progress = register_coalesce() || progress;
3275 progress = register_coalesce_2() || progress;
3276 progress = compute_to_mrf() || progress;
3277 } while (progress);
3278
3279 lower_uniform_pull_constant_loads();
3280
3281 assign_curb_setup();
3282 assign_urb_setup();
3283
3284 static enum instruction_scheduler_mode pre_modes[] = {
3285 SCHEDULE_PRE,
3286 SCHEDULE_PRE_NON_LIFO,
3287 SCHEDULE_PRE_LIFO,
3288 };
3289
3290 /* Try each scheduling heuristic to see if it can successfully register
3291 * allocate without spilling. They should be ordered by decreasing
3292 * performance but increasing likelihood of allocating.
3293 */
3294 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3295 schedule_instructions(pre_modes[i]);
3296
3297 if (0) {
3298 assign_regs_trivial();
3299 allocated_without_spills = true;
3300 } else {
3301 allocated_without_spills = assign_regs(false);
3302 }
3303 if (allocated_without_spills)
3304 break;
3305 }
3306
3307 if (!allocated_without_spills) {
3308 /* We assume that any spilling is worse than just dropping back to
3309 * SIMD8. There's probably actually some intermediate point where
3310 * SIMD16 with a couple of spills is still better.
3311 */
3312 if (dispatch_width == 16) {
3313 fail("Failure to register allocate. Reduce number of "
3314 "live scalar values to avoid this.");
3315 }
3316
3317 /* Since we're out of heuristics, just go spill registers until we
3318 * get an allocation.
3319 */
3320 while (!assign_regs(true)) {
3321 if (failed)
3322 break;
3323 }
3324 }
3325 }
3326 assert(force_uncompressed_stack == 0);
3327
3328 /* This must come after all optimization and register allocation, since
3329 * it inserts dead code that happens to have side effects, and it does
3330 * so based on the actual physical registers in use.
3331 */
3332 insert_gen4_send_dependency_workarounds();
3333
3334 if (failed)
3335 return false;
3336
3337 if (!allocated_without_spills)
3338 schedule_instructions(SCHEDULE_POST);
3339
3340 if (dispatch_width == 8) {
3341 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3342 } else {
3343 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3344
3345 /* Make sure we didn't try to sneak in an extra uniform */
3346 assert(orig_nr_params == c->prog_data.nr_params);
3347 (void) orig_nr_params;
3348 }
3349
3350 /* If any state parameters were appended, then ParameterValues could have
3351 * been realloced, in which case the driver uniform storage set up by
3352 * _mesa_associate_uniform_storage() would point to freed memory. Make
3353 * sure that didn't happen.
3354 */
3355 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3356
3357 return !failed;
3358 }
3359
3360 const unsigned *
3361 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3362 struct gl_fragment_program *fp,
3363 struct gl_shader_program *prog,
3364 unsigned *final_assembly_size)
3365 {
3366 bool start_busy = false;
3367 float start_time = 0;
3368
3369 if (unlikely(brw->perf_debug)) {
3370 start_busy = (brw->batch.last_bo &&
3371 drm_intel_bo_busy(brw->batch.last_bo));
3372 start_time = get_time();
3373 }
3374
3375 struct brw_shader *shader = NULL;
3376 if (prog)
3377 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3378
3379 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3380 if (prog) {
3381 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3382 _mesa_print_ir(shader->ir, NULL);
3383 printf("\n\n");
3384 } else {
3385 printf("ARB_fragment_program %d ir for native fragment shader\n",
3386 fp->Base.Id);
3387 _mesa_print_program(&fp->Base);
3388 }
3389 }
3390
3391 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3392 */
3393 fs_visitor v(brw, c, prog, fp, 8);
3394 if (!v.run()) {
3395 if (prog) {
3396 prog->LinkStatus = false;
3397 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3398 }
3399
3400 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3401 v.fail_msg);
3402
3403 return NULL;
3404 }
3405
3406 exec_list *simd16_instructions = NULL;
3407 fs_visitor v2(brw, c, prog, fp, 16);
3408 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3409 if (c->prog_data.nr_pull_params == 0) {
3410 /* Try a 16-wide compile */
3411 v2.import_uniforms(&v);
3412 if (!v2.run()) {
3413 perf_debug("16-wide shader failed to compile, falling back to "
3414 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3415 } else {
3416 simd16_instructions = &v2.instructions;
3417 }
3418 } else {
3419 perf_debug("Skipping 16-wide due to pull parameters.\n");
3420 }
3421 }
3422
3423 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3424 const unsigned *generated = g.generate_assembly(&v.instructions,
3425 simd16_instructions,
3426 final_assembly_size);
3427
3428 if (unlikely(brw->perf_debug) && shader) {
3429 if (shader->compiled_once)
3430 brw_wm_debug_recompile(brw, prog, &c->key);
3431 shader->compiled_once = true;
3432
3433 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3434 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3435 (get_time() - start_time) * 1000);
3436 }
3437 }
3438
3439 return generated;
3440 }
3441
3442 bool
3443 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3444 {
3445 struct brw_context *brw = brw_context(ctx);
3446 struct brw_wm_prog_key key;
3447
3448 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3449 return true;
3450
3451 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3452 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3453 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3454 bool program_uses_dfdy = fp->UsesDFdy;
3455
3456 memset(&key, 0, sizeof(key));
3457
3458 if (brw->gen < 6) {
3459 if (fp->UsesKill)
3460 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3461
3462 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3463 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3464
3465 /* Just assume depth testing. */
3466 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3467 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3468 }
3469
3470 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3471 BRW_FS_VARYING_INPUT_MASK) > 16)
3472 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3473
3474 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3475
3476 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3477 for (unsigned i = 0; i < sampler_count; i++) {
3478 if (fp->Base.ShadowSamplers & (1 << i)) {
3479 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3480 key.tex.swizzles[i] =
3481 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3482 } else {
3483 /* Color sampler: assume no swizzling. */
3484 key.tex.swizzles[i] = SWIZZLE_XYZW;
3485 }
3486 }
3487
3488 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3489 key.drawable_height = ctx->DrawBuffer->Height;
3490 }
3491
3492 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3493 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3494 }
3495
3496 key.nr_color_regions = 1;
3497
3498 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3499 * quality of the derivatives is likely to be determined by the driconf
3500 * option.
3501 */
3502 key.high_quality_derivatives = brw->disable_derivative_optimization;
3503
3504 key.program_string_id = bfp->id;
3505
3506 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3507 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3508
3509 bool success = do_wm_prog(brw, prog, bfp, &key);
3510
3511 brw->wm.base.prog_offset = old_prog_offset;
3512 brw->wm.prog_data = old_prog_data;
3513
3514 return success;
3515 }