i965/fs: Make lower_load_payload etc. appear in INTEL_DEBUG=optimizer.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "util/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "util/register_allocate.h"
42 #include "program/hash_table.h"
43 #include "brw_context.h"
44 #include "brw_eu.h"
45 #include "brw_wm.h"
46 }
47 #include "brw_fs.h"
48 #include "brw_cfg.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53 #include "program/sampler.h"
54
55 void
56 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
57 fs_reg *src, int sources)
58 {
59 memset(this, 0, sizeof(*this));
60
61 this->opcode = opcode;
62 this->dst = dst;
63 this->src = src;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (int i = 0; i < sources; ++i) {
79 if (src[i].file != GRF && src[i].file != ATTR)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 for (int i = 0; i < sources; ++i) {
94 switch (this->src[i].file) {
95 case BAD_FILE:
96 this->src[i].effective_width = 8;
97 break;
98 case GRF:
99 case HW_REG:
100 case ATTR:
101 assert(this->src[i].width > 0);
102 if (this->src[i].width == 1) {
103 this->src[i].effective_width = this->exec_size;
104 } else {
105 this->src[i].effective_width = this->src[i].width;
106 }
107 break;
108 case IMM:
109 case UNIFORM:
110 this->src[i].effective_width = this->exec_size;
111 break;
112 default:
113 unreachable("Invalid source register file");
114 }
115 }
116 this->dst.effective_width = this->exec_size;
117
118 this->conditional_mod = BRW_CONDITIONAL_NONE;
119
120 /* This will be the case for almost all instructions. */
121 switch (dst.file) {
122 case GRF:
123 case HW_REG:
124 case MRF:
125 case ATTR:
126 this->regs_written = (dst.width * dst.stride * type_sz(dst.type) + 31) / 32;
127 break;
128 case BAD_FILE:
129 this->regs_written = 0;
130 break;
131 case IMM:
132 case UNIFORM:
133 unreachable("Invalid destination register file");
134 default:
135 unreachable("Invalid register file");
136 }
137
138 this->writes_accumulator = false;
139 }
140
141 fs_inst::fs_inst()
142 {
143 fs_reg *src = ralloc_array(this, fs_reg, 3);
144 init(BRW_OPCODE_NOP, 8, dst, src, 0);
145 }
146
147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
148 {
149 fs_reg *src = ralloc_array(this, fs_reg, 3);
150 init(opcode, exec_size, reg_undef, src, 0);
151 }
152
153 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
154 {
155 fs_reg *src = ralloc_array(this, fs_reg, 3);
156 init(opcode, 0, dst, src, 0);
157 }
158
159 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
160 const fs_reg &src0)
161 {
162 fs_reg *src = ralloc_array(this, fs_reg, 3);
163 src[0] = src0;
164 init(opcode, exec_size, dst, src, 1);
165 }
166
167 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
168 {
169 fs_reg *src = ralloc_array(this, fs_reg, 3);
170 src[0] = src0;
171 init(opcode, 0, dst, src, 1);
172 }
173
174 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
175 const fs_reg &src0, const fs_reg &src1)
176 {
177 fs_reg *src = ralloc_array(this, fs_reg, 3);
178 src[0] = src0;
179 src[1] = src1;
180 init(opcode, exec_size, dst, src, 2);
181 }
182
183 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
184 const fs_reg &src1)
185 {
186 fs_reg *src = ralloc_array(this, fs_reg, 3);
187 src[0] = src0;
188 src[1] = src1;
189 init(opcode, 0, dst, src, 2);
190 }
191
192 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
193 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
194 {
195 fs_reg *src = ralloc_array(this, fs_reg, 3);
196 src[0] = src0;
197 src[1] = src1;
198 src[2] = src2;
199 init(opcode, exec_size, dst, src, 3);
200 }
201
202 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
203 const fs_reg &src1, const fs_reg &src2)
204 {
205 fs_reg *src = ralloc_array(this, fs_reg, 3);
206 src[0] = src0;
207 src[1] = src1;
208 src[2] = src2;
209 init(opcode, 0, dst, src, 3);
210 }
211
212 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
213 {
214 init(opcode, 0, dst, src, sources);
215 }
216
217 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
218 fs_reg src[], int sources)
219 {
220 init(opcode, exec_width, dst, src, sources);
221 }
222
223 fs_inst::fs_inst(const fs_inst &that)
224 {
225 memcpy(this, &that, sizeof(that));
226
227 this->src = ralloc_array(this, fs_reg, that.sources);
228
229 for (int i = 0; i < that.sources; i++)
230 this->src[i] = that.src[i];
231 }
232
233 void
234 fs_inst::resize_sources(uint8_t num_sources)
235 {
236 if (this->sources != num_sources) {
237 this->src = reralloc(this, this->src, fs_reg, num_sources);
238 this->sources = num_sources;
239 }
240 }
241
242 #define ALU1(op) \
243 fs_inst * \
244 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
245 { \
246 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
247 }
248
249 #define ALU2(op) \
250 fs_inst * \
251 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
252 const fs_reg &src1) \
253 { \
254 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
255 }
256
257 #define ALU2_ACC(op) \
258 fs_inst * \
259 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
260 const fs_reg &src1) \
261 { \
262 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
263 inst->writes_accumulator = true; \
264 return inst; \
265 }
266
267 #define ALU3(op) \
268 fs_inst * \
269 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
270 const fs_reg &src1, const fs_reg &src2) \
271 { \
272 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
273 }
274
275 ALU1(NOT)
276 ALU1(MOV)
277 ALU1(FRC)
278 ALU1(RNDD)
279 ALU1(RNDE)
280 ALU1(RNDZ)
281 ALU2(ADD)
282 ALU2(MUL)
283 ALU2_ACC(MACH)
284 ALU2(AND)
285 ALU2(OR)
286 ALU2(XOR)
287 ALU2(SHL)
288 ALU2(SHR)
289 ALU2(ASR)
290 ALU3(LRP)
291 ALU1(BFREV)
292 ALU3(BFE)
293 ALU2(BFI1)
294 ALU3(BFI2)
295 ALU1(FBH)
296 ALU1(FBL)
297 ALU1(CBIT)
298 ALU3(MAD)
299 ALU2_ACC(ADDC)
300 ALU2_ACC(SUBB)
301 ALU2(SEL)
302 ALU2(MAC)
303
304 /** Gen4 predicated IF. */
305 fs_inst *
306 fs_visitor::IF(enum brw_predicate predicate)
307 {
308 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
309 inst->predicate = predicate;
310 return inst;
311 }
312
313 /** Gen6 IF with embedded comparison. */
314 fs_inst *
315 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
316 enum brw_conditional_mod condition)
317 {
318 assert(brw->gen == 6);
319 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
320 reg_null_d, src0, src1);
321 inst->conditional_mod = condition;
322 return inst;
323 }
324
325 /**
326 * CMP: Sets the low bit of the destination channels with the result
327 * of the comparison, while the upper bits are undefined, and updates
328 * the flag register with the packed 16 bits of the result.
329 */
330 fs_inst *
331 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
332 enum brw_conditional_mod condition)
333 {
334 fs_inst *inst;
335
336 /* Take the instruction:
337 *
338 * CMP null<d> src0<f> src1<f>
339 *
340 * Original gen4 does type conversion to the destination type before
341 * comparison, producing garbage results for floating point comparisons.
342 * gen5 does the comparison on the execution type (resolved source types),
343 * so dst type doesn't matter. gen6 does comparison and then uses the
344 * result as if it was the dst type with no conversion, which happens to
345 * mostly work out for float-interpreted-as-int since our comparisons are
346 * for >0, =0, <0.
347 */
348 if (brw->gen == 4) {
349 dst.type = src0.type;
350 if (dst.file == HW_REG)
351 dst.fixed_hw_reg.type = dst.type;
352 }
353
354 resolve_ud_negate(&src0);
355 resolve_ud_negate(&src1);
356
357 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
358 inst->conditional_mod = condition;
359
360 return inst;
361 }
362
363 fs_inst *
364 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
365 {
366 uint8_t exec_size = dst.width;
367 for (int i = 0; i < sources; ++i) {
368 assert(src[i].width % dst.width == 0);
369 if (src[i].width > exec_size)
370 exec_size = src[i].width;
371 }
372
373 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
374 dst, src, sources);
375 inst->regs_written = 0;
376 for (int i = 0; i < sources; ++i) {
377 /* The LOAD_PAYLOAD instruction only really makes sense if we are
378 * dealing with whole registers. If this ever changes, we can deal
379 * with it later.
380 */
381 int size = src[i].effective_width * type_sz(src[i].type);
382 assert(size % 32 == 0);
383 inst->regs_written += (size + 31) / 32;
384 }
385
386 return inst;
387 }
388
389 exec_list
390 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
391 const fs_reg &surf_index,
392 const fs_reg &varying_offset,
393 uint32_t const_offset)
394 {
395 exec_list instructions;
396 fs_inst *inst;
397
398 /* We have our constant surface use a pitch of 4 bytes, so our index can
399 * be any component of a vector, and then we load 4 contiguous
400 * components starting from that.
401 *
402 * We break down the const_offset to a portion added to the variable
403 * offset and a portion done using reg_offset, which means that if you
404 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
405 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
406 * CSE can later notice that those loads are all the same and eliminate
407 * the redundant ones.
408 */
409 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
410 instructions.push_tail(ADD(vec4_offset,
411 varying_offset, fs_reg(const_offset & ~3)));
412
413 int scale = 1;
414 if (brw->gen == 4 && dst.width == 8) {
415 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
416 * u, v, r) as parameters, or we can just use the SIMD16 message
417 * consisting of (header, u). We choose the second, at the cost of a
418 * longer return length.
419 */
420 scale = 2;
421 }
422
423 enum opcode op;
424 if (brw->gen >= 7)
425 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
426 else
427 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
428
429 assert(dst.width % 8 == 0);
430 int regs_written = 4 * (dst.width / 8) * scale;
431 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(regs_written),
432 dst.type, dst.width);
433 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
434 inst->regs_written = regs_written;
435 instructions.push_tail(inst);
436
437 if (brw->gen < 7) {
438 inst->base_mrf = 13;
439 inst->header_present = true;
440 if (brw->gen == 4)
441 inst->mlen = 3;
442 else
443 inst->mlen = 1 + dispatch_width / 8;
444 }
445
446 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
447 instructions.push_tail(MOV(dst, result));
448
449 return instructions;
450 }
451
452 /**
453 * A helper for MOV generation for fixing up broken hardware SEND dependency
454 * handling.
455 */
456 fs_inst *
457 fs_visitor::DEP_RESOLVE_MOV(int grf)
458 {
459 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
460
461 inst->ir = NULL;
462 inst->annotation = "send dependency resolve";
463
464 /* The caller always wants uncompressed to emit the minimal extra
465 * dependencies, and to avoid having to deal with aligning its regs to 2.
466 */
467 inst->exec_size = 8;
468
469 return inst;
470 }
471
472 bool
473 fs_inst::equals(fs_inst *inst) const
474 {
475 return (opcode == inst->opcode &&
476 dst.equals(inst->dst) &&
477 src[0].equals(inst->src[0]) &&
478 src[1].equals(inst->src[1]) &&
479 src[2].equals(inst->src[2]) &&
480 saturate == inst->saturate &&
481 predicate == inst->predicate &&
482 conditional_mod == inst->conditional_mod &&
483 mlen == inst->mlen &&
484 base_mrf == inst->base_mrf &&
485 target == inst->target &&
486 eot == inst->eot &&
487 header_present == inst->header_present &&
488 shadow_compare == inst->shadow_compare &&
489 exec_size == inst->exec_size &&
490 offset == inst->offset);
491 }
492
493 bool
494 fs_inst::overwrites_reg(const fs_reg &reg) const
495 {
496 return (reg.file == dst.file &&
497 reg.reg == dst.reg &&
498 reg.reg_offset >= dst.reg_offset &&
499 reg.reg_offset < dst.reg_offset + regs_written);
500 }
501
502 bool
503 fs_inst::is_send_from_grf() const
504 {
505 switch (opcode) {
506 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
507 case SHADER_OPCODE_SHADER_TIME_ADD:
508 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
509 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
510 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
511 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
512 case SHADER_OPCODE_UNTYPED_ATOMIC:
513 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
514 case SHADER_OPCODE_URB_WRITE_SIMD8:
515 return true;
516 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
517 return src[1].file == GRF;
518 case FS_OPCODE_FB_WRITE:
519 return src[0].file == GRF;
520 default:
521 if (is_tex())
522 return src[0].file == GRF;
523
524 return false;
525 }
526 }
527
528 bool
529 fs_inst::can_do_source_mods(struct brw_context *brw)
530 {
531 if (brw->gen == 6 && is_math())
532 return false;
533
534 if (is_send_from_grf())
535 return false;
536
537 if (!backend_instruction::can_do_source_mods())
538 return false;
539
540 return true;
541 }
542
543 void
544 fs_reg::init()
545 {
546 memset(this, 0, sizeof(*this));
547 stride = 1;
548 }
549
550 /** Generic unset register constructor. */
551 fs_reg::fs_reg()
552 {
553 init();
554 this->file = BAD_FILE;
555 }
556
557 /** Immediate value constructor. */
558 fs_reg::fs_reg(float f)
559 {
560 init();
561 this->file = IMM;
562 this->type = BRW_REGISTER_TYPE_F;
563 this->fixed_hw_reg.dw1.f = f;
564 this->width = 1;
565 }
566
567 /** Immediate value constructor. */
568 fs_reg::fs_reg(int32_t i)
569 {
570 init();
571 this->file = IMM;
572 this->type = BRW_REGISTER_TYPE_D;
573 this->fixed_hw_reg.dw1.d = i;
574 this->width = 1;
575 }
576
577 /** Immediate value constructor. */
578 fs_reg::fs_reg(uint32_t u)
579 {
580 init();
581 this->file = IMM;
582 this->type = BRW_REGISTER_TYPE_UD;
583 this->fixed_hw_reg.dw1.ud = u;
584 this->width = 1;
585 }
586
587 /** Vector float immediate value constructor. */
588 fs_reg::fs_reg(uint8_t vf[4])
589 {
590 init();
591 this->file = IMM;
592 this->type = BRW_REGISTER_TYPE_VF;
593 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
594 }
595
596 /** Vector float immediate value constructor. */
597 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
598 {
599 init();
600 this->file = IMM;
601 this->type = BRW_REGISTER_TYPE_VF;
602 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
603 (vf1 << 8) |
604 (vf2 << 16) |
605 (vf3 << 24);
606 }
607
608 /** Fixed brw_reg. */
609 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
610 {
611 init();
612 this->file = HW_REG;
613 this->fixed_hw_reg = fixed_hw_reg;
614 this->type = fixed_hw_reg.type;
615 this->width = 1 << fixed_hw_reg.width;
616 }
617
618 bool
619 fs_reg::equals(const fs_reg &r) const
620 {
621 return (file == r.file &&
622 reg == r.reg &&
623 reg_offset == r.reg_offset &&
624 subreg_offset == r.subreg_offset &&
625 type == r.type &&
626 negate == r.negate &&
627 abs == r.abs &&
628 !reladdr && !r.reladdr &&
629 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
630 width == r.width &&
631 stride == r.stride);
632 }
633
634 fs_reg &
635 fs_reg::set_smear(unsigned subreg)
636 {
637 assert(file != HW_REG && file != IMM);
638 subreg_offset = subreg * type_sz(type);
639 stride = 0;
640 return *this;
641 }
642
643 bool
644 fs_reg::is_contiguous() const
645 {
646 return stride == 1;
647 }
648
649 int
650 fs_visitor::type_size(const struct glsl_type *type)
651 {
652 unsigned int size, i;
653
654 switch (type->base_type) {
655 case GLSL_TYPE_UINT:
656 case GLSL_TYPE_INT:
657 case GLSL_TYPE_FLOAT:
658 case GLSL_TYPE_BOOL:
659 return type->components();
660 case GLSL_TYPE_ARRAY:
661 return type_size(type->fields.array) * type->length;
662 case GLSL_TYPE_STRUCT:
663 size = 0;
664 for (i = 0; i < type->length; i++) {
665 size += type_size(type->fields.structure[i].type);
666 }
667 return size;
668 case GLSL_TYPE_SAMPLER:
669 /* Samplers take up no register space, since they're baked in at
670 * link time.
671 */
672 return 0;
673 case GLSL_TYPE_ATOMIC_UINT:
674 return 0;
675 case GLSL_TYPE_IMAGE:
676 case GLSL_TYPE_VOID:
677 case GLSL_TYPE_ERROR:
678 case GLSL_TYPE_INTERFACE:
679 unreachable("not reached");
680 }
681
682 return 0;
683 }
684
685 fs_reg
686 fs_visitor::get_timestamp()
687 {
688 assert(brw->gen >= 7);
689
690 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
691 BRW_ARF_TIMESTAMP,
692 0),
693 BRW_REGISTER_TYPE_UD));
694
695 fs_reg dst = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 4);
696
697 fs_inst *mov = emit(MOV(dst, ts));
698 /* We want to read the 3 fields we care about even if it's not enabled in
699 * the dispatch.
700 */
701 mov->force_writemask_all = true;
702
703 /* The caller wants the low 32 bits of the timestamp. Since it's running
704 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
705 * which is plenty of time for our purposes. It is identical across the
706 * EUs, but since it's tracking GPU core speed it will increment at a
707 * varying rate as render P-states change.
708 *
709 * The caller could also check if render P-states have changed (or anything
710 * else that might disrupt timing) by setting smear to 2 and checking if
711 * that field is != 0.
712 */
713 dst.set_smear(0);
714
715 return dst;
716 }
717
718 void
719 fs_visitor::emit_shader_time_begin()
720 {
721 current_annotation = "shader time start";
722 shader_start_time = get_timestamp();
723 }
724
725 void
726 fs_visitor::emit_shader_time_end()
727 {
728 current_annotation = "shader time end";
729
730 enum shader_time_shader_type type, written_type, reset_type;
731 if (dispatch_width == 8) {
732 type = ST_FS8;
733 written_type = ST_FS8_WRITTEN;
734 reset_type = ST_FS8_RESET;
735 } else {
736 assert(dispatch_width == 16);
737 type = ST_FS16;
738 written_type = ST_FS16_WRITTEN;
739 reset_type = ST_FS16_RESET;
740 }
741
742 fs_reg shader_end_time = get_timestamp();
743
744 /* Check that there weren't any timestamp reset events (assuming these
745 * were the only two timestamp reads that happened).
746 */
747 fs_reg reset = shader_end_time;
748 reset.set_smear(2);
749 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
750 test->conditional_mod = BRW_CONDITIONAL_Z;
751 emit(IF(BRW_PREDICATE_NORMAL));
752
753 fs_reg start = shader_start_time;
754 start.negate = true;
755 fs_reg diff = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 1);
756 emit(ADD(diff, start, shader_end_time));
757
758 /* If there were no instructions between the two timestamp gets, the diff
759 * is 2 cycles. Remove that overhead, so I can forget about that when
760 * trying to determine the time taken for single instructions.
761 */
762 emit(ADD(diff, diff, fs_reg(-2u)));
763
764 emit_shader_time_write(type, diff);
765 emit_shader_time_write(written_type, fs_reg(1u));
766 emit(BRW_OPCODE_ELSE);
767 emit_shader_time_write(reset_type, fs_reg(1u));
768 emit(BRW_OPCODE_ENDIF);
769 }
770
771 void
772 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
773 fs_reg value)
774 {
775 int shader_time_index =
776 brw_get_shader_time_index(brw, shader_prog, prog, type);
777 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
778
779 fs_reg payload;
780 if (dispatch_width == 8)
781 payload = fs_reg(this, glsl_type::uvec2_type);
782 else
783 payload = fs_reg(this, glsl_type::uint_type);
784
785 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
786 fs_reg(), payload, offset, value));
787 }
788
789 void
790 fs_visitor::vfail(const char *format, va_list va)
791 {
792 char *msg;
793
794 if (failed)
795 return;
796
797 failed = true;
798
799 msg = ralloc_vasprintf(mem_ctx, format, va);
800 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
801
802 this->fail_msg = msg;
803
804 if (INTEL_DEBUG & DEBUG_WM) {
805 fprintf(stderr, "%s", msg);
806 }
807 }
808
809 void
810 fs_visitor::fail(const char *format, ...)
811 {
812 va_list va;
813
814 va_start(va, format);
815 vfail(format, va);
816 va_end(va);
817 }
818
819 /**
820 * Mark this program as impossible to compile in SIMD16 mode.
821 *
822 * During the SIMD8 compile (which happens first), we can detect and flag
823 * things that are unsupported in SIMD16 mode, so the compiler can skip
824 * the SIMD16 compile altogether.
825 *
826 * During a SIMD16 compile (if one happens anyway), this just calls fail().
827 */
828 void
829 fs_visitor::no16(const char *format, ...)
830 {
831 va_list va;
832
833 va_start(va, format);
834
835 if (dispatch_width == 16) {
836 vfail(format, va);
837 } else {
838 simd16_unsupported = true;
839
840 if (brw->perf_debug) {
841 if (no16_msg)
842 ralloc_vasprintf_append(&no16_msg, format, va);
843 else
844 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
845 }
846 }
847
848 va_end(va);
849 }
850
851 fs_inst *
852 fs_visitor::emit(enum opcode opcode)
853 {
854 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
855 }
856
857 fs_inst *
858 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
859 {
860 return emit(new(mem_ctx) fs_inst(opcode, dst));
861 }
862
863 fs_inst *
864 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
865 {
866 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
867 }
868
869 fs_inst *
870 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
871 const fs_reg &src1)
872 {
873 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
874 }
875
876 fs_inst *
877 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
878 const fs_reg &src1, const fs_reg &src2)
879 {
880 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
881 }
882
883 fs_inst *
884 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
885 fs_reg src[], int sources)
886 {
887 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
888 }
889
890 /**
891 * Returns true if the instruction has a flag that means it won't
892 * update an entire destination register.
893 *
894 * For example, dead code elimination and live variable analysis want to know
895 * when a write to a variable screens off any preceding values that were in
896 * it.
897 */
898 bool
899 fs_inst::is_partial_write() const
900 {
901 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
902 (this->dst.width * type_sz(this->dst.type)) < 32 ||
903 !this->dst.is_contiguous());
904 }
905
906 int
907 fs_inst::regs_read(fs_visitor *v, int arg) const
908 {
909 if (is_tex() && arg == 0 && src[0].file == GRF) {
910 return mlen;
911 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
912 return mlen;
913 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
914 return mlen;
915 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
916 return mlen;
917 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
918 return mlen;
919 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
920 return mlen;
921 }
922
923 switch (src[arg].file) {
924 case BAD_FILE:
925 case UNIFORM:
926 case IMM:
927 return 1;
928 case GRF:
929 case HW_REG:
930 if (src[arg].stride == 0) {
931 return 1;
932 } else {
933 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
934 return (size + 31) / 32;
935 }
936 case MRF:
937 unreachable("MRF registers are not allowed as sources");
938 default:
939 unreachable("Invalid register file");
940 }
941 }
942
943 bool
944 fs_inst::reads_flag() const
945 {
946 return predicate;
947 }
948
949 bool
950 fs_inst::writes_flag() const
951 {
952 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
953 opcode != BRW_OPCODE_IF &&
954 opcode != BRW_OPCODE_WHILE)) ||
955 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
956 }
957
958 /**
959 * Returns how many MRFs an FS opcode will write over.
960 *
961 * Note that this is not the 0 or 1 implied writes in an actual gen
962 * instruction -- the FS opcodes often generate MOVs in addition.
963 */
964 int
965 fs_visitor::implied_mrf_writes(fs_inst *inst)
966 {
967 if (inst->mlen == 0)
968 return 0;
969
970 if (inst->base_mrf == -1)
971 return 0;
972
973 switch (inst->opcode) {
974 case SHADER_OPCODE_RCP:
975 case SHADER_OPCODE_RSQ:
976 case SHADER_OPCODE_SQRT:
977 case SHADER_OPCODE_EXP2:
978 case SHADER_OPCODE_LOG2:
979 case SHADER_OPCODE_SIN:
980 case SHADER_OPCODE_COS:
981 return 1 * dispatch_width / 8;
982 case SHADER_OPCODE_POW:
983 case SHADER_OPCODE_INT_QUOTIENT:
984 case SHADER_OPCODE_INT_REMAINDER:
985 return 2 * dispatch_width / 8;
986 case SHADER_OPCODE_TEX:
987 case FS_OPCODE_TXB:
988 case SHADER_OPCODE_TXD:
989 case SHADER_OPCODE_TXF:
990 case SHADER_OPCODE_TXF_CMS:
991 case SHADER_OPCODE_TXF_MCS:
992 case SHADER_OPCODE_TG4:
993 case SHADER_OPCODE_TG4_OFFSET:
994 case SHADER_OPCODE_TXL:
995 case SHADER_OPCODE_TXS:
996 case SHADER_OPCODE_LOD:
997 return 1;
998 case FS_OPCODE_FB_WRITE:
999 return 2;
1000 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1001 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1002 return 1;
1003 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1004 return inst->mlen;
1005 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1006 return 2;
1007 case SHADER_OPCODE_UNTYPED_ATOMIC:
1008 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1009 case SHADER_OPCODE_URB_WRITE_SIMD8:
1010 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1011 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1012 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1013 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1014 return 0;
1015 default:
1016 unreachable("not reached");
1017 }
1018 }
1019
1020 int
1021 fs_visitor::virtual_grf_alloc(int size)
1022 {
1023 if (virtual_grf_array_size <= virtual_grf_count) {
1024 if (virtual_grf_array_size == 0)
1025 virtual_grf_array_size = 16;
1026 else
1027 virtual_grf_array_size *= 2;
1028 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
1029 virtual_grf_array_size);
1030 }
1031 virtual_grf_sizes[virtual_grf_count] = size;
1032 return virtual_grf_count++;
1033 }
1034
1035 /** Fixed HW reg constructor. */
1036 fs_reg::fs_reg(enum register_file file, int reg)
1037 {
1038 init();
1039 this->file = file;
1040 this->reg = reg;
1041 this->type = BRW_REGISTER_TYPE_F;
1042
1043 switch (file) {
1044 case UNIFORM:
1045 this->width = 1;
1046 break;
1047 default:
1048 this->width = 8;
1049 }
1050 }
1051
1052 /** Fixed HW reg constructor. */
1053 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1054 {
1055 init();
1056 this->file = file;
1057 this->reg = reg;
1058 this->type = type;
1059
1060 switch (file) {
1061 case UNIFORM:
1062 this->width = 1;
1063 break;
1064 default:
1065 this->width = 8;
1066 }
1067 }
1068
1069 /** Fixed HW reg constructor. */
1070 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1071 uint8_t width)
1072 {
1073 init();
1074 this->file = file;
1075 this->reg = reg;
1076 this->type = type;
1077 this->width = width;
1078 }
1079
1080 /** Automatic reg constructor. */
1081 fs_reg::fs_reg(fs_visitor *v, const struct glsl_type *type)
1082 {
1083 init();
1084 int reg_width = v->dispatch_width / 8;
1085
1086 this->file = GRF;
1087 this->reg = v->virtual_grf_alloc(v->type_size(type) * reg_width);
1088 this->reg_offset = 0;
1089 this->type = brw_type_for_base_type(type);
1090 this->width = v->dispatch_width;
1091 assert(this->width == 8 || this->width == 16);
1092 }
1093
1094 fs_reg *
1095 fs_visitor::variable_storage(ir_variable *var)
1096 {
1097 return (fs_reg *)hash_table_find(this->variable_ht, var);
1098 }
1099
1100 void
1101 import_uniforms_callback(const void *key,
1102 void *data,
1103 void *closure)
1104 {
1105 struct hash_table *dst_ht = (struct hash_table *)closure;
1106 const fs_reg *reg = (const fs_reg *)data;
1107
1108 if (reg->file != UNIFORM)
1109 return;
1110
1111 hash_table_insert(dst_ht, data, key);
1112 }
1113
1114 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1115 * This brings in those uniform definitions
1116 */
1117 void
1118 fs_visitor::import_uniforms(fs_visitor *v)
1119 {
1120 hash_table_call_foreach(v->variable_ht,
1121 import_uniforms_callback,
1122 variable_ht);
1123 this->push_constant_loc = v->push_constant_loc;
1124 this->pull_constant_loc = v->pull_constant_loc;
1125 this->uniforms = v->uniforms;
1126 this->param_size = v->param_size;
1127 }
1128
1129 /* Our support for uniforms is piggy-backed on the struct
1130 * gl_fragment_program, because that's where the values actually
1131 * get stored, rather than in some global gl_shader_program uniform
1132 * store.
1133 */
1134 void
1135 fs_visitor::setup_uniform_values(ir_variable *ir)
1136 {
1137 int namelen = strlen(ir->name);
1138
1139 /* The data for our (non-builtin) uniforms is stored in a series of
1140 * gl_uniform_driver_storage structs for each subcomponent that
1141 * glGetUniformLocation() could name. We know it's been set up in the same
1142 * order we'd walk the type, so walk the list of storage and find anything
1143 * with our name, or the prefix of a component that starts with our name.
1144 */
1145 unsigned params_before = uniforms;
1146 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1147 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1148
1149 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1150 (storage->name[namelen] != 0 &&
1151 storage->name[namelen] != '.' &&
1152 storage->name[namelen] != '[')) {
1153 continue;
1154 }
1155
1156 unsigned slots = storage->type->component_slots();
1157 if (storage->array_elements)
1158 slots *= storage->array_elements;
1159
1160 for (unsigned i = 0; i < slots; i++) {
1161 stage_prog_data->param[uniforms++] = &storage->storage[i];
1162 }
1163 }
1164
1165 /* Make sure we actually initialized the right amount of stuff here. */
1166 assert(params_before + ir->type->component_slots() == uniforms);
1167 (void)params_before;
1168 }
1169
1170
1171 /* Our support for builtin uniforms is even scarier than non-builtin.
1172 * It sits on top of the PROG_STATE_VAR parameters that are
1173 * automatically updated from GL context state.
1174 */
1175 void
1176 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1177 {
1178 const ir_state_slot *const slots = ir->get_state_slots();
1179 assert(slots != NULL);
1180
1181 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1182 /* This state reference has already been setup by ir_to_mesa, but we'll
1183 * get the same index back here.
1184 */
1185 int index = _mesa_add_state_reference(this->prog->Parameters,
1186 (gl_state_index *)slots[i].tokens);
1187
1188 /* Add each of the unique swizzles of the element as a parameter.
1189 * This'll end up matching the expected layout of the
1190 * array/matrix/structure we're trying to fill in.
1191 */
1192 int last_swiz = -1;
1193 for (unsigned int j = 0; j < 4; j++) {
1194 int swiz = GET_SWZ(slots[i].swizzle, j);
1195 if (swiz == last_swiz)
1196 break;
1197 last_swiz = swiz;
1198
1199 stage_prog_data->param[uniforms++] =
1200 &prog->Parameters->ParameterValues[index][swiz];
1201 }
1202 }
1203 }
1204
1205 fs_reg *
1206 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1207 bool origin_upper_left)
1208 {
1209 assert(stage == MESA_SHADER_FRAGMENT);
1210 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1211 fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::vec4_type);
1212 fs_reg wpos = *reg;
1213 bool flip = !origin_upper_left ^ key->render_to_fbo;
1214
1215 /* gl_FragCoord.x */
1216 if (pixel_center_integer) {
1217 emit(MOV(wpos, this->pixel_x));
1218 } else {
1219 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1220 }
1221 wpos = offset(wpos, 1);
1222
1223 /* gl_FragCoord.y */
1224 if (!flip && pixel_center_integer) {
1225 emit(MOV(wpos, this->pixel_y));
1226 } else {
1227 fs_reg pixel_y = this->pixel_y;
1228 float offset = (pixel_center_integer ? 0.0 : 0.5);
1229
1230 if (flip) {
1231 pixel_y.negate = true;
1232 offset += key->drawable_height - 1.0;
1233 }
1234
1235 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1236 }
1237 wpos = offset(wpos, 1);
1238
1239 /* gl_FragCoord.z */
1240 if (brw->gen >= 6) {
1241 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1242 } else {
1243 emit(FS_OPCODE_LINTERP, wpos,
1244 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1245 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1246 interp_reg(VARYING_SLOT_POS, 2));
1247 }
1248 wpos = offset(wpos, 1);
1249
1250 /* gl_FragCoord.w: Already set up in emit_interpolation */
1251 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1252
1253 return reg;
1254 }
1255
1256 fs_inst *
1257 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1258 glsl_interp_qualifier interpolation_mode,
1259 bool is_centroid, bool is_sample)
1260 {
1261 brw_wm_barycentric_interp_mode barycoord_mode;
1262 if (brw->gen >= 6) {
1263 if (is_centroid) {
1264 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1265 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1266 else
1267 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1268 } else if (is_sample) {
1269 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1270 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1271 else
1272 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1273 } else {
1274 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1275 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1276 else
1277 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1278 }
1279 } else {
1280 /* On Ironlake and below, there is only one interpolation mode.
1281 * Centroid interpolation doesn't mean anything on this hardware --
1282 * there is no multisampling.
1283 */
1284 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1285 }
1286 return emit(FS_OPCODE_LINTERP, attr,
1287 this->delta_x[barycoord_mode],
1288 this->delta_y[barycoord_mode], interp);
1289 }
1290
1291 void
1292 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1293 const glsl_type *type,
1294 glsl_interp_qualifier interpolation_mode,
1295 int location, bool mod_centroid,
1296 bool mod_sample)
1297 {
1298 attr.type = brw_type_for_base_type(type->get_scalar_type());
1299
1300 assert(stage == MESA_SHADER_FRAGMENT);
1301 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1302 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1303
1304 unsigned int array_elements;
1305
1306 if (type->is_array()) {
1307 array_elements = type->length;
1308 if (array_elements == 0) {
1309 fail("dereferenced array '%s' has length 0\n", name);
1310 }
1311 type = type->fields.array;
1312 } else {
1313 array_elements = 1;
1314 }
1315
1316 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1317 bool is_gl_Color =
1318 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1319 if (key->flat_shade && is_gl_Color) {
1320 interpolation_mode = INTERP_QUALIFIER_FLAT;
1321 } else {
1322 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1323 }
1324 }
1325
1326 for (unsigned int i = 0; i < array_elements; i++) {
1327 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1328 if (prog_data->urb_setup[location] == -1) {
1329 /* If there's no incoming setup data for this slot, don't
1330 * emit interpolation for it.
1331 */
1332 attr = offset(attr, type->vector_elements);
1333 location++;
1334 continue;
1335 }
1336
1337 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1338 /* Constant interpolation (flat shading) case. The SF has
1339 * handed us defined values in only the constant offset
1340 * field of the setup reg.
1341 */
1342 for (unsigned int k = 0; k < type->vector_elements; k++) {
1343 struct brw_reg interp = interp_reg(location, k);
1344 interp = suboffset(interp, 3);
1345 interp.type = attr.type;
1346 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1347 attr = offset(attr, 1);
1348 }
1349 } else {
1350 /* Smooth/noperspective interpolation case. */
1351 for (unsigned int k = 0; k < type->vector_elements; k++) {
1352 struct brw_reg interp = interp_reg(location, k);
1353 if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1354 /* Get the pixel/sample mask into f0 so that we know
1355 * which pixels are lit. Then, for each channel that is
1356 * unlit, replace the centroid data with non-centroid
1357 * data.
1358 */
1359 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1360
1361 fs_inst *inst;
1362 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1363 false, false);
1364 inst->predicate = BRW_PREDICATE_NORMAL;
1365 inst->predicate_inverse = true;
1366 if (brw->has_pln)
1367 inst->no_dd_clear = true;
1368
1369 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1370 mod_centroid && !key->persample_shading,
1371 mod_sample || key->persample_shading);
1372 inst->predicate = BRW_PREDICATE_NORMAL;
1373 inst->predicate_inverse = false;
1374 if (brw->has_pln)
1375 inst->no_dd_check = true;
1376
1377 } else {
1378 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1379 mod_centroid && !key->persample_shading,
1380 mod_sample || key->persample_shading);
1381 }
1382 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1383 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1384 }
1385 attr = offset(attr, 1);
1386 }
1387
1388 }
1389 location++;
1390 }
1391 }
1392 }
1393
1394 fs_reg *
1395 fs_visitor::emit_frontfacing_interpolation()
1396 {
1397 fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::bool_type);
1398
1399 if (brw->gen >= 6) {
1400 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1401 * a boolean result from this (~0/true or 0/false).
1402 *
1403 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1404 * this task in only one instruction:
1405 * - a negation source modifier will flip the bit; and
1406 * - a W -> D type conversion will sign extend the bit into the high
1407 * word of the destination.
1408 *
1409 * An ASR 15 fills the low word of the destination.
1410 */
1411 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1412 g0.negate = true;
1413
1414 emit(ASR(*reg, g0, fs_reg(15)));
1415 } else {
1416 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1417 * a boolean result from this (1/true or 0/false).
1418 *
1419 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1420 * the negation source modifier to flip it. Unfortunately the SHR
1421 * instruction only operates on UD (or D with an abs source modifier)
1422 * sources without negation.
1423 *
1424 * Instead, use ASR (which will give ~0/true or 0/false).
1425 */
1426 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1427 g1_6.negate = true;
1428
1429 emit(ASR(*reg, g1_6, fs_reg(31)));
1430 }
1431
1432 return reg;
1433 }
1434
1435 void
1436 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1437 {
1438 assert(stage == MESA_SHADER_FRAGMENT);
1439 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1440 assert(dst.type == BRW_REGISTER_TYPE_F);
1441
1442 if (key->compute_pos_offset) {
1443 /* Convert int_sample_pos to floating point */
1444 emit(MOV(dst, int_sample_pos));
1445 /* Scale to the range [0, 1] */
1446 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1447 }
1448 else {
1449 /* From ARB_sample_shading specification:
1450 * "When rendering to a non-multisample buffer, or if multisample
1451 * rasterization is disabled, gl_SamplePosition will always be
1452 * (0.5, 0.5).
1453 */
1454 emit(MOV(dst, fs_reg(0.5f)));
1455 }
1456 }
1457
1458 fs_reg *
1459 fs_visitor::emit_samplepos_setup()
1460 {
1461 assert(brw->gen >= 6);
1462
1463 this->current_annotation = "compute sample position";
1464 fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::vec2_type);
1465 fs_reg pos = *reg;
1466 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1467 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1468
1469 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1470 * mode will be enabled.
1471 *
1472 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1473 * R31.1:0 Position Offset X/Y for Slot[3:0]
1474 * R31.3:2 Position Offset X/Y for Slot[7:4]
1475 * .....
1476 *
1477 * The X, Y sample positions come in as bytes in thread payload. So, read
1478 * the positions using vstride=16, width=8, hstride=2.
1479 */
1480 struct brw_reg sample_pos_reg =
1481 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1482 BRW_REGISTER_TYPE_B), 16, 8, 2);
1483
1484 if (dispatch_width == 8) {
1485 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1486 } else {
1487 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1488 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1489 ->force_sechalf = true;
1490 }
1491 /* Compute gl_SamplePosition.x */
1492 compute_sample_position(pos, int_sample_x);
1493 pos = offset(pos, 1);
1494 if (dispatch_width == 8) {
1495 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1496 } else {
1497 emit(MOV(half(int_sample_y, 0),
1498 fs_reg(suboffset(sample_pos_reg, 1))));
1499 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1500 ->force_sechalf = true;
1501 }
1502 /* Compute gl_SamplePosition.y */
1503 compute_sample_position(pos, int_sample_y);
1504 return reg;
1505 }
1506
1507 fs_reg *
1508 fs_visitor::emit_sampleid_setup()
1509 {
1510 assert(stage == MESA_SHADER_FRAGMENT);
1511 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1512 assert(brw->gen >= 6);
1513
1514 this->current_annotation = "compute sample id";
1515 fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::int_type);
1516
1517 if (key->compute_sample_id) {
1518 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1519 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1520 t2.type = BRW_REGISTER_TYPE_UW;
1521
1522 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1523 * 8x multisampling, subspan 0 will represent sample N (where N
1524 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1525 * 7. We can find the value of N by looking at R0.0 bits 7:6
1526 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1527 * (since samples are always delivered in pairs). That is, we
1528 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1529 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1530 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1531 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1532 * populating a temporary variable with the sequence (0, 1, 2, 3),
1533 * and then reading from it using vstride=1, width=4, hstride=0.
1534 * These computations hold good for 4x multisampling as well.
1535 *
1536 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1537 * the first four slots are sample 0 of subspan 0; the next four
1538 * are sample 1 of subspan 0; the third group is sample 0 of
1539 * subspan 1, and finally sample 1 of subspan 1.
1540 */
1541 fs_inst *inst;
1542 inst = emit(BRW_OPCODE_AND, t1,
1543 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1544 fs_reg(0xc0));
1545 inst->force_writemask_all = true;
1546 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1547 inst->force_writemask_all = true;
1548 /* This works for both SIMD8 and SIMD16 */
1549 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1550 inst->force_writemask_all = true;
1551 /* This special instruction takes care of setting vstride=1,
1552 * width=4, hstride=0 of t2 during an ADD instruction.
1553 */
1554 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1555 } else {
1556 /* As per GL_ARB_sample_shading specification:
1557 * "When rendering to a non-multisample buffer, or if multisample
1558 * rasterization is disabled, gl_SampleID will always be zero."
1559 */
1560 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1561 }
1562
1563 return reg;
1564 }
1565
1566 fs_reg
1567 fs_visitor::fix_math_operand(fs_reg src)
1568 {
1569 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1570 * might be able to do better by doing execsize = 1 math and then
1571 * expanding that result out, but we would need to be careful with
1572 * masking.
1573 *
1574 * The hardware ignores source modifiers (negate and abs) on math
1575 * instructions, so we also move to a temp to set those up.
1576 */
1577 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1578 !src.abs && !src.negate)
1579 return src;
1580
1581 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1582 * operands to math
1583 */
1584 if (brw->gen >= 7 && src.file != IMM)
1585 return src;
1586
1587 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1588 expanded.type = src.type;
1589 emit(BRW_OPCODE_MOV, expanded, src);
1590 return expanded;
1591 }
1592
1593 fs_inst *
1594 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1595 {
1596 switch (opcode) {
1597 case SHADER_OPCODE_RCP:
1598 case SHADER_OPCODE_RSQ:
1599 case SHADER_OPCODE_SQRT:
1600 case SHADER_OPCODE_EXP2:
1601 case SHADER_OPCODE_LOG2:
1602 case SHADER_OPCODE_SIN:
1603 case SHADER_OPCODE_COS:
1604 break;
1605 default:
1606 unreachable("not reached: bad math opcode");
1607 }
1608
1609 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1610 * might be able to do better by doing execsize = 1 math and then
1611 * expanding that result out, but we would need to be careful with
1612 * masking.
1613 *
1614 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1615 * instructions, so we also move to a temp to set those up.
1616 */
1617 if (brw->gen == 6 || brw->gen == 7)
1618 src = fix_math_operand(src);
1619
1620 fs_inst *inst = emit(opcode, dst, src);
1621
1622 if (brw->gen < 6) {
1623 inst->base_mrf = 2;
1624 inst->mlen = dispatch_width / 8;
1625 }
1626
1627 return inst;
1628 }
1629
1630 fs_inst *
1631 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1632 {
1633 int base_mrf = 2;
1634 fs_inst *inst;
1635
1636 if (brw->gen >= 8) {
1637 inst = emit(opcode, dst, src0, src1);
1638 } else if (brw->gen >= 6) {
1639 src0 = fix_math_operand(src0);
1640 src1 = fix_math_operand(src1);
1641
1642 inst = emit(opcode, dst, src0, src1);
1643 } else {
1644 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1645 * "Message Payload":
1646 *
1647 * "Operand0[7]. For the INT DIV functions, this operand is the
1648 * denominator."
1649 * ...
1650 * "Operand1[7]. For the INT DIV functions, this operand is the
1651 * numerator."
1652 */
1653 bool is_int_div = opcode != SHADER_OPCODE_POW;
1654 fs_reg &op0 = is_int_div ? src1 : src0;
1655 fs_reg &op1 = is_int_div ? src0 : src1;
1656
1657 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1658 inst = emit(opcode, dst, op0, reg_null_f);
1659
1660 inst->base_mrf = base_mrf;
1661 inst->mlen = 2 * dispatch_width / 8;
1662 }
1663 return inst;
1664 }
1665
1666 void
1667 fs_visitor::assign_curb_setup()
1668 {
1669 if (dispatch_width == 8) {
1670 prog_data->dispatch_grf_start_reg = payload.num_regs;
1671 } else {
1672 assert(stage == MESA_SHADER_FRAGMENT);
1673 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1674 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1675 }
1676
1677 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1678
1679 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1680 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1681 for (unsigned int i = 0; i < inst->sources; i++) {
1682 if (inst->src[i].file == UNIFORM) {
1683 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1684 int constant_nr;
1685 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1686 constant_nr = push_constant_loc[uniform_nr];
1687 } else {
1688 /* Section 5.11 of the OpenGL 4.1 spec says:
1689 * "Out-of-bounds reads return undefined values, which include
1690 * values from other variables of the active program or zero."
1691 * Just return the first push constant.
1692 */
1693 constant_nr = 0;
1694 }
1695
1696 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1697 constant_nr / 8,
1698 constant_nr % 8);
1699
1700 inst->src[i].file = HW_REG;
1701 inst->src[i].fixed_hw_reg = byte_offset(
1702 retype(brw_reg, inst->src[i].type),
1703 inst->src[i].subreg_offset);
1704 }
1705 }
1706 }
1707 }
1708
1709 void
1710 fs_visitor::calculate_urb_setup()
1711 {
1712 assert(stage == MESA_SHADER_FRAGMENT);
1713 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1714 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1715
1716 memset(prog_data->urb_setup, -1,
1717 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1718
1719 int urb_next = 0;
1720 /* Figure out where each of the incoming setup attributes lands. */
1721 if (brw->gen >= 6) {
1722 if (_mesa_bitcount_64(prog->InputsRead &
1723 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1724 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1725 * first 16 varying inputs, so we can put them wherever we want.
1726 * Just put them in order.
1727 *
1728 * This is useful because it means that (a) inputs not used by the
1729 * fragment shader won't take up valuable register space, and (b) we
1730 * won't have to recompile the fragment shader if it gets paired with
1731 * a different vertex (or geometry) shader.
1732 */
1733 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1734 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1735 BITFIELD64_BIT(i)) {
1736 prog_data->urb_setup[i] = urb_next++;
1737 }
1738 }
1739 } else {
1740 /* We have enough input varyings that the SF/SBE pipeline stage can't
1741 * arbitrarily rearrange them to suit our whim; we have to put them
1742 * in an order that matches the output of the previous pipeline stage
1743 * (geometry or vertex shader).
1744 */
1745 struct brw_vue_map prev_stage_vue_map;
1746 brw_compute_vue_map(brw, &prev_stage_vue_map,
1747 key->input_slots_valid);
1748 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1749 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1750 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1751 slot++) {
1752 int varying = prev_stage_vue_map.slot_to_varying[slot];
1753 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1754 * unused.
1755 */
1756 if (varying != BRW_VARYING_SLOT_COUNT &&
1757 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1758 BITFIELD64_BIT(varying))) {
1759 prog_data->urb_setup[varying] = slot - first_slot;
1760 }
1761 }
1762 urb_next = prev_stage_vue_map.num_slots - first_slot;
1763 }
1764 } else {
1765 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1766 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1767 /* Point size is packed into the header, not as a general attribute */
1768 if (i == VARYING_SLOT_PSIZ)
1769 continue;
1770
1771 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1772 /* The back color slot is skipped when the front color is
1773 * also written to. In addition, some slots can be
1774 * written in the vertex shader and not read in the
1775 * fragment shader. So the register number must always be
1776 * incremented, mapped or not.
1777 */
1778 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1779 prog_data->urb_setup[i] = urb_next;
1780 urb_next++;
1781 }
1782 }
1783
1784 /*
1785 * It's a FS only attribute, and we did interpolation for this attribute
1786 * in SF thread. So, count it here, too.
1787 *
1788 * See compile_sf_prog() for more info.
1789 */
1790 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1791 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1792 }
1793
1794 prog_data->num_varying_inputs = urb_next;
1795 }
1796
1797 void
1798 fs_visitor::assign_urb_setup()
1799 {
1800 assert(stage == MESA_SHADER_FRAGMENT);
1801 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1802
1803 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1804
1805 /* Offset all the urb_setup[] index by the actual position of the
1806 * setup regs, now that the location of the constants has been chosen.
1807 */
1808 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1809 if (inst->opcode == FS_OPCODE_LINTERP) {
1810 assert(inst->src[2].file == HW_REG);
1811 inst->src[2].fixed_hw_reg.nr += urb_start;
1812 }
1813
1814 if (inst->opcode == FS_OPCODE_CINTERP) {
1815 assert(inst->src[0].file == HW_REG);
1816 inst->src[0].fixed_hw_reg.nr += urb_start;
1817 }
1818 }
1819
1820 /* Each attribute is 4 setup channels, each of which is half a reg. */
1821 this->first_non_payload_grf =
1822 urb_start + prog_data->num_varying_inputs * 2;
1823 }
1824
1825 void
1826 fs_visitor::assign_vs_urb_setup()
1827 {
1828 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1829 int grf, count, slot, channel, attr;
1830
1831 assert(stage == MESA_SHADER_VERTEX);
1832 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1833 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1834 count++;
1835
1836 /* Each attribute is 4 regs. */
1837 this->first_non_payload_grf =
1838 payload.num_regs + prog_data->curb_read_length + count * 4;
1839
1840 unsigned vue_entries =
1841 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1842
1843 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1844 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1845
1846 assert(vs_prog_data->base.urb_read_length <= 15);
1847
1848 /* Rewrite all ATTR file references to the hw grf that they land in. */
1849 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1850 for (int i = 0; i < inst->sources; i++) {
1851 if (inst->src[i].file == ATTR) {
1852
1853 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1854 slot = count - 1;
1855 } else {
1856 /* Attributes come in in a contiguous block, ordered by their
1857 * gl_vert_attrib value. That means we can compute the slot
1858 * number for an attribute by masking out the enabled
1859 * attributes before it and counting the bits.
1860 */
1861 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1862 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1863 BITFIELD64_MASK(attr));
1864 }
1865
1866 channel = inst->src[i].reg_offset & 3;
1867
1868 grf = payload.num_regs +
1869 prog_data->curb_read_length +
1870 slot * 4 + channel;
1871
1872 inst->src[i].file = HW_REG;
1873 inst->src[i].fixed_hw_reg =
1874 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1875 }
1876 }
1877 }
1878 }
1879
1880 /**
1881 * Split large virtual GRFs into separate components if we can.
1882 *
1883 * This is mostly duplicated with what brw_fs_vector_splitting does,
1884 * but that's really conservative because it's afraid of doing
1885 * splitting that doesn't result in real progress after the rest of
1886 * the optimization phases, which would cause infinite looping in
1887 * optimization. We can do it once here, safely. This also has the
1888 * opportunity to split interpolated values, or maybe even uniforms,
1889 * which we don't have at the IR level.
1890 *
1891 * We want to split, because virtual GRFs are what we register
1892 * allocate and spill (due to contiguousness requirements for some
1893 * instructions), and they're what we naturally generate in the
1894 * codegen process, but most virtual GRFs don't actually need to be
1895 * contiguous sets of GRFs. If we split, we'll end up with reduced
1896 * live intervals and better dead code elimination and coalescing.
1897 */
1898 void
1899 fs_visitor::split_virtual_grfs()
1900 {
1901 int num_vars = this->virtual_grf_count;
1902
1903 /* Count the total number of registers */
1904 int reg_count = 0;
1905 int vgrf_to_reg[num_vars];
1906 for (int i = 0; i < num_vars; i++) {
1907 vgrf_to_reg[i] = reg_count;
1908 reg_count += virtual_grf_sizes[i];
1909 }
1910
1911 /* An array of "split points". For each register slot, this indicates
1912 * if this slot can be separated from the previous slot. Every time an
1913 * instruction uses multiple elements of a register (as a source or
1914 * destination), we mark the used slots as inseparable. Then we go
1915 * through and split the registers into the smallest pieces we can.
1916 */
1917 bool split_points[reg_count];
1918 memset(split_points, 0, sizeof(split_points));
1919
1920 /* Mark all used registers as fully splittable */
1921 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1922 if (inst->dst.file == GRF) {
1923 int reg = vgrf_to_reg[inst->dst.reg];
1924 for (int j = 1; j < this->virtual_grf_sizes[inst->dst.reg]; j++)
1925 split_points[reg + j] = true;
1926 }
1927
1928 for (int i = 0; i < inst->sources; i++) {
1929 if (inst->src[i].file == GRF) {
1930 int reg = vgrf_to_reg[inst->src[i].reg];
1931 for (int j = 1; j < this->virtual_grf_sizes[inst->src[i].reg]; j++)
1932 split_points[reg + j] = true;
1933 }
1934 }
1935 }
1936
1937 if (brw->has_pln &&
1938 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1939 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1940 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1941 * Gen6, that was the only supported interpolation mode, and since Gen6,
1942 * delta_x and delta_y are in fixed hardware registers.
1943 */
1944 int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1945 split_points[vgrf_to_reg[vgrf] + 1] = false;
1946 }
1947
1948 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1949 if (inst->dst.file == GRF) {
1950 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1951 for (int j = 1; j < inst->regs_written; j++)
1952 split_points[reg + j] = false;
1953 }
1954 for (int i = 0; i < inst->sources; i++) {
1955 if (inst->src[i].file == GRF) {
1956 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1957 for (int j = 1; j < inst->regs_read(this, i); j++)
1958 split_points[reg + j] = false;
1959 }
1960 }
1961 }
1962
1963 int new_virtual_grf[reg_count];
1964 int new_reg_offset[reg_count];
1965
1966 int reg = 0;
1967 for (int i = 0; i < num_vars; i++) {
1968 /* The first one should always be 0 as a quick sanity check. */
1969 assert(split_points[reg] == false);
1970
1971 /* j = 0 case */
1972 new_reg_offset[reg] = 0;
1973 reg++;
1974 int offset = 1;
1975
1976 /* j > 0 case */
1977 for (int j = 1; j < virtual_grf_sizes[i]; j++) {
1978 /* If this is a split point, reset the offset to 0 and allocate a
1979 * new virtual GRF for the previous offset many registers
1980 */
1981 if (split_points[reg]) {
1982 assert(offset <= MAX_VGRF_SIZE);
1983 int grf = virtual_grf_alloc(offset);
1984 for (int k = reg - offset; k < reg; k++)
1985 new_virtual_grf[k] = grf;
1986 offset = 0;
1987 }
1988 new_reg_offset[reg] = offset;
1989 offset++;
1990 reg++;
1991 }
1992
1993 /* The last one gets the original register number */
1994 assert(offset <= MAX_VGRF_SIZE);
1995 virtual_grf_sizes[i] = offset;
1996 for (int k = reg - offset; k < reg; k++)
1997 new_virtual_grf[k] = i;
1998 }
1999 assert(reg == reg_count);
2000
2001 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2002 if (inst->dst.file == GRF) {
2003 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2004 inst->dst.reg = new_virtual_grf[reg];
2005 inst->dst.reg_offset = new_reg_offset[reg];
2006 assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
2007 }
2008 for (int i = 0; i < inst->sources; i++) {
2009 if (inst->src[i].file == GRF) {
2010 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2011 inst->src[i].reg = new_virtual_grf[reg];
2012 inst->src[i].reg_offset = new_reg_offset[reg];
2013 assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
2014 }
2015 }
2016 }
2017 invalidate_live_intervals();
2018 }
2019
2020 /**
2021 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2022 *
2023 * During code generation, we create tons of temporary variables, many of
2024 * which get immediately killed and are never used again. Yet, in later
2025 * optimization and analysis passes, such as compute_live_intervals, we need
2026 * to loop over all the virtual GRFs. Compacting them can save a lot of
2027 * overhead.
2028 */
2029 bool
2030 fs_visitor::compact_virtual_grfs()
2031 {
2032 bool progress = false;
2033 int remap_table[this->virtual_grf_count];
2034 memset(remap_table, -1, sizeof(remap_table));
2035
2036 /* Mark which virtual GRFs are used. */
2037 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2038 if (inst->dst.file == GRF)
2039 remap_table[inst->dst.reg] = 0;
2040
2041 for (int i = 0; i < inst->sources; i++) {
2042 if (inst->src[i].file == GRF)
2043 remap_table[inst->src[i].reg] = 0;
2044 }
2045 }
2046
2047 /* Compact the GRF arrays. */
2048 int new_index = 0;
2049 for (int i = 0; i < this->virtual_grf_count; i++) {
2050 if (remap_table[i] == -1) {
2051 /* We just found an unused register. This means that we are
2052 * actually going to compact something.
2053 */
2054 progress = true;
2055 } else {
2056 remap_table[i] = new_index;
2057 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
2058 invalidate_live_intervals();
2059 ++new_index;
2060 }
2061 }
2062
2063 this->virtual_grf_count = new_index;
2064
2065 /* Patch all the instructions to use the newly renumbered registers */
2066 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2067 if (inst->dst.file == GRF)
2068 inst->dst.reg = remap_table[inst->dst.reg];
2069
2070 for (int i = 0; i < inst->sources; i++) {
2071 if (inst->src[i].file == GRF)
2072 inst->src[i].reg = remap_table[inst->src[i].reg];
2073 }
2074 }
2075
2076 /* Patch all the references to delta_x/delta_y, since they're used in
2077 * register allocation. If they're unused, switch them to BAD_FILE so
2078 * we don't think some random VGRF is delta_x/delta_y.
2079 */
2080 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2081 if (delta_x[i].file == GRF) {
2082 if (remap_table[delta_x[i].reg] != -1) {
2083 delta_x[i].reg = remap_table[delta_x[i].reg];
2084 } else {
2085 delta_x[i].file = BAD_FILE;
2086 }
2087 }
2088 }
2089 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2090 if (delta_y[i].file == GRF) {
2091 if (remap_table[delta_y[i].reg] != -1) {
2092 delta_y[i].reg = remap_table[delta_y[i].reg];
2093 } else {
2094 delta_y[i].file = BAD_FILE;
2095 }
2096 }
2097 }
2098
2099 return progress;
2100 }
2101
2102 /*
2103 * Implements array access of uniforms by inserting a
2104 * PULL_CONSTANT_LOAD instruction.
2105 *
2106 * Unlike temporary GRF array access (where we don't support it due to
2107 * the difficulty of doing relative addressing on instruction
2108 * destinations), we could potentially do array access of uniforms
2109 * that were loaded in GRF space as push constants. In real-world
2110 * usage we've seen, though, the arrays being used are always larger
2111 * than we could load as push constants, so just always move all
2112 * uniform array access out to a pull constant buffer.
2113 */
2114 void
2115 fs_visitor::move_uniform_array_access_to_pull_constants()
2116 {
2117 if (dispatch_width != 8)
2118 return;
2119
2120 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2121 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2122
2123 /* Walk through and find array access of uniforms. Put a copy of that
2124 * uniform in the pull constant buffer.
2125 *
2126 * Note that we don't move constant-indexed accesses to arrays. No
2127 * testing has been done of the performance impact of this choice.
2128 */
2129 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2130 for (int i = 0 ; i < inst->sources; i++) {
2131 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2132 continue;
2133
2134 int uniform = inst->src[i].reg;
2135
2136 /* If this array isn't already present in the pull constant buffer,
2137 * add it.
2138 */
2139 if (pull_constant_loc[uniform] == -1) {
2140 const gl_constant_value **values = &stage_prog_data->param[uniform];
2141
2142 assert(param_size[uniform]);
2143
2144 for (int j = 0; j < param_size[uniform]; j++) {
2145 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2146
2147 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2148 values[j];
2149 }
2150 }
2151 }
2152 }
2153 }
2154
2155 /**
2156 * Assign UNIFORM file registers to either push constants or pull constants.
2157 *
2158 * We allow a fragment shader to have more than the specified minimum
2159 * maximum number of fragment shader uniform components (64). If
2160 * there are too many of these, they'd fill up all of register space.
2161 * So, this will push some of them out to the pull constant buffer and
2162 * update the program to load them.
2163 */
2164 void
2165 fs_visitor::assign_constant_locations()
2166 {
2167 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2168 if (dispatch_width != 8)
2169 return;
2170
2171 /* Find which UNIFORM registers are still in use. */
2172 bool is_live[uniforms];
2173 for (unsigned int i = 0; i < uniforms; i++) {
2174 is_live[i] = false;
2175 }
2176
2177 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2178 for (int i = 0; i < inst->sources; i++) {
2179 if (inst->src[i].file != UNIFORM)
2180 continue;
2181
2182 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2183 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2184 is_live[constant_nr] = true;
2185 }
2186 }
2187
2188 /* Only allow 16 registers (128 uniform components) as push constants.
2189 *
2190 * Just demote the end of the list. We could probably do better
2191 * here, demoting things that are rarely used in the program first.
2192 *
2193 * If changing this value, note the limitation about total_regs in
2194 * brw_curbe.c.
2195 */
2196 unsigned int max_push_components = 16 * 8;
2197 unsigned int num_push_constants = 0;
2198
2199 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2200
2201 for (unsigned int i = 0; i < uniforms; i++) {
2202 if (!is_live[i] || pull_constant_loc[i] != -1) {
2203 /* This UNIFORM register is either dead, or has already been demoted
2204 * to a pull const. Mark it as no longer living in the param[] array.
2205 */
2206 push_constant_loc[i] = -1;
2207 continue;
2208 }
2209
2210 if (num_push_constants < max_push_components) {
2211 /* Retain as a push constant. Record the location in the params[]
2212 * array.
2213 */
2214 push_constant_loc[i] = num_push_constants++;
2215 } else {
2216 /* Demote to a pull constant. */
2217 push_constant_loc[i] = -1;
2218
2219 int pull_index = stage_prog_data->nr_pull_params++;
2220 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2221 pull_constant_loc[i] = pull_index;
2222 }
2223 }
2224
2225 stage_prog_data->nr_params = num_push_constants;
2226
2227 /* Up until now, the param[] array has been indexed by reg + reg_offset
2228 * of UNIFORM registers. Condense it to only contain the uniforms we
2229 * chose to upload as push constants.
2230 */
2231 for (unsigned int i = 0; i < uniforms; i++) {
2232 int remapped = push_constant_loc[i];
2233
2234 if (remapped == -1)
2235 continue;
2236
2237 assert(remapped <= (int)i);
2238 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2239 }
2240 }
2241
2242 /**
2243 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2244 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2245 */
2246 void
2247 fs_visitor::demote_pull_constants()
2248 {
2249 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2250 for (int i = 0; i < inst->sources; i++) {
2251 if (inst->src[i].file != UNIFORM)
2252 continue;
2253
2254 int pull_index = pull_constant_loc[inst->src[i].reg +
2255 inst->src[i].reg_offset];
2256 if (pull_index == -1)
2257 continue;
2258
2259 /* Set up the annotation tracking for new generated instructions. */
2260 base_ir = inst->ir;
2261 current_annotation = inst->annotation;
2262
2263 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2264 fs_reg dst = fs_reg(this, glsl_type::float_type);
2265
2266 /* Generate a pull load into dst. */
2267 if (inst->src[i].reladdr) {
2268 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2269 surf_index,
2270 *inst->src[i].reladdr,
2271 pull_index);
2272 inst->insert_before(block, &list);
2273 inst->src[i].reladdr = NULL;
2274 } else {
2275 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2276 fs_inst *pull =
2277 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2278 dst, surf_index, offset);
2279 inst->insert_before(block, pull);
2280 inst->src[i].set_smear(pull_index & 3);
2281 }
2282
2283 /* Rewrite the instruction to use the temporary VGRF. */
2284 inst->src[i].file = GRF;
2285 inst->src[i].reg = dst.reg;
2286 inst->src[i].reg_offset = 0;
2287 inst->src[i].width = dispatch_width;
2288 }
2289 }
2290 invalidate_live_intervals();
2291 }
2292
2293 bool
2294 fs_visitor::opt_algebraic()
2295 {
2296 bool progress = false;
2297
2298 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2299 switch (inst->opcode) {
2300 case BRW_OPCODE_MOV:
2301 if (inst->src[0].file != IMM)
2302 break;
2303
2304 if (inst->saturate) {
2305 if (inst->dst.type != inst->src[0].type)
2306 assert(!"unimplemented: saturate mixed types");
2307
2308 if (brw_saturate_immediate(inst->dst.type,
2309 &inst->src[0].fixed_hw_reg)) {
2310 inst->saturate = false;
2311 progress = true;
2312 }
2313 }
2314 break;
2315
2316 case BRW_OPCODE_MUL:
2317 if (inst->src[1].file != IMM)
2318 continue;
2319
2320 /* a * 1.0 = a */
2321 if (inst->src[1].is_one()) {
2322 inst->opcode = BRW_OPCODE_MOV;
2323 inst->src[1] = reg_undef;
2324 progress = true;
2325 break;
2326 }
2327
2328 /* a * 0.0 = 0.0 */
2329 if (inst->src[1].is_zero()) {
2330 inst->opcode = BRW_OPCODE_MOV;
2331 inst->src[0] = inst->src[1];
2332 inst->src[1] = reg_undef;
2333 progress = true;
2334 break;
2335 }
2336
2337 break;
2338 case BRW_OPCODE_ADD:
2339 if (inst->src[1].file != IMM)
2340 continue;
2341
2342 /* a + 0.0 = a */
2343 if (inst->src[1].is_zero()) {
2344 inst->opcode = BRW_OPCODE_MOV;
2345 inst->src[1] = reg_undef;
2346 progress = true;
2347 break;
2348 }
2349 break;
2350 case BRW_OPCODE_OR:
2351 if (inst->src[0].equals(inst->src[1])) {
2352 inst->opcode = BRW_OPCODE_MOV;
2353 inst->src[1] = reg_undef;
2354 progress = true;
2355 break;
2356 }
2357 break;
2358 case BRW_OPCODE_LRP:
2359 if (inst->src[1].equals(inst->src[2])) {
2360 inst->opcode = BRW_OPCODE_MOV;
2361 inst->src[0] = inst->src[1];
2362 inst->src[1] = reg_undef;
2363 inst->src[2] = reg_undef;
2364 progress = true;
2365 break;
2366 }
2367 break;
2368 case BRW_OPCODE_SEL:
2369 if (inst->src[0].equals(inst->src[1])) {
2370 inst->opcode = BRW_OPCODE_MOV;
2371 inst->src[1] = reg_undef;
2372 inst->predicate = BRW_PREDICATE_NONE;
2373 inst->predicate_inverse = false;
2374 progress = true;
2375 } else if (inst->saturate && inst->src[1].file == IMM) {
2376 switch (inst->conditional_mod) {
2377 case BRW_CONDITIONAL_LE:
2378 case BRW_CONDITIONAL_L:
2379 switch (inst->src[1].type) {
2380 case BRW_REGISTER_TYPE_F:
2381 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2382 inst->opcode = BRW_OPCODE_MOV;
2383 inst->src[1] = reg_undef;
2384 progress = true;
2385 }
2386 break;
2387 default:
2388 break;
2389 }
2390 break;
2391 case BRW_CONDITIONAL_GE:
2392 case BRW_CONDITIONAL_G:
2393 switch (inst->src[1].type) {
2394 case BRW_REGISTER_TYPE_F:
2395 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2396 inst->opcode = BRW_OPCODE_MOV;
2397 inst->src[1] = reg_undef;
2398 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2399 progress = true;
2400 }
2401 break;
2402 default:
2403 break;
2404 }
2405 default:
2406 break;
2407 }
2408 }
2409 break;
2410 case SHADER_OPCODE_RCP: {
2411 fs_inst *prev = (fs_inst *)inst->prev;
2412 if (prev->opcode == SHADER_OPCODE_SQRT) {
2413 if (inst->src[0].equals(prev->dst)) {
2414 inst->opcode = SHADER_OPCODE_RSQ;
2415 inst->src[0] = prev->src[0];
2416 progress = true;
2417 }
2418 }
2419 break;
2420 }
2421 default:
2422 break;
2423 }
2424 }
2425
2426 return progress;
2427 }
2428
2429 bool
2430 fs_visitor::opt_register_renaming()
2431 {
2432 bool progress = false;
2433 int depth = 0;
2434
2435 int remap[virtual_grf_count];
2436 memset(remap, -1, sizeof(int) * virtual_grf_count);
2437
2438 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2439 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2440 depth++;
2441 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2442 inst->opcode == BRW_OPCODE_WHILE) {
2443 depth--;
2444 }
2445
2446 /* Rewrite instruction sources. */
2447 for (int i = 0; i < inst->sources; i++) {
2448 if (inst->src[i].file == GRF &&
2449 remap[inst->src[i].reg] != -1 &&
2450 remap[inst->src[i].reg] != inst->src[i].reg) {
2451 inst->src[i].reg = remap[inst->src[i].reg];
2452 progress = true;
2453 }
2454 }
2455
2456 const int dst = inst->dst.reg;
2457
2458 if (depth == 0 &&
2459 inst->dst.file == GRF &&
2460 virtual_grf_sizes[inst->dst.reg] == inst->dst.width / 8 &&
2461 !inst->is_partial_write()) {
2462 if (remap[dst] == -1) {
2463 remap[dst] = dst;
2464 } else {
2465 remap[dst] = virtual_grf_alloc(inst->dst.width / 8);
2466 inst->dst.reg = remap[dst];
2467 progress = true;
2468 }
2469 } else if (inst->dst.file == GRF &&
2470 remap[dst] != -1 &&
2471 remap[dst] != dst) {
2472 inst->dst.reg = remap[dst];
2473 progress = true;
2474 }
2475 }
2476
2477 if (progress) {
2478 invalidate_live_intervals();
2479
2480 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2481 if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2482 delta_x[i].reg = remap[delta_x[i].reg];
2483 }
2484 }
2485 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2486 if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2487 delta_y[i].reg = remap[delta_y[i].reg];
2488 }
2489 }
2490 }
2491
2492 return progress;
2493 }
2494
2495 bool
2496 fs_visitor::compute_to_mrf()
2497 {
2498 bool progress = false;
2499 int next_ip = 0;
2500
2501 /* No MRFs on Gen >= 7. */
2502 if (brw->gen >= 7)
2503 return false;
2504
2505 calculate_live_intervals();
2506
2507 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2508 int ip = next_ip;
2509 next_ip++;
2510
2511 if (inst->opcode != BRW_OPCODE_MOV ||
2512 inst->is_partial_write() ||
2513 inst->dst.file != MRF || inst->src[0].file != GRF ||
2514 inst->dst.type != inst->src[0].type ||
2515 inst->src[0].abs || inst->src[0].negate ||
2516 !inst->src[0].is_contiguous() ||
2517 inst->src[0].subreg_offset)
2518 continue;
2519
2520 /* Work out which hardware MRF registers are written by this
2521 * instruction.
2522 */
2523 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2524 int mrf_high;
2525 if (inst->dst.reg & BRW_MRF_COMPR4) {
2526 mrf_high = mrf_low + 4;
2527 } else if (inst->exec_size == 16) {
2528 mrf_high = mrf_low + 1;
2529 } else {
2530 mrf_high = mrf_low;
2531 }
2532
2533 /* Can't compute-to-MRF this GRF if someone else was going to
2534 * read it later.
2535 */
2536 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2537 continue;
2538
2539 /* Found a move of a GRF to a MRF. Let's see if we can go
2540 * rewrite the thing that made this GRF to write into the MRF.
2541 */
2542 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2543 if (scan_inst->dst.file == GRF &&
2544 scan_inst->dst.reg == inst->src[0].reg) {
2545 /* Found the last thing to write our reg we want to turn
2546 * into a compute-to-MRF.
2547 */
2548
2549 /* If this one instruction didn't populate all the
2550 * channels, bail. We might be able to rewrite everything
2551 * that writes that reg, but it would require smarter
2552 * tracking to delay the rewriting until complete success.
2553 */
2554 if (scan_inst->is_partial_write())
2555 break;
2556
2557 /* Things returning more than one register would need us to
2558 * understand coalescing out more than one MOV at a time.
2559 */
2560 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2561 break;
2562
2563 /* SEND instructions can't have MRF as a destination. */
2564 if (scan_inst->mlen)
2565 break;
2566
2567 if (brw->gen == 6) {
2568 /* gen6 math instructions must have the destination be
2569 * GRF, so no compute-to-MRF for them.
2570 */
2571 if (scan_inst->is_math()) {
2572 break;
2573 }
2574 }
2575
2576 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2577 /* Found the creator of our MRF's source value. */
2578 scan_inst->dst.file = MRF;
2579 scan_inst->dst.reg = inst->dst.reg;
2580 scan_inst->saturate |= inst->saturate;
2581 inst->remove(block);
2582 progress = true;
2583 }
2584 break;
2585 }
2586
2587 /* We don't handle control flow here. Most computation of
2588 * values that end up in MRFs are shortly before the MRF
2589 * write anyway.
2590 */
2591 if (block->start() == scan_inst)
2592 break;
2593
2594 /* You can't read from an MRF, so if someone else reads our
2595 * MRF's source GRF that we wanted to rewrite, that stops us.
2596 */
2597 bool interfered = false;
2598 for (int i = 0; i < scan_inst->sources; i++) {
2599 if (scan_inst->src[i].file == GRF &&
2600 scan_inst->src[i].reg == inst->src[0].reg &&
2601 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2602 interfered = true;
2603 }
2604 }
2605 if (interfered)
2606 break;
2607
2608 if (scan_inst->dst.file == MRF) {
2609 /* If somebody else writes our MRF here, we can't
2610 * compute-to-MRF before that.
2611 */
2612 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2613 int scan_mrf_high;
2614
2615 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2616 scan_mrf_high = scan_mrf_low + 4;
2617 } else if (scan_inst->exec_size == 16) {
2618 scan_mrf_high = scan_mrf_low + 1;
2619 } else {
2620 scan_mrf_high = scan_mrf_low;
2621 }
2622
2623 if (mrf_low == scan_mrf_low ||
2624 mrf_low == scan_mrf_high ||
2625 mrf_high == scan_mrf_low ||
2626 mrf_high == scan_mrf_high) {
2627 break;
2628 }
2629 }
2630
2631 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2632 /* Found a SEND instruction, which means that there are
2633 * live values in MRFs from base_mrf to base_mrf +
2634 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2635 * above it.
2636 */
2637 if (mrf_low >= scan_inst->base_mrf &&
2638 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2639 break;
2640 }
2641 if (mrf_high >= scan_inst->base_mrf &&
2642 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2643 break;
2644 }
2645 }
2646 }
2647 }
2648
2649 if (progress)
2650 invalidate_live_intervals();
2651
2652 return progress;
2653 }
2654
2655 /**
2656 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2657 * instructions to FS_OPCODE_REP_FB_WRITE.
2658 */
2659 void
2660 fs_visitor::emit_repclear_shader()
2661 {
2662 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2663 int base_mrf = 1;
2664 int color_mrf = base_mrf + 2;
2665
2666 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2667 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2668 mov->force_writemask_all = true;
2669
2670 fs_inst *write;
2671 if (key->nr_color_regions == 1) {
2672 write = emit(FS_OPCODE_REP_FB_WRITE);
2673 write->saturate = key->clamp_fragment_color;
2674 write->base_mrf = color_mrf;
2675 write->target = 0;
2676 write->header_present = false;
2677 write->mlen = 1;
2678 } else {
2679 assume(key->nr_color_regions > 0);
2680 for (int i = 0; i < key->nr_color_regions; ++i) {
2681 write = emit(FS_OPCODE_REP_FB_WRITE);
2682 write->saturate = key->clamp_fragment_color;
2683 write->base_mrf = base_mrf;
2684 write->target = i;
2685 write->header_present = true;
2686 write->mlen = 3;
2687 }
2688 }
2689 write->eot = true;
2690
2691 calculate_cfg();
2692
2693 assign_constant_locations();
2694 assign_curb_setup();
2695
2696 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2697 assert(mov->src[0].file == HW_REG);
2698 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2699 }
2700
2701 /**
2702 * Walks through basic blocks, looking for repeated MRF writes and
2703 * removing the later ones.
2704 */
2705 bool
2706 fs_visitor::remove_duplicate_mrf_writes()
2707 {
2708 fs_inst *last_mrf_move[16];
2709 bool progress = false;
2710
2711 /* Need to update the MRF tracking for compressed instructions. */
2712 if (dispatch_width == 16)
2713 return false;
2714
2715 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2716
2717 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2718 if (inst->is_control_flow()) {
2719 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2720 }
2721
2722 if (inst->opcode == BRW_OPCODE_MOV &&
2723 inst->dst.file == MRF) {
2724 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2725 if (prev_inst && inst->equals(prev_inst)) {
2726 inst->remove(block);
2727 progress = true;
2728 continue;
2729 }
2730 }
2731
2732 /* Clear out the last-write records for MRFs that were overwritten. */
2733 if (inst->dst.file == MRF) {
2734 last_mrf_move[inst->dst.reg] = NULL;
2735 }
2736
2737 if (inst->mlen > 0 && inst->base_mrf != -1) {
2738 /* Found a SEND instruction, which will include two or fewer
2739 * implied MRF writes. We could do better here.
2740 */
2741 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2742 last_mrf_move[inst->base_mrf + i] = NULL;
2743 }
2744 }
2745
2746 /* Clear out any MRF move records whose sources got overwritten. */
2747 if (inst->dst.file == GRF) {
2748 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2749 if (last_mrf_move[i] &&
2750 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2751 last_mrf_move[i] = NULL;
2752 }
2753 }
2754 }
2755
2756 if (inst->opcode == BRW_OPCODE_MOV &&
2757 inst->dst.file == MRF &&
2758 inst->src[0].file == GRF &&
2759 !inst->is_partial_write()) {
2760 last_mrf_move[inst->dst.reg] = inst;
2761 }
2762 }
2763
2764 if (progress)
2765 invalidate_live_intervals();
2766
2767 return progress;
2768 }
2769
2770 static void
2771 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2772 int first_grf, int grf_len)
2773 {
2774 /* Clear the flag for registers that actually got read (as expected). */
2775 for (int i = 0; i < inst->sources; i++) {
2776 int grf;
2777 if (inst->src[i].file == GRF) {
2778 grf = inst->src[i].reg;
2779 } else if (inst->src[i].file == HW_REG &&
2780 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2781 grf = inst->src[i].fixed_hw_reg.nr;
2782 } else {
2783 continue;
2784 }
2785
2786 if (grf >= first_grf &&
2787 grf < first_grf + grf_len) {
2788 deps[grf - first_grf] = false;
2789 if (inst->exec_size == 16)
2790 deps[grf - first_grf + 1] = false;
2791 }
2792 }
2793 }
2794
2795 /**
2796 * Implements this workaround for the original 965:
2797 *
2798 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2799 * check for post destination dependencies on this instruction, software
2800 * must ensure that there is no destination hazard for the case of ‘write
2801 * followed by a posted write’ shown in the following example.
2802 *
2803 * 1. mov r3 0
2804 * 2. send r3.xy <rest of send instruction>
2805 * 3. mov r2 r3
2806 *
2807 * Due to no post-destination dependency check on the ‘send’, the above
2808 * code sequence could have two instructions (1 and 2) in flight at the
2809 * same time that both consider ‘r3’ as the target of their final writes.
2810 */
2811 void
2812 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2813 fs_inst *inst)
2814 {
2815 int write_len = inst->regs_written;
2816 int first_write_grf = inst->dst.reg;
2817 bool needs_dep[BRW_MAX_MRF];
2818 assert(write_len < (int)sizeof(needs_dep) - 1);
2819
2820 memset(needs_dep, false, sizeof(needs_dep));
2821 memset(needs_dep, true, write_len);
2822
2823 clear_deps_for_inst_src(inst, dispatch_width,
2824 needs_dep, first_write_grf, write_len);
2825
2826 /* Walk backwards looking for writes to registers we're writing which
2827 * aren't read since being written. If we hit the start of the program,
2828 * we assume that there are no outstanding dependencies on entry to the
2829 * program.
2830 */
2831 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2832 /* If we hit control flow, assume that there *are* outstanding
2833 * dependencies, and force their cleanup before our instruction.
2834 */
2835 if (block->start() == scan_inst) {
2836 for (int i = 0; i < write_len; i++) {
2837 if (needs_dep[i]) {
2838 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2839 }
2840 }
2841 return;
2842 }
2843
2844 /* We insert our reads as late as possible on the assumption that any
2845 * instruction but a MOV that might have left us an outstanding
2846 * dependency has more latency than a MOV.
2847 */
2848 if (scan_inst->dst.file == GRF) {
2849 for (int i = 0; i < scan_inst->regs_written; i++) {
2850 int reg = scan_inst->dst.reg + i;
2851
2852 if (reg >= first_write_grf &&
2853 reg < first_write_grf + write_len &&
2854 needs_dep[reg - first_write_grf]) {
2855 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2856 needs_dep[reg - first_write_grf] = false;
2857 if (scan_inst->exec_size == 16)
2858 needs_dep[reg - first_write_grf + 1] = false;
2859 }
2860 }
2861 }
2862
2863 /* Clear the flag for registers that actually got read (as expected). */
2864 clear_deps_for_inst_src(scan_inst, dispatch_width,
2865 needs_dep, first_write_grf, write_len);
2866
2867 /* Continue the loop only if we haven't resolved all the dependencies */
2868 int i;
2869 for (i = 0; i < write_len; i++) {
2870 if (needs_dep[i])
2871 break;
2872 }
2873 if (i == write_len)
2874 return;
2875 }
2876 }
2877
2878 /**
2879 * Implements this workaround for the original 965:
2880 *
2881 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2882 * used as a destination register until after it has been sourced by an
2883 * instruction with a different destination register.
2884 */
2885 void
2886 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2887 {
2888 int write_len = inst->regs_written;
2889 int first_write_grf = inst->dst.reg;
2890 bool needs_dep[BRW_MAX_MRF];
2891 assert(write_len < (int)sizeof(needs_dep) - 1);
2892
2893 memset(needs_dep, false, sizeof(needs_dep));
2894 memset(needs_dep, true, write_len);
2895 /* Walk forwards looking for writes to registers we're writing which aren't
2896 * read before being written.
2897 */
2898 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2899 /* If we hit control flow, force resolve all remaining dependencies. */
2900 if (block->end() == scan_inst) {
2901 for (int i = 0; i < write_len; i++) {
2902 if (needs_dep[i])
2903 scan_inst->insert_before(block,
2904 DEP_RESOLVE_MOV(first_write_grf + i));
2905 }
2906 return;
2907 }
2908
2909 /* Clear the flag for registers that actually got read (as expected). */
2910 clear_deps_for_inst_src(scan_inst, dispatch_width,
2911 needs_dep, first_write_grf, write_len);
2912
2913 /* We insert our reads as late as possible since they're reading the
2914 * result of a SEND, which has massive latency.
2915 */
2916 if (scan_inst->dst.file == GRF &&
2917 scan_inst->dst.reg >= first_write_grf &&
2918 scan_inst->dst.reg < first_write_grf + write_len &&
2919 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2920 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
2921 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2922 }
2923
2924 /* Continue the loop only if we haven't resolved all the dependencies */
2925 int i;
2926 for (i = 0; i < write_len; i++) {
2927 if (needs_dep[i])
2928 break;
2929 }
2930 if (i == write_len)
2931 return;
2932 }
2933
2934 /* If we hit the end of the program, resolve all remaining dependencies out
2935 * of paranoia.
2936 */
2937 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2938 assert(last_inst->eot);
2939 for (int i = 0; i < write_len; i++) {
2940 if (needs_dep[i])
2941 last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2942 }
2943 }
2944
2945 void
2946 fs_visitor::insert_gen4_send_dependency_workarounds()
2947 {
2948 if (brw->gen != 4 || brw->is_g4x)
2949 return;
2950
2951 bool progress = false;
2952
2953 /* Note that we're done with register allocation, so GRF fs_regs always
2954 * have a .reg_offset of 0.
2955 */
2956
2957 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2958 if (inst->mlen != 0 && inst->dst.file == GRF) {
2959 insert_gen4_pre_send_dependency_workarounds(block, inst);
2960 insert_gen4_post_send_dependency_workarounds(block, inst);
2961 progress = true;
2962 }
2963 }
2964
2965 if (progress)
2966 invalidate_live_intervals();
2967 }
2968
2969 /**
2970 * Turns the generic expression-style uniform pull constant load instruction
2971 * into a hardware-specific series of instructions for loading a pull
2972 * constant.
2973 *
2974 * The expression style allows the CSE pass before this to optimize out
2975 * repeated loads from the same offset, and gives the pre-register-allocation
2976 * scheduling full flexibility, while the conversion to native instructions
2977 * allows the post-register-allocation scheduler the best information
2978 * possible.
2979 *
2980 * Note that execution masking for setting up pull constant loads is special:
2981 * the channels that need to be written are unrelated to the current execution
2982 * mask, since a later instruction will use one of the result channels as a
2983 * source operand for all 8 or 16 of its channels.
2984 */
2985 void
2986 fs_visitor::lower_uniform_pull_constant_loads()
2987 {
2988 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2989 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2990 continue;
2991
2992 if (brw->gen >= 7) {
2993 /* The offset arg before was a vec4-aligned byte offset. We need to
2994 * turn it into a dword offset.
2995 */
2996 fs_reg const_offset_reg = inst->src[1];
2997 assert(const_offset_reg.file == IMM &&
2998 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2999 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3000 fs_reg payload = fs_reg(this, glsl_type::uint_type);
3001
3002 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3003 * Reserve space for the register.
3004 */
3005 if (brw->gen >= 9) {
3006 payload.reg_offset++;
3007 virtual_grf_sizes[payload.reg] = 2;
3008 }
3009
3010 /* This is actually going to be a MOV, but since only the first dword
3011 * is accessed, we have a special opcode to do just that one. Note
3012 * that this needs to be an operation that will be considered a def
3013 * by live variable analysis, or register allocation will explode.
3014 */
3015 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3016 8, payload, const_offset_reg);
3017 setup->force_writemask_all = true;
3018
3019 setup->ir = inst->ir;
3020 setup->annotation = inst->annotation;
3021 inst->insert_before(block, setup);
3022
3023 /* Similarly, this will only populate the first 4 channels of the
3024 * result register (since we only use smear values from 0-3), but we
3025 * don't tell the optimizer.
3026 */
3027 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3028 inst->src[1] = payload;
3029
3030 invalidate_live_intervals();
3031 } else {
3032 /* Before register allocation, we didn't tell the scheduler about the
3033 * MRF we use. We know it's safe to use this MRF because nothing
3034 * else does except for register spill/unspill, which generates and
3035 * uses its MRF within a single IR instruction.
3036 */
3037 inst->base_mrf = 14;
3038 inst->mlen = 1;
3039 }
3040 }
3041 }
3042
3043 bool
3044 fs_visitor::lower_load_payload()
3045 {
3046 bool progress = false;
3047
3048 int vgrf_to_reg[virtual_grf_count];
3049 int reg_count = 16; /* Leave room for MRF */
3050 for (int i = 0; i < virtual_grf_count; ++i) {
3051 vgrf_to_reg[i] = reg_count;
3052 reg_count += virtual_grf_sizes[i];
3053 }
3054
3055 struct {
3056 bool written:1; /* Whether this register has ever been written */
3057 bool force_writemask_all:1;
3058 bool force_sechalf:1;
3059 } metadata[reg_count];
3060 memset(metadata, 0, sizeof(metadata));
3061
3062 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3063 int dst_reg;
3064 if (inst->dst.file == GRF) {
3065 dst_reg = vgrf_to_reg[inst->dst.reg];
3066 } else {
3067 /* MRF */
3068 dst_reg = inst->dst.reg;
3069 }
3070
3071 if (inst->dst.file == MRF || inst->dst.file == GRF) {
3072 bool force_sechalf = inst->force_sechalf;
3073 bool toggle_sechalf = inst->dst.width == 16 &&
3074 type_sz(inst->dst.type) == 4;
3075 for (int i = 0; i < inst->regs_written; ++i) {
3076 metadata[dst_reg + i].written = true;
3077 metadata[dst_reg + i].force_sechalf = force_sechalf;
3078 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3079 force_sechalf = (toggle_sechalf != force_sechalf);
3080 }
3081 }
3082
3083 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3084 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3085 fs_reg dst = inst->dst;
3086
3087 for (int i = 0; i < inst->sources; i++) {
3088 dst.width = inst->src[i].effective_width;
3089 dst.type = inst->src[i].type;
3090
3091 if (inst->src[i].file == BAD_FILE) {
3092 /* Do nothing but otherwise increment as normal */
3093 } else if (dst.file == MRF &&
3094 dst.width == 8 &&
3095 brw->has_compr4 &&
3096 i + 4 < inst->sources &&
3097 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3098 fs_reg compr4_dst = dst;
3099 compr4_dst.reg += BRW_MRF_COMPR4;
3100 compr4_dst.width = 16;
3101 fs_reg compr4_src = inst->src[i];
3102 compr4_src.width = 16;
3103 fs_inst *mov = MOV(compr4_dst, compr4_src);
3104 mov->force_writemask_all = true;
3105 inst->insert_before(block, mov);
3106 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3107 inst->src[i + 4].file = BAD_FILE;
3108 } else {
3109 fs_inst *mov = MOV(dst, inst->src[i]);
3110 if (inst->src[i].file == GRF) {
3111 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3112 inst->src[i].reg_offset;
3113 mov->force_sechalf = metadata[src_reg].force_sechalf;
3114 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3115 metadata[dst_reg] = metadata[src_reg];
3116 if (dst.width * type_sz(dst.type) > 32) {
3117 assert((!metadata[src_reg].written ||
3118 !metadata[src_reg].force_sechalf) &&
3119 (!metadata[src_reg + 1].written ||
3120 metadata[src_reg + 1].force_sechalf));
3121 metadata[dst_reg + 1] = metadata[src_reg + 1];
3122 }
3123 } else {
3124 metadata[dst_reg].force_writemask_all = false;
3125 metadata[dst_reg].force_sechalf = false;
3126 if (dst.width == 16) {
3127 metadata[dst_reg + 1].force_writemask_all = false;
3128 metadata[dst_reg + 1].force_sechalf = true;
3129 }
3130 }
3131 inst->insert_before(block, mov);
3132 }
3133
3134 dst = offset(dst, 1);
3135 }
3136
3137 inst->remove(block);
3138 progress = true;
3139 }
3140 }
3141
3142 if (progress)
3143 invalidate_live_intervals();
3144
3145 return progress;
3146 }
3147
3148 void
3149 fs_visitor::dump_instructions()
3150 {
3151 dump_instructions(NULL);
3152 }
3153
3154 void
3155 fs_visitor::dump_instructions(const char *name)
3156 {
3157 calculate_register_pressure();
3158 FILE *file = stderr;
3159 if (name && geteuid() != 0) {
3160 file = fopen(name, "w");
3161 if (!file)
3162 file = stderr;
3163 }
3164
3165 int ip = 0, max_pressure = 0;
3166 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3167 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3168 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3169 dump_instruction(inst, file);
3170 ++ip;
3171 }
3172 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3173
3174 if (file != stderr) {
3175 fclose(file);
3176 }
3177 }
3178
3179 void
3180 fs_visitor::dump_instruction(backend_instruction *be_inst)
3181 {
3182 dump_instruction(be_inst, stderr);
3183 }
3184
3185 void
3186 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3187 {
3188 fs_inst *inst = (fs_inst *)be_inst;
3189
3190 if (inst->predicate) {
3191 fprintf(file, "(%cf0.%d) ",
3192 inst->predicate_inverse ? '-' : '+',
3193 inst->flag_subreg);
3194 }
3195
3196 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3197 if (inst->saturate)
3198 fprintf(file, ".sat");
3199 if (inst->conditional_mod) {
3200 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3201 if (!inst->predicate &&
3202 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3203 inst->opcode != BRW_OPCODE_IF &&
3204 inst->opcode != BRW_OPCODE_WHILE))) {
3205 fprintf(file, ".f0.%d", inst->flag_subreg);
3206 }
3207 }
3208 fprintf(file, "(%d) ", inst->exec_size);
3209
3210
3211 switch (inst->dst.file) {
3212 case GRF:
3213 fprintf(file, "vgrf%d", inst->dst.reg);
3214 if (inst->dst.width != dispatch_width)
3215 fprintf(file, "@%d", inst->dst.width);
3216 if (virtual_grf_sizes[inst->dst.reg] != inst->dst.width / 8 ||
3217 inst->dst.subreg_offset)
3218 fprintf(file, "+%d.%d",
3219 inst->dst.reg_offset, inst->dst.subreg_offset);
3220 break;
3221 case MRF:
3222 fprintf(file, "m%d", inst->dst.reg);
3223 break;
3224 case BAD_FILE:
3225 fprintf(file, "(null)");
3226 break;
3227 case UNIFORM:
3228 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3229 break;
3230 case ATTR:
3231 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3232 break;
3233 case HW_REG:
3234 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3235 switch (inst->dst.fixed_hw_reg.nr) {
3236 case BRW_ARF_NULL:
3237 fprintf(file, "null");
3238 break;
3239 case BRW_ARF_ADDRESS:
3240 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3241 break;
3242 case BRW_ARF_ACCUMULATOR:
3243 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3244 break;
3245 case BRW_ARF_FLAG:
3246 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3247 inst->dst.fixed_hw_reg.subnr);
3248 break;
3249 default:
3250 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3251 inst->dst.fixed_hw_reg.subnr);
3252 break;
3253 }
3254 } else {
3255 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3256 }
3257 if (inst->dst.fixed_hw_reg.subnr)
3258 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3259 break;
3260 default:
3261 fprintf(file, "???");
3262 break;
3263 }
3264 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3265
3266 for (int i = 0; i < inst->sources; i++) {
3267 if (inst->src[i].negate)
3268 fprintf(file, "-");
3269 if (inst->src[i].abs)
3270 fprintf(file, "|");
3271 switch (inst->src[i].file) {
3272 case GRF:
3273 fprintf(file, "vgrf%d", inst->src[i].reg);
3274 if (inst->src[i].width != dispatch_width)
3275 fprintf(file, "@%d", inst->src[i].width);
3276 if (virtual_grf_sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3277 inst->src[i].subreg_offset)
3278 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3279 inst->src[i].subreg_offset);
3280 break;
3281 case MRF:
3282 fprintf(file, "***m%d***", inst->src[i].reg);
3283 break;
3284 case ATTR:
3285 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3286 break;
3287 case UNIFORM:
3288 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3289 if (inst->src[i].reladdr) {
3290 fprintf(file, "+reladdr");
3291 } else if (inst->src[i].subreg_offset) {
3292 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3293 inst->src[i].subreg_offset);
3294 }
3295 break;
3296 case BAD_FILE:
3297 fprintf(file, "(null)");
3298 break;
3299 case IMM:
3300 switch (inst->src[i].type) {
3301 case BRW_REGISTER_TYPE_F:
3302 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3303 break;
3304 case BRW_REGISTER_TYPE_D:
3305 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3306 break;
3307 case BRW_REGISTER_TYPE_UD:
3308 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3309 break;
3310 case BRW_REGISTER_TYPE_VF:
3311 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3312 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3313 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3314 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3315 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3316 break;
3317 default:
3318 fprintf(file, "???");
3319 break;
3320 }
3321 break;
3322 case HW_REG:
3323 if (inst->src[i].fixed_hw_reg.negate)
3324 fprintf(file, "-");
3325 if (inst->src[i].fixed_hw_reg.abs)
3326 fprintf(file, "|");
3327 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3328 switch (inst->src[i].fixed_hw_reg.nr) {
3329 case BRW_ARF_NULL:
3330 fprintf(file, "null");
3331 break;
3332 case BRW_ARF_ADDRESS:
3333 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3334 break;
3335 case BRW_ARF_ACCUMULATOR:
3336 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3337 break;
3338 case BRW_ARF_FLAG:
3339 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3340 inst->src[i].fixed_hw_reg.subnr);
3341 break;
3342 default:
3343 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3344 inst->src[i].fixed_hw_reg.subnr);
3345 break;
3346 }
3347 } else {
3348 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3349 }
3350 if (inst->src[i].fixed_hw_reg.subnr)
3351 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3352 if (inst->src[i].fixed_hw_reg.abs)
3353 fprintf(file, "|");
3354 break;
3355 default:
3356 fprintf(file, "???");
3357 break;
3358 }
3359 if (inst->src[i].abs)
3360 fprintf(file, "|");
3361
3362 if (inst->src[i].file != IMM) {
3363 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3364 }
3365
3366 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3367 fprintf(file, ", ");
3368 }
3369
3370 fprintf(file, " ");
3371
3372 if (dispatch_width == 16 && inst->exec_size == 8) {
3373 if (inst->force_sechalf)
3374 fprintf(file, "2ndhalf ");
3375 else
3376 fprintf(file, "1sthalf ");
3377 }
3378
3379 fprintf(file, "\n");
3380 }
3381
3382 /**
3383 * Possibly returns an instruction that set up @param reg.
3384 *
3385 * Sometimes we want to take the result of some expression/variable
3386 * dereference tree and rewrite the instruction generating the result
3387 * of the tree. When processing the tree, we know that the
3388 * instructions generated are all writing temporaries that are dead
3389 * outside of this tree. So, if we have some instructions that write
3390 * a temporary, we're free to point that temp write somewhere else.
3391 *
3392 * Note that this doesn't guarantee that the instruction generated
3393 * only reg -- it might be the size=4 destination of a texture instruction.
3394 */
3395 fs_inst *
3396 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3397 fs_inst *end,
3398 const fs_reg &reg)
3399 {
3400 if (end == start ||
3401 end->is_partial_write() ||
3402 reg.reladdr ||
3403 !reg.equals(end->dst)) {
3404 return NULL;
3405 } else {
3406 return end;
3407 }
3408 }
3409
3410 void
3411 fs_visitor::setup_payload_gen6()
3412 {
3413 bool uses_depth =
3414 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3415 unsigned barycentric_interp_modes =
3416 (stage == MESA_SHADER_FRAGMENT) ?
3417 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3418
3419 assert(brw->gen >= 6);
3420
3421 /* R0-1: masks, pixel X/Y coordinates. */
3422 payload.num_regs = 2;
3423 /* R2: only for 32-pixel dispatch.*/
3424
3425 /* R3-26: barycentric interpolation coordinates. These appear in the
3426 * same order that they appear in the brw_wm_barycentric_interp_mode
3427 * enum. Each set of coordinates occupies 2 registers if dispatch width
3428 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3429 * appear if they were enabled using the "Barycentric Interpolation
3430 * Mode" bits in WM_STATE.
3431 */
3432 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3433 if (barycentric_interp_modes & (1 << i)) {
3434 payload.barycentric_coord_reg[i] = payload.num_regs;
3435 payload.num_regs += 2;
3436 if (dispatch_width == 16) {
3437 payload.num_regs += 2;
3438 }
3439 }
3440 }
3441
3442 /* R27: interpolated depth if uses source depth */
3443 if (uses_depth) {
3444 payload.source_depth_reg = payload.num_regs;
3445 payload.num_regs++;
3446 if (dispatch_width == 16) {
3447 /* R28: interpolated depth if not SIMD8. */
3448 payload.num_regs++;
3449 }
3450 }
3451 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3452 if (uses_depth) {
3453 payload.source_w_reg = payload.num_regs;
3454 payload.num_regs++;
3455 if (dispatch_width == 16) {
3456 /* R30: interpolated W if not SIMD8. */
3457 payload.num_regs++;
3458 }
3459 }
3460
3461 if (stage == MESA_SHADER_FRAGMENT) {
3462 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3463 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3464 prog_data->uses_pos_offset = key->compute_pos_offset;
3465 /* R31: MSAA position offsets. */
3466 if (prog_data->uses_pos_offset) {
3467 payload.sample_pos_reg = payload.num_regs;
3468 payload.num_regs++;
3469 }
3470 }
3471
3472 /* R32: MSAA input coverage mask */
3473 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3474 assert(brw->gen >= 7);
3475 payload.sample_mask_in_reg = payload.num_regs;
3476 payload.num_regs++;
3477 if (dispatch_width == 16) {
3478 /* R33: input coverage mask if not SIMD8. */
3479 payload.num_regs++;
3480 }
3481 }
3482
3483 /* R34-: bary for 32-pixel. */
3484 /* R58-59: interp W for 32-pixel. */
3485
3486 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3487 source_depth_to_render_target = true;
3488 }
3489 }
3490
3491 void
3492 fs_visitor::setup_vs_payload()
3493 {
3494 /* R0: thread header, R1: urb handles */
3495 payload.num_regs = 2;
3496 }
3497
3498 void
3499 fs_visitor::assign_binding_table_offsets()
3500 {
3501 assert(stage == MESA_SHADER_FRAGMENT);
3502 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3503 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3504 uint32_t next_binding_table_offset = 0;
3505
3506 /* If there are no color regions, we still perform an FB write to a null
3507 * renderbuffer, which we place at surface index 0.
3508 */
3509 prog_data->binding_table.render_target_start = next_binding_table_offset;
3510 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3511
3512 assign_common_binding_table_offsets(next_binding_table_offset);
3513 }
3514
3515 void
3516 fs_visitor::calculate_register_pressure()
3517 {
3518 invalidate_live_intervals();
3519 calculate_live_intervals();
3520
3521 unsigned num_instructions = 0;
3522 foreach_block(block, cfg)
3523 num_instructions += block->instructions.length();
3524
3525 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3526
3527 for (int reg = 0; reg < virtual_grf_count; reg++) {
3528 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3529 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3530 }
3531 }
3532
3533 void
3534 fs_visitor::optimize()
3535 {
3536 const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3537
3538 calculate_cfg();
3539
3540 split_virtual_grfs();
3541
3542 move_uniform_array_access_to_pull_constants();
3543 assign_constant_locations();
3544 demote_pull_constants();
3545
3546 #define OPT(pass, args...) ({ \
3547 pass_num++; \
3548 bool this_progress = pass(args); \
3549 \
3550 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3551 char filename[64]; \
3552 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3553 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3554 \
3555 backend_visitor::dump_instructions(filename); \
3556 } \
3557 \
3558 progress = progress || this_progress; \
3559 this_progress; \
3560 })
3561
3562 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3563 char filename[64];
3564 snprintf(filename, 64, "%s%d-%04d-00-start",
3565 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3566
3567 backend_visitor::dump_instructions(filename);
3568 }
3569
3570 bool progress;
3571 int iteration = 0;
3572 int pass_num = 0;
3573 do {
3574 progress = false;
3575 pass_num = 0;
3576 iteration++;
3577
3578 OPT(remove_duplicate_mrf_writes);
3579
3580 OPT(opt_algebraic);
3581 OPT(opt_cse);
3582 OPT(opt_copy_propagate);
3583 OPT(opt_peephole_predicated_break);
3584 OPT(dead_code_eliminate);
3585 OPT(opt_peephole_sel);
3586 OPT(dead_control_flow_eliminate, this);
3587 OPT(opt_register_renaming);
3588 OPT(opt_saturate_propagation);
3589 OPT(register_coalesce);
3590 OPT(compute_to_mrf);
3591
3592 OPT(compact_virtual_grfs);
3593 } while (progress);
3594
3595 pass_num = 0;
3596
3597 if (OPT(lower_load_payload)) {
3598 split_virtual_grfs();
3599 OPT(register_coalesce);
3600 OPT(compute_to_mrf);
3601 OPT(dead_code_eliminate);
3602 }
3603
3604 lower_uniform_pull_constant_loads();
3605 }
3606
3607 void
3608 fs_visitor::allocate_registers()
3609 {
3610 bool allocated_without_spills;
3611
3612 static const enum instruction_scheduler_mode pre_modes[] = {
3613 SCHEDULE_PRE,
3614 SCHEDULE_PRE_NON_LIFO,
3615 SCHEDULE_PRE_LIFO,
3616 };
3617
3618 /* Try each scheduling heuristic to see if it can successfully register
3619 * allocate without spilling. They should be ordered by decreasing
3620 * performance but increasing likelihood of allocating.
3621 */
3622 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3623 schedule_instructions(pre_modes[i]);
3624
3625 if (0) {
3626 assign_regs_trivial();
3627 allocated_without_spills = true;
3628 } else {
3629 allocated_without_spills = assign_regs(false);
3630 }
3631 if (allocated_without_spills)
3632 break;
3633 }
3634
3635 if (!allocated_without_spills) {
3636 const char *stage_name = stage == MESA_SHADER_VERTEX ?
3637 "Vertex" : "Fragment";
3638
3639 /* We assume that any spilling is worse than just dropping back to
3640 * SIMD8. There's probably actually some intermediate point where
3641 * SIMD16 with a couple of spills is still better.
3642 */
3643 if (dispatch_width == 16) {
3644 fail("Failure to register allocate. Reduce number of "
3645 "live scalar values to avoid this.");
3646 } else {
3647 perf_debug("%s shader triggered register spilling. "
3648 "Try reducing the number of live scalar values to "
3649 "improve performance.\n", stage_name);
3650 }
3651
3652 /* Since we're out of heuristics, just go spill registers until we
3653 * get an allocation.
3654 */
3655 while (!assign_regs(true)) {
3656 if (failed)
3657 break;
3658 }
3659 }
3660
3661 /* This must come after all optimization and register allocation, since
3662 * it inserts dead code that happens to have side effects, and it does
3663 * so based on the actual physical registers in use.
3664 */
3665 insert_gen4_send_dependency_workarounds();
3666
3667 if (failed)
3668 return;
3669
3670 if (!allocated_without_spills)
3671 schedule_instructions(SCHEDULE_POST);
3672
3673 if (last_scratch > 0)
3674 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3675 }
3676
3677 bool
3678 fs_visitor::run_vs()
3679 {
3680 assert(stage == MESA_SHADER_VERTEX);
3681
3682 assign_common_binding_table_offsets(0);
3683 setup_vs_payload();
3684
3685 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3686 emit_shader_time_begin();
3687
3688 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3689 base_ir = ir;
3690 this->result = reg_undef;
3691 ir->accept(this);
3692 }
3693 base_ir = NULL;
3694 if (failed)
3695 return false;
3696
3697 emit_urb_writes();
3698
3699 optimize();
3700
3701 assign_curb_setup();
3702 assign_vs_urb_setup();
3703
3704 allocate_registers();
3705
3706 return !failed;
3707 }
3708
3709 bool
3710 fs_visitor::run_fs()
3711 {
3712 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3713 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3714
3715 assert(stage == MESA_SHADER_FRAGMENT);
3716
3717 sanity_param_count = prog->Parameters->NumParameters;
3718
3719 assign_binding_table_offsets();
3720
3721 if (brw->gen >= 6)
3722 setup_payload_gen6();
3723 else
3724 setup_payload_gen4();
3725
3726 if (0) {
3727 emit_dummy_fs();
3728 } else if (brw->use_rep_send && dispatch_width == 16) {
3729 emit_repclear_shader();
3730 } else {
3731 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3732 emit_shader_time_begin();
3733
3734 calculate_urb_setup();
3735 if (prog->InputsRead > 0) {
3736 if (brw->gen < 6)
3737 emit_interpolation_setup_gen4();
3738 else
3739 emit_interpolation_setup_gen6();
3740 }
3741
3742 /* We handle discards by keeping track of the still-live pixels in f0.1.
3743 * Initialize it with the dispatched pixels.
3744 */
3745 if (wm_prog_data->uses_kill) {
3746 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3747 discard_init->flag_subreg = 1;
3748 }
3749
3750 /* Generate FS IR for main(). (the visitor only descends into
3751 * functions called "main").
3752 */
3753 if (shader) {
3754 if (getenv("INTEL_USE_NIR") != NULL && !brw->use_rep_send) {
3755 no16("Cannot do 16-wide in NIR yet");
3756 emit_nir_code();
3757 } else {
3758 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3759 base_ir = ir;
3760 this->result = reg_undef;
3761 ir->accept(this);
3762 }
3763 }
3764 } else {
3765 emit_fragment_program_code();
3766 }
3767 base_ir = NULL;
3768 if (failed)
3769 return false;
3770
3771 emit(FS_OPCODE_PLACEHOLDER_HALT);
3772
3773 if (wm_key->alpha_test_func)
3774 emit_alpha_test();
3775
3776 emit_fb_writes();
3777
3778 optimize();
3779
3780 assign_curb_setup();
3781 assign_urb_setup();
3782
3783 allocate_registers();
3784
3785 if (failed)
3786 return false;
3787 }
3788
3789 if (dispatch_width == 8)
3790 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3791 else
3792 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3793
3794 /* If any state parameters were appended, then ParameterValues could have
3795 * been realloced, in which case the driver uniform storage set up by
3796 * _mesa_associate_uniform_storage() would point to freed memory. Make
3797 * sure that didn't happen.
3798 */
3799 assert(sanity_param_count == prog->Parameters->NumParameters);
3800
3801 return !failed;
3802 }
3803
3804 const unsigned *
3805 brw_wm_fs_emit(struct brw_context *brw,
3806 void *mem_ctx,
3807 const struct brw_wm_prog_key *key,
3808 struct brw_wm_prog_data *prog_data,
3809 struct gl_fragment_program *fp,
3810 struct gl_shader_program *prog,
3811 unsigned *final_assembly_size)
3812 {
3813 bool start_busy = false;
3814 double start_time = 0;
3815
3816 if (unlikely(brw->perf_debug)) {
3817 start_busy = (brw->batch.last_bo &&
3818 drm_intel_bo_busy(brw->batch.last_bo));
3819 start_time = get_time();
3820 }
3821
3822 struct brw_shader *shader = NULL;
3823 if (prog)
3824 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3825
3826 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3827 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3828
3829 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3830 */
3831 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3832 if (!v.run_fs()) {
3833 if (prog) {
3834 prog->LinkStatus = false;
3835 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3836 }
3837
3838 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3839 v.fail_msg);
3840
3841 return NULL;
3842 }
3843
3844 cfg_t *simd16_cfg = NULL;
3845 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3846 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3847 brw->use_rep_send)) {
3848 if (!v.simd16_unsupported) {
3849 /* Try a SIMD16 compile */
3850 v2.import_uniforms(&v);
3851 if (!v2.run_fs()) {
3852 perf_debug("SIMD16 shader failed to compile, falling back to "
3853 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3854 } else {
3855 simd16_cfg = v2.cfg;
3856 }
3857 } else {
3858 perf_debug("SIMD16 shader unsupported, falling back to "
3859 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3860 }
3861 }
3862
3863 cfg_t *simd8_cfg;
3864 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3865 if (no_simd8 && simd16_cfg) {
3866 simd8_cfg = NULL;
3867 prog_data->no_8 = true;
3868 } else {
3869 simd8_cfg = v.cfg;
3870 prog_data->no_8 = false;
3871 }
3872
3873 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
3874 &fp->Base, v.runtime_check_aads_emit, "FS");
3875
3876 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3877 char *name;
3878 if (prog)
3879 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
3880 prog->Label ? prog->Label : "unnamed",
3881 prog->Name);
3882 else
3883 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
3884
3885 g.enable_debug(name);
3886 }
3887
3888 if (simd8_cfg)
3889 g.generate_code(simd8_cfg, 8);
3890 if (simd16_cfg)
3891 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
3892
3893 if (unlikely(brw->perf_debug) && shader) {
3894 if (shader->compiled_once)
3895 brw_wm_debug_recompile(brw, prog, key);
3896 shader->compiled_once = true;
3897
3898 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3899 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3900 (get_time() - start_time) * 1000);
3901 }
3902 }
3903
3904 return g.get_assembly(final_assembly_size);
3905 }
3906
3907 extern "C" bool
3908 brw_fs_precompile(struct gl_context *ctx,
3909 struct gl_shader_program *shader_prog,
3910 struct gl_program *prog)
3911 {
3912 struct brw_context *brw = brw_context(ctx);
3913 struct brw_wm_prog_key key;
3914
3915 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
3916 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3917 bool program_uses_dfdy = fp->UsesDFdy;
3918
3919 memset(&key, 0, sizeof(key));
3920
3921 if (brw->gen < 6) {
3922 if (fp->UsesKill)
3923 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3924
3925 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3926 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3927
3928 /* Just assume depth testing. */
3929 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3930 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3931 }
3932
3933 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3934 BRW_FS_VARYING_INPUT_MASK) > 16)
3935 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3936
3937 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
3938 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3939 for (unsigned i = 0; i < sampler_count; i++) {
3940 if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
3941 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3942 key.tex.swizzles[i] =
3943 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3944 } else {
3945 /* Color sampler: assume no swizzling. */
3946 key.tex.swizzles[i] = SWIZZLE_XYZW;
3947 }
3948 }
3949
3950 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3951 key.drawable_height = ctx->DrawBuffer->Height;
3952 }
3953
3954 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3955 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3956 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3957
3958 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3959 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3960 key.nr_color_regions > 1;
3961 }
3962
3963 key.program_string_id = bfp->id;
3964
3965 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3966 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3967
3968 bool success = do_wm_prog(brw, shader_prog, bfp, &key);
3969
3970 brw->wm.base.prog_offset = old_prog_offset;
3971 brw->wm.prog_data = old_prog_data;
3972
3973 return success;
3974 }