i965/skl: Use 1 register for uniform pull constant payload
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "util/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "util/register_allocate.h"
42 #include "program/hash_table.h"
43 #include "brw_context.h"
44 #include "brw_eu.h"
45 #include "brw_wm.h"
46 }
47 #include "brw_fs.h"
48 #include "brw_cfg.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53 #include "program/sampler.h"
54
55 void
56 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
57 const fs_reg *src, unsigned sources)
58 {
59 memset(this, 0, sizeof(*this));
60
61 this->src = new fs_reg[MAX2(sources, 3)];
62 for (unsigned i = 0; i < sources; i++)
63 this->src[i] = src[i];
64
65 this->opcode = opcode;
66 this->dst = dst;
67 this->sources = sources;
68 this->exec_size = exec_size;
69
70 assert(dst.file != IMM && dst.file != UNIFORM);
71
72 /* If exec_size == 0, try to guess it from the registers. Since all
73 * manner of things may use hardware registers, we first try to guess
74 * based on GRF registers. If this fails, we will go ahead and take the
75 * width from the destination register.
76 */
77 if (this->exec_size == 0) {
78 if (dst.file == GRF) {
79 this->exec_size = dst.width;
80 } else {
81 for (unsigned i = 0; i < sources; ++i) {
82 if (src[i].file != GRF && src[i].file != ATTR)
83 continue;
84
85 if (this->exec_size <= 1)
86 this->exec_size = src[i].width;
87 assert(src[i].width == 1 || src[i].width == this->exec_size);
88 }
89 }
90
91 if (this->exec_size == 0 && dst.file != BAD_FILE)
92 this->exec_size = dst.width;
93 }
94 assert(this->exec_size != 0);
95
96 for (unsigned i = 0; i < sources; ++i) {
97 switch (this->src[i].file) {
98 case BAD_FILE:
99 this->src[i].effective_width = 8;
100 break;
101 case GRF:
102 case HW_REG:
103 case ATTR:
104 assert(this->src[i].width > 0);
105 if (this->src[i].width == 1) {
106 this->src[i].effective_width = this->exec_size;
107 } else {
108 this->src[i].effective_width = this->src[i].width;
109 }
110 break;
111 case IMM:
112 case UNIFORM:
113 this->src[i].effective_width = this->exec_size;
114 break;
115 default:
116 unreachable("Invalid source register file");
117 }
118 }
119 this->dst.effective_width = this->exec_size;
120
121 this->conditional_mod = BRW_CONDITIONAL_NONE;
122
123 /* This will be the case for almost all instructions. */
124 switch (dst.file) {
125 case GRF:
126 case HW_REG:
127 case MRF:
128 case ATTR:
129 this->regs_written =
130 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
131 break;
132 case BAD_FILE:
133 this->regs_written = 0;
134 break;
135 case IMM:
136 case UNIFORM:
137 unreachable("Invalid destination register file");
138 default:
139 unreachable("Invalid register file");
140 }
141
142 this->writes_accumulator = false;
143 }
144
145 fs_inst::fs_inst()
146 {
147 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
148 }
149
150 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
151 {
152 init(opcode, exec_size, reg_undef, NULL, 0);
153 }
154
155 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
156 {
157 init(opcode, 0, dst, NULL, 0);
158 }
159
160 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
161 const fs_reg &src0)
162 {
163 const fs_reg src[1] = { src0 };
164 init(opcode, exec_size, dst, src, 1);
165 }
166
167 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
168 {
169 const fs_reg src[1] = { src0 };
170 init(opcode, 0, dst, src, 1);
171 }
172
173 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
174 const fs_reg &src0, const fs_reg &src1)
175 {
176 const fs_reg src[2] = { src0, src1 };
177 init(opcode, exec_size, dst, src, 2);
178 }
179
180 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
181 const fs_reg &src1)
182 {
183 const fs_reg src[2] = { src0, src1 };
184 init(opcode, 0, dst, src, 2);
185 }
186
187 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
188 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
189 {
190 const fs_reg src[3] = { src0, src1, src2 };
191 init(opcode, exec_size, dst, src, 3);
192 }
193
194 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
195 const fs_reg &src1, const fs_reg &src2)
196 {
197 const fs_reg src[3] = { src0, src1, src2 };
198 init(opcode, 0, dst, src, 3);
199 }
200
201 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
202 const fs_reg src[], unsigned sources)
203 {
204 init(opcode, 0, dst, src, sources);
205 }
206
207 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
208 const fs_reg src[], unsigned sources)
209 {
210 init(opcode, exec_width, dst, src, sources);
211 }
212
213 fs_inst::fs_inst(const fs_inst &that)
214 {
215 memcpy(this, &that, sizeof(that));
216
217 this->src = new fs_reg[MAX2(that.sources, 3)];
218
219 for (unsigned i = 0; i < that.sources; i++)
220 this->src[i] = that.src[i];
221 }
222
223 fs_inst::~fs_inst()
224 {
225 delete[] this->src;
226 }
227
228 void
229 fs_inst::resize_sources(uint8_t num_sources)
230 {
231 if (this->sources != num_sources) {
232 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
233
234 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
235 src[i] = this->src[i];
236
237 delete[] this->src;
238 this->src = src;
239 this->sources = num_sources;
240 }
241 }
242
243 #define ALU1(op) \
244 fs_inst * \
245 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
246 { \
247 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
248 }
249
250 #define ALU2(op) \
251 fs_inst * \
252 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
253 const fs_reg &src1) \
254 { \
255 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
256 }
257
258 #define ALU2_ACC(op) \
259 fs_inst * \
260 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
261 const fs_reg &src1) \
262 { \
263 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
264 inst->writes_accumulator = true; \
265 return inst; \
266 }
267
268 #define ALU3(op) \
269 fs_inst * \
270 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
271 const fs_reg &src1, const fs_reg &src2) \
272 { \
273 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
274 }
275
276 ALU1(NOT)
277 ALU1(MOV)
278 ALU1(FRC)
279 ALU1(RNDD)
280 ALU1(RNDE)
281 ALU1(RNDZ)
282 ALU2(ADD)
283 ALU2(MUL)
284 ALU2_ACC(MACH)
285 ALU2(AND)
286 ALU2(OR)
287 ALU2(XOR)
288 ALU2(SHL)
289 ALU2(SHR)
290 ALU2(ASR)
291 ALU3(LRP)
292 ALU1(BFREV)
293 ALU3(BFE)
294 ALU2(BFI1)
295 ALU3(BFI2)
296 ALU1(FBH)
297 ALU1(FBL)
298 ALU1(CBIT)
299 ALU3(MAD)
300 ALU2_ACC(ADDC)
301 ALU2_ACC(SUBB)
302 ALU2(SEL)
303 ALU2(MAC)
304
305 /** Gen4 predicated IF. */
306 fs_inst *
307 fs_visitor::IF(enum brw_predicate predicate)
308 {
309 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
310 inst->predicate = predicate;
311 return inst;
312 }
313
314 /** Gen6 IF with embedded comparison. */
315 fs_inst *
316 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
317 enum brw_conditional_mod condition)
318 {
319 assert(brw->gen == 6);
320 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
321 reg_null_d, src0, src1);
322 inst->conditional_mod = condition;
323 return inst;
324 }
325
326 /**
327 * CMP: Sets the low bit of the destination channels with the result
328 * of the comparison, while the upper bits are undefined, and updates
329 * the flag register with the packed 16 bits of the result.
330 */
331 fs_inst *
332 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
333 enum brw_conditional_mod condition)
334 {
335 fs_inst *inst;
336
337 /* Take the instruction:
338 *
339 * CMP null<d> src0<f> src1<f>
340 *
341 * Original gen4 does type conversion to the destination type before
342 * comparison, producing garbage results for floating point comparisons.
343 *
344 * The destination type doesn't matter on newer generations, so we set the
345 * type to match src0 so we can compact the instruction.
346 */
347 dst.type = src0.type;
348 if (dst.file == HW_REG)
349 dst.fixed_hw_reg.type = dst.type;
350
351 resolve_ud_negate(&src0);
352 resolve_ud_negate(&src1);
353
354 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
355 inst->conditional_mod = condition;
356
357 return inst;
358 }
359
360 fs_inst *
361 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
362 {
363 uint8_t exec_size = dst.width;
364 for (int i = 0; i < sources; ++i) {
365 assert(src[i].width % dst.width == 0);
366 if (src[i].width > exec_size)
367 exec_size = src[i].width;
368 }
369
370 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
371 dst, src, sources);
372 inst->regs_written = 0;
373 for (int i = 0; i < sources; ++i) {
374 /* The LOAD_PAYLOAD instruction only really makes sense if we are
375 * dealing with whole registers. If this ever changes, we can deal
376 * with it later.
377 */
378 int size = inst->src[i].effective_width * type_sz(src[i].type);
379 assert(size % 32 == 0);
380 inst->regs_written += (size + 31) / 32;
381 }
382
383 return inst;
384 }
385
386 exec_list
387 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
388 const fs_reg &surf_index,
389 const fs_reg &varying_offset,
390 uint32_t const_offset)
391 {
392 exec_list instructions;
393 fs_inst *inst;
394
395 /* We have our constant surface use a pitch of 4 bytes, so our index can
396 * be any component of a vector, and then we load 4 contiguous
397 * components starting from that.
398 *
399 * We break down the const_offset to a portion added to the variable
400 * offset and a portion done using reg_offset, which means that if you
401 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
402 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
403 * CSE can later notice that those loads are all the same and eliminate
404 * the redundant ones.
405 */
406 fs_reg vec4_offset = vgrf(glsl_type::int_type);
407 instructions.push_tail(ADD(vec4_offset,
408 varying_offset, fs_reg(const_offset & ~3)));
409
410 int scale = 1;
411 if (brw->gen == 4 && dst.width == 8) {
412 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
413 * u, v, r) as parameters, or we can just use the SIMD16 message
414 * consisting of (header, u). We choose the second, at the cost of a
415 * longer return length.
416 */
417 scale = 2;
418 }
419
420 enum opcode op;
421 if (brw->gen >= 7)
422 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
423 else
424 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
425
426 assert(dst.width % 8 == 0);
427 int regs_written = 4 * (dst.width / 8) * scale;
428 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
429 dst.type, dst.width);
430 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
431 inst->regs_written = regs_written;
432 instructions.push_tail(inst);
433
434 if (brw->gen < 7) {
435 inst->base_mrf = 13;
436 inst->header_present = true;
437 if (brw->gen == 4)
438 inst->mlen = 3;
439 else
440 inst->mlen = 1 + dispatch_width / 8;
441 }
442
443 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
444 instructions.push_tail(MOV(dst, result));
445
446 return instructions;
447 }
448
449 /**
450 * A helper for MOV generation for fixing up broken hardware SEND dependency
451 * handling.
452 */
453 fs_inst *
454 fs_visitor::DEP_RESOLVE_MOV(int grf)
455 {
456 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
457
458 inst->ir = NULL;
459 inst->annotation = "send dependency resolve";
460
461 /* The caller always wants uncompressed to emit the minimal extra
462 * dependencies, and to avoid having to deal with aligning its regs to 2.
463 */
464 inst->exec_size = 8;
465
466 return inst;
467 }
468
469 bool
470 fs_inst::equals(fs_inst *inst) const
471 {
472 return (opcode == inst->opcode &&
473 dst.equals(inst->dst) &&
474 src[0].equals(inst->src[0]) &&
475 src[1].equals(inst->src[1]) &&
476 src[2].equals(inst->src[2]) &&
477 saturate == inst->saturate &&
478 predicate == inst->predicate &&
479 conditional_mod == inst->conditional_mod &&
480 mlen == inst->mlen &&
481 base_mrf == inst->base_mrf &&
482 target == inst->target &&
483 eot == inst->eot &&
484 header_present == inst->header_present &&
485 shadow_compare == inst->shadow_compare &&
486 exec_size == inst->exec_size &&
487 offset == inst->offset);
488 }
489
490 bool
491 fs_inst::overwrites_reg(const fs_reg &reg) const
492 {
493 return (reg.file == dst.file &&
494 reg.reg == dst.reg &&
495 reg.reg_offset >= dst.reg_offset &&
496 reg.reg_offset < dst.reg_offset + regs_written);
497 }
498
499 bool
500 fs_inst::is_send_from_grf() const
501 {
502 switch (opcode) {
503 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
504 case SHADER_OPCODE_SHADER_TIME_ADD:
505 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
506 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
507 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
508 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
509 case SHADER_OPCODE_UNTYPED_ATOMIC:
510 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
511 case SHADER_OPCODE_URB_WRITE_SIMD8:
512 return true;
513 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
514 return src[1].file == GRF;
515 case FS_OPCODE_FB_WRITE:
516 return src[0].file == GRF;
517 default:
518 if (is_tex())
519 return src[0].file == GRF;
520
521 return false;
522 }
523 }
524
525 bool
526 fs_inst::can_do_source_mods(struct brw_context *brw)
527 {
528 if (brw->gen == 6 && is_math())
529 return false;
530
531 if (is_send_from_grf())
532 return false;
533
534 if (!backend_instruction::can_do_source_mods())
535 return false;
536
537 return true;
538 }
539
540 void
541 fs_reg::init()
542 {
543 memset(this, 0, sizeof(*this));
544 stride = 1;
545 }
546
547 /** Generic unset register constructor. */
548 fs_reg::fs_reg()
549 {
550 init();
551 this->file = BAD_FILE;
552 }
553
554 /** Immediate value constructor. */
555 fs_reg::fs_reg(float f)
556 {
557 init();
558 this->file = IMM;
559 this->type = BRW_REGISTER_TYPE_F;
560 this->fixed_hw_reg.dw1.f = f;
561 this->width = 1;
562 }
563
564 /** Immediate value constructor. */
565 fs_reg::fs_reg(int32_t i)
566 {
567 init();
568 this->file = IMM;
569 this->type = BRW_REGISTER_TYPE_D;
570 this->fixed_hw_reg.dw1.d = i;
571 this->width = 1;
572 }
573
574 /** Immediate value constructor. */
575 fs_reg::fs_reg(uint32_t u)
576 {
577 init();
578 this->file = IMM;
579 this->type = BRW_REGISTER_TYPE_UD;
580 this->fixed_hw_reg.dw1.ud = u;
581 this->width = 1;
582 }
583
584 /** Vector float immediate value constructor. */
585 fs_reg::fs_reg(uint8_t vf[4])
586 {
587 init();
588 this->file = IMM;
589 this->type = BRW_REGISTER_TYPE_VF;
590 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
591 }
592
593 /** Vector float immediate value constructor. */
594 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
595 {
596 init();
597 this->file = IMM;
598 this->type = BRW_REGISTER_TYPE_VF;
599 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
600 (vf1 << 8) |
601 (vf2 << 16) |
602 (vf3 << 24);
603 }
604
605 /** Fixed brw_reg. */
606 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
607 {
608 init();
609 this->file = HW_REG;
610 this->fixed_hw_reg = fixed_hw_reg;
611 this->type = fixed_hw_reg.type;
612 this->width = 1 << fixed_hw_reg.width;
613 }
614
615 bool
616 fs_reg::equals(const fs_reg &r) const
617 {
618 return (file == r.file &&
619 reg == r.reg &&
620 reg_offset == r.reg_offset &&
621 subreg_offset == r.subreg_offset &&
622 type == r.type &&
623 negate == r.negate &&
624 abs == r.abs &&
625 !reladdr && !r.reladdr &&
626 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
627 width == r.width &&
628 stride == r.stride);
629 }
630
631 fs_reg &
632 fs_reg::set_smear(unsigned subreg)
633 {
634 assert(file != HW_REG && file != IMM);
635 subreg_offset = subreg * type_sz(type);
636 stride = 0;
637 return *this;
638 }
639
640 bool
641 fs_reg::is_contiguous() const
642 {
643 return stride == 1;
644 }
645
646 int
647 fs_visitor::type_size(const struct glsl_type *type)
648 {
649 unsigned int size, i;
650
651 switch (type->base_type) {
652 case GLSL_TYPE_UINT:
653 case GLSL_TYPE_INT:
654 case GLSL_TYPE_FLOAT:
655 case GLSL_TYPE_BOOL:
656 return type->components();
657 case GLSL_TYPE_ARRAY:
658 return type_size(type->fields.array) * type->length;
659 case GLSL_TYPE_STRUCT:
660 size = 0;
661 for (i = 0; i < type->length; i++) {
662 size += type_size(type->fields.structure[i].type);
663 }
664 return size;
665 case GLSL_TYPE_SAMPLER:
666 /* Samplers take up no register space, since they're baked in at
667 * link time.
668 */
669 return 0;
670 case GLSL_TYPE_ATOMIC_UINT:
671 return 0;
672 case GLSL_TYPE_IMAGE:
673 case GLSL_TYPE_VOID:
674 case GLSL_TYPE_ERROR:
675 case GLSL_TYPE_INTERFACE:
676 case GLSL_TYPE_DOUBLE:
677 unreachable("not reached");
678 }
679
680 return 0;
681 }
682
683 fs_reg
684 fs_visitor::get_timestamp()
685 {
686 assert(brw->gen >= 7);
687
688 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
689 BRW_ARF_TIMESTAMP,
690 0),
691 BRW_REGISTER_TYPE_UD));
692
693 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
694
695 fs_inst *mov = emit(MOV(dst, ts));
696 /* We want to read the 3 fields we care about even if it's not enabled in
697 * the dispatch.
698 */
699 mov->force_writemask_all = true;
700
701 /* The caller wants the low 32 bits of the timestamp. Since it's running
702 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
703 * which is plenty of time for our purposes. It is identical across the
704 * EUs, but since it's tracking GPU core speed it will increment at a
705 * varying rate as render P-states change.
706 *
707 * The caller could also check if render P-states have changed (or anything
708 * else that might disrupt timing) by setting smear to 2 and checking if
709 * that field is != 0.
710 */
711 dst.set_smear(0);
712
713 return dst;
714 }
715
716 void
717 fs_visitor::emit_shader_time_begin()
718 {
719 current_annotation = "shader time start";
720 shader_start_time = get_timestamp();
721 }
722
723 void
724 fs_visitor::emit_shader_time_end()
725 {
726 current_annotation = "shader time end";
727
728 enum shader_time_shader_type type, written_type, reset_type;
729 switch (stage) {
730 case MESA_SHADER_VERTEX:
731 type = ST_VS;
732 written_type = ST_VS_WRITTEN;
733 reset_type = ST_VS_RESET;
734 break;
735 case MESA_SHADER_GEOMETRY:
736 type = ST_GS;
737 written_type = ST_GS_WRITTEN;
738 reset_type = ST_GS_RESET;
739 break;
740 case MESA_SHADER_FRAGMENT:
741 if (dispatch_width == 8) {
742 type = ST_FS8;
743 written_type = ST_FS8_WRITTEN;
744 reset_type = ST_FS8_RESET;
745 } else {
746 assert(dispatch_width == 16);
747 type = ST_FS16;
748 written_type = ST_FS16_WRITTEN;
749 reset_type = ST_FS16_RESET;
750 }
751 break;
752 default:
753 unreachable("fs_visitor::emit_shader_time_end missing code");
754 }
755
756 fs_reg shader_end_time = get_timestamp();
757
758 /* Check that there weren't any timestamp reset events (assuming these
759 * were the only two timestamp reads that happened).
760 */
761 fs_reg reset = shader_end_time;
762 reset.set_smear(2);
763 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
764 test->conditional_mod = BRW_CONDITIONAL_Z;
765 emit(IF(BRW_PREDICATE_NORMAL));
766
767 fs_reg start = shader_start_time;
768 start.negate = true;
769 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
770 emit(ADD(diff, start, shader_end_time));
771
772 /* If there were no instructions between the two timestamp gets, the diff
773 * is 2 cycles. Remove that overhead, so I can forget about that when
774 * trying to determine the time taken for single instructions.
775 */
776 emit(ADD(diff, diff, fs_reg(-2u)));
777
778 emit_shader_time_write(type, diff);
779 emit_shader_time_write(written_type, fs_reg(1u));
780 emit(BRW_OPCODE_ELSE);
781 emit_shader_time_write(reset_type, fs_reg(1u));
782 emit(BRW_OPCODE_ENDIF);
783 }
784
785 void
786 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
787 fs_reg value)
788 {
789 int shader_time_index =
790 brw_get_shader_time_index(brw, shader_prog, prog, type);
791 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
792
793 fs_reg payload;
794 if (dispatch_width == 8)
795 payload = vgrf(glsl_type::uvec2_type);
796 else
797 payload = vgrf(glsl_type::uint_type);
798
799 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
800 fs_reg(), payload, offset, value));
801 }
802
803 void
804 fs_visitor::vfail(const char *format, va_list va)
805 {
806 char *msg;
807
808 if (failed)
809 return;
810
811 failed = true;
812
813 msg = ralloc_vasprintf(mem_ctx, format, va);
814 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
815
816 this->fail_msg = msg;
817
818 if (debug_enabled) {
819 fprintf(stderr, "%s", msg);
820 }
821 }
822
823 void
824 fs_visitor::fail(const char *format, ...)
825 {
826 va_list va;
827
828 va_start(va, format);
829 vfail(format, va);
830 va_end(va);
831 }
832
833 /**
834 * Mark this program as impossible to compile in SIMD16 mode.
835 *
836 * During the SIMD8 compile (which happens first), we can detect and flag
837 * things that are unsupported in SIMD16 mode, so the compiler can skip
838 * the SIMD16 compile altogether.
839 *
840 * During a SIMD16 compile (if one happens anyway), this just calls fail().
841 */
842 void
843 fs_visitor::no16(const char *format, ...)
844 {
845 va_list va;
846
847 va_start(va, format);
848
849 if (dispatch_width == 16) {
850 vfail(format, va);
851 } else {
852 simd16_unsupported = true;
853
854 if (brw->perf_debug) {
855 if (no16_msg)
856 ralloc_vasprintf_append(&no16_msg, format, va);
857 else
858 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
859 }
860 }
861
862 va_end(va);
863 }
864
865 fs_inst *
866 fs_visitor::emit(enum opcode opcode)
867 {
868 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
869 }
870
871 fs_inst *
872 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
873 {
874 return emit(new(mem_ctx) fs_inst(opcode, dst));
875 }
876
877 fs_inst *
878 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
879 {
880 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
881 }
882
883 fs_inst *
884 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
885 const fs_reg &src1)
886 {
887 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
888 }
889
890 fs_inst *
891 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
892 const fs_reg &src1, const fs_reg &src2)
893 {
894 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
895 }
896
897 fs_inst *
898 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
899 fs_reg src[], int sources)
900 {
901 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
902 }
903
904 /**
905 * Returns true if the instruction has a flag that means it won't
906 * update an entire destination register.
907 *
908 * For example, dead code elimination and live variable analysis want to know
909 * when a write to a variable screens off any preceding values that were in
910 * it.
911 */
912 bool
913 fs_inst::is_partial_write() const
914 {
915 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
916 (this->dst.width * type_sz(this->dst.type)) < 32 ||
917 !this->dst.is_contiguous());
918 }
919
920 int
921 fs_inst::regs_read(int arg) const
922 {
923 if (is_tex() && arg == 0 && src[0].file == GRF) {
924 return mlen;
925 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
926 return mlen;
927 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
928 return mlen;
929 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
930 return mlen;
931 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
932 return mlen;
933 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
934 return mlen;
935 }
936
937 switch (src[arg].file) {
938 case BAD_FILE:
939 case UNIFORM:
940 case IMM:
941 return 1;
942 case GRF:
943 case HW_REG:
944 if (src[arg].stride == 0) {
945 return 1;
946 } else {
947 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
948 return (size + 31) / 32;
949 }
950 case MRF:
951 unreachable("MRF registers are not allowed as sources");
952 default:
953 unreachable("Invalid register file");
954 }
955 }
956
957 bool
958 fs_inst::reads_flag() const
959 {
960 return predicate;
961 }
962
963 bool
964 fs_inst::writes_flag() const
965 {
966 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
967 opcode != BRW_OPCODE_IF &&
968 opcode != BRW_OPCODE_WHILE)) ||
969 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
970 }
971
972 /**
973 * Returns how many MRFs an FS opcode will write over.
974 *
975 * Note that this is not the 0 or 1 implied writes in an actual gen
976 * instruction -- the FS opcodes often generate MOVs in addition.
977 */
978 int
979 fs_visitor::implied_mrf_writes(fs_inst *inst)
980 {
981 if (inst->mlen == 0)
982 return 0;
983
984 if (inst->base_mrf == -1)
985 return 0;
986
987 switch (inst->opcode) {
988 case SHADER_OPCODE_RCP:
989 case SHADER_OPCODE_RSQ:
990 case SHADER_OPCODE_SQRT:
991 case SHADER_OPCODE_EXP2:
992 case SHADER_OPCODE_LOG2:
993 case SHADER_OPCODE_SIN:
994 case SHADER_OPCODE_COS:
995 return 1 * dispatch_width / 8;
996 case SHADER_OPCODE_POW:
997 case SHADER_OPCODE_INT_QUOTIENT:
998 case SHADER_OPCODE_INT_REMAINDER:
999 return 2 * dispatch_width / 8;
1000 case SHADER_OPCODE_TEX:
1001 case FS_OPCODE_TXB:
1002 case SHADER_OPCODE_TXD:
1003 case SHADER_OPCODE_TXF:
1004 case SHADER_OPCODE_TXF_CMS:
1005 case SHADER_OPCODE_TXF_MCS:
1006 case SHADER_OPCODE_TG4:
1007 case SHADER_OPCODE_TG4_OFFSET:
1008 case SHADER_OPCODE_TXL:
1009 case SHADER_OPCODE_TXS:
1010 case SHADER_OPCODE_LOD:
1011 return 1;
1012 case FS_OPCODE_FB_WRITE:
1013 return 2;
1014 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1015 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1016 return 1;
1017 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1018 return inst->mlen;
1019 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1020 return 2;
1021 case SHADER_OPCODE_UNTYPED_ATOMIC:
1022 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1023 case SHADER_OPCODE_URB_WRITE_SIMD8:
1024 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1025 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1026 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1027 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1028 return 0;
1029 default:
1030 unreachable("not reached");
1031 }
1032 }
1033
1034 fs_reg
1035 fs_visitor::vgrf(const glsl_type *const type)
1036 {
1037 int reg_width = dispatch_width / 8;
1038 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1039 brw_type_for_base_type(type), dispatch_width);
1040 }
1041
1042 fs_reg
1043 fs_visitor::vgrf(int num_components)
1044 {
1045 int reg_width = dispatch_width / 8;
1046 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1047 BRW_REGISTER_TYPE_F, dispatch_width);
1048 }
1049
1050 /** Fixed HW reg constructor. */
1051 fs_reg::fs_reg(enum register_file file, int reg)
1052 {
1053 init();
1054 this->file = file;
1055 this->reg = reg;
1056 this->type = BRW_REGISTER_TYPE_F;
1057
1058 switch (file) {
1059 case UNIFORM:
1060 this->width = 1;
1061 break;
1062 default:
1063 this->width = 8;
1064 }
1065 }
1066
1067 /** Fixed HW reg constructor. */
1068 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1069 {
1070 init();
1071 this->file = file;
1072 this->reg = reg;
1073 this->type = type;
1074
1075 switch (file) {
1076 case UNIFORM:
1077 this->width = 1;
1078 break;
1079 default:
1080 this->width = 8;
1081 }
1082 }
1083
1084 /** Fixed HW reg constructor. */
1085 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1086 uint8_t width)
1087 {
1088 init();
1089 this->file = file;
1090 this->reg = reg;
1091 this->type = type;
1092 this->width = width;
1093 }
1094
1095 fs_reg *
1096 fs_visitor::variable_storage(ir_variable *var)
1097 {
1098 return (fs_reg *)hash_table_find(this->variable_ht, var);
1099 }
1100
1101 void
1102 import_uniforms_callback(const void *key,
1103 void *data,
1104 void *closure)
1105 {
1106 struct hash_table *dst_ht = (struct hash_table *)closure;
1107 const fs_reg *reg = (const fs_reg *)data;
1108
1109 if (reg->file != UNIFORM)
1110 return;
1111
1112 hash_table_insert(dst_ht, data, key);
1113 }
1114
1115 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1116 * This brings in those uniform definitions
1117 */
1118 void
1119 fs_visitor::import_uniforms(fs_visitor *v)
1120 {
1121 hash_table_call_foreach(v->variable_ht,
1122 import_uniforms_callback,
1123 variable_ht);
1124 this->push_constant_loc = v->push_constant_loc;
1125 this->pull_constant_loc = v->pull_constant_loc;
1126 this->uniforms = v->uniforms;
1127 this->param_size = v->param_size;
1128 }
1129
1130 /* Our support for uniforms is piggy-backed on the struct
1131 * gl_fragment_program, because that's where the values actually
1132 * get stored, rather than in some global gl_shader_program uniform
1133 * store.
1134 */
1135 void
1136 fs_visitor::setup_uniform_values(ir_variable *ir)
1137 {
1138 int namelen = strlen(ir->name);
1139
1140 /* The data for our (non-builtin) uniforms is stored in a series of
1141 * gl_uniform_driver_storage structs for each subcomponent that
1142 * glGetUniformLocation() could name. We know it's been set up in the same
1143 * order we'd walk the type, so walk the list of storage and find anything
1144 * with our name, or the prefix of a component that starts with our name.
1145 */
1146 unsigned params_before = uniforms;
1147 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1148 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1149
1150 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1151 (storage->name[namelen] != 0 &&
1152 storage->name[namelen] != '.' &&
1153 storage->name[namelen] != '[')) {
1154 continue;
1155 }
1156
1157 unsigned slots = storage->type->component_slots();
1158 if (storage->array_elements)
1159 slots *= storage->array_elements;
1160
1161 for (unsigned i = 0; i < slots; i++) {
1162 stage_prog_data->param[uniforms++] = &storage->storage[i];
1163 }
1164 }
1165
1166 /* Make sure we actually initialized the right amount of stuff here. */
1167 assert(params_before + ir->type->component_slots() == uniforms);
1168 (void)params_before;
1169 }
1170
1171
1172 /* Our support for builtin uniforms is even scarier than non-builtin.
1173 * It sits on top of the PROG_STATE_VAR parameters that are
1174 * automatically updated from GL context state.
1175 */
1176 void
1177 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1178 {
1179 const ir_state_slot *const slots = ir->get_state_slots();
1180 assert(slots != NULL);
1181
1182 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1183 /* This state reference has already been setup by ir_to_mesa, but we'll
1184 * get the same index back here.
1185 */
1186 int index = _mesa_add_state_reference(this->prog->Parameters,
1187 (gl_state_index *)slots[i].tokens);
1188
1189 /* Add each of the unique swizzles of the element as a parameter.
1190 * This'll end up matching the expected layout of the
1191 * array/matrix/structure we're trying to fill in.
1192 */
1193 int last_swiz = -1;
1194 for (unsigned int j = 0; j < 4; j++) {
1195 int swiz = GET_SWZ(slots[i].swizzle, j);
1196 if (swiz == last_swiz)
1197 break;
1198 last_swiz = swiz;
1199
1200 stage_prog_data->param[uniforms++] =
1201 &prog->Parameters->ParameterValues[index][swiz];
1202 }
1203 }
1204 }
1205
1206 fs_reg *
1207 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1208 bool origin_upper_left)
1209 {
1210 assert(stage == MESA_SHADER_FRAGMENT);
1211 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1212 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1213 fs_reg wpos = *reg;
1214 bool flip = !origin_upper_left ^ key->render_to_fbo;
1215
1216 /* gl_FragCoord.x */
1217 if (pixel_center_integer) {
1218 emit(MOV(wpos, this->pixel_x));
1219 } else {
1220 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1221 }
1222 wpos = offset(wpos, 1);
1223
1224 /* gl_FragCoord.y */
1225 if (!flip && pixel_center_integer) {
1226 emit(MOV(wpos, this->pixel_y));
1227 } else {
1228 fs_reg pixel_y = this->pixel_y;
1229 float offset = (pixel_center_integer ? 0.0 : 0.5);
1230
1231 if (flip) {
1232 pixel_y.negate = true;
1233 offset += key->drawable_height - 1.0;
1234 }
1235
1236 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1237 }
1238 wpos = offset(wpos, 1);
1239
1240 /* gl_FragCoord.z */
1241 if (brw->gen >= 6) {
1242 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1243 } else {
1244 emit(FS_OPCODE_LINTERP, wpos,
1245 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1246 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1247 interp_reg(VARYING_SLOT_POS, 2));
1248 }
1249 wpos = offset(wpos, 1);
1250
1251 /* gl_FragCoord.w: Already set up in emit_interpolation */
1252 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1253
1254 return reg;
1255 }
1256
1257 fs_inst *
1258 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1259 glsl_interp_qualifier interpolation_mode,
1260 bool is_centroid, bool is_sample)
1261 {
1262 brw_wm_barycentric_interp_mode barycoord_mode;
1263 if (brw->gen >= 6) {
1264 if (is_centroid) {
1265 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1266 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1267 else
1268 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1269 } else if (is_sample) {
1270 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1271 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1272 else
1273 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1274 } else {
1275 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1276 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1277 else
1278 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1279 }
1280 } else {
1281 /* On Ironlake and below, there is only one interpolation mode.
1282 * Centroid interpolation doesn't mean anything on this hardware --
1283 * there is no multisampling.
1284 */
1285 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1286 }
1287 return emit(FS_OPCODE_LINTERP, attr,
1288 this->delta_x[barycoord_mode],
1289 this->delta_y[barycoord_mode], interp);
1290 }
1291
1292 void
1293 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1294 const glsl_type *type,
1295 glsl_interp_qualifier interpolation_mode,
1296 int location, bool mod_centroid,
1297 bool mod_sample)
1298 {
1299 attr.type = brw_type_for_base_type(type->get_scalar_type());
1300
1301 assert(stage == MESA_SHADER_FRAGMENT);
1302 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1303 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1304
1305 unsigned int array_elements;
1306
1307 if (type->is_array()) {
1308 array_elements = type->length;
1309 if (array_elements == 0) {
1310 fail("dereferenced array '%s' has length 0\n", name);
1311 }
1312 type = type->fields.array;
1313 } else {
1314 array_elements = 1;
1315 }
1316
1317 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1318 bool is_gl_Color =
1319 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1320 if (key->flat_shade && is_gl_Color) {
1321 interpolation_mode = INTERP_QUALIFIER_FLAT;
1322 } else {
1323 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1324 }
1325 }
1326
1327 for (unsigned int i = 0; i < array_elements; i++) {
1328 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1329 if (prog_data->urb_setup[location] == -1) {
1330 /* If there's no incoming setup data for this slot, don't
1331 * emit interpolation for it.
1332 */
1333 attr = offset(attr, type->vector_elements);
1334 location++;
1335 continue;
1336 }
1337
1338 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1339 /* Constant interpolation (flat shading) case. The SF has
1340 * handed us defined values in only the constant offset
1341 * field of the setup reg.
1342 */
1343 for (unsigned int k = 0; k < type->vector_elements; k++) {
1344 struct brw_reg interp = interp_reg(location, k);
1345 interp = suboffset(interp, 3);
1346 interp.type = attr.type;
1347 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1348 attr = offset(attr, 1);
1349 }
1350 } else {
1351 /* Smooth/noperspective interpolation case. */
1352 for (unsigned int k = 0; k < type->vector_elements; k++) {
1353 struct brw_reg interp = interp_reg(location, k);
1354 if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1355 /* Get the pixel/sample mask into f0 so that we know
1356 * which pixels are lit. Then, for each channel that is
1357 * unlit, replace the centroid data with non-centroid
1358 * data.
1359 */
1360 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1361
1362 fs_inst *inst;
1363 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1364 false, false);
1365 inst->predicate = BRW_PREDICATE_NORMAL;
1366 inst->predicate_inverse = true;
1367 if (brw->has_pln)
1368 inst->no_dd_clear = true;
1369
1370 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1371 mod_centroid && !key->persample_shading,
1372 mod_sample || key->persample_shading);
1373 inst->predicate = BRW_PREDICATE_NORMAL;
1374 inst->predicate_inverse = false;
1375 if (brw->has_pln)
1376 inst->no_dd_check = true;
1377
1378 } else {
1379 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1380 mod_centroid && !key->persample_shading,
1381 mod_sample || key->persample_shading);
1382 }
1383 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1384 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1385 }
1386 attr = offset(attr, 1);
1387 }
1388
1389 }
1390 location++;
1391 }
1392 }
1393 }
1394
1395 fs_reg *
1396 fs_visitor::emit_frontfacing_interpolation()
1397 {
1398 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1399
1400 if (brw->gen >= 6) {
1401 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1402 * a boolean result from this (~0/true or 0/false).
1403 *
1404 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1405 * this task in only one instruction:
1406 * - a negation source modifier will flip the bit; and
1407 * - a W -> D type conversion will sign extend the bit into the high
1408 * word of the destination.
1409 *
1410 * An ASR 15 fills the low word of the destination.
1411 */
1412 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1413 g0.negate = true;
1414
1415 emit(ASR(*reg, g0, fs_reg(15)));
1416 } else {
1417 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1418 * a boolean result from this (1/true or 0/false).
1419 *
1420 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1421 * the negation source modifier to flip it. Unfortunately the SHR
1422 * instruction only operates on UD (or D with an abs source modifier)
1423 * sources without negation.
1424 *
1425 * Instead, use ASR (which will give ~0/true or 0/false).
1426 */
1427 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1428 g1_6.negate = true;
1429
1430 emit(ASR(*reg, g1_6, fs_reg(31)));
1431 }
1432
1433 return reg;
1434 }
1435
1436 void
1437 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1438 {
1439 assert(stage == MESA_SHADER_FRAGMENT);
1440 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1441 assert(dst.type == BRW_REGISTER_TYPE_F);
1442
1443 if (key->compute_pos_offset) {
1444 /* Convert int_sample_pos to floating point */
1445 emit(MOV(dst, int_sample_pos));
1446 /* Scale to the range [0, 1] */
1447 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1448 }
1449 else {
1450 /* From ARB_sample_shading specification:
1451 * "When rendering to a non-multisample buffer, or if multisample
1452 * rasterization is disabled, gl_SamplePosition will always be
1453 * (0.5, 0.5).
1454 */
1455 emit(MOV(dst, fs_reg(0.5f)));
1456 }
1457 }
1458
1459 fs_reg *
1460 fs_visitor::emit_samplepos_setup()
1461 {
1462 assert(brw->gen >= 6);
1463
1464 this->current_annotation = "compute sample position";
1465 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1466 fs_reg pos = *reg;
1467 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1468 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1469
1470 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1471 * mode will be enabled.
1472 *
1473 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1474 * R31.1:0 Position Offset X/Y for Slot[3:0]
1475 * R31.3:2 Position Offset X/Y for Slot[7:4]
1476 * .....
1477 *
1478 * The X, Y sample positions come in as bytes in thread payload. So, read
1479 * the positions using vstride=16, width=8, hstride=2.
1480 */
1481 struct brw_reg sample_pos_reg =
1482 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1483 BRW_REGISTER_TYPE_B), 16, 8, 2);
1484
1485 if (dispatch_width == 8) {
1486 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1487 } else {
1488 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1489 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1490 ->force_sechalf = true;
1491 }
1492 /* Compute gl_SamplePosition.x */
1493 compute_sample_position(pos, int_sample_x);
1494 pos = offset(pos, 1);
1495 if (dispatch_width == 8) {
1496 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1497 } else {
1498 emit(MOV(half(int_sample_y, 0),
1499 fs_reg(suboffset(sample_pos_reg, 1))));
1500 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1501 ->force_sechalf = true;
1502 }
1503 /* Compute gl_SamplePosition.y */
1504 compute_sample_position(pos, int_sample_y);
1505 return reg;
1506 }
1507
1508 fs_reg *
1509 fs_visitor::emit_sampleid_setup()
1510 {
1511 assert(stage == MESA_SHADER_FRAGMENT);
1512 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1513 assert(brw->gen >= 6);
1514
1515 this->current_annotation = "compute sample id";
1516 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1517
1518 if (key->compute_sample_id) {
1519 fs_reg t1 = vgrf(glsl_type::int_type);
1520 fs_reg t2 = vgrf(glsl_type::int_type);
1521 t2.type = BRW_REGISTER_TYPE_UW;
1522
1523 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1524 * 8x multisampling, subspan 0 will represent sample N (where N
1525 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1526 * 7. We can find the value of N by looking at R0.0 bits 7:6
1527 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1528 * (since samples are always delivered in pairs). That is, we
1529 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1530 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1531 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1532 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1533 * populating a temporary variable with the sequence (0, 1, 2, 3),
1534 * and then reading from it using vstride=1, width=4, hstride=0.
1535 * These computations hold good for 4x multisampling as well.
1536 *
1537 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1538 * the first four slots are sample 0 of subspan 0; the next four
1539 * are sample 1 of subspan 0; the third group is sample 0 of
1540 * subspan 1, and finally sample 1 of subspan 1.
1541 */
1542 fs_inst *inst;
1543 inst = emit(BRW_OPCODE_AND, t1,
1544 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1545 fs_reg(0xc0));
1546 inst->force_writemask_all = true;
1547 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1548 inst->force_writemask_all = true;
1549 /* This works for both SIMD8 and SIMD16 */
1550 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1551 inst->force_writemask_all = true;
1552 /* This special instruction takes care of setting vstride=1,
1553 * width=4, hstride=0 of t2 during an ADD instruction.
1554 */
1555 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1556 } else {
1557 /* As per GL_ARB_sample_shading specification:
1558 * "When rendering to a non-multisample buffer, or if multisample
1559 * rasterization is disabled, gl_SampleID will always be zero."
1560 */
1561 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1562 }
1563
1564 return reg;
1565 }
1566
1567 fs_reg
1568 fs_visitor::fix_math_operand(fs_reg src)
1569 {
1570 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1571 * might be able to do better by doing execsize = 1 math and then
1572 * expanding that result out, but we would need to be careful with
1573 * masking.
1574 *
1575 * The hardware ignores source modifiers (negate and abs) on math
1576 * instructions, so we also move to a temp to set those up.
1577 */
1578 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1579 !src.abs && !src.negate)
1580 return src;
1581
1582 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1583 * operands to math
1584 */
1585 if (brw->gen >= 7 && src.file != IMM)
1586 return src;
1587
1588 fs_reg expanded = vgrf(glsl_type::float_type);
1589 expanded.type = src.type;
1590 emit(BRW_OPCODE_MOV, expanded, src);
1591 return expanded;
1592 }
1593
1594 fs_inst *
1595 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1596 {
1597 switch (opcode) {
1598 case SHADER_OPCODE_RCP:
1599 case SHADER_OPCODE_RSQ:
1600 case SHADER_OPCODE_SQRT:
1601 case SHADER_OPCODE_EXP2:
1602 case SHADER_OPCODE_LOG2:
1603 case SHADER_OPCODE_SIN:
1604 case SHADER_OPCODE_COS:
1605 break;
1606 default:
1607 unreachable("not reached: bad math opcode");
1608 }
1609
1610 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1611 * might be able to do better by doing execsize = 1 math and then
1612 * expanding that result out, but we would need to be careful with
1613 * masking.
1614 *
1615 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1616 * instructions, so we also move to a temp to set those up.
1617 */
1618 if (brw->gen == 6 || brw->gen == 7)
1619 src = fix_math_operand(src);
1620
1621 fs_inst *inst = emit(opcode, dst, src);
1622
1623 if (brw->gen < 6) {
1624 inst->base_mrf = 2;
1625 inst->mlen = dispatch_width / 8;
1626 }
1627
1628 return inst;
1629 }
1630
1631 fs_inst *
1632 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1633 {
1634 int base_mrf = 2;
1635 fs_inst *inst;
1636
1637 if (brw->gen >= 8) {
1638 inst = emit(opcode, dst, src0, src1);
1639 } else if (brw->gen >= 6) {
1640 src0 = fix_math_operand(src0);
1641 src1 = fix_math_operand(src1);
1642
1643 inst = emit(opcode, dst, src0, src1);
1644 } else {
1645 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1646 * "Message Payload":
1647 *
1648 * "Operand0[7]. For the INT DIV functions, this operand is the
1649 * denominator."
1650 * ...
1651 * "Operand1[7]. For the INT DIV functions, this operand is the
1652 * numerator."
1653 */
1654 bool is_int_div = opcode != SHADER_OPCODE_POW;
1655 fs_reg &op0 = is_int_div ? src1 : src0;
1656 fs_reg &op1 = is_int_div ? src0 : src1;
1657
1658 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1659 inst = emit(opcode, dst, op0, reg_null_f);
1660
1661 inst->base_mrf = base_mrf;
1662 inst->mlen = 2 * dispatch_width / 8;
1663 }
1664 return inst;
1665 }
1666
1667 void
1668 fs_visitor::assign_curb_setup()
1669 {
1670 if (dispatch_width == 8) {
1671 prog_data->dispatch_grf_start_reg = payload.num_regs;
1672 } else {
1673 assert(stage == MESA_SHADER_FRAGMENT);
1674 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1675 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1676 }
1677
1678 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1679
1680 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1681 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1682 for (unsigned int i = 0; i < inst->sources; i++) {
1683 if (inst->src[i].file == UNIFORM) {
1684 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1685 int constant_nr;
1686 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1687 constant_nr = push_constant_loc[uniform_nr];
1688 } else {
1689 /* Section 5.11 of the OpenGL 4.1 spec says:
1690 * "Out-of-bounds reads return undefined values, which include
1691 * values from other variables of the active program or zero."
1692 * Just return the first push constant.
1693 */
1694 constant_nr = 0;
1695 }
1696
1697 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1698 constant_nr / 8,
1699 constant_nr % 8);
1700
1701 inst->src[i].file = HW_REG;
1702 inst->src[i].fixed_hw_reg = byte_offset(
1703 retype(brw_reg, inst->src[i].type),
1704 inst->src[i].subreg_offset);
1705 }
1706 }
1707 }
1708 }
1709
1710 void
1711 fs_visitor::calculate_urb_setup()
1712 {
1713 assert(stage == MESA_SHADER_FRAGMENT);
1714 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1715 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1716
1717 memset(prog_data->urb_setup, -1,
1718 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1719
1720 int urb_next = 0;
1721 /* Figure out where each of the incoming setup attributes lands. */
1722 if (brw->gen >= 6) {
1723 if (_mesa_bitcount_64(prog->InputsRead &
1724 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1725 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1726 * first 16 varying inputs, so we can put them wherever we want.
1727 * Just put them in order.
1728 *
1729 * This is useful because it means that (a) inputs not used by the
1730 * fragment shader won't take up valuable register space, and (b) we
1731 * won't have to recompile the fragment shader if it gets paired with
1732 * a different vertex (or geometry) shader.
1733 */
1734 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1735 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1736 BITFIELD64_BIT(i)) {
1737 prog_data->urb_setup[i] = urb_next++;
1738 }
1739 }
1740 } else {
1741 /* We have enough input varyings that the SF/SBE pipeline stage can't
1742 * arbitrarily rearrange them to suit our whim; we have to put them
1743 * in an order that matches the output of the previous pipeline stage
1744 * (geometry or vertex shader).
1745 */
1746 struct brw_vue_map prev_stage_vue_map;
1747 brw_compute_vue_map(brw, &prev_stage_vue_map,
1748 key->input_slots_valid);
1749 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1750 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1751 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1752 slot++) {
1753 int varying = prev_stage_vue_map.slot_to_varying[slot];
1754 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1755 * unused.
1756 */
1757 if (varying != BRW_VARYING_SLOT_COUNT &&
1758 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1759 BITFIELD64_BIT(varying))) {
1760 prog_data->urb_setup[varying] = slot - first_slot;
1761 }
1762 }
1763 urb_next = prev_stage_vue_map.num_slots - first_slot;
1764 }
1765 } else {
1766 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1767 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1768 /* Point size is packed into the header, not as a general attribute */
1769 if (i == VARYING_SLOT_PSIZ)
1770 continue;
1771
1772 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1773 /* The back color slot is skipped when the front color is
1774 * also written to. In addition, some slots can be
1775 * written in the vertex shader and not read in the
1776 * fragment shader. So the register number must always be
1777 * incremented, mapped or not.
1778 */
1779 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1780 prog_data->urb_setup[i] = urb_next;
1781 urb_next++;
1782 }
1783 }
1784
1785 /*
1786 * It's a FS only attribute, and we did interpolation for this attribute
1787 * in SF thread. So, count it here, too.
1788 *
1789 * See compile_sf_prog() for more info.
1790 */
1791 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1792 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1793 }
1794
1795 prog_data->num_varying_inputs = urb_next;
1796 }
1797
1798 void
1799 fs_visitor::assign_urb_setup()
1800 {
1801 assert(stage == MESA_SHADER_FRAGMENT);
1802 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1803
1804 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1805
1806 /* Offset all the urb_setup[] index by the actual position of the
1807 * setup regs, now that the location of the constants has been chosen.
1808 */
1809 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1810 if (inst->opcode == FS_OPCODE_LINTERP) {
1811 assert(inst->src[2].file == HW_REG);
1812 inst->src[2].fixed_hw_reg.nr += urb_start;
1813 }
1814
1815 if (inst->opcode == FS_OPCODE_CINTERP) {
1816 assert(inst->src[0].file == HW_REG);
1817 inst->src[0].fixed_hw_reg.nr += urb_start;
1818 }
1819 }
1820
1821 /* Each attribute is 4 setup channels, each of which is half a reg. */
1822 this->first_non_payload_grf =
1823 urb_start + prog_data->num_varying_inputs * 2;
1824 }
1825
1826 void
1827 fs_visitor::assign_vs_urb_setup()
1828 {
1829 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1830 int grf, count, slot, channel, attr;
1831
1832 assert(stage == MESA_SHADER_VERTEX);
1833 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1834 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1835 count++;
1836
1837 /* Each attribute is 4 regs. */
1838 this->first_non_payload_grf =
1839 payload.num_regs + prog_data->curb_read_length + count * 4;
1840
1841 unsigned vue_entries =
1842 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1843
1844 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1845 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1846
1847 assert(vs_prog_data->base.urb_read_length <= 15);
1848
1849 /* Rewrite all ATTR file references to the hw grf that they land in. */
1850 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1851 for (int i = 0; i < inst->sources; i++) {
1852 if (inst->src[i].file == ATTR) {
1853
1854 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1855 slot = count - 1;
1856 } else {
1857 /* Attributes come in in a contiguous block, ordered by their
1858 * gl_vert_attrib value. That means we can compute the slot
1859 * number for an attribute by masking out the enabled
1860 * attributes before it and counting the bits.
1861 */
1862 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1863 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1864 BITFIELD64_MASK(attr));
1865 }
1866
1867 channel = inst->src[i].reg_offset & 3;
1868
1869 grf = payload.num_regs +
1870 prog_data->curb_read_length +
1871 slot * 4 + channel;
1872
1873 inst->src[i].file = HW_REG;
1874 inst->src[i].fixed_hw_reg =
1875 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1876 }
1877 }
1878 }
1879 }
1880
1881 /**
1882 * Split large virtual GRFs into separate components if we can.
1883 *
1884 * This is mostly duplicated with what brw_fs_vector_splitting does,
1885 * but that's really conservative because it's afraid of doing
1886 * splitting that doesn't result in real progress after the rest of
1887 * the optimization phases, which would cause infinite looping in
1888 * optimization. We can do it once here, safely. This also has the
1889 * opportunity to split interpolated values, or maybe even uniforms,
1890 * which we don't have at the IR level.
1891 *
1892 * We want to split, because virtual GRFs are what we register
1893 * allocate and spill (due to contiguousness requirements for some
1894 * instructions), and they're what we naturally generate in the
1895 * codegen process, but most virtual GRFs don't actually need to be
1896 * contiguous sets of GRFs. If we split, we'll end up with reduced
1897 * live intervals and better dead code elimination and coalescing.
1898 */
1899 void
1900 fs_visitor::split_virtual_grfs()
1901 {
1902 int num_vars = this->alloc.count;
1903
1904 /* Count the total number of registers */
1905 int reg_count = 0;
1906 int vgrf_to_reg[num_vars];
1907 for (int i = 0; i < num_vars; i++) {
1908 vgrf_to_reg[i] = reg_count;
1909 reg_count += alloc.sizes[i];
1910 }
1911
1912 /* An array of "split points". For each register slot, this indicates
1913 * if this slot can be separated from the previous slot. Every time an
1914 * instruction uses multiple elements of a register (as a source or
1915 * destination), we mark the used slots as inseparable. Then we go
1916 * through and split the registers into the smallest pieces we can.
1917 */
1918 bool split_points[reg_count];
1919 memset(split_points, 0, sizeof(split_points));
1920
1921 /* Mark all used registers as fully splittable */
1922 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1923 if (inst->dst.file == GRF) {
1924 int reg = vgrf_to_reg[inst->dst.reg];
1925 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1926 split_points[reg + j] = true;
1927 }
1928
1929 for (int i = 0; i < inst->sources; i++) {
1930 if (inst->src[i].file == GRF) {
1931 int reg = vgrf_to_reg[inst->src[i].reg];
1932 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1933 split_points[reg + j] = true;
1934 }
1935 }
1936 }
1937
1938 if (brw->has_pln &&
1939 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1940 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1941 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1942 * Gen6, that was the only supported interpolation mode, and since Gen6,
1943 * delta_x and delta_y are in fixed hardware registers.
1944 */
1945 int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1946 split_points[vgrf_to_reg[vgrf] + 1] = false;
1947 }
1948
1949 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1950 if (inst->dst.file == GRF) {
1951 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1952 for (int j = 1; j < inst->regs_written; j++)
1953 split_points[reg + j] = false;
1954 }
1955 for (int i = 0; i < inst->sources; i++) {
1956 if (inst->src[i].file == GRF) {
1957 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1958 for (int j = 1; j < inst->regs_read(i); j++)
1959 split_points[reg + j] = false;
1960 }
1961 }
1962 }
1963
1964 int new_virtual_grf[reg_count];
1965 int new_reg_offset[reg_count];
1966
1967 int reg = 0;
1968 for (int i = 0; i < num_vars; i++) {
1969 /* The first one should always be 0 as a quick sanity check. */
1970 assert(split_points[reg] == false);
1971
1972 /* j = 0 case */
1973 new_reg_offset[reg] = 0;
1974 reg++;
1975 int offset = 1;
1976
1977 /* j > 0 case */
1978 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1979 /* If this is a split point, reset the offset to 0 and allocate a
1980 * new virtual GRF for the previous offset many registers
1981 */
1982 if (split_points[reg]) {
1983 assert(offset <= MAX_VGRF_SIZE);
1984 int grf = alloc.allocate(offset);
1985 for (int k = reg - offset; k < reg; k++)
1986 new_virtual_grf[k] = grf;
1987 offset = 0;
1988 }
1989 new_reg_offset[reg] = offset;
1990 offset++;
1991 reg++;
1992 }
1993
1994 /* The last one gets the original register number */
1995 assert(offset <= MAX_VGRF_SIZE);
1996 alloc.sizes[i] = offset;
1997 for (int k = reg - offset; k < reg; k++)
1998 new_virtual_grf[k] = i;
1999 }
2000 assert(reg == reg_count);
2001
2002 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2003 if (inst->dst.file == GRF) {
2004 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2005 inst->dst.reg = new_virtual_grf[reg];
2006 inst->dst.reg_offset = new_reg_offset[reg];
2007 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2008 }
2009 for (int i = 0; i < inst->sources; i++) {
2010 if (inst->src[i].file == GRF) {
2011 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2012 inst->src[i].reg = new_virtual_grf[reg];
2013 inst->src[i].reg_offset = new_reg_offset[reg];
2014 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2015 }
2016 }
2017 }
2018 invalidate_live_intervals();
2019 }
2020
2021 /**
2022 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2023 *
2024 * During code generation, we create tons of temporary variables, many of
2025 * which get immediately killed and are never used again. Yet, in later
2026 * optimization and analysis passes, such as compute_live_intervals, we need
2027 * to loop over all the virtual GRFs. Compacting them can save a lot of
2028 * overhead.
2029 */
2030 bool
2031 fs_visitor::compact_virtual_grfs()
2032 {
2033 bool progress = false;
2034 int remap_table[this->alloc.count];
2035 memset(remap_table, -1, sizeof(remap_table));
2036
2037 /* Mark which virtual GRFs are used. */
2038 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2039 if (inst->dst.file == GRF)
2040 remap_table[inst->dst.reg] = 0;
2041
2042 for (int i = 0; i < inst->sources; i++) {
2043 if (inst->src[i].file == GRF)
2044 remap_table[inst->src[i].reg] = 0;
2045 }
2046 }
2047
2048 /* Compact the GRF arrays. */
2049 int new_index = 0;
2050 for (unsigned i = 0; i < this->alloc.count; i++) {
2051 if (remap_table[i] == -1) {
2052 /* We just found an unused register. This means that we are
2053 * actually going to compact something.
2054 */
2055 progress = true;
2056 } else {
2057 remap_table[i] = new_index;
2058 alloc.sizes[new_index] = alloc.sizes[i];
2059 invalidate_live_intervals();
2060 ++new_index;
2061 }
2062 }
2063
2064 this->alloc.count = new_index;
2065
2066 /* Patch all the instructions to use the newly renumbered registers */
2067 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2068 if (inst->dst.file == GRF)
2069 inst->dst.reg = remap_table[inst->dst.reg];
2070
2071 for (int i = 0; i < inst->sources; i++) {
2072 if (inst->src[i].file == GRF)
2073 inst->src[i].reg = remap_table[inst->src[i].reg];
2074 }
2075 }
2076
2077 /* Patch all the references to delta_x/delta_y, since they're used in
2078 * register allocation. If they're unused, switch them to BAD_FILE so
2079 * we don't think some random VGRF is delta_x/delta_y.
2080 */
2081 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2082 if (delta_x[i].file == GRF) {
2083 if (remap_table[delta_x[i].reg] != -1) {
2084 delta_x[i].reg = remap_table[delta_x[i].reg];
2085 } else {
2086 delta_x[i].file = BAD_FILE;
2087 }
2088 }
2089 }
2090 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2091 if (delta_y[i].file == GRF) {
2092 if (remap_table[delta_y[i].reg] != -1) {
2093 delta_y[i].reg = remap_table[delta_y[i].reg];
2094 } else {
2095 delta_y[i].file = BAD_FILE;
2096 }
2097 }
2098 }
2099
2100 return progress;
2101 }
2102
2103 /*
2104 * Implements array access of uniforms by inserting a
2105 * PULL_CONSTANT_LOAD instruction.
2106 *
2107 * Unlike temporary GRF array access (where we don't support it due to
2108 * the difficulty of doing relative addressing on instruction
2109 * destinations), we could potentially do array access of uniforms
2110 * that were loaded in GRF space as push constants. In real-world
2111 * usage we've seen, though, the arrays being used are always larger
2112 * than we could load as push constants, so just always move all
2113 * uniform array access out to a pull constant buffer.
2114 */
2115 void
2116 fs_visitor::move_uniform_array_access_to_pull_constants()
2117 {
2118 if (dispatch_width != 8)
2119 return;
2120
2121 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2122 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2123
2124 /* Walk through and find array access of uniforms. Put a copy of that
2125 * uniform in the pull constant buffer.
2126 *
2127 * Note that we don't move constant-indexed accesses to arrays. No
2128 * testing has been done of the performance impact of this choice.
2129 */
2130 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2131 for (int i = 0 ; i < inst->sources; i++) {
2132 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2133 continue;
2134
2135 int uniform = inst->src[i].reg;
2136
2137 /* If this array isn't already present in the pull constant buffer,
2138 * add it.
2139 */
2140 if (pull_constant_loc[uniform] == -1) {
2141 const gl_constant_value **values = &stage_prog_data->param[uniform];
2142
2143 assert(param_size[uniform]);
2144
2145 for (int j = 0; j < param_size[uniform]; j++) {
2146 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2147
2148 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2149 values[j];
2150 }
2151 }
2152 }
2153 }
2154 }
2155
2156 /**
2157 * Assign UNIFORM file registers to either push constants or pull constants.
2158 *
2159 * We allow a fragment shader to have more than the specified minimum
2160 * maximum number of fragment shader uniform components (64). If
2161 * there are too many of these, they'd fill up all of register space.
2162 * So, this will push some of them out to the pull constant buffer and
2163 * update the program to load them.
2164 */
2165 void
2166 fs_visitor::assign_constant_locations()
2167 {
2168 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2169 if (dispatch_width != 8)
2170 return;
2171
2172 /* Find which UNIFORM registers are still in use. */
2173 bool is_live[uniforms];
2174 for (unsigned int i = 0; i < uniforms; i++) {
2175 is_live[i] = false;
2176 }
2177
2178 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2179 for (int i = 0; i < inst->sources; i++) {
2180 if (inst->src[i].file != UNIFORM)
2181 continue;
2182
2183 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2184 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2185 is_live[constant_nr] = true;
2186 }
2187 }
2188
2189 /* Only allow 16 registers (128 uniform components) as push constants.
2190 *
2191 * Just demote the end of the list. We could probably do better
2192 * here, demoting things that are rarely used in the program first.
2193 *
2194 * If changing this value, note the limitation about total_regs in
2195 * brw_curbe.c.
2196 */
2197 unsigned int max_push_components = 16 * 8;
2198 unsigned int num_push_constants = 0;
2199
2200 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2201
2202 for (unsigned int i = 0; i < uniforms; i++) {
2203 if (!is_live[i] || pull_constant_loc[i] != -1) {
2204 /* This UNIFORM register is either dead, or has already been demoted
2205 * to a pull const. Mark it as no longer living in the param[] array.
2206 */
2207 push_constant_loc[i] = -1;
2208 continue;
2209 }
2210
2211 if (num_push_constants < max_push_components) {
2212 /* Retain as a push constant. Record the location in the params[]
2213 * array.
2214 */
2215 push_constant_loc[i] = num_push_constants++;
2216 } else {
2217 /* Demote to a pull constant. */
2218 push_constant_loc[i] = -1;
2219
2220 int pull_index = stage_prog_data->nr_pull_params++;
2221 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2222 pull_constant_loc[i] = pull_index;
2223 }
2224 }
2225
2226 stage_prog_data->nr_params = num_push_constants;
2227
2228 /* Up until now, the param[] array has been indexed by reg + reg_offset
2229 * of UNIFORM registers. Condense it to only contain the uniforms we
2230 * chose to upload as push constants.
2231 */
2232 for (unsigned int i = 0; i < uniforms; i++) {
2233 int remapped = push_constant_loc[i];
2234
2235 if (remapped == -1)
2236 continue;
2237
2238 assert(remapped <= (int)i);
2239 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2240 }
2241 }
2242
2243 /**
2244 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2245 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2246 */
2247 void
2248 fs_visitor::demote_pull_constants()
2249 {
2250 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2251 for (int i = 0; i < inst->sources; i++) {
2252 if (inst->src[i].file != UNIFORM)
2253 continue;
2254
2255 int pull_index = pull_constant_loc[inst->src[i].reg +
2256 inst->src[i].reg_offset];
2257 if (pull_index == -1)
2258 continue;
2259
2260 /* Set up the annotation tracking for new generated instructions. */
2261 base_ir = inst->ir;
2262 current_annotation = inst->annotation;
2263
2264 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2265 fs_reg dst = vgrf(glsl_type::float_type);
2266
2267 /* Generate a pull load into dst. */
2268 if (inst->src[i].reladdr) {
2269 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2270 surf_index,
2271 *inst->src[i].reladdr,
2272 pull_index);
2273 inst->insert_before(block, &list);
2274 inst->src[i].reladdr = NULL;
2275 } else {
2276 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2277 fs_inst *pull =
2278 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2279 dst, surf_index, offset);
2280 inst->insert_before(block, pull);
2281 inst->src[i].set_smear(pull_index & 3);
2282 }
2283
2284 /* Rewrite the instruction to use the temporary VGRF. */
2285 inst->src[i].file = GRF;
2286 inst->src[i].reg = dst.reg;
2287 inst->src[i].reg_offset = 0;
2288 inst->src[i].width = dispatch_width;
2289 }
2290 }
2291 invalidate_live_intervals();
2292 }
2293
2294 bool
2295 fs_visitor::opt_algebraic()
2296 {
2297 bool progress = false;
2298
2299 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2300 switch (inst->opcode) {
2301 case BRW_OPCODE_MOV:
2302 if (inst->src[0].file != IMM)
2303 break;
2304
2305 if (inst->saturate) {
2306 if (inst->dst.type != inst->src[0].type)
2307 assert(!"unimplemented: saturate mixed types");
2308
2309 if (brw_saturate_immediate(inst->dst.type,
2310 &inst->src[0].fixed_hw_reg)) {
2311 inst->saturate = false;
2312 progress = true;
2313 }
2314 }
2315 break;
2316
2317 case BRW_OPCODE_MUL:
2318 if (inst->src[1].file != IMM)
2319 continue;
2320
2321 /* a * 1.0 = a */
2322 if (inst->src[1].is_one()) {
2323 inst->opcode = BRW_OPCODE_MOV;
2324 inst->src[1] = reg_undef;
2325 progress = true;
2326 break;
2327 }
2328
2329 /* a * -1.0 = -a */
2330 if (inst->src[1].is_negative_one()) {
2331 inst->opcode = BRW_OPCODE_MOV;
2332 inst->src[0].negate = !inst->src[0].negate;
2333 inst->src[1] = reg_undef;
2334 progress = true;
2335 break;
2336 }
2337
2338 /* a * 0.0 = 0.0 */
2339 if (inst->src[1].is_zero()) {
2340 inst->opcode = BRW_OPCODE_MOV;
2341 inst->src[0] = inst->src[1];
2342 inst->src[1] = reg_undef;
2343 progress = true;
2344 break;
2345 }
2346
2347 if (inst->src[0].file == IMM) {
2348 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2349 inst->opcode = BRW_OPCODE_MOV;
2350 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2351 inst->src[1] = reg_undef;
2352 progress = true;
2353 break;
2354 }
2355 break;
2356 case BRW_OPCODE_ADD:
2357 if (inst->src[1].file != IMM)
2358 continue;
2359
2360 /* a + 0.0 = a */
2361 if (inst->src[1].is_zero()) {
2362 inst->opcode = BRW_OPCODE_MOV;
2363 inst->src[1] = reg_undef;
2364 progress = true;
2365 break;
2366 }
2367
2368 if (inst->src[0].file == IMM) {
2369 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2370 inst->opcode = BRW_OPCODE_MOV;
2371 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2372 inst->src[1] = reg_undef;
2373 progress = true;
2374 break;
2375 }
2376 break;
2377 case BRW_OPCODE_OR:
2378 if (inst->src[0].equals(inst->src[1])) {
2379 inst->opcode = BRW_OPCODE_MOV;
2380 inst->src[1] = reg_undef;
2381 progress = true;
2382 break;
2383 }
2384 break;
2385 case BRW_OPCODE_LRP:
2386 if (inst->src[1].equals(inst->src[2])) {
2387 inst->opcode = BRW_OPCODE_MOV;
2388 inst->src[0] = inst->src[1];
2389 inst->src[1] = reg_undef;
2390 inst->src[2] = reg_undef;
2391 progress = true;
2392 break;
2393 }
2394 break;
2395 case BRW_OPCODE_CMP:
2396 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2397 inst->src[0].abs &&
2398 inst->src[0].negate &&
2399 inst->src[1].is_zero()) {
2400 inst->src[0].abs = false;
2401 inst->src[0].negate = false;
2402 inst->conditional_mod = BRW_CONDITIONAL_Z;
2403 progress = true;
2404 break;
2405 }
2406 break;
2407 case BRW_OPCODE_SEL:
2408 if (inst->src[0].equals(inst->src[1])) {
2409 inst->opcode = BRW_OPCODE_MOV;
2410 inst->src[1] = reg_undef;
2411 inst->predicate = BRW_PREDICATE_NONE;
2412 inst->predicate_inverse = false;
2413 progress = true;
2414 } else if (inst->saturate && inst->src[1].file == IMM) {
2415 switch (inst->conditional_mod) {
2416 case BRW_CONDITIONAL_LE:
2417 case BRW_CONDITIONAL_L:
2418 switch (inst->src[1].type) {
2419 case BRW_REGISTER_TYPE_F:
2420 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2421 inst->opcode = BRW_OPCODE_MOV;
2422 inst->src[1] = reg_undef;
2423 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2424 progress = true;
2425 }
2426 break;
2427 default:
2428 break;
2429 }
2430 break;
2431 case BRW_CONDITIONAL_GE:
2432 case BRW_CONDITIONAL_G:
2433 switch (inst->src[1].type) {
2434 case BRW_REGISTER_TYPE_F:
2435 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2436 inst->opcode = BRW_OPCODE_MOV;
2437 inst->src[1] = reg_undef;
2438 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2439 progress = true;
2440 }
2441 break;
2442 default:
2443 break;
2444 }
2445 default:
2446 break;
2447 }
2448 }
2449 break;
2450 case BRW_OPCODE_MAD:
2451 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2452 inst->opcode = BRW_OPCODE_MOV;
2453 inst->src[1] = reg_undef;
2454 inst->src[2] = reg_undef;
2455 progress = true;
2456 } else if (inst->src[0].is_zero()) {
2457 inst->opcode = BRW_OPCODE_MUL;
2458 inst->src[0] = inst->src[2];
2459 inst->src[2] = reg_undef;
2460 } else if (inst->src[1].is_one()) {
2461 inst->opcode = BRW_OPCODE_ADD;
2462 inst->src[1] = inst->src[2];
2463 inst->src[2] = reg_undef;
2464 progress = true;
2465 } else if (inst->src[2].is_one()) {
2466 inst->opcode = BRW_OPCODE_ADD;
2467 inst->src[2] = reg_undef;
2468 progress = true;
2469 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2470 inst->opcode = BRW_OPCODE_ADD;
2471 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2472 inst->src[2] = reg_undef;
2473 progress = true;
2474 }
2475 break;
2476 case SHADER_OPCODE_RCP: {
2477 fs_inst *prev = (fs_inst *)inst->prev;
2478 if (prev->opcode == SHADER_OPCODE_SQRT) {
2479 if (inst->src[0].equals(prev->dst)) {
2480 inst->opcode = SHADER_OPCODE_RSQ;
2481 inst->src[0] = prev->src[0];
2482 progress = true;
2483 }
2484 }
2485 break;
2486 }
2487 default:
2488 break;
2489 }
2490 }
2491
2492 return progress;
2493 }
2494
2495 bool
2496 fs_visitor::opt_register_renaming()
2497 {
2498 bool progress = false;
2499 int depth = 0;
2500
2501 int remap[alloc.count];
2502 memset(remap, -1, sizeof(int) * alloc.count);
2503
2504 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2505 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2506 depth++;
2507 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2508 inst->opcode == BRW_OPCODE_WHILE) {
2509 depth--;
2510 }
2511
2512 /* Rewrite instruction sources. */
2513 for (int i = 0; i < inst->sources; i++) {
2514 if (inst->src[i].file == GRF &&
2515 remap[inst->src[i].reg] != -1 &&
2516 remap[inst->src[i].reg] != inst->src[i].reg) {
2517 inst->src[i].reg = remap[inst->src[i].reg];
2518 progress = true;
2519 }
2520 }
2521
2522 const int dst = inst->dst.reg;
2523
2524 if (depth == 0 &&
2525 inst->dst.file == GRF &&
2526 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2527 !inst->is_partial_write()) {
2528 if (remap[dst] == -1) {
2529 remap[dst] = dst;
2530 } else {
2531 remap[dst] = alloc.allocate(inst->dst.width / 8);
2532 inst->dst.reg = remap[dst];
2533 progress = true;
2534 }
2535 } else if (inst->dst.file == GRF &&
2536 remap[dst] != -1 &&
2537 remap[dst] != dst) {
2538 inst->dst.reg = remap[dst];
2539 progress = true;
2540 }
2541 }
2542
2543 if (progress) {
2544 invalidate_live_intervals();
2545
2546 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2547 if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2548 delta_x[i].reg = remap[delta_x[i].reg];
2549 }
2550 }
2551 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2552 if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2553 delta_y[i].reg = remap[delta_y[i].reg];
2554 }
2555 }
2556 }
2557
2558 return progress;
2559 }
2560
2561 bool
2562 fs_visitor::compute_to_mrf()
2563 {
2564 bool progress = false;
2565 int next_ip = 0;
2566
2567 /* No MRFs on Gen >= 7. */
2568 if (brw->gen >= 7)
2569 return false;
2570
2571 calculate_live_intervals();
2572
2573 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2574 int ip = next_ip;
2575 next_ip++;
2576
2577 if (inst->opcode != BRW_OPCODE_MOV ||
2578 inst->is_partial_write() ||
2579 inst->dst.file != MRF || inst->src[0].file != GRF ||
2580 inst->dst.type != inst->src[0].type ||
2581 inst->src[0].abs || inst->src[0].negate ||
2582 !inst->src[0].is_contiguous() ||
2583 inst->src[0].subreg_offset)
2584 continue;
2585
2586 /* Work out which hardware MRF registers are written by this
2587 * instruction.
2588 */
2589 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2590 int mrf_high;
2591 if (inst->dst.reg & BRW_MRF_COMPR4) {
2592 mrf_high = mrf_low + 4;
2593 } else if (inst->exec_size == 16) {
2594 mrf_high = mrf_low + 1;
2595 } else {
2596 mrf_high = mrf_low;
2597 }
2598
2599 /* Can't compute-to-MRF this GRF if someone else was going to
2600 * read it later.
2601 */
2602 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2603 continue;
2604
2605 /* Found a move of a GRF to a MRF. Let's see if we can go
2606 * rewrite the thing that made this GRF to write into the MRF.
2607 */
2608 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2609 if (scan_inst->dst.file == GRF &&
2610 scan_inst->dst.reg == inst->src[0].reg) {
2611 /* Found the last thing to write our reg we want to turn
2612 * into a compute-to-MRF.
2613 */
2614
2615 /* If this one instruction didn't populate all the
2616 * channels, bail. We might be able to rewrite everything
2617 * that writes that reg, but it would require smarter
2618 * tracking to delay the rewriting until complete success.
2619 */
2620 if (scan_inst->is_partial_write())
2621 break;
2622
2623 /* Things returning more than one register would need us to
2624 * understand coalescing out more than one MOV at a time.
2625 */
2626 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2627 break;
2628
2629 /* SEND instructions can't have MRF as a destination. */
2630 if (scan_inst->mlen)
2631 break;
2632
2633 if (brw->gen == 6) {
2634 /* gen6 math instructions must have the destination be
2635 * GRF, so no compute-to-MRF for them.
2636 */
2637 if (scan_inst->is_math()) {
2638 break;
2639 }
2640 }
2641
2642 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2643 /* Found the creator of our MRF's source value. */
2644 scan_inst->dst.file = MRF;
2645 scan_inst->dst.reg = inst->dst.reg;
2646 scan_inst->saturate |= inst->saturate;
2647 inst->remove(block);
2648 progress = true;
2649 }
2650 break;
2651 }
2652
2653 /* We don't handle control flow here. Most computation of
2654 * values that end up in MRFs are shortly before the MRF
2655 * write anyway.
2656 */
2657 if (block->start() == scan_inst)
2658 break;
2659
2660 /* You can't read from an MRF, so if someone else reads our
2661 * MRF's source GRF that we wanted to rewrite, that stops us.
2662 */
2663 bool interfered = false;
2664 for (int i = 0; i < scan_inst->sources; i++) {
2665 if (scan_inst->src[i].file == GRF &&
2666 scan_inst->src[i].reg == inst->src[0].reg &&
2667 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2668 interfered = true;
2669 }
2670 }
2671 if (interfered)
2672 break;
2673
2674 if (scan_inst->dst.file == MRF) {
2675 /* If somebody else writes our MRF here, we can't
2676 * compute-to-MRF before that.
2677 */
2678 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2679 int scan_mrf_high;
2680
2681 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2682 scan_mrf_high = scan_mrf_low + 4;
2683 } else if (scan_inst->exec_size == 16) {
2684 scan_mrf_high = scan_mrf_low + 1;
2685 } else {
2686 scan_mrf_high = scan_mrf_low;
2687 }
2688
2689 if (mrf_low == scan_mrf_low ||
2690 mrf_low == scan_mrf_high ||
2691 mrf_high == scan_mrf_low ||
2692 mrf_high == scan_mrf_high) {
2693 break;
2694 }
2695 }
2696
2697 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2698 /* Found a SEND instruction, which means that there are
2699 * live values in MRFs from base_mrf to base_mrf +
2700 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2701 * above it.
2702 */
2703 if (mrf_low >= scan_inst->base_mrf &&
2704 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2705 break;
2706 }
2707 if (mrf_high >= scan_inst->base_mrf &&
2708 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2709 break;
2710 }
2711 }
2712 }
2713 }
2714
2715 if (progress)
2716 invalidate_live_intervals();
2717
2718 return progress;
2719 }
2720
2721 /**
2722 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2723 * instructions to FS_OPCODE_REP_FB_WRITE.
2724 */
2725 void
2726 fs_visitor::emit_repclear_shader()
2727 {
2728 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2729 int base_mrf = 1;
2730 int color_mrf = base_mrf + 2;
2731
2732 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2733 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2734 mov->force_writemask_all = true;
2735
2736 fs_inst *write;
2737 if (key->nr_color_regions == 1) {
2738 write = emit(FS_OPCODE_REP_FB_WRITE);
2739 write->saturate = key->clamp_fragment_color;
2740 write->base_mrf = color_mrf;
2741 write->target = 0;
2742 write->header_present = false;
2743 write->mlen = 1;
2744 } else {
2745 assume(key->nr_color_regions > 0);
2746 for (int i = 0; i < key->nr_color_regions; ++i) {
2747 write = emit(FS_OPCODE_REP_FB_WRITE);
2748 write->saturate = key->clamp_fragment_color;
2749 write->base_mrf = base_mrf;
2750 write->target = i;
2751 write->header_present = true;
2752 write->mlen = 3;
2753 }
2754 }
2755 write->eot = true;
2756
2757 calculate_cfg();
2758
2759 assign_constant_locations();
2760 assign_curb_setup();
2761
2762 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2763 assert(mov->src[0].file == HW_REG);
2764 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2765 }
2766
2767 /**
2768 * Walks through basic blocks, looking for repeated MRF writes and
2769 * removing the later ones.
2770 */
2771 bool
2772 fs_visitor::remove_duplicate_mrf_writes()
2773 {
2774 fs_inst *last_mrf_move[16];
2775 bool progress = false;
2776
2777 /* Need to update the MRF tracking for compressed instructions. */
2778 if (dispatch_width == 16)
2779 return false;
2780
2781 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2782
2783 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2784 if (inst->is_control_flow()) {
2785 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2786 }
2787
2788 if (inst->opcode == BRW_OPCODE_MOV &&
2789 inst->dst.file == MRF) {
2790 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2791 if (prev_inst && inst->equals(prev_inst)) {
2792 inst->remove(block);
2793 progress = true;
2794 continue;
2795 }
2796 }
2797
2798 /* Clear out the last-write records for MRFs that were overwritten. */
2799 if (inst->dst.file == MRF) {
2800 last_mrf_move[inst->dst.reg] = NULL;
2801 }
2802
2803 if (inst->mlen > 0 && inst->base_mrf != -1) {
2804 /* Found a SEND instruction, which will include two or fewer
2805 * implied MRF writes. We could do better here.
2806 */
2807 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2808 last_mrf_move[inst->base_mrf + i] = NULL;
2809 }
2810 }
2811
2812 /* Clear out any MRF move records whose sources got overwritten. */
2813 if (inst->dst.file == GRF) {
2814 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2815 if (last_mrf_move[i] &&
2816 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2817 last_mrf_move[i] = NULL;
2818 }
2819 }
2820 }
2821
2822 if (inst->opcode == BRW_OPCODE_MOV &&
2823 inst->dst.file == MRF &&
2824 inst->src[0].file == GRF &&
2825 !inst->is_partial_write()) {
2826 last_mrf_move[inst->dst.reg] = inst;
2827 }
2828 }
2829
2830 if (progress)
2831 invalidate_live_intervals();
2832
2833 return progress;
2834 }
2835
2836 static void
2837 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2838 int first_grf, int grf_len)
2839 {
2840 /* Clear the flag for registers that actually got read (as expected). */
2841 for (int i = 0; i < inst->sources; i++) {
2842 int grf;
2843 if (inst->src[i].file == GRF) {
2844 grf = inst->src[i].reg;
2845 } else if (inst->src[i].file == HW_REG &&
2846 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2847 grf = inst->src[i].fixed_hw_reg.nr;
2848 } else {
2849 continue;
2850 }
2851
2852 if (grf >= first_grf &&
2853 grf < first_grf + grf_len) {
2854 deps[grf - first_grf] = false;
2855 if (inst->exec_size == 16)
2856 deps[grf - first_grf + 1] = false;
2857 }
2858 }
2859 }
2860
2861 /**
2862 * Implements this workaround for the original 965:
2863 *
2864 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2865 * check for post destination dependencies on this instruction, software
2866 * must ensure that there is no destination hazard for the case of ‘write
2867 * followed by a posted write’ shown in the following example.
2868 *
2869 * 1. mov r3 0
2870 * 2. send r3.xy <rest of send instruction>
2871 * 3. mov r2 r3
2872 *
2873 * Due to no post-destination dependency check on the ‘send’, the above
2874 * code sequence could have two instructions (1 and 2) in flight at the
2875 * same time that both consider ‘r3’ as the target of their final writes.
2876 */
2877 void
2878 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2879 fs_inst *inst)
2880 {
2881 int write_len = inst->regs_written;
2882 int first_write_grf = inst->dst.reg;
2883 bool needs_dep[BRW_MAX_MRF];
2884 assert(write_len < (int)sizeof(needs_dep) - 1);
2885
2886 memset(needs_dep, false, sizeof(needs_dep));
2887 memset(needs_dep, true, write_len);
2888
2889 clear_deps_for_inst_src(inst, dispatch_width,
2890 needs_dep, first_write_grf, write_len);
2891
2892 /* Walk backwards looking for writes to registers we're writing which
2893 * aren't read since being written. If we hit the start of the program,
2894 * we assume that there are no outstanding dependencies on entry to the
2895 * program.
2896 */
2897 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2898 /* If we hit control flow, assume that there *are* outstanding
2899 * dependencies, and force their cleanup before our instruction.
2900 */
2901 if (block->start() == scan_inst) {
2902 for (int i = 0; i < write_len; i++) {
2903 if (needs_dep[i]) {
2904 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2905 }
2906 }
2907 return;
2908 }
2909
2910 /* We insert our reads as late as possible on the assumption that any
2911 * instruction but a MOV that might have left us an outstanding
2912 * dependency has more latency than a MOV.
2913 */
2914 if (scan_inst->dst.file == GRF) {
2915 for (int i = 0; i < scan_inst->regs_written; i++) {
2916 int reg = scan_inst->dst.reg + i;
2917
2918 if (reg >= first_write_grf &&
2919 reg < first_write_grf + write_len &&
2920 needs_dep[reg - first_write_grf]) {
2921 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2922 needs_dep[reg - first_write_grf] = false;
2923 if (scan_inst->exec_size == 16)
2924 needs_dep[reg - first_write_grf + 1] = false;
2925 }
2926 }
2927 }
2928
2929 /* Clear the flag for registers that actually got read (as expected). */
2930 clear_deps_for_inst_src(scan_inst, dispatch_width,
2931 needs_dep, first_write_grf, write_len);
2932
2933 /* Continue the loop only if we haven't resolved all the dependencies */
2934 int i;
2935 for (i = 0; i < write_len; i++) {
2936 if (needs_dep[i])
2937 break;
2938 }
2939 if (i == write_len)
2940 return;
2941 }
2942 }
2943
2944 /**
2945 * Implements this workaround for the original 965:
2946 *
2947 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2948 * used as a destination register until after it has been sourced by an
2949 * instruction with a different destination register.
2950 */
2951 void
2952 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2953 {
2954 int write_len = inst->regs_written;
2955 int first_write_grf = inst->dst.reg;
2956 bool needs_dep[BRW_MAX_MRF];
2957 assert(write_len < (int)sizeof(needs_dep) - 1);
2958
2959 memset(needs_dep, false, sizeof(needs_dep));
2960 memset(needs_dep, true, write_len);
2961 /* Walk forwards looking for writes to registers we're writing which aren't
2962 * read before being written.
2963 */
2964 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2965 /* If we hit control flow, force resolve all remaining dependencies. */
2966 if (block->end() == scan_inst) {
2967 for (int i = 0; i < write_len; i++) {
2968 if (needs_dep[i])
2969 scan_inst->insert_before(block,
2970 DEP_RESOLVE_MOV(first_write_grf + i));
2971 }
2972 return;
2973 }
2974
2975 /* Clear the flag for registers that actually got read (as expected). */
2976 clear_deps_for_inst_src(scan_inst, dispatch_width,
2977 needs_dep, first_write_grf, write_len);
2978
2979 /* We insert our reads as late as possible since they're reading the
2980 * result of a SEND, which has massive latency.
2981 */
2982 if (scan_inst->dst.file == GRF &&
2983 scan_inst->dst.reg >= first_write_grf &&
2984 scan_inst->dst.reg < first_write_grf + write_len &&
2985 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2986 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
2987 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2988 }
2989
2990 /* Continue the loop only if we haven't resolved all the dependencies */
2991 int i;
2992 for (i = 0; i < write_len; i++) {
2993 if (needs_dep[i])
2994 break;
2995 }
2996 if (i == write_len)
2997 return;
2998 }
2999
3000 /* If we hit the end of the program, resolve all remaining dependencies out
3001 * of paranoia.
3002 */
3003 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
3004 assert(last_inst->eot);
3005 for (int i = 0; i < write_len; i++) {
3006 if (needs_dep[i])
3007 last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3008 }
3009 }
3010
3011 void
3012 fs_visitor::insert_gen4_send_dependency_workarounds()
3013 {
3014 if (brw->gen != 4 || brw->is_g4x)
3015 return;
3016
3017 bool progress = false;
3018
3019 /* Note that we're done with register allocation, so GRF fs_regs always
3020 * have a .reg_offset of 0.
3021 */
3022
3023 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3024 if (inst->mlen != 0 && inst->dst.file == GRF) {
3025 insert_gen4_pre_send_dependency_workarounds(block, inst);
3026 insert_gen4_post_send_dependency_workarounds(block, inst);
3027 progress = true;
3028 }
3029 }
3030
3031 if (progress)
3032 invalidate_live_intervals();
3033 }
3034
3035 /**
3036 * Turns the generic expression-style uniform pull constant load instruction
3037 * into a hardware-specific series of instructions for loading a pull
3038 * constant.
3039 *
3040 * The expression style allows the CSE pass before this to optimize out
3041 * repeated loads from the same offset, and gives the pre-register-allocation
3042 * scheduling full flexibility, while the conversion to native instructions
3043 * allows the post-register-allocation scheduler the best information
3044 * possible.
3045 *
3046 * Note that execution masking for setting up pull constant loads is special:
3047 * the channels that need to be written are unrelated to the current execution
3048 * mask, since a later instruction will use one of the result channels as a
3049 * source operand for all 8 or 16 of its channels.
3050 */
3051 void
3052 fs_visitor::lower_uniform_pull_constant_loads()
3053 {
3054 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3055 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3056 continue;
3057
3058 if (brw->gen >= 7) {
3059 /* The offset arg before was a vec4-aligned byte offset. We need to
3060 * turn it into a dword offset.
3061 */
3062 fs_reg const_offset_reg = inst->src[1];
3063 assert(const_offset_reg.file == IMM &&
3064 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3065 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3066 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3067
3068 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3069 * Reserve space for the register.
3070 */
3071 if (brw->gen >= 9) {
3072 payload.reg_offset++;
3073 alloc.sizes[payload.reg] = 2;
3074 }
3075
3076 /* This is actually going to be a MOV, but since only the first dword
3077 * is accessed, we have a special opcode to do just that one. Note
3078 * that this needs to be an operation that will be considered a def
3079 * by live variable analysis, or register allocation will explode.
3080 */
3081 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3082 8, payload, const_offset_reg);
3083 setup->force_writemask_all = true;
3084
3085 setup->ir = inst->ir;
3086 setup->annotation = inst->annotation;
3087 inst->insert_before(block, setup);
3088
3089 /* Similarly, this will only populate the first 4 channels of the
3090 * result register (since we only use smear values from 0-3), but we
3091 * don't tell the optimizer.
3092 */
3093 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3094 inst->src[1] = payload;
3095
3096 invalidate_live_intervals();
3097 } else {
3098 /* Before register allocation, we didn't tell the scheduler about the
3099 * MRF we use. We know it's safe to use this MRF because nothing
3100 * else does except for register spill/unspill, which generates and
3101 * uses its MRF within a single IR instruction.
3102 */
3103 inst->base_mrf = 14;
3104 inst->mlen = 1;
3105 }
3106 }
3107 }
3108
3109 bool
3110 fs_visitor::lower_load_payload()
3111 {
3112 bool progress = false;
3113
3114 int vgrf_to_reg[alloc.count];
3115 int reg_count = 16; /* Leave room for MRF */
3116 for (unsigned i = 0; i < alloc.count; ++i) {
3117 vgrf_to_reg[i] = reg_count;
3118 reg_count += alloc.sizes[i];
3119 }
3120
3121 struct {
3122 bool written:1; /* Whether this register has ever been written */
3123 bool force_writemask_all:1;
3124 bool force_sechalf:1;
3125 } metadata[reg_count];
3126 memset(metadata, 0, sizeof(metadata));
3127
3128 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3129 int dst_reg;
3130 if (inst->dst.file == GRF) {
3131 dst_reg = vgrf_to_reg[inst->dst.reg];
3132 } else {
3133 /* MRF */
3134 dst_reg = inst->dst.reg;
3135 }
3136
3137 if (inst->dst.file == MRF || inst->dst.file == GRF) {
3138 bool force_sechalf = inst->force_sechalf;
3139 bool toggle_sechalf = inst->dst.width == 16 &&
3140 type_sz(inst->dst.type) == 4;
3141 for (int i = 0; i < inst->regs_written; ++i) {
3142 metadata[dst_reg + i].written = true;
3143 metadata[dst_reg + i].force_sechalf = force_sechalf;
3144 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3145 force_sechalf = (toggle_sechalf != force_sechalf);
3146 }
3147 }
3148
3149 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3150 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3151 fs_reg dst = inst->dst;
3152
3153 for (int i = 0; i < inst->sources; i++) {
3154 dst.width = inst->src[i].effective_width;
3155 dst.type = inst->src[i].type;
3156
3157 if (inst->src[i].file == BAD_FILE) {
3158 /* Do nothing but otherwise increment as normal */
3159 } else if (dst.file == MRF &&
3160 dst.width == 8 &&
3161 brw->has_compr4 &&
3162 i + 4 < inst->sources &&
3163 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3164 fs_reg compr4_dst = dst;
3165 compr4_dst.reg += BRW_MRF_COMPR4;
3166 compr4_dst.width = 16;
3167 fs_reg compr4_src = inst->src[i];
3168 compr4_src.width = 16;
3169 fs_inst *mov = MOV(compr4_dst, compr4_src);
3170 mov->force_writemask_all = true;
3171 inst->insert_before(block, mov);
3172 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3173 inst->src[i + 4].file = BAD_FILE;
3174 } else {
3175 fs_inst *mov = MOV(dst, inst->src[i]);
3176 if (inst->src[i].file == GRF) {
3177 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3178 inst->src[i].reg_offset;
3179 mov->force_sechalf = metadata[src_reg].force_sechalf;
3180 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3181 metadata[dst_reg] = metadata[src_reg];
3182 if (dst.width * type_sz(dst.type) > 32) {
3183 assert((!metadata[src_reg].written ||
3184 !metadata[src_reg].force_sechalf) &&
3185 (!metadata[src_reg + 1].written ||
3186 metadata[src_reg + 1].force_sechalf));
3187 metadata[dst_reg + 1] = metadata[src_reg + 1];
3188 }
3189 } else {
3190 metadata[dst_reg].force_writemask_all = false;
3191 metadata[dst_reg].force_sechalf = false;
3192 if (dst.width == 16) {
3193 metadata[dst_reg + 1].force_writemask_all = false;
3194 metadata[dst_reg + 1].force_sechalf = true;
3195 }
3196 }
3197 inst->insert_before(block, mov);
3198 }
3199
3200 dst = offset(dst, 1);
3201 }
3202
3203 inst->remove(block);
3204 progress = true;
3205 }
3206 }
3207
3208 if (progress)
3209 invalidate_live_intervals();
3210
3211 return progress;
3212 }
3213
3214 void
3215 fs_visitor::dump_instructions()
3216 {
3217 dump_instructions(NULL);
3218 }
3219
3220 void
3221 fs_visitor::dump_instructions(const char *name)
3222 {
3223 FILE *file = stderr;
3224 if (name && geteuid() != 0) {
3225 file = fopen(name, "w");
3226 if (!file)
3227 file = stderr;
3228 }
3229
3230 if (cfg) {
3231 calculate_register_pressure();
3232 int ip = 0, max_pressure = 0;
3233 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3234 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3235 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3236 dump_instruction(inst, file);
3237 ip++;
3238 }
3239 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3240 } else {
3241 int ip = 0;
3242 foreach_in_list(backend_instruction, inst, &instructions) {
3243 fprintf(file, "%4d: ", ip++);
3244 dump_instruction(inst, file);
3245 }
3246 }
3247
3248 if (file != stderr) {
3249 fclose(file);
3250 }
3251 }
3252
3253 void
3254 fs_visitor::dump_instruction(backend_instruction *be_inst)
3255 {
3256 dump_instruction(be_inst, stderr);
3257 }
3258
3259 void
3260 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3261 {
3262 fs_inst *inst = (fs_inst *)be_inst;
3263
3264 if (inst->predicate) {
3265 fprintf(file, "(%cf0.%d) ",
3266 inst->predicate_inverse ? '-' : '+',
3267 inst->flag_subreg);
3268 }
3269
3270 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3271 if (inst->saturate)
3272 fprintf(file, ".sat");
3273 if (inst->conditional_mod) {
3274 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3275 if (!inst->predicate &&
3276 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3277 inst->opcode != BRW_OPCODE_IF &&
3278 inst->opcode != BRW_OPCODE_WHILE))) {
3279 fprintf(file, ".f0.%d", inst->flag_subreg);
3280 }
3281 }
3282 fprintf(file, "(%d) ", inst->exec_size);
3283
3284
3285 switch (inst->dst.file) {
3286 case GRF:
3287 fprintf(file, "vgrf%d", inst->dst.reg);
3288 if (inst->dst.width != dispatch_width)
3289 fprintf(file, "@%d", inst->dst.width);
3290 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3291 inst->dst.subreg_offset)
3292 fprintf(file, "+%d.%d",
3293 inst->dst.reg_offset, inst->dst.subreg_offset);
3294 break;
3295 case MRF:
3296 fprintf(file, "m%d", inst->dst.reg);
3297 break;
3298 case BAD_FILE:
3299 fprintf(file, "(null)");
3300 break;
3301 case UNIFORM:
3302 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3303 break;
3304 case ATTR:
3305 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3306 break;
3307 case HW_REG:
3308 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3309 switch (inst->dst.fixed_hw_reg.nr) {
3310 case BRW_ARF_NULL:
3311 fprintf(file, "null");
3312 break;
3313 case BRW_ARF_ADDRESS:
3314 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3315 break;
3316 case BRW_ARF_ACCUMULATOR:
3317 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3318 break;
3319 case BRW_ARF_FLAG:
3320 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3321 inst->dst.fixed_hw_reg.subnr);
3322 break;
3323 default:
3324 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3325 inst->dst.fixed_hw_reg.subnr);
3326 break;
3327 }
3328 } else {
3329 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3330 }
3331 if (inst->dst.fixed_hw_reg.subnr)
3332 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3333 break;
3334 default:
3335 fprintf(file, "???");
3336 break;
3337 }
3338 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3339
3340 for (int i = 0; i < inst->sources; i++) {
3341 if (inst->src[i].negate)
3342 fprintf(file, "-");
3343 if (inst->src[i].abs)
3344 fprintf(file, "|");
3345 switch (inst->src[i].file) {
3346 case GRF:
3347 fprintf(file, "vgrf%d", inst->src[i].reg);
3348 if (inst->src[i].width != dispatch_width)
3349 fprintf(file, "@%d", inst->src[i].width);
3350 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3351 inst->src[i].subreg_offset)
3352 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3353 inst->src[i].subreg_offset);
3354 break;
3355 case MRF:
3356 fprintf(file, "***m%d***", inst->src[i].reg);
3357 break;
3358 case ATTR:
3359 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3360 break;
3361 case UNIFORM:
3362 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3363 if (inst->src[i].reladdr) {
3364 fprintf(file, "+reladdr");
3365 } else if (inst->src[i].subreg_offset) {
3366 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3367 inst->src[i].subreg_offset);
3368 }
3369 break;
3370 case BAD_FILE:
3371 fprintf(file, "(null)");
3372 break;
3373 case IMM:
3374 switch (inst->src[i].type) {
3375 case BRW_REGISTER_TYPE_F:
3376 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3377 break;
3378 case BRW_REGISTER_TYPE_W:
3379 case BRW_REGISTER_TYPE_D:
3380 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3381 break;
3382 case BRW_REGISTER_TYPE_UW:
3383 case BRW_REGISTER_TYPE_UD:
3384 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3385 break;
3386 case BRW_REGISTER_TYPE_VF:
3387 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3388 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3389 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3390 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3391 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3392 break;
3393 default:
3394 fprintf(file, "???");
3395 break;
3396 }
3397 break;
3398 case HW_REG:
3399 if (inst->src[i].fixed_hw_reg.negate)
3400 fprintf(file, "-");
3401 if (inst->src[i].fixed_hw_reg.abs)
3402 fprintf(file, "|");
3403 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3404 switch (inst->src[i].fixed_hw_reg.nr) {
3405 case BRW_ARF_NULL:
3406 fprintf(file, "null");
3407 break;
3408 case BRW_ARF_ADDRESS:
3409 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3410 break;
3411 case BRW_ARF_ACCUMULATOR:
3412 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3413 break;
3414 case BRW_ARF_FLAG:
3415 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3416 inst->src[i].fixed_hw_reg.subnr);
3417 break;
3418 default:
3419 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3420 inst->src[i].fixed_hw_reg.subnr);
3421 break;
3422 }
3423 } else {
3424 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3425 }
3426 if (inst->src[i].fixed_hw_reg.subnr)
3427 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3428 if (inst->src[i].fixed_hw_reg.abs)
3429 fprintf(file, "|");
3430 break;
3431 default:
3432 fprintf(file, "???");
3433 break;
3434 }
3435 if (inst->src[i].abs)
3436 fprintf(file, "|");
3437
3438 if (inst->src[i].file != IMM) {
3439 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3440 }
3441
3442 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3443 fprintf(file, ", ");
3444 }
3445
3446 fprintf(file, " ");
3447
3448 if (dispatch_width == 16 && inst->exec_size == 8) {
3449 if (inst->force_sechalf)
3450 fprintf(file, "2ndhalf ");
3451 else
3452 fprintf(file, "1sthalf ");
3453 }
3454
3455 fprintf(file, "\n");
3456 }
3457
3458 /**
3459 * Possibly returns an instruction that set up @param reg.
3460 *
3461 * Sometimes we want to take the result of some expression/variable
3462 * dereference tree and rewrite the instruction generating the result
3463 * of the tree. When processing the tree, we know that the
3464 * instructions generated are all writing temporaries that are dead
3465 * outside of this tree. So, if we have some instructions that write
3466 * a temporary, we're free to point that temp write somewhere else.
3467 *
3468 * Note that this doesn't guarantee that the instruction generated
3469 * only reg -- it might be the size=4 destination of a texture instruction.
3470 */
3471 fs_inst *
3472 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3473 fs_inst *end,
3474 const fs_reg &reg)
3475 {
3476 if (end == start ||
3477 end->is_partial_write() ||
3478 reg.reladdr ||
3479 !reg.equals(end->dst)) {
3480 return NULL;
3481 } else {
3482 return end;
3483 }
3484 }
3485
3486 void
3487 fs_visitor::setup_payload_gen6()
3488 {
3489 bool uses_depth =
3490 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3491 unsigned barycentric_interp_modes =
3492 (stage == MESA_SHADER_FRAGMENT) ?
3493 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3494
3495 assert(brw->gen >= 6);
3496
3497 /* R0-1: masks, pixel X/Y coordinates. */
3498 payload.num_regs = 2;
3499 /* R2: only for 32-pixel dispatch.*/
3500
3501 /* R3-26: barycentric interpolation coordinates. These appear in the
3502 * same order that they appear in the brw_wm_barycentric_interp_mode
3503 * enum. Each set of coordinates occupies 2 registers if dispatch width
3504 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3505 * appear if they were enabled using the "Barycentric Interpolation
3506 * Mode" bits in WM_STATE.
3507 */
3508 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3509 if (barycentric_interp_modes & (1 << i)) {
3510 payload.barycentric_coord_reg[i] = payload.num_regs;
3511 payload.num_regs += 2;
3512 if (dispatch_width == 16) {
3513 payload.num_regs += 2;
3514 }
3515 }
3516 }
3517
3518 /* R27: interpolated depth if uses source depth */
3519 if (uses_depth) {
3520 payload.source_depth_reg = payload.num_regs;
3521 payload.num_regs++;
3522 if (dispatch_width == 16) {
3523 /* R28: interpolated depth if not SIMD8. */
3524 payload.num_regs++;
3525 }
3526 }
3527 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3528 if (uses_depth) {
3529 payload.source_w_reg = payload.num_regs;
3530 payload.num_regs++;
3531 if (dispatch_width == 16) {
3532 /* R30: interpolated W if not SIMD8. */
3533 payload.num_regs++;
3534 }
3535 }
3536
3537 if (stage == MESA_SHADER_FRAGMENT) {
3538 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3539 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3540 prog_data->uses_pos_offset = key->compute_pos_offset;
3541 /* R31: MSAA position offsets. */
3542 if (prog_data->uses_pos_offset) {
3543 payload.sample_pos_reg = payload.num_regs;
3544 payload.num_regs++;
3545 }
3546 }
3547
3548 /* R32: MSAA input coverage mask */
3549 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3550 assert(brw->gen >= 7);
3551 payload.sample_mask_in_reg = payload.num_regs;
3552 payload.num_regs++;
3553 if (dispatch_width == 16) {
3554 /* R33: input coverage mask if not SIMD8. */
3555 payload.num_regs++;
3556 }
3557 }
3558
3559 /* R34-: bary for 32-pixel. */
3560 /* R58-59: interp W for 32-pixel. */
3561
3562 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3563 source_depth_to_render_target = true;
3564 }
3565 }
3566
3567 void
3568 fs_visitor::setup_vs_payload()
3569 {
3570 /* R0: thread header, R1: urb handles */
3571 payload.num_regs = 2;
3572 }
3573
3574 void
3575 fs_visitor::assign_binding_table_offsets()
3576 {
3577 assert(stage == MESA_SHADER_FRAGMENT);
3578 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3579 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3580 uint32_t next_binding_table_offset = 0;
3581
3582 /* If there are no color regions, we still perform an FB write to a null
3583 * renderbuffer, which we place at surface index 0.
3584 */
3585 prog_data->binding_table.render_target_start = next_binding_table_offset;
3586 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3587
3588 assign_common_binding_table_offsets(next_binding_table_offset);
3589 }
3590
3591 void
3592 fs_visitor::calculate_register_pressure()
3593 {
3594 invalidate_live_intervals();
3595 calculate_live_intervals();
3596
3597 unsigned num_instructions = 0;
3598 foreach_block(block, cfg)
3599 num_instructions += block->instructions.length();
3600
3601 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3602
3603 for (unsigned reg = 0; reg < alloc.count; reg++) {
3604 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3605 regs_live_at_ip[ip] += alloc.sizes[reg];
3606 }
3607 }
3608
3609 void
3610 fs_visitor::optimize()
3611 {
3612 const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3613
3614 split_virtual_grfs();
3615
3616 move_uniform_array_access_to_pull_constants();
3617 assign_constant_locations();
3618 demote_pull_constants();
3619
3620 #define OPT(pass, args...) ({ \
3621 pass_num++; \
3622 bool this_progress = pass(args); \
3623 \
3624 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3625 char filename[64]; \
3626 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3627 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3628 \
3629 backend_visitor::dump_instructions(filename); \
3630 } \
3631 \
3632 progress = progress || this_progress; \
3633 this_progress; \
3634 })
3635
3636 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3637 char filename[64];
3638 snprintf(filename, 64, "%s%d-%04d-00-start",
3639 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3640
3641 backend_visitor::dump_instructions(filename);
3642 }
3643
3644 bool progress;
3645 int iteration = 0;
3646 int pass_num = 0;
3647 do {
3648 progress = false;
3649 pass_num = 0;
3650 iteration++;
3651
3652 OPT(remove_duplicate_mrf_writes);
3653
3654 OPT(opt_algebraic);
3655 OPT(opt_cse);
3656 OPT(opt_copy_propagate);
3657 OPT(opt_peephole_predicated_break);
3658 OPT(opt_cmod_propagation);
3659 OPT(dead_code_eliminate);
3660 OPT(opt_peephole_sel);
3661 OPT(dead_control_flow_eliminate, this);
3662 OPT(opt_register_renaming);
3663 OPT(opt_saturate_propagation);
3664 OPT(register_coalesce);
3665 OPT(compute_to_mrf);
3666
3667 OPT(compact_virtual_grfs);
3668 } while (progress);
3669
3670 pass_num = 0;
3671
3672 if (OPT(lower_load_payload)) {
3673 split_virtual_grfs();
3674 OPT(register_coalesce);
3675 OPT(compute_to_mrf);
3676 OPT(dead_code_eliminate);
3677 }
3678
3679 OPT(opt_combine_constants);
3680
3681 lower_uniform_pull_constant_loads();
3682 }
3683
3684 /**
3685 * Three source instruction must have a GRF/MRF destination register.
3686 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
3687 */
3688 void
3689 fs_visitor::fixup_3src_null_dest()
3690 {
3691 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3692 if (inst->is_3src() && inst->dst.is_null()) {
3693 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3694 inst->dst.type);
3695 }
3696 }
3697 }
3698
3699 void
3700 fs_visitor::allocate_registers()
3701 {
3702 bool allocated_without_spills;
3703
3704 static const enum instruction_scheduler_mode pre_modes[] = {
3705 SCHEDULE_PRE,
3706 SCHEDULE_PRE_NON_LIFO,
3707 SCHEDULE_PRE_LIFO,
3708 };
3709
3710 /* Try each scheduling heuristic to see if it can successfully register
3711 * allocate without spilling. They should be ordered by decreasing
3712 * performance but increasing likelihood of allocating.
3713 */
3714 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3715 schedule_instructions(pre_modes[i]);
3716
3717 if (0) {
3718 assign_regs_trivial();
3719 allocated_without_spills = true;
3720 } else {
3721 allocated_without_spills = assign_regs(false);
3722 }
3723 if (allocated_without_spills)
3724 break;
3725 }
3726
3727 if (!allocated_without_spills) {
3728 const char *stage_name = stage == MESA_SHADER_VERTEX ?
3729 "Vertex" : "Fragment";
3730
3731 /* We assume that any spilling is worse than just dropping back to
3732 * SIMD8. There's probably actually some intermediate point where
3733 * SIMD16 with a couple of spills is still better.
3734 */
3735 if (dispatch_width == 16) {
3736 fail("Failure to register allocate. Reduce number of "
3737 "live scalar values to avoid this.");
3738 } else {
3739 perf_debug("%s shader triggered register spilling. "
3740 "Try reducing the number of live scalar values to "
3741 "improve performance.\n", stage_name);
3742 }
3743
3744 /* Since we're out of heuristics, just go spill registers until we
3745 * get an allocation.
3746 */
3747 while (!assign_regs(true)) {
3748 if (failed)
3749 break;
3750 }
3751 }
3752
3753 /* This must come after all optimization and register allocation, since
3754 * it inserts dead code that happens to have side effects, and it does
3755 * so based on the actual physical registers in use.
3756 */
3757 insert_gen4_send_dependency_workarounds();
3758
3759 if (failed)
3760 return;
3761
3762 if (!allocated_without_spills)
3763 schedule_instructions(SCHEDULE_POST);
3764
3765 if (last_scratch > 0)
3766 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3767 }
3768
3769 bool
3770 fs_visitor::run_vs()
3771 {
3772 assert(stage == MESA_SHADER_VERTEX);
3773
3774 assign_common_binding_table_offsets(0);
3775 setup_vs_payload();
3776
3777 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3778 emit_shader_time_begin();
3779
3780 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3781 base_ir = ir;
3782 this->result = reg_undef;
3783 ir->accept(this);
3784 }
3785 base_ir = NULL;
3786 if (failed)
3787 return false;
3788
3789 emit_urb_writes();
3790
3791 calculate_cfg();
3792
3793 optimize();
3794
3795 assign_curb_setup();
3796 assign_vs_urb_setup();
3797
3798 fixup_3src_null_dest();
3799 allocate_registers();
3800
3801 return !failed;
3802 }
3803
3804 bool
3805 fs_visitor::run_fs()
3806 {
3807 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3808 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3809
3810 assert(stage == MESA_SHADER_FRAGMENT);
3811
3812 sanity_param_count = prog->Parameters->NumParameters;
3813
3814 assign_binding_table_offsets();
3815
3816 if (brw->gen >= 6)
3817 setup_payload_gen6();
3818 else
3819 setup_payload_gen4();
3820
3821 if (0) {
3822 emit_dummy_fs();
3823 } else if (brw->use_rep_send && dispatch_width == 16) {
3824 emit_repclear_shader();
3825 } else {
3826 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3827 emit_shader_time_begin();
3828
3829 calculate_urb_setup();
3830 if (prog->InputsRead > 0) {
3831 if (brw->gen < 6)
3832 emit_interpolation_setup_gen4();
3833 else
3834 emit_interpolation_setup_gen6();
3835 }
3836
3837 /* We handle discards by keeping track of the still-live pixels in f0.1.
3838 * Initialize it with the dispatched pixels.
3839 */
3840 if (wm_prog_data->uses_kill) {
3841 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3842 discard_init->flag_subreg = 1;
3843 }
3844
3845 /* Generate FS IR for main(). (the visitor only descends into
3846 * functions called "main").
3847 */
3848 if (shader) {
3849 if (getenv("INTEL_USE_NIR") != NULL) {
3850 emit_nir_code();
3851 } else {
3852 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3853 base_ir = ir;
3854 this->result = reg_undef;
3855 ir->accept(this);
3856 }
3857 }
3858 } else {
3859 emit_fragment_program_code();
3860 }
3861 base_ir = NULL;
3862 if (failed)
3863 return false;
3864
3865 emit(FS_OPCODE_PLACEHOLDER_HALT);
3866
3867 if (wm_key->alpha_test_func)
3868 emit_alpha_test();
3869
3870 emit_fb_writes();
3871
3872 calculate_cfg();
3873
3874 optimize();
3875
3876 assign_curb_setup();
3877 assign_urb_setup();
3878
3879 fixup_3src_null_dest();
3880 allocate_registers();
3881
3882 if (failed)
3883 return false;
3884 }
3885
3886 if (dispatch_width == 8)
3887 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3888 else
3889 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3890
3891 /* If any state parameters were appended, then ParameterValues could have
3892 * been realloced, in which case the driver uniform storage set up by
3893 * _mesa_associate_uniform_storage() would point to freed memory. Make
3894 * sure that didn't happen.
3895 */
3896 assert(sanity_param_count == prog->Parameters->NumParameters);
3897
3898 return !failed;
3899 }
3900
3901 const unsigned *
3902 brw_wm_fs_emit(struct brw_context *brw,
3903 void *mem_ctx,
3904 const struct brw_wm_prog_key *key,
3905 struct brw_wm_prog_data *prog_data,
3906 struct gl_fragment_program *fp,
3907 struct gl_shader_program *prog,
3908 unsigned *final_assembly_size)
3909 {
3910 bool start_busy = false;
3911 double start_time = 0;
3912
3913 if (unlikely(brw->perf_debug)) {
3914 start_busy = (brw->batch.last_bo &&
3915 drm_intel_bo_busy(brw->batch.last_bo));
3916 start_time = get_time();
3917 }
3918
3919 struct brw_shader *shader = NULL;
3920 if (prog)
3921 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3922
3923 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3924 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3925
3926 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3927 */
3928 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3929 if (!v.run_fs()) {
3930 if (prog) {
3931 prog->LinkStatus = false;
3932 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3933 }
3934
3935 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3936 v.fail_msg);
3937
3938 return NULL;
3939 }
3940
3941 cfg_t *simd16_cfg = NULL;
3942 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3943 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3944 brw->use_rep_send)) {
3945 if (!v.simd16_unsupported) {
3946 /* Try a SIMD16 compile */
3947 v2.import_uniforms(&v);
3948 if (!v2.run_fs()) {
3949 perf_debug("SIMD16 shader failed to compile, falling back to "
3950 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3951 } else {
3952 simd16_cfg = v2.cfg;
3953 }
3954 } else {
3955 perf_debug("SIMD16 shader unsupported, falling back to "
3956 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3957 }
3958 }
3959
3960 cfg_t *simd8_cfg;
3961 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3962 if (no_simd8 && simd16_cfg) {
3963 simd8_cfg = NULL;
3964 prog_data->no_8 = true;
3965 } else {
3966 simd8_cfg = v.cfg;
3967 prog_data->no_8 = false;
3968 }
3969
3970 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
3971 &fp->Base, v.runtime_check_aads_emit, "FS");
3972
3973 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3974 char *name;
3975 if (prog)
3976 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
3977 prog->Label ? prog->Label : "unnamed",
3978 prog->Name);
3979 else
3980 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
3981
3982 g.enable_debug(name);
3983 }
3984
3985 if (simd8_cfg)
3986 g.generate_code(simd8_cfg, 8);
3987 if (simd16_cfg)
3988 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
3989
3990 if (unlikely(brw->perf_debug) && shader) {
3991 if (shader->compiled_once)
3992 brw_wm_debug_recompile(brw, prog, key);
3993 shader->compiled_once = true;
3994
3995 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3996 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3997 (get_time() - start_time) * 1000);
3998 }
3999 }
4000
4001 return g.get_assembly(final_assembly_size);
4002 }
4003
4004 extern "C" bool
4005 brw_fs_precompile(struct gl_context *ctx,
4006 struct gl_shader_program *shader_prog,
4007 struct gl_program *prog)
4008 {
4009 struct brw_context *brw = brw_context(ctx);
4010 struct brw_wm_prog_key key;
4011
4012 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4013 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4014 bool program_uses_dfdy = fp->UsesDFdy;
4015
4016 memset(&key, 0, sizeof(key));
4017
4018 if (brw->gen < 6) {
4019 if (fp->UsesKill)
4020 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4021
4022 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4023 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4024
4025 /* Just assume depth testing. */
4026 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4027 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4028 }
4029
4030 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4031 BRW_FS_VARYING_INPUT_MASK) > 16)
4032 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4033
4034 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4035 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
4036 for (unsigned i = 0; i < sampler_count; i++) {
4037 if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
4038 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4039 key.tex.swizzles[i] =
4040 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4041 } else {
4042 /* Color sampler: assume no swizzling. */
4043 key.tex.swizzles[i] = SWIZZLE_XYZW;
4044 }
4045 }
4046
4047 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4048 key.drawable_height = ctx->DrawBuffer->Height;
4049 }
4050
4051 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4052 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4053 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4054
4055 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4056 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4057 key.nr_color_regions > 1;
4058 }
4059
4060 key.program_string_id = bfp->id;
4061
4062 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4063 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4064
4065 bool success = do_wm_prog(brw, shader_prog, bfp, &key);
4066
4067 brw->wm.base.prog_offset = old_prog_offset;
4068 brw->wm.prog_data = old_prog_data;
4069
4070 return success;
4071 }