i965/fs: Un-hardcode DEBUG_WM, "FS", and "fragment".
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "util/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "util/register_allocate.h"
42 #include "program/hash_table.h"
43 #include "brw_context.h"
44 #include "brw_eu.h"
45 #include "brw_wm.h"
46 }
47 #include "brw_fs.h"
48 #include "brw_cfg.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53 #include "program/sampler.h"
54
55 void
56 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
57 const fs_reg *src, unsigned sources)
58 {
59 memset(this, 0, sizeof(*this));
60
61 this->src = new fs_reg[MAX2(sources, 3)];
62 for (unsigned i = 0; i < sources; i++)
63 this->src[i] = src[i];
64
65 this->opcode = opcode;
66 this->dst = dst;
67 this->sources = sources;
68 this->exec_size = exec_size;
69
70 assert(dst.file != IMM && dst.file != UNIFORM);
71
72 /* If exec_size == 0, try to guess it from the registers. Since all
73 * manner of things may use hardware registers, we first try to guess
74 * based on GRF registers. If this fails, we will go ahead and take the
75 * width from the destination register.
76 */
77 if (this->exec_size == 0) {
78 if (dst.file == GRF) {
79 this->exec_size = dst.width;
80 } else {
81 for (unsigned i = 0; i < sources; ++i) {
82 if (src[i].file != GRF && src[i].file != ATTR)
83 continue;
84
85 if (this->exec_size <= 1)
86 this->exec_size = src[i].width;
87 assert(src[i].width == 1 || src[i].width == this->exec_size);
88 }
89 }
90
91 if (this->exec_size == 0 && dst.file != BAD_FILE)
92 this->exec_size = dst.width;
93 }
94 assert(this->exec_size != 0);
95
96 for (unsigned i = 0; i < sources; ++i) {
97 switch (this->src[i].file) {
98 case BAD_FILE:
99 this->src[i].effective_width = 8;
100 break;
101 case GRF:
102 case HW_REG:
103 case ATTR:
104 assert(this->src[i].width > 0);
105 if (this->src[i].width == 1) {
106 this->src[i].effective_width = this->exec_size;
107 } else {
108 this->src[i].effective_width = this->src[i].width;
109 }
110 break;
111 case IMM:
112 case UNIFORM:
113 this->src[i].effective_width = this->exec_size;
114 break;
115 default:
116 unreachable("Invalid source register file");
117 }
118 }
119 this->dst.effective_width = this->exec_size;
120
121 this->conditional_mod = BRW_CONDITIONAL_NONE;
122
123 /* This will be the case for almost all instructions. */
124 switch (dst.file) {
125 case GRF:
126 case HW_REG:
127 case MRF:
128 case ATTR:
129 this->regs_written =
130 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
131 break;
132 case BAD_FILE:
133 this->regs_written = 0;
134 break;
135 case IMM:
136 case UNIFORM:
137 unreachable("Invalid destination register file");
138 default:
139 unreachable("Invalid register file");
140 }
141
142 this->writes_accumulator = false;
143 }
144
145 fs_inst::fs_inst()
146 {
147 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
148 }
149
150 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
151 {
152 init(opcode, exec_size, reg_undef, NULL, 0);
153 }
154
155 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
156 {
157 init(opcode, 0, dst, NULL, 0);
158 }
159
160 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
161 const fs_reg &src0)
162 {
163 const fs_reg src[1] = { src0 };
164 init(opcode, exec_size, dst, src, 1);
165 }
166
167 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
168 {
169 const fs_reg src[1] = { src0 };
170 init(opcode, 0, dst, src, 1);
171 }
172
173 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
174 const fs_reg &src0, const fs_reg &src1)
175 {
176 const fs_reg src[2] = { src0, src1 };
177 init(opcode, exec_size, dst, src, 2);
178 }
179
180 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
181 const fs_reg &src1)
182 {
183 const fs_reg src[2] = { src0, src1 };
184 init(opcode, 0, dst, src, 2);
185 }
186
187 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
188 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
189 {
190 const fs_reg src[3] = { src0, src1, src2 };
191 init(opcode, exec_size, dst, src, 3);
192 }
193
194 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
195 const fs_reg &src1, const fs_reg &src2)
196 {
197 const fs_reg src[3] = { src0, src1, src2 };
198 init(opcode, 0, dst, src, 3);
199 }
200
201 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
202 const fs_reg src[], unsigned sources)
203 {
204 init(opcode, 0, dst, src, sources);
205 }
206
207 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
208 const fs_reg src[], unsigned sources)
209 {
210 init(opcode, exec_width, dst, src, sources);
211 }
212
213 fs_inst::fs_inst(const fs_inst &that)
214 {
215 memcpy(this, &that, sizeof(that));
216
217 this->src = new fs_reg[MAX2(that.sources, 3)];
218
219 for (unsigned i = 0; i < that.sources; i++)
220 this->src[i] = that.src[i];
221 }
222
223 fs_inst::~fs_inst()
224 {
225 delete[] this->src;
226 }
227
228 void
229 fs_inst::resize_sources(uint8_t num_sources)
230 {
231 if (this->sources != num_sources) {
232 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
233
234 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
235 src[i] = this->src[i];
236
237 delete[] this->src;
238 this->src = src;
239 this->sources = num_sources;
240 }
241 }
242
243 #define ALU1(op) \
244 fs_inst * \
245 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
246 { \
247 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
248 }
249
250 #define ALU2(op) \
251 fs_inst * \
252 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
253 const fs_reg &src1) \
254 { \
255 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
256 }
257
258 #define ALU2_ACC(op) \
259 fs_inst * \
260 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
261 const fs_reg &src1) \
262 { \
263 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
264 inst->writes_accumulator = true; \
265 return inst; \
266 }
267
268 #define ALU3(op) \
269 fs_inst * \
270 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
271 const fs_reg &src1, const fs_reg &src2) \
272 { \
273 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
274 }
275
276 ALU1(NOT)
277 ALU1(MOV)
278 ALU1(FRC)
279 ALU1(RNDD)
280 ALU1(RNDE)
281 ALU1(RNDZ)
282 ALU2(ADD)
283 ALU2(MUL)
284 ALU2_ACC(MACH)
285 ALU2(AND)
286 ALU2(OR)
287 ALU2(XOR)
288 ALU2(SHL)
289 ALU2(SHR)
290 ALU2(ASR)
291 ALU3(LRP)
292 ALU1(BFREV)
293 ALU3(BFE)
294 ALU2(BFI1)
295 ALU3(BFI2)
296 ALU1(FBH)
297 ALU1(FBL)
298 ALU1(CBIT)
299 ALU3(MAD)
300 ALU2_ACC(ADDC)
301 ALU2_ACC(SUBB)
302 ALU2(SEL)
303 ALU2(MAC)
304
305 /** Gen4 predicated IF. */
306 fs_inst *
307 fs_visitor::IF(enum brw_predicate predicate)
308 {
309 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
310 inst->predicate = predicate;
311 return inst;
312 }
313
314 /** Gen6 IF with embedded comparison. */
315 fs_inst *
316 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
317 enum brw_conditional_mod condition)
318 {
319 assert(brw->gen == 6);
320 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
321 reg_null_d, src0, src1);
322 inst->conditional_mod = condition;
323 return inst;
324 }
325
326 /**
327 * CMP: Sets the low bit of the destination channels with the result
328 * of the comparison, while the upper bits are undefined, and updates
329 * the flag register with the packed 16 bits of the result.
330 */
331 fs_inst *
332 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
333 enum brw_conditional_mod condition)
334 {
335 fs_inst *inst;
336
337 /* Take the instruction:
338 *
339 * CMP null<d> src0<f> src1<f>
340 *
341 * Original gen4 does type conversion to the destination type before
342 * comparison, producing garbage results for floating point comparisons.
343 *
344 * The destination type doesn't matter on newer generations, so we set the
345 * type to match src0 so we can compact the instruction.
346 */
347 dst.type = src0.type;
348 if (dst.file == HW_REG)
349 dst.fixed_hw_reg.type = dst.type;
350
351 resolve_ud_negate(&src0);
352 resolve_ud_negate(&src1);
353
354 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
355 inst->conditional_mod = condition;
356
357 return inst;
358 }
359
360 fs_inst *
361 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
362 {
363 uint8_t exec_size = dst.width;
364 for (int i = 0; i < sources; ++i) {
365 assert(src[i].width % dst.width == 0);
366 if (src[i].width > exec_size)
367 exec_size = src[i].width;
368 }
369
370 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
371 dst, src, sources);
372 inst->regs_written = 0;
373 for (int i = 0; i < sources; ++i) {
374 /* The LOAD_PAYLOAD instruction only really makes sense if we are
375 * dealing with whole registers. If this ever changes, we can deal
376 * with it later.
377 */
378 int size = inst->src[i].effective_width * type_sz(src[i].type);
379 assert(size % 32 == 0);
380 inst->regs_written += (size + 31) / 32;
381 }
382
383 return inst;
384 }
385
386 exec_list
387 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
388 const fs_reg &surf_index,
389 const fs_reg &varying_offset,
390 uint32_t const_offset)
391 {
392 exec_list instructions;
393 fs_inst *inst;
394
395 /* We have our constant surface use a pitch of 4 bytes, so our index can
396 * be any component of a vector, and then we load 4 contiguous
397 * components starting from that.
398 *
399 * We break down the const_offset to a portion added to the variable
400 * offset and a portion done using reg_offset, which means that if you
401 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
402 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
403 * CSE can later notice that those loads are all the same and eliminate
404 * the redundant ones.
405 */
406 fs_reg vec4_offset = vgrf(glsl_type::int_type);
407 instructions.push_tail(ADD(vec4_offset,
408 varying_offset, fs_reg(const_offset & ~3)));
409
410 int scale = 1;
411 if (brw->gen == 4 && dst.width == 8) {
412 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
413 * u, v, r) as parameters, or we can just use the SIMD16 message
414 * consisting of (header, u). We choose the second, at the cost of a
415 * longer return length.
416 */
417 scale = 2;
418 }
419
420 enum opcode op;
421 if (brw->gen >= 7)
422 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
423 else
424 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
425
426 assert(dst.width % 8 == 0);
427 int regs_written = 4 * (dst.width / 8) * scale;
428 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
429 dst.type, dst.width);
430 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
431 inst->regs_written = regs_written;
432 instructions.push_tail(inst);
433
434 if (brw->gen < 7) {
435 inst->base_mrf = 13;
436 inst->header_present = true;
437 if (brw->gen == 4)
438 inst->mlen = 3;
439 else
440 inst->mlen = 1 + dispatch_width / 8;
441 }
442
443 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
444 instructions.push_tail(MOV(dst, result));
445
446 return instructions;
447 }
448
449 /**
450 * A helper for MOV generation for fixing up broken hardware SEND dependency
451 * handling.
452 */
453 fs_inst *
454 fs_visitor::DEP_RESOLVE_MOV(int grf)
455 {
456 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
457
458 inst->ir = NULL;
459 inst->annotation = "send dependency resolve";
460
461 /* The caller always wants uncompressed to emit the minimal extra
462 * dependencies, and to avoid having to deal with aligning its regs to 2.
463 */
464 inst->exec_size = 8;
465
466 return inst;
467 }
468
469 bool
470 fs_inst::equals(fs_inst *inst) const
471 {
472 return (opcode == inst->opcode &&
473 dst.equals(inst->dst) &&
474 src[0].equals(inst->src[0]) &&
475 src[1].equals(inst->src[1]) &&
476 src[2].equals(inst->src[2]) &&
477 saturate == inst->saturate &&
478 predicate == inst->predicate &&
479 conditional_mod == inst->conditional_mod &&
480 mlen == inst->mlen &&
481 base_mrf == inst->base_mrf &&
482 target == inst->target &&
483 eot == inst->eot &&
484 header_present == inst->header_present &&
485 shadow_compare == inst->shadow_compare &&
486 exec_size == inst->exec_size &&
487 offset == inst->offset);
488 }
489
490 bool
491 fs_inst::overwrites_reg(const fs_reg &reg) const
492 {
493 return (reg.file == dst.file &&
494 reg.reg == dst.reg &&
495 reg.reg_offset >= dst.reg_offset &&
496 reg.reg_offset < dst.reg_offset + regs_written);
497 }
498
499 bool
500 fs_inst::is_send_from_grf() const
501 {
502 switch (opcode) {
503 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
504 case SHADER_OPCODE_SHADER_TIME_ADD:
505 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
506 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
507 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
508 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
509 case SHADER_OPCODE_UNTYPED_ATOMIC:
510 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
511 case SHADER_OPCODE_URB_WRITE_SIMD8:
512 return true;
513 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
514 return src[1].file == GRF;
515 case FS_OPCODE_FB_WRITE:
516 return src[0].file == GRF;
517 default:
518 if (is_tex())
519 return src[0].file == GRF;
520
521 return false;
522 }
523 }
524
525 bool
526 fs_inst::can_do_source_mods(struct brw_context *brw)
527 {
528 if (brw->gen == 6 && is_math())
529 return false;
530
531 if (is_send_from_grf())
532 return false;
533
534 if (!backend_instruction::can_do_source_mods())
535 return false;
536
537 return true;
538 }
539
540 void
541 fs_reg::init()
542 {
543 memset(this, 0, sizeof(*this));
544 stride = 1;
545 }
546
547 /** Generic unset register constructor. */
548 fs_reg::fs_reg()
549 {
550 init();
551 this->file = BAD_FILE;
552 }
553
554 /** Immediate value constructor. */
555 fs_reg::fs_reg(float f)
556 {
557 init();
558 this->file = IMM;
559 this->type = BRW_REGISTER_TYPE_F;
560 this->fixed_hw_reg.dw1.f = f;
561 this->width = 1;
562 }
563
564 /** Immediate value constructor. */
565 fs_reg::fs_reg(int32_t i)
566 {
567 init();
568 this->file = IMM;
569 this->type = BRW_REGISTER_TYPE_D;
570 this->fixed_hw_reg.dw1.d = i;
571 this->width = 1;
572 }
573
574 /** Immediate value constructor. */
575 fs_reg::fs_reg(uint32_t u)
576 {
577 init();
578 this->file = IMM;
579 this->type = BRW_REGISTER_TYPE_UD;
580 this->fixed_hw_reg.dw1.ud = u;
581 this->width = 1;
582 }
583
584 /** Vector float immediate value constructor. */
585 fs_reg::fs_reg(uint8_t vf[4])
586 {
587 init();
588 this->file = IMM;
589 this->type = BRW_REGISTER_TYPE_VF;
590 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
591 }
592
593 /** Vector float immediate value constructor. */
594 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
595 {
596 init();
597 this->file = IMM;
598 this->type = BRW_REGISTER_TYPE_VF;
599 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
600 (vf1 << 8) |
601 (vf2 << 16) |
602 (vf3 << 24);
603 }
604
605 /** Fixed brw_reg. */
606 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
607 {
608 init();
609 this->file = HW_REG;
610 this->fixed_hw_reg = fixed_hw_reg;
611 this->type = fixed_hw_reg.type;
612 this->width = 1 << fixed_hw_reg.width;
613 }
614
615 bool
616 fs_reg::equals(const fs_reg &r) const
617 {
618 return (file == r.file &&
619 reg == r.reg &&
620 reg_offset == r.reg_offset &&
621 subreg_offset == r.subreg_offset &&
622 type == r.type &&
623 negate == r.negate &&
624 abs == r.abs &&
625 !reladdr && !r.reladdr &&
626 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
627 width == r.width &&
628 stride == r.stride);
629 }
630
631 fs_reg &
632 fs_reg::set_smear(unsigned subreg)
633 {
634 assert(file != HW_REG && file != IMM);
635 subreg_offset = subreg * type_sz(type);
636 stride = 0;
637 return *this;
638 }
639
640 bool
641 fs_reg::is_contiguous() const
642 {
643 return stride == 1;
644 }
645
646 int
647 fs_visitor::type_size(const struct glsl_type *type)
648 {
649 unsigned int size, i;
650
651 switch (type->base_type) {
652 case GLSL_TYPE_UINT:
653 case GLSL_TYPE_INT:
654 case GLSL_TYPE_FLOAT:
655 case GLSL_TYPE_BOOL:
656 return type->components();
657 case GLSL_TYPE_ARRAY:
658 return type_size(type->fields.array) * type->length;
659 case GLSL_TYPE_STRUCT:
660 size = 0;
661 for (i = 0; i < type->length; i++) {
662 size += type_size(type->fields.structure[i].type);
663 }
664 return size;
665 case GLSL_TYPE_SAMPLER:
666 /* Samplers take up no register space, since they're baked in at
667 * link time.
668 */
669 return 0;
670 case GLSL_TYPE_ATOMIC_UINT:
671 return 0;
672 case GLSL_TYPE_IMAGE:
673 case GLSL_TYPE_VOID:
674 case GLSL_TYPE_ERROR:
675 case GLSL_TYPE_INTERFACE:
676 unreachable("not reached");
677 }
678
679 return 0;
680 }
681
682 fs_reg
683 fs_visitor::get_timestamp()
684 {
685 assert(brw->gen >= 7);
686
687 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
688 BRW_ARF_TIMESTAMP,
689 0),
690 BRW_REGISTER_TYPE_UD));
691
692 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
693
694 fs_inst *mov = emit(MOV(dst, ts));
695 /* We want to read the 3 fields we care about even if it's not enabled in
696 * the dispatch.
697 */
698 mov->force_writemask_all = true;
699
700 /* The caller wants the low 32 bits of the timestamp. Since it's running
701 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
702 * which is plenty of time for our purposes. It is identical across the
703 * EUs, but since it's tracking GPU core speed it will increment at a
704 * varying rate as render P-states change.
705 *
706 * The caller could also check if render P-states have changed (or anything
707 * else that might disrupt timing) by setting smear to 2 and checking if
708 * that field is != 0.
709 */
710 dst.set_smear(0);
711
712 return dst;
713 }
714
715 void
716 fs_visitor::emit_shader_time_begin()
717 {
718 current_annotation = "shader time start";
719 shader_start_time = get_timestamp();
720 }
721
722 void
723 fs_visitor::emit_shader_time_end()
724 {
725 current_annotation = "shader time end";
726
727 enum shader_time_shader_type type, written_type, reset_type;
728 switch (stage) {
729 case MESA_SHADER_VERTEX:
730 type = ST_VS;
731 written_type = ST_VS_WRITTEN;
732 reset_type = ST_VS_RESET;
733 break;
734 case MESA_SHADER_GEOMETRY:
735 type = ST_GS;
736 written_type = ST_GS_WRITTEN;
737 reset_type = ST_GS_RESET;
738 break;
739 case MESA_SHADER_FRAGMENT:
740 if (dispatch_width == 8) {
741 type = ST_FS8;
742 written_type = ST_FS8_WRITTEN;
743 reset_type = ST_FS8_RESET;
744 } else {
745 assert(dispatch_width == 16);
746 type = ST_FS16;
747 written_type = ST_FS16_WRITTEN;
748 reset_type = ST_FS16_RESET;
749 }
750 break;
751 default:
752 unreachable("fs_visitor::emit_shader_time_end missing code");
753 }
754
755 fs_reg shader_end_time = get_timestamp();
756
757 /* Check that there weren't any timestamp reset events (assuming these
758 * were the only two timestamp reads that happened).
759 */
760 fs_reg reset = shader_end_time;
761 reset.set_smear(2);
762 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
763 test->conditional_mod = BRW_CONDITIONAL_Z;
764 emit(IF(BRW_PREDICATE_NORMAL));
765
766 fs_reg start = shader_start_time;
767 start.negate = true;
768 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
769 emit(ADD(diff, start, shader_end_time));
770
771 /* If there were no instructions between the two timestamp gets, the diff
772 * is 2 cycles. Remove that overhead, so I can forget about that when
773 * trying to determine the time taken for single instructions.
774 */
775 emit(ADD(diff, diff, fs_reg(-2u)));
776
777 emit_shader_time_write(type, diff);
778 emit_shader_time_write(written_type, fs_reg(1u));
779 emit(BRW_OPCODE_ELSE);
780 emit_shader_time_write(reset_type, fs_reg(1u));
781 emit(BRW_OPCODE_ENDIF);
782 }
783
784 void
785 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
786 fs_reg value)
787 {
788 int shader_time_index =
789 brw_get_shader_time_index(brw, shader_prog, prog, type);
790 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
791
792 fs_reg payload;
793 if (dispatch_width == 8)
794 payload = vgrf(glsl_type::uvec2_type);
795 else
796 payload = vgrf(glsl_type::uint_type);
797
798 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
799 fs_reg(), payload, offset, value));
800 }
801
802 void
803 fs_visitor::vfail(const char *format, va_list va)
804 {
805 char *msg;
806
807 if (failed)
808 return;
809
810 failed = true;
811
812 msg = ralloc_vasprintf(mem_ctx, format, va);
813 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
814
815 this->fail_msg = msg;
816
817 if (debug_enabled) {
818 fprintf(stderr, "%s", msg);
819 }
820 }
821
822 void
823 fs_visitor::fail(const char *format, ...)
824 {
825 va_list va;
826
827 va_start(va, format);
828 vfail(format, va);
829 va_end(va);
830 }
831
832 /**
833 * Mark this program as impossible to compile in SIMD16 mode.
834 *
835 * During the SIMD8 compile (which happens first), we can detect and flag
836 * things that are unsupported in SIMD16 mode, so the compiler can skip
837 * the SIMD16 compile altogether.
838 *
839 * During a SIMD16 compile (if one happens anyway), this just calls fail().
840 */
841 void
842 fs_visitor::no16(const char *format, ...)
843 {
844 va_list va;
845
846 va_start(va, format);
847
848 if (dispatch_width == 16) {
849 vfail(format, va);
850 } else {
851 simd16_unsupported = true;
852
853 if (brw->perf_debug) {
854 if (no16_msg)
855 ralloc_vasprintf_append(&no16_msg, format, va);
856 else
857 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
858 }
859 }
860
861 va_end(va);
862 }
863
864 fs_inst *
865 fs_visitor::emit(enum opcode opcode)
866 {
867 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
868 }
869
870 fs_inst *
871 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
872 {
873 return emit(new(mem_ctx) fs_inst(opcode, dst));
874 }
875
876 fs_inst *
877 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
878 {
879 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
880 }
881
882 fs_inst *
883 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
884 const fs_reg &src1)
885 {
886 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
887 }
888
889 fs_inst *
890 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
891 const fs_reg &src1, const fs_reg &src2)
892 {
893 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
894 }
895
896 fs_inst *
897 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
898 fs_reg src[], int sources)
899 {
900 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
901 }
902
903 /**
904 * Returns true if the instruction has a flag that means it won't
905 * update an entire destination register.
906 *
907 * For example, dead code elimination and live variable analysis want to know
908 * when a write to a variable screens off any preceding values that were in
909 * it.
910 */
911 bool
912 fs_inst::is_partial_write() const
913 {
914 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
915 (this->dst.width * type_sz(this->dst.type)) < 32 ||
916 !this->dst.is_contiguous());
917 }
918
919 int
920 fs_inst::regs_read(int arg) const
921 {
922 if (is_tex() && arg == 0 && src[0].file == GRF) {
923 return mlen;
924 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
925 return mlen;
926 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
927 return mlen;
928 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
929 return mlen;
930 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
931 return mlen;
932 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
933 return mlen;
934 }
935
936 switch (src[arg].file) {
937 case BAD_FILE:
938 case UNIFORM:
939 case IMM:
940 return 1;
941 case GRF:
942 case HW_REG:
943 if (src[arg].stride == 0) {
944 return 1;
945 } else {
946 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
947 return (size + 31) / 32;
948 }
949 case MRF:
950 unreachable("MRF registers are not allowed as sources");
951 default:
952 unreachable("Invalid register file");
953 }
954 }
955
956 bool
957 fs_inst::reads_flag() const
958 {
959 return predicate;
960 }
961
962 bool
963 fs_inst::writes_flag() const
964 {
965 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
966 opcode != BRW_OPCODE_IF &&
967 opcode != BRW_OPCODE_WHILE)) ||
968 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
969 }
970
971 /**
972 * Returns how many MRFs an FS opcode will write over.
973 *
974 * Note that this is not the 0 or 1 implied writes in an actual gen
975 * instruction -- the FS opcodes often generate MOVs in addition.
976 */
977 int
978 fs_visitor::implied_mrf_writes(fs_inst *inst)
979 {
980 if (inst->mlen == 0)
981 return 0;
982
983 if (inst->base_mrf == -1)
984 return 0;
985
986 switch (inst->opcode) {
987 case SHADER_OPCODE_RCP:
988 case SHADER_OPCODE_RSQ:
989 case SHADER_OPCODE_SQRT:
990 case SHADER_OPCODE_EXP2:
991 case SHADER_OPCODE_LOG2:
992 case SHADER_OPCODE_SIN:
993 case SHADER_OPCODE_COS:
994 return 1 * dispatch_width / 8;
995 case SHADER_OPCODE_POW:
996 case SHADER_OPCODE_INT_QUOTIENT:
997 case SHADER_OPCODE_INT_REMAINDER:
998 return 2 * dispatch_width / 8;
999 case SHADER_OPCODE_TEX:
1000 case FS_OPCODE_TXB:
1001 case SHADER_OPCODE_TXD:
1002 case SHADER_OPCODE_TXF:
1003 case SHADER_OPCODE_TXF_CMS:
1004 case SHADER_OPCODE_TXF_MCS:
1005 case SHADER_OPCODE_TG4:
1006 case SHADER_OPCODE_TG4_OFFSET:
1007 case SHADER_OPCODE_TXL:
1008 case SHADER_OPCODE_TXS:
1009 case SHADER_OPCODE_LOD:
1010 return 1;
1011 case FS_OPCODE_FB_WRITE:
1012 return 2;
1013 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1014 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1015 return 1;
1016 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1017 return inst->mlen;
1018 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1019 return 2;
1020 case SHADER_OPCODE_UNTYPED_ATOMIC:
1021 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1022 case SHADER_OPCODE_URB_WRITE_SIMD8:
1023 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1024 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1025 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1026 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1027 return 0;
1028 default:
1029 unreachable("not reached");
1030 }
1031 }
1032
1033 fs_reg
1034 fs_visitor::vgrf(const glsl_type *const type)
1035 {
1036 int reg_width = dispatch_width / 8;
1037 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1038 brw_type_for_base_type(type), dispatch_width);
1039 }
1040
1041 fs_reg
1042 fs_visitor::vgrf(int num_components)
1043 {
1044 int reg_width = dispatch_width / 8;
1045 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1046 BRW_REGISTER_TYPE_F, dispatch_width);
1047 }
1048
1049 /** Fixed HW reg constructor. */
1050 fs_reg::fs_reg(enum register_file file, int reg)
1051 {
1052 init();
1053 this->file = file;
1054 this->reg = reg;
1055 this->type = BRW_REGISTER_TYPE_F;
1056
1057 switch (file) {
1058 case UNIFORM:
1059 this->width = 1;
1060 break;
1061 default:
1062 this->width = 8;
1063 }
1064 }
1065
1066 /** Fixed HW reg constructor. */
1067 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1068 {
1069 init();
1070 this->file = file;
1071 this->reg = reg;
1072 this->type = type;
1073
1074 switch (file) {
1075 case UNIFORM:
1076 this->width = 1;
1077 break;
1078 default:
1079 this->width = 8;
1080 }
1081 }
1082
1083 /** Fixed HW reg constructor. */
1084 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1085 uint8_t width)
1086 {
1087 init();
1088 this->file = file;
1089 this->reg = reg;
1090 this->type = type;
1091 this->width = width;
1092 }
1093
1094 fs_reg *
1095 fs_visitor::variable_storage(ir_variable *var)
1096 {
1097 return (fs_reg *)hash_table_find(this->variable_ht, var);
1098 }
1099
1100 void
1101 import_uniforms_callback(const void *key,
1102 void *data,
1103 void *closure)
1104 {
1105 struct hash_table *dst_ht = (struct hash_table *)closure;
1106 const fs_reg *reg = (const fs_reg *)data;
1107
1108 if (reg->file != UNIFORM)
1109 return;
1110
1111 hash_table_insert(dst_ht, data, key);
1112 }
1113
1114 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1115 * This brings in those uniform definitions
1116 */
1117 void
1118 fs_visitor::import_uniforms(fs_visitor *v)
1119 {
1120 hash_table_call_foreach(v->variable_ht,
1121 import_uniforms_callback,
1122 variable_ht);
1123 this->push_constant_loc = v->push_constant_loc;
1124 this->pull_constant_loc = v->pull_constant_loc;
1125 this->uniforms = v->uniforms;
1126 this->param_size = v->param_size;
1127 }
1128
1129 /* Our support for uniforms is piggy-backed on the struct
1130 * gl_fragment_program, because that's where the values actually
1131 * get stored, rather than in some global gl_shader_program uniform
1132 * store.
1133 */
1134 void
1135 fs_visitor::setup_uniform_values(ir_variable *ir)
1136 {
1137 int namelen = strlen(ir->name);
1138
1139 /* The data for our (non-builtin) uniforms is stored in a series of
1140 * gl_uniform_driver_storage structs for each subcomponent that
1141 * glGetUniformLocation() could name. We know it's been set up in the same
1142 * order we'd walk the type, so walk the list of storage and find anything
1143 * with our name, or the prefix of a component that starts with our name.
1144 */
1145 unsigned params_before = uniforms;
1146 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1147 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1148
1149 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1150 (storage->name[namelen] != 0 &&
1151 storage->name[namelen] != '.' &&
1152 storage->name[namelen] != '[')) {
1153 continue;
1154 }
1155
1156 unsigned slots = storage->type->component_slots();
1157 if (storage->array_elements)
1158 slots *= storage->array_elements;
1159
1160 for (unsigned i = 0; i < slots; i++) {
1161 stage_prog_data->param[uniforms++] = &storage->storage[i];
1162 }
1163 }
1164
1165 /* Make sure we actually initialized the right amount of stuff here. */
1166 assert(params_before + ir->type->component_slots() == uniforms);
1167 (void)params_before;
1168 }
1169
1170
1171 /* Our support for builtin uniforms is even scarier than non-builtin.
1172 * It sits on top of the PROG_STATE_VAR parameters that are
1173 * automatically updated from GL context state.
1174 */
1175 void
1176 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1177 {
1178 const ir_state_slot *const slots = ir->get_state_slots();
1179 assert(slots != NULL);
1180
1181 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1182 /* This state reference has already been setup by ir_to_mesa, but we'll
1183 * get the same index back here.
1184 */
1185 int index = _mesa_add_state_reference(this->prog->Parameters,
1186 (gl_state_index *)slots[i].tokens);
1187
1188 /* Add each of the unique swizzles of the element as a parameter.
1189 * This'll end up matching the expected layout of the
1190 * array/matrix/structure we're trying to fill in.
1191 */
1192 int last_swiz = -1;
1193 for (unsigned int j = 0; j < 4; j++) {
1194 int swiz = GET_SWZ(slots[i].swizzle, j);
1195 if (swiz == last_swiz)
1196 break;
1197 last_swiz = swiz;
1198
1199 stage_prog_data->param[uniforms++] =
1200 &prog->Parameters->ParameterValues[index][swiz];
1201 }
1202 }
1203 }
1204
1205 fs_reg *
1206 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1207 bool origin_upper_left)
1208 {
1209 assert(stage == MESA_SHADER_FRAGMENT);
1210 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1211 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1212 fs_reg wpos = *reg;
1213 bool flip = !origin_upper_left ^ key->render_to_fbo;
1214
1215 /* gl_FragCoord.x */
1216 if (pixel_center_integer) {
1217 emit(MOV(wpos, this->pixel_x));
1218 } else {
1219 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1220 }
1221 wpos = offset(wpos, 1);
1222
1223 /* gl_FragCoord.y */
1224 if (!flip && pixel_center_integer) {
1225 emit(MOV(wpos, this->pixel_y));
1226 } else {
1227 fs_reg pixel_y = this->pixel_y;
1228 float offset = (pixel_center_integer ? 0.0 : 0.5);
1229
1230 if (flip) {
1231 pixel_y.negate = true;
1232 offset += key->drawable_height - 1.0;
1233 }
1234
1235 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1236 }
1237 wpos = offset(wpos, 1);
1238
1239 /* gl_FragCoord.z */
1240 if (brw->gen >= 6) {
1241 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1242 } else {
1243 emit(FS_OPCODE_LINTERP, wpos,
1244 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1245 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1246 interp_reg(VARYING_SLOT_POS, 2));
1247 }
1248 wpos = offset(wpos, 1);
1249
1250 /* gl_FragCoord.w: Already set up in emit_interpolation */
1251 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1252
1253 return reg;
1254 }
1255
1256 fs_inst *
1257 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1258 glsl_interp_qualifier interpolation_mode,
1259 bool is_centroid, bool is_sample)
1260 {
1261 brw_wm_barycentric_interp_mode barycoord_mode;
1262 if (brw->gen >= 6) {
1263 if (is_centroid) {
1264 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1265 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1266 else
1267 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1268 } else if (is_sample) {
1269 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1270 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1271 else
1272 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1273 } else {
1274 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1275 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1276 else
1277 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1278 }
1279 } else {
1280 /* On Ironlake and below, there is only one interpolation mode.
1281 * Centroid interpolation doesn't mean anything on this hardware --
1282 * there is no multisampling.
1283 */
1284 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1285 }
1286 return emit(FS_OPCODE_LINTERP, attr,
1287 this->delta_x[barycoord_mode],
1288 this->delta_y[barycoord_mode], interp);
1289 }
1290
1291 void
1292 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1293 const glsl_type *type,
1294 glsl_interp_qualifier interpolation_mode,
1295 int location, bool mod_centroid,
1296 bool mod_sample)
1297 {
1298 attr.type = brw_type_for_base_type(type->get_scalar_type());
1299
1300 assert(stage == MESA_SHADER_FRAGMENT);
1301 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1302 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1303
1304 unsigned int array_elements;
1305
1306 if (type->is_array()) {
1307 array_elements = type->length;
1308 if (array_elements == 0) {
1309 fail("dereferenced array '%s' has length 0\n", name);
1310 }
1311 type = type->fields.array;
1312 } else {
1313 array_elements = 1;
1314 }
1315
1316 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1317 bool is_gl_Color =
1318 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1319 if (key->flat_shade && is_gl_Color) {
1320 interpolation_mode = INTERP_QUALIFIER_FLAT;
1321 } else {
1322 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1323 }
1324 }
1325
1326 for (unsigned int i = 0; i < array_elements; i++) {
1327 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1328 if (prog_data->urb_setup[location] == -1) {
1329 /* If there's no incoming setup data for this slot, don't
1330 * emit interpolation for it.
1331 */
1332 attr = offset(attr, type->vector_elements);
1333 location++;
1334 continue;
1335 }
1336
1337 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1338 /* Constant interpolation (flat shading) case. The SF has
1339 * handed us defined values in only the constant offset
1340 * field of the setup reg.
1341 */
1342 for (unsigned int k = 0; k < type->vector_elements; k++) {
1343 struct brw_reg interp = interp_reg(location, k);
1344 interp = suboffset(interp, 3);
1345 interp.type = attr.type;
1346 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1347 attr = offset(attr, 1);
1348 }
1349 } else {
1350 /* Smooth/noperspective interpolation case. */
1351 for (unsigned int k = 0; k < type->vector_elements; k++) {
1352 struct brw_reg interp = interp_reg(location, k);
1353 if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1354 /* Get the pixel/sample mask into f0 so that we know
1355 * which pixels are lit. Then, for each channel that is
1356 * unlit, replace the centroid data with non-centroid
1357 * data.
1358 */
1359 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1360
1361 fs_inst *inst;
1362 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1363 false, false);
1364 inst->predicate = BRW_PREDICATE_NORMAL;
1365 inst->predicate_inverse = true;
1366 if (brw->has_pln)
1367 inst->no_dd_clear = true;
1368
1369 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1370 mod_centroid && !key->persample_shading,
1371 mod_sample || key->persample_shading);
1372 inst->predicate = BRW_PREDICATE_NORMAL;
1373 inst->predicate_inverse = false;
1374 if (brw->has_pln)
1375 inst->no_dd_check = true;
1376
1377 } else {
1378 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1379 mod_centroid && !key->persample_shading,
1380 mod_sample || key->persample_shading);
1381 }
1382 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1383 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1384 }
1385 attr = offset(attr, 1);
1386 }
1387
1388 }
1389 location++;
1390 }
1391 }
1392 }
1393
1394 fs_reg *
1395 fs_visitor::emit_frontfacing_interpolation()
1396 {
1397 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1398
1399 if (brw->gen >= 6) {
1400 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1401 * a boolean result from this (~0/true or 0/false).
1402 *
1403 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1404 * this task in only one instruction:
1405 * - a negation source modifier will flip the bit; and
1406 * - a W -> D type conversion will sign extend the bit into the high
1407 * word of the destination.
1408 *
1409 * An ASR 15 fills the low word of the destination.
1410 */
1411 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1412 g0.negate = true;
1413
1414 emit(ASR(*reg, g0, fs_reg(15)));
1415 } else {
1416 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1417 * a boolean result from this (1/true or 0/false).
1418 *
1419 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1420 * the negation source modifier to flip it. Unfortunately the SHR
1421 * instruction only operates on UD (or D with an abs source modifier)
1422 * sources without negation.
1423 *
1424 * Instead, use ASR (which will give ~0/true or 0/false).
1425 */
1426 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1427 g1_6.negate = true;
1428
1429 emit(ASR(*reg, g1_6, fs_reg(31)));
1430 }
1431
1432 return reg;
1433 }
1434
1435 void
1436 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1437 {
1438 assert(stage == MESA_SHADER_FRAGMENT);
1439 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1440 assert(dst.type == BRW_REGISTER_TYPE_F);
1441
1442 if (key->compute_pos_offset) {
1443 /* Convert int_sample_pos to floating point */
1444 emit(MOV(dst, int_sample_pos));
1445 /* Scale to the range [0, 1] */
1446 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1447 }
1448 else {
1449 /* From ARB_sample_shading specification:
1450 * "When rendering to a non-multisample buffer, or if multisample
1451 * rasterization is disabled, gl_SamplePosition will always be
1452 * (0.5, 0.5).
1453 */
1454 emit(MOV(dst, fs_reg(0.5f)));
1455 }
1456 }
1457
1458 fs_reg *
1459 fs_visitor::emit_samplepos_setup()
1460 {
1461 assert(brw->gen >= 6);
1462
1463 this->current_annotation = "compute sample position";
1464 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1465 fs_reg pos = *reg;
1466 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1467 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1468
1469 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1470 * mode will be enabled.
1471 *
1472 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1473 * R31.1:0 Position Offset X/Y for Slot[3:0]
1474 * R31.3:2 Position Offset X/Y for Slot[7:4]
1475 * .....
1476 *
1477 * The X, Y sample positions come in as bytes in thread payload. So, read
1478 * the positions using vstride=16, width=8, hstride=2.
1479 */
1480 struct brw_reg sample_pos_reg =
1481 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1482 BRW_REGISTER_TYPE_B), 16, 8, 2);
1483
1484 if (dispatch_width == 8) {
1485 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1486 } else {
1487 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1488 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1489 ->force_sechalf = true;
1490 }
1491 /* Compute gl_SamplePosition.x */
1492 compute_sample_position(pos, int_sample_x);
1493 pos = offset(pos, 1);
1494 if (dispatch_width == 8) {
1495 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1496 } else {
1497 emit(MOV(half(int_sample_y, 0),
1498 fs_reg(suboffset(sample_pos_reg, 1))));
1499 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1500 ->force_sechalf = true;
1501 }
1502 /* Compute gl_SamplePosition.y */
1503 compute_sample_position(pos, int_sample_y);
1504 return reg;
1505 }
1506
1507 fs_reg *
1508 fs_visitor::emit_sampleid_setup()
1509 {
1510 assert(stage == MESA_SHADER_FRAGMENT);
1511 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1512 assert(brw->gen >= 6);
1513
1514 this->current_annotation = "compute sample id";
1515 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1516
1517 if (key->compute_sample_id) {
1518 fs_reg t1 = vgrf(glsl_type::int_type);
1519 fs_reg t2 = vgrf(glsl_type::int_type);
1520 t2.type = BRW_REGISTER_TYPE_UW;
1521
1522 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1523 * 8x multisampling, subspan 0 will represent sample N (where N
1524 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1525 * 7. We can find the value of N by looking at R0.0 bits 7:6
1526 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1527 * (since samples are always delivered in pairs). That is, we
1528 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1529 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1530 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1531 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1532 * populating a temporary variable with the sequence (0, 1, 2, 3),
1533 * and then reading from it using vstride=1, width=4, hstride=0.
1534 * These computations hold good for 4x multisampling as well.
1535 *
1536 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1537 * the first four slots are sample 0 of subspan 0; the next four
1538 * are sample 1 of subspan 0; the third group is sample 0 of
1539 * subspan 1, and finally sample 1 of subspan 1.
1540 */
1541 fs_inst *inst;
1542 inst = emit(BRW_OPCODE_AND, t1,
1543 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1544 fs_reg(0xc0));
1545 inst->force_writemask_all = true;
1546 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1547 inst->force_writemask_all = true;
1548 /* This works for both SIMD8 and SIMD16 */
1549 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1550 inst->force_writemask_all = true;
1551 /* This special instruction takes care of setting vstride=1,
1552 * width=4, hstride=0 of t2 during an ADD instruction.
1553 */
1554 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1555 } else {
1556 /* As per GL_ARB_sample_shading specification:
1557 * "When rendering to a non-multisample buffer, or if multisample
1558 * rasterization is disabled, gl_SampleID will always be zero."
1559 */
1560 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1561 }
1562
1563 return reg;
1564 }
1565
1566 fs_reg
1567 fs_visitor::fix_math_operand(fs_reg src)
1568 {
1569 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1570 * might be able to do better by doing execsize = 1 math and then
1571 * expanding that result out, but we would need to be careful with
1572 * masking.
1573 *
1574 * The hardware ignores source modifiers (negate and abs) on math
1575 * instructions, so we also move to a temp to set those up.
1576 */
1577 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1578 !src.abs && !src.negate)
1579 return src;
1580
1581 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1582 * operands to math
1583 */
1584 if (brw->gen >= 7 && src.file != IMM)
1585 return src;
1586
1587 fs_reg expanded = vgrf(glsl_type::float_type);
1588 expanded.type = src.type;
1589 emit(BRW_OPCODE_MOV, expanded, src);
1590 return expanded;
1591 }
1592
1593 fs_inst *
1594 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1595 {
1596 switch (opcode) {
1597 case SHADER_OPCODE_RCP:
1598 case SHADER_OPCODE_RSQ:
1599 case SHADER_OPCODE_SQRT:
1600 case SHADER_OPCODE_EXP2:
1601 case SHADER_OPCODE_LOG2:
1602 case SHADER_OPCODE_SIN:
1603 case SHADER_OPCODE_COS:
1604 break;
1605 default:
1606 unreachable("not reached: bad math opcode");
1607 }
1608
1609 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1610 * might be able to do better by doing execsize = 1 math and then
1611 * expanding that result out, but we would need to be careful with
1612 * masking.
1613 *
1614 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1615 * instructions, so we also move to a temp to set those up.
1616 */
1617 if (brw->gen == 6 || brw->gen == 7)
1618 src = fix_math_operand(src);
1619
1620 fs_inst *inst = emit(opcode, dst, src);
1621
1622 if (brw->gen < 6) {
1623 inst->base_mrf = 2;
1624 inst->mlen = dispatch_width / 8;
1625 }
1626
1627 return inst;
1628 }
1629
1630 fs_inst *
1631 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1632 {
1633 int base_mrf = 2;
1634 fs_inst *inst;
1635
1636 if (brw->gen >= 8) {
1637 inst = emit(opcode, dst, src0, src1);
1638 } else if (brw->gen >= 6) {
1639 src0 = fix_math_operand(src0);
1640 src1 = fix_math_operand(src1);
1641
1642 inst = emit(opcode, dst, src0, src1);
1643 } else {
1644 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1645 * "Message Payload":
1646 *
1647 * "Operand0[7]. For the INT DIV functions, this operand is the
1648 * denominator."
1649 * ...
1650 * "Operand1[7]. For the INT DIV functions, this operand is the
1651 * numerator."
1652 */
1653 bool is_int_div = opcode != SHADER_OPCODE_POW;
1654 fs_reg &op0 = is_int_div ? src1 : src0;
1655 fs_reg &op1 = is_int_div ? src0 : src1;
1656
1657 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1658 inst = emit(opcode, dst, op0, reg_null_f);
1659
1660 inst->base_mrf = base_mrf;
1661 inst->mlen = 2 * dispatch_width / 8;
1662 }
1663 return inst;
1664 }
1665
1666 void
1667 fs_visitor::assign_curb_setup()
1668 {
1669 if (dispatch_width == 8) {
1670 prog_data->dispatch_grf_start_reg = payload.num_regs;
1671 } else {
1672 assert(stage == MESA_SHADER_FRAGMENT);
1673 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1674 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1675 }
1676
1677 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1678
1679 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1680 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1681 for (unsigned int i = 0; i < inst->sources; i++) {
1682 if (inst->src[i].file == UNIFORM) {
1683 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1684 int constant_nr;
1685 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1686 constant_nr = push_constant_loc[uniform_nr];
1687 } else {
1688 /* Section 5.11 of the OpenGL 4.1 spec says:
1689 * "Out-of-bounds reads return undefined values, which include
1690 * values from other variables of the active program or zero."
1691 * Just return the first push constant.
1692 */
1693 constant_nr = 0;
1694 }
1695
1696 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1697 constant_nr / 8,
1698 constant_nr % 8);
1699
1700 inst->src[i].file = HW_REG;
1701 inst->src[i].fixed_hw_reg = byte_offset(
1702 retype(brw_reg, inst->src[i].type),
1703 inst->src[i].subreg_offset);
1704 }
1705 }
1706 }
1707 }
1708
1709 void
1710 fs_visitor::calculate_urb_setup()
1711 {
1712 assert(stage == MESA_SHADER_FRAGMENT);
1713 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1714 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1715
1716 memset(prog_data->urb_setup, -1,
1717 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1718
1719 int urb_next = 0;
1720 /* Figure out where each of the incoming setup attributes lands. */
1721 if (brw->gen >= 6) {
1722 if (_mesa_bitcount_64(prog->InputsRead &
1723 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1724 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1725 * first 16 varying inputs, so we can put them wherever we want.
1726 * Just put them in order.
1727 *
1728 * This is useful because it means that (a) inputs not used by the
1729 * fragment shader won't take up valuable register space, and (b) we
1730 * won't have to recompile the fragment shader if it gets paired with
1731 * a different vertex (or geometry) shader.
1732 */
1733 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1734 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1735 BITFIELD64_BIT(i)) {
1736 prog_data->urb_setup[i] = urb_next++;
1737 }
1738 }
1739 } else {
1740 /* We have enough input varyings that the SF/SBE pipeline stage can't
1741 * arbitrarily rearrange them to suit our whim; we have to put them
1742 * in an order that matches the output of the previous pipeline stage
1743 * (geometry or vertex shader).
1744 */
1745 struct brw_vue_map prev_stage_vue_map;
1746 brw_compute_vue_map(brw, &prev_stage_vue_map,
1747 key->input_slots_valid);
1748 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1749 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1750 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1751 slot++) {
1752 int varying = prev_stage_vue_map.slot_to_varying[slot];
1753 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1754 * unused.
1755 */
1756 if (varying != BRW_VARYING_SLOT_COUNT &&
1757 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1758 BITFIELD64_BIT(varying))) {
1759 prog_data->urb_setup[varying] = slot - first_slot;
1760 }
1761 }
1762 urb_next = prev_stage_vue_map.num_slots - first_slot;
1763 }
1764 } else {
1765 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1766 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1767 /* Point size is packed into the header, not as a general attribute */
1768 if (i == VARYING_SLOT_PSIZ)
1769 continue;
1770
1771 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1772 /* The back color slot is skipped when the front color is
1773 * also written to. In addition, some slots can be
1774 * written in the vertex shader and not read in the
1775 * fragment shader. So the register number must always be
1776 * incremented, mapped or not.
1777 */
1778 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1779 prog_data->urb_setup[i] = urb_next;
1780 urb_next++;
1781 }
1782 }
1783
1784 /*
1785 * It's a FS only attribute, and we did interpolation for this attribute
1786 * in SF thread. So, count it here, too.
1787 *
1788 * See compile_sf_prog() for more info.
1789 */
1790 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1791 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1792 }
1793
1794 prog_data->num_varying_inputs = urb_next;
1795 }
1796
1797 void
1798 fs_visitor::assign_urb_setup()
1799 {
1800 assert(stage == MESA_SHADER_FRAGMENT);
1801 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1802
1803 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1804
1805 /* Offset all the urb_setup[] index by the actual position of the
1806 * setup regs, now that the location of the constants has been chosen.
1807 */
1808 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1809 if (inst->opcode == FS_OPCODE_LINTERP) {
1810 assert(inst->src[2].file == HW_REG);
1811 inst->src[2].fixed_hw_reg.nr += urb_start;
1812 }
1813
1814 if (inst->opcode == FS_OPCODE_CINTERP) {
1815 assert(inst->src[0].file == HW_REG);
1816 inst->src[0].fixed_hw_reg.nr += urb_start;
1817 }
1818 }
1819
1820 /* Each attribute is 4 setup channels, each of which is half a reg. */
1821 this->first_non_payload_grf =
1822 urb_start + prog_data->num_varying_inputs * 2;
1823 }
1824
1825 void
1826 fs_visitor::assign_vs_urb_setup()
1827 {
1828 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1829 int grf, count, slot, channel, attr;
1830
1831 assert(stage == MESA_SHADER_VERTEX);
1832 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1833 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1834 count++;
1835
1836 /* Each attribute is 4 regs. */
1837 this->first_non_payload_grf =
1838 payload.num_regs + prog_data->curb_read_length + count * 4;
1839
1840 unsigned vue_entries =
1841 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1842
1843 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1844 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1845
1846 assert(vs_prog_data->base.urb_read_length <= 15);
1847
1848 /* Rewrite all ATTR file references to the hw grf that they land in. */
1849 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1850 for (int i = 0; i < inst->sources; i++) {
1851 if (inst->src[i].file == ATTR) {
1852
1853 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1854 slot = count - 1;
1855 } else {
1856 /* Attributes come in in a contiguous block, ordered by their
1857 * gl_vert_attrib value. That means we can compute the slot
1858 * number for an attribute by masking out the enabled
1859 * attributes before it and counting the bits.
1860 */
1861 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1862 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1863 BITFIELD64_MASK(attr));
1864 }
1865
1866 channel = inst->src[i].reg_offset & 3;
1867
1868 grf = payload.num_regs +
1869 prog_data->curb_read_length +
1870 slot * 4 + channel;
1871
1872 inst->src[i].file = HW_REG;
1873 inst->src[i].fixed_hw_reg =
1874 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1875 }
1876 }
1877 }
1878 }
1879
1880 /**
1881 * Split large virtual GRFs into separate components if we can.
1882 *
1883 * This is mostly duplicated with what brw_fs_vector_splitting does,
1884 * but that's really conservative because it's afraid of doing
1885 * splitting that doesn't result in real progress after the rest of
1886 * the optimization phases, which would cause infinite looping in
1887 * optimization. We can do it once here, safely. This also has the
1888 * opportunity to split interpolated values, or maybe even uniforms,
1889 * which we don't have at the IR level.
1890 *
1891 * We want to split, because virtual GRFs are what we register
1892 * allocate and spill (due to contiguousness requirements for some
1893 * instructions), and they're what we naturally generate in the
1894 * codegen process, but most virtual GRFs don't actually need to be
1895 * contiguous sets of GRFs. If we split, we'll end up with reduced
1896 * live intervals and better dead code elimination and coalescing.
1897 */
1898 void
1899 fs_visitor::split_virtual_grfs()
1900 {
1901 int num_vars = this->alloc.count;
1902
1903 /* Count the total number of registers */
1904 int reg_count = 0;
1905 int vgrf_to_reg[num_vars];
1906 for (int i = 0; i < num_vars; i++) {
1907 vgrf_to_reg[i] = reg_count;
1908 reg_count += alloc.sizes[i];
1909 }
1910
1911 /* An array of "split points". For each register slot, this indicates
1912 * if this slot can be separated from the previous slot. Every time an
1913 * instruction uses multiple elements of a register (as a source or
1914 * destination), we mark the used slots as inseparable. Then we go
1915 * through and split the registers into the smallest pieces we can.
1916 */
1917 bool split_points[reg_count];
1918 memset(split_points, 0, sizeof(split_points));
1919
1920 /* Mark all used registers as fully splittable */
1921 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1922 if (inst->dst.file == GRF) {
1923 int reg = vgrf_to_reg[inst->dst.reg];
1924 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1925 split_points[reg + j] = true;
1926 }
1927
1928 for (int i = 0; i < inst->sources; i++) {
1929 if (inst->src[i].file == GRF) {
1930 int reg = vgrf_to_reg[inst->src[i].reg];
1931 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1932 split_points[reg + j] = true;
1933 }
1934 }
1935 }
1936
1937 if (brw->has_pln &&
1938 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1939 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1940 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1941 * Gen6, that was the only supported interpolation mode, and since Gen6,
1942 * delta_x and delta_y are in fixed hardware registers.
1943 */
1944 int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1945 split_points[vgrf_to_reg[vgrf] + 1] = false;
1946 }
1947
1948 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1949 if (inst->dst.file == GRF) {
1950 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1951 for (int j = 1; j < inst->regs_written; j++)
1952 split_points[reg + j] = false;
1953 }
1954 for (int i = 0; i < inst->sources; i++) {
1955 if (inst->src[i].file == GRF) {
1956 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1957 for (int j = 1; j < inst->regs_read(i); j++)
1958 split_points[reg + j] = false;
1959 }
1960 }
1961 }
1962
1963 int new_virtual_grf[reg_count];
1964 int new_reg_offset[reg_count];
1965
1966 int reg = 0;
1967 for (int i = 0; i < num_vars; i++) {
1968 /* The first one should always be 0 as a quick sanity check. */
1969 assert(split_points[reg] == false);
1970
1971 /* j = 0 case */
1972 new_reg_offset[reg] = 0;
1973 reg++;
1974 int offset = 1;
1975
1976 /* j > 0 case */
1977 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1978 /* If this is a split point, reset the offset to 0 and allocate a
1979 * new virtual GRF for the previous offset many registers
1980 */
1981 if (split_points[reg]) {
1982 assert(offset <= MAX_VGRF_SIZE);
1983 int grf = alloc.allocate(offset);
1984 for (int k = reg - offset; k < reg; k++)
1985 new_virtual_grf[k] = grf;
1986 offset = 0;
1987 }
1988 new_reg_offset[reg] = offset;
1989 offset++;
1990 reg++;
1991 }
1992
1993 /* The last one gets the original register number */
1994 assert(offset <= MAX_VGRF_SIZE);
1995 alloc.sizes[i] = offset;
1996 for (int k = reg - offset; k < reg; k++)
1997 new_virtual_grf[k] = i;
1998 }
1999 assert(reg == reg_count);
2000
2001 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2002 if (inst->dst.file == GRF) {
2003 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2004 inst->dst.reg = new_virtual_grf[reg];
2005 inst->dst.reg_offset = new_reg_offset[reg];
2006 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2007 }
2008 for (int i = 0; i < inst->sources; i++) {
2009 if (inst->src[i].file == GRF) {
2010 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2011 inst->src[i].reg = new_virtual_grf[reg];
2012 inst->src[i].reg_offset = new_reg_offset[reg];
2013 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2014 }
2015 }
2016 }
2017 invalidate_live_intervals();
2018 }
2019
2020 /**
2021 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2022 *
2023 * During code generation, we create tons of temporary variables, many of
2024 * which get immediately killed and are never used again. Yet, in later
2025 * optimization and analysis passes, such as compute_live_intervals, we need
2026 * to loop over all the virtual GRFs. Compacting them can save a lot of
2027 * overhead.
2028 */
2029 bool
2030 fs_visitor::compact_virtual_grfs()
2031 {
2032 bool progress = false;
2033 int remap_table[this->alloc.count];
2034 memset(remap_table, -1, sizeof(remap_table));
2035
2036 /* Mark which virtual GRFs are used. */
2037 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2038 if (inst->dst.file == GRF)
2039 remap_table[inst->dst.reg] = 0;
2040
2041 for (int i = 0; i < inst->sources; i++) {
2042 if (inst->src[i].file == GRF)
2043 remap_table[inst->src[i].reg] = 0;
2044 }
2045 }
2046
2047 /* Compact the GRF arrays. */
2048 int new_index = 0;
2049 for (unsigned i = 0; i < this->alloc.count; i++) {
2050 if (remap_table[i] == -1) {
2051 /* We just found an unused register. This means that we are
2052 * actually going to compact something.
2053 */
2054 progress = true;
2055 } else {
2056 remap_table[i] = new_index;
2057 alloc.sizes[new_index] = alloc.sizes[i];
2058 invalidate_live_intervals();
2059 ++new_index;
2060 }
2061 }
2062
2063 this->alloc.count = new_index;
2064
2065 /* Patch all the instructions to use the newly renumbered registers */
2066 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2067 if (inst->dst.file == GRF)
2068 inst->dst.reg = remap_table[inst->dst.reg];
2069
2070 for (int i = 0; i < inst->sources; i++) {
2071 if (inst->src[i].file == GRF)
2072 inst->src[i].reg = remap_table[inst->src[i].reg];
2073 }
2074 }
2075
2076 /* Patch all the references to delta_x/delta_y, since they're used in
2077 * register allocation. If they're unused, switch them to BAD_FILE so
2078 * we don't think some random VGRF is delta_x/delta_y.
2079 */
2080 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2081 if (delta_x[i].file == GRF) {
2082 if (remap_table[delta_x[i].reg] != -1) {
2083 delta_x[i].reg = remap_table[delta_x[i].reg];
2084 } else {
2085 delta_x[i].file = BAD_FILE;
2086 }
2087 }
2088 }
2089 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2090 if (delta_y[i].file == GRF) {
2091 if (remap_table[delta_y[i].reg] != -1) {
2092 delta_y[i].reg = remap_table[delta_y[i].reg];
2093 } else {
2094 delta_y[i].file = BAD_FILE;
2095 }
2096 }
2097 }
2098
2099 return progress;
2100 }
2101
2102 /*
2103 * Implements array access of uniforms by inserting a
2104 * PULL_CONSTANT_LOAD instruction.
2105 *
2106 * Unlike temporary GRF array access (where we don't support it due to
2107 * the difficulty of doing relative addressing on instruction
2108 * destinations), we could potentially do array access of uniforms
2109 * that were loaded in GRF space as push constants. In real-world
2110 * usage we've seen, though, the arrays being used are always larger
2111 * than we could load as push constants, so just always move all
2112 * uniform array access out to a pull constant buffer.
2113 */
2114 void
2115 fs_visitor::move_uniform_array_access_to_pull_constants()
2116 {
2117 if (dispatch_width != 8)
2118 return;
2119
2120 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2121 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2122
2123 /* Walk through and find array access of uniforms. Put a copy of that
2124 * uniform in the pull constant buffer.
2125 *
2126 * Note that we don't move constant-indexed accesses to arrays. No
2127 * testing has been done of the performance impact of this choice.
2128 */
2129 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2130 for (int i = 0 ; i < inst->sources; i++) {
2131 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2132 continue;
2133
2134 int uniform = inst->src[i].reg;
2135
2136 /* If this array isn't already present in the pull constant buffer,
2137 * add it.
2138 */
2139 if (pull_constant_loc[uniform] == -1) {
2140 const gl_constant_value **values = &stage_prog_data->param[uniform];
2141
2142 assert(param_size[uniform]);
2143
2144 for (int j = 0; j < param_size[uniform]; j++) {
2145 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2146
2147 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2148 values[j];
2149 }
2150 }
2151 }
2152 }
2153 }
2154
2155 /**
2156 * Assign UNIFORM file registers to either push constants or pull constants.
2157 *
2158 * We allow a fragment shader to have more than the specified minimum
2159 * maximum number of fragment shader uniform components (64). If
2160 * there are too many of these, they'd fill up all of register space.
2161 * So, this will push some of them out to the pull constant buffer and
2162 * update the program to load them.
2163 */
2164 void
2165 fs_visitor::assign_constant_locations()
2166 {
2167 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2168 if (dispatch_width != 8)
2169 return;
2170
2171 /* Find which UNIFORM registers are still in use. */
2172 bool is_live[uniforms];
2173 for (unsigned int i = 0; i < uniforms; i++) {
2174 is_live[i] = false;
2175 }
2176
2177 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2178 for (int i = 0; i < inst->sources; i++) {
2179 if (inst->src[i].file != UNIFORM)
2180 continue;
2181
2182 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2183 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2184 is_live[constant_nr] = true;
2185 }
2186 }
2187
2188 /* Only allow 16 registers (128 uniform components) as push constants.
2189 *
2190 * Just demote the end of the list. We could probably do better
2191 * here, demoting things that are rarely used in the program first.
2192 *
2193 * If changing this value, note the limitation about total_regs in
2194 * brw_curbe.c.
2195 */
2196 unsigned int max_push_components = 16 * 8;
2197 unsigned int num_push_constants = 0;
2198
2199 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2200
2201 for (unsigned int i = 0; i < uniforms; i++) {
2202 if (!is_live[i] || pull_constant_loc[i] != -1) {
2203 /* This UNIFORM register is either dead, or has already been demoted
2204 * to a pull const. Mark it as no longer living in the param[] array.
2205 */
2206 push_constant_loc[i] = -1;
2207 continue;
2208 }
2209
2210 if (num_push_constants < max_push_components) {
2211 /* Retain as a push constant. Record the location in the params[]
2212 * array.
2213 */
2214 push_constant_loc[i] = num_push_constants++;
2215 } else {
2216 /* Demote to a pull constant. */
2217 push_constant_loc[i] = -1;
2218
2219 int pull_index = stage_prog_data->nr_pull_params++;
2220 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2221 pull_constant_loc[i] = pull_index;
2222 }
2223 }
2224
2225 stage_prog_data->nr_params = num_push_constants;
2226
2227 /* Up until now, the param[] array has been indexed by reg + reg_offset
2228 * of UNIFORM registers. Condense it to only contain the uniforms we
2229 * chose to upload as push constants.
2230 */
2231 for (unsigned int i = 0; i < uniforms; i++) {
2232 int remapped = push_constant_loc[i];
2233
2234 if (remapped == -1)
2235 continue;
2236
2237 assert(remapped <= (int)i);
2238 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2239 }
2240 }
2241
2242 /**
2243 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2244 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2245 */
2246 void
2247 fs_visitor::demote_pull_constants()
2248 {
2249 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2250 for (int i = 0; i < inst->sources; i++) {
2251 if (inst->src[i].file != UNIFORM)
2252 continue;
2253
2254 int pull_index = pull_constant_loc[inst->src[i].reg +
2255 inst->src[i].reg_offset];
2256 if (pull_index == -1)
2257 continue;
2258
2259 /* Set up the annotation tracking for new generated instructions. */
2260 base_ir = inst->ir;
2261 current_annotation = inst->annotation;
2262
2263 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2264 fs_reg dst = vgrf(glsl_type::float_type);
2265
2266 /* Generate a pull load into dst. */
2267 if (inst->src[i].reladdr) {
2268 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2269 surf_index,
2270 *inst->src[i].reladdr,
2271 pull_index);
2272 inst->insert_before(block, &list);
2273 inst->src[i].reladdr = NULL;
2274 } else {
2275 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2276 fs_inst *pull =
2277 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2278 dst, surf_index, offset);
2279 inst->insert_before(block, pull);
2280 inst->src[i].set_smear(pull_index & 3);
2281 }
2282
2283 /* Rewrite the instruction to use the temporary VGRF. */
2284 inst->src[i].file = GRF;
2285 inst->src[i].reg = dst.reg;
2286 inst->src[i].reg_offset = 0;
2287 inst->src[i].width = dispatch_width;
2288 }
2289 }
2290 invalidate_live_intervals();
2291 }
2292
2293 bool
2294 fs_visitor::opt_algebraic()
2295 {
2296 bool progress = false;
2297
2298 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2299 switch (inst->opcode) {
2300 case BRW_OPCODE_MOV:
2301 if (inst->src[0].file != IMM)
2302 break;
2303
2304 if (inst->saturate) {
2305 if (inst->dst.type != inst->src[0].type)
2306 assert(!"unimplemented: saturate mixed types");
2307
2308 if (brw_saturate_immediate(inst->dst.type,
2309 &inst->src[0].fixed_hw_reg)) {
2310 inst->saturate = false;
2311 progress = true;
2312 }
2313 }
2314 break;
2315
2316 case BRW_OPCODE_MUL:
2317 if (inst->src[1].file != IMM)
2318 continue;
2319
2320 /* a * 1.0 = a */
2321 if (inst->src[1].is_one()) {
2322 inst->opcode = BRW_OPCODE_MOV;
2323 inst->src[1] = reg_undef;
2324 progress = true;
2325 break;
2326 }
2327
2328 /* a * -1.0 = -a */
2329 if (inst->src[1].is_negative_one()) {
2330 inst->opcode = BRW_OPCODE_MOV;
2331 inst->src[0].negate = !inst->src[0].negate;
2332 inst->src[1] = reg_undef;
2333 progress = true;
2334 break;
2335 }
2336
2337 /* a * 0.0 = 0.0 */
2338 if (inst->src[1].is_zero()) {
2339 inst->opcode = BRW_OPCODE_MOV;
2340 inst->src[0] = inst->src[1];
2341 inst->src[1] = reg_undef;
2342 progress = true;
2343 break;
2344 }
2345
2346 if (inst->src[0].file == IMM) {
2347 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2348 inst->opcode = BRW_OPCODE_MOV;
2349 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2350 inst->src[1] = reg_undef;
2351 progress = true;
2352 break;
2353 }
2354 break;
2355 case BRW_OPCODE_ADD:
2356 if (inst->src[1].file != IMM)
2357 continue;
2358
2359 /* a + 0.0 = a */
2360 if (inst->src[1].is_zero()) {
2361 inst->opcode = BRW_OPCODE_MOV;
2362 inst->src[1] = reg_undef;
2363 progress = true;
2364 break;
2365 }
2366
2367 if (inst->src[0].file == IMM) {
2368 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2369 inst->opcode = BRW_OPCODE_MOV;
2370 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2371 inst->src[1] = reg_undef;
2372 progress = true;
2373 break;
2374 }
2375 break;
2376 case BRW_OPCODE_OR:
2377 if (inst->src[0].equals(inst->src[1])) {
2378 inst->opcode = BRW_OPCODE_MOV;
2379 inst->src[1] = reg_undef;
2380 progress = true;
2381 break;
2382 }
2383 break;
2384 case BRW_OPCODE_LRP:
2385 if (inst->src[1].equals(inst->src[2])) {
2386 inst->opcode = BRW_OPCODE_MOV;
2387 inst->src[0] = inst->src[1];
2388 inst->src[1] = reg_undef;
2389 inst->src[2] = reg_undef;
2390 progress = true;
2391 break;
2392 }
2393 break;
2394 case BRW_OPCODE_CMP:
2395 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2396 inst->src[0].abs &&
2397 inst->src[0].negate &&
2398 inst->src[1].is_zero()) {
2399 inst->src[0].abs = false;
2400 inst->src[0].negate = false;
2401 inst->conditional_mod = BRW_CONDITIONAL_Z;
2402 progress = true;
2403 break;
2404 }
2405 break;
2406 case BRW_OPCODE_SEL:
2407 if (inst->src[0].equals(inst->src[1])) {
2408 inst->opcode = BRW_OPCODE_MOV;
2409 inst->src[1] = reg_undef;
2410 inst->predicate = BRW_PREDICATE_NONE;
2411 inst->predicate_inverse = false;
2412 progress = true;
2413 } else if (inst->saturate && inst->src[1].file == IMM) {
2414 switch (inst->conditional_mod) {
2415 case BRW_CONDITIONAL_LE:
2416 case BRW_CONDITIONAL_L:
2417 switch (inst->src[1].type) {
2418 case BRW_REGISTER_TYPE_F:
2419 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2420 inst->opcode = BRW_OPCODE_MOV;
2421 inst->src[1] = reg_undef;
2422 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2423 progress = true;
2424 }
2425 break;
2426 default:
2427 break;
2428 }
2429 break;
2430 case BRW_CONDITIONAL_GE:
2431 case BRW_CONDITIONAL_G:
2432 switch (inst->src[1].type) {
2433 case BRW_REGISTER_TYPE_F:
2434 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2435 inst->opcode = BRW_OPCODE_MOV;
2436 inst->src[1] = reg_undef;
2437 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2438 progress = true;
2439 }
2440 break;
2441 default:
2442 break;
2443 }
2444 default:
2445 break;
2446 }
2447 }
2448 break;
2449 case BRW_OPCODE_MAD:
2450 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2451 inst->opcode = BRW_OPCODE_MOV;
2452 inst->src[1] = reg_undef;
2453 inst->src[2] = reg_undef;
2454 progress = true;
2455 } else if (inst->src[0].is_zero()) {
2456 inst->opcode = BRW_OPCODE_MUL;
2457 inst->src[0] = inst->src[2];
2458 inst->src[2] = reg_undef;
2459 } else if (inst->src[1].is_one()) {
2460 inst->opcode = BRW_OPCODE_ADD;
2461 inst->src[1] = inst->src[2];
2462 inst->src[2] = reg_undef;
2463 progress = true;
2464 } else if (inst->src[2].is_one()) {
2465 inst->opcode = BRW_OPCODE_ADD;
2466 inst->src[2] = reg_undef;
2467 progress = true;
2468 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2469 inst->opcode = BRW_OPCODE_ADD;
2470 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2471 inst->src[2] = reg_undef;
2472 progress = true;
2473 }
2474 break;
2475 case SHADER_OPCODE_RCP: {
2476 fs_inst *prev = (fs_inst *)inst->prev;
2477 if (prev->opcode == SHADER_OPCODE_SQRT) {
2478 if (inst->src[0].equals(prev->dst)) {
2479 inst->opcode = SHADER_OPCODE_RSQ;
2480 inst->src[0] = prev->src[0];
2481 progress = true;
2482 }
2483 }
2484 break;
2485 }
2486 default:
2487 break;
2488 }
2489 }
2490
2491 return progress;
2492 }
2493
2494 bool
2495 fs_visitor::opt_register_renaming()
2496 {
2497 bool progress = false;
2498 int depth = 0;
2499
2500 int remap[alloc.count];
2501 memset(remap, -1, sizeof(int) * alloc.count);
2502
2503 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2504 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2505 depth++;
2506 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2507 inst->opcode == BRW_OPCODE_WHILE) {
2508 depth--;
2509 }
2510
2511 /* Rewrite instruction sources. */
2512 for (int i = 0; i < inst->sources; i++) {
2513 if (inst->src[i].file == GRF &&
2514 remap[inst->src[i].reg] != -1 &&
2515 remap[inst->src[i].reg] != inst->src[i].reg) {
2516 inst->src[i].reg = remap[inst->src[i].reg];
2517 progress = true;
2518 }
2519 }
2520
2521 const int dst = inst->dst.reg;
2522
2523 if (depth == 0 &&
2524 inst->dst.file == GRF &&
2525 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2526 !inst->is_partial_write()) {
2527 if (remap[dst] == -1) {
2528 remap[dst] = dst;
2529 } else {
2530 remap[dst] = alloc.allocate(inst->dst.width / 8);
2531 inst->dst.reg = remap[dst];
2532 progress = true;
2533 }
2534 } else if (inst->dst.file == GRF &&
2535 remap[dst] != -1 &&
2536 remap[dst] != dst) {
2537 inst->dst.reg = remap[dst];
2538 progress = true;
2539 }
2540 }
2541
2542 if (progress) {
2543 invalidate_live_intervals();
2544
2545 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2546 if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2547 delta_x[i].reg = remap[delta_x[i].reg];
2548 }
2549 }
2550 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2551 if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2552 delta_y[i].reg = remap[delta_y[i].reg];
2553 }
2554 }
2555 }
2556
2557 return progress;
2558 }
2559
2560 bool
2561 fs_visitor::compute_to_mrf()
2562 {
2563 bool progress = false;
2564 int next_ip = 0;
2565
2566 /* No MRFs on Gen >= 7. */
2567 if (brw->gen >= 7)
2568 return false;
2569
2570 calculate_live_intervals();
2571
2572 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2573 int ip = next_ip;
2574 next_ip++;
2575
2576 if (inst->opcode != BRW_OPCODE_MOV ||
2577 inst->is_partial_write() ||
2578 inst->dst.file != MRF || inst->src[0].file != GRF ||
2579 inst->dst.type != inst->src[0].type ||
2580 inst->src[0].abs || inst->src[0].negate ||
2581 !inst->src[0].is_contiguous() ||
2582 inst->src[0].subreg_offset)
2583 continue;
2584
2585 /* Work out which hardware MRF registers are written by this
2586 * instruction.
2587 */
2588 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2589 int mrf_high;
2590 if (inst->dst.reg & BRW_MRF_COMPR4) {
2591 mrf_high = mrf_low + 4;
2592 } else if (inst->exec_size == 16) {
2593 mrf_high = mrf_low + 1;
2594 } else {
2595 mrf_high = mrf_low;
2596 }
2597
2598 /* Can't compute-to-MRF this GRF if someone else was going to
2599 * read it later.
2600 */
2601 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2602 continue;
2603
2604 /* Found a move of a GRF to a MRF. Let's see if we can go
2605 * rewrite the thing that made this GRF to write into the MRF.
2606 */
2607 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2608 if (scan_inst->dst.file == GRF &&
2609 scan_inst->dst.reg == inst->src[0].reg) {
2610 /* Found the last thing to write our reg we want to turn
2611 * into a compute-to-MRF.
2612 */
2613
2614 /* If this one instruction didn't populate all the
2615 * channels, bail. We might be able to rewrite everything
2616 * that writes that reg, but it would require smarter
2617 * tracking to delay the rewriting until complete success.
2618 */
2619 if (scan_inst->is_partial_write())
2620 break;
2621
2622 /* Things returning more than one register would need us to
2623 * understand coalescing out more than one MOV at a time.
2624 */
2625 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2626 break;
2627
2628 /* SEND instructions can't have MRF as a destination. */
2629 if (scan_inst->mlen)
2630 break;
2631
2632 if (brw->gen == 6) {
2633 /* gen6 math instructions must have the destination be
2634 * GRF, so no compute-to-MRF for them.
2635 */
2636 if (scan_inst->is_math()) {
2637 break;
2638 }
2639 }
2640
2641 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2642 /* Found the creator of our MRF's source value. */
2643 scan_inst->dst.file = MRF;
2644 scan_inst->dst.reg = inst->dst.reg;
2645 scan_inst->saturate |= inst->saturate;
2646 inst->remove(block);
2647 progress = true;
2648 }
2649 break;
2650 }
2651
2652 /* We don't handle control flow here. Most computation of
2653 * values that end up in MRFs are shortly before the MRF
2654 * write anyway.
2655 */
2656 if (block->start() == scan_inst)
2657 break;
2658
2659 /* You can't read from an MRF, so if someone else reads our
2660 * MRF's source GRF that we wanted to rewrite, that stops us.
2661 */
2662 bool interfered = false;
2663 for (int i = 0; i < scan_inst->sources; i++) {
2664 if (scan_inst->src[i].file == GRF &&
2665 scan_inst->src[i].reg == inst->src[0].reg &&
2666 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2667 interfered = true;
2668 }
2669 }
2670 if (interfered)
2671 break;
2672
2673 if (scan_inst->dst.file == MRF) {
2674 /* If somebody else writes our MRF here, we can't
2675 * compute-to-MRF before that.
2676 */
2677 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2678 int scan_mrf_high;
2679
2680 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2681 scan_mrf_high = scan_mrf_low + 4;
2682 } else if (scan_inst->exec_size == 16) {
2683 scan_mrf_high = scan_mrf_low + 1;
2684 } else {
2685 scan_mrf_high = scan_mrf_low;
2686 }
2687
2688 if (mrf_low == scan_mrf_low ||
2689 mrf_low == scan_mrf_high ||
2690 mrf_high == scan_mrf_low ||
2691 mrf_high == scan_mrf_high) {
2692 break;
2693 }
2694 }
2695
2696 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2697 /* Found a SEND instruction, which means that there are
2698 * live values in MRFs from base_mrf to base_mrf +
2699 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2700 * above it.
2701 */
2702 if (mrf_low >= scan_inst->base_mrf &&
2703 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2704 break;
2705 }
2706 if (mrf_high >= scan_inst->base_mrf &&
2707 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2708 break;
2709 }
2710 }
2711 }
2712 }
2713
2714 if (progress)
2715 invalidate_live_intervals();
2716
2717 return progress;
2718 }
2719
2720 /**
2721 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2722 * instructions to FS_OPCODE_REP_FB_WRITE.
2723 */
2724 void
2725 fs_visitor::emit_repclear_shader()
2726 {
2727 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2728 int base_mrf = 1;
2729 int color_mrf = base_mrf + 2;
2730
2731 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2732 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2733 mov->force_writemask_all = true;
2734
2735 fs_inst *write;
2736 if (key->nr_color_regions == 1) {
2737 write = emit(FS_OPCODE_REP_FB_WRITE);
2738 write->saturate = key->clamp_fragment_color;
2739 write->base_mrf = color_mrf;
2740 write->target = 0;
2741 write->header_present = false;
2742 write->mlen = 1;
2743 } else {
2744 assume(key->nr_color_regions > 0);
2745 for (int i = 0; i < key->nr_color_regions; ++i) {
2746 write = emit(FS_OPCODE_REP_FB_WRITE);
2747 write->saturate = key->clamp_fragment_color;
2748 write->base_mrf = base_mrf;
2749 write->target = i;
2750 write->header_present = true;
2751 write->mlen = 3;
2752 }
2753 }
2754 write->eot = true;
2755
2756 calculate_cfg();
2757
2758 assign_constant_locations();
2759 assign_curb_setup();
2760
2761 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2762 assert(mov->src[0].file == HW_REG);
2763 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2764 }
2765
2766 /**
2767 * Walks through basic blocks, looking for repeated MRF writes and
2768 * removing the later ones.
2769 */
2770 bool
2771 fs_visitor::remove_duplicate_mrf_writes()
2772 {
2773 fs_inst *last_mrf_move[16];
2774 bool progress = false;
2775
2776 /* Need to update the MRF tracking for compressed instructions. */
2777 if (dispatch_width == 16)
2778 return false;
2779
2780 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2781
2782 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2783 if (inst->is_control_flow()) {
2784 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2785 }
2786
2787 if (inst->opcode == BRW_OPCODE_MOV &&
2788 inst->dst.file == MRF) {
2789 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2790 if (prev_inst && inst->equals(prev_inst)) {
2791 inst->remove(block);
2792 progress = true;
2793 continue;
2794 }
2795 }
2796
2797 /* Clear out the last-write records for MRFs that were overwritten. */
2798 if (inst->dst.file == MRF) {
2799 last_mrf_move[inst->dst.reg] = NULL;
2800 }
2801
2802 if (inst->mlen > 0 && inst->base_mrf != -1) {
2803 /* Found a SEND instruction, which will include two or fewer
2804 * implied MRF writes. We could do better here.
2805 */
2806 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2807 last_mrf_move[inst->base_mrf + i] = NULL;
2808 }
2809 }
2810
2811 /* Clear out any MRF move records whose sources got overwritten. */
2812 if (inst->dst.file == GRF) {
2813 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2814 if (last_mrf_move[i] &&
2815 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2816 last_mrf_move[i] = NULL;
2817 }
2818 }
2819 }
2820
2821 if (inst->opcode == BRW_OPCODE_MOV &&
2822 inst->dst.file == MRF &&
2823 inst->src[0].file == GRF &&
2824 !inst->is_partial_write()) {
2825 last_mrf_move[inst->dst.reg] = inst;
2826 }
2827 }
2828
2829 if (progress)
2830 invalidate_live_intervals();
2831
2832 return progress;
2833 }
2834
2835 static void
2836 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2837 int first_grf, int grf_len)
2838 {
2839 /* Clear the flag for registers that actually got read (as expected). */
2840 for (int i = 0; i < inst->sources; i++) {
2841 int grf;
2842 if (inst->src[i].file == GRF) {
2843 grf = inst->src[i].reg;
2844 } else if (inst->src[i].file == HW_REG &&
2845 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2846 grf = inst->src[i].fixed_hw_reg.nr;
2847 } else {
2848 continue;
2849 }
2850
2851 if (grf >= first_grf &&
2852 grf < first_grf + grf_len) {
2853 deps[grf - first_grf] = false;
2854 if (inst->exec_size == 16)
2855 deps[grf - first_grf + 1] = false;
2856 }
2857 }
2858 }
2859
2860 /**
2861 * Implements this workaround for the original 965:
2862 *
2863 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2864 * check for post destination dependencies on this instruction, software
2865 * must ensure that there is no destination hazard for the case of ‘write
2866 * followed by a posted write’ shown in the following example.
2867 *
2868 * 1. mov r3 0
2869 * 2. send r3.xy <rest of send instruction>
2870 * 3. mov r2 r3
2871 *
2872 * Due to no post-destination dependency check on the ‘send’, the above
2873 * code sequence could have two instructions (1 and 2) in flight at the
2874 * same time that both consider ‘r3’ as the target of their final writes.
2875 */
2876 void
2877 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2878 fs_inst *inst)
2879 {
2880 int write_len = inst->regs_written;
2881 int first_write_grf = inst->dst.reg;
2882 bool needs_dep[BRW_MAX_MRF];
2883 assert(write_len < (int)sizeof(needs_dep) - 1);
2884
2885 memset(needs_dep, false, sizeof(needs_dep));
2886 memset(needs_dep, true, write_len);
2887
2888 clear_deps_for_inst_src(inst, dispatch_width,
2889 needs_dep, first_write_grf, write_len);
2890
2891 /* Walk backwards looking for writes to registers we're writing which
2892 * aren't read since being written. If we hit the start of the program,
2893 * we assume that there are no outstanding dependencies on entry to the
2894 * program.
2895 */
2896 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2897 /* If we hit control flow, assume that there *are* outstanding
2898 * dependencies, and force their cleanup before our instruction.
2899 */
2900 if (block->start() == scan_inst) {
2901 for (int i = 0; i < write_len; i++) {
2902 if (needs_dep[i]) {
2903 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2904 }
2905 }
2906 return;
2907 }
2908
2909 /* We insert our reads as late as possible on the assumption that any
2910 * instruction but a MOV that might have left us an outstanding
2911 * dependency has more latency than a MOV.
2912 */
2913 if (scan_inst->dst.file == GRF) {
2914 for (int i = 0; i < scan_inst->regs_written; i++) {
2915 int reg = scan_inst->dst.reg + i;
2916
2917 if (reg >= first_write_grf &&
2918 reg < first_write_grf + write_len &&
2919 needs_dep[reg - first_write_grf]) {
2920 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2921 needs_dep[reg - first_write_grf] = false;
2922 if (scan_inst->exec_size == 16)
2923 needs_dep[reg - first_write_grf + 1] = false;
2924 }
2925 }
2926 }
2927
2928 /* Clear the flag for registers that actually got read (as expected). */
2929 clear_deps_for_inst_src(scan_inst, dispatch_width,
2930 needs_dep, first_write_grf, write_len);
2931
2932 /* Continue the loop only if we haven't resolved all the dependencies */
2933 int i;
2934 for (i = 0; i < write_len; i++) {
2935 if (needs_dep[i])
2936 break;
2937 }
2938 if (i == write_len)
2939 return;
2940 }
2941 }
2942
2943 /**
2944 * Implements this workaround for the original 965:
2945 *
2946 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2947 * used as a destination register until after it has been sourced by an
2948 * instruction with a different destination register.
2949 */
2950 void
2951 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2952 {
2953 int write_len = inst->regs_written;
2954 int first_write_grf = inst->dst.reg;
2955 bool needs_dep[BRW_MAX_MRF];
2956 assert(write_len < (int)sizeof(needs_dep) - 1);
2957
2958 memset(needs_dep, false, sizeof(needs_dep));
2959 memset(needs_dep, true, write_len);
2960 /* Walk forwards looking for writes to registers we're writing which aren't
2961 * read before being written.
2962 */
2963 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2964 /* If we hit control flow, force resolve all remaining dependencies. */
2965 if (block->end() == scan_inst) {
2966 for (int i = 0; i < write_len; i++) {
2967 if (needs_dep[i])
2968 scan_inst->insert_before(block,
2969 DEP_RESOLVE_MOV(first_write_grf + i));
2970 }
2971 return;
2972 }
2973
2974 /* Clear the flag for registers that actually got read (as expected). */
2975 clear_deps_for_inst_src(scan_inst, dispatch_width,
2976 needs_dep, first_write_grf, write_len);
2977
2978 /* We insert our reads as late as possible since they're reading the
2979 * result of a SEND, which has massive latency.
2980 */
2981 if (scan_inst->dst.file == GRF &&
2982 scan_inst->dst.reg >= first_write_grf &&
2983 scan_inst->dst.reg < first_write_grf + write_len &&
2984 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2985 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
2986 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2987 }
2988
2989 /* Continue the loop only if we haven't resolved all the dependencies */
2990 int i;
2991 for (i = 0; i < write_len; i++) {
2992 if (needs_dep[i])
2993 break;
2994 }
2995 if (i == write_len)
2996 return;
2997 }
2998
2999 /* If we hit the end of the program, resolve all remaining dependencies out
3000 * of paranoia.
3001 */
3002 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
3003 assert(last_inst->eot);
3004 for (int i = 0; i < write_len; i++) {
3005 if (needs_dep[i])
3006 last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3007 }
3008 }
3009
3010 void
3011 fs_visitor::insert_gen4_send_dependency_workarounds()
3012 {
3013 if (brw->gen != 4 || brw->is_g4x)
3014 return;
3015
3016 bool progress = false;
3017
3018 /* Note that we're done with register allocation, so GRF fs_regs always
3019 * have a .reg_offset of 0.
3020 */
3021
3022 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3023 if (inst->mlen != 0 && inst->dst.file == GRF) {
3024 insert_gen4_pre_send_dependency_workarounds(block, inst);
3025 insert_gen4_post_send_dependency_workarounds(block, inst);
3026 progress = true;
3027 }
3028 }
3029
3030 if (progress)
3031 invalidate_live_intervals();
3032 }
3033
3034 /**
3035 * Turns the generic expression-style uniform pull constant load instruction
3036 * into a hardware-specific series of instructions for loading a pull
3037 * constant.
3038 *
3039 * The expression style allows the CSE pass before this to optimize out
3040 * repeated loads from the same offset, and gives the pre-register-allocation
3041 * scheduling full flexibility, while the conversion to native instructions
3042 * allows the post-register-allocation scheduler the best information
3043 * possible.
3044 *
3045 * Note that execution masking for setting up pull constant loads is special:
3046 * the channels that need to be written are unrelated to the current execution
3047 * mask, since a later instruction will use one of the result channels as a
3048 * source operand for all 8 or 16 of its channels.
3049 */
3050 void
3051 fs_visitor::lower_uniform_pull_constant_loads()
3052 {
3053 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3054 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3055 continue;
3056
3057 if (brw->gen >= 7) {
3058 /* The offset arg before was a vec4-aligned byte offset. We need to
3059 * turn it into a dword offset.
3060 */
3061 fs_reg const_offset_reg = inst->src[1];
3062 assert(const_offset_reg.file == IMM &&
3063 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3064 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3065 fs_reg payload = vgrf(glsl_type::uint_type);
3066
3067 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3068 * Reserve space for the register.
3069 */
3070 if (brw->gen >= 9) {
3071 payload.reg_offset++;
3072 alloc.sizes[payload.reg] = 2;
3073 }
3074
3075 /* This is actually going to be a MOV, but since only the first dword
3076 * is accessed, we have a special opcode to do just that one. Note
3077 * that this needs to be an operation that will be considered a def
3078 * by live variable analysis, or register allocation will explode.
3079 */
3080 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3081 8, payload, const_offset_reg);
3082 setup->force_writemask_all = true;
3083
3084 setup->ir = inst->ir;
3085 setup->annotation = inst->annotation;
3086 inst->insert_before(block, setup);
3087
3088 /* Similarly, this will only populate the first 4 channels of the
3089 * result register (since we only use smear values from 0-3), but we
3090 * don't tell the optimizer.
3091 */
3092 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3093 inst->src[1] = payload;
3094
3095 invalidate_live_intervals();
3096 } else {
3097 /* Before register allocation, we didn't tell the scheduler about the
3098 * MRF we use. We know it's safe to use this MRF because nothing
3099 * else does except for register spill/unspill, which generates and
3100 * uses its MRF within a single IR instruction.
3101 */
3102 inst->base_mrf = 14;
3103 inst->mlen = 1;
3104 }
3105 }
3106 }
3107
3108 bool
3109 fs_visitor::lower_load_payload()
3110 {
3111 bool progress = false;
3112
3113 int vgrf_to_reg[alloc.count];
3114 int reg_count = 16; /* Leave room for MRF */
3115 for (unsigned i = 0; i < alloc.count; ++i) {
3116 vgrf_to_reg[i] = reg_count;
3117 reg_count += alloc.sizes[i];
3118 }
3119
3120 struct {
3121 bool written:1; /* Whether this register has ever been written */
3122 bool force_writemask_all:1;
3123 bool force_sechalf:1;
3124 } metadata[reg_count];
3125 memset(metadata, 0, sizeof(metadata));
3126
3127 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3128 int dst_reg;
3129 if (inst->dst.file == GRF) {
3130 dst_reg = vgrf_to_reg[inst->dst.reg];
3131 } else {
3132 /* MRF */
3133 dst_reg = inst->dst.reg;
3134 }
3135
3136 if (inst->dst.file == MRF || inst->dst.file == GRF) {
3137 bool force_sechalf = inst->force_sechalf;
3138 bool toggle_sechalf = inst->dst.width == 16 &&
3139 type_sz(inst->dst.type) == 4;
3140 for (int i = 0; i < inst->regs_written; ++i) {
3141 metadata[dst_reg + i].written = true;
3142 metadata[dst_reg + i].force_sechalf = force_sechalf;
3143 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3144 force_sechalf = (toggle_sechalf != force_sechalf);
3145 }
3146 }
3147
3148 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3149 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3150 fs_reg dst = inst->dst;
3151
3152 for (int i = 0; i < inst->sources; i++) {
3153 dst.width = inst->src[i].effective_width;
3154 dst.type = inst->src[i].type;
3155
3156 if (inst->src[i].file == BAD_FILE) {
3157 /* Do nothing but otherwise increment as normal */
3158 } else if (dst.file == MRF &&
3159 dst.width == 8 &&
3160 brw->has_compr4 &&
3161 i + 4 < inst->sources &&
3162 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3163 fs_reg compr4_dst = dst;
3164 compr4_dst.reg += BRW_MRF_COMPR4;
3165 compr4_dst.width = 16;
3166 fs_reg compr4_src = inst->src[i];
3167 compr4_src.width = 16;
3168 fs_inst *mov = MOV(compr4_dst, compr4_src);
3169 mov->force_writemask_all = true;
3170 inst->insert_before(block, mov);
3171 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3172 inst->src[i + 4].file = BAD_FILE;
3173 } else {
3174 fs_inst *mov = MOV(dst, inst->src[i]);
3175 if (inst->src[i].file == GRF) {
3176 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3177 inst->src[i].reg_offset;
3178 mov->force_sechalf = metadata[src_reg].force_sechalf;
3179 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3180 metadata[dst_reg] = metadata[src_reg];
3181 if (dst.width * type_sz(dst.type) > 32) {
3182 assert((!metadata[src_reg].written ||
3183 !metadata[src_reg].force_sechalf) &&
3184 (!metadata[src_reg + 1].written ||
3185 metadata[src_reg + 1].force_sechalf));
3186 metadata[dst_reg + 1] = metadata[src_reg + 1];
3187 }
3188 } else {
3189 metadata[dst_reg].force_writemask_all = false;
3190 metadata[dst_reg].force_sechalf = false;
3191 if (dst.width == 16) {
3192 metadata[dst_reg + 1].force_writemask_all = false;
3193 metadata[dst_reg + 1].force_sechalf = true;
3194 }
3195 }
3196 inst->insert_before(block, mov);
3197 }
3198
3199 dst = offset(dst, 1);
3200 }
3201
3202 inst->remove(block);
3203 progress = true;
3204 }
3205 }
3206
3207 if (progress)
3208 invalidate_live_intervals();
3209
3210 return progress;
3211 }
3212
3213 void
3214 fs_visitor::dump_instructions()
3215 {
3216 dump_instructions(NULL);
3217 }
3218
3219 void
3220 fs_visitor::dump_instructions(const char *name)
3221 {
3222 FILE *file = stderr;
3223 if (name && geteuid() != 0) {
3224 file = fopen(name, "w");
3225 if (!file)
3226 file = stderr;
3227 }
3228
3229 if (cfg) {
3230 calculate_register_pressure();
3231 int ip = 0, max_pressure = 0;
3232 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3233 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3234 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3235 dump_instruction(inst, file);
3236 ip++;
3237 }
3238 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3239 } else {
3240 int ip = 0;
3241 foreach_in_list(backend_instruction, inst, &instructions) {
3242 fprintf(file, "%4d: ", ip++);
3243 dump_instruction(inst, file);
3244 }
3245 }
3246
3247 if (file != stderr) {
3248 fclose(file);
3249 }
3250 }
3251
3252 void
3253 fs_visitor::dump_instruction(backend_instruction *be_inst)
3254 {
3255 dump_instruction(be_inst, stderr);
3256 }
3257
3258 void
3259 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3260 {
3261 fs_inst *inst = (fs_inst *)be_inst;
3262
3263 if (inst->predicate) {
3264 fprintf(file, "(%cf0.%d) ",
3265 inst->predicate_inverse ? '-' : '+',
3266 inst->flag_subreg);
3267 }
3268
3269 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3270 if (inst->saturate)
3271 fprintf(file, ".sat");
3272 if (inst->conditional_mod) {
3273 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3274 if (!inst->predicate &&
3275 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3276 inst->opcode != BRW_OPCODE_IF &&
3277 inst->opcode != BRW_OPCODE_WHILE))) {
3278 fprintf(file, ".f0.%d", inst->flag_subreg);
3279 }
3280 }
3281 fprintf(file, "(%d) ", inst->exec_size);
3282
3283
3284 switch (inst->dst.file) {
3285 case GRF:
3286 fprintf(file, "vgrf%d", inst->dst.reg);
3287 if (inst->dst.width != dispatch_width)
3288 fprintf(file, "@%d", inst->dst.width);
3289 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3290 inst->dst.subreg_offset)
3291 fprintf(file, "+%d.%d",
3292 inst->dst.reg_offset, inst->dst.subreg_offset);
3293 break;
3294 case MRF:
3295 fprintf(file, "m%d", inst->dst.reg);
3296 break;
3297 case BAD_FILE:
3298 fprintf(file, "(null)");
3299 break;
3300 case UNIFORM:
3301 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3302 break;
3303 case ATTR:
3304 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3305 break;
3306 case HW_REG:
3307 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3308 switch (inst->dst.fixed_hw_reg.nr) {
3309 case BRW_ARF_NULL:
3310 fprintf(file, "null");
3311 break;
3312 case BRW_ARF_ADDRESS:
3313 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3314 break;
3315 case BRW_ARF_ACCUMULATOR:
3316 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3317 break;
3318 case BRW_ARF_FLAG:
3319 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3320 inst->dst.fixed_hw_reg.subnr);
3321 break;
3322 default:
3323 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3324 inst->dst.fixed_hw_reg.subnr);
3325 break;
3326 }
3327 } else {
3328 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3329 }
3330 if (inst->dst.fixed_hw_reg.subnr)
3331 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3332 break;
3333 default:
3334 fprintf(file, "???");
3335 break;
3336 }
3337 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3338
3339 for (int i = 0; i < inst->sources; i++) {
3340 if (inst->src[i].negate)
3341 fprintf(file, "-");
3342 if (inst->src[i].abs)
3343 fprintf(file, "|");
3344 switch (inst->src[i].file) {
3345 case GRF:
3346 fprintf(file, "vgrf%d", inst->src[i].reg);
3347 if (inst->src[i].width != dispatch_width)
3348 fprintf(file, "@%d", inst->src[i].width);
3349 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3350 inst->src[i].subreg_offset)
3351 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3352 inst->src[i].subreg_offset);
3353 break;
3354 case MRF:
3355 fprintf(file, "***m%d***", inst->src[i].reg);
3356 break;
3357 case ATTR:
3358 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3359 break;
3360 case UNIFORM:
3361 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3362 if (inst->src[i].reladdr) {
3363 fprintf(file, "+reladdr");
3364 } else if (inst->src[i].subreg_offset) {
3365 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3366 inst->src[i].subreg_offset);
3367 }
3368 break;
3369 case BAD_FILE:
3370 fprintf(file, "(null)");
3371 break;
3372 case IMM:
3373 switch (inst->src[i].type) {
3374 case BRW_REGISTER_TYPE_F:
3375 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3376 break;
3377 case BRW_REGISTER_TYPE_W:
3378 case BRW_REGISTER_TYPE_D:
3379 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3380 break;
3381 case BRW_REGISTER_TYPE_UW:
3382 case BRW_REGISTER_TYPE_UD:
3383 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3384 break;
3385 case BRW_REGISTER_TYPE_VF:
3386 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3387 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3388 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3389 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3390 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3391 break;
3392 default:
3393 fprintf(file, "???");
3394 break;
3395 }
3396 break;
3397 case HW_REG:
3398 if (inst->src[i].fixed_hw_reg.negate)
3399 fprintf(file, "-");
3400 if (inst->src[i].fixed_hw_reg.abs)
3401 fprintf(file, "|");
3402 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3403 switch (inst->src[i].fixed_hw_reg.nr) {
3404 case BRW_ARF_NULL:
3405 fprintf(file, "null");
3406 break;
3407 case BRW_ARF_ADDRESS:
3408 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3409 break;
3410 case BRW_ARF_ACCUMULATOR:
3411 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3412 break;
3413 case BRW_ARF_FLAG:
3414 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3415 inst->src[i].fixed_hw_reg.subnr);
3416 break;
3417 default:
3418 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3419 inst->src[i].fixed_hw_reg.subnr);
3420 break;
3421 }
3422 } else {
3423 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3424 }
3425 if (inst->src[i].fixed_hw_reg.subnr)
3426 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3427 if (inst->src[i].fixed_hw_reg.abs)
3428 fprintf(file, "|");
3429 break;
3430 default:
3431 fprintf(file, "???");
3432 break;
3433 }
3434 if (inst->src[i].abs)
3435 fprintf(file, "|");
3436
3437 if (inst->src[i].file != IMM) {
3438 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3439 }
3440
3441 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3442 fprintf(file, ", ");
3443 }
3444
3445 fprintf(file, " ");
3446
3447 if (dispatch_width == 16 && inst->exec_size == 8) {
3448 if (inst->force_sechalf)
3449 fprintf(file, "2ndhalf ");
3450 else
3451 fprintf(file, "1sthalf ");
3452 }
3453
3454 fprintf(file, "\n");
3455 }
3456
3457 /**
3458 * Possibly returns an instruction that set up @param reg.
3459 *
3460 * Sometimes we want to take the result of some expression/variable
3461 * dereference tree and rewrite the instruction generating the result
3462 * of the tree. When processing the tree, we know that the
3463 * instructions generated are all writing temporaries that are dead
3464 * outside of this tree. So, if we have some instructions that write
3465 * a temporary, we're free to point that temp write somewhere else.
3466 *
3467 * Note that this doesn't guarantee that the instruction generated
3468 * only reg -- it might be the size=4 destination of a texture instruction.
3469 */
3470 fs_inst *
3471 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3472 fs_inst *end,
3473 const fs_reg &reg)
3474 {
3475 if (end == start ||
3476 end->is_partial_write() ||
3477 reg.reladdr ||
3478 !reg.equals(end->dst)) {
3479 return NULL;
3480 } else {
3481 return end;
3482 }
3483 }
3484
3485 void
3486 fs_visitor::setup_payload_gen6()
3487 {
3488 bool uses_depth =
3489 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3490 unsigned barycentric_interp_modes =
3491 (stage == MESA_SHADER_FRAGMENT) ?
3492 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3493
3494 assert(brw->gen >= 6);
3495
3496 /* R0-1: masks, pixel X/Y coordinates. */
3497 payload.num_regs = 2;
3498 /* R2: only for 32-pixel dispatch.*/
3499
3500 /* R3-26: barycentric interpolation coordinates. These appear in the
3501 * same order that they appear in the brw_wm_barycentric_interp_mode
3502 * enum. Each set of coordinates occupies 2 registers if dispatch width
3503 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3504 * appear if they were enabled using the "Barycentric Interpolation
3505 * Mode" bits in WM_STATE.
3506 */
3507 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3508 if (barycentric_interp_modes & (1 << i)) {
3509 payload.barycentric_coord_reg[i] = payload.num_regs;
3510 payload.num_regs += 2;
3511 if (dispatch_width == 16) {
3512 payload.num_regs += 2;
3513 }
3514 }
3515 }
3516
3517 /* R27: interpolated depth if uses source depth */
3518 if (uses_depth) {
3519 payload.source_depth_reg = payload.num_regs;
3520 payload.num_regs++;
3521 if (dispatch_width == 16) {
3522 /* R28: interpolated depth if not SIMD8. */
3523 payload.num_regs++;
3524 }
3525 }
3526 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3527 if (uses_depth) {
3528 payload.source_w_reg = payload.num_regs;
3529 payload.num_regs++;
3530 if (dispatch_width == 16) {
3531 /* R30: interpolated W if not SIMD8. */
3532 payload.num_regs++;
3533 }
3534 }
3535
3536 if (stage == MESA_SHADER_FRAGMENT) {
3537 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3538 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3539 prog_data->uses_pos_offset = key->compute_pos_offset;
3540 /* R31: MSAA position offsets. */
3541 if (prog_data->uses_pos_offset) {
3542 payload.sample_pos_reg = payload.num_regs;
3543 payload.num_regs++;
3544 }
3545 }
3546
3547 /* R32: MSAA input coverage mask */
3548 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3549 assert(brw->gen >= 7);
3550 payload.sample_mask_in_reg = payload.num_regs;
3551 payload.num_regs++;
3552 if (dispatch_width == 16) {
3553 /* R33: input coverage mask if not SIMD8. */
3554 payload.num_regs++;
3555 }
3556 }
3557
3558 /* R34-: bary for 32-pixel. */
3559 /* R58-59: interp W for 32-pixel. */
3560
3561 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3562 source_depth_to_render_target = true;
3563 }
3564 }
3565
3566 void
3567 fs_visitor::setup_vs_payload()
3568 {
3569 /* R0: thread header, R1: urb handles */
3570 payload.num_regs = 2;
3571 }
3572
3573 void
3574 fs_visitor::assign_binding_table_offsets()
3575 {
3576 assert(stage == MESA_SHADER_FRAGMENT);
3577 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3578 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3579 uint32_t next_binding_table_offset = 0;
3580
3581 /* If there are no color regions, we still perform an FB write to a null
3582 * renderbuffer, which we place at surface index 0.
3583 */
3584 prog_data->binding_table.render_target_start = next_binding_table_offset;
3585 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3586
3587 assign_common_binding_table_offsets(next_binding_table_offset);
3588 }
3589
3590 void
3591 fs_visitor::calculate_register_pressure()
3592 {
3593 invalidate_live_intervals();
3594 calculate_live_intervals();
3595
3596 unsigned num_instructions = 0;
3597 foreach_block(block, cfg)
3598 num_instructions += block->instructions.length();
3599
3600 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3601
3602 for (unsigned reg = 0; reg < alloc.count; reg++) {
3603 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3604 regs_live_at_ip[ip] += alloc.sizes[reg];
3605 }
3606 }
3607
3608 void
3609 fs_visitor::optimize()
3610 {
3611 const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3612
3613 split_virtual_grfs();
3614
3615 move_uniform_array_access_to_pull_constants();
3616 assign_constant_locations();
3617 demote_pull_constants();
3618
3619 #define OPT(pass, args...) ({ \
3620 pass_num++; \
3621 bool this_progress = pass(args); \
3622 \
3623 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3624 char filename[64]; \
3625 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3626 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3627 \
3628 backend_visitor::dump_instructions(filename); \
3629 } \
3630 \
3631 progress = progress || this_progress; \
3632 this_progress; \
3633 })
3634
3635 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3636 char filename[64];
3637 snprintf(filename, 64, "%s%d-%04d-00-start",
3638 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3639
3640 backend_visitor::dump_instructions(filename);
3641 }
3642
3643 bool progress;
3644 int iteration = 0;
3645 int pass_num = 0;
3646 do {
3647 progress = false;
3648 pass_num = 0;
3649 iteration++;
3650
3651 OPT(remove_duplicate_mrf_writes);
3652
3653 OPT(opt_algebraic);
3654 OPT(opt_cse);
3655 OPT(opt_copy_propagate);
3656 OPT(opt_peephole_predicated_break);
3657 OPT(opt_cmod_propagation);
3658 OPT(dead_code_eliminate);
3659 OPT(opt_peephole_sel);
3660 OPT(dead_control_flow_eliminate, this);
3661 OPT(opt_register_renaming);
3662 OPT(opt_saturate_propagation);
3663 OPT(register_coalesce);
3664 OPT(compute_to_mrf);
3665
3666 OPT(compact_virtual_grfs);
3667 } while (progress);
3668
3669 pass_num = 0;
3670
3671 if (OPT(lower_load_payload)) {
3672 split_virtual_grfs();
3673 OPT(register_coalesce);
3674 OPT(compute_to_mrf);
3675 OPT(dead_code_eliminate);
3676 }
3677
3678 OPT(opt_combine_constants);
3679
3680 lower_uniform_pull_constant_loads();
3681 }
3682
3683 /**
3684 * Three source instruction must have a GRF/MRF destination register.
3685 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
3686 */
3687 void
3688 fs_visitor::fixup_3src_null_dest()
3689 {
3690 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3691 if (inst->is_3src() && inst->dst.is_null()) {
3692 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3693 inst->dst.type);
3694 }
3695 }
3696 }
3697
3698 void
3699 fs_visitor::allocate_registers()
3700 {
3701 bool allocated_without_spills;
3702
3703 static const enum instruction_scheduler_mode pre_modes[] = {
3704 SCHEDULE_PRE,
3705 SCHEDULE_PRE_NON_LIFO,
3706 SCHEDULE_PRE_LIFO,
3707 };
3708
3709 /* Try each scheduling heuristic to see if it can successfully register
3710 * allocate without spilling. They should be ordered by decreasing
3711 * performance but increasing likelihood of allocating.
3712 */
3713 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3714 schedule_instructions(pre_modes[i]);
3715
3716 if (0) {
3717 assign_regs_trivial();
3718 allocated_without_spills = true;
3719 } else {
3720 allocated_without_spills = assign_regs(false);
3721 }
3722 if (allocated_without_spills)
3723 break;
3724 }
3725
3726 if (!allocated_without_spills) {
3727 const char *stage_name = stage == MESA_SHADER_VERTEX ?
3728 "Vertex" : "Fragment";
3729
3730 /* We assume that any spilling is worse than just dropping back to
3731 * SIMD8. There's probably actually some intermediate point where
3732 * SIMD16 with a couple of spills is still better.
3733 */
3734 if (dispatch_width == 16) {
3735 fail("Failure to register allocate. Reduce number of "
3736 "live scalar values to avoid this.");
3737 } else {
3738 perf_debug("%s shader triggered register spilling. "
3739 "Try reducing the number of live scalar values to "
3740 "improve performance.\n", stage_name);
3741 }
3742
3743 /* Since we're out of heuristics, just go spill registers until we
3744 * get an allocation.
3745 */
3746 while (!assign_regs(true)) {
3747 if (failed)
3748 break;
3749 }
3750 }
3751
3752 /* This must come after all optimization and register allocation, since
3753 * it inserts dead code that happens to have side effects, and it does
3754 * so based on the actual physical registers in use.
3755 */
3756 insert_gen4_send_dependency_workarounds();
3757
3758 if (failed)
3759 return;
3760
3761 if (!allocated_without_spills)
3762 schedule_instructions(SCHEDULE_POST);
3763
3764 if (last_scratch > 0)
3765 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3766 }
3767
3768 bool
3769 fs_visitor::run_vs()
3770 {
3771 assert(stage == MESA_SHADER_VERTEX);
3772
3773 assign_common_binding_table_offsets(0);
3774 setup_vs_payload();
3775
3776 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3777 emit_shader_time_begin();
3778
3779 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3780 base_ir = ir;
3781 this->result = reg_undef;
3782 ir->accept(this);
3783 }
3784 base_ir = NULL;
3785 if (failed)
3786 return false;
3787
3788 emit_urb_writes();
3789
3790 calculate_cfg();
3791
3792 optimize();
3793
3794 assign_curb_setup();
3795 assign_vs_urb_setup();
3796
3797 fixup_3src_null_dest();
3798 allocate_registers();
3799
3800 return !failed;
3801 }
3802
3803 bool
3804 fs_visitor::run_fs()
3805 {
3806 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3807 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3808
3809 assert(stage == MESA_SHADER_FRAGMENT);
3810
3811 sanity_param_count = prog->Parameters->NumParameters;
3812
3813 assign_binding_table_offsets();
3814
3815 if (brw->gen >= 6)
3816 setup_payload_gen6();
3817 else
3818 setup_payload_gen4();
3819
3820 if (0) {
3821 emit_dummy_fs();
3822 } else if (brw->use_rep_send && dispatch_width == 16) {
3823 emit_repclear_shader();
3824 } else {
3825 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3826 emit_shader_time_begin();
3827
3828 calculate_urb_setup();
3829 if (prog->InputsRead > 0) {
3830 if (brw->gen < 6)
3831 emit_interpolation_setup_gen4();
3832 else
3833 emit_interpolation_setup_gen6();
3834 }
3835
3836 /* We handle discards by keeping track of the still-live pixels in f0.1.
3837 * Initialize it with the dispatched pixels.
3838 */
3839 if (wm_prog_data->uses_kill) {
3840 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3841 discard_init->flag_subreg = 1;
3842 }
3843
3844 /* Generate FS IR for main(). (the visitor only descends into
3845 * functions called "main").
3846 */
3847 if (shader) {
3848 if (getenv("INTEL_USE_NIR") != NULL) {
3849 emit_nir_code();
3850 } else {
3851 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3852 base_ir = ir;
3853 this->result = reg_undef;
3854 ir->accept(this);
3855 }
3856 }
3857 } else {
3858 emit_fragment_program_code();
3859 }
3860 base_ir = NULL;
3861 if (failed)
3862 return false;
3863
3864 emit(FS_OPCODE_PLACEHOLDER_HALT);
3865
3866 if (wm_key->alpha_test_func)
3867 emit_alpha_test();
3868
3869 emit_fb_writes();
3870
3871 calculate_cfg();
3872
3873 optimize();
3874
3875 assign_curb_setup();
3876 assign_urb_setup();
3877
3878 fixup_3src_null_dest();
3879 allocate_registers();
3880
3881 if (failed)
3882 return false;
3883 }
3884
3885 if (dispatch_width == 8)
3886 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3887 else
3888 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3889
3890 /* If any state parameters were appended, then ParameterValues could have
3891 * been realloced, in which case the driver uniform storage set up by
3892 * _mesa_associate_uniform_storage() would point to freed memory. Make
3893 * sure that didn't happen.
3894 */
3895 assert(sanity_param_count == prog->Parameters->NumParameters);
3896
3897 return !failed;
3898 }
3899
3900 const unsigned *
3901 brw_wm_fs_emit(struct brw_context *brw,
3902 void *mem_ctx,
3903 const struct brw_wm_prog_key *key,
3904 struct brw_wm_prog_data *prog_data,
3905 struct gl_fragment_program *fp,
3906 struct gl_shader_program *prog,
3907 unsigned *final_assembly_size)
3908 {
3909 bool start_busy = false;
3910 double start_time = 0;
3911
3912 if (unlikely(brw->perf_debug)) {
3913 start_busy = (brw->batch.last_bo &&
3914 drm_intel_bo_busy(brw->batch.last_bo));
3915 start_time = get_time();
3916 }
3917
3918 struct brw_shader *shader = NULL;
3919 if (prog)
3920 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3921
3922 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3923 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3924
3925 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3926 */
3927 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3928 if (!v.run_fs()) {
3929 if (prog) {
3930 prog->LinkStatus = false;
3931 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3932 }
3933
3934 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3935 v.fail_msg);
3936
3937 return NULL;
3938 }
3939
3940 cfg_t *simd16_cfg = NULL;
3941 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3942 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3943 brw->use_rep_send)) {
3944 if (!v.simd16_unsupported) {
3945 /* Try a SIMD16 compile */
3946 v2.import_uniforms(&v);
3947 if (!v2.run_fs()) {
3948 perf_debug("SIMD16 shader failed to compile, falling back to "
3949 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3950 } else {
3951 simd16_cfg = v2.cfg;
3952 }
3953 } else {
3954 perf_debug("SIMD16 shader unsupported, falling back to "
3955 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3956 }
3957 }
3958
3959 cfg_t *simd8_cfg;
3960 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3961 if (no_simd8 && simd16_cfg) {
3962 simd8_cfg = NULL;
3963 prog_data->no_8 = true;
3964 } else {
3965 simd8_cfg = v.cfg;
3966 prog_data->no_8 = false;
3967 }
3968
3969 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
3970 &fp->Base, v.runtime_check_aads_emit, "FS");
3971
3972 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3973 char *name;
3974 if (prog)
3975 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
3976 prog->Label ? prog->Label : "unnamed",
3977 prog->Name);
3978 else
3979 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
3980
3981 g.enable_debug(name);
3982 }
3983
3984 if (simd8_cfg)
3985 g.generate_code(simd8_cfg, 8);
3986 if (simd16_cfg)
3987 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
3988
3989 if (unlikely(brw->perf_debug) && shader) {
3990 if (shader->compiled_once)
3991 brw_wm_debug_recompile(brw, prog, key);
3992 shader->compiled_once = true;
3993
3994 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3995 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3996 (get_time() - start_time) * 1000);
3997 }
3998 }
3999
4000 return g.get_assembly(final_assembly_size);
4001 }
4002
4003 extern "C" bool
4004 brw_fs_precompile(struct gl_context *ctx,
4005 struct gl_shader_program *shader_prog,
4006 struct gl_program *prog)
4007 {
4008 struct brw_context *brw = brw_context(ctx);
4009 struct brw_wm_prog_key key;
4010
4011 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4012 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4013 bool program_uses_dfdy = fp->UsesDFdy;
4014
4015 memset(&key, 0, sizeof(key));
4016
4017 if (brw->gen < 6) {
4018 if (fp->UsesKill)
4019 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4020
4021 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4022 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4023
4024 /* Just assume depth testing. */
4025 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4026 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4027 }
4028
4029 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4030 BRW_FS_VARYING_INPUT_MASK) > 16)
4031 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4032
4033 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4034 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
4035 for (unsigned i = 0; i < sampler_count; i++) {
4036 if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
4037 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4038 key.tex.swizzles[i] =
4039 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4040 } else {
4041 /* Color sampler: assume no swizzling. */
4042 key.tex.swizzles[i] = SWIZZLE_XYZW;
4043 }
4044 }
4045
4046 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4047 key.drawable_height = ctx->DrawBuffer->Height;
4048 }
4049
4050 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4051 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4052 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4053
4054 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4055 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4056 key.nr_color_regions > 1;
4057 }
4058
4059 key.program_string_id = bfp->id;
4060
4061 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4062 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4063
4064 bool success = do_wm_prog(brw, shader_prog, bfp, &key);
4065
4066 brw->wm.base.prog_offset = old_prog_offset;
4067 brw->wm.prog_data = old_prog_data;
4068
4069 return success;
4070 }