i965/fs: Strip trailing constant zeroes in sample messages
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 void
53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
54 const fs_reg *src, unsigned sources)
55 {
56 memset(this, 0, sizeof(*this));
57
58 this->src = new fs_reg[MAX2(sources, 3)];
59 for (unsigned i = 0; i < sources; i++)
60 this->src[i] = src[i];
61
62 this->opcode = opcode;
63 this->dst = dst;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (unsigned i = 0; i < sources; ++i) {
79 if (src[i].file != GRF && src[i].file != ATTR)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 for (unsigned i = 0; i < sources; ++i) {
94 switch (this->src[i].file) {
95 case BAD_FILE:
96 this->src[i].effective_width = 8;
97 break;
98 case GRF:
99 case HW_REG:
100 case ATTR:
101 assert(this->src[i].width > 0);
102 if (this->src[i].width == 1) {
103 this->src[i].effective_width = this->exec_size;
104 } else {
105 this->src[i].effective_width = this->src[i].width;
106 }
107 break;
108 case IMM:
109 case UNIFORM:
110 this->src[i].effective_width = this->exec_size;
111 break;
112 default:
113 unreachable("Invalid source register file");
114 }
115 }
116 this->dst.effective_width = this->exec_size;
117
118 this->conditional_mod = BRW_CONDITIONAL_NONE;
119
120 /* This will be the case for almost all instructions. */
121 switch (dst.file) {
122 case GRF:
123 case HW_REG:
124 case MRF:
125 case ATTR:
126 this->regs_written =
127 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
128 break;
129 case BAD_FILE:
130 this->regs_written = 0;
131 break;
132 case IMM:
133 case UNIFORM:
134 unreachable("Invalid destination register file");
135 default:
136 unreachable("Invalid register file");
137 }
138
139 this->writes_accumulator = false;
140 }
141
142 fs_inst::fs_inst()
143 {
144 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
145 }
146
147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
148 {
149 init(opcode, exec_size, reg_undef, NULL, 0);
150 }
151
152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
153 {
154 init(opcode, 0, dst, NULL, 0);
155 }
156
157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
158 const fs_reg &src0)
159 {
160 const fs_reg src[1] = { src0 };
161 init(opcode, exec_size, dst, src, 1);
162 }
163
164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
165 {
166 const fs_reg src[1] = { src0 };
167 init(opcode, 0, dst, src, 1);
168 }
169
170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
171 const fs_reg &src0, const fs_reg &src1)
172 {
173 const fs_reg src[2] = { src0, src1 };
174 init(opcode, exec_size, dst, src, 2);
175 }
176
177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
178 const fs_reg &src1)
179 {
180 const fs_reg src[2] = { src0, src1 };
181 init(opcode, 0, dst, src, 2);
182 }
183
184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
185 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
186 {
187 const fs_reg src[3] = { src0, src1, src2 };
188 init(opcode, exec_size, dst, src, 3);
189 }
190
191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
192 const fs_reg &src1, const fs_reg &src2)
193 {
194 const fs_reg src[3] = { src0, src1, src2 };
195 init(opcode, 0, dst, src, 3);
196 }
197
198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
199 const fs_reg src[], unsigned sources)
200 {
201 init(opcode, 0, dst, src, sources);
202 }
203
204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
205 const fs_reg src[], unsigned sources)
206 {
207 init(opcode, exec_width, dst, src, sources);
208 }
209
210 fs_inst::fs_inst(const fs_inst &that)
211 {
212 memcpy(this, &that, sizeof(that));
213
214 this->src = new fs_reg[MAX2(that.sources, 3)];
215
216 for (unsigned i = 0; i < that.sources; i++)
217 this->src[i] = that.src[i];
218 }
219
220 fs_inst::~fs_inst()
221 {
222 delete[] this->src;
223 }
224
225 void
226 fs_inst::resize_sources(uint8_t num_sources)
227 {
228 if (this->sources != num_sources) {
229 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
230
231 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
232 src[i] = this->src[i];
233
234 delete[] this->src;
235 this->src = src;
236 this->sources = num_sources;
237 }
238 }
239
240 #define ALU1(op) \
241 fs_inst * \
242 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
243 { \
244 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
245 }
246
247 #define ALU2(op) \
248 fs_inst * \
249 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
250 const fs_reg &src1) \
251 { \
252 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
253 }
254
255 #define ALU2_ACC(op) \
256 fs_inst * \
257 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
258 const fs_reg &src1) \
259 { \
260 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
261 inst->writes_accumulator = true; \
262 return inst; \
263 }
264
265 #define ALU3(op) \
266 fs_inst * \
267 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
268 const fs_reg &src1, const fs_reg &src2) \
269 { \
270 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
271 }
272
273 ALU1(NOT)
274 ALU1(MOV)
275 ALU1(FRC)
276 ALU1(RNDD)
277 ALU1(RNDE)
278 ALU1(RNDZ)
279 ALU2(ADD)
280 ALU2(MUL)
281 ALU2_ACC(MACH)
282 ALU2(AND)
283 ALU2(OR)
284 ALU2(XOR)
285 ALU2(SHL)
286 ALU2(SHR)
287 ALU2(ASR)
288 ALU3(LRP)
289 ALU1(BFREV)
290 ALU3(BFE)
291 ALU2(BFI1)
292 ALU3(BFI2)
293 ALU1(FBH)
294 ALU1(FBL)
295 ALU1(CBIT)
296 ALU3(MAD)
297 ALU2_ACC(ADDC)
298 ALU2_ACC(SUBB)
299 ALU2(SEL)
300 ALU2(MAC)
301
302 /** Gen4 predicated IF. */
303 fs_inst *
304 fs_visitor::IF(enum brw_predicate predicate)
305 {
306 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
307 inst->predicate = predicate;
308 return inst;
309 }
310
311 /** Gen6 IF with embedded comparison. */
312 fs_inst *
313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
314 enum brw_conditional_mod condition)
315 {
316 assert(devinfo->gen == 6);
317 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
318 reg_null_d, src0, src1);
319 inst->conditional_mod = condition;
320 return inst;
321 }
322
323 /**
324 * CMP: Sets the low bit of the destination channels with the result
325 * of the comparison, while the upper bits are undefined, and updates
326 * the flag register with the packed 16 bits of the result.
327 */
328 fs_inst *
329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
330 enum brw_conditional_mod condition)
331 {
332 fs_inst *inst;
333
334 /* Take the instruction:
335 *
336 * CMP null<d> src0<f> src1<f>
337 *
338 * Original gen4 does type conversion to the destination type before
339 * comparison, producing garbage results for floating point comparisons.
340 *
341 * The destination type doesn't matter on newer generations, so we set the
342 * type to match src0 so we can compact the instruction.
343 */
344 dst.type = src0.type;
345 if (dst.file == HW_REG)
346 dst.fixed_hw_reg.type = dst.type;
347
348 resolve_ud_negate(&src0);
349 resolve_ud_negate(&src1);
350
351 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
352 inst->conditional_mod = condition;
353
354 return inst;
355 }
356
357 fs_inst *
358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
359 {
360 uint8_t exec_size = dst.width;
361 for (int i = 0; i < sources; ++i) {
362 assert(src[i].width % dst.width == 0);
363 if (src[i].width > exec_size)
364 exec_size = src[i].width;
365 }
366
367 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
368 dst, src, sources);
369 inst->regs_written = 0;
370 for (int i = 0; i < sources; ++i) {
371 /* The LOAD_PAYLOAD instruction only really makes sense if we are
372 * dealing with whole registers. If this ever changes, we can deal
373 * with it later.
374 */
375 int size = inst->src[i].effective_width * type_sz(src[i].type);
376 assert(size % 32 == 0);
377 inst->regs_written += (size + 31) / 32;
378 }
379
380 return inst;
381 }
382
383 exec_list
384 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
385 const fs_reg &surf_index,
386 const fs_reg &varying_offset,
387 uint32_t const_offset)
388 {
389 exec_list instructions;
390 fs_inst *inst;
391
392 /* We have our constant surface use a pitch of 4 bytes, so our index can
393 * be any component of a vector, and then we load 4 contiguous
394 * components starting from that.
395 *
396 * We break down the const_offset to a portion added to the variable
397 * offset and a portion done using reg_offset, which means that if you
398 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
399 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
400 * CSE can later notice that those loads are all the same and eliminate
401 * the redundant ones.
402 */
403 fs_reg vec4_offset = vgrf(glsl_type::int_type);
404 instructions.push_tail(ADD(vec4_offset,
405 varying_offset, fs_reg(const_offset & ~3)));
406
407 int scale = 1;
408 if (devinfo->gen == 4 && dst.width == 8) {
409 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
410 * u, v, r) as parameters, or we can just use the SIMD16 message
411 * consisting of (header, u). We choose the second, at the cost of a
412 * longer return length.
413 */
414 scale = 2;
415 }
416
417 enum opcode op;
418 if (devinfo->gen >= 7)
419 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
420 else
421 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
422
423 assert(dst.width % 8 == 0);
424 int regs_written = 4 * (dst.width / 8) * scale;
425 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
426 dst.type, dst.width);
427 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
428 inst->regs_written = regs_written;
429 instructions.push_tail(inst);
430
431 if (devinfo->gen < 7) {
432 inst->base_mrf = 13;
433 inst->header_present = true;
434 if (devinfo->gen == 4)
435 inst->mlen = 3;
436 else
437 inst->mlen = 1 + dispatch_width / 8;
438 }
439
440 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
441 instructions.push_tail(MOV(dst, result));
442
443 return instructions;
444 }
445
446 /**
447 * A helper for MOV generation for fixing up broken hardware SEND dependency
448 * handling.
449 */
450 fs_inst *
451 fs_visitor::DEP_RESOLVE_MOV(int grf)
452 {
453 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
454
455 inst->ir = NULL;
456 inst->annotation = "send dependency resolve";
457
458 /* The caller always wants uncompressed to emit the minimal extra
459 * dependencies, and to avoid having to deal with aligning its regs to 2.
460 */
461 inst->exec_size = 8;
462
463 return inst;
464 }
465
466 bool
467 fs_inst::equals(fs_inst *inst) const
468 {
469 return (opcode == inst->opcode &&
470 dst.equals(inst->dst) &&
471 src[0].equals(inst->src[0]) &&
472 src[1].equals(inst->src[1]) &&
473 src[2].equals(inst->src[2]) &&
474 saturate == inst->saturate &&
475 predicate == inst->predicate &&
476 conditional_mod == inst->conditional_mod &&
477 mlen == inst->mlen &&
478 base_mrf == inst->base_mrf &&
479 target == inst->target &&
480 eot == inst->eot &&
481 header_present == inst->header_present &&
482 shadow_compare == inst->shadow_compare &&
483 exec_size == inst->exec_size &&
484 offset == inst->offset);
485 }
486
487 bool
488 fs_inst::overwrites_reg(const fs_reg &reg) const
489 {
490 return reg.in_range(dst, regs_written);
491 }
492
493 bool
494 fs_inst::is_send_from_grf() const
495 {
496 switch (opcode) {
497 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
498 case SHADER_OPCODE_SHADER_TIME_ADD:
499 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
500 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
501 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
502 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
503 case SHADER_OPCODE_UNTYPED_ATOMIC:
504 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
505 case SHADER_OPCODE_URB_WRITE_SIMD8:
506 return true;
507 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
508 return src[1].file == GRF;
509 case FS_OPCODE_FB_WRITE:
510 return src[0].file == GRF;
511 default:
512 if (is_tex())
513 return src[0].file == GRF;
514
515 return false;
516 }
517 }
518
519 bool
520 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
521 {
522 if (devinfo->gen == 6 && is_math())
523 return false;
524
525 if (is_send_from_grf())
526 return false;
527
528 if (!backend_instruction::can_do_source_mods())
529 return false;
530
531 return true;
532 }
533
534 bool
535 fs_inst::has_side_effects() const
536 {
537 return this->eot || backend_instruction::has_side_effects();
538 }
539
540 void
541 fs_reg::init()
542 {
543 memset(this, 0, sizeof(*this));
544 stride = 1;
545 }
546
547 /** Generic unset register constructor. */
548 fs_reg::fs_reg()
549 {
550 init();
551 this->file = BAD_FILE;
552 }
553
554 /** Immediate value constructor. */
555 fs_reg::fs_reg(float f)
556 {
557 init();
558 this->file = IMM;
559 this->type = BRW_REGISTER_TYPE_F;
560 this->fixed_hw_reg.dw1.f = f;
561 this->width = 1;
562 }
563
564 /** Immediate value constructor. */
565 fs_reg::fs_reg(int32_t i)
566 {
567 init();
568 this->file = IMM;
569 this->type = BRW_REGISTER_TYPE_D;
570 this->fixed_hw_reg.dw1.d = i;
571 this->width = 1;
572 }
573
574 /** Immediate value constructor. */
575 fs_reg::fs_reg(uint32_t u)
576 {
577 init();
578 this->file = IMM;
579 this->type = BRW_REGISTER_TYPE_UD;
580 this->fixed_hw_reg.dw1.ud = u;
581 this->width = 1;
582 }
583
584 /** Vector float immediate value constructor. */
585 fs_reg::fs_reg(uint8_t vf[4])
586 {
587 init();
588 this->file = IMM;
589 this->type = BRW_REGISTER_TYPE_VF;
590 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
591 }
592
593 /** Vector float immediate value constructor. */
594 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
595 {
596 init();
597 this->file = IMM;
598 this->type = BRW_REGISTER_TYPE_VF;
599 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
600 (vf1 << 8) |
601 (vf2 << 16) |
602 (vf3 << 24);
603 }
604
605 /** Fixed brw_reg. */
606 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
607 {
608 init();
609 this->file = HW_REG;
610 this->fixed_hw_reg = fixed_hw_reg;
611 this->type = fixed_hw_reg.type;
612 this->width = 1 << fixed_hw_reg.width;
613 }
614
615 bool
616 fs_reg::equals(const fs_reg &r) const
617 {
618 return (file == r.file &&
619 reg == r.reg &&
620 reg_offset == r.reg_offset &&
621 subreg_offset == r.subreg_offset &&
622 type == r.type &&
623 negate == r.negate &&
624 abs == r.abs &&
625 !reladdr && !r.reladdr &&
626 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
627 width == r.width &&
628 stride == r.stride);
629 }
630
631 fs_reg &
632 fs_reg::set_smear(unsigned subreg)
633 {
634 assert(file != HW_REG && file != IMM);
635 subreg_offset = subreg * type_sz(type);
636 stride = 0;
637 return *this;
638 }
639
640 bool
641 fs_reg::is_contiguous() const
642 {
643 return stride == 1;
644 }
645
646 int
647 fs_visitor::type_size(const struct glsl_type *type)
648 {
649 unsigned int size, i;
650
651 switch (type->base_type) {
652 case GLSL_TYPE_UINT:
653 case GLSL_TYPE_INT:
654 case GLSL_TYPE_FLOAT:
655 case GLSL_TYPE_BOOL:
656 return type->components();
657 case GLSL_TYPE_ARRAY:
658 return type_size(type->fields.array) * type->length;
659 case GLSL_TYPE_STRUCT:
660 size = 0;
661 for (i = 0; i < type->length; i++) {
662 size += type_size(type->fields.structure[i].type);
663 }
664 return size;
665 case GLSL_TYPE_SAMPLER:
666 /* Samplers take up no register space, since they're baked in at
667 * link time.
668 */
669 return 0;
670 case GLSL_TYPE_ATOMIC_UINT:
671 return 0;
672 case GLSL_TYPE_IMAGE:
673 case GLSL_TYPE_VOID:
674 case GLSL_TYPE_ERROR:
675 case GLSL_TYPE_INTERFACE:
676 case GLSL_TYPE_DOUBLE:
677 unreachable("not reached");
678 }
679
680 return 0;
681 }
682
683 /**
684 * Create a MOV to read the timestamp register.
685 *
686 * The caller is responsible for emitting the MOV. The return value is
687 * the destination of the MOV, with extra parameters set.
688 */
689 fs_reg
690 fs_visitor::get_timestamp(fs_inst **out_mov)
691 {
692 assert(devinfo->gen >= 7);
693
694 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
695 BRW_ARF_TIMESTAMP,
696 0),
697 BRW_REGISTER_TYPE_UD));
698
699 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
700
701 fs_inst *mov = MOV(dst, ts);
702 /* We want to read the 3 fields we care about even if it's not enabled in
703 * the dispatch.
704 */
705 mov->force_writemask_all = true;
706
707 /* The caller wants the low 32 bits of the timestamp. Since it's running
708 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
709 * which is plenty of time for our purposes. It is identical across the
710 * EUs, but since it's tracking GPU core speed it will increment at a
711 * varying rate as render P-states change.
712 *
713 * The caller could also check if render P-states have changed (or anything
714 * else that might disrupt timing) by setting smear to 2 and checking if
715 * that field is != 0.
716 */
717 dst.set_smear(0);
718
719 *out_mov = mov;
720 return dst;
721 }
722
723 void
724 fs_visitor::emit_shader_time_begin()
725 {
726 current_annotation = "shader time start";
727 fs_inst *mov;
728 shader_start_time = get_timestamp(&mov);
729 emit(mov);
730 }
731
732 void
733 fs_visitor::emit_shader_time_end()
734 {
735 current_annotation = "shader time end";
736
737 enum shader_time_shader_type type, written_type, reset_type;
738 switch (stage) {
739 case MESA_SHADER_VERTEX:
740 type = ST_VS;
741 written_type = ST_VS_WRITTEN;
742 reset_type = ST_VS_RESET;
743 break;
744 case MESA_SHADER_GEOMETRY:
745 type = ST_GS;
746 written_type = ST_GS_WRITTEN;
747 reset_type = ST_GS_RESET;
748 break;
749 case MESA_SHADER_FRAGMENT:
750 if (dispatch_width == 8) {
751 type = ST_FS8;
752 written_type = ST_FS8_WRITTEN;
753 reset_type = ST_FS8_RESET;
754 } else {
755 assert(dispatch_width == 16);
756 type = ST_FS16;
757 written_type = ST_FS16_WRITTEN;
758 reset_type = ST_FS16_RESET;
759 }
760 break;
761 default:
762 unreachable("fs_visitor::emit_shader_time_end missing code");
763 }
764
765 /* Insert our code just before the final SEND with EOT. */
766 exec_node *end = this->instructions.get_tail();
767 assert(end && ((fs_inst *) end)->eot);
768
769 fs_inst *tm_read;
770 fs_reg shader_end_time = get_timestamp(&tm_read);
771 end->insert_before(tm_read);
772
773 /* Check that there weren't any timestamp reset events (assuming these
774 * were the only two timestamp reads that happened).
775 */
776 fs_reg reset = shader_end_time;
777 reset.set_smear(2);
778 fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
779 test->conditional_mod = BRW_CONDITIONAL_Z;
780 test->force_writemask_all = true;
781 end->insert_before(test);
782 end->insert_before(IF(BRW_PREDICATE_NORMAL));
783
784 fs_reg start = shader_start_time;
785 start.negate = true;
786 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
787 diff.set_smear(0);
788 fs_inst *add = ADD(diff, start, shader_end_time);
789 add->force_writemask_all = true;
790 end->insert_before(add);
791
792 /* If there were no instructions between the two timestamp gets, the diff
793 * is 2 cycles. Remove that overhead, so I can forget about that when
794 * trying to determine the time taken for single instructions.
795 */
796 add = ADD(diff, diff, fs_reg(-2u));
797 add->force_writemask_all = true;
798 end->insert_before(add);
799
800 end->insert_before(SHADER_TIME_ADD(type, diff));
801 end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
802 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
803 end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
804 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
805 }
806
807 fs_inst *
808 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
809 {
810 int shader_time_index =
811 brw_get_shader_time_index(brw, shader_prog, prog, type);
812 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
813
814 fs_reg payload;
815 if (dispatch_width == 8)
816 payload = vgrf(glsl_type::uvec2_type);
817 else
818 payload = vgrf(glsl_type::uint_type);
819
820 return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
821 fs_reg(), payload, offset, value);
822 }
823
824 void
825 fs_visitor::vfail(const char *format, va_list va)
826 {
827 char *msg;
828
829 if (failed)
830 return;
831
832 failed = true;
833
834 msg = ralloc_vasprintf(mem_ctx, format, va);
835 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
836
837 this->fail_msg = msg;
838
839 if (debug_enabled) {
840 fprintf(stderr, "%s", msg);
841 }
842 }
843
844 void
845 fs_visitor::fail(const char *format, ...)
846 {
847 va_list va;
848
849 va_start(va, format);
850 vfail(format, va);
851 va_end(va);
852 }
853
854 /**
855 * Mark this program as impossible to compile in SIMD16 mode.
856 *
857 * During the SIMD8 compile (which happens first), we can detect and flag
858 * things that are unsupported in SIMD16 mode, so the compiler can skip
859 * the SIMD16 compile altogether.
860 *
861 * During a SIMD16 compile (if one happens anyway), this just calls fail().
862 */
863 void
864 fs_visitor::no16(const char *format, ...)
865 {
866 va_list va;
867
868 va_start(va, format);
869
870 if (dispatch_width == 16) {
871 vfail(format, va);
872 } else {
873 simd16_unsupported = true;
874
875 if (brw->perf_debug) {
876 if (no16_msg)
877 ralloc_vasprintf_append(&no16_msg, format, va);
878 else
879 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
880 }
881 }
882
883 va_end(va);
884 }
885
886 fs_inst *
887 fs_visitor::emit(enum opcode opcode)
888 {
889 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
890 }
891
892 fs_inst *
893 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
894 {
895 return emit(new(mem_ctx) fs_inst(opcode, dst));
896 }
897
898 fs_inst *
899 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
900 {
901 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
902 }
903
904 fs_inst *
905 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
906 const fs_reg &src1)
907 {
908 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
909 }
910
911 fs_inst *
912 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
913 const fs_reg &src1, const fs_reg &src2)
914 {
915 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
916 }
917
918 fs_inst *
919 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
920 fs_reg src[], int sources)
921 {
922 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
923 }
924
925 /**
926 * Returns true if the instruction has a flag that means it won't
927 * update an entire destination register.
928 *
929 * For example, dead code elimination and live variable analysis want to know
930 * when a write to a variable screens off any preceding values that were in
931 * it.
932 */
933 bool
934 fs_inst::is_partial_write() const
935 {
936 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
937 (this->dst.width * type_sz(this->dst.type)) < 32 ||
938 !this->dst.is_contiguous());
939 }
940
941 int
942 fs_inst::regs_read(int arg) const
943 {
944 if (is_tex() && arg == 0 && src[0].file == GRF) {
945 return mlen;
946 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
947 return mlen;
948 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
949 return mlen;
950 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
951 return mlen;
952 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
953 return mlen;
954 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
955 return mlen;
956 } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
957 return exec_size / 4;
958 }
959
960 switch (src[arg].file) {
961 case BAD_FILE:
962 case UNIFORM:
963 case IMM:
964 return 1;
965 case GRF:
966 case HW_REG:
967 if (src[arg].stride == 0) {
968 return 1;
969 } else {
970 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
971 return (size + 31) / 32;
972 }
973 case MRF:
974 unreachable("MRF registers are not allowed as sources");
975 default:
976 unreachable("Invalid register file");
977 }
978 }
979
980 bool
981 fs_inst::reads_flag() const
982 {
983 return predicate;
984 }
985
986 bool
987 fs_inst::writes_flag() const
988 {
989 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
990 opcode != BRW_OPCODE_IF &&
991 opcode != BRW_OPCODE_WHILE)) ||
992 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
993 }
994
995 /**
996 * Returns how many MRFs an FS opcode will write over.
997 *
998 * Note that this is not the 0 or 1 implied writes in an actual gen
999 * instruction -- the FS opcodes often generate MOVs in addition.
1000 */
1001 int
1002 fs_visitor::implied_mrf_writes(fs_inst *inst)
1003 {
1004 if (inst->mlen == 0)
1005 return 0;
1006
1007 if (inst->base_mrf == -1)
1008 return 0;
1009
1010 switch (inst->opcode) {
1011 case SHADER_OPCODE_RCP:
1012 case SHADER_OPCODE_RSQ:
1013 case SHADER_OPCODE_SQRT:
1014 case SHADER_OPCODE_EXP2:
1015 case SHADER_OPCODE_LOG2:
1016 case SHADER_OPCODE_SIN:
1017 case SHADER_OPCODE_COS:
1018 return 1 * dispatch_width / 8;
1019 case SHADER_OPCODE_POW:
1020 case SHADER_OPCODE_INT_QUOTIENT:
1021 case SHADER_OPCODE_INT_REMAINDER:
1022 return 2 * dispatch_width / 8;
1023 case SHADER_OPCODE_TEX:
1024 case FS_OPCODE_TXB:
1025 case SHADER_OPCODE_TXD:
1026 case SHADER_OPCODE_TXF:
1027 case SHADER_OPCODE_TXF_CMS:
1028 case SHADER_OPCODE_TXF_MCS:
1029 case SHADER_OPCODE_TG4:
1030 case SHADER_OPCODE_TG4_OFFSET:
1031 case SHADER_OPCODE_TXL:
1032 case SHADER_OPCODE_TXS:
1033 case SHADER_OPCODE_LOD:
1034 return 1;
1035 case FS_OPCODE_FB_WRITE:
1036 return 2;
1037 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1038 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1039 return 1;
1040 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1041 return inst->mlen;
1042 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1043 return 2;
1044 case SHADER_OPCODE_UNTYPED_ATOMIC:
1045 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1046 case SHADER_OPCODE_URB_WRITE_SIMD8:
1047 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1048 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1049 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1050 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1051 return 0;
1052 default:
1053 unreachable("not reached");
1054 }
1055 }
1056
1057 fs_reg
1058 fs_visitor::vgrf(const glsl_type *const type)
1059 {
1060 int reg_width = dispatch_width / 8;
1061 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1062 brw_type_for_base_type(type), dispatch_width);
1063 }
1064
1065 fs_reg
1066 fs_visitor::vgrf(int num_components)
1067 {
1068 int reg_width = dispatch_width / 8;
1069 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1070 BRW_REGISTER_TYPE_F, dispatch_width);
1071 }
1072
1073 /** Fixed HW reg constructor. */
1074 fs_reg::fs_reg(enum register_file file, int reg)
1075 {
1076 init();
1077 this->file = file;
1078 this->reg = reg;
1079 this->type = BRW_REGISTER_TYPE_F;
1080
1081 switch (file) {
1082 case UNIFORM:
1083 this->width = 1;
1084 break;
1085 default:
1086 this->width = 8;
1087 }
1088 }
1089
1090 /** Fixed HW reg constructor. */
1091 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1092 {
1093 init();
1094 this->file = file;
1095 this->reg = reg;
1096 this->type = type;
1097
1098 switch (file) {
1099 case UNIFORM:
1100 this->width = 1;
1101 break;
1102 default:
1103 this->width = 8;
1104 }
1105 }
1106
1107 /** Fixed HW reg constructor. */
1108 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1109 uint8_t width)
1110 {
1111 init();
1112 this->file = file;
1113 this->reg = reg;
1114 this->type = type;
1115 this->width = width;
1116 }
1117
1118 fs_reg *
1119 fs_visitor::variable_storage(ir_variable *var)
1120 {
1121 return (fs_reg *)hash_table_find(this->variable_ht, var);
1122 }
1123
1124 void
1125 import_uniforms_callback(const void *key,
1126 void *data,
1127 void *closure)
1128 {
1129 struct hash_table *dst_ht = (struct hash_table *)closure;
1130 const fs_reg *reg = (const fs_reg *)data;
1131
1132 if (reg->file != UNIFORM)
1133 return;
1134
1135 hash_table_insert(dst_ht, data, key);
1136 }
1137
1138 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1139 * This brings in those uniform definitions
1140 */
1141 void
1142 fs_visitor::import_uniforms(fs_visitor *v)
1143 {
1144 hash_table_call_foreach(v->variable_ht,
1145 import_uniforms_callback,
1146 variable_ht);
1147 this->push_constant_loc = v->push_constant_loc;
1148 this->pull_constant_loc = v->pull_constant_loc;
1149 this->uniforms = v->uniforms;
1150 this->param_size = v->param_size;
1151 }
1152
1153 /* Our support for uniforms is piggy-backed on the struct
1154 * gl_fragment_program, because that's where the values actually
1155 * get stored, rather than in some global gl_shader_program uniform
1156 * store.
1157 */
1158 void
1159 fs_visitor::setup_uniform_values(ir_variable *ir)
1160 {
1161 int namelen = strlen(ir->name);
1162
1163 /* The data for our (non-builtin) uniforms is stored in a series of
1164 * gl_uniform_driver_storage structs for each subcomponent that
1165 * glGetUniformLocation() could name. We know it's been set up in the same
1166 * order we'd walk the type, so walk the list of storage and find anything
1167 * with our name, or the prefix of a component that starts with our name.
1168 */
1169 unsigned params_before = uniforms;
1170 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1171 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1172
1173 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1174 (storage->name[namelen] != 0 &&
1175 storage->name[namelen] != '.' &&
1176 storage->name[namelen] != '[')) {
1177 continue;
1178 }
1179
1180 unsigned slots = storage->type->component_slots();
1181 if (storage->array_elements)
1182 slots *= storage->array_elements;
1183
1184 for (unsigned i = 0; i < slots; i++) {
1185 stage_prog_data->param[uniforms++] = &storage->storage[i];
1186 }
1187 }
1188
1189 /* Make sure we actually initialized the right amount of stuff here. */
1190 assert(params_before + ir->type->component_slots() == uniforms);
1191 (void)params_before;
1192 }
1193
1194
1195 /* Our support for builtin uniforms is even scarier than non-builtin.
1196 * It sits on top of the PROG_STATE_VAR parameters that are
1197 * automatically updated from GL context state.
1198 */
1199 void
1200 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1201 {
1202 const ir_state_slot *const slots = ir->get_state_slots();
1203 assert(slots != NULL);
1204
1205 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1206 /* This state reference has already been setup by ir_to_mesa, but we'll
1207 * get the same index back here.
1208 */
1209 int index = _mesa_add_state_reference(this->prog->Parameters,
1210 (gl_state_index *)slots[i].tokens);
1211
1212 /* Add each of the unique swizzles of the element as a parameter.
1213 * This'll end up matching the expected layout of the
1214 * array/matrix/structure we're trying to fill in.
1215 */
1216 int last_swiz = -1;
1217 for (unsigned int j = 0; j < 4; j++) {
1218 int swiz = GET_SWZ(slots[i].swizzle, j);
1219 if (swiz == last_swiz)
1220 break;
1221 last_swiz = swiz;
1222
1223 stage_prog_data->param[uniforms++] =
1224 &prog->Parameters->ParameterValues[index][swiz];
1225 }
1226 }
1227 }
1228
1229 fs_reg *
1230 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1231 bool origin_upper_left)
1232 {
1233 assert(stage == MESA_SHADER_FRAGMENT);
1234 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1235 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1236 fs_reg wpos = *reg;
1237 bool flip = !origin_upper_left ^ key->render_to_fbo;
1238
1239 /* gl_FragCoord.x */
1240 if (pixel_center_integer) {
1241 emit(MOV(wpos, this->pixel_x));
1242 } else {
1243 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1244 }
1245 wpos = offset(wpos, 1);
1246
1247 /* gl_FragCoord.y */
1248 if (!flip && pixel_center_integer) {
1249 emit(MOV(wpos, this->pixel_y));
1250 } else {
1251 fs_reg pixel_y = this->pixel_y;
1252 float offset = (pixel_center_integer ? 0.0 : 0.5);
1253
1254 if (flip) {
1255 pixel_y.negate = true;
1256 offset += key->drawable_height - 1.0;
1257 }
1258
1259 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1260 }
1261 wpos = offset(wpos, 1);
1262
1263 /* gl_FragCoord.z */
1264 if (devinfo->gen >= 6) {
1265 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1266 } else {
1267 emit(FS_OPCODE_LINTERP, wpos,
1268 this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1269 interp_reg(VARYING_SLOT_POS, 2));
1270 }
1271 wpos = offset(wpos, 1);
1272
1273 /* gl_FragCoord.w: Already set up in emit_interpolation */
1274 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1275
1276 return reg;
1277 }
1278
1279 fs_inst *
1280 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1281 glsl_interp_qualifier interpolation_mode,
1282 bool is_centroid, bool is_sample)
1283 {
1284 brw_wm_barycentric_interp_mode barycoord_mode;
1285 if (devinfo->gen >= 6) {
1286 if (is_centroid) {
1287 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1288 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1289 else
1290 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1291 } else if (is_sample) {
1292 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1293 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1294 else
1295 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1296 } else {
1297 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1298 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1299 else
1300 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1301 }
1302 } else {
1303 /* On Ironlake and below, there is only one interpolation mode.
1304 * Centroid interpolation doesn't mean anything on this hardware --
1305 * there is no multisampling.
1306 */
1307 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1308 }
1309 return emit(FS_OPCODE_LINTERP, attr,
1310 this->delta_xy[barycoord_mode], interp);
1311 }
1312
1313 void
1314 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1315 const glsl_type *type,
1316 glsl_interp_qualifier interpolation_mode,
1317 int location, bool mod_centroid,
1318 bool mod_sample)
1319 {
1320 attr.type = brw_type_for_base_type(type->get_scalar_type());
1321
1322 assert(stage == MESA_SHADER_FRAGMENT);
1323 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1324 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1325
1326 unsigned int array_elements;
1327
1328 if (type->is_array()) {
1329 array_elements = type->length;
1330 if (array_elements == 0) {
1331 fail("dereferenced array '%s' has length 0\n", name);
1332 }
1333 type = type->fields.array;
1334 } else {
1335 array_elements = 1;
1336 }
1337
1338 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1339 bool is_gl_Color =
1340 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1341 if (key->flat_shade && is_gl_Color) {
1342 interpolation_mode = INTERP_QUALIFIER_FLAT;
1343 } else {
1344 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1345 }
1346 }
1347
1348 for (unsigned int i = 0; i < array_elements; i++) {
1349 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1350 if (prog_data->urb_setup[location] == -1) {
1351 /* If there's no incoming setup data for this slot, don't
1352 * emit interpolation for it.
1353 */
1354 attr = offset(attr, type->vector_elements);
1355 location++;
1356 continue;
1357 }
1358
1359 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1360 /* Constant interpolation (flat shading) case. The SF has
1361 * handed us defined values in only the constant offset
1362 * field of the setup reg.
1363 */
1364 for (unsigned int k = 0; k < type->vector_elements; k++) {
1365 struct brw_reg interp = interp_reg(location, k);
1366 interp = suboffset(interp, 3);
1367 interp.type = attr.type;
1368 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1369 attr = offset(attr, 1);
1370 }
1371 } else {
1372 /* Smooth/noperspective interpolation case. */
1373 for (unsigned int k = 0; k < type->vector_elements; k++) {
1374 struct brw_reg interp = interp_reg(location, k);
1375 if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1376 /* Get the pixel/sample mask into f0 so that we know
1377 * which pixels are lit. Then, for each channel that is
1378 * unlit, replace the centroid data with non-centroid
1379 * data.
1380 */
1381 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1382
1383 fs_inst *inst;
1384 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1385 false, false);
1386 inst->predicate = BRW_PREDICATE_NORMAL;
1387 inst->predicate_inverse = true;
1388 if (devinfo->has_pln)
1389 inst->no_dd_clear = true;
1390
1391 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1392 mod_centroid && !key->persample_shading,
1393 mod_sample || key->persample_shading);
1394 inst->predicate = BRW_PREDICATE_NORMAL;
1395 inst->predicate_inverse = false;
1396 if (devinfo->has_pln)
1397 inst->no_dd_check = true;
1398
1399 } else {
1400 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1401 mod_centroid && !key->persample_shading,
1402 mod_sample || key->persample_shading);
1403 }
1404 if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1405 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1406 }
1407 attr = offset(attr, 1);
1408 }
1409
1410 }
1411 location++;
1412 }
1413 }
1414 }
1415
1416 fs_reg *
1417 fs_visitor::emit_frontfacing_interpolation()
1418 {
1419 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1420
1421 if (devinfo->gen >= 6) {
1422 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1423 * a boolean result from this (~0/true or 0/false).
1424 *
1425 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1426 * this task in only one instruction:
1427 * - a negation source modifier will flip the bit; and
1428 * - a W -> D type conversion will sign extend the bit into the high
1429 * word of the destination.
1430 *
1431 * An ASR 15 fills the low word of the destination.
1432 */
1433 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1434 g0.negate = true;
1435
1436 emit(ASR(*reg, g0, fs_reg(15)));
1437 } else {
1438 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1439 * a boolean result from this (1/true or 0/false).
1440 *
1441 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1442 * the negation source modifier to flip it. Unfortunately the SHR
1443 * instruction only operates on UD (or D with an abs source modifier)
1444 * sources without negation.
1445 *
1446 * Instead, use ASR (which will give ~0/true or 0/false).
1447 */
1448 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1449 g1_6.negate = true;
1450
1451 emit(ASR(*reg, g1_6, fs_reg(31)));
1452 }
1453
1454 return reg;
1455 }
1456
1457 void
1458 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1459 {
1460 assert(stage == MESA_SHADER_FRAGMENT);
1461 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1462 assert(dst.type == BRW_REGISTER_TYPE_F);
1463
1464 if (key->compute_pos_offset) {
1465 /* Convert int_sample_pos to floating point */
1466 emit(MOV(dst, int_sample_pos));
1467 /* Scale to the range [0, 1] */
1468 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1469 }
1470 else {
1471 /* From ARB_sample_shading specification:
1472 * "When rendering to a non-multisample buffer, or if multisample
1473 * rasterization is disabled, gl_SamplePosition will always be
1474 * (0.5, 0.5).
1475 */
1476 emit(MOV(dst, fs_reg(0.5f)));
1477 }
1478 }
1479
1480 fs_reg *
1481 fs_visitor::emit_samplepos_setup()
1482 {
1483 assert(devinfo->gen >= 6);
1484
1485 this->current_annotation = "compute sample position";
1486 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1487 fs_reg pos = *reg;
1488 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1489 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1490
1491 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1492 * mode will be enabled.
1493 *
1494 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1495 * R31.1:0 Position Offset X/Y for Slot[3:0]
1496 * R31.3:2 Position Offset X/Y for Slot[7:4]
1497 * .....
1498 *
1499 * The X, Y sample positions come in as bytes in thread payload. So, read
1500 * the positions using vstride=16, width=8, hstride=2.
1501 */
1502 struct brw_reg sample_pos_reg =
1503 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1504 BRW_REGISTER_TYPE_B), 16, 8, 2);
1505
1506 if (dispatch_width == 8) {
1507 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1508 } else {
1509 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1510 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1511 ->force_sechalf = true;
1512 }
1513 /* Compute gl_SamplePosition.x */
1514 compute_sample_position(pos, int_sample_x);
1515 pos = offset(pos, 1);
1516 if (dispatch_width == 8) {
1517 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1518 } else {
1519 emit(MOV(half(int_sample_y, 0),
1520 fs_reg(suboffset(sample_pos_reg, 1))));
1521 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1522 ->force_sechalf = true;
1523 }
1524 /* Compute gl_SamplePosition.y */
1525 compute_sample_position(pos, int_sample_y);
1526 return reg;
1527 }
1528
1529 fs_reg *
1530 fs_visitor::emit_sampleid_setup()
1531 {
1532 assert(stage == MESA_SHADER_FRAGMENT);
1533 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1534 assert(devinfo->gen >= 6);
1535
1536 this->current_annotation = "compute sample id";
1537 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1538
1539 if (key->compute_sample_id) {
1540 fs_reg t1 = vgrf(glsl_type::int_type);
1541 fs_reg t2 = vgrf(glsl_type::int_type);
1542 t2.type = BRW_REGISTER_TYPE_UW;
1543
1544 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1545 * 8x multisampling, subspan 0 will represent sample N (where N
1546 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1547 * 7. We can find the value of N by looking at R0.0 bits 7:6
1548 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1549 * (since samples are always delivered in pairs). That is, we
1550 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1551 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1552 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1553 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1554 * populating a temporary variable with the sequence (0, 1, 2, 3),
1555 * and then reading from it using vstride=1, width=4, hstride=0.
1556 * These computations hold good for 4x multisampling as well.
1557 *
1558 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1559 * the first four slots are sample 0 of subspan 0; the next four
1560 * are sample 1 of subspan 0; the third group is sample 0 of
1561 * subspan 1, and finally sample 1 of subspan 1.
1562 */
1563 fs_inst *inst;
1564 inst = emit(BRW_OPCODE_AND, t1,
1565 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1566 fs_reg(0xc0));
1567 inst->force_writemask_all = true;
1568 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1569 inst->force_writemask_all = true;
1570 /* This works for both SIMD8 and SIMD16 */
1571 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1572 inst->force_writemask_all = true;
1573 /* This special instruction takes care of setting vstride=1,
1574 * width=4, hstride=0 of t2 during an ADD instruction.
1575 */
1576 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1577 } else {
1578 /* As per GL_ARB_sample_shading specification:
1579 * "When rendering to a non-multisample buffer, or if multisample
1580 * rasterization is disabled, gl_SampleID will always be zero."
1581 */
1582 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1583 }
1584
1585 return reg;
1586 }
1587
1588 void
1589 fs_visitor::resolve_source_modifiers(fs_reg *src)
1590 {
1591 if (!src->abs && !src->negate)
1592 return;
1593
1594 fs_reg temp = retype(vgrf(1), src->type);
1595 emit(MOV(temp, *src));
1596 *src = temp;
1597 }
1598
1599 fs_reg
1600 fs_visitor::fix_math_operand(fs_reg src)
1601 {
1602 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1603 * might be able to do better by doing execsize = 1 math and then
1604 * expanding that result out, but we would need to be careful with
1605 * masking.
1606 *
1607 * The hardware ignores source modifiers (negate and abs) on math
1608 * instructions, so we also move to a temp to set those up.
1609 */
1610 if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1611 !src.abs && !src.negate)
1612 return src;
1613
1614 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1615 * operands to math
1616 */
1617 if (devinfo->gen >= 7 && src.file != IMM)
1618 return src;
1619
1620 fs_reg expanded = vgrf(glsl_type::float_type);
1621 expanded.type = src.type;
1622 emit(BRW_OPCODE_MOV, expanded, src);
1623 return expanded;
1624 }
1625
1626 fs_inst *
1627 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1628 {
1629 switch (opcode) {
1630 case SHADER_OPCODE_RCP:
1631 case SHADER_OPCODE_RSQ:
1632 case SHADER_OPCODE_SQRT:
1633 case SHADER_OPCODE_EXP2:
1634 case SHADER_OPCODE_LOG2:
1635 case SHADER_OPCODE_SIN:
1636 case SHADER_OPCODE_COS:
1637 break;
1638 default:
1639 unreachable("not reached: bad math opcode");
1640 }
1641
1642 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1643 * might be able to do better by doing execsize = 1 math and then
1644 * expanding that result out, but we would need to be careful with
1645 * masking.
1646 *
1647 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1648 * instructions, so we also move to a temp to set those up.
1649 */
1650 if (devinfo->gen == 6 || devinfo->gen == 7)
1651 src = fix_math_operand(src);
1652
1653 fs_inst *inst = emit(opcode, dst, src);
1654
1655 if (devinfo->gen < 6) {
1656 inst->base_mrf = 2;
1657 inst->mlen = dispatch_width / 8;
1658 }
1659
1660 return inst;
1661 }
1662
1663 fs_inst *
1664 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1665 {
1666 int base_mrf = 2;
1667 fs_inst *inst;
1668
1669 if (devinfo->gen >= 8) {
1670 inst = emit(opcode, dst, src0, src1);
1671 } else if (devinfo->gen >= 6) {
1672 src0 = fix_math_operand(src0);
1673 src1 = fix_math_operand(src1);
1674
1675 inst = emit(opcode, dst, src0, src1);
1676 } else {
1677 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1678 * "Message Payload":
1679 *
1680 * "Operand0[7]. For the INT DIV functions, this operand is the
1681 * denominator."
1682 * ...
1683 * "Operand1[7]. For the INT DIV functions, this operand is the
1684 * numerator."
1685 */
1686 bool is_int_div = opcode != SHADER_OPCODE_POW;
1687 fs_reg &op0 = is_int_div ? src1 : src0;
1688 fs_reg &op1 = is_int_div ? src0 : src1;
1689
1690 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1691 inst = emit(opcode, dst, op0, reg_null_f);
1692
1693 inst->base_mrf = base_mrf;
1694 inst->mlen = 2 * dispatch_width / 8;
1695 }
1696 return inst;
1697 }
1698
1699 void
1700 fs_visitor::emit_discard_jump()
1701 {
1702 assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1703
1704 /* For performance, after a discard, jump to the end of the
1705 * shader if all relevant channels have been discarded.
1706 */
1707 fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1708 discard_jump->flag_subreg = 1;
1709
1710 discard_jump->predicate = (dispatch_width == 8)
1711 ? BRW_PREDICATE_ALIGN1_ANY8H
1712 : BRW_PREDICATE_ALIGN1_ANY16H;
1713 discard_jump->predicate_inverse = true;
1714 }
1715
1716 void
1717 fs_visitor::assign_curb_setup()
1718 {
1719 if (dispatch_width == 8) {
1720 prog_data->dispatch_grf_start_reg = payload.num_regs;
1721 } else {
1722 assert(stage == MESA_SHADER_FRAGMENT);
1723 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1724 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1725 }
1726
1727 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1728
1729 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1730 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1731 for (unsigned int i = 0; i < inst->sources; i++) {
1732 if (inst->src[i].file == UNIFORM) {
1733 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1734 int constant_nr;
1735 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1736 constant_nr = push_constant_loc[uniform_nr];
1737 } else {
1738 /* Section 5.11 of the OpenGL 4.1 spec says:
1739 * "Out-of-bounds reads return undefined values, which include
1740 * values from other variables of the active program or zero."
1741 * Just return the first push constant.
1742 */
1743 constant_nr = 0;
1744 }
1745
1746 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1747 constant_nr / 8,
1748 constant_nr % 8);
1749
1750 inst->src[i].file = HW_REG;
1751 inst->src[i].fixed_hw_reg = byte_offset(
1752 retype(brw_reg, inst->src[i].type),
1753 inst->src[i].subreg_offset);
1754 }
1755 }
1756 }
1757 }
1758
1759 void
1760 fs_visitor::calculate_urb_setup()
1761 {
1762 assert(stage == MESA_SHADER_FRAGMENT);
1763 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1764 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1765
1766 memset(prog_data->urb_setup, -1,
1767 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1768
1769 int urb_next = 0;
1770 /* Figure out where each of the incoming setup attributes lands. */
1771 if (devinfo->gen >= 6) {
1772 if (_mesa_bitcount_64(prog->InputsRead &
1773 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1774 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1775 * first 16 varying inputs, so we can put them wherever we want.
1776 * Just put them in order.
1777 *
1778 * This is useful because it means that (a) inputs not used by the
1779 * fragment shader won't take up valuable register space, and (b) we
1780 * won't have to recompile the fragment shader if it gets paired with
1781 * a different vertex (or geometry) shader.
1782 */
1783 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1784 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1785 BITFIELD64_BIT(i)) {
1786 prog_data->urb_setup[i] = urb_next++;
1787 }
1788 }
1789 } else {
1790 /* We have enough input varyings that the SF/SBE pipeline stage can't
1791 * arbitrarily rearrange them to suit our whim; we have to put them
1792 * in an order that matches the output of the previous pipeline stage
1793 * (geometry or vertex shader).
1794 */
1795 struct brw_vue_map prev_stage_vue_map;
1796 brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1797 key->input_slots_valid);
1798 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1799 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1800 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1801 slot++) {
1802 int varying = prev_stage_vue_map.slot_to_varying[slot];
1803 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1804 * unused.
1805 */
1806 if (varying != BRW_VARYING_SLOT_COUNT &&
1807 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1808 BITFIELD64_BIT(varying))) {
1809 prog_data->urb_setup[varying] = slot - first_slot;
1810 }
1811 }
1812 urb_next = prev_stage_vue_map.num_slots - first_slot;
1813 }
1814 } else {
1815 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1816 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1817 /* Point size is packed into the header, not as a general attribute */
1818 if (i == VARYING_SLOT_PSIZ)
1819 continue;
1820
1821 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1822 /* The back color slot is skipped when the front color is
1823 * also written to. In addition, some slots can be
1824 * written in the vertex shader and not read in the
1825 * fragment shader. So the register number must always be
1826 * incremented, mapped or not.
1827 */
1828 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1829 prog_data->urb_setup[i] = urb_next;
1830 urb_next++;
1831 }
1832 }
1833
1834 /*
1835 * It's a FS only attribute, and we did interpolation for this attribute
1836 * in SF thread. So, count it here, too.
1837 *
1838 * See compile_sf_prog() for more info.
1839 */
1840 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1841 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1842 }
1843
1844 prog_data->num_varying_inputs = urb_next;
1845 }
1846
1847 void
1848 fs_visitor::assign_urb_setup()
1849 {
1850 assert(stage == MESA_SHADER_FRAGMENT);
1851 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1852
1853 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1854
1855 /* Offset all the urb_setup[] index by the actual position of the
1856 * setup regs, now that the location of the constants has been chosen.
1857 */
1858 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1859 if (inst->opcode == FS_OPCODE_LINTERP) {
1860 assert(inst->src[1].file == HW_REG);
1861 inst->src[1].fixed_hw_reg.nr += urb_start;
1862 }
1863
1864 if (inst->opcode == FS_OPCODE_CINTERP) {
1865 assert(inst->src[0].file == HW_REG);
1866 inst->src[0].fixed_hw_reg.nr += urb_start;
1867 }
1868 }
1869
1870 /* Each attribute is 4 setup channels, each of which is half a reg. */
1871 this->first_non_payload_grf =
1872 urb_start + prog_data->num_varying_inputs * 2;
1873 }
1874
1875 void
1876 fs_visitor::assign_vs_urb_setup()
1877 {
1878 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1879 int grf, count, slot, channel, attr;
1880
1881 assert(stage == MESA_SHADER_VERTEX);
1882 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1883 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1884 count++;
1885
1886 /* Each attribute is 4 regs. */
1887 this->first_non_payload_grf =
1888 payload.num_regs + prog_data->curb_read_length + count * 4;
1889
1890 unsigned vue_entries =
1891 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1892
1893 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1894 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1895
1896 assert(vs_prog_data->base.urb_read_length <= 15);
1897
1898 /* Rewrite all ATTR file references to the hw grf that they land in. */
1899 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1900 for (int i = 0; i < inst->sources; i++) {
1901 if (inst->src[i].file == ATTR) {
1902
1903 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1904 slot = count - 1;
1905 } else {
1906 /* Attributes come in in a contiguous block, ordered by their
1907 * gl_vert_attrib value. That means we can compute the slot
1908 * number for an attribute by masking out the enabled
1909 * attributes before it and counting the bits.
1910 */
1911 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1912 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1913 BITFIELD64_MASK(attr));
1914 }
1915
1916 channel = inst->src[i].reg_offset & 3;
1917
1918 grf = payload.num_regs +
1919 prog_data->curb_read_length +
1920 slot * 4 + channel;
1921
1922 inst->src[i].file = HW_REG;
1923 inst->src[i].fixed_hw_reg =
1924 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1925 }
1926 }
1927 }
1928 }
1929
1930 /**
1931 * Split large virtual GRFs into separate components if we can.
1932 *
1933 * This is mostly duplicated with what brw_fs_vector_splitting does,
1934 * but that's really conservative because it's afraid of doing
1935 * splitting that doesn't result in real progress after the rest of
1936 * the optimization phases, which would cause infinite looping in
1937 * optimization. We can do it once here, safely. This also has the
1938 * opportunity to split interpolated values, or maybe even uniforms,
1939 * which we don't have at the IR level.
1940 *
1941 * We want to split, because virtual GRFs are what we register
1942 * allocate and spill (due to contiguousness requirements for some
1943 * instructions), and they're what we naturally generate in the
1944 * codegen process, but most virtual GRFs don't actually need to be
1945 * contiguous sets of GRFs. If we split, we'll end up with reduced
1946 * live intervals and better dead code elimination and coalescing.
1947 */
1948 void
1949 fs_visitor::split_virtual_grfs()
1950 {
1951 int num_vars = this->alloc.count;
1952
1953 /* Count the total number of registers */
1954 int reg_count = 0;
1955 int vgrf_to_reg[num_vars];
1956 for (int i = 0; i < num_vars; i++) {
1957 vgrf_to_reg[i] = reg_count;
1958 reg_count += alloc.sizes[i];
1959 }
1960
1961 /* An array of "split points". For each register slot, this indicates
1962 * if this slot can be separated from the previous slot. Every time an
1963 * instruction uses multiple elements of a register (as a source or
1964 * destination), we mark the used slots as inseparable. Then we go
1965 * through and split the registers into the smallest pieces we can.
1966 */
1967 bool split_points[reg_count];
1968 memset(split_points, 0, sizeof(split_points));
1969
1970 /* Mark all used registers as fully splittable */
1971 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1972 if (inst->dst.file == GRF) {
1973 int reg = vgrf_to_reg[inst->dst.reg];
1974 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1975 split_points[reg + j] = true;
1976 }
1977
1978 for (int i = 0; i < inst->sources; i++) {
1979 if (inst->src[i].file == GRF) {
1980 int reg = vgrf_to_reg[inst->src[i].reg];
1981 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1982 split_points[reg + j] = true;
1983 }
1984 }
1985 }
1986
1987 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1988 if (inst->dst.file == GRF) {
1989 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1990 for (int j = 1; j < inst->regs_written; j++)
1991 split_points[reg + j] = false;
1992 }
1993 for (int i = 0; i < inst->sources; i++) {
1994 if (inst->src[i].file == GRF) {
1995 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1996 for (int j = 1; j < inst->regs_read(i); j++)
1997 split_points[reg + j] = false;
1998 }
1999 }
2000 }
2001
2002 int new_virtual_grf[reg_count];
2003 int new_reg_offset[reg_count];
2004
2005 int reg = 0;
2006 for (int i = 0; i < num_vars; i++) {
2007 /* The first one should always be 0 as a quick sanity check. */
2008 assert(split_points[reg] == false);
2009
2010 /* j = 0 case */
2011 new_reg_offset[reg] = 0;
2012 reg++;
2013 int offset = 1;
2014
2015 /* j > 0 case */
2016 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2017 /* If this is a split point, reset the offset to 0 and allocate a
2018 * new virtual GRF for the previous offset many registers
2019 */
2020 if (split_points[reg]) {
2021 assert(offset <= MAX_VGRF_SIZE);
2022 int grf = alloc.allocate(offset);
2023 for (int k = reg - offset; k < reg; k++)
2024 new_virtual_grf[k] = grf;
2025 offset = 0;
2026 }
2027 new_reg_offset[reg] = offset;
2028 offset++;
2029 reg++;
2030 }
2031
2032 /* The last one gets the original register number */
2033 assert(offset <= MAX_VGRF_SIZE);
2034 alloc.sizes[i] = offset;
2035 for (int k = reg - offset; k < reg; k++)
2036 new_virtual_grf[k] = i;
2037 }
2038 assert(reg == reg_count);
2039
2040 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2041 if (inst->dst.file == GRF) {
2042 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2043 inst->dst.reg = new_virtual_grf[reg];
2044 inst->dst.reg_offset = new_reg_offset[reg];
2045 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2046 }
2047 for (int i = 0; i < inst->sources; i++) {
2048 if (inst->src[i].file == GRF) {
2049 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2050 inst->src[i].reg = new_virtual_grf[reg];
2051 inst->src[i].reg_offset = new_reg_offset[reg];
2052 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2053 }
2054 }
2055 }
2056 invalidate_live_intervals();
2057 }
2058
2059 /**
2060 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2061 *
2062 * During code generation, we create tons of temporary variables, many of
2063 * which get immediately killed and are never used again. Yet, in later
2064 * optimization and analysis passes, such as compute_live_intervals, we need
2065 * to loop over all the virtual GRFs. Compacting them can save a lot of
2066 * overhead.
2067 */
2068 bool
2069 fs_visitor::compact_virtual_grfs()
2070 {
2071 bool progress = false;
2072 int remap_table[this->alloc.count];
2073 memset(remap_table, -1, sizeof(remap_table));
2074
2075 /* Mark which virtual GRFs are used. */
2076 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2077 if (inst->dst.file == GRF)
2078 remap_table[inst->dst.reg] = 0;
2079
2080 for (int i = 0; i < inst->sources; i++) {
2081 if (inst->src[i].file == GRF)
2082 remap_table[inst->src[i].reg] = 0;
2083 }
2084 }
2085
2086 /* Compact the GRF arrays. */
2087 int new_index = 0;
2088 for (unsigned i = 0; i < this->alloc.count; i++) {
2089 if (remap_table[i] == -1) {
2090 /* We just found an unused register. This means that we are
2091 * actually going to compact something.
2092 */
2093 progress = true;
2094 } else {
2095 remap_table[i] = new_index;
2096 alloc.sizes[new_index] = alloc.sizes[i];
2097 invalidate_live_intervals();
2098 ++new_index;
2099 }
2100 }
2101
2102 this->alloc.count = new_index;
2103
2104 /* Patch all the instructions to use the newly renumbered registers */
2105 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2106 if (inst->dst.file == GRF)
2107 inst->dst.reg = remap_table[inst->dst.reg];
2108
2109 for (int i = 0; i < inst->sources; i++) {
2110 if (inst->src[i].file == GRF)
2111 inst->src[i].reg = remap_table[inst->src[i].reg];
2112 }
2113 }
2114
2115 /* Patch all the references to delta_xy, since they're used in register
2116 * allocation. If they're unused, switch them to BAD_FILE so we don't
2117 * think some random VGRF is delta_xy.
2118 */
2119 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2120 if (delta_xy[i].file == GRF) {
2121 if (remap_table[delta_xy[i].reg] != -1) {
2122 delta_xy[i].reg = remap_table[delta_xy[i].reg];
2123 } else {
2124 delta_xy[i].file = BAD_FILE;
2125 }
2126 }
2127 }
2128
2129 return progress;
2130 }
2131
2132 /*
2133 * Implements array access of uniforms by inserting a
2134 * PULL_CONSTANT_LOAD instruction.
2135 *
2136 * Unlike temporary GRF array access (where we don't support it due to
2137 * the difficulty of doing relative addressing on instruction
2138 * destinations), we could potentially do array access of uniforms
2139 * that were loaded in GRF space as push constants. In real-world
2140 * usage we've seen, though, the arrays being used are always larger
2141 * than we could load as push constants, so just always move all
2142 * uniform array access out to a pull constant buffer.
2143 */
2144 void
2145 fs_visitor::move_uniform_array_access_to_pull_constants()
2146 {
2147 if (dispatch_width != 8)
2148 return;
2149
2150 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2151 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2152
2153 /* Walk through and find array access of uniforms. Put a copy of that
2154 * uniform in the pull constant buffer.
2155 *
2156 * Note that we don't move constant-indexed accesses to arrays. No
2157 * testing has been done of the performance impact of this choice.
2158 */
2159 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2160 for (int i = 0 ; i < inst->sources; i++) {
2161 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2162 continue;
2163
2164 int uniform = inst->src[i].reg;
2165
2166 /* If this array isn't already present in the pull constant buffer,
2167 * add it.
2168 */
2169 if (pull_constant_loc[uniform] == -1) {
2170 const gl_constant_value **values = &stage_prog_data->param[uniform];
2171
2172 assert(param_size[uniform]);
2173
2174 for (int j = 0; j < param_size[uniform]; j++) {
2175 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2176
2177 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2178 values[j];
2179 }
2180 }
2181 }
2182 }
2183 }
2184
2185 /**
2186 * Assign UNIFORM file registers to either push constants or pull constants.
2187 *
2188 * We allow a fragment shader to have more than the specified minimum
2189 * maximum number of fragment shader uniform components (64). If
2190 * there are too many of these, they'd fill up all of register space.
2191 * So, this will push some of them out to the pull constant buffer and
2192 * update the program to load them.
2193 */
2194 void
2195 fs_visitor::assign_constant_locations()
2196 {
2197 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2198 if (dispatch_width != 8)
2199 return;
2200
2201 /* Find which UNIFORM registers are still in use. */
2202 bool is_live[uniforms];
2203 for (unsigned int i = 0; i < uniforms; i++) {
2204 is_live[i] = false;
2205 }
2206
2207 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2208 for (int i = 0; i < inst->sources; i++) {
2209 if (inst->src[i].file != UNIFORM)
2210 continue;
2211
2212 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2213 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2214 is_live[constant_nr] = true;
2215 }
2216 }
2217
2218 /* Only allow 16 registers (128 uniform components) as push constants.
2219 *
2220 * Just demote the end of the list. We could probably do better
2221 * here, demoting things that are rarely used in the program first.
2222 *
2223 * If changing this value, note the limitation about total_regs in
2224 * brw_curbe.c.
2225 */
2226 unsigned int max_push_components = 16 * 8;
2227 unsigned int num_push_constants = 0;
2228
2229 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2230
2231 for (unsigned int i = 0; i < uniforms; i++) {
2232 if (!is_live[i] || pull_constant_loc[i] != -1) {
2233 /* This UNIFORM register is either dead, or has already been demoted
2234 * to a pull const. Mark it as no longer living in the param[] array.
2235 */
2236 push_constant_loc[i] = -1;
2237 continue;
2238 }
2239
2240 if (num_push_constants < max_push_components) {
2241 /* Retain as a push constant. Record the location in the params[]
2242 * array.
2243 */
2244 push_constant_loc[i] = num_push_constants++;
2245 } else {
2246 /* Demote to a pull constant. */
2247 push_constant_loc[i] = -1;
2248
2249 int pull_index = stage_prog_data->nr_pull_params++;
2250 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2251 pull_constant_loc[i] = pull_index;
2252 }
2253 }
2254
2255 stage_prog_data->nr_params = num_push_constants;
2256
2257 /* Up until now, the param[] array has been indexed by reg + reg_offset
2258 * of UNIFORM registers. Condense it to only contain the uniforms we
2259 * chose to upload as push constants.
2260 */
2261 for (unsigned int i = 0; i < uniforms; i++) {
2262 int remapped = push_constant_loc[i];
2263
2264 if (remapped == -1)
2265 continue;
2266
2267 assert(remapped <= (int)i);
2268 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2269 }
2270 }
2271
2272 /**
2273 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2274 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2275 */
2276 void
2277 fs_visitor::demote_pull_constants()
2278 {
2279 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2280 for (int i = 0; i < inst->sources; i++) {
2281 if (inst->src[i].file != UNIFORM)
2282 continue;
2283
2284 int pull_index;
2285 unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2286 if (location >= uniforms) /* Out of bounds access */
2287 pull_index = -1;
2288 else
2289 pull_index = pull_constant_loc[location];
2290
2291 if (pull_index == -1)
2292 continue;
2293
2294 /* Set up the annotation tracking for new generated instructions. */
2295 base_ir = inst->ir;
2296 current_annotation = inst->annotation;
2297
2298 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2299 fs_reg dst = vgrf(glsl_type::float_type);
2300
2301 /* Generate a pull load into dst. */
2302 if (inst->src[i].reladdr) {
2303 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2304 surf_index,
2305 *inst->src[i].reladdr,
2306 pull_index);
2307 inst->insert_before(block, &list);
2308 inst->src[i].reladdr = NULL;
2309 } else {
2310 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2311 fs_inst *pull =
2312 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2313 dst, surf_index, offset);
2314 inst->insert_before(block, pull);
2315 inst->src[i].set_smear(pull_index & 3);
2316 }
2317
2318 /* Rewrite the instruction to use the temporary VGRF. */
2319 inst->src[i].file = GRF;
2320 inst->src[i].reg = dst.reg;
2321 inst->src[i].reg_offset = 0;
2322 inst->src[i].width = dispatch_width;
2323 }
2324 }
2325 invalidate_live_intervals();
2326 }
2327
2328 bool
2329 fs_visitor::opt_algebraic()
2330 {
2331 bool progress = false;
2332
2333 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2334 switch (inst->opcode) {
2335 case BRW_OPCODE_MOV:
2336 if (inst->src[0].file != IMM)
2337 break;
2338
2339 if (inst->saturate) {
2340 if (inst->dst.type != inst->src[0].type)
2341 assert(!"unimplemented: saturate mixed types");
2342
2343 if (brw_saturate_immediate(inst->dst.type,
2344 &inst->src[0].fixed_hw_reg)) {
2345 inst->saturate = false;
2346 progress = true;
2347 }
2348 }
2349 break;
2350
2351 case BRW_OPCODE_MUL:
2352 if (inst->src[1].file != IMM)
2353 continue;
2354
2355 /* a * 1.0 = a */
2356 if (inst->src[1].is_one()) {
2357 inst->opcode = BRW_OPCODE_MOV;
2358 inst->src[1] = reg_undef;
2359 progress = true;
2360 break;
2361 }
2362
2363 /* a * -1.0 = -a */
2364 if (inst->src[1].is_negative_one()) {
2365 inst->opcode = BRW_OPCODE_MOV;
2366 inst->src[0].negate = !inst->src[0].negate;
2367 inst->src[1] = reg_undef;
2368 progress = true;
2369 break;
2370 }
2371
2372 /* a * 0.0 = 0.0 */
2373 if (inst->src[1].is_zero()) {
2374 inst->opcode = BRW_OPCODE_MOV;
2375 inst->src[0] = inst->src[1];
2376 inst->src[1] = reg_undef;
2377 progress = true;
2378 break;
2379 }
2380
2381 if (inst->src[0].file == IMM) {
2382 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2383 inst->opcode = BRW_OPCODE_MOV;
2384 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2385 inst->src[1] = reg_undef;
2386 progress = true;
2387 break;
2388 }
2389 break;
2390 case BRW_OPCODE_ADD:
2391 if (inst->src[1].file != IMM)
2392 continue;
2393
2394 /* a + 0.0 = a */
2395 if (inst->src[1].is_zero()) {
2396 inst->opcode = BRW_OPCODE_MOV;
2397 inst->src[1] = reg_undef;
2398 progress = true;
2399 break;
2400 }
2401
2402 if (inst->src[0].file == IMM) {
2403 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2404 inst->opcode = BRW_OPCODE_MOV;
2405 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2406 inst->src[1] = reg_undef;
2407 progress = true;
2408 break;
2409 }
2410 break;
2411 case BRW_OPCODE_OR:
2412 if (inst->src[0].equals(inst->src[1])) {
2413 inst->opcode = BRW_OPCODE_MOV;
2414 inst->src[1] = reg_undef;
2415 progress = true;
2416 break;
2417 }
2418 break;
2419 case BRW_OPCODE_LRP:
2420 if (inst->src[1].equals(inst->src[2])) {
2421 inst->opcode = BRW_OPCODE_MOV;
2422 inst->src[0] = inst->src[1];
2423 inst->src[1] = reg_undef;
2424 inst->src[2] = reg_undef;
2425 progress = true;
2426 break;
2427 }
2428 break;
2429 case BRW_OPCODE_CMP:
2430 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2431 inst->src[0].abs &&
2432 inst->src[0].negate &&
2433 inst->src[1].is_zero()) {
2434 inst->src[0].abs = false;
2435 inst->src[0].negate = false;
2436 inst->conditional_mod = BRW_CONDITIONAL_Z;
2437 progress = true;
2438 break;
2439 }
2440 break;
2441 case BRW_OPCODE_SEL:
2442 if (inst->src[0].equals(inst->src[1])) {
2443 inst->opcode = BRW_OPCODE_MOV;
2444 inst->src[1] = reg_undef;
2445 inst->predicate = BRW_PREDICATE_NONE;
2446 inst->predicate_inverse = false;
2447 progress = true;
2448 } else if (inst->saturate && inst->src[1].file == IMM) {
2449 switch (inst->conditional_mod) {
2450 case BRW_CONDITIONAL_LE:
2451 case BRW_CONDITIONAL_L:
2452 switch (inst->src[1].type) {
2453 case BRW_REGISTER_TYPE_F:
2454 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2455 inst->opcode = BRW_OPCODE_MOV;
2456 inst->src[1] = reg_undef;
2457 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2458 progress = true;
2459 }
2460 break;
2461 default:
2462 break;
2463 }
2464 break;
2465 case BRW_CONDITIONAL_GE:
2466 case BRW_CONDITIONAL_G:
2467 switch (inst->src[1].type) {
2468 case BRW_REGISTER_TYPE_F:
2469 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2470 inst->opcode = BRW_OPCODE_MOV;
2471 inst->src[1] = reg_undef;
2472 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2473 progress = true;
2474 }
2475 break;
2476 default:
2477 break;
2478 }
2479 default:
2480 break;
2481 }
2482 }
2483 break;
2484 case BRW_OPCODE_MAD:
2485 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2486 inst->opcode = BRW_OPCODE_MOV;
2487 inst->src[1] = reg_undef;
2488 inst->src[2] = reg_undef;
2489 progress = true;
2490 } else if (inst->src[0].is_zero()) {
2491 inst->opcode = BRW_OPCODE_MUL;
2492 inst->src[0] = inst->src[2];
2493 inst->src[2] = reg_undef;
2494 progress = true;
2495 } else if (inst->src[1].is_one()) {
2496 inst->opcode = BRW_OPCODE_ADD;
2497 inst->src[1] = inst->src[2];
2498 inst->src[2] = reg_undef;
2499 progress = true;
2500 } else if (inst->src[2].is_one()) {
2501 inst->opcode = BRW_OPCODE_ADD;
2502 inst->src[2] = reg_undef;
2503 progress = true;
2504 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2505 inst->opcode = BRW_OPCODE_ADD;
2506 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2507 inst->src[2] = reg_undef;
2508 progress = true;
2509 }
2510 break;
2511 case SHADER_OPCODE_RCP: {
2512 fs_inst *prev = (fs_inst *)inst->prev;
2513 if (prev->opcode == SHADER_OPCODE_SQRT) {
2514 if (inst->src[0].equals(prev->dst)) {
2515 inst->opcode = SHADER_OPCODE_RSQ;
2516 inst->src[0] = prev->src[0];
2517 progress = true;
2518 }
2519 }
2520 break;
2521 }
2522 default:
2523 break;
2524 }
2525
2526 /* Swap if src[0] is immediate. */
2527 if (progress && inst->is_commutative()) {
2528 if (inst->src[0].file == IMM) {
2529 fs_reg tmp = inst->src[1];
2530 inst->src[1] = inst->src[0];
2531 inst->src[0] = tmp;
2532 }
2533 }
2534 }
2535 return progress;
2536 }
2537
2538 /**
2539 * Optimize sample messages that have constant zero values for the trailing
2540 * texture coordinates. We can just reduce the message length for these
2541 * instructions instead of reserving a register for it. Trailing parameters
2542 * that aren't sent default to zero anyway. This will cause the dead code
2543 * eliminator to remove the MOV instruction that would otherwise be emitted to
2544 * set up the zero value.
2545 */
2546 bool
2547 fs_visitor::opt_zero_samples()
2548 {
2549 /* Gen4 infers the texturing opcode based on the message length so we can't
2550 * change it.
2551 */
2552 if (devinfo->gen < 5)
2553 return false;
2554
2555 bool progress = false;
2556
2557 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2558 if (!inst->is_tex())
2559 continue;
2560
2561 fs_inst *load_payload = (fs_inst *) inst->prev;
2562
2563 if (load_payload->is_head_sentinel() ||
2564 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2565 continue;
2566
2567 /* We don't want to remove the message header. Removing all of the
2568 * parameters is avoided because it seems to cause a GPU hang but I
2569 * can't find any documentation indicating that this is expected.
2570 */
2571 while (inst->mlen > inst->header_present + dispatch_width / 8 &&
2572 load_payload->src[(inst->mlen - inst->header_present) /
2573 (dispatch_width / 8) +
2574 inst->header_present - 1].is_zero()) {
2575 inst->mlen -= dispatch_width / 8;
2576 progress = true;
2577 }
2578 }
2579
2580 if (progress)
2581 invalidate_live_intervals();
2582
2583 return progress;
2584 }
2585
2586 /**
2587 * Optimize sample messages which are followed by the final RT write.
2588 *
2589 * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2590 * results sent directly to the framebuffer, bypassing the EU. Recognize the
2591 * final texturing results copied to the framebuffer write payload and modify
2592 * them to write to the framebuffer directly.
2593 */
2594 bool
2595 fs_visitor::opt_sampler_eot()
2596 {
2597 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2598
2599 if (stage != MESA_SHADER_FRAGMENT)
2600 return false;
2601
2602 if (devinfo->gen < 9 && !devinfo->is_cherryview)
2603 return false;
2604
2605 /* FINISHME: It should be possible to implement this optimization when there
2606 * are multiple drawbuffers.
2607 */
2608 if (key->nr_color_regions != 1)
2609 return false;
2610
2611 /* Look for a texturing instruction immediately before the final FB_WRITE. */
2612 fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2613 assert(fb_write->eot);
2614 assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2615
2616 fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2617
2618 /* There wasn't one; nothing to do. */
2619 if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2620 return false;
2621
2622 /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2623 * It's very likely to be the previous instruction.
2624 */
2625 fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2626 if (load_payload->is_head_sentinel() ||
2627 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2628 return false;
2629
2630 assert(!tex_inst->eot); /* We can't get here twice */
2631 assert((tex_inst->offset & (0xff << 24)) == 0);
2632
2633 tex_inst->offset |= fb_write->target << 24;
2634 tex_inst->eot = true;
2635 fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2636
2637 /* If a header is present, marking the eot is sufficient. Otherwise, we need
2638 * to create a new LOAD_PAYLOAD command with the same sources and a space
2639 * saved for the header. Using a new destination register not only makes sure
2640 * we have enough space, but it will make sure the dead code eliminator kills
2641 * the instruction that this will replace.
2642 */
2643 if (tex_inst->header_present)
2644 return true;
2645
2646 fs_reg send_header = vgrf(load_payload->sources + 1);
2647 fs_reg *new_sources =
2648 ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2649
2650 new_sources[0] = fs_reg();
2651 for (int i = 0; i < load_payload->sources; i++)
2652 new_sources[i+1] = load_payload->src[i];
2653
2654 /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2655 * requires a lot of information about the sources to appropriately figure
2656 * out the number of registers needed to be used. Given this stage in our
2657 * optimization, we may not have the appropriate GRFs required by
2658 * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2659 * manually emit the instruction.
2660 */
2661 fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2662 load_payload->exec_size,
2663 send_header,
2664 new_sources,
2665 load_payload->sources + 1);
2666
2667 new_load_payload->regs_written = load_payload->regs_written + 1;
2668 tex_inst->mlen++;
2669 tex_inst->header_present = true;
2670 tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2671 tex_inst->src[0] = send_header;
2672 tex_inst->dst = reg_null_ud;
2673
2674 return true;
2675 }
2676
2677 bool
2678 fs_visitor::opt_register_renaming()
2679 {
2680 bool progress = false;
2681 int depth = 0;
2682
2683 int remap[alloc.count];
2684 memset(remap, -1, sizeof(int) * alloc.count);
2685
2686 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2687 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2688 depth++;
2689 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2690 inst->opcode == BRW_OPCODE_WHILE) {
2691 depth--;
2692 }
2693
2694 /* Rewrite instruction sources. */
2695 for (int i = 0; i < inst->sources; i++) {
2696 if (inst->src[i].file == GRF &&
2697 remap[inst->src[i].reg] != -1 &&
2698 remap[inst->src[i].reg] != inst->src[i].reg) {
2699 inst->src[i].reg = remap[inst->src[i].reg];
2700 progress = true;
2701 }
2702 }
2703
2704 const int dst = inst->dst.reg;
2705
2706 if (depth == 0 &&
2707 inst->dst.file == GRF &&
2708 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2709 !inst->is_partial_write()) {
2710 if (remap[dst] == -1) {
2711 remap[dst] = dst;
2712 } else {
2713 remap[dst] = alloc.allocate(inst->dst.width / 8);
2714 inst->dst.reg = remap[dst];
2715 progress = true;
2716 }
2717 } else if (inst->dst.file == GRF &&
2718 remap[dst] != -1 &&
2719 remap[dst] != dst) {
2720 inst->dst.reg = remap[dst];
2721 progress = true;
2722 }
2723 }
2724
2725 if (progress) {
2726 invalidate_live_intervals();
2727
2728 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2729 if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2730 delta_xy[i].reg = remap[delta_xy[i].reg];
2731 }
2732 }
2733 }
2734
2735 return progress;
2736 }
2737
2738 /**
2739 * Remove redundant or useless discard jumps.
2740 *
2741 * For example, we can eliminate jumps in the following sequence:
2742 *
2743 * discard-jump (redundant with the next jump)
2744 * discard-jump (useless; jumps to the next instruction)
2745 * placeholder-halt
2746 */
2747 bool
2748 fs_visitor::opt_redundant_discard_jumps()
2749 {
2750 bool progress = false;
2751
2752 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2753
2754 fs_inst *placeholder_halt = NULL;
2755 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2756 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2757 placeholder_halt = inst;
2758 break;
2759 }
2760 }
2761
2762 if (!placeholder_halt)
2763 return false;
2764
2765 /* Delete any HALTs immediately before the placeholder halt. */
2766 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2767 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2768 prev = (fs_inst *) placeholder_halt->prev) {
2769 prev->remove(last_bblock);
2770 progress = true;
2771 }
2772
2773 if (progress)
2774 invalidate_live_intervals();
2775
2776 return progress;
2777 }
2778
2779 bool
2780 fs_visitor::compute_to_mrf()
2781 {
2782 bool progress = false;
2783 int next_ip = 0;
2784
2785 /* No MRFs on Gen >= 7. */
2786 if (devinfo->gen >= 7)
2787 return false;
2788
2789 calculate_live_intervals();
2790
2791 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2792 int ip = next_ip;
2793 next_ip++;
2794
2795 if (inst->opcode != BRW_OPCODE_MOV ||
2796 inst->is_partial_write() ||
2797 inst->dst.file != MRF || inst->src[0].file != GRF ||
2798 inst->dst.type != inst->src[0].type ||
2799 inst->src[0].abs || inst->src[0].negate ||
2800 !inst->src[0].is_contiguous() ||
2801 inst->src[0].subreg_offset)
2802 continue;
2803
2804 /* Work out which hardware MRF registers are written by this
2805 * instruction.
2806 */
2807 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2808 int mrf_high;
2809 if (inst->dst.reg & BRW_MRF_COMPR4) {
2810 mrf_high = mrf_low + 4;
2811 } else if (inst->exec_size == 16) {
2812 mrf_high = mrf_low + 1;
2813 } else {
2814 mrf_high = mrf_low;
2815 }
2816
2817 /* Can't compute-to-MRF this GRF if someone else was going to
2818 * read it later.
2819 */
2820 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2821 continue;
2822
2823 /* Found a move of a GRF to a MRF. Let's see if we can go
2824 * rewrite the thing that made this GRF to write into the MRF.
2825 */
2826 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2827 if (scan_inst->dst.file == GRF &&
2828 scan_inst->dst.reg == inst->src[0].reg) {
2829 /* Found the last thing to write our reg we want to turn
2830 * into a compute-to-MRF.
2831 */
2832
2833 /* If this one instruction didn't populate all the
2834 * channels, bail. We might be able to rewrite everything
2835 * that writes that reg, but it would require smarter
2836 * tracking to delay the rewriting until complete success.
2837 */
2838 if (scan_inst->is_partial_write())
2839 break;
2840
2841 /* Things returning more than one register would need us to
2842 * understand coalescing out more than one MOV at a time.
2843 */
2844 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2845 break;
2846
2847 /* SEND instructions can't have MRF as a destination. */
2848 if (scan_inst->mlen)
2849 break;
2850
2851 if (devinfo->gen == 6) {
2852 /* gen6 math instructions must have the destination be
2853 * GRF, so no compute-to-MRF for them.
2854 */
2855 if (scan_inst->is_math()) {
2856 break;
2857 }
2858 }
2859
2860 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2861 /* Found the creator of our MRF's source value. */
2862 scan_inst->dst.file = MRF;
2863 scan_inst->dst.reg = inst->dst.reg;
2864 scan_inst->saturate |= inst->saturate;
2865 inst->remove(block);
2866 progress = true;
2867 }
2868 break;
2869 }
2870
2871 /* We don't handle control flow here. Most computation of
2872 * values that end up in MRFs are shortly before the MRF
2873 * write anyway.
2874 */
2875 if (block->start() == scan_inst)
2876 break;
2877
2878 /* You can't read from an MRF, so if someone else reads our
2879 * MRF's source GRF that we wanted to rewrite, that stops us.
2880 */
2881 bool interfered = false;
2882 for (int i = 0; i < scan_inst->sources; i++) {
2883 if (scan_inst->src[i].file == GRF &&
2884 scan_inst->src[i].reg == inst->src[0].reg &&
2885 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2886 interfered = true;
2887 }
2888 }
2889 if (interfered)
2890 break;
2891
2892 if (scan_inst->dst.file == MRF) {
2893 /* If somebody else writes our MRF here, we can't
2894 * compute-to-MRF before that.
2895 */
2896 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2897 int scan_mrf_high;
2898
2899 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2900 scan_mrf_high = scan_mrf_low + 4;
2901 } else if (scan_inst->exec_size == 16) {
2902 scan_mrf_high = scan_mrf_low + 1;
2903 } else {
2904 scan_mrf_high = scan_mrf_low;
2905 }
2906
2907 if (mrf_low == scan_mrf_low ||
2908 mrf_low == scan_mrf_high ||
2909 mrf_high == scan_mrf_low ||
2910 mrf_high == scan_mrf_high) {
2911 break;
2912 }
2913 }
2914
2915 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2916 /* Found a SEND instruction, which means that there are
2917 * live values in MRFs from base_mrf to base_mrf +
2918 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2919 * above it.
2920 */
2921 if (mrf_low >= scan_inst->base_mrf &&
2922 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2923 break;
2924 }
2925 if (mrf_high >= scan_inst->base_mrf &&
2926 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2927 break;
2928 }
2929 }
2930 }
2931 }
2932
2933 if (progress)
2934 invalidate_live_intervals();
2935
2936 return progress;
2937 }
2938
2939 /**
2940 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2941 * instructions to FS_OPCODE_REP_FB_WRITE.
2942 */
2943 void
2944 fs_visitor::emit_repclear_shader()
2945 {
2946 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2947 int base_mrf = 1;
2948 int color_mrf = base_mrf + 2;
2949
2950 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2951 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2952 mov->force_writemask_all = true;
2953
2954 fs_inst *write;
2955 if (key->nr_color_regions == 1) {
2956 write = emit(FS_OPCODE_REP_FB_WRITE);
2957 write->saturate = key->clamp_fragment_color;
2958 write->base_mrf = color_mrf;
2959 write->target = 0;
2960 write->header_present = false;
2961 write->mlen = 1;
2962 } else {
2963 assume(key->nr_color_regions > 0);
2964 for (int i = 0; i < key->nr_color_regions; ++i) {
2965 write = emit(FS_OPCODE_REP_FB_WRITE);
2966 write->saturate = key->clamp_fragment_color;
2967 write->base_mrf = base_mrf;
2968 write->target = i;
2969 write->header_present = true;
2970 write->mlen = 3;
2971 }
2972 }
2973 write->eot = true;
2974
2975 calculate_cfg();
2976
2977 assign_constant_locations();
2978 assign_curb_setup();
2979
2980 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2981 assert(mov->src[0].file == HW_REG);
2982 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2983 }
2984
2985 /**
2986 * Walks through basic blocks, looking for repeated MRF writes and
2987 * removing the later ones.
2988 */
2989 bool
2990 fs_visitor::remove_duplicate_mrf_writes()
2991 {
2992 fs_inst *last_mrf_move[16];
2993 bool progress = false;
2994
2995 /* Need to update the MRF tracking for compressed instructions. */
2996 if (dispatch_width == 16)
2997 return false;
2998
2999 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3000
3001 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3002 if (inst->is_control_flow()) {
3003 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3004 }
3005
3006 if (inst->opcode == BRW_OPCODE_MOV &&
3007 inst->dst.file == MRF) {
3008 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3009 if (prev_inst && inst->equals(prev_inst)) {
3010 inst->remove(block);
3011 progress = true;
3012 continue;
3013 }
3014 }
3015
3016 /* Clear out the last-write records for MRFs that were overwritten. */
3017 if (inst->dst.file == MRF) {
3018 last_mrf_move[inst->dst.reg] = NULL;
3019 }
3020
3021 if (inst->mlen > 0 && inst->base_mrf != -1) {
3022 /* Found a SEND instruction, which will include two or fewer
3023 * implied MRF writes. We could do better here.
3024 */
3025 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3026 last_mrf_move[inst->base_mrf + i] = NULL;
3027 }
3028 }
3029
3030 /* Clear out any MRF move records whose sources got overwritten. */
3031 if (inst->dst.file == GRF) {
3032 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3033 if (last_mrf_move[i] &&
3034 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3035 last_mrf_move[i] = NULL;
3036 }
3037 }
3038 }
3039
3040 if (inst->opcode == BRW_OPCODE_MOV &&
3041 inst->dst.file == MRF &&
3042 inst->src[0].file == GRF &&
3043 !inst->is_partial_write()) {
3044 last_mrf_move[inst->dst.reg] = inst;
3045 }
3046 }
3047
3048 if (progress)
3049 invalidate_live_intervals();
3050
3051 return progress;
3052 }
3053
3054 static void
3055 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3056 {
3057 /* Clear the flag for registers that actually got read (as expected). */
3058 for (int i = 0; i < inst->sources; i++) {
3059 int grf;
3060 if (inst->src[i].file == GRF) {
3061 grf = inst->src[i].reg;
3062 } else if (inst->src[i].file == HW_REG &&
3063 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3064 grf = inst->src[i].fixed_hw_reg.nr;
3065 } else {
3066 continue;
3067 }
3068
3069 if (grf >= first_grf &&
3070 grf < first_grf + grf_len) {
3071 deps[grf - first_grf] = false;
3072 if (inst->exec_size == 16)
3073 deps[grf - first_grf + 1] = false;
3074 }
3075 }
3076 }
3077
3078 /**
3079 * Implements this workaround for the original 965:
3080 *
3081 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3082 * check for post destination dependencies on this instruction, software
3083 * must ensure that there is no destination hazard for the case of ‘write
3084 * followed by a posted write’ shown in the following example.
3085 *
3086 * 1. mov r3 0
3087 * 2. send r3.xy <rest of send instruction>
3088 * 3. mov r2 r3
3089 *
3090 * Due to no post-destination dependency check on the ‘send’, the above
3091 * code sequence could have two instructions (1 and 2) in flight at the
3092 * same time that both consider ‘r3’ as the target of their final writes.
3093 */
3094 void
3095 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3096 fs_inst *inst)
3097 {
3098 int write_len = inst->regs_written;
3099 int first_write_grf = inst->dst.reg;
3100 bool needs_dep[BRW_MAX_MRF];
3101 assert(write_len < (int)sizeof(needs_dep) - 1);
3102
3103 memset(needs_dep, false, sizeof(needs_dep));
3104 memset(needs_dep, true, write_len);
3105
3106 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3107
3108 /* Walk backwards looking for writes to registers we're writing which
3109 * aren't read since being written. If we hit the start of the program,
3110 * we assume that there are no outstanding dependencies on entry to the
3111 * program.
3112 */
3113 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3114 /* If we hit control flow, assume that there *are* outstanding
3115 * dependencies, and force their cleanup before our instruction.
3116 */
3117 if (block->start() == scan_inst) {
3118 for (int i = 0; i < write_len; i++) {
3119 if (needs_dep[i]) {
3120 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3121 }
3122 }
3123 return;
3124 }
3125
3126 /* We insert our reads as late as possible on the assumption that any
3127 * instruction but a MOV that might have left us an outstanding
3128 * dependency has more latency than a MOV.
3129 */
3130 if (scan_inst->dst.file == GRF) {
3131 for (int i = 0; i < scan_inst->regs_written; i++) {
3132 int reg = scan_inst->dst.reg + i;
3133
3134 if (reg >= first_write_grf &&
3135 reg < first_write_grf + write_len &&
3136 needs_dep[reg - first_write_grf]) {
3137 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3138 needs_dep[reg - first_write_grf] = false;
3139 if (scan_inst->exec_size == 16)
3140 needs_dep[reg - first_write_grf + 1] = false;
3141 }
3142 }
3143 }
3144
3145 /* Clear the flag for registers that actually got read (as expected). */
3146 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3147
3148 /* Continue the loop only if we haven't resolved all the dependencies */
3149 int i;
3150 for (i = 0; i < write_len; i++) {
3151 if (needs_dep[i])
3152 break;
3153 }
3154 if (i == write_len)
3155 return;
3156 }
3157 }
3158
3159 /**
3160 * Implements this workaround for the original 965:
3161 *
3162 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3163 * used as a destination register until after it has been sourced by an
3164 * instruction with a different destination register.
3165 */
3166 void
3167 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3168 {
3169 int write_len = inst->regs_written;
3170 int first_write_grf = inst->dst.reg;
3171 bool needs_dep[BRW_MAX_MRF];
3172 assert(write_len < (int)sizeof(needs_dep) - 1);
3173
3174 memset(needs_dep, false, sizeof(needs_dep));
3175 memset(needs_dep, true, write_len);
3176 /* Walk forwards looking for writes to registers we're writing which aren't
3177 * read before being written.
3178 */
3179 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3180 /* If we hit control flow, force resolve all remaining dependencies. */
3181 if (block->end() == scan_inst) {
3182 for (int i = 0; i < write_len; i++) {
3183 if (needs_dep[i])
3184 scan_inst->insert_before(block,
3185 DEP_RESOLVE_MOV(first_write_grf + i));
3186 }
3187 return;
3188 }
3189
3190 /* Clear the flag for registers that actually got read (as expected). */
3191 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3192
3193 /* We insert our reads as late as possible since they're reading the
3194 * result of a SEND, which has massive latency.
3195 */
3196 if (scan_inst->dst.file == GRF &&
3197 scan_inst->dst.reg >= first_write_grf &&
3198 scan_inst->dst.reg < first_write_grf + write_len &&
3199 needs_dep[scan_inst->dst.reg - first_write_grf]) {
3200 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3201 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3202 }
3203
3204 /* Continue the loop only if we haven't resolved all the dependencies */
3205 int i;
3206 for (i = 0; i < write_len; i++) {
3207 if (needs_dep[i])
3208 break;
3209 }
3210 if (i == write_len)
3211 return;
3212 }
3213 }
3214
3215 void
3216 fs_visitor::insert_gen4_send_dependency_workarounds()
3217 {
3218 if (devinfo->gen != 4 || devinfo->is_g4x)
3219 return;
3220
3221 bool progress = false;
3222
3223 /* Note that we're done with register allocation, so GRF fs_regs always
3224 * have a .reg_offset of 0.
3225 */
3226
3227 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3228 if (inst->mlen != 0 && inst->dst.file == GRF) {
3229 insert_gen4_pre_send_dependency_workarounds(block, inst);
3230 insert_gen4_post_send_dependency_workarounds(block, inst);
3231 progress = true;
3232 }
3233 }
3234
3235 if (progress)
3236 invalidate_live_intervals();
3237 }
3238
3239 /**
3240 * Turns the generic expression-style uniform pull constant load instruction
3241 * into a hardware-specific series of instructions for loading a pull
3242 * constant.
3243 *
3244 * The expression style allows the CSE pass before this to optimize out
3245 * repeated loads from the same offset, and gives the pre-register-allocation
3246 * scheduling full flexibility, while the conversion to native instructions
3247 * allows the post-register-allocation scheduler the best information
3248 * possible.
3249 *
3250 * Note that execution masking for setting up pull constant loads is special:
3251 * the channels that need to be written are unrelated to the current execution
3252 * mask, since a later instruction will use one of the result channels as a
3253 * source operand for all 8 or 16 of its channels.
3254 */
3255 void
3256 fs_visitor::lower_uniform_pull_constant_loads()
3257 {
3258 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3259 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3260 continue;
3261
3262 if (devinfo->gen >= 7) {
3263 /* The offset arg before was a vec4-aligned byte offset. We need to
3264 * turn it into a dword offset.
3265 */
3266 fs_reg const_offset_reg = inst->src[1];
3267 assert(const_offset_reg.file == IMM &&
3268 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3269 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3270 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3271
3272 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3273 * Reserve space for the register.
3274 */
3275 if (devinfo->gen >= 9) {
3276 payload.reg_offset++;
3277 alloc.sizes[payload.reg] = 2;
3278 }
3279
3280 /* This is actually going to be a MOV, but since only the first dword
3281 * is accessed, we have a special opcode to do just that one. Note
3282 * that this needs to be an operation that will be considered a def
3283 * by live variable analysis, or register allocation will explode.
3284 */
3285 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3286 8, payload, const_offset_reg);
3287 setup->force_writemask_all = true;
3288
3289 setup->ir = inst->ir;
3290 setup->annotation = inst->annotation;
3291 inst->insert_before(block, setup);
3292
3293 /* Similarly, this will only populate the first 4 channels of the
3294 * result register (since we only use smear values from 0-3), but we
3295 * don't tell the optimizer.
3296 */
3297 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3298 inst->src[1] = payload;
3299
3300 invalidate_live_intervals();
3301 } else {
3302 /* Before register allocation, we didn't tell the scheduler about the
3303 * MRF we use. We know it's safe to use this MRF because nothing
3304 * else does except for register spill/unspill, which generates and
3305 * uses its MRF within a single IR instruction.
3306 */
3307 inst->base_mrf = 14;
3308 inst->mlen = 1;
3309 }
3310 }
3311 }
3312
3313 bool
3314 fs_visitor::lower_load_payload()
3315 {
3316 bool progress = false;
3317
3318 int vgrf_to_reg[alloc.count];
3319 int reg_count = 0;
3320 for (unsigned i = 0; i < alloc.count; ++i) {
3321 vgrf_to_reg[i] = reg_count;
3322 reg_count += alloc.sizes[i];
3323 }
3324
3325 struct {
3326 bool written:1; /* Whether this register has ever been written */
3327 bool force_writemask_all:1;
3328 bool force_sechalf:1;
3329 } metadata[reg_count];
3330 memset(metadata, 0, sizeof(metadata));
3331
3332 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3333 if (inst->dst.file == GRF) {
3334 const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3335 bool force_sechalf = inst->force_sechalf &&
3336 !inst->force_writemask_all;
3337 bool toggle_sechalf = inst->dst.width == 16 &&
3338 type_sz(inst->dst.type) == 4 &&
3339 !inst->force_writemask_all;
3340 for (int i = 0; i < inst->regs_written; ++i) {
3341 metadata[dst_reg + i].written = true;
3342 metadata[dst_reg + i].force_sechalf = force_sechalf;
3343 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3344 force_sechalf = (toggle_sechalf != force_sechalf);
3345 }
3346 }
3347
3348 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3349 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3350 fs_reg dst = inst->dst;
3351
3352 for (int i = 0; i < inst->sources; i++) {
3353 dst.width = inst->src[i].effective_width;
3354 dst.type = inst->src[i].type;
3355
3356 if (inst->src[i].file == BAD_FILE) {
3357 /* Do nothing but otherwise increment as normal */
3358 } else if (dst.file == MRF &&
3359 dst.width == 8 &&
3360 devinfo->has_compr4 &&
3361 i + 4 < inst->sources &&
3362 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3363 fs_reg compr4_dst = dst;
3364 compr4_dst.reg += BRW_MRF_COMPR4;
3365 compr4_dst.width = 16;
3366 fs_reg compr4_src = inst->src[i];
3367 compr4_src.width = 16;
3368 fs_inst *mov = MOV(compr4_dst, compr4_src);
3369 mov->force_writemask_all = true;
3370 inst->insert_before(block, mov);
3371 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3372 inst->src[i + 4].file = BAD_FILE;
3373 } else {
3374 fs_inst *mov = MOV(dst, inst->src[i]);
3375 if (inst->src[i].file == GRF) {
3376 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3377 inst->src[i].reg_offset;
3378 mov->force_sechalf = metadata[src_reg].force_sechalf;
3379 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3380 } else {
3381 /* We don't have any useful metadata for immediates or
3382 * uniforms. Assume that any of the channels of the
3383 * destination may be used.
3384 */
3385 assert(inst->src[i].file == IMM ||
3386 inst->src[i].file == UNIFORM);
3387 mov->force_writemask_all = true;
3388 }
3389
3390 if (dst.file == GRF) {
3391 const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3392 const bool force_writemask = mov->force_writemask_all;
3393 metadata[dst_reg].force_writemask_all = force_writemask;
3394 metadata[dst_reg].force_sechalf = mov->force_sechalf;
3395 if (dst.width * type_sz(dst.type) > 32) {
3396 assert(!mov->force_sechalf);
3397 metadata[dst_reg + 1].force_writemask_all = force_writemask;
3398 metadata[dst_reg + 1].force_sechalf = !force_writemask;
3399 }
3400 }
3401
3402 inst->insert_before(block, mov);
3403 }
3404
3405 dst = offset(dst, 1);
3406 }
3407
3408 inst->remove(block);
3409 progress = true;
3410 }
3411 }
3412
3413 if (progress)
3414 invalidate_live_intervals();
3415
3416 return progress;
3417 }
3418
3419 void
3420 fs_visitor::dump_instructions()
3421 {
3422 dump_instructions(NULL);
3423 }
3424
3425 void
3426 fs_visitor::dump_instructions(const char *name)
3427 {
3428 FILE *file = stderr;
3429 if (name && geteuid() != 0) {
3430 file = fopen(name, "w");
3431 if (!file)
3432 file = stderr;
3433 }
3434
3435 if (cfg) {
3436 calculate_register_pressure();
3437 int ip = 0, max_pressure = 0;
3438 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3439 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3440 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3441 dump_instruction(inst, file);
3442 ip++;
3443 }
3444 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3445 } else {
3446 int ip = 0;
3447 foreach_in_list(backend_instruction, inst, &instructions) {
3448 fprintf(file, "%4d: ", ip++);
3449 dump_instruction(inst, file);
3450 }
3451 }
3452
3453 if (file != stderr) {
3454 fclose(file);
3455 }
3456 }
3457
3458 void
3459 fs_visitor::dump_instruction(backend_instruction *be_inst)
3460 {
3461 dump_instruction(be_inst, stderr);
3462 }
3463
3464 void
3465 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3466 {
3467 fs_inst *inst = (fs_inst *)be_inst;
3468
3469 if (inst->predicate) {
3470 fprintf(file, "(%cf0.%d) ",
3471 inst->predicate_inverse ? '-' : '+',
3472 inst->flag_subreg);
3473 }
3474
3475 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3476 if (inst->saturate)
3477 fprintf(file, ".sat");
3478 if (inst->conditional_mod) {
3479 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3480 if (!inst->predicate &&
3481 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3482 inst->opcode != BRW_OPCODE_IF &&
3483 inst->opcode != BRW_OPCODE_WHILE))) {
3484 fprintf(file, ".f0.%d", inst->flag_subreg);
3485 }
3486 }
3487 fprintf(file, "(%d) ", inst->exec_size);
3488
3489
3490 switch (inst->dst.file) {
3491 case GRF:
3492 fprintf(file, "vgrf%d", inst->dst.reg);
3493 if (inst->dst.width != dispatch_width)
3494 fprintf(file, "@%d", inst->dst.width);
3495 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3496 inst->dst.subreg_offset)
3497 fprintf(file, "+%d.%d",
3498 inst->dst.reg_offset, inst->dst.subreg_offset);
3499 break;
3500 case MRF:
3501 fprintf(file, "m%d", inst->dst.reg);
3502 break;
3503 case BAD_FILE:
3504 fprintf(file, "(null)");
3505 break;
3506 case UNIFORM:
3507 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3508 break;
3509 case ATTR:
3510 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3511 break;
3512 case HW_REG:
3513 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3514 switch (inst->dst.fixed_hw_reg.nr) {
3515 case BRW_ARF_NULL:
3516 fprintf(file, "null");
3517 break;
3518 case BRW_ARF_ADDRESS:
3519 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3520 break;
3521 case BRW_ARF_ACCUMULATOR:
3522 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3523 break;
3524 case BRW_ARF_FLAG:
3525 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3526 inst->dst.fixed_hw_reg.subnr);
3527 break;
3528 default:
3529 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3530 inst->dst.fixed_hw_reg.subnr);
3531 break;
3532 }
3533 } else {
3534 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3535 }
3536 if (inst->dst.fixed_hw_reg.subnr)
3537 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3538 break;
3539 default:
3540 fprintf(file, "???");
3541 break;
3542 }
3543 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3544
3545 for (int i = 0; i < inst->sources; i++) {
3546 if (inst->src[i].negate)
3547 fprintf(file, "-");
3548 if (inst->src[i].abs)
3549 fprintf(file, "|");
3550 switch (inst->src[i].file) {
3551 case GRF:
3552 fprintf(file, "vgrf%d", inst->src[i].reg);
3553 if (inst->src[i].width != dispatch_width)
3554 fprintf(file, "@%d", inst->src[i].width);
3555 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3556 inst->src[i].subreg_offset)
3557 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3558 inst->src[i].subreg_offset);
3559 break;
3560 case MRF:
3561 fprintf(file, "***m%d***", inst->src[i].reg);
3562 break;
3563 case ATTR:
3564 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3565 break;
3566 case UNIFORM:
3567 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3568 if (inst->src[i].reladdr) {
3569 fprintf(file, "+reladdr");
3570 } else if (inst->src[i].subreg_offset) {
3571 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3572 inst->src[i].subreg_offset);
3573 }
3574 break;
3575 case BAD_FILE:
3576 fprintf(file, "(null)");
3577 break;
3578 case IMM:
3579 switch (inst->src[i].type) {
3580 case BRW_REGISTER_TYPE_F:
3581 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3582 break;
3583 case BRW_REGISTER_TYPE_W:
3584 case BRW_REGISTER_TYPE_D:
3585 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3586 break;
3587 case BRW_REGISTER_TYPE_UW:
3588 case BRW_REGISTER_TYPE_UD:
3589 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3590 break;
3591 case BRW_REGISTER_TYPE_VF:
3592 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3593 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3594 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3595 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3596 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3597 break;
3598 default:
3599 fprintf(file, "???");
3600 break;
3601 }
3602 break;
3603 case HW_REG:
3604 if (inst->src[i].fixed_hw_reg.negate)
3605 fprintf(file, "-");
3606 if (inst->src[i].fixed_hw_reg.abs)
3607 fprintf(file, "|");
3608 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3609 switch (inst->src[i].fixed_hw_reg.nr) {
3610 case BRW_ARF_NULL:
3611 fprintf(file, "null");
3612 break;
3613 case BRW_ARF_ADDRESS:
3614 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3615 break;
3616 case BRW_ARF_ACCUMULATOR:
3617 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3618 break;
3619 case BRW_ARF_FLAG:
3620 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3621 inst->src[i].fixed_hw_reg.subnr);
3622 break;
3623 default:
3624 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3625 inst->src[i].fixed_hw_reg.subnr);
3626 break;
3627 }
3628 } else {
3629 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3630 }
3631 if (inst->src[i].fixed_hw_reg.subnr)
3632 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3633 if (inst->src[i].fixed_hw_reg.abs)
3634 fprintf(file, "|");
3635 break;
3636 default:
3637 fprintf(file, "???");
3638 break;
3639 }
3640 if (inst->src[i].abs)
3641 fprintf(file, "|");
3642
3643 if (inst->src[i].file != IMM) {
3644 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3645 }
3646
3647 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3648 fprintf(file, ", ");
3649 }
3650
3651 fprintf(file, " ");
3652
3653 if (dispatch_width == 16 && inst->exec_size == 8) {
3654 if (inst->force_sechalf)
3655 fprintf(file, "2ndhalf ");
3656 else
3657 fprintf(file, "1sthalf ");
3658 }
3659
3660 fprintf(file, "\n");
3661 }
3662
3663 /**
3664 * Possibly returns an instruction that set up @param reg.
3665 *
3666 * Sometimes we want to take the result of some expression/variable
3667 * dereference tree and rewrite the instruction generating the result
3668 * of the tree. When processing the tree, we know that the
3669 * instructions generated are all writing temporaries that are dead
3670 * outside of this tree. So, if we have some instructions that write
3671 * a temporary, we're free to point that temp write somewhere else.
3672 *
3673 * Note that this doesn't guarantee that the instruction generated
3674 * only reg -- it might be the size=4 destination of a texture instruction.
3675 */
3676 fs_inst *
3677 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3678 fs_inst *end,
3679 const fs_reg &reg)
3680 {
3681 if (end == start ||
3682 end->is_partial_write() ||
3683 reg.reladdr ||
3684 !reg.equals(end->dst)) {
3685 return NULL;
3686 } else {
3687 return end;
3688 }
3689 }
3690
3691 void
3692 fs_visitor::setup_payload_gen6()
3693 {
3694 bool uses_depth =
3695 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3696 unsigned barycentric_interp_modes =
3697 (stage == MESA_SHADER_FRAGMENT) ?
3698 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3699
3700 assert(devinfo->gen >= 6);
3701
3702 /* R0-1: masks, pixel X/Y coordinates. */
3703 payload.num_regs = 2;
3704 /* R2: only for 32-pixel dispatch.*/
3705
3706 /* R3-26: barycentric interpolation coordinates. These appear in the
3707 * same order that they appear in the brw_wm_barycentric_interp_mode
3708 * enum. Each set of coordinates occupies 2 registers if dispatch width
3709 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3710 * appear if they were enabled using the "Barycentric Interpolation
3711 * Mode" bits in WM_STATE.
3712 */
3713 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3714 if (barycentric_interp_modes & (1 << i)) {
3715 payload.barycentric_coord_reg[i] = payload.num_regs;
3716 payload.num_regs += 2;
3717 if (dispatch_width == 16) {
3718 payload.num_regs += 2;
3719 }
3720 }
3721 }
3722
3723 /* R27: interpolated depth if uses source depth */
3724 if (uses_depth) {
3725 payload.source_depth_reg = payload.num_regs;
3726 payload.num_regs++;
3727 if (dispatch_width == 16) {
3728 /* R28: interpolated depth if not SIMD8. */
3729 payload.num_regs++;
3730 }
3731 }
3732 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3733 if (uses_depth) {
3734 payload.source_w_reg = payload.num_regs;
3735 payload.num_regs++;
3736 if (dispatch_width == 16) {
3737 /* R30: interpolated W if not SIMD8. */
3738 payload.num_regs++;
3739 }
3740 }
3741
3742 if (stage == MESA_SHADER_FRAGMENT) {
3743 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3744 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3745 prog_data->uses_pos_offset = key->compute_pos_offset;
3746 /* R31: MSAA position offsets. */
3747 if (prog_data->uses_pos_offset) {
3748 payload.sample_pos_reg = payload.num_regs;
3749 payload.num_regs++;
3750 }
3751 }
3752
3753 /* R32: MSAA input coverage mask */
3754 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3755 assert(devinfo->gen >= 7);
3756 payload.sample_mask_in_reg = payload.num_regs;
3757 payload.num_regs++;
3758 if (dispatch_width == 16) {
3759 /* R33: input coverage mask if not SIMD8. */
3760 payload.num_regs++;
3761 }
3762 }
3763
3764 /* R34-: bary for 32-pixel. */
3765 /* R58-59: interp W for 32-pixel. */
3766
3767 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3768 source_depth_to_render_target = true;
3769 }
3770 }
3771
3772 void
3773 fs_visitor::setup_vs_payload()
3774 {
3775 /* R0: thread header, R1: urb handles */
3776 payload.num_regs = 2;
3777 }
3778
3779 void
3780 fs_visitor::assign_binding_table_offsets()
3781 {
3782 assert(stage == MESA_SHADER_FRAGMENT);
3783 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3784 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3785 uint32_t next_binding_table_offset = 0;
3786
3787 /* If there are no color regions, we still perform an FB write to a null
3788 * renderbuffer, which we place at surface index 0.
3789 */
3790 prog_data->binding_table.render_target_start = next_binding_table_offset;
3791 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3792
3793 assign_common_binding_table_offsets(next_binding_table_offset);
3794 }
3795
3796 void
3797 fs_visitor::calculate_register_pressure()
3798 {
3799 invalidate_live_intervals();
3800 calculate_live_intervals();
3801
3802 unsigned num_instructions = 0;
3803 foreach_block(block, cfg)
3804 num_instructions += block->instructions.length();
3805
3806 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3807
3808 for (unsigned reg = 0; reg < alloc.count; reg++) {
3809 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3810 regs_live_at_ip[ip] += alloc.sizes[reg];
3811 }
3812 }
3813
3814 void
3815 fs_visitor::optimize()
3816 {
3817 split_virtual_grfs();
3818
3819 move_uniform_array_access_to_pull_constants();
3820 assign_constant_locations();
3821 demote_pull_constants();
3822
3823 #define OPT(pass, args...) ({ \
3824 pass_num++; \
3825 bool this_progress = pass(args); \
3826 \
3827 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3828 char filename[64]; \
3829 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3830 stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3831 \
3832 backend_visitor::dump_instructions(filename); \
3833 } \
3834 \
3835 progress = progress || this_progress; \
3836 this_progress; \
3837 })
3838
3839 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3840 char filename[64];
3841 snprintf(filename, 64, "%s%d-%04d-00-start",
3842 stage_abbrev, dispatch_width,
3843 shader_prog ? shader_prog->Name : 0);
3844
3845 backend_visitor::dump_instructions(filename);
3846 }
3847
3848 bool progress;
3849 int iteration = 0;
3850 int pass_num = 0;
3851 do {
3852 progress = false;
3853 pass_num = 0;
3854 iteration++;
3855
3856 OPT(remove_duplicate_mrf_writes);
3857
3858 OPT(opt_algebraic);
3859 OPT(opt_cse);
3860 OPT(opt_copy_propagate);
3861 OPT(opt_peephole_predicated_break);
3862 OPT(opt_cmod_propagation);
3863 OPT(dead_code_eliminate);
3864 OPT(opt_peephole_sel);
3865 OPT(dead_control_flow_eliminate, this);
3866 OPT(opt_register_renaming);
3867 OPT(opt_redundant_discard_jumps);
3868 OPT(opt_saturate_propagation);
3869 OPT(opt_zero_samples);
3870 OPT(register_coalesce);
3871 OPT(compute_to_mrf);
3872
3873 OPT(compact_virtual_grfs);
3874 } while (progress);
3875
3876 pass_num = 0;
3877
3878 OPT(opt_sampler_eot);
3879
3880 if (OPT(lower_load_payload)) {
3881 split_virtual_grfs();
3882 OPT(register_coalesce);
3883 OPT(compute_to_mrf);
3884 OPT(dead_code_eliminate);
3885 }
3886
3887 OPT(opt_combine_constants);
3888
3889 lower_uniform_pull_constant_loads();
3890 }
3891
3892 /**
3893 * Three source instruction must have a GRF/MRF destination register.
3894 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
3895 */
3896 void
3897 fs_visitor::fixup_3src_null_dest()
3898 {
3899 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3900 if (inst->is_3src() && inst->dst.is_null()) {
3901 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3902 inst->dst.type);
3903 }
3904 }
3905 }
3906
3907 void
3908 fs_visitor::allocate_registers()
3909 {
3910 bool allocated_without_spills;
3911
3912 static const enum instruction_scheduler_mode pre_modes[] = {
3913 SCHEDULE_PRE,
3914 SCHEDULE_PRE_NON_LIFO,
3915 SCHEDULE_PRE_LIFO,
3916 };
3917
3918 /* Try each scheduling heuristic to see if it can successfully register
3919 * allocate without spilling. They should be ordered by decreasing
3920 * performance but increasing likelihood of allocating.
3921 */
3922 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3923 schedule_instructions(pre_modes[i]);
3924
3925 if (0) {
3926 assign_regs_trivial();
3927 allocated_without_spills = true;
3928 } else {
3929 allocated_without_spills = assign_regs(false);
3930 }
3931 if (allocated_without_spills)
3932 break;
3933 }
3934
3935 if (!allocated_without_spills) {
3936 /* We assume that any spilling is worse than just dropping back to
3937 * SIMD8. There's probably actually some intermediate point where
3938 * SIMD16 with a couple of spills is still better.
3939 */
3940 if (dispatch_width == 16) {
3941 fail("Failure to register allocate. Reduce number of "
3942 "live scalar values to avoid this.");
3943 } else {
3944 perf_debug("%s shader triggered register spilling. "
3945 "Try reducing the number of live scalar values to "
3946 "improve performance.\n", stage_name);
3947 }
3948
3949 /* Since we're out of heuristics, just go spill registers until we
3950 * get an allocation.
3951 */
3952 while (!assign_regs(true)) {
3953 if (failed)
3954 break;
3955 }
3956 }
3957
3958 /* This must come after all optimization and register allocation, since
3959 * it inserts dead code that happens to have side effects, and it does
3960 * so based on the actual physical registers in use.
3961 */
3962 insert_gen4_send_dependency_workarounds();
3963
3964 if (failed)
3965 return;
3966
3967 if (!allocated_without_spills)
3968 schedule_instructions(SCHEDULE_POST);
3969
3970 if (last_scratch > 0)
3971 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3972 }
3973
3974 bool
3975 fs_visitor::run_vs()
3976 {
3977 assert(stage == MESA_SHADER_VERTEX);
3978
3979 assign_common_binding_table_offsets(0);
3980 setup_vs_payload();
3981
3982 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3983 emit_shader_time_begin();
3984
3985 if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
3986 emit_nir_code();
3987 } else {
3988 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3989 base_ir = ir;
3990 this->result = reg_undef;
3991 ir->accept(this);
3992 }
3993 base_ir = NULL;
3994 }
3995
3996 if (failed)
3997 return false;
3998
3999 emit_urb_writes();
4000
4001 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4002 emit_shader_time_end();
4003
4004 calculate_cfg();
4005
4006 optimize();
4007
4008 assign_curb_setup();
4009 assign_vs_urb_setup();
4010
4011 fixup_3src_null_dest();
4012 allocate_registers();
4013
4014 return !failed;
4015 }
4016
4017 bool
4018 fs_visitor::run_fs()
4019 {
4020 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4021 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4022
4023 assert(stage == MESA_SHADER_FRAGMENT);
4024
4025 sanity_param_count = prog->Parameters->NumParameters;
4026
4027 assign_binding_table_offsets();
4028
4029 if (devinfo->gen >= 6)
4030 setup_payload_gen6();
4031 else
4032 setup_payload_gen4();
4033
4034 if (0) {
4035 emit_dummy_fs();
4036 } else if (brw->use_rep_send && dispatch_width == 16) {
4037 emit_repclear_shader();
4038 } else {
4039 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4040 emit_shader_time_begin();
4041
4042 calculate_urb_setup();
4043 if (prog->InputsRead > 0) {
4044 if (devinfo->gen < 6)
4045 emit_interpolation_setup_gen4();
4046 else
4047 emit_interpolation_setup_gen6();
4048 }
4049
4050 /* We handle discards by keeping track of the still-live pixels in f0.1.
4051 * Initialize it with the dispatched pixels.
4052 */
4053 if (wm_prog_data->uses_kill) {
4054 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4055 discard_init->flag_subreg = 1;
4056 }
4057
4058 /* Generate FS IR for main(). (the visitor only descends into
4059 * functions called "main").
4060 */
4061 if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
4062 emit_nir_code();
4063 } else if (shader) {
4064 foreach_in_list(ir_instruction, ir, shader->base.ir) {
4065 base_ir = ir;
4066 this->result = reg_undef;
4067 ir->accept(this);
4068 }
4069 } else {
4070 emit_fragment_program_code();
4071 }
4072 base_ir = NULL;
4073 if (failed)
4074 return false;
4075
4076 if (wm_prog_data->uses_kill)
4077 emit(FS_OPCODE_PLACEHOLDER_HALT);
4078
4079 if (wm_key->alpha_test_func)
4080 emit_alpha_test();
4081
4082 emit_fb_writes();
4083
4084 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4085 emit_shader_time_end();
4086
4087 calculate_cfg();
4088
4089 optimize();
4090
4091 assign_curb_setup();
4092 assign_urb_setup();
4093
4094 fixup_3src_null_dest();
4095 allocate_registers();
4096
4097 if (failed)
4098 return false;
4099 }
4100
4101 if (dispatch_width == 8)
4102 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4103 else
4104 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4105
4106 /* If any state parameters were appended, then ParameterValues could have
4107 * been realloced, in which case the driver uniform storage set up by
4108 * _mesa_associate_uniform_storage() would point to freed memory. Make
4109 * sure that didn't happen.
4110 */
4111 assert(sanity_param_count == prog->Parameters->NumParameters);
4112
4113 return !failed;
4114 }
4115
4116 const unsigned *
4117 brw_wm_fs_emit(struct brw_context *brw,
4118 void *mem_ctx,
4119 const struct brw_wm_prog_key *key,
4120 struct brw_wm_prog_data *prog_data,
4121 struct gl_fragment_program *fp,
4122 struct gl_shader_program *prog,
4123 unsigned *final_assembly_size)
4124 {
4125 bool start_busy = false;
4126 double start_time = 0;
4127
4128 if (unlikely(brw->perf_debug)) {
4129 start_busy = (brw->batch.last_bo &&
4130 drm_intel_bo_busy(brw->batch.last_bo));
4131 start_time = get_time();
4132 }
4133
4134 struct brw_shader *shader = NULL;
4135 if (prog)
4136 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4137
4138 if (unlikely(INTEL_DEBUG & DEBUG_WM))
4139 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4140
4141 /* Now the main event: Visit the shader IR and generate our FS IR for it.
4142 */
4143 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
4144 if (!v.run_fs()) {
4145 if (prog) {
4146 prog->LinkStatus = false;
4147 ralloc_strcat(&prog->InfoLog, v.fail_msg);
4148 }
4149
4150 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4151 v.fail_msg);
4152
4153 return NULL;
4154 }
4155
4156 cfg_t *simd16_cfg = NULL;
4157 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
4158 if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4159 if (!v.simd16_unsupported) {
4160 /* Try a SIMD16 compile */
4161 v2.import_uniforms(&v);
4162 if (!v2.run_fs()) {
4163 perf_debug("SIMD16 shader failed to compile, falling back to "
4164 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4165 } else {
4166 simd16_cfg = v2.cfg;
4167 }
4168 } else {
4169 perf_debug("SIMD16 shader unsupported, falling back to "
4170 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4171 }
4172 }
4173
4174 cfg_t *simd8_cfg;
4175 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4176 if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4177 simd8_cfg = NULL;
4178 prog_data->no_8 = true;
4179 } else {
4180 simd8_cfg = v.cfg;
4181 prog_data->no_8 = false;
4182 }
4183
4184 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4185 &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4186
4187 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4188 char *name;
4189 if (prog)
4190 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4191 prog->Label ? prog->Label : "unnamed",
4192 prog->Name);
4193 else
4194 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4195
4196 g.enable_debug(name);
4197 }
4198
4199 if (simd8_cfg)
4200 g.generate_code(simd8_cfg, 8);
4201 if (simd16_cfg)
4202 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4203
4204 if (unlikely(brw->perf_debug) && shader) {
4205 if (shader->compiled_once)
4206 brw_wm_debug_recompile(brw, prog, key);
4207 shader->compiled_once = true;
4208
4209 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4210 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4211 (get_time() - start_time) * 1000);
4212 }
4213 }
4214
4215 return g.get_assembly(final_assembly_size);
4216 }
4217
4218 extern "C" bool
4219 brw_fs_precompile(struct gl_context *ctx,
4220 struct gl_shader_program *shader_prog,
4221 struct gl_program *prog)
4222 {
4223 struct brw_context *brw = brw_context(ctx);
4224 struct brw_wm_prog_key key;
4225
4226 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4227 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4228 bool program_uses_dfdy = fp->UsesDFdy;
4229
4230 memset(&key, 0, sizeof(key));
4231
4232 if (brw->gen < 6) {
4233 if (fp->UsesKill)
4234 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4235
4236 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4237 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4238
4239 /* Just assume depth testing. */
4240 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4241 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4242 }
4243
4244 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4245 BRW_FS_VARYING_INPUT_MASK) > 16)
4246 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4247
4248 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4249 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
4250 for (unsigned i = 0; i < sampler_count; i++) {
4251 if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
4252 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4253 key.tex.swizzles[i] =
4254 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4255 } else {
4256 /* Color sampler: assume no swizzling. */
4257 key.tex.swizzles[i] = SWIZZLE_XYZW;
4258 }
4259 }
4260
4261 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4262 key.drawable_height = ctx->DrawBuffer->Height;
4263 }
4264
4265 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4266 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4267 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4268
4269 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4270 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4271 key.nr_color_regions > 1;
4272 }
4273
4274 key.program_string_id = bfp->id;
4275
4276 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4277 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4278
4279 bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4280
4281 brw->wm.base.prog_offset = old_prog_offset;
4282 brw->wm.prog_data = old_prog_data;
4283
4284 return success;
4285 }