i965: Add brw_setup_tex_for_precompile. Use in VS, GS & FS.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 void
53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
54 const fs_reg *src, unsigned sources)
55 {
56 memset(this, 0, sizeof(*this));
57
58 this->src = new fs_reg[MAX2(sources, 3)];
59 for (unsigned i = 0; i < sources; i++)
60 this->src[i] = src[i];
61
62 this->opcode = opcode;
63 this->dst = dst;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (unsigned i = 0; i < sources; ++i) {
79 if (src[i].file != GRF && src[i].file != ATTR)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 for (unsigned i = 0; i < sources; ++i) {
94 switch (this->src[i].file) {
95 case BAD_FILE:
96 this->src[i].effective_width = 8;
97 break;
98 case GRF:
99 case HW_REG:
100 case ATTR:
101 assert(this->src[i].width > 0);
102 if (this->src[i].width == 1) {
103 this->src[i].effective_width = this->exec_size;
104 } else {
105 this->src[i].effective_width = this->src[i].width;
106 }
107 break;
108 case IMM:
109 case UNIFORM:
110 this->src[i].effective_width = this->exec_size;
111 break;
112 default:
113 unreachable("Invalid source register file");
114 }
115 }
116 this->dst.effective_width = this->exec_size;
117
118 this->conditional_mod = BRW_CONDITIONAL_NONE;
119
120 /* This will be the case for almost all instructions. */
121 switch (dst.file) {
122 case GRF:
123 case HW_REG:
124 case MRF:
125 case ATTR:
126 this->regs_written =
127 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
128 break;
129 case BAD_FILE:
130 this->regs_written = 0;
131 break;
132 case IMM:
133 case UNIFORM:
134 unreachable("Invalid destination register file");
135 default:
136 unreachable("Invalid register file");
137 }
138
139 this->writes_accumulator = false;
140 }
141
142 fs_inst::fs_inst()
143 {
144 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
145 }
146
147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
148 {
149 init(opcode, exec_size, reg_undef, NULL, 0);
150 }
151
152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
153 {
154 init(opcode, 0, dst, NULL, 0);
155 }
156
157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
158 const fs_reg &src0)
159 {
160 const fs_reg src[1] = { src0 };
161 init(opcode, exec_size, dst, src, 1);
162 }
163
164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
165 {
166 const fs_reg src[1] = { src0 };
167 init(opcode, 0, dst, src, 1);
168 }
169
170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
171 const fs_reg &src0, const fs_reg &src1)
172 {
173 const fs_reg src[2] = { src0, src1 };
174 init(opcode, exec_size, dst, src, 2);
175 }
176
177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
178 const fs_reg &src1)
179 {
180 const fs_reg src[2] = { src0, src1 };
181 init(opcode, 0, dst, src, 2);
182 }
183
184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
185 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
186 {
187 const fs_reg src[3] = { src0, src1, src2 };
188 init(opcode, exec_size, dst, src, 3);
189 }
190
191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
192 const fs_reg &src1, const fs_reg &src2)
193 {
194 const fs_reg src[3] = { src0, src1, src2 };
195 init(opcode, 0, dst, src, 3);
196 }
197
198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
199 const fs_reg src[], unsigned sources)
200 {
201 init(opcode, 0, dst, src, sources);
202 }
203
204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
205 const fs_reg src[], unsigned sources)
206 {
207 init(opcode, exec_width, dst, src, sources);
208 }
209
210 fs_inst::fs_inst(const fs_inst &that)
211 {
212 memcpy(this, &that, sizeof(that));
213
214 this->src = new fs_reg[MAX2(that.sources, 3)];
215
216 for (unsigned i = 0; i < that.sources; i++)
217 this->src[i] = that.src[i];
218 }
219
220 fs_inst::~fs_inst()
221 {
222 delete[] this->src;
223 }
224
225 void
226 fs_inst::resize_sources(uint8_t num_sources)
227 {
228 if (this->sources != num_sources) {
229 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
230
231 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
232 src[i] = this->src[i];
233
234 delete[] this->src;
235 this->src = src;
236 this->sources = num_sources;
237 }
238 }
239
240 #define ALU1(op) \
241 fs_inst * \
242 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
243 { \
244 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
245 }
246
247 #define ALU2(op) \
248 fs_inst * \
249 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
250 const fs_reg &src1) \
251 { \
252 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
253 }
254
255 #define ALU2_ACC(op) \
256 fs_inst * \
257 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
258 const fs_reg &src1) \
259 { \
260 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
261 inst->writes_accumulator = true; \
262 return inst; \
263 }
264
265 #define ALU3(op) \
266 fs_inst * \
267 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
268 const fs_reg &src1, const fs_reg &src2) \
269 { \
270 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
271 }
272
273 ALU1(NOT)
274 ALU1(MOV)
275 ALU1(FRC)
276 ALU1(RNDD)
277 ALU1(RNDE)
278 ALU1(RNDZ)
279 ALU2(ADD)
280 ALU2(MUL)
281 ALU2_ACC(MACH)
282 ALU2(AND)
283 ALU2(OR)
284 ALU2(XOR)
285 ALU2(SHL)
286 ALU2(SHR)
287 ALU2(ASR)
288 ALU3(LRP)
289 ALU1(BFREV)
290 ALU3(BFE)
291 ALU2(BFI1)
292 ALU3(BFI2)
293 ALU1(FBH)
294 ALU1(FBL)
295 ALU1(CBIT)
296 ALU3(MAD)
297 ALU2_ACC(ADDC)
298 ALU2_ACC(SUBB)
299 ALU2(SEL)
300 ALU2(MAC)
301
302 /** Gen4 predicated IF. */
303 fs_inst *
304 fs_visitor::IF(enum brw_predicate predicate)
305 {
306 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
307 inst->predicate = predicate;
308 return inst;
309 }
310
311 /** Gen6 IF with embedded comparison. */
312 fs_inst *
313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
314 enum brw_conditional_mod condition)
315 {
316 assert(devinfo->gen == 6);
317 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
318 reg_null_d, src0, src1);
319 inst->conditional_mod = condition;
320 return inst;
321 }
322
323 /**
324 * CMP: Sets the low bit of the destination channels with the result
325 * of the comparison, while the upper bits are undefined, and updates
326 * the flag register with the packed 16 bits of the result.
327 */
328 fs_inst *
329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
330 enum brw_conditional_mod condition)
331 {
332 fs_inst *inst;
333
334 /* Take the instruction:
335 *
336 * CMP null<d> src0<f> src1<f>
337 *
338 * Original gen4 does type conversion to the destination type before
339 * comparison, producing garbage results for floating point comparisons.
340 *
341 * The destination type doesn't matter on newer generations, so we set the
342 * type to match src0 so we can compact the instruction.
343 */
344 dst.type = src0.type;
345 if (dst.file == HW_REG)
346 dst.fixed_hw_reg.type = dst.type;
347
348 resolve_ud_negate(&src0);
349 resolve_ud_negate(&src1);
350
351 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
352 inst->conditional_mod = condition;
353
354 return inst;
355 }
356
357 fs_inst *
358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
359 {
360 uint8_t exec_size = dst.width;
361 for (int i = 0; i < sources; ++i) {
362 assert(src[i].width % dst.width == 0);
363 if (src[i].width > exec_size)
364 exec_size = src[i].width;
365 }
366
367 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
368 dst, src, sources);
369 inst->regs_written = 0;
370 for (int i = 0; i < sources; ++i) {
371 /* The LOAD_PAYLOAD instruction only really makes sense if we are
372 * dealing with whole registers. If this ever changes, we can deal
373 * with it later.
374 */
375 int size = inst->src[i].effective_width * type_sz(src[i].type);
376 assert(size % 32 == 0);
377 inst->regs_written += (size + 31) / 32;
378 }
379
380 return inst;
381 }
382
383 exec_list
384 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
385 const fs_reg &surf_index,
386 const fs_reg &varying_offset,
387 uint32_t const_offset)
388 {
389 exec_list instructions;
390 fs_inst *inst;
391
392 /* We have our constant surface use a pitch of 4 bytes, so our index can
393 * be any component of a vector, and then we load 4 contiguous
394 * components starting from that.
395 *
396 * We break down the const_offset to a portion added to the variable
397 * offset and a portion done using reg_offset, which means that if you
398 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
399 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
400 * CSE can later notice that those loads are all the same and eliminate
401 * the redundant ones.
402 */
403 fs_reg vec4_offset = vgrf(glsl_type::int_type);
404 instructions.push_tail(ADD(vec4_offset,
405 varying_offset, fs_reg(const_offset & ~3)));
406
407 int scale = 1;
408 if (devinfo->gen == 4 && dst.width == 8) {
409 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
410 * u, v, r) as parameters, or we can just use the SIMD16 message
411 * consisting of (header, u). We choose the second, at the cost of a
412 * longer return length.
413 */
414 scale = 2;
415 }
416
417 enum opcode op;
418 if (devinfo->gen >= 7)
419 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
420 else
421 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
422
423 assert(dst.width % 8 == 0);
424 int regs_written = 4 * (dst.width / 8) * scale;
425 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
426 dst.type, dst.width);
427 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
428 inst->regs_written = regs_written;
429 instructions.push_tail(inst);
430
431 if (devinfo->gen < 7) {
432 inst->base_mrf = 13;
433 inst->header_present = true;
434 if (devinfo->gen == 4)
435 inst->mlen = 3;
436 else
437 inst->mlen = 1 + dispatch_width / 8;
438 }
439
440 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
441 instructions.push_tail(MOV(dst, result));
442
443 return instructions;
444 }
445
446 /**
447 * A helper for MOV generation for fixing up broken hardware SEND dependency
448 * handling.
449 */
450 fs_inst *
451 fs_visitor::DEP_RESOLVE_MOV(int grf)
452 {
453 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
454
455 inst->ir = NULL;
456 inst->annotation = "send dependency resolve";
457
458 /* The caller always wants uncompressed to emit the minimal extra
459 * dependencies, and to avoid having to deal with aligning its regs to 2.
460 */
461 inst->exec_size = 8;
462
463 return inst;
464 }
465
466 bool
467 fs_inst::equals(fs_inst *inst) const
468 {
469 return (opcode == inst->opcode &&
470 dst.equals(inst->dst) &&
471 src[0].equals(inst->src[0]) &&
472 src[1].equals(inst->src[1]) &&
473 src[2].equals(inst->src[2]) &&
474 saturate == inst->saturate &&
475 predicate == inst->predicate &&
476 conditional_mod == inst->conditional_mod &&
477 mlen == inst->mlen &&
478 base_mrf == inst->base_mrf &&
479 target == inst->target &&
480 eot == inst->eot &&
481 header_present == inst->header_present &&
482 shadow_compare == inst->shadow_compare &&
483 exec_size == inst->exec_size &&
484 offset == inst->offset);
485 }
486
487 bool
488 fs_inst::overwrites_reg(const fs_reg &reg) const
489 {
490 return reg.in_range(dst, regs_written);
491 }
492
493 bool
494 fs_inst::is_send_from_grf() const
495 {
496 switch (opcode) {
497 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
498 case SHADER_OPCODE_SHADER_TIME_ADD:
499 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
500 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
501 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
502 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
503 case SHADER_OPCODE_UNTYPED_ATOMIC:
504 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
505 case SHADER_OPCODE_URB_WRITE_SIMD8:
506 return true;
507 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
508 return src[1].file == GRF;
509 case FS_OPCODE_FB_WRITE:
510 return src[0].file == GRF;
511 default:
512 if (is_tex())
513 return src[0].file == GRF;
514
515 return false;
516 }
517 }
518
519 bool
520 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
521 {
522 if (devinfo->gen == 6 && is_math())
523 return false;
524
525 if (is_send_from_grf())
526 return false;
527
528 if (!backend_instruction::can_do_source_mods())
529 return false;
530
531 return true;
532 }
533
534 bool
535 fs_inst::has_side_effects() const
536 {
537 return this->eot || backend_instruction::has_side_effects();
538 }
539
540 void
541 fs_reg::init()
542 {
543 memset(this, 0, sizeof(*this));
544 stride = 1;
545 }
546
547 /** Generic unset register constructor. */
548 fs_reg::fs_reg()
549 {
550 init();
551 this->file = BAD_FILE;
552 }
553
554 /** Immediate value constructor. */
555 fs_reg::fs_reg(float f)
556 {
557 init();
558 this->file = IMM;
559 this->type = BRW_REGISTER_TYPE_F;
560 this->fixed_hw_reg.dw1.f = f;
561 this->width = 1;
562 }
563
564 /** Immediate value constructor. */
565 fs_reg::fs_reg(int32_t i)
566 {
567 init();
568 this->file = IMM;
569 this->type = BRW_REGISTER_TYPE_D;
570 this->fixed_hw_reg.dw1.d = i;
571 this->width = 1;
572 }
573
574 /** Immediate value constructor. */
575 fs_reg::fs_reg(uint32_t u)
576 {
577 init();
578 this->file = IMM;
579 this->type = BRW_REGISTER_TYPE_UD;
580 this->fixed_hw_reg.dw1.ud = u;
581 this->width = 1;
582 }
583
584 /** Vector float immediate value constructor. */
585 fs_reg::fs_reg(uint8_t vf[4])
586 {
587 init();
588 this->file = IMM;
589 this->type = BRW_REGISTER_TYPE_VF;
590 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
591 }
592
593 /** Vector float immediate value constructor. */
594 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
595 {
596 init();
597 this->file = IMM;
598 this->type = BRW_REGISTER_TYPE_VF;
599 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
600 (vf1 << 8) |
601 (vf2 << 16) |
602 (vf3 << 24);
603 }
604
605 /** Fixed brw_reg. */
606 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
607 {
608 init();
609 this->file = HW_REG;
610 this->fixed_hw_reg = fixed_hw_reg;
611 this->type = fixed_hw_reg.type;
612 this->width = 1 << fixed_hw_reg.width;
613 }
614
615 bool
616 fs_reg::equals(const fs_reg &r) const
617 {
618 return (file == r.file &&
619 reg == r.reg &&
620 reg_offset == r.reg_offset &&
621 subreg_offset == r.subreg_offset &&
622 type == r.type &&
623 negate == r.negate &&
624 abs == r.abs &&
625 !reladdr && !r.reladdr &&
626 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
627 width == r.width &&
628 stride == r.stride);
629 }
630
631 fs_reg &
632 fs_reg::set_smear(unsigned subreg)
633 {
634 assert(file != HW_REG && file != IMM);
635 subreg_offset = subreg * type_sz(type);
636 stride = 0;
637 return *this;
638 }
639
640 bool
641 fs_reg::is_contiguous() const
642 {
643 return stride == 1;
644 }
645
646 int
647 fs_visitor::type_size(const struct glsl_type *type)
648 {
649 unsigned int size, i;
650
651 switch (type->base_type) {
652 case GLSL_TYPE_UINT:
653 case GLSL_TYPE_INT:
654 case GLSL_TYPE_FLOAT:
655 case GLSL_TYPE_BOOL:
656 return type->components();
657 case GLSL_TYPE_ARRAY:
658 return type_size(type->fields.array) * type->length;
659 case GLSL_TYPE_STRUCT:
660 size = 0;
661 for (i = 0; i < type->length; i++) {
662 size += type_size(type->fields.structure[i].type);
663 }
664 return size;
665 case GLSL_TYPE_SAMPLER:
666 /* Samplers take up no register space, since they're baked in at
667 * link time.
668 */
669 return 0;
670 case GLSL_TYPE_ATOMIC_UINT:
671 return 0;
672 case GLSL_TYPE_IMAGE:
673 case GLSL_TYPE_VOID:
674 case GLSL_TYPE_ERROR:
675 case GLSL_TYPE_INTERFACE:
676 case GLSL_TYPE_DOUBLE:
677 unreachable("not reached");
678 }
679
680 return 0;
681 }
682
683 /**
684 * Create a MOV to read the timestamp register.
685 *
686 * The caller is responsible for emitting the MOV. The return value is
687 * the destination of the MOV, with extra parameters set.
688 */
689 fs_reg
690 fs_visitor::get_timestamp(fs_inst **out_mov)
691 {
692 assert(devinfo->gen >= 7);
693
694 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
695 BRW_ARF_TIMESTAMP,
696 0),
697 BRW_REGISTER_TYPE_UD));
698
699 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
700
701 fs_inst *mov = MOV(dst, ts);
702 /* We want to read the 3 fields we care about even if it's not enabled in
703 * the dispatch.
704 */
705 mov->force_writemask_all = true;
706
707 /* The caller wants the low 32 bits of the timestamp. Since it's running
708 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
709 * which is plenty of time for our purposes. It is identical across the
710 * EUs, but since it's tracking GPU core speed it will increment at a
711 * varying rate as render P-states change.
712 *
713 * The caller could also check if render P-states have changed (or anything
714 * else that might disrupt timing) by setting smear to 2 and checking if
715 * that field is != 0.
716 */
717 dst.set_smear(0);
718
719 *out_mov = mov;
720 return dst;
721 }
722
723 void
724 fs_visitor::emit_shader_time_begin()
725 {
726 current_annotation = "shader time start";
727 fs_inst *mov;
728 shader_start_time = get_timestamp(&mov);
729 emit(mov);
730 }
731
732 void
733 fs_visitor::emit_shader_time_end()
734 {
735 current_annotation = "shader time end";
736
737 enum shader_time_shader_type type, written_type, reset_type;
738 switch (stage) {
739 case MESA_SHADER_VERTEX:
740 type = ST_VS;
741 written_type = ST_VS_WRITTEN;
742 reset_type = ST_VS_RESET;
743 break;
744 case MESA_SHADER_GEOMETRY:
745 type = ST_GS;
746 written_type = ST_GS_WRITTEN;
747 reset_type = ST_GS_RESET;
748 break;
749 case MESA_SHADER_FRAGMENT:
750 if (dispatch_width == 8) {
751 type = ST_FS8;
752 written_type = ST_FS8_WRITTEN;
753 reset_type = ST_FS8_RESET;
754 } else {
755 assert(dispatch_width == 16);
756 type = ST_FS16;
757 written_type = ST_FS16_WRITTEN;
758 reset_type = ST_FS16_RESET;
759 }
760 break;
761 default:
762 unreachable("fs_visitor::emit_shader_time_end missing code");
763 }
764
765 /* Insert our code just before the final SEND with EOT. */
766 exec_node *end = this->instructions.get_tail();
767 assert(end && ((fs_inst *) end)->eot);
768
769 fs_inst *tm_read;
770 fs_reg shader_end_time = get_timestamp(&tm_read);
771 end->insert_before(tm_read);
772
773 /* Check that there weren't any timestamp reset events (assuming these
774 * were the only two timestamp reads that happened).
775 */
776 fs_reg reset = shader_end_time;
777 reset.set_smear(2);
778 fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
779 test->conditional_mod = BRW_CONDITIONAL_Z;
780 test->force_writemask_all = true;
781 end->insert_before(test);
782 end->insert_before(IF(BRW_PREDICATE_NORMAL));
783
784 fs_reg start = shader_start_time;
785 start.negate = true;
786 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
787 diff.set_smear(0);
788 fs_inst *add = ADD(diff, start, shader_end_time);
789 add->force_writemask_all = true;
790 end->insert_before(add);
791
792 /* If there were no instructions between the two timestamp gets, the diff
793 * is 2 cycles. Remove that overhead, so I can forget about that when
794 * trying to determine the time taken for single instructions.
795 */
796 add = ADD(diff, diff, fs_reg(-2u));
797 add->force_writemask_all = true;
798 end->insert_before(add);
799
800 end->insert_before(SHADER_TIME_ADD(type, diff));
801 end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
802 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
803 end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
804 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
805 }
806
807 fs_inst *
808 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
809 {
810 int shader_time_index =
811 brw_get_shader_time_index(brw, shader_prog, prog, type);
812 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
813
814 fs_reg payload;
815 if (dispatch_width == 8)
816 payload = vgrf(glsl_type::uvec2_type);
817 else
818 payload = vgrf(glsl_type::uint_type);
819
820 return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
821 fs_reg(), payload, offset, value);
822 }
823
824 void
825 fs_visitor::vfail(const char *format, va_list va)
826 {
827 char *msg;
828
829 if (failed)
830 return;
831
832 failed = true;
833
834 msg = ralloc_vasprintf(mem_ctx, format, va);
835 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
836
837 this->fail_msg = msg;
838
839 if (debug_enabled) {
840 fprintf(stderr, "%s", msg);
841 }
842 }
843
844 void
845 fs_visitor::fail(const char *format, ...)
846 {
847 va_list va;
848
849 va_start(va, format);
850 vfail(format, va);
851 va_end(va);
852 }
853
854 /**
855 * Mark this program as impossible to compile in SIMD16 mode.
856 *
857 * During the SIMD8 compile (which happens first), we can detect and flag
858 * things that are unsupported in SIMD16 mode, so the compiler can skip
859 * the SIMD16 compile altogether.
860 *
861 * During a SIMD16 compile (if one happens anyway), this just calls fail().
862 */
863 void
864 fs_visitor::no16(const char *format, ...)
865 {
866 va_list va;
867
868 va_start(va, format);
869
870 if (dispatch_width == 16) {
871 vfail(format, va);
872 } else {
873 simd16_unsupported = true;
874
875 if (brw->perf_debug) {
876 if (no16_msg)
877 ralloc_vasprintf_append(&no16_msg, format, va);
878 else
879 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
880 }
881 }
882
883 va_end(va);
884 }
885
886 fs_inst *
887 fs_visitor::emit(enum opcode opcode)
888 {
889 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
890 }
891
892 fs_inst *
893 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
894 {
895 return emit(new(mem_ctx) fs_inst(opcode, dst));
896 }
897
898 fs_inst *
899 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
900 {
901 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
902 }
903
904 fs_inst *
905 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
906 const fs_reg &src1)
907 {
908 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
909 }
910
911 fs_inst *
912 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
913 const fs_reg &src1, const fs_reg &src2)
914 {
915 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
916 }
917
918 fs_inst *
919 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
920 fs_reg src[], int sources)
921 {
922 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
923 }
924
925 /**
926 * Returns true if the instruction has a flag that means it won't
927 * update an entire destination register.
928 *
929 * For example, dead code elimination and live variable analysis want to know
930 * when a write to a variable screens off any preceding values that were in
931 * it.
932 */
933 bool
934 fs_inst::is_partial_write() const
935 {
936 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
937 (this->dst.width * type_sz(this->dst.type)) < 32 ||
938 !this->dst.is_contiguous());
939 }
940
941 int
942 fs_inst::regs_read(int arg) const
943 {
944 if (is_tex() && arg == 0 && src[0].file == GRF) {
945 return mlen;
946 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
947 return mlen;
948 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
949 return mlen;
950 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
951 return mlen;
952 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
953 return mlen;
954 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
955 return mlen;
956 } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
957 return exec_size / 4;
958 }
959
960 switch (src[arg].file) {
961 case BAD_FILE:
962 case UNIFORM:
963 case IMM:
964 return 1;
965 case GRF:
966 case HW_REG:
967 if (src[arg].stride == 0) {
968 return 1;
969 } else {
970 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
971 return (size + 31) / 32;
972 }
973 case MRF:
974 unreachable("MRF registers are not allowed as sources");
975 default:
976 unreachable("Invalid register file");
977 }
978 }
979
980 bool
981 fs_inst::reads_flag() const
982 {
983 return predicate;
984 }
985
986 bool
987 fs_inst::writes_flag() const
988 {
989 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
990 opcode != BRW_OPCODE_IF &&
991 opcode != BRW_OPCODE_WHILE)) ||
992 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
993 }
994
995 /**
996 * Returns how many MRFs an FS opcode will write over.
997 *
998 * Note that this is not the 0 or 1 implied writes in an actual gen
999 * instruction -- the FS opcodes often generate MOVs in addition.
1000 */
1001 int
1002 fs_visitor::implied_mrf_writes(fs_inst *inst)
1003 {
1004 if (inst->mlen == 0)
1005 return 0;
1006
1007 if (inst->base_mrf == -1)
1008 return 0;
1009
1010 switch (inst->opcode) {
1011 case SHADER_OPCODE_RCP:
1012 case SHADER_OPCODE_RSQ:
1013 case SHADER_OPCODE_SQRT:
1014 case SHADER_OPCODE_EXP2:
1015 case SHADER_OPCODE_LOG2:
1016 case SHADER_OPCODE_SIN:
1017 case SHADER_OPCODE_COS:
1018 return 1 * dispatch_width / 8;
1019 case SHADER_OPCODE_POW:
1020 case SHADER_OPCODE_INT_QUOTIENT:
1021 case SHADER_OPCODE_INT_REMAINDER:
1022 return 2 * dispatch_width / 8;
1023 case SHADER_OPCODE_TEX:
1024 case FS_OPCODE_TXB:
1025 case SHADER_OPCODE_TXD:
1026 case SHADER_OPCODE_TXF:
1027 case SHADER_OPCODE_TXF_CMS:
1028 case SHADER_OPCODE_TXF_MCS:
1029 case SHADER_OPCODE_TG4:
1030 case SHADER_OPCODE_TG4_OFFSET:
1031 case SHADER_OPCODE_TXL:
1032 case SHADER_OPCODE_TXS:
1033 case SHADER_OPCODE_LOD:
1034 return 1;
1035 case FS_OPCODE_FB_WRITE:
1036 return 2;
1037 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1038 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1039 return 1;
1040 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1041 return inst->mlen;
1042 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1043 return 2;
1044 case SHADER_OPCODE_UNTYPED_ATOMIC:
1045 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1046 case SHADER_OPCODE_URB_WRITE_SIMD8:
1047 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1048 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1049 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1050 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1051 return 0;
1052 default:
1053 unreachable("not reached");
1054 }
1055 }
1056
1057 fs_reg
1058 fs_visitor::vgrf(const glsl_type *const type)
1059 {
1060 int reg_width = dispatch_width / 8;
1061 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1062 brw_type_for_base_type(type), dispatch_width);
1063 }
1064
1065 fs_reg
1066 fs_visitor::vgrf(int num_components)
1067 {
1068 int reg_width = dispatch_width / 8;
1069 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1070 BRW_REGISTER_TYPE_F, dispatch_width);
1071 }
1072
1073 /** Fixed HW reg constructor. */
1074 fs_reg::fs_reg(enum register_file file, int reg)
1075 {
1076 init();
1077 this->file = file;
1078 this->reg = reg;
1079 this->type = BRW_REGISTER_TYPE_F;
1080
1081 switch (file) {
1082 case UNIFORM:
1083 this->width = 1;
1084 break;
1085 default:
1086 this->width = 8;
1087 }
1088 }
1089
1090 /** Fixed HW reg constructor. */
1091 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1092 {
1093 init();
1094 this->file = file;
1095 this->reg = reg;
1096 this->type = type;
1097
1098 switch (file) {
1099 case UNIFORM:
1100 this->width = 1;
1101 break;
1102 default:
1103 this->width = 8;
1104 }
1105 }
1106
1107 /** Fixed HW reg constructor. */
1108 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1109 uint8_t width)
1110 {
1111 init();
1112 this->file = file;
1113 this->reg = reg;
1114 this->type = type;
1115 this->width = width;
1116 }
1117
1118 fs_reg *
1119 fs_visitor::variable_storage(ir_variable *var)
1120 {
1121 return (fs_reg *)hash_table_find(this->variable_ht, var);
1122 }
1123
1124 void
1125 import_uniforms_callback(const void *key,
1126 void *data,
1127 void *closure)
1128 {
1129 struct hash_table *dst_ht = (struct hash_table *)closure;
1130 const fs_reg *reg = (const fs_reg *)data;
1131
1132 if (reg->file != UNIFORM)
1133 return;
1134
1135 hash_table_insert(dst_ht, data, key);
1136 }
1137
1138 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1139 * This brings in those uniform definitions
1140 */
1141 void
1142 fs_visitor::import_uniforms(fs_visitor *v)
1143 {
1144 hash_table_call_foreach(v->variable_ht,
1145 import_uniforms_callback,
1146 variable_ht);
1147 this->push_constant_loc = v->push_constant_loc;
1148 this->pull_constant_loc = v->pull_constant_loc;
1149 this->uniforms = v->uniforms;
1150 this->param_size = v->param_size;
1151 }
1152
1153 /* Our support for uniforms is piggy-backed on the struct
1154 * gl_fragment_program, because that's where the values actually
1155 * get stored, rather than in some global gl_shader_program uniform
1156 * store.
1157 */
1158 void
1159 fs_visitor::setup_uniform_values(ir_variable *ir)
1160 {
1161 int namelen = strlen(ir->name);
1162
1163 /* The data for our (non-builtin) uniforms is stored in a series of
1164 * gl_uniform_driver_storage structs for each subcomponent that
1165 * glGetUniformLocation() could name. We know it's been set up in the same
1166 * order we'd walk the type, so walk the list of storage and find anything
1167 * with our name, or the prefix of a component that starts with our name.
1168 */
1169 unsigned params_before = uniforms;
1170 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1171 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1172
1173 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1174 (storage->name[namelen] != 0 &&
1175 storage->name[namelen] != '.' &&
1176 storage->name[namelen] != '[')) {
1177 continue;
1178 }
1179
1180 unsigned slots = storage->type->component_slots();
1181 if (storage->array_elements)
1182 slots *= storage->array_elements;
1183
1184 for (unsigned i = 0; i < slots; i++) {
1185 stage_prog_data->param[uniforms++] = &storage->storage[i];
1186 }
1187 }
1188
1189 /* Make sure we actually initialized the right amount of stuff here. */
1190 assert(params_before + ir->type->component_slots() == uniforms);
1191 (void)params_before;
1192 }
1193
1194
1195 /* Our support for builtin uniforms is even scarier than non-builtin.
1196 * It sits on top of the PROG_STATE_VAR parameters that are
1197 * automatically updated from GL context state.
1198 */
1199 void
1200 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1201 {
1202 const ir_state_slot *const slots = ir->get_state_slots();
1203 assert(slots != NULL);
1204
1205 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1206 /* This state reference has already been setup by ir_to_mesa, but we'll
1207 * get the same index back here.
1208 */
1209 int index = _mesa_add_state_reference(this->prog->Parameters,
1210 (gl_state_index *)slots[i].tokens);
1211
1212 /* Add each of the unique swizzles of the element as a parameter.
1213 * This'll end up matching the expected layout of the
1214 * array/matrix/structure we're trying to fill in.
1215 */
1216 int last_swiz = -1;
1217 for (unsigned int j = 0; j < 4; j++) {
1218 int swiz = GET_SWZ(slots[i].swizzle, j);
1219 if (swiz == last_swiz)
1220 break;
1221 last_swiz = swiz;
1222
1223 stage_prog_data->param[uniforms++] =
1224 &prog->Parameters->ParameterValues[index][swiz];
1225 }
1226 }
1227 }
1228
1229 fs_reg *
1230 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1231 bool origin_upper_left)
1232 {
1233 assert(stage == MESA_SHADER_FRAGMENT);
1234 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1235 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1236 fs_reg wpos = *reg;
1237 bool flip = !origin_upper_left ^ key->render_to_fbo;
1238
1239 /* gl_FragCoord.x */
1240 if (pixel_center_integer) {
1241 emit(MOV(wpos, this->pixel_x));
1242 } else {
1243 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1244 }
1245 wpos = offset(wpos, 1);
1246
1247 /* gl_FragCoord.y */
1248 if (!flip && pixel_center_integer) {
1249 emit(MOV(wpos, this->pixel_y));
1250 } else {
1251 fs_reg pixel_y = this->pixel_y;
1252 float offset = (pixel_center_integer ? 0.0 : 0.5);
1253
1254 if (flip) {
1255 pixel_y.negate = true;
1256 offset += key->drawable_height - 1.0;
1257 }
1258
1259 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1260 }
1261 wpos = offset(wpos, 1);
1262
1263 /* gl_FragCoord.z */
1264 if (devinfo->gen >= 6) {
1265 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1266 } else {
1267 emit(FS_OPCODE_LINTERP, wpos,
1268 this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1269 interp_reg(VARYING_SLOT_POS, 2));
1270 }
1271 wpos = offset(wpos, 1);
1272
1273 /* gl_FragCoord.w: Already set up in emit_interpolation */
1274 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1275
1276 return reg;
1277 }
1278
1279 fs_inst *
1280 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1281 glsl_interp_qualifier interpolation_mode,
1282 bool is_centroid, bool is_sample)
1283 {
1284 brw_wm_barycentric_interp_mode barycoord_mode;
1285 if (devinfo->gen >= 6) {
1286 if (is_centroid) {
1287 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1288 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1289 else
1290 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1291 } else if (is_sample) {
1292 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1293 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1294 else
1295 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1296 } else {
1297 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1298 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1299 else
1300 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1301 }
1302 } else {
1303 /* On Ironlake and below, there is only one interpolation mode.
1304 * Centroid interpolation doesn't mean anything on this hardware --
1305 * there is no multisampling.
1306 */
1307 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1308 }
1309 return emit(FS_OPCODE_LINTERP, attr,
1310 this->delta_xy[barycoord_mode], interp);
1311 }
1312
1313 void
1314 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1315 const glsl_type *type,
1316 glsl_interp_qualifier interpolation_mode,
1317 int location, bool mod_centroid,
1318 bool mod_sample)
1319 {
1320 attr.type = brw_type_for_base_type(type->get_scalar_type());
1321
1322 assert(stage == MESA_SHADER_FRAGMENT);
1323 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1324 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1325
1326 unsigned int array_elements;
1327
1328 if (type->is_array()) {
1329 array_elements = type->length;
1330 if (array_elements == 0) {
1331 fail("dereferenced array '%s' has length 0\n", name);
1332 }
1333 type = type->fields.array;
1334 } else {
1335 array_elements = 1;
1336 }
1337
1338 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1339 bool is_gl_Color =
1340 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1341 if (key->flat_shade && is_gl_Color) {
1342 interpolation_mode = INTERP_QUALIFIER_FLAT;
1343 } else {
1344 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1345 }
1346 }
1347
1348 for (unsigned int i = 0; i < array_elements; i++) {
1349 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1350 if (prog_data->urb_setup[location] == -1) {
1351 /* If there's no incoming setup data for this slot, don't
1352 * emit interpolation for it.
1353 */
1354 attr = offset(attr, type->vector_elements);
1355 location++;
1356 continue;
1357 }
1358
1359 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1360 /* Constant interpolation (flat shading) case. The SF has
1361 * handed us defined values in only the constant offset
1362 * field of the setup reg.
1363 */
1364 for (unsigned int k = 0; k < type->vector_elements; k++) {
1365 struct brw_reg interp = interp_reg(location, k);
1366 interp = suboffset(interp, 3);
1367 interp.type = attr.type;
1368 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1369 attr = offset(attr, 1);
1370 }
1371 } else {
1372 /* Smooth/noperspective interpolation case. */
1373 for (unsigned int k = 0; k < type->vector_elements; k++) {
1374 struct brw_reg interp = interp_reg(location, k);
1375 if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1376 /* Get the pixel/sample mask into f0 so that we know
1377 * which pixels are lit. Then, for each channel that is
1378 * unlit, replace the centroid data with non-centroid
1379 * data.
1380 */
1381 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1382
1383 fs_inst *inst;
1384 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1385 false, false);
1386 inst->predicate = BRW_PREDICATE_NORMAL;
1387 inst->predicate_inverse = true;
1388 if (devinfo->has_pln)
1389 inst->no_dd_clear = true;
1390
1391 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1392 mod_centroid && !key->persample_shading,
1393 mod_sample || key->persample_shading);
1394 inst->predicate = BRW_PREDICATE_NORMAL;
1395 inst->predicate_inverse = false;
1396 if (devinfo->has_pln)
1397 inst->no_dd_check = true;
1398
1399 } else {
1400 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1401 mod_centroid && !key->persample_shading,
1402 mod_sample || key->persample_shading);
1403 }
1404 if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1405 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1406 }
1407 attr = offset(attr, 1);
1408 }
1409
1410 }
1411 location++;
1412 }
1413 }
1414 }
1415
1416 fs_reg *
1417 fs_visitor::emit_frontfacing_interpolation()
1418 {
1419 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1420
1421 if (devinfo->gen >= 6) {
1422 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1423 * a boolean result from this (~0/true or 0/false).
1424 *
1425 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1426 * this task in only one instruction:
1427 * - a negation source modifier will flip the bit; and
1428 * - a W -> D type conversion will sign extend the bit into the high
1429 * word of the destination.
1430 *
1431 * An ASR 15 fills the low word of the destination.
1432 */
1433 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1434 g0.negate = true;
1435
1436 emit(ASR(*reg, g0, fs_reg(15)));
1437 } else {
1438 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1439 * a boolean result from this (1/true or 0/false).
1440 *
1441 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1442 * the negation source modifier to flip it. Unfortunately the SHR
1443 * instruction only operates on UD (or D with an abs source modifier)
1444 * sources without negation.
1445 *
1446 * Instead, use ASR (which will give ~0/true or 0/false).
1447 */
1448 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1449 g1_6.negate = true;
1450
1451 emit(ASR(*reg, g1_6, fs_reg(31)));
1452 }
1453
1454 return reg;
1455 }
1456
1457 void
1458 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1459 {
1460 assert(stage == MESA_SHADER_FRAGMENT);
1461 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1462 assert(dst.type == BRW_REGISTER_TYPE_F);
1463
1464 if (key->compute_pos_offset) {
1465 /* Convert int_sample_pos to floating point */
1466 emit(MOV(dst, int_sample_pos));
1467 /* Scale to the range [0, 1] */
1468 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1469 }
1470 else {
1471 /* From ARB_sample_shading specification:
1472 * "When rendering to a non-multisample buffer, or if multisample
1473 * rasterization is disabled, gl_SamplePosition will always be
1474 * (0.5, 0.5).
1475 */
1476 emit(MOV(dst, fs_reg(0.5f)));
1477 }
1478 }
1479
1480 fs_reg *
1481 fs_visitor::emit_samplepos_setup()
1482 {
1483 assert(devinfo->gen >= 6);
1484
1485 this->current_annotation = "compute sample position";
1486 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1487 fs_reg pos = *reg;
1488 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1489 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1490
1491 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1492 * mode will be enabled.
1493 *
1494 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1495 * R31.1:0 Position Offset X/Y for Slot[3:0]
1496 * R31.3:2 Position Offset X/Y for Slot[7:4]
1497 * .....
1498 *
1499 * The X, Y sample positions come in as bytes in thread payload. So, read
1500 * the positions using vstride=16, width=8, hstride=2.
1501 */
1502 struct brw_reg sample_pos_reg =
1503 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1504 BRW_REGISTER_TYPE_B), 16, 8, 2);
1505
1506 if (dispatch_width == 8) {
1507 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1508 } else {
1509 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1510 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1511 ->force_sechalf = true;
1512 }
1513 /* Compute gl_SamplePosition.x */
1514 compute_sample_position(pos, int_sample_x);
1515 pos = offset(pos, 1);
1516 if (dispatch_width == 8) {
1517 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1518 } else {
1519 emit(MOV(half(int_sample_y, 0),
1520 fs_reg(suboffset(sample_pos_reg, 1))));
1521 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1522 ->force_sechalf = true;
1523 }
1524 /* Compute gl_SamplePosition.y */
1525 compute_sample_position(pos, int_sample_y);
1526 return reg;
1527 }
1528
1529 fs_reg *
1530 fs_visitor::emit_sampleid_setup()
1531 {
1532 assert(stage == MESA_SHADER_FRAGMENT);
1533 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1534 assert(devinfo->gen >= 6);
1535
1536 this->current_annotation = "compute sample id";
1537 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1538
1539 if (key->compute_sample_id) {
1540 fs_reg t1 = vgrf(glsl_type::int_type);
1541 fs_reg t2 = vgrf(glsl_type::int_type);
1542 t2.type = BRW_REGISTER_TYPE_UW;
1543
1544 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1545 * 8x multisampling, subspan 0 will represent sample N (where N
1546 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1547 * 7. We can find the value of N by looking at R0.0 bits 7:6
1548 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1549 * (since samples are always delivered in pairs). That is, we
1550 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1551 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1552 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1553 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1554 * populating a temporary variable with the sequence (0, 1, 2, 3),
1555 * and then reading from it using vstride=1, width=4, hstride=0.
1556 * These computations hold good for 4x multisampling as well.
1557 *
1558 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1559 * the first four slots are sample 0 of subspan 0; the next four
1560 * are sample 1 of subspan 0; the third group is sample 0 of
1561 * subspan 1, and finally sample 1 of subspan 1.
1562 */
1563 fs_inst *inst;
1564 inst = emit(BRW_OPCODE_AND, t1,
1565 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1566 fs_reg(0xc0));
1567 inst->force_writemask_all = true;
1568 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1569 inst->force_writemask_all = true;
1570 /* This works for both SIMD8 and SIMD16 */
1571 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1572 inst->force_writemask_all = true;
1573 /* This special instruction takes care of setting vstride=1,
1574 * width=4, hstride=0 of t2 during an ADD instruction.
1575 */
1576 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1577 } else {
1578 /* As per GL_ARB_sample_shading specification:
1579 * "When rendering to a non-multisample buffer, or if multisample
1580 * rasterization is disabled, gl_SampleID will always be zero."
1581 */
1582 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1583 }
1584
1585 return reg;
1586 }
1587
1588 void
1589 fs_visitor::resolve_source_modifiers(fs_reg *src)
1590 {
1591 if (!src->abs && !src->negate)
1592 return;
1593
1594 fs_reg temp = retype(vgrf(1), src->type);
1595 emit(MOV(temp, *src));
1596 *src = temp;
1597 }
1598
1599 fs_reg
1600 fs_visitor::fix_math_operand(fs_reg src)
1601 {
1602 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1603 * might be able to do better by doing execsize = 1 math and then
1604 * expanding that result out, but we would need to be careful with
1605 * masking.
1606 *
1607 * The hardware ignores source modifiers (negate and abs) on math
1608 * instructions, so we also move to a temp to set those up.
1609 */
1610 if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1611 !src.abs && !src.negate)
1612 return src;
1613
1614 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1615 * operands to math
1616 */
1617 if (devinfo->gen >= 7 && src.file != IMM)
1618 return src;
1619
1620 fs_reg expanded = vgrf(glsl_type::float_type);
1621 expanded.type = src.type;
1622 emit(BRW_OPCODE_MOV, expanded, src);
1623 return expanded;
1624 }
1625
1626 fs_inst *
1627 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1628 {
1629 switch (opcode) {
1630 case SHADER_OPCODE_RCP:
1631 case SHADER_OPCODE_RSQ:
1632 case SHADER_OPCODE_SQRT:
1633 case SHADER_OPCODE_EXP2:
1634 case SHADER_OPCODE_LOG2:
1635 case SHADER_OPCODE_SIN:
1636 case SHADER_OPCODE_COS:
1637 break;
1638 default:
1639 unreachable("not reached: bad math opcode");
1640 }
1641
1642 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1643 * might be able to do better by doing execsize = 1 math and then
1644 * expanding that result out, but we would need to be careful with
1645 * masking.
1646 *
1647 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1648 * instructions, so we also move to a temp to set those up.
1649 */
1650 if (devinfo->gen == 6 || devinfo->gen == 7)
1651 src = fix_math_operand(src);
1652
1653 fs_inst *inst = emit(opcode, dst, src);
1654
1655 if (devinfo->gen < 6) {
1656 inst->base_mrf = 2;
1657 inst->mlen = dispatch_width / 8;
1658 }
1659
1660 return inst;
1661 }
1662
1663 fs_inst *
1664 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1665 {
1666 int base_mrf = 2;
1667 fs_inst *inst;
1668
1669 if (devinfo->gen >= 8) {
1670 inst = emit(opcode, dst, src0, src1);
1671 } else if (devinfo->gen >= 6) {
1672 src0 = fix_math_operand(src0);
1673 src1 = fix_math_operand(src1);
1674
1675 inst = emit(opcode, dst, src0, src1);
1676 } else {
1677 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1678 * "Message Payload":
1679 *
1680 * "Operand0[7]. For the INT DIV functions, this operand is the
1681 * denominator."
1682 * ...
1683 * "Operand1[7]. For the INT DIV functions, this operand is the
1684 * numerator."
1685 */
1686 bool is_int_div = opcode != SHADER_OPCODE_POW;
1687 fs_reg &op0 = is_int_div ? src1 : src0;
1688 fs_reg &op1 = is_int_div ? src0 : src1;
1689
1690 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1691 inst = emit(opcode, dst, op0, reg_null_f);
1692
1693 inst->base_mrf = base_mrf;
1694 inst->mlen = 2 * dispatch_width / 8;
1695 }
1696 return inst;
1697 }
1698
1699 void
1700 fs_visitor::emit_discard_jump()
1701 {
1702 assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1703
1704 /* For performance, after a discard, jump to the end of the
1705 * shader if all relevant channels have been discarded.
1706 */
1707 fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1708 discard_jump->flag_subreg = 1;
1709
1710 discard_jump->predicate = (dispatch_width == 8)
1711 ? BRW_PREDICATE_ALIGN1_ANY8H
1712 : BRW_PREDICATE_ALIGN1_ANY16H;
1713 discard_jump->predicate_inverse = true;
1714 }
1715
1716 void
1717 fs_visitor::assign_curb_setup()
1718 {
1719 if (dispatch_width == 8) {
1720 prog_data->dispatch_grf_start_reg = payload.num_regs;
1721 } else {
1722 if (stage == MESA_SHADER_FRAGMENT) {
1723 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1724 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1725 } else if (stage == MESA_SHADER_COMPUTE) {
1726 brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1727 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1728 } else {
1729 unreachable("Unsupported shader type!");
1730 }
1731 }
1732
1733 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1734
1735 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1736 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1737 for (unsigned int i = 0; i < inst->sources; i++) {
1738 if (inst->src[i].file == UNIFORM) {
1739 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1740 int constant_nr;
1741 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1742 constant_nr = push_constant_loc[uniform_nr];
1743 } else {
1744 /* Section 5.11 of the OpenGL 4.1 spec says:
1745 * "Out-of-bounds reads return undefined values, which include
1746 * values from other variables of the active program or zero."
1747 * Just return the first push constant.
1748 */
1749 constant_nr = 0;
1750 }
1751
1752 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1753 constant_nr / 8,
1754 constant_nr % 8);
1755
1756 inst->src[i].file = HW_REG;
1757 inst->src[i].fixed_hw_reg = byte_offset(
1758 retype(brw_reg, inst->src[i].type),
1759 inst->src[i].subreg_offset);
1760 }
1761 }
1762 }
1763 }
1764
1765 void
1766 fs_visitor::calculate_urb_setup()
1767 {
1768 assert(stage == MESA_SHADER_FRAGMENT);
1769 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1770 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1771
1772 memset(prog_data->urb_setup, -1,
1773 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1774
1775 int urb_next = 0;
1776 /* Figure out where each of the incoming setup attributes lands. */
1777 if (devinfo->gen >= 6) {
1778 if (_mesa_bitcount_64(prog->InputsRead &
1779 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1780 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1781 * first 16 varying inputs, so we can put them wherever we want.
1782 * Just put them in order.
1783 *
1784 * This is useful because it means that (a) inputs not used by the
1785 * fragment shader won't take up valuable register space, and (b) we
1786 * won't have to recompile the fragment shader if it gets paired with
1787 * a different vertex (or geometry) shader.
1788 */
1789 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1790 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1791 BITFIELD64_BIT(i)) {
1792 prog_data->urb_setup[i] = urb_next++;
1793 }
1794 }
1795 } else {
1796 /* We have enough input varyings that the SF/SBE pipeline stage can't
1797 * arbitrarily rearrange them to suit our whim; we have to put them
1798 * in an order that matches the output of the previous pipeline stage
1799 * (geometry or vertex shader).
1800 */
1801 struct brw_vue_map prev_stage_vue_map;
1802 brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1803 key->input_slots_valid);
1804 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1805 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1806 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1807 slot++) {
1808 int varying = prev_stage_vue_map.slot_to_varying[slot];
1809 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1810 * unused.
1811 */
1812 if (varying != BRW_VARYING_SLOT_COUNT &&
1813 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1814 BITFIELD64_BIT(varying))) {
1815 prog_data->urb_setup[varying] = slot - first_slot;
1816 }
1817 }
1818 urb_next = prev_stage_vue_map.num_slots - first_slot;
1819 }
1820 } else {
1821 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1822 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1823 /* Point size is packed into the header, not as a general attribute */
1824 if (i == VARYING_SLOT_PSIZ)
1825 continue;
1826
1827 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1828 /* The back color slot is skipped when the front color is
1829 * also written to. In addition, some slots can be
1830 * written in the vertex shader and not read in the
1831 * fragment shader. So the register number must always be
1832 * incremented, mapped or not.
1833 */
1834 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1835 prog_data->urb_setup[i] = urb_next;
1836 urb_next++;
1837 }
1838 }
1839
1840 /*
1841 * It's a FS only attribute, and we did interpolation for this attribute
1842 * in SF thread. So, count it here, too.
1843 *
1844 * See compile_sf_prog() for more info.
1845 */
1846 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1847 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1848 }
1849
1850 prog_data->num_varying_inputs = urb_next;
1851 }
1852
1853 void
1854 fs_visitor::assign_urb_setup()
1855 {
1856 assert(stage == MESA_SHADER_FRAGMENT);
1857 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1858
1859 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1860
1861 /* Offset all the urb_setup[] index by the actual position of the
1862 * setup regs, now that the location of the constants has been chosen.
1863 */
1864 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1865 if (inst->opcode == FS_OPCODE_LINTERP) {
1866 assert(inst->src[1].file == HW_REG);
1867 inst->src[1].fixed_hw_reg.nr += urb_start;
1868 }
1869
1870 if (inst->opcode == FS_OPCODE_CINTERP) {
1871 assert(inst->src[0].file == HW_REG);
1872 inst->src[0].fixed_hw_reg.nr += urb_start;
1873 }
1874 }
1875
1876 /* Each attribute is 4 setup channels, each of which is half a reg. */
1877 this->first_non_payload_grf =
1878 urb_start + prog_data->num_varying_inputs * 2;
1879 }
1880
1881 void
1882 fs_visitor::assign_vs_urb_setup()
1883 {
1884 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1885 int grf, count, slot, channel, attr;
1886
1887 assert(stage == MESA_SHADER_VERTEX);
1888 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1889 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1890 count++;
1891
1892 /* Each attribute is 4 regs. */
1893 this->first_non_payload_grf =
1894 payload.num_regs + prog_data->curb_read_length + count * 4;
1895
1896 unsigned vue_entries =
1897 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1898
1899 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1900 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1901
1902 assert(vs_prog_data->base.urb_read_length <= 15);
1903
1904 /* Rewrite all ATTR file references to the hw grf that they land in. */
1905 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1906 for (int i = 0; i < inst->sources; i++) {
1907 if (inst->src[i].file == ATTR) {
1908
1909 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1910 slot = count - 1;
1911 } else {
1912 /* Attributes come in in a contiguous block, ordered by their
1913 * gl_vert_attrib value. That means we can compute the slot
1914 * number for an attribute by masking out the enabled
1915 * attributes before it and counting the bits.
1916 */
1917 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1918 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1919 BITFIELD64_MASK(attr));
1920 }
1921
1922 channel = inst->src[i].reg_offset & 3;
1923
1924 grf = payload.num_regs +
1925 prog_data->curb_read_length +
1926 slot * 4 + channel;
1927
1928 inst->src[i].file = HW_REG;
1929 inst->src[i].fixed_hw_reg =
1930 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1931 }
1932 }
1933 }
1934 }
1935
1936 /**
1937 * Split large virtual GRFs into separate components if we can.
1938 *
1939 * This is mostly duplicated with what brw_fs_vector_splitting does,
1940 * but that's really conservative because it's afraid of doing
1941 * splitting that doesn't result in real progress after the rest of
1942 * the optimization phases, which would cause infinite looping in
1943 * optimization. We can do it once here, safely. This also has the
1944 * opportunity to split interpolated values, or maybe even uniforms,
1945 * which we don't have at the IR level.
1946 *
1947 * We want to split, because virtual GRFs are what we register
1948 * allocate and spill (due to contiguousness requirements for some
1949 * instructions), and they're what we naturally generate in the
1950 * codegen process, but most virtual GRFs don't actually need to be
1951 * contiguous sets of GRFs. If we split, we'll end up with reduced
1952 * live intervals and better dead code elimination and coalescing.
1953 */
1954 void
1955 fs_visitor::split_virtual_grfs()
1956 {
1957 int num_vars = this->alloc.count;
1958
1959 /* Count the total number of registers */
1960 int reg_count = 0;
1961 int vgrf_to_reg[num_vars];
1962 for (int i = 0; i < num_vars; i++) {
1963 vgrf_to_reg[i] = reg_count;
1964 reg_count += alloc.sizes[i];
1965 }
1966
1967 /* An array of "split points". For each register slot, this indicates
1968 * if this slot can be separated from the previous slot. Every time an
1969 * instruction uses multiple elements of a register (as a source or
1970 * destination), we mark the used slots as inseparable. Then we go
1971 * through and split the registers into the smallest pieces we can.
1972 */
1973 bool split_points[reg_count];
1974 memset(split_points, 0, sizeof(split_points));
1975
1976 /* Mark all used registers as fully splittable */
1977 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1978 if (inst->dst.file == GRF) {
1979 int reg = vgrf_to_reg[inst->dst.reg];
1980 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1981 split_points[reg + j] = true;
1982 }
1983
1984 for (int i = 0; i < inst->sources; i++) {
1985 if (inst->src[i].file == GRF) {
1986 int reg = vgrf_to_reg[inst->src[i].reg];
1987 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1988 split_points[reg + j] = true;
1989 }
1990 }
1991 }
1992
1993 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1994 if (inst->dst.file == GRF) {
1995 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1996 for (int j = 1; j < inst->regs_written; j++)
1997 split_points[reg + j] = false;
1998 }
1999 for (int i = 0; i < inst->sources; i++) {
2000 if (inst->src[i].file == GRF) {
2001 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2002 for (int j = 1; j < inst->regs_read(i); j++)
2003 split_points[reg + j] = false;
2004 }
2005 }
2006 }
2007
2008 int new_virtual_grf[reg_count];
2009 int new_reg_offset[reg_count];
2010
2011 int reg = 0;
2012 for (int i = 0; i < num_vars; i++) {
2013 /* The first one should always be 0 as a quick sanity check. */
2014 assert(split_points[reg] == false);
2015
2016 /* j = 0 case */
2017 new_reg_offset[reg] = 0;
2018 reg++;
2019 int offset = 1;
2020
2021 /* j > 0 case */
2022 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2023 /* If this is a split point, reset the offset to 0 and allocate a
2024 * new virtual GRF for the previous offset many registers
2025 */
2026 if (split_points[reg]) {
2027 assert(offset <= MAX_VGRF_SIZE);
2028 int grf = alloc.allocate(offset);
2029 for (int k = reg - offset; k < reg; k++)
2030 new_virtual_grf[k] = grf;
2031 offset = 0;
2032 }
2033 new_reg_offset[reg] = offset;
2034 offset++;
2035 reg++;
2036 }
2037
2038 /* The last one gets the original register number */
2039 assert(offset <= MAX_VGRF_SIZE);
2040 alloc.sizes[i] = offset;
2041 for (int k = reg - offset; k < reg; k++)
2042 new_virtual_grf[k] = i;
2043 }
2044 assert(reg == reg_count);
2045
2046 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2047 if (inst->dst.file == GRF) {
2048 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2049 inst->dst.reg = new_virtual_grf[reg];
2050 inst->dst.reg_offset = new_reg_offset[reg];
2051 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2052 }
2053 for (int i = 0; i < inst->sources; i++) {
2054 if (inst->src[i].file == GRF) {
2055 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2056 inst->src[i].reg = new_virtual_grf[reg];
2057 inst->src[i].reg_offset = new_reg_offset[reg];
2058 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2059 }
2060 }
2061 }
2062 invalidate_live_intervals();
2063 }
2064
2065 /**
2066 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2067 *
2068 * During code generation, we create tons of temporary variables, many of
2069 * which get immediately killed and are never used again. Yet, in later
2070 * optimization and analysis passes, such as compute_live_intervals, we need
2071 * to loop over all the virtual GRFs. Compacting them can save a lot of
2072 * overhead.
2073 */
2074 bool
2075 fs_visitor::compact_virtual_grfs()
2076 {
2077 bool progress = false;
2078 int remap_table[this->alloc.count];
2079 memset(remap_table, -1, sizeof(remap_table));
2080
2081 /* Mark which virtual GRFs are used. */
2082 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2083 if (inst->dst.file == GRF)
2084 remap_table[inst->dst.reg] = 0;
2085
2086 for (int i = 0; i < inst->sources; i++) {
2087 if (inst->src[i].file == GRF)
2088 remap_table[inst->src[i].reg] = 0;
2089 }
2090 }
2091
2092 /* Compact the GRF arrays. */
2093 int new_index = 0;
2094 for (unsigned i = 0; i < this->alloc.count; i++) {
2095 if (remap_table[i] == -1) {
2096 /* We just found an unused register. This means that we are
2097 * actually going to compact something.
2098 */
2099 progress = true;
2100 } else {
2101 remap_table[i] = new_index;
2102 alloc.sizes[new_index] = alloc.sizes[i];
2103 invalidate_live_intervals();
2104 ++new_index;
2105 }
2106 }
2107
2108 this->alloc.count = new_index;
2109
2110 /* Patch all the instructions to use the newly renumbered registers */
2111 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2112 if (inst->dst.file == GRF)
2113 inst->dst.reg = remap_table[inst->dst.reg];
2114
2115 for (int i = 0; i < inst->sources; i++) {
2116 if (inst->src[i].file == GRF)
2117 inst->src[i].reg = remap_table[inst->src[i].reg];
2118 }
2119 }
2120
2121 /* Patch all the references to delta_xy, since they're used in register
2122 * allocation. If they're unused, switch them to BAD_FILE so we don't
2123 * think some random VGRF is delta_xy.
2124 */
2125 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2126 if (delta_xy[i].file == GRF) {
2127 if (remap_table[delta_xy[i].reg] != -1) {
2128 delta_xy[i].reg = remap_table[delta_xy[i].reg];
2129 } else {
2130 delta_xy[i].file = BAD_FILE;
2131 }
2132 }
2133 }
2134
2135 return progress;
2136 }
2137
2138 /*
2139 * Implements array access of uniforms by inserting a
2140 * PULL_CONSTANT_LOAD instruction.
2141 *
2142 * Unlike temporary GRF array access (where we don't support it due to
2143 * the difficulty of doing relative addressing on instruction
2144 * destinations), we could potentially do array access of uniforms
2145 * that were loaded in GRF space as push constants. In real-world
2146 * usage we've seen, though, the arrays being used are always larger
2147 * than we could load as push constants, so just always move all
2148 * uniform array access out to a pull constant buffer.
2149 */
2150 void
2151 fs_visitor::move_uniform_array_access_to_pull_constants()
2152 {
2153 if (dispatch_width != 8)
2154 return;
2155
2156 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2157 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2158
2159 /* Walk through and find array access of uniforms. Put a copy of that
2160 * uniform in the pull constant buffer.
2161 *
2162 * Note that we don't move constant-indexed accesses to arrays. No
2163 * testing has been done of the performance impact of this choice.
2164 */
2165 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2166 for (int i = 0 ; i < inst->sources; i++) {
2167 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2168 continue;
2169
2170 int uniform = inst->src[i].reg;
2171
2172 /* If this array isn't already present in the pull constant buffer,
2173 * add it.
2174 */
2175 if (pull_constant_loc[uniform] == -1) {
2176 const gl_constant_value **values = &stage_prog_data->param[uniform];
2177
2178 assert(param_size[uniform]);
2179
2180 for (int j = 0; j < param_size[uniform]; j++) {
2181 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2182
2183 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2184 values[j];
2185 }
2186 }
2187 }
2188 }
2189 }
2190
2191 /**
2192 * Assign UNIFORM file registers to either push constants or pull constants.
2193 *
2194 * We allow a fragment shader to have more than the specified minimum
2195 * maximum number of fragment shader uniform components (64). If
2196 * there are too many of these, they'd fill up all of register space.
2197 * So, this will push some of them out to the pull constant buffer and
2198 * update the program to load them.
2199 */
2200 void
2201 fs_visitor::assign_constant_locations()
2202 {
2203 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2204 if (dispatch_width != 8)
2205 return;
2206
2207 /* Find which UNIFORM registers are still in use. */
2208 bool is_live[uniforms];
2209 for (unsigned int i = 0; i < uniforms; i++) {
2210 is_live[i] = false;
2211 }
2212
2213 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2214 for (int i = 0; i < inst->sources; i++) {
2215 if (inst->src[i].file != UNIFORM)
2216 continue;
2217
2218 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2219 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2220 is_live[constant_nr] = true;
2221 }
2222 }
2223
2224 /* Only allow 16 registers (128 uniform components) as push constants.
2225 *
2226 * Just demote the end of the list. We could probably do better
2227 * here, demoting things that are rarely used in the program first.
2228 *
2229 * If changing this value, note the limitation about total_regs in
2230 * brw_curbe.c.
2231 */
2232 unsigned int max_push_components = 16 * 8;
2233 unsigned int num_push_constants = 0;
2234
2235 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2236
2237 for (unsigned int i = 0; i < uniforms; i++) {
2238 if (!is_live[i] || pull_constant_loc[i] != -1) {
2239 /* This UNIFORM register is either dead, or has already been demoted
2240 * to a pull const. Mark it as no longer living in the param[] array.
2241 */
2242 push_constant_loc[i] = -1;
2243 continue;
2244 }
2245
2246 if (num_push_constants < max_push_components) {
2247 /* Retain as a push constant. Record the location in the params[]
2248 * array.
2249 */
2250 push_constant_loc[i] = num_push_constants++;
2251 } else {
2252 /* Demote to a pull constant. */
2253 push_constant_loc[i] = -1;
2254
2255 int pull_index = stage_prog_data->nr_pull_params++;
2256 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2257 pull_constant_loc[i] = pull_index;
2258 }
2259 }
2260
2261 stage_prog_data->nr_params = num_push_constants;
2262
2263 /* Up until now, the param[] array has been indexed by reg + reg_offset
2264 * of UNIFORM registers. Condense it to only contain the uniforms we
2265 * chose to upload as push constants.
2266 */
2267 for (unsigned int i = 0; i < uniforms; i++) {
2268 int remapped = push_constant_loc[i];
2269
2270 if (remapped == -1)
2271 continue;
2272
2273 assert(remapped <= (int)i);
2274 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2275 }
2276 }
2277
2278 /**
2279 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2280 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2281 */
2282 void
2283 fs_visitor::demote_pull_constants()
2284 {
2285 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2286 for (int i = 0; i < inst->sources; i++) {
2287 if (inst->src[i].file != UNIFORM)
2288 continue;
2289
2290 int pull_index;
2291 unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2292 if (location >= uniforms) /* Out of bounds access */
2293 pull_index = -1;
2294 else
2295 pull_index = pull_constant_loc[location];
2296
2297 if (pull_index == -1)
2298 continue;
2299
2300 /* Set up the annotation tracking for new generated instructions. */
2301 base_ir = inst->ir;
2302 current_annotation = inst->annotation;
2303
2304 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2305 fs_reg dst = vgrf(glsl_type::float_type);
2306
2307 /* Generate a pull load into dst. */
2308 if (inst->src[i].reladdr) {
2309 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2310 surf_index,
2311 *inst->src[i].reladdr,
2312 pull_index);
2313 inst->insert_before(block, &list);
2314 inst->src[i].reladdr = NULL;
2315 } else {
2316 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2317 fs_inst *pull =
2318 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2319 dst, surf_index, offset);
2320 inst->insert_before(block, pull);
2321 inst->src[i].set_smear(pull_index & 3);
2322 }
2323
2324 /* Rewrite the instruction to use the temporary VGRF. */
2325 inst->src[i].file = GRF;
2326 inst->src[i].reg = dst.reg;
2327 inst->src[i].reg_offset = 0;
2328 inst->src[i].width = dispatch_width;
2329 }
2330 }
2331 invalidate_live_intervals();
2332 }
2333
2334 bool
2335 fs_visitor::opt_algebraic()
2336 {
2337 bool progress = false;
2338
2339 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2340 switch (inst->opcode) {
2341 case BRW_OPCODE_MOV:
2342 if (inst->src[0].file != IMM)
2343 break;
2344
2345 if (inst->saturate) {
2346 if (inst->dst.type != inst->src[0].type)
2347 assert(!"unimplemented: saturate mixed types");
2348
2349 if (brw_saturate_immediate(inst->dst.type,
2350 &inst->src[0].fixed_hw_reg)) {
2351 inst->saturate = false;
2352 progress = true;
2353 }
2354 }
2355 break;
2356
2357 case BRW_OPCODE_MUL:
2358 if (inst->src[1].file != IMM)
2359 continue;
2360
2361 /* a * 1.0 = a */
2362 if (inst->src[1].is_one()) {
2363 inst->opcode = BRW_OPCODE_MOV;
2364 inst->src[1] = reg_undef;
2365 progress = true;
2366 break;
2367 }
2368
2369 /* a * -1.0 = -a */
2370 if (inst->src[1].is_negative_one()) {
2371 inst->opcode = BRW_OPCODE_MOV;
2372 inst->src[0].negate = !inst->src[0].negate;
2373 inst->src[1] = reg_undef;
2374 progress = true;
2375 break;
2376 }
2377
2378 /* a * 0.0 = 0.0 */
2379 if (inst->src[1].is_zero()) {
2380 inst->opcode = BRW_OPCODE_MOV;
2381 inst->src[0] = inst->src[1];
2382 inst->src[1] = reg_undef;
2383 progress = true;
2384 break;
2385 }
2386
2387 if (inst->src[0].file == IMM) {
2388 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2389 inst->opcode = BRW_OPCODE_MOV;
2390 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2391 inst->src[1] = reg_undef;
2392 progress = true;
2393 break;
2394 }
2395 break;
2396 case BRW_OPCODE_ADD:
2397 if (inst->src[1].file != IMM)
2398 continue;
2399
2400 /* a + 0.0 = a */
2401 if (inst->src[1].is_zero()) {
2402 inst->opcode = BRW_OPCODE_MOV;
2403 inst->src[1] = reg_undef;
2404 progress = true;
2405 break;
2406 }
2407
2408 if (inst->src[0].file == IMM) {
2409 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2410 inst->opcode = BRW_OPCODE_MOV;
2411 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2412 inst->src[1] = reg_undef;
2413 progress = true;
2414 break;
2415 }
2416 break;
2417 case BRW_OPCODE_OR:
2418 if (inst->src[0].equals(inst->src[1])) {
2419 inst->opcode = BRW_OPCODE_MOV;
2420 inst->src[1] = reg_undef;
2421 progress = true;
2422 break;
2423 }
2424 break;
2425 case BRW_OPCODE_LRP:
2426 if (inst->src[1].equals(inst->src[2])) {
2427 inst->opcode = BRW_OPCODE_MOV;
2428 inst->src[0] = inst->src[1];
2429 inst->src[1] = reg_undef;
2430 inst->src[2] = reg_undef;
2431 progress = true;
2432 break;
2433 }
2434 break;
2435 case BRW_OPCODE_CMP:
2436 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2437 inst->src[0].abs &&
2438 inst->src[0].negate &&
2439 inst->src[1].is_zero()) {
2440 inst->src[0].abs = false;
2441 inst->src[0].negate = false;
2442 inst->conditional_mod = BRW_CONDITIONAL_Z;
2443 progress = true;
2444 break;
2445 }
2446 break;
2447 case BRW_OPCODE_SEL:
2448 if (inst->src[0].equals(inst->src[1])) {
2449 inst->opcode = BRW_OPCODE_MOV;
2450 inst->src[1] = reg_undef;
2451 inst->predicate = BRW_PREDICATE_NONE;
2452 inst->predicate_inverse = false;
2453 progress = true;
2454 } else if (inst->saturate && inst->src[1].file == IMM) {
2455 switch (inst->conditional_mod) {
2456 case BRW_CONDITIONAL_LE:
2457 case BRW_CONDITIONAL_L:
2458 switch (inst->src[1].type) {
2459 case BRW_REGISTER_TYPE_F:
2460 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2461 inst->opcode = BRW_OPCODE_MOV;
2462 inst->src[1] = reg_undef;
2463 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2464 progress = true;
2465 }
2466 break;
2467 default:
2468 break;
2469 }
2470 break;
2471 case BRW_CONDITIONAL_GE:
2472 case BRW_CONDITIONAL_G:
2473 switch (inst->src[1].type) {
2474 case BRW_REGISTER_TYPE_F:
2475 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2476 inst->opcode = BRW_OPCODE_MOV;
2477 inst->src[1] = reg_undef;
2478 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2479 progress = true;
2480 }
2481 break;
2482 default:
2483 break;
2484 }
2485 default:
2486 break;
2487 }
2488 }
2489 break;
2490 case BRW_OPCODE_MAD:
2491 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2492 inst->opcode = BRW_OPCODE_MOV;
2493 inst->src[1] = reg_undef;
2494 inst->src[2] = reg_undef;
2495 progress = true;
2496 } else if (inst->src[0].is_zero()) {
2497 inst->opcode = BRW_OPCODE_MUL;
2498 inst->src[0] = inst->src[2];
2499 inst->src[2] = reg_undef;
2500 progress = true;
2501 } else if (inst->src[1].is_one()) {
2502 inst->opcode = BRW_OPCODE_ADD;
2503 inst->src[1] = inst->src[2];
2504 inst->src[2] = reg_undef;
2505 progress = true;
2506 } else if (inst->src[2].is_one()) {
2507 inst->opcode = BRW_OPCODE_ADD;
2508 inst->src[2] = reg_undef;
2509 progress = true;
2510 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2511 inst->opcode = BRW_OPCODE_ADD;
2512 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2513 inst->src[2] = reg_undef;
2514 progress = true;
2515 }
2516 break;
2517 case SHADER_OPCODE_RCP: {
2518 fs_inst *prev = (fs_inst *)inst->prev;
2519 if (prev->opcode == SHADER_OPCODE_SQRT) {
2520 if (inst->src[0].equals(prev->dst)) {
2521 inst->opcode = SHADER_OPCODE_RSQ;
2522 inst->src[0] = prev->src[0];
2523 progress = true;
2524 }
2525 }
2526 break;
2527 }
2528 default:
2529 break;
2530 }
2531
2532 /* Swap if src[0] is immediate. */
2533 if (progress && inst->is_commutative()) {
2534 if (inst->src[0].file == IMM) {
2535 fs_reg tmp = inst->src[1];
2536 inst->src[1] = inst->src[0];
2537 inst->src[0] = tmp;
2538 }
2539 }
2540 }
2541 return progress;
2542 }
2543
2544 /**
2545 * Optimize sample messages that have constant zero values for the trailing
2546 * texture coordinates. We can just reduce the message length for these
2547 * instructions instead of reserving a register for it. Trailing parameters
2548 * that aren't sent default to zero anyway. This will cause the dead code
2549 * eliminator to remove the MOV instruction that would otherwise be emitted to
2550 * set up the zero value.
2551 */
2552 bool
2553 fs_visitor::opt_zero_samples()
2554 {
2555 /* Gen4 infers the texturing opcode based on the message length so we can't
2556 * change it.
2557 */
2558 if (devinfo->gen < 5)
2559 return false;
2560
2561 bool progress = false;
2562
2563 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2564 if (!inst->is_tex())
2565 continue;
2566
2567 fs_inst *load_payload = (fs_inst *) inst->prev;
2568
2569 if (load_payload->is_head_sentinel() ||
2570 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2571 continue;
2572
2573 /* We don't want to remove the message header. Removing all of the
2574 * parameters is avoided because it seems to cause a GPU hang but I
2575 * can't find any documentation indicating that this is expected.
2576 */
2577 while (inst->mlen > inst->header_present + dispatch_width / 8 &&
2578 load_payload->src[(inst->mlen - inst->header_present) /
2579 (dispatch_width / 8) +
2580 inst->header_present - 1].is_zero()) {
2581 inst->mlen -= dispatch_width / 8;
2582 progress = true;
2583 }
2584 }
2585
2586 if (progress)
2587 invalidate_live_intervals();
2588
2589 return progress;
2590 }
2591
2592 /**
2593 * Optimize sample messages which are followed by the final RT write.
2594 *
2595 * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2596 * results sent directly to the framebuffer, bypassing the EU. Recognize the
2597 * final texturing results copied to the framebuffer write payload and modify
2598 * them to write to the framebuffer directly.
2599 */
2600 bool
2601 fs_visitor::opt_sampler_eot()
2602 {
2603 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2604
2605 if (stage != MESA_SHADER_FRAGMENT)
2606 return false;
2607
2608 if (devinfo->gen < 9 && !devinfo->is_cherryview)
2609 return false;
2610
2611 /* FINISHME: It should be possible to implement this optimization when there
2612 * are multiple drawbuffers.
2613 */
2614 if (key->nr_color_regions != 1)
2615 return false;
2616
2617 /* Look for a texturing instruction immediately before the final FB_WRITE. */
2618 fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2619 assert(fb_write->eot);
2620 assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2621
2622 fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2623
2624 /* There wasn't one; nothing to do. */
2625 if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2626 return false;
2627
2628 /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2629 * It's very likely to be the previous instruction.
2630 */
2631 fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2632 if (load_payload->is_head_sentinel() ||
2633 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2634 return false;
2635
2636 assert(!tex_inst->eot); /* We can't get here twice */
2637 assert((tex_inst->offset & (0xff << 24)) == 0);
2638
2639 tex_inst->offset |= fb_write->target << 24;
2640 tex_inst->eot = true;
2641 fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2642
2643 /* If a header is present, marking the eot is sufficient. Otherwise, we need
2644 * to create a new LOAD_PAYLOAD command with the same sources and a space
2645 * saved for the header. Using a new destination register not only makes sure
2646 * we have enough space, but it will make sure the dead code eliminator kills
2647 * the instruction that this will replace.
2648 */
2649 if (tex_inst->header_present)
2650 return true;
2651
2652 fs_reg send_header = vgrf(load_payload->sources + 1);
2653 fs_reg *new_sources =
2654 ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2655
2656 new_sources[0] = fs_reg();
2657 for (int i = 0; i < load_payload->sources; i++)
2658 new_sources[i+1] = load_payload->src[i];
2659
2660 /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2661 * requires a lot of information about the sources to appropriately figure
2662 * out the number of registers needed to be used. Given this stage in our
2663 * optimization, we may not have the appropriate GRFs required by
2664 * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2665 * manually emit the instruction.
2666 */
2667 fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2668 load_payload->exec_size,
2669 send_header,
2670 new_sources,
2671 load_payload->sources + 1);
2672
2673 new_load_payload->regs_written = load_payload->regs_written + 1;
2674 tex_inst->mlen++;
2675 tex_inst->header_present = true;
2676 tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2677 tex_inst->src[0] = send_header;
2678 tex_inst->dst = reg_null_ud;
2679
2680 return true;
2681 }
2682
2683 bool
2684 fs_visitor::opt_register_renaming()
2685 {
2686 bool progress = false;
2687 int depth = 0;
2688
2689 int remap[alloc.count];
2690 memset(remap, -1, sizeof(int) * alloc.count);
2691
2692 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2693 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2694 depth++;
2695 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2696 inst->opcode == BRW_OPCODE_WHILE) {
2697 depth--;
2698 }
2699
2700 /* Rewrite instruction sources. */
2701 for (int i = 0; i < inst->sources; i++) {
2702 if (inst->src[i].file == GRF &&
2703 remap[inst->src[i].reg] != -1 &&
2704 remap[inst->src[i].reg] != inst->src[i].reg) {
2705 inst->src[i].reg = remap[inst->src[i].reg];
2706 progress = true;
2707 }
2708 }
2709
2710 const int dst = inst->dst.reg;
2711
2712 if (depth == 0 &&
2713 inst->dst.file == GRF &&
2714 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2715 !inst->is_partial_write()) {
2716 if (remap[dst] == -1) {
2717 remap[dst] = dst;
2718 } else {
2719 remap[dst] = alloc.allocate(inst->dst.width / 8);
2720 inst->dst.reg = remap[dst];
2721 progress = true;
2722 }
2723 } else if (inst->dst.file == GRF &&
2724 remap[dst] != -1 &&
2725 remap[dst] != dst) {
2726 inst->dst.reg = remap[dst];
2727 progress = true;
2728 }
2729 }
2730
2731 if (progress) {
2732 invalidate_live_intervals();
2733
2734 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2735 if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2736 delta_xy[i].reg = remap[delta_xy[i].reg];
2737 }
2738 }
2739 }
2740
2741 return progress;
2742 }
2743
2744 /**
2745 * Remove redundant or useless discard jumps.
2746 *
2747 * For example, we can eliminate jumps in the following sequence:
2748 *
2749 * discard-jump (redundant with the next jump)
2750 * discard-jump (useless; jumps to the next instruction)
2751 * placeholder-halt
2752 */
2753 bool
2754 fs_visitor::opt_redundant_discard_jumps()
2755 {
2756 bool progress = false;
2757
2758 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2759
2760 fs_inst *placeholder_halt = NULL;
2761 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2762 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2763 placeholder_halt = inst;
2764 break;
2765 }
2766 }
2767
2768 if (!placeholder_halt)
2769 return false;
2770
2771 /* Delete any HALTs immediately before the placeholder halt. */
2772 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2773 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2774 prev = (fs_inst *) placeholder_halt->prev) {
2775 prev->remove(last_bblock);
2776 progress = true;
2777 }
2778
2779 if (progress)
2780 invalidate_live_intervals();
2781
2782 return progress;
2783 }
2784
2785 bool
2786 fs_visitor::compute_to_mrf()
2787 {
2788 bool progress = false;
2789 int next_ip = 0;
2790
2791 /* No MRFs on Gen >= 7. */
2792 if (devinfo->gen >= 7)
2793 return false;
2794
2795 calculate_live_intervals();
2796
2797 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2798 int ip = next_ip;
2799 next_ip++;
2800
2801 if (inst->opcode != BRW_OPCODE_MOV ||
2802 inst->is_partial_write() ||
2803 inst->dst.file != MRF || inst->src[0].file != GRF ||
2804 inst->dst.type != inst->src[0].type ||
2805 inst->src[0].abs || inst->src[0].negate ||
2806 !inst->src[0].is_contiguous() ||
2807 inst->src[0].subreg_offset)
2808 continue;
2809
2810 /* Work out which hardware MRF registers are written by this
2811 * instruction.
2812 */
2813 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2814 int mrf_high;
2815 if (inst->dst.reg & BRW_MRF_COMPR4) {
2816 mrf_high = mrf_low + 4;
2817 } else if (inst->exec_size == 16) {
2818 mrf_high = mrf_low + 1;
2819 } else {
2820 mrf_high = mrf_low;
2821 }
2822
2823 /* Can't compute-to-MRF this GRF if someone else was going to
2824 * read it later.
2825 */
2826 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2827 continue;
2828
2829 /* Found a move of a GRF to a MRF. Let's see if we can go
2830 * rewrite the thing that made this GRF to write into the MRF.
2831 */
2832 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2833 if (scan_inst->dst.file == GRF &&
2834 scan_inst->dst.reg == inst->src[0].reg) {
2835 /* Found the last thing to write our reg we want to turn
2836 * into a compute-to-MRF.
2837 */
2838
2839 /* If this one instruction didn't populate all the
2840 * channels, bail. We might be able to rewrite everything
2841 * that writes that reg, but it would require smarter
2842 * tracking to delay the rewriting until complete success.
2843 */
2844 if (scan_inst->is_partial_write())
2845 break;
2846
2847 /* Things returning more than one register would need us to
2848 * understand coalescing out more than one MOV at a time.
2849 */
2850 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2851 break;
2852
2853 /* SEND instructions can't have MRF as a destination. */
2854 if (scan_inst->mlen)
2855 break;
2856
2857 if (devinfo->gen == 6) {
2858 /* gen6 math instructions must have the destination be
2859 * GRF, so no compute-to-MRF for them.
2860 */
2861 if (scan_inst->is_math()) {
2862 break;
2863 }
2864 }
2865
2866 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2867 /* Found the creator of our MRF's source value. */
2868 scan_inst->dst.file = MRF;
2869 scan_inst->dst.reg = inst->dst.reg;
2870 scan_inst->saturate |= inst->saturate;
2871 inst->remove(block);
2872 progress = true;
2873 }
2874 break;
2875 }
2876
2877 /* We don't handle control flow here. Most computation of
2878 * values that end up in MRFs are shortly before the MRF
2879 * write anyway.
2880 */
2881 if (block->start() == scan_inst)
2882 break;
2883
2884 /* You can't read from an MRF, so if someone else reads our
2885 * MRF's source GRF that we wanted to rewrite, that stops us.
2886 */
2887 bool interfered = false;
2888 for (int i = 0; i < scan_inst->sources; i++) {
2889 if (scan_inst->src[i].file == GRF &&
2890 scan_inst->src[i].reg == inst->src[0].reg &&
2891 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2892 interfered = true;
2893 }
2894 }
2895 if (interfered)
2896 break;
2897
2898 if (scan_inst->dst.file == MRF) {
2899 /* If somebody else writes our MRF here, we can't
2900 * compute-to-MRF before that.
2901 */
2902 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2903 int scan_mrf_high;
2904
2905 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2906 scan_mrf_high = scan_mrf_low + 4;
2907 } else if (scan_inst->exec_size == 16) {
2908 scan_mrf_high = scan_mrf_low + 1;
2909 } else {
2910 scan_mrf_high = scan_mrf_low;
2911 }
2912
2913 if (mrf_low == scan_mrf_low ||
2914 mrf_low == scan_mrf_high ||
2915 mrf_high == scan_mrf_low ||
2916 mrf_high == scan_mrf_high) {
2917 break;
2918 }
2919 }
2920
2921 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2922 /* Found a SEND instruction, which means that there are
2923 * live values in MRFs from base_mrf to base_mrf +
2924 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2925 * above it.
2926 */
2927 if (mrf_low >= scan_inst->base_mrf &&
2928 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2929 break;
2930 }
2931 if (mrf_high >= scan_inst->base_mrf &&
2932 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2933 break;
2934 }
2935 }
2936 }
2937 }
2938
2939 if (progress)
2940 invalidate_live_intervals();
2941
2942 return progress;
2943 }
2944
2945 /**
2946 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2947 * instructions to FS_OPCODE_REP_FB_WRITE.
2948 */
2949 void
2950 fs_visitor::emit_repclear_shader()
2951 {
2952 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2953 int base_mrf = 1;
2954 int color_mrf = base_mrf + 2;
2955
2956 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2957 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2958 mov->force_writemask_all = true;
2959
2960 fs_inst *write;
2961 if (key->nr_color_regions == 1) {
2962 write = emit(FS_OPCODE_REP_FB_WRITE);
2963 write->saturate = key->clamp_fragment_color;
2964 write->base_mrf = color_mrf;
2965 write->target = 0;
2966 write->header_present = false;
2967 write->mlen = 1;
2968 } else {
2969 assume(key->nr_color_regions > 0);
2970 for (int i = 0; i < key->nr_color_regions; ++i) {
2971 write = emit(FS_OPCODE_REP_FB_WRITE);
2972 write->saturate = key->clamp_fragment_color;
2973 write->base_mrf = base_mrf;
2974 write->target = i;
2975 write->header_present = true;
2976 write->mlen = 3;
2977 }
2978 }
2979 write->eot = true;
2980
2981 calculate_cfg();
2982
2983 assign_constant_locations();
2984 assign_curb_setup();
2985
2986 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2987 assert(mov->src[0].file == HW_REG);
2988 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2989 }
2990
2991 /**
2992 * Walks through basic blocks, looking for repeated MRF writes and
2993 * removing the later ones.
2994 */
2995 bool
2996 fs_visitor::remove_duplicate_mrf_writes()
2997 {
2998 fs_inst *last_mrf_move[16];
2999 bool progress = false;
3000
3001 /* Need to update the MRF tracking for compressed instructions. */
3002 if (dispatch_width == 16)
3003 return false;
3004
3005 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3006
3007 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3008 if (inst->is_control_flow()) {
3009 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3010 }
3011
3012 if (inst->opcode == BRW_OPCODE_MOV &&
3013 inst->dst.file == MRF) {
3014 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3015 if (prev_inst && inst->equals(prev_inst)) {
3016 inst->remove(block);
3017 progress = true;
3018 continue;
3019 }
3020 }
3021
3022 /* Clear out the last-write records for MRFs that were overwritten. */
3023 if (inst->dst.file == MRF) {
3024 last_mrf_move[inst->dst.reg] = NULL;
3025 }
3026
3027 if (inst->mlen > 0 && inst->base_mrf != -1) {
3028 /* Found a SEND instruction, which will include two or fewer
3029 * implied MRF writes. We could do better here.
3030 */
3031 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3032 last_mrf_move[inst->base_mrf + i] = NULL;
3033 }
3034 }
3035
3036 /* Clear out any MRF move records whose sources got overwritten. */
3037 if (inst->dst.file == GRF) {
3038 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3039 if (last_mrf_move[i] &&
3040 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3041 last_mrf_move[i] = NULL;
3042 }
3043 }
3044 }
3045
3046 if (inst->opcode == BRW_OPCODE_MOV &&
3047 inst->dst.file == MRF &&
3048 inst->src[0].file == GRF &&
3049 !inst->is_partial_write()) {
3050 last_mrf_move[inst->dst.reg] = inst;
3051 }
3052 }
3053
3054 if (progress)
3055 invalidate_live_intervals();
3056
3057 return progress;
3058 }
3059
3060 static void
3061 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3062 {
3063 /* Clear the flag for registers that actually got read (as expected). */
3064 for (int i = 0; i < inst->sources; i++) {
3065 int grf;
3066 if (inst->src[i].file == GRF) {
3067 grf = inst->src[i].reg;
3068 } else if (inst->src[i].file == HW_REG &&
3069 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3070 grf = inst->src[i].fixed_hw_reg.nr;
3071 } else {
3072 continue;
3073 }
3074
3075 if (grf >= first_grf &&
3076 grf < first_grf + grf_len) {
3077 deps[grf - first_grf] = false;
3078 if (inst->exec_size == 16)
3079 deps[grf - first_grf + 1] = false;
3080 }
3081 }
3082 }
3083
3084 /**
3085 * Implements this workaround for the original 965:
3086 *
3087 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3088 * check for post destination dependencies on this instruction, software
3089 * must ensure that there is no destination hazard for the case of ‘write
3090 * followed by a posted write’ shown in the following example.
3091 *
3092 * 1. mov r3 0
3093 * 2. send r3.xy <rest of send instruction>
3094 * 3. mov r2 r3
3095 *
3096 * Due to no post-destination dependency check on the ‘send’, the above
3097 * code sequence could have two instructions (1 and 2) in flight at the
3098 * same time that both consider ‘r3’ as the target of their final writes.
3099 */
3100 void
3101 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3102 fs_inst *inst)
3103 {
3104 int write_len = inst->regs_written;
3105 int first_write_grf = inst->dst.reg;
3106 bool needs_dep[BRW_MAX_MRF];
3107 assert(write_len < (int)sizeof(needs_dep) - 1);
3108
3109 memset(needs_dep, false, sizeof(needs_dep));
3110 memset(needs_dep, true, write_len);
3111
3112 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3113
3114 /* Walk backwards looking for writes to registers we're writing which
3115 * aren't read since being written. If we hit the start of the program,
3116 * we assume that there are no outstanding dependencies on entry to the
3117 * program.
3118 */
3119 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3120 /* If we hit control flow, assume that there *are* outstanding
3121 * dependencies, and force their cleanup before our instruction.
3122 */
3123 if (block->start() == scan_inst) {
3124 for (int i = 0; i < write_len; i++) {
3125 if (needs_dep[i]) {
3126 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3127 }
3128 }
3129 return;
3130 }
3131
3132 /* We insert our reads as late as possible on the assumption that any
3133 * instruction but a MOV that might have left us an outstanding
3134 * dependency has more latency than a MOV.
3135 */
3136 if (scan_inst->dst.file == GRF) {
3137 for (int i = 0; i < scan_inst->regs_written; i++) {
3138 int reg = scan_inst->dst.reg + i;
3139
3140 if (reg >= first_write_grf &&
3141 reg < first_write_grf + write_len &&
3142 needs_dep[reg - first_write_grf]) {
3143 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3144 needs_dep[reg - first_write_grf] = false;
3145 if (scan_inst->exec_size == 16)
3146 needs_dep[reg - first_write_grf + 1] = false;
3147 }
3148 }
3149 }
3150
3151 /* Clear the flag for registers that actually got read (as expected). */
3152 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3153
3154 /* Continue the loop only if we haven't resolved all the dependencies */
3155 int i;
3156 for (i = 0; i < write_len; i++) {
3157 if (needs_dep[i])
3158 break;
3159 }
3160 if (i == write_len)
3161 return;
3162 }
3163 }
3164
3165 /**
3166 * Implements this workaround for the original 965:
3167 *
3168 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3169 * used as a destination register until after it has been sourced by an
3170 * instruction with a different destination register.
3171 */
3172 void
3173 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3174 {
3175 int write_len = inst->regs_written;
3176 int first_write_grf = inst->dst.reg;
3177 bool needs_dep[BRW_MAX_MRF];
3178 assert(write_len < (int)sizeof(needs_dep) - 1);
3179
3180 memset(needs_dep, false, sizeof(needs_dep));
3181 memset(needs_dep, true, write_len);
3182 /* Walk forwards looking for writes to registers we're writing which aren't
3183 * read before being written.
3184 */
3185 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3186 /* If we hit control flow, force resolve all remaining dependencies. */
3187 if (block->end() == scan_inst) {
3188 for (int i = 0; i < write_len; i++) {
3189 if (needs_dep[i])
3190 scan_inst->insert_before(block,
3191 DEP_RESOLVE_MOV(first_write_grf + i));
3192 }
3193 return;
3194 }
3195
3196 /* Clear the flag for registers that actually got read (as expected). */
3197 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3198
3199 /* We insert our reads as late as possible since they're reading the
3200 * result of a SEND, which has massive latency.
3201 */
3202 if (scan_inst->dst.file == GRF &&
3203 scan_inst->dst.reg >= first_write_grf &&
3204 scan_inst->dst.reg < first_write_grf + write_len &&
3205 needs_dep[scan_inst->dst.reg - first_write_grf]) {
3206 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3207 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3208 }
3209
3210 /* Continue the loop only if we haven't resolved all the dependencies */
3211 int i;
3212 for (i = 0; i < write_len; i++) {
3213 if (needs_dep[i])
3214 break;
3215 }
3216 if (i == write_len)
3217 return;
3218 }
3219 }
3220
3221 void
3222 fs_visitor::insert_gen4_send_dependency_workarounds()
3223 {
3224 if (devinfo->gen != 4 || devinfo->is_g4x)
3225 return;
3226
3227 bool progress = false;
3228
3229 /* Note that we're done with register allocation, so GRF fs_regs always
3230 * have a .reg_offset of 0.
3231 */
3232
3233 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3234 if (inst->mlen != 0 && inst->dst.file == GRF) {
3235 insert_gen4_pre_send_dependency_workarounds(block, inst);
3236 insert_gen4_post_send_dependency_workarounds(block, inst);
3237 progress = true;
3238 }
3239 }
3240
3241 if (progress)
3242 invalidate_live_intervals();
3243 }
3244
3245 /**
3246 * Turns the generic expression-style uniform pull constant load instruction
3247 * into a hardware-specific series of instructions for loading a pull
3248 * constant.
3249 *
3250 * The expression style allows the CSE pass before this to optimize out
3251 * repeated loads from the same offset, and gives the pre-register-allocation
3252 * scheduling full flexibility, while the conversion to native instructions
3253 * allows the post-register-allocation scheduler the best information
3254 * possible.
3255 *
3256 * Note that execution masking for setting up pull constant loads is special:
3257 * the channels that need to be written are unrelated to the current execution
3258 * mask, since a later instruction will use one of the result channels as a
3259 * source operand for all 8 or 16 of its channels.
3260 */
3261 void
3262 fs_visitor::lower_uniform_pull_constant_loads()
3263 {
3264 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3265 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3266 continue;
3267
3268 if (devinfo->gen >= 7) {
3269 /* The offset arg before was a vec4-aligned byte offset. We need to
3270 * turn it into a dword offset.
3271 */
3272 fs_reg const_offset_reg = inst->src[1];
3273 assert(const_offset_reg.file == IMM &&
3274 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3275 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3276 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3277
3278 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3279 * Reserve space for the register.
3280 */
3281 if (devinfo->gen >= 9) {
3282 payload.reg_offset++;
3283 alloc.sizes[payload.reg] = 2;
3284 }
3285
3286 /* This is actually going to be a MOV, but since only the first dword
3287 * is accessed, we have a special opcode to do just that one. Note
3288 * that this needs to be an operation that will be considered a def
3289 * by live variable analysis, or register allocation will explode.
3290 */
3291 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3292 8, payload, const_offset_reg);
3293 setup->force_writemask_all = true;
3294
3295 setup->ir = inst->ir;
3296 setup->annotation = inst->annotation;
3297 inst->insert_before(block, setup);
3298
3299 /* Similarly, this will only populate the first 4 channels of the
3300 * result register (since we only use smear values from 0-3), but we
3301 * don't tell the optimizer.
3302 */
3303 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3304 inst->src[1] = payload;
3305
3306 invalidate_live_intervals();
3307 } else {
3308 /* Before register allocation, we didn't tell the scheduler about the
3309 * MRF we use. We know it's safe to use this MRF because nothing
3310 * else does except for register spill/unspill, which generates and
3311 * uses its MRF within a single IR instruction.
3312 */
3313 inst->base_mrf = 14;
3314 inst->mlen = 1;
3315 }
3316 }
3317 }
3318
3319 bool
3320 fs_visitor::lower_load_payload()
3321 {
3322 bool progress = false;
3323
3324 int vgrf_to_reg[alloc.count];
3325 int reg_count = 0;
3326 for (unsigned i = 0; i < alloc.count; ++i) {
3327 vgrf_to_reg[i] = reg_count;
3328 reg_count += alloc.sizes[i];
3329 }
3330
3331 struct {
3332 bool written:1; /* Whether this register has ever been written */
3333 bool force_writemask_all:1;
3334 bool force_sechalf:1;
3335 } metadata[reg_count];
3336 memset(metadata, 0, sizeof(metadata));
3337
3338 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3339 if (inst->dst.file == GRF) {
3340 const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3341 bool force_sechalf = inst->force_sechalf &&
3342 !inst->force_writemask_all;
3343 bool toggle_sechalf = inst->dst.width == 16 &&
3344 type_sz(inst->dst.type) == 4 &&
3345 !inst->force_writemask_all;
3346 for (int i = 0; i < inst->regs_written; ++i) {
3347 metadata[dst_reg + i].written = true;
3348 metadata[dst_reg + i].force_sechalf = force_sechalf;
3349 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3350 force_sechalf = (toggle_sechalf != force_sechalf);
3351 }
3352 }
3353
3354 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3355 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3356 fs_reg dst = inst->dst;
3357
3358 for (int i = 0; i < inst->sources; i++) {
3359 dst.width = inst->src[i].effective_width;
3360 dst.type = inst->src[i].type;
3361
3362 if (inst->src[i].file == BAD_FILE) {
3363 /* Do nothing but otherwise increment as normal */
3364 } else if (dst.file == MRF &&
3365 dst.width == 8 &&
3366 devinfo->has_compr4 &&
3367 i + 4 < inst->sources &&
3368 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3369 fs_reg compr4_dst = dst;
3370 compr4_dst.reg += BRW_MRF_COMPR4;
3371 compr4_dst.width = 16;
3372 fs_reg compr4_src = inst->src[i];
3373 compr4_src.width = 16;
3374 fs_inst *mov = MOV(compr4_dst, compr4_src);
3375 mov->force_writemask_all = true;
3376 inst->insert_before(block, mov);
3377 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3378 inst->src[i + 4].file = BAD_FILE;
3379 } else {
3380 fs_inst *mov = MOV(dst, inst->src[i]);
3381 if (inst->src[i].file == GRF) {
3382 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3383 inst->src[i].reg_offset;
3384 mov->force_sechalf = metadata[src_reg].force_sechalf;
3385 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3386 } else {
3387 /* We don't have any useful metadata for immediates or
3388 * uniforms. Assume that any of the channels of the
3389 * destination may be used.
3390 */
3391 assert(inst->src[i].file == IMM ||
3392 inst->src[i].file == UNIFORM);
3393 mov->force_writemask_all = true;
3394 }
3395
3396 if (dst.file == GRF) {
3397 const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3398 const bool force_writemask = mov->force_writemask_all;
3399 metadata[dst_reg].force_writemask_all = force_writemask;
3400 metadata[dst_reg].force_sechalf = mov->force_sechalf;
3401 if (dst.width * type_sz(dst.type) > 32) {
3402 assert(!mov->force_sechalf);
3403 metadata[dst_reg + 1].force_writemask_all = force_writemask;
3404 metadata[dst_reg + 1].force_sechalf = !force_writemask;
3405 }
3406 }
3407
3408 inst->insert_before(block, mov);
3409 }
3410
3411 dst = offset(dst, 1);
3412 }
3413
3414 inst->remove(block);
3415 progress = true;
3416 }
3417 }
3418
3419 if (progress)
3420 invalidate_live_intervals();
3421
3422 return progress;
3423 }
3424
3425 void
3426 fs_visitor::dump_instructions()
3427 {
3428 dump_instructions(NULL);
3429 }
3430
3431 void
3432 fs_visitor::dump_instructions(const char *name)
3433 {
3434 FILE *file = stderr;
3435 if (name && geteuid() != 0) {
3436 file = fopen(name, "w");
3437 if (!file)
3438 file = stderr;
3439 }
3440
3441 if (cfg) {
3442 calculate_register_pressure();
3443 int ip = 0, max_pressure = 0;
3444 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3445 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3446 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3447 dump_instruction(inst, file);
3448 ip++;
3449 }
3450 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3451 } else {
3452 int ip = 0;
3453 foreach_in_list(backend_instruction, inst, &instructions) {
3454 fprintf(file, "%4d: ", ip++);
3455 dump_instruction(inst, file);
3456 }
3457 }
3458
3459 if (file != stderr) {
3460 fclose(file);
3461 }
3462 }
3463
3464 void
3465 fs_visitor::dump_instruction(backend_instruction *be_inst)
3466 {
3467 dump_instruction(be_inst, stderr);
3468 }
3469
3470 void
3471 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3472 {
3473 fs_inst *inst = (fs_inst *)be_inst;
3474
3475 if (inst->predicate) {
3476 fprintf(file, "(%cf0.%d) ",
3477 inst->predicate_inverse ? '-' : '+',
3478 inst->flag_subreg);
3479 }
3480
3481 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3482 if (inst->saturate)
3483 fprintf(file, ".sat");
3484 if (inst->conditional_mod) {
3485 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3486 if (!inst->predicate &&
3487 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3488 inst->opcode != BRW_OPCODE_IF &&
3489 inst->opcode != BRW_OPCODE_WHILE))) {
3490 fprintf(file, ".f0.%d", inst->flag_subreg);
3491 }
3492 }
3493 fprintf(file, "(%d) ", inst->exec_size);
3494
3495
3496 switch (inst->dst.file) {
3497 case GRF:
3498 fprintf(file, "vgrf%d", inst->dst.reg);
3499 if (inst->dst.width != dispatch_width)
3500 fprintf(file, "@%d", inst->dst.width);
3501 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3502 inst->dst.subreg_offset)
3503 fprintf(file, "+%d.%d",
3504 inst->dst.reg_offset, inst->dst.subreg_offset);
3505 break;
3506 case MRF:
3507 fprintf(file, "m%d", inst->dst.reg);
3508 break;
3509 case BAD_FILE:
3510 fprintf(file, "(null)");
3511 break;
3512 case UNIFORM:
3513 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3514 break;
3515 case ATTR:
3516 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3517 break;
3518 case HW_REG:
3519 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3520 switch (inst->dst.fixed_hw_reg.nr) {
3521 case BRW_ARF_NULL:
3522 fprintf(file, "null");
3523 break;
3524 case BRW_ARF_ADDRESS:
3525 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3526 break;
3527 case BRW_ARF_ACCUMULATOR:
3528 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3529 break;
3530 case BRW_ARF_FLAG:
3531 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3532 inst->dst.fixed_hw_reg.subnr);
3533 break;
3534 default:
3535 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3536 inst->dst.fixed_hw_reg.subnr);
3537 break;
3538 }
3539 } else {
3540 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3541 }
3542 if (inst->dst.fixed_hw_reg.subnr)
3543 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3544 break;
3545 default:
3546 fprintf(file, "???");
3547 break;
3548 }
3549 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3550
3551 for (int i = 0; i < inst->sources; i++) {
3552 if (inst->src[i].negate)
3553 fprintf(file, "-");
3554 if (inst->src[i].abs)
3555 fprintf(file, "|");
3556 switch (inst->src[i].file) {
3557 case GRF:
3558 fprintf(file, "vgrf%d", inst->src[i].reg);
3559 if (inst->src[i].width != dispatch_width)
3560 fprintf(file, "@%d", inst->src[i].width);
3561 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3562 inst->src[i].subreg_offset)
3563 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3564 inst->src[i].subreg_offset);
3565 break;
3566 case MRF:
3567 fprintf(file, "***m%d***", inst->src[i].reg);
3568 break;
3569 case ATTR:
3570 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3571 break;
3572 case UNIFORM:
3573 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3574 if (inst->src[i].reladdr) {
3575 fprintf(file, "+reladdr");
3576 } else if (inst->src[i].subreg_offset) {
3577 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3578 inst->src[i].subreg_offset);
3579 }
3580 break;
3581 case BAD_FILE:
3582 fprintf(file, "(null)");
3583 break;
3584 case IMM:
3585 switch (inst->src[i].type) {
3586 case BRW_REGISTER_TYPE_F:
3587 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3588 break;
3589 case BRW_REGISTER_TYPE_W:
3590 case BRW_REGISTER_TYPE_D:
3591 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3592 break;
3593 case BRW_REGISTER_TYPE_UW:
3594 case BRW_REGISTER_TYPE_UD:
3595 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3596 break;
3597 case BRW_REGISTER_TYPE_VF:
3598 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3599 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3600 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3601 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3602 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3603 break;
3604 default:
3605 fprintf(file, "???");
3606 break;
3607 }
3608 break;
3609 case HW_REG:
3610 if (inst->src[i].fixed_hw_reg.negate)
3611 fprintf(file, "-");
3612 if (inst->src[i].fixed_hw_reg.abs)
3613 fprintf(file, "|");
3614 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3615 switch (inst->src[i].fixed_hw_reg.nr) {
3616 case BRW_ARF_NULL:
3617 fprintf(file, "null");
3618 break;
3619 case BRW_ARF_ADDRESS:
3620 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3621 break;
3622 case BRW_ARF_ACCUMULATOR:
3623 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3624 break;
3625 case BRW_ARF_FLAG:
3626 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3627 inst->src[i].fixed_hw_reg.subnr);
3628 break;
3629 default:
3630 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3631 inst->src[i].fixed_hw_reg.subnr);
3632 break;
3633 }
3634 } else {
3635 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3636 }
3637 if (inst->src[i].fixed_hw_reg.subnr)
3638 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3639 if (inst->src[i].fixed_hw_reg.abs)
3640 fprintf(file, "|");
3641 break;
3642 default:
3643 fprintf(file, "???");
3644 break;
3645 }
3646 if (inst->src[i].abs)
3647 fprintf(file, "|");
3648
3649 if (inst->src[i].file != IMM) {
3650 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3651 }
3652
3653 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3654 fprintf(file, ", ");
3655 }
3656
3657 fprintf(file, " ");
3658
3659 if (dispatch_width == 16 && inst->exec_size == 8) {
3660 if (inst->force_sechalf)
3661 fprintf(file, "2ndhalf ");
3662 else
3663 fprintf(file, "1sthalf ");
3664 }
3665
3666 fprintf(file, "\n");
3667 }
3668
3669 /**
3670 * Possibly returns an instruction that set up @param reg.
3671 *
3672 * Sometimes we want to take the result of some expression/variable
3673 * dereference tree and rewrite the instruction generating the result
3674 * of the tree. When processing the tree, we know that the
3675 * instructions generated are all writing temporaries that are dead
3676 * outside of this tree. So, if we have some instructions that write
3677 * a temporary, we're free to point that temp write somewhere else.
3678 *
3679 * Note that this doesn't guarantee that the instruction generated
3680 * only reg -- it might be the size=4 destination of a texture instruction.
3681 */
3682 fs_inst *
3683 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3684 fs_inst *end,
3685 const fs_reg &reg)
3686 {
3687 if (end == start ||
3688 end->is_partial_write() ||
3689 reg.reladdr ||
3690 !reg.equals(end->dst)) {
3691 return NULL;
3692 } else {
3693 return end;
3694 }
3695 }
3696
3697 void
3698 fs_visitor::setup_payload_gen6()
3699 {
3700 bool uses_depth =
3701 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3702 unsigned barycentric_interp_modes =
3703 (stage == MESA_SHADER_FRAGMENT) ?
3704 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3705
3706 assert(devinfo->gen >= 6);
3707
3708 /* R0-1: masks, pixel X/Y coordinates. */
3709 payload.num_regs = 2;
3710 /* R2: only for 32-pixel dispatch.*/
3711
3712 /* R3-26: barycentric interpolation coordinates. These appear in the
3713 * same order that they appear in the brw_wm_barycentric_interp_mode
3714 * enum. Each set of coordinates occupies 2 registers if dispatch width
3715 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3716 * appear if they were enabled using the "Barycentric Interpolation
3717 * Mode" bits in WM_STATE.
3718 */
3719 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3720 if (barycentric_interp_modes & (1 << i)) {
3721 payload.barycentric_coord_reg[i] = payload.num_regs;
3722 payload.num_regs += 2;
3723 if (dispatch_width == 16) {
3724 payload.num_regs += 2;
3725 }
3726 }
3727 }
3728
3729 /* R27: interpolated depth if uses source depth */
3730 if (uses_depth) {
3731 payload.source_depth_reg = payload.num_regs;
3732 payload.num_regs++;
3733 if (dispatch_width == 16) {
3734 /* R28: interpolated depth if not SIMD8. */
3735 payload.num_regs++;
3736 }
3737 }
3738 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3739 if (uses_depth) {
3740 payload.source_w_reg = payload.num_regs;
3741 payload.num_regs++;
3742 if (dispatch_width == 16) {
3743 /* R30: interpolated W if not SIMD8. */
3744 payload.num_regs++;
3745 }
3746 }
3747
3748 if (stage == MESA_SHADER_FRAGMENT) {
3749 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3750 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3751 prog_data->uses_pos_offset = key->compute_pos_offset;
3752 /* R31: MSAA position offsets. */
3753 if (prog_data->uses_pos_offset) {
3754 payload.sample_pos_reg = payload.num_regs;
3755 payload.num_regs++;
3756 }
3757 }
3758
3759 /* R32: MSAA input coverage mask */
3760 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3761 assert(devinfo->gen >= 7);
3762 payload.sample_mask_in_reg = payload.num_regs;
3763 payload.num_regs++;
3764 if (dispatch_width == 16) {
3765 /* R33: input coverage mask if not SIMD8. */
3766 payload.num_regs++;
3767 }
3768 }
3769
3770 /* R34-: bary for 32-pixel. */
3771 /* R58-59: interp W for 32-pixel. */
3772
3773 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3774 source_depth_to_render_target = true;
3775 }
3776 }
3777
3778 void
3779 fs_visitor::setup_vs_payload()
3780 {
3781 /* R0: thread header, R1: urb handles */
3782 payload.num_regs = 2;
3783 }
3784
3785 void
3786 fs_visitor::setup_cs_payload()
3787 {
3788 assert(brw->gen >= 7);
3789
3790 payload.num_regs = 1;
3791 }
3792
3793 void
3794 fs_visitor::assign_binding_table_offsets()
3795 {
3796 assert(stage == MESA_SHADER_FRAGMENT);
3797 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3798 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3799 uint32_t next_binding_table_offset = 0;
3800
3801 /* If there are no color regions, we still perform an FB write to a null
3802 * renderbuffer, which we place at surface index 0.
3803 */
3804 prog_data->binding_table.render_target_start = next_binding_table_offset;
3805 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3806
3807 assign_common_binding_table_offsets(next_binding_table_offset);
3808 }
3809
3810 void
3811 fs_visitor::calculate_register_pressure()
3812 {
3813 invalidate_live_intervals();
3814 calculate_live_intervals();
3815
3816 unsigned num_instructions = 0;
3817 foreach_block(block, cfg)
3818 num_instructions += block->instructions.length();
3819
3820 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3821
3822 for (unsigned reg = 0; reg < alloc.count; reg++) {
3823 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3824 regs_live_at_ip[ip] += alloc.sizes[reg];
3825 }
3826 }
3827
3828 void
3829 fs_visitor::optimize()
3830 {
3831 split_virtual_grfs();
3832
3833 move_uniform_array_access_to_pull_constants();
3834 assign_constant_locations();
3835 demote_pull_constants();
3836
3837 #define OPT(pass, args...) ({ \
3838 pass_num++; \
3839 bool this_progress = pass(args); \
3840 \
3841 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3842 char filename[64]; \
3843 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3844 stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3845 \
3846 backend_visitor::dump_instructions(filename); \
3847 } \
3848 \
3849 progress = progress || this_progress; \
3850 this_progress; \
3851 })
3852
3853 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3854 char filename[64];
3855 snprintf(filename, 64, "%s%d-%04d-00-start",
3856 stage_abbrev, dispatch_width,
3857 shader_prog ? shader_prog->Name : 0);
3858
3859 backend_visitor::dump_instructions(filename);
3860 }
3861
3862 bool progress;
3863 int iteration = 0;
3864 int pass_num = 0;
3865 do {
3866 progress = false;
3867 pass_num = 0;
3868 iteration++;
3869
3870 OPT(remove_duplicate_mrf_writes);
3871
3872 OPT(opt_algebraic);
3873 OPT(opt_cse);
3874 OPT(opt_copy_propagate);
3875 OPT(opt_peephole_predicated_break);
3876 OPT(opt_cmod_propagation);
3877 OPT(dead_code_eliminate);
3878 OPT(opt_peephole_sel);
3879 OPT(dead_control_flow_eliminate, this);
3880 OPT(opt_register_renaming);
3881 OPT(opt_redundant_discard_jumps);
3882 OPT(opt_saturate_propagation);
3883 OPT(opt_zero_samples);
3884 OPT(register_coalesce);
3885 OPT(compute_to_mrf);
3886
3887 OPT(compact_virtual_grfs);
3888 } while (progress);
3889
3890 pass_num = 0;
3891
3892 OPT(opt_sampler_eot);
3893
3894 if (OPT(lower_load_payload)) {
3895 split_virtual_grfs();
3896 OPT(register_coalesce);
3897 OPT(compute_to_mrf);
3898 OPT(dead_code_eliminate);
3899 }
3900
3901 OPT(opt_combine_constants);
3902
3903 lower_uniform_pull_constant_loads();
3904 }
3905
3906 /**
3907 * Three source instruction must have a GRF/MRF destination register.
3908 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
3909 */
3910 void
3911 fs_visitor::fixup_3src_null_dest()
3912 {
3913 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3914 if (inst->is_3src() && inst->dst.is_null()) {
3915 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3916 inst->dst.type);
3917 }
3918 }
3919 }
3920
3921 void
3922 fs_visitor::allocate_registers()
3923 {
3924 bool allocated_without_spills;
3925
3926 static const enum instruction_scheduler_mode pre_modes[] = {
3927 SCHEDULE_PRE,
3928 SCHEDULE_PRE_NON_LIFO,
3929 SCHEDULE_PRE_LIFO,
3930 };
3931
3932 /* Try each scheduling heuristic to see if it can successfully register
3933 * allocate without spilling. They should be ordered by decreasing
3934 * performance but increasing likelihood of allocating.
3935 */
3936 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3937 schedule_instructions(pre_modes[i]);
3938
3939 if (0) {
3940 assign_regs_trivial();
3941 allocated_without_spills = true;
3942 } else {
3943 allocated_without_spills = assign_regs(false);
3944 }
3945 if (allocated_without_spills)
3946 break;
3947 }
3948
3949 if (!allocated_without_spills) {
3950 /* We assume that any spilling is worse than just dropping back to
3951 * SIMD8. There's probably actually some intermediate point where
3952 * SIMD16 with a couple of spills is still better.
3953 */
3954 if (dispatch_width == 16) {
3955 fail("Failure to register allocate. Reduce number of "
3956 "live scalar values to avoid this.");
3957 } else {
3958 perf_debug("%s shader triggered register spilling. "
3959 "Try reducing the number of live scalar values to "
3960 "improve performance.\n", stage_name);
3961 }
3962
3963 /* Since we're out of heuristics, just go spill registers until we
3964 * get an allocation.
3965 */
3966 while (!assign_regs(true)) {
3967 if (failed)
3968 break;
3969 }
3970 }
3971
3972 /* This must come after all optimization and register allocation, since
3973 * it inserts dead code that happens to have side effects, and it does
3974 * so based on the actual physical registers in use.
3975 */
3976 insert_gen4_send_dependency_workarounds();
3977
3978 if (failed)
3979 return;
3980
3981 if (!allocated_without_spills)
3982 schedule_instructions(SCHEDULE_POST);
3983
3984 if (last_scratch > 0)
3985 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3986 }
3987
3988 bool
3989 fs_visitor::run_vs()
3990 {
3991 assert(stage == MESA_SHADER_VERTEX);
3992
3993 assign_common_binding_table_offsets(0);
3994 setup_vs_payload();
3995
3996 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3997 emit_shader_time_begin();
3998
3999 if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
4000 emit_nir_code();
4001 } else {
4002 foreach_in_list(ir_instruction, ir, shader->base.ir) {
4003 base_ir = ir;
4004 this->result = reg_undef;
4005 ir->accept(this);
4006 }
4007 base_ir = NULL;
4008 }
4009
4010 if (failed)
4011 return false;
4012
4013 emit_urb_writes();
4014
4015 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4016 emit_shader_time_end();
4017
4018 calculate_cfg();
4019
4020 optimize();
4021
4022 assign_curb_setup();
4023 assign_vs_urb_setup();
4024
4025 fixup_3src_null_dest();
4026 allocate_registers();
4027
4028 return !failed;
4029 }
4030
4031 bool
4032 fs_visitor::run_fs()
4033 {
4034 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4035 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4036
4037 assert(stage == MESA_SHADER_FRAGMENT);
4038
4039 sanity_param_count = prog->Parameters->NumParameters;
4040
4041 assign_binding_table_offsets();
4042
4043 if (devinfo->gen >= 6)
4044 setup_payload_gen6();
4045 else
4046 setup_payload_gen4();
4047
4048 if (0) {
4049 emit_dummy_fs();
4050 } else if (brw->use_rep_send && dispatch_width == 16) {
4051 emit_repclear_shader();
4052 } else {
4053 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4054 emit_shader_time_begin();
4055
4056 calculate_urb_setup();
4057 if (prog->InputsRead > 0) {
4058 if (devinfo->gen < 6)
4059 emit_interpolation_setup_gen4();
4060 else
4061 emit_interpolation_setup_gen6();
4062 }
4063
4064 /* We handle discards by keeping track of the still-live pixels in f0.1.
4065 * Initialize it with the dispatched pixels.
4066 */
4067 if (wm_prog_data->uses_kill) {
4068 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4069 discard_init->flag_subreg = 1;
4070 }
4071
4072 /* Generate FS IR for main(). (the visitor only descends into
4073 * functions called "main").
4074 */
4075 if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
4076 emit_nir_code();
4077 } else if (shader) {
4078 foreach_in_list(ir_instruction, ir, shader->base.ir) {
4079 base_ir = ir;
4080 this->result = reg_undef;
4081 ir->accept(this);
4082 }
4083 } else {
4084 emit_fragment_program_code();
4085 }
4086 base_ir = NULL;
4087 if (failed)
4088 return false;
4089
4090 if (wm_prog_data->uses_kill)
4091 emit(FS_OPCODE_PLACEHOLDER_HALT);
4092
4093 if (wm_key->alpha_test_func)
4094 emit_alpha_test();
4095
4096 emit_fb_writes();
4097
4098 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4099 emit_shader_time_end();
4100
4101 calculate_cfg();
4102
4103 optimize();
4104
4105 assign_curb_setup();
4106 assign_urb_setup();
4107
4108 fixup_3src_null_dest();
4109 allocate_registers();
4110
4111 if (failed)
4112 return false;
4113 }
4114
4115 if (dispatch_width == 8)
4116 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4117 else
4118 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4119
4120 /* If any state parameters were appended, then ParameterValues could have
4121 * been realloced, in which case the driver uniform storage set up by
4122 * _mesa_associate_uniform_storage() would point to freed memory. Make
4123 * sure that didn't happen.
4124 */
4125 assert(sanity_param_count == prog->Parameters->NumParameters);
4126
4127 return !failed;
4128 }
4129
4130 bool
4131 fs_visitor::run_cs()
4132 {
4133 assert(stage == MESA_SHADER_COMPUTE);
4134 assert(shader);
4135
4136 sanity_param_count = prog->Parameters->NumParameters;
4137
4138 assign_common_binding_table_offsets(0);
4139
4140 setup_cs_payload();
4141
4142 emit_nir_code();
4143
4144 if (failed)
4145 return false;
4146
4147 emit_cs_terminate();
4148
4149 calculate_cfg();
4150
4151 optimize();
4152
4153 assign_curb_setup();
4154
4155 fixup_3src_null_dest();
4156 allocate_registers();
4157
4158 if (failed)
4159 return false;
4160
4161 /* If any state parameters were appended, then ParameterValues could have
4162 * been realloced, in which case the driver uniform storage set up by
4163 * _mesa_associate_uniform_storage() would point to freed memory. Make
4164 * sure that didn't happen.
4165 */
4166 assert(sanity_param_count == prog->Parameters->NumParameters);
4167
4168 return !failed;
4169 }
4170
4171 const unsigned *
4172 brw_wm_fs_emit(struct brw_context *brw,
4173 void *mem_ctx,
4174 const struct brw_wm_prog_key *key,
4175 struct brw_wm_prog_data *prog_data,
4176 struct gl_fragment_program *fp,
4177 struct gl_shader_program *prog,
4178 unsigned *final_assembly_size)
4179 {
4180 bool start_busy = false;
4181 double start_time = 0;
4182
4183 if (unlikely(brw->perf_debug)) {
4184 start_busy = (brw->batch.last_bo &&
4185 drm_intel_bo_busy(brw->batch.last_bo));
4186 start_time = get_time();
4187 }
4188
4189 struct brw_shader *shader = NULL;
4190 if (prog)
4191 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4192
4193 if (unlikely(INTEL_DEBUG & DEBUG_WM))
4194 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4195
4196 /* Now the main event: Visit the shader IR and generate our FS IR for it.
4197 */
4198 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
4199 if (!v.run_fs()) {
4200 if (prog) {
4201 prog->LinkStatus = false;
4202 ralloc_strcat(&prog->InfoLog, v.fail_msg);
4203 }
4204
4205 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4206 v.fail_msg);
4207
4208 return NULL;
4209 }
4210
4211 cfg_t *simd16_cfg = NULL;
4212 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
4213 if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4214 if (!v.simd16_unsupported) {
4215 /* Try a SIMD16 compile */
4216 v2.import_uniforms(&v);
4217 if (!v2.run_fs()) {
4218 perf_debug("SIMD16 shader failed to compile, falling back to "
4219 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4220 } else {
4221 simd16_cfg = v2.cfg;
4222 }
4223 } else {
4224 perf_debug("SIMD16 shader unsupported, falling back to "
4225 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4226 }
4227 }
4228
4229 cfg_t *simd8_cfg;
4230 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4231 if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4232 simd8_cfg = NULL;
4233 prog_data->no_8 = true;
4234 } else {
4235 simd8_cfg = v.cfg;
4236 prog_data->no_8 = false;
4237 }
4238
4239 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4240 &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4241
4242 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4243 char *name;
4244 if (prog)
4245 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4246 prog->Label ? prog->Label : "unnamed",
4247 prog->Name);
4248 else
4249 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4250
4251 g.enable_debug(name);
4252 }
4253
4254 if (simd8_cfg)
4255 g.generate_code(simd8_cfg, 8);
4256 if (simd16_cfg)
4257 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4258
4259 if (unlikely(brw->perf_debug) && shader) {
4260 if (shader->compiled_once)
4261 brw_wm_debug_recompile(brw, prog, key);
4262 shader->compiled_once = true;
4263
4264 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4265 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4266 (get_time() - start_time) * 1000);
4267 }
4268 }
4269
4270 return g.get_assembly(final_assembly_size);
4271 }
4272
4273 extern "C" bool
4274 brw_fs_precompile(struct gl_context *ctx,
4275 struct gl_shader_program *shader_prog,
4276 struct gl_program *prog)
4277 {
4278 struct brw_context *brw = brw_context(ctx);
4279 struct brw_wm_prog_key key;
4280
4281 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4282 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4283 bool program_uses_dfdy = fp->UsesDFdy;
4284
4285 memset(&key, 0, sizeof(key));
4286
4287 if (brw->gen < 6) {
4288 if (fp->UsesKill)
4289 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4290
4291 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4292 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4293
4294 /* Just assume depth testing. */
4295 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4296 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4297 }
4298
4299 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4300 BRW_FS_VARYING_INPUT_MASK) > 16)
4301 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4302
4303 brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4304
4305 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4306 key.drawable_height = ctx->DrawBuffer->Height;
4307 }
4308
4309 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4310 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4311 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4312
4313 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4314 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4315 key.nr_color_regions > 1;
4316 }
4317
4318 key.program_string_id = bfp->id;
4319
4320 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4321 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4322
4323 bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4324
4325 brw->wm.base.prog_offset = old_prog_offset;
4326 brw->wm.prog_data = old_prog_data;
4327
4328 return success;
4329 }
4330
4331 void
4332 brw_setup_tex_for_precompile(struct brw_context *brw,
4333 struct brw_sampler_prog_key_data *tex,
4334 struct gl_program *prog)
4335 {
4336 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4337 unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4338 for (unsigned i = 0; i < sampler_count; i++) {
4339 if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4340 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4341 tex->swizzles[i] =
4342 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4343 } else {
4344 /* Color sampler: assume no swizzling. */
4345 tex->swizzles[i] = SWIZZLE_XYZW;
4346 }
4347 }
4348 }