i965/fs: Add CS shader time support
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 void
53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
54 const fs_reg *src, unsigned sources)
55 {
56 memset(this, 0, sizeof(*this));
57
58 this->src = new fs_reg[MAX2(sources, 3)];
59 for (unsigned i = 0; i < sources; i++)
60 this->src[i] = src[i];
61
62 this->opcode = opcode;
63 this->dst = dst;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (unsigned i = 0; i < sources; ++i) {
79 if (src[i].file != GRF && src[i].file != ATTR)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 for (unsigned i = 0; i < sources; ++i) {
94 switch (this->src[i].file) {
95 case BAD_FILE:
96 this->src[i].effective_width = 8;
97 break;
98 case GRF:
99 case HW_REG:
100 case ATTR:
101 assert(this->src[i].width > 0);
102 if (this->src[i].width == 1) {
103 this->src[i].effective_width = this->exec_size;
104 } else {
105 this->src[i].effective_width = this->src[i].width;
106 }
107 break;
108 case IMM:
109 case UNIFORM:
110 this->src[i].effective_width = this->exec_size;
111 break;
112 default:
113 unreachable("Invalid source register file");
114 }
115 }
116 this->dst.effective_width = this->exec_size;
117
118 this->conditional_mod = BRW_CONDITIONAL_NONE;
119
120 /* This will be the case for almost all instructions. */
121 switch (dst.file) {
122 case GRF:
123 case HW_REG:
124 case MRF:
125 case ATTR:
126 this->regs_written =
127 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
128 break;
129 case BAD_FILE:
130 this->regs_written = 0;
131 break;
132 case IMM:
133 case UNIFORM:
134 unreachable("Invalid destination register file");
135 default:
136 unreachable("Invalid register file");
137 }
138
139 this->writes_accumulator = false;
140 }
141
142 fs_inst::fs_inst()
143 {
144 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
145 }
146
147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
148 {
149 init(opcode, exec_size, reg_undef, NULL, 0);
150 }
151
152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
153 {
154 init(opcode, 0, dst, NULL, 0);
155 }
156
157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
158 const fs_reg &src0)
159 {
160 const fs_reg src[1] = { src0 };
161 init(opcode, exec_size, dst, src, 1);
162 }
163
164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
165 {
166 const fs_reg src[1] = { src0 };
167 init(opcode, 0, dst, src, 1);
168 }
169
170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
171 const fs_reg &src0, const fs_reg &src1)
172 {
173 const fs_reg src[2] = { src0, src1 };
174 init(opcode, exec_size, dst, src, 2);
175 }
176
177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
178 const fs_reg &src1)
179 {
180 const fs_reg src[2] = { src0, src1 };
181 init(opcode, 0, dst, src, 2);
182 }
183
184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
185 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
186 {
187 const fs_reg src[3] = { src0, src1, src2 };
188 init(opcode, exec_size, dst, src, 3);
189 }
190
191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
192 const fs_reg &src1, const fs_reg &src2)
193 {
194 const fs_reg src[3] = { src0, src1, src2 };
195 init(opcode, 0, dst, src, 3);
196 }
197
198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
199 const fs_reg src[], unsigned sources)
200 {
201 init(opcode, 0, dst, src, sources);
202 }
203
204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
205 const fs_reg src[], unsigned sources)
206 {
207 init(opcode, exec_width, dst, src, sources);
208 }
209
210 fs_inst::fs_inst(const fs_inst &that)
211 {
212 memcpy(this, &that, sizeof(that));
213
214 this->src = new fs_reg[MAX2(that.sources, 3)];
215
216 for (unsigned i = 0; i < that.sources; i++)
217 this->src[i] = that.src[i];
218 }
219
220 fs_inst::~fs_inst()
221 {
222 delete[] this->src;
223 }
224
225 void
226 fs_inst::resize_sources(uint8_t num_sources)
227 {
228 if (this->sources != num_sources) {
229 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
230
231 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
232 src[i] = this->src[i];
233
234 delete[] this->src;
235 this->src = src;
236 this->sources = num_sources;
237 }
238 }
239
240 #define ALU1(op) \
241 fs_inst * \
242 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
243 { \
244 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
245 }
246
247 #define ALU2(op) \
248 fs_inst * \
249 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
250 const fs_reg &src1) \
251 { \
252 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
253 }
254
255 #define ALU2_ACC(op) \
256 fs_inst * \
257 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
258 const fs_reg &src1) \
259 { \
260 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
261 inst->writes_accumulator = true; \
262 return inst; \
263 }
264
265 #define ALU3(op) \
266 fs_inst * \
267 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
268 const fs_reg &src1, const fs_reg &src2) \
269 { \
270 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
271 }
272
273 ALU1(NOT)
274 ALU1(MOV)
275 ALU1(FRC)
276 ALU1(RNDD)
277 ALU1(RNDE)
278 ALU1(RNDZ)
279 ALU2(ADD)
280 ALU2(MUL)
281 ALU2_ACC(MACH)
282 ALU2(AND)
283 ALU2(OR)
284 ALU2(XOR)
285 ALU2(SHL)
286 ALU2(SHR)
287 ALU2(ASR)
288 ALU3(LRP)
289 ALU1(BFREV)
290 ALU3(BFE)
291 ALU2(BFI1)
292 ALU3(BFI2)
293 ALU1(FBH)
294 ALU1(FBL)
295 ALU1(CBIT)
296 ALU3(MAD)
297 ALU2_ACC(ADDC)
298 ALU2_ACC(SUBB)
299 ALU2(SEL)
300 ALU2(MAC)
301
302 /** Gen4 predicated IF. */
303 fs_inst *
304 fs_visitor::IF(enum brw_predicate predicate)
305 {
306 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
307 inst->predicate = predicate;
308 return inst;
309 }
310
311 /** Gen6 IF with embedded comparison. */
312 fs_inst *
313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
314 enum brw_conditional_mod condition)
315 {
316 assert(devinfo->gen == 6);
317 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
318 reg_null_d, src0, src1);
319 inst->conditional_mod = condition;
320 return inst;
321 }
322
323 /**
324 * CMP: Sets the low bit of the destination channels with the result
325 * of the comparison, while the upper bits are undefined, and updates
326 * the flag register with the packed 16 bits of the result.
327 */
328 fs_inst *
329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
330 enum brw_conditional_mod condition)
331 {
332 fs_inst *inst;
333
334 /* Take the instruction:
335 *
336 * CMP null<d> src0<f> src1<f>
337 *
338 * Original gen4 does type conversion to the destination type before
339 * comparison, producing garbage results for floating point comparisons.
340 *
341 * The destination type doesn't matter on newer generations, so we set the
342 * type to match src0 so we can compact the instruction.
343 */
344 dst.type = src0.type;
345 if (dst.file == HW_REG)
346 dst.fixed_hw_reg.type = dst.type;
347
348 resolve_ud_negate(&src0);
349 resolve_ud_negate(&src1);
350
351 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
352 inst->conditional_mod = condition;
353
354 return inst;
355 }
356
357 fs_inst *
358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
359 {
360 uint8_t exec_size = dst.width;
361 for (int i = 0; i < sources; ++i) {
362 assert(src[i].width % dst.width == 0);
363 if (src[i].width > exec_size)
364 exec_size = src[i].width;
365 }
366
367 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
368 dst, src, sources);
369 inst->regs_written = 0;
370 for (int i = 0; i < sources; ++i) {
371 /* The LOAD_PAYLOAD instruction only really makes sense if we are
372 * dealing with whole registers. If this ever changes, we can deal
373 * with it later.
374 */
375 int size = inst->src[i].effective_width * type_sz(src[i].type);
376 assert(size % 32 == 0);
377 inst->regs_written += (size + 31) / 32;
378 }
379
380 return inst;
381 }
382
383 exec_list
384 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
385 const fs_reg &surf_index,
386 const fs_reg &varying_offset,
387 uint32_t const_offset)
388 {
389 exec_list instructions;
390 fs_inst *inst;
391
392 /* We have our constant surface use a pitch of 4 bytes, so our index can
393 * be any component of a vector, and then we load 4 contiguous
394 * components starting from that.
395 *
396 * We break down the const_offset to a portion added to the variable
397 * offset and a portion done using reg_offset, which means that if you
398 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
399 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
400 * CSE can later notice that those loads are all the same and eliminate
401 * the redundant ones.
402 */
403 fs_reg vec4_offset = vgrf(glsl_type::int_type);
404 instructions.push_tail(ADD(vec4_offset,
405 varying_offset, fs_reg(const_offset & ~3)));
406
407 int scale = 1;
408 if (devinfo->gen == 4 && dst.width == 8) {
409 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
410 * u, v, r) as parameters, or we can just use the SIMD16 message
411 * consisting of (header, u). We choose the second, at the cost of a
412 * longer return length.
413 */
414 scale = 2;
415 }
416
417 enum opcode op;
418 if (devinfo->gen >= 7)
419 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
420 else
421 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
422
423 assert(dst.width % 8 == 0);
424 int regs_written = 4 * (dst.width / 8) * scale;
425 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
426 dst.type, dst.width);
427 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
428 inst->regs_written = regs_written;
429 instructions.push_tail(inst);
430
431 if (devinfo->gen < 7) {
432 inst->base_mrf = 13;
433 inst->header_present = true;
434 if (devinfo->gen == 4)
435 inst->mlen = 3;
436 else
437 inst->mlen = 1 + dispatch_width / 8;
438 }
439
440 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
441 instructions.push_tail(MOV(dst, result));
442
443 return instructions;
444 }
445
446 /**
447 * A helper for MOV generation for fixing up broken hardware SEND dependency
448 * handling.
449 */
450 fs_inst *
451 fs_visitor::DEP_RESOLVE_MOV(int grf)
452 {
453 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
454
455 inst->ir = NULL;
456 inst->annotation = "send dependency resolve";
457
458 /* The caller always wants uncompressed to emit the minimal extra
459 * dependencies, and to avoid having to deal with aligning its regs to 2.
460 */
461 inst->exec_size = 8;
462
463 return inst;
464 }
465
466 bool
467 fs_inst::equals(fs_inst *inst) const
468 {
469 return (opcode == inst->opcode &&
470 dst.equals(inst->dst) &&
471 src[0].equals(inst->src[0]) &&
472 src[1].equals(inst->src[1]) &&
473 src[2].equals(inst->src[2]) &&
474 saturate == inst->saturate &&
475 predicate == inst->predicate &&
476 conditional_mod == inst->conditional_mod &&
477 mlen == inst->mlen &&
478 base_mrf == inst->base_mrf &&
479 target == inst->target &&
480 eot == inst->eot &&
481 header_present == inst->header_present &&
482 shadow_compare == inst->shadow_compare &&
483 exec_size == inst->exec_size &&
484 offset == inst->offset);
485 }
486
487 bool
488 fs_inst::overwrites_reg(const fs_reg &reg) const
489 {
490 return reg.in_range(dst, regs_written);
491 }
492
493 bool
494 fs_inst::is_send_from_grf() const
495 {
496 switch (opcode) {
497 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
498 case SHADER_OPCODE_SHADER_TIME_ADD:
499 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
500 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
501 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
502 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
503 case SHADER_OPCODE_UNTYPED_ATOMIC:
504 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
505 case SHADER_OPCODE_URB_WRITE_SIMD8:
506 return true;
507 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
508 return src[1].file == GRF;
509 case FS_OPCODE_FB_WRITE:
510 return src[0].file == GRF;
511 default:
512 if (is_tex())
513 return src[0].file == GRF;
514
515 return false;
516 }
517 }
518
519 bool
520 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
521 {
522 if (devinfo->gen == 6 && is_math())
523 return false;
524
525 if (is_send_from_grf())
526 return false;
527
528 if (!backend_instruction::can_do_source_mods())
529 return false;
530
531 return true;
532 }
533
534 bool
535 fs_inst::has_side_effects() const
536 {
537 return this->eot || backend_instruction::has_side_effects();
538 }
539
540 void
541 fs_reg::init()
542 {
543 memset(this, 0, sizeof(*this));
544 stride = 1;
545 }
546
547 /** Generic unset register constructor. */
548 fs_reg::fs_reg()
549 {
550 init();
551 this->file = BAD_FILE;
552 }
553
554 /** Immediate value constructor. */
555 fs_reg::fs_reg(float f)
556 {
557 init();
558 this->file = IMM;
559 this->type = BRW_REGISTER_TYPE_F;
560 this->fixed_hw_reg.dw1.f = f;
561 this->width = 1;
562 }
563
564 /** Immediate value constructor. */
565 fs_reg::fs_reg(int32_t i)
566 {
567 init();
568 this->file = IMM;
569 this->type = BRW_REGISTER_TYPE_D;
570 this->fixed_hw_reg.dw1.d = i;
571 this->width = 1;
572 }
573
574 /** Immediate value constructor. */
575 fs_reg::fs_reg(uint32_t u)
576 {
577 init();
578 this->file = IMM;
579 this->type = BRW_REGISTER_TYPE_UD;
580 this->fixed_hw_reg.dw1.ud = u;
581 this->width = 1;
582 }
583
584 /** Vector float immediate value constructor. */
585 fs_reg::fs_reg(uint8_t vf[4])
586 {
587 init();
588 this->file = IMM;
589 this->type = BRW_REGISTER_TYPE_VF;
590 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
591 }
592
593 /** Vector float immediate value constructor. */
594 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
595 {
596 init();
597 this->file = IMM;
598 this->type = BRW_REGISTER_TYPE_VF;
599 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
600 (vf1 << 8) |
601 (vf2 << 16) |
602 (vf3 << 24);
603 }
604
605 /** Fixed brw_reg. */
606 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
607 {
608 init();
609 this->file = HW_REG;
610 this->fixed_hw_reg = fixed_hw_reg;
611 this->type = fixed_hw_reg.type;
612 this->width = 1 << fixed_hw_reg.width;
613 }
614
615 bool
616 fs_reg::equals(const fs_reg &r) const
617 {
618 return (file == r.file &&
619 reg == r.reg &&
620 reg_offset == r.reg_offset &&
621 subreg_offset == r.subreg_offset &&
622 type == r.type &&
623 negate == r.negate &&
624 abs == r.abs &&
625 !reladdr && !r.reladdr &&
626 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
627 width == r.width &&
628 stride == r.stride);
629 }
630
631 fs_reg &
632 fs_reg::set_smear(unsigned subreg)
633 {
634 assert(file != HW_REG && file != IMM);
635 subreg_offset = subreg * type_sz(type);
636 stride = 0;
637 return *this;
638 }
639
640 bool
641 fs_reg::is_contiguous() const
642 {
643 return stride == 1;
644 }
645
646 int
647 fs_visitor::type_size(const struct glsl_type *type)
648 {
649 unsigned int size, i;
650
651 switch (type->base_type) {
652 case GLSL_TYPE_UINT:
653 case GLSL_TYPE_INT:
654 case GLSL_TYPE_FLOAT:
655 case GLSL_TYPE_BOOL:
656 return type->components();
657 case GLSL_TYPE_ARRAY:
658 return type_size(type->fields.array) * type->length;
659 case GLSL_TYPE_STRUCT:
660 size = 0;
661 for (i = 0; i < type->length; i++) {
662 size += type_size(type->fields.structure[i].type);
663 }
664 return size;
665 case GLSL_TYPE_SAMPLER:
666 /* Samplers take up no register space, since they're baked in at
667 * link time.
668 */
669 return 0;
670 case GLSL_TYPE_ATOMIC_UINT:
671 return 0;
672 case GLSL_TYPE_IMAGE:
673 case GLSL_TYPE_VOID:
674 case GLSL_TYPE_ERROR:
675 case GLSL_TYPE_INTERFACE:
676 case GLSL_TYPE_DOUBLE:
677 unreachable("not reached");
678 }
679
680 return 0;
681 }
682
683 /**
684 * Create a MOV to read the timestamp register.
685 *
686 * The caller is responsible for emitting the MOV. The return value is
687 * the destination of the MOV, with extra parameters set.
688 */
689 fs_reg
690 fs_visitor::get_timestamp(fs_inst **out_mov)
691 {
692 assert(devinfo->gen >= 7);
693
694 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
695 BRW_ARF_TIMESTAMP,
696 0),
697 BRW_REGISTER_TYPE_UD));
698
699 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
700
701 fs_inst *mov = MOV(dst, ts);
702 /* We want to read the 3 fields we care about even if it's not enabled in
703 * the dispatch.
704 */
705 mov->force_writemask_all = true;
706
707 /* The caller wants the low 32 bits of the timestamp. Since it's running
708 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
709 * which is plenty of time for our purposes. It is identical across the
710 * EUs, but since it's tracking GPU core speed it will increment at a
711 * varying rate as render P-states change.
712 *
713 * The caller could also check if render P-states have changed (or anything
714 * else that might disrupt timing) by setting smear to 2 and checking if
715 * that field is != 0.
716 */
717 dst.set_smear(0);
718
719 *out_mov = mov;
720 return dst;
721 }
722
723 void
724 fs_visitor::emit_shader_time_begin()
725 {
726 current_annotation = "shader time start";
727 fs_inst *mov;
728 shader_start_time = get_timestamp(&mov);
729 emit(mov);
730 }
731
732 void
733 fs_visitor::emit_shader_time_end()
734 {
735 current_annotation = "shader time end";
736
737 enum shader_time_shader_type type, written_type, reset_type;
738 switch (stage) {
739 case MESA_SHADER_VERTEX:
740 type = ST_VS;
741 written_type = ST_VS_WRITTEN;
742 reset_type = ST_VS_RESET;
743 break;
744 case MESA_SHADER_GEOMETRY:
745 type = ST_GS;
746 written_type = ST_GS_WRITTEN;
747 reset_type = ST_GS_RESET;
748 break;
749 case MESA_SHADER_FRAGMENT:
750 if (dispatch_width == 8) {
751 type = ST_FS8;
752 written_type = ST_FS8_WRITTEN;
753 reset_type = ST_FS8_RESET;
754 } else {
755 assert(dispatch_width == 16);
756 type = ST_FS16;
757 written_type = ST_FS16_WRITTEN;
758 reset_type = ST_FS16_RESET;
759 }
760 break;
761 case MESA_SHADER_COMPUTE:
762 type = ST_CS;
763 written_type = ST_CS_WRITTEN;
764 reset_type = ST_CS_RESET;
765 break;
766 default:
767 unreachable("fs_visitor::emit_shader_time_end missing code");
768 }
769
770 /* Insert our code just before the final SEND with EOT. */
771 exec_node *end = this->instructions.get_tail();
772 assert(end && ((fs_inst *) end)->eot);
773
774 fs_inst *tm_read;
775 fs_reg shader_end_time = get_timestamp(&tm_read);
776 end->insert_before(tm_read);
777
778 /* Check that there weren't any timestamp reset events (assuming these
779 * were the only two timestamp reads that happened).
780 */
781 fs_reg reset = shader_end_time;
782 reset.set_smear(2);
783 fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
784 test->conditional_mod = BRW_CONDITIONAL_Z;
785 test->force_writemask_all = true;
786 end->insert_before(test);
787 end->insert_before(IF(BRW_PREDICATE_NORMAL));
788
789 fs_reg start = shader_start_time;
790 start.negate = true;
791 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
792 diff.set_smear(0);
793 fs_inst *add = ADD(diff, start, shader_end_time);
794 add->force_writemask_all = true;
795 end->insert_before(add);
796
797 /* If there were no instructions between the two timestamp gets, the diff
798 * is 2 cycles. Remove that overhead, so I can forget about that when
799 * trying to determine the time taken for single instructions.
800 */
801 add = ADD(diff, diff, fs_reg(-2u));
802 add->force_writemask_all = true;
803 end->insert_before(add);
804
805 end->insert_before(SHADER_TIME_ADD(type, diff));
806 end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
807 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
808 end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
809 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
810 }
811
812 fs_inst *
813 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
814 {
815 int shader_time_index =
816 brw_get_shader_time_index(brw, shader_prog, prog, type);
817 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
818
819 fs_reg payload;
820 if (dispatch_width == 8)
821 payload = vgrf(glsl_type::uvec2_type);
822 else
823 payload = vgrf(glsl_type::uint_type);
824
825 return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
826 fs_reg(), payload, offset, value);
827 }
828
829 void
830 fs_visitor::vfail(const char *format, va_list va)
831 {
832 char *msg;
833
834 if (failed)
835 return;
836
837 failed = true;
838
839 msg = ralloc_vasprintf(mem_ctx, format, va);
840 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
841
842 this->fail_msg = msg;
843
844 if (debug_enabled) {
845 fprintf(stderr, "%s", msg);
846 }
847 }
848
849 void
850 fs_visitor::fail(const char *format, ...)
851 {
852 va_list va;
853
854 va_start(va, format);
855 vfail(format, va);
856 va_end(va);
857 }
858
859 /**
860 * Mark this program as impossible to compile in SIMD16 mode.
861 *
862 * During the SIMD8 compile (which happens first), we can detect and flag
863 * things that are unsupported in SIMD16 mode, so the compiler can skip
864 * the SIMD16 compile altogether.
865 *
866 * During a SIMD16 compile (if one happens anyway), this just calls fail().
867 */
868 void
869 fs_visitor::no16(const char *format, ...)
870 {
871 va_list va;
872
873 va_start(va, format);
874
875 if (dispatch_width == 16) {
876 vfail(format, va);
877 } else {
878 simd16_unsupported = true;
879
880 if (brw->perf_debug) {
881 if (no16_msg)
882 ralloc_vasprintf_append(&no16_msg, format, va);
883 else
884 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
885 }
886 }
887
888 va_end(va);
889 }
890
891 fs_inst *
892 fs_visitor::emit(enum opcode opcode)
893 {
894 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
895 }
896
897 fs_inst *
898 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
899 {
900 return emit(new(mem_ctx) fs_inst(opcode, dst));
901 }
902
903 fs_inst *
904 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
905 {
906 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
907 }
908
909 fs_inst *
910 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
911 const fs_reg &src1)
912 {
913 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
914 }
915
916 fs_inst *
917 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
918 const fs_reg &src1, const fs_reg &src2)
919 {
920 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
921 }
922
923 fs_inst *
924 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
925 fs_reg src[], int sources)
926 {
927 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
928 }
929
930 /**
931 * Returns true if the instruction has a flag that means it won't
932 * update an entire destination register.
933 *
934 * For example, dead code elimination and live variable analysis want to know
935 * when a write to a variable screens off any preceding values that were in
936 * it.
937 */
938 bool
939 fs_inst::is_partial_write() const
940 {
941 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
942 (this->dst.width * type_sz(this->dst.type)) < 32 ||
943 !this->dst.is_contiguous());
944 }
945
946 int
947 fs_inst::regs_read(int arg) const
948 {
949 if (is_tex() && arg == 0 && src[0].file == GRF) {
950 return mlen;
951 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
952 return mlen;
953 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
954 return mlen;
955 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
956 return mlen;
957 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
958 return mlen;
959 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
960 return mlen;
961 } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
962 return exec_size / 4;
963 }
964
965 switch (src[arg].file) {
966 case BAD_FILE:
967 case UNIFORM:
968 case IMM:
969 return 1;
970 case GRF:
971 case HW_REG:
972 if (src[arg].stride == 0) {
973 return 1;
974 } else {
975 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
976 return (size + 31) / 32;
977 }
978 case MRF:
979 unreachable("MRF registers are not allowed as sources");
980 default:
981 unreachable("Invalid register file");
982 }
983 }
984
985 bool
986 fs_inst::reads_flag() const
987 {
988 return predicate;
989 }
990
991 bool
992 fs_inst::writes_flag() const
993 {
994 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
995 opcode != BRW_OPCODE_IF &&
996 opcode != BRW_OPCODE_WHILE)) ||
997 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
998 }
999
1000 /**
1001 * Returns how many MRFs an FS opcode will write over.
1002 *
1003 * Note that this is not the 0 or 1 implied writes in an actual gen
1004 * instruction -- the FS opcodes often generate MOVs in addition.
1005 */
1006 int
1007 fs_visitor::implied_mrf_writes(fs_inst *inst)
1008 {
1009 if (inst->mlen == 0)
1010 return 0;
1011
1012 if (inst->base_mrf == -1)
1013 return 0;
1014
1015 switch (inst->opcode) {
1016 case SHADER_OPCODE_RCP:
1017 case SHADER_OPCODE_RSQ:
1018 case SHADER_OPCODE_SQRT:
1019 case SHADER_OPCODE_EXP2:
1020 case SHADER_OPCODE_LOG2:
1021 case SHADER_OPCODE_SIN:
1022 case SHADER_OPCODE_COS:
1023 return 1 * dispatch_width / 8;
1024 case SHADER_OPCODE_POW:
1025 case SHADER_OPCODE_INT_QUOTIENT:
1026 case SHADER_OPCODE_INT_REMAINDER:
1027 return 2 * dispatch_width / 8;
1028 case SHADER_OPCODE_TEX:
1029 case FS_OPCODE_TXB:
1030 case SHADER_OPCODE_TXD:
1031 case SHADER_OPCODE_TXF:
1032 case SHADER_OPCODE_TXF_CMS:
1033 case SHADER_OPCODE_TXF_MCS:
1034 case SHADER_OPCODE_TG4:
1035 case SHADER_OPCODE_TG4_OFFSET:
1036 case SHADER_OPCODE_TXL:
1037 case SHADER_OPCODE_TXS:
1038 case SHADER_OPCODE_LOD:
1039 return 1;
1040 case FS_OPCODE_FB_WRITE:
1041 return 2;
1042 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1043 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1044 return 1;
1045 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1046 return inst->mlen;
1047 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1048 return 2;
1049 case SHADER_OPCODE_UNTYPED_ATOMIC:
1050 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1051 case SHADER_OPCODE_URB_WRITE_SIMD8:
1052 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1053 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1054 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1055 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1056 return 0;
1057 default:
1058 unreachable("not reached");
1059 }
1060 }
1061
1062 fs_reg
1063 fs_visitor::vgrf(const glsl_type *const type)
1064 {
1065 int reg_width = dispatch_width / 8;
1066 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1067 brw_type_for_base_type(type), dispatch_width);
1068 }
1069
1070 fs_reg
1071 fs_visitor::vgrf(int num_components)
1072 {
1073 int reg_width = dispatch_width / 8;
1074 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1075 BRW_REGISTER_TYPE_F, dispatch_width);
1076 }
1077
1078 /** Fixed HW reg constructor. */
1079 fs_reg::fs_reg(enum register_file file, int reg)
1080 {
1081 init();
1082 this->file = file;
1083 this->reg = reg;
1084 this->type = BRW_REGISTER_TYPE_F;
1085
1086 switch (file) {
1087 case UNIFORM:
1088 this->width = 1;
1089 break;
1090 default:
1091 this->width = 8;
1092 }
1093 }
1094
1095 /** Fixed HW reg constructor. */
1096 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1097 {
1098 init();
1099 this->file = file;
1100 this->reg = reg;
1101 this->type = type;
1102
1103 switch (file) {
1104 case UNIFORM:
1105 this->width = 1;
1106 break;
1107 default:
1108 this->width = 8;
1109 }
1110 }
1111
1112 /** Fixed HW reg constructor. */
1113 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1114 uint8_t width)
1115 {
1116 init();
1117 this->file = file;
1118 this->reg = reg;
1119 this->type = type;
1120 this->width = width;
1121 }
1122
1123 fs_reg *
1124 fs_visitor::variable_storage(ir_variable *var)
1125 {
1126 return (fs_reg *)hash_table_find(this->variable_ht, var);
1127 }
1128
1129 void
1130 import_uniforms_callback(const void *key,
1131 void *data,
1132 void *closure)
1133 {
1134 struct hash_table *dst_ht = (struct hash_table *)closure;
1135 const fs_reg *reg = (const fs_reg *)data;
1136
1137 if (reg->file != UNIFORM)
1138 return;
1139
1140 hash_table_insert(dst_ht, data, key);
1141 }
1142
1143 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1144 * This brings in those uniform definitions
1145 */
1146 void
1147 fs_visitor::import_uniforms(fs_visitor *v)
1148 {
1149 hash_table_call_foreach(v->variable_ht,
1150 import_uniforms_callback,
1151 variable_ht);
1152 this->push_constant_loc = v->push_constant_loc;
1153 this->pull_constant_loc = v->pull_constant_loc;
1154 this->uniforms = v->uniforms;
1155 this->param_size = v->param_size;
1156 }
1157
1158 /* Our support for uniforms is piggy-backed on the struct
1159 * gl_fragment_program, because that's where the values actually
1160 * get stored, rather than in some global gl_shader_program uniform
1161 * store.
1162 */
1163 void
1164 fs_visitor::setup_uniform_values(ir_variable *ir)
1165 {
1166 int namelen = strlen(ir->name);
1167
1168 /* The data for our (non-builtin) uniforms is stored in a series of
1169 * gl_uniform_driver_storage structs for each subcomponent that
1170 * glGetUniformLocation() could name. We know it's been set up in the same
1171 * order we'd walk the type, so walk the list of storage and find anything
1172 * with our name, or the prefix of a component that starts with our name.
1173 */
1174 unsigned params_before = uniforms;
1175 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1176 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1177
1178 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1179 (storage->name[namelen] != 0 &&
1180 storage->name[namelen] != '.' &&
1181 storage->name[namelen] != '[')) {
1182 continue;
1183 }
1184
1185 unsigned slots = storage->type->component_slots();
1186 if (storage->array_elements)
1187 slots *= storage->array_elements;
1188
1189 for (unsigned i = 0; i < slots; i++) {
1190 stage_prog_data->param[uniforms++] = &storage->storage[i];
1191 }
1192 }
1193
1194 /* Make sure we actually initialized the right amount of stuff here. */
1195 assert(params_before + ir->type->component_slots() == uniforms);
1196 (void)params_before;
1197 }
1198
1199
1200 /* Our support for builtin uniforms is even scarier than non-builtin.
1201 * It sits on top of the PROG_STATE_VAR parameters that are
1202 * automatically updated from GL context state.
1203 */
1204 void
1205 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1206 {
1207 const ir_state_slot *const slots = ir->get_state_slots();
1208 assert(slots != NULL);
1209
1210 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1211 /* This state reference has already been setup by ir_to_mesa, but we'll
1212 * get the same index back here.
1213 */
1214 int index = _mesa_add_state_reference(this->prog->Parameters,
1215 (gl_state_index *)slots[i].tokens);
1216
1217 /* Add each of the unique swizzles of the element as a parameter.
1218 * This'll end up matching the expected layout of the
1219 * array/matrix/structure we're trying to fill in.
1220 */
1221 int last_swiz = -1;
1222 for (unsigned int j = 0; j < 4; j++) {
1223 int swiz = GET_SWZ(slots[i].swizzle, j);
1224 if (swiz == last_swiz)
1225 break;
1226 last_swiz = swiz;
1227
1228 stage_prog_data->param[uniforms++] =
1229 &prog->Parameters->ParameterValues[index][swiz];
1230 }
1231 }
1232 }
1233
1234 fs_reg *
1235 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1236 bool origin_upper_left)
1237 {
1238 assert(stage == MESA_SHADER_FRAGMENT);
1239 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1240 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1241 fs_reg wpos = *reg;
1242 bool flip = !origin_upper_left ^ key->render_to_fbo;
1243
1244 /* gl_FragCoord.x */
1245 if (pixel_center_integer) {
1246 emit(MOV(wpos, this->pixel_x));
1247 } else {
1248 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1249 }
1250 wpos = offset(wpos, 1);
1251
1252 /* gl_FragCoord.y */
1253 if (!flip && pixel_center_integer) {
1254 emit(MOV(wpos, this->pixel_y));
1255 } else {
1256 fs_reg pixel_y = this->pixel_y;
1257 float offset = (pixel_center_integer ? 0.0 : 0.5);
1258
1259 if (flip) {
1260 pixel_y.negate = true;
1261 offset += key->drawable_height - 1.0;
1262 }
1263
1264 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1265 }
1266 wpos = offset(wpos, 1);
1267
1268 /* gl_FragCoord.z */
1269 if (devinfo->gen >= 6) {
1270 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1271 } else {
1272 emit(FS_OPCODE_LINTERP, wpos,
1273 this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1274 interp_reg(VARYING_SLOT_POS, 2));
1275 }
1276 wpos = offset(wpos, 1);
1277
1278 /* gl_FragCoord.w: Already set up in emit_interpolation */
1279 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1280
1281 return reg;
1282 }
1283
1284 fs_inst *
1285 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1286 glsl_interp_qualifier interpolation_mode,
1287 bool is_centroid, bool is_sample)
1288 {
1289 brw_wm_barycentric_interp_mode barycoord_mode;
1290 if (devinfo->gen >= 6) {
1291 if (is_centroid) {
1292 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1293 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1294 else
1295 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1296 } else if (is_sample) {
1297 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1298 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1299 else
1300 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1301 } else {
1302 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1303 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1304 else
1305 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1306 }
1307 } else {
1308 /* On Ironlake and below, there is only one interpolation mode.
1309 * Centroid interpolation doesn't mean anything on this hardware --
1310 * there is no multisampling.
1311 */
1312 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1313 }
1314 return emit(FS_OPCODE_LINTERP, attr,
1315 this->delta_xy[barycoord_mode], interp);
1316 }
1317
1318 void
1319 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1320 const glsl_type *type,
1321 glsl_interp_qualifier interpolation_mode,
1322 int location, bool mod_centroid,
1323 bool mod_sample)
1324 {
1325 attr.type = brw_type_for_base_type(type->get_scalar_type());
1326
1327 assert(stage == MESA_SHADER_FRAGMENT);
1328 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1329 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1330
1331 unsigned int array_elements;
1332
1333 if (type->is_array()) {
1334 array_elements = type->length;
1335 if (array_elements == 0) {
1336 fail("dereferenced array '%s' has length 0\n", name);
1337 }
1338 type = type->fields.array;
1339 } else {
1340 array_elements = 1;
1341 }
1342
1343 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1344 bool is_gl_Color =
1345 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1346 if (key->flat_shade && is_gl_Color) {
1347 interpolation_mode = INTERP_QUALIFIER_FLAT;
1348 } else {
1349 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1350 }
1351 }
1352
1353 for (unsigned int i = 0; i < array_elements; i++) {
1354 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1355 if (prog_data->urb_setup[location] == -1) {
1356 /* If there's no incoming setup data for this slot, don't
1357 * emit interpolation for it.
1358 */
1359 attr = offset(attr, type->vector_elements);
1360 location++;
1361 continue;
1362 }
1363
1364 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1365 /* Constant interpolation (flat shading) case. The SF has
1366 * handed us defined values in only the constant offset
1367 * field of the setup reg.
1368 */
1369 for (unsigned int k = 0; k < type->vector_elements; k++) {
1370 struct brw_reg interp = interp_reg(location, k);
1371 interp = suboffset(interp, 3);
1372 interp.type = attr.type;
1373 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1374 attr = offset(attr, 1);
1375 }
1376 } else {
1377 /* Smooth/noperspective interpolation case. */
1378 for (unsigned int k = 0; k < type->vector_elements; k++) {
1379 struct brw_reg interp = interp_reg(location, k);
1380 if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1381 /* Get the pixel/sample mask into f0 so that we know
1382 * which pixels are lit. Then, for each channel that is
1383 * unlit, replace the centroid data with non-centroid
1384 * data.
1385 */
1386 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1387
1388 fs_inst *inst;
1389 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1390 false, false);
1391 inst->predicate = BRW_PREDICATE_NORMAL;
1392 inst->predicate_inverse = true;
1393 if (devinfo->has_pln)
1394 inst->no_dd_clear = true;
1395
1396 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1397 mod_centroid && !key->persample_shading,
1398 mod_sample || key->persample_shading);
1399 inst->predicate = BRW_PREDICATE_NORMAL;
1400 inst->predicate_inverse = false;
1401 if (devinfo->has_pln)
1402 inst->no_dd_check = true;
1403
1404 } else {
1405 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1406 mod_centroid && !key->persample_shading,
1407 mod_sample || key->persample_shading);
1408 }
1409 if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1410 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1411 }
1412 attr = offset(attr, 1);
1413 }
1414
1415 }
1416 location++;
1417 }
1418 }
1419 }
1420
1421 fs_reg *
1422 fs_visitor::emit_frontfacing_interpolation()
1423 {
1424 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1425
1426 if (devinfo->gen >= 6) {
1427 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1428 * a boolean result from this (~0/true or 0/false).
1429 *
1430 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1431 * this task in only one instruction:
1432 * - a negation source modifier will flip the bit; and
1433 * - a W -> D type conversion will sign extend the bit into the high
1434 * word of the destination.
1435 *
1436 * An ASR 15 fills the low word of the destination.
1437 */
1438 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1439 g0.negate = true;
1440
1441 emit(ASR(*reg, g0, fs_reg(15)));
1442 } else {
1443 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1444 * a boolean result from this (1/true or 0/false).
1445 *
1446 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1447 * the negation source modifier to flip it. Unfortunately the SHR
1448 * instruction only operates on UD (or D with an abs source modifier)
1449 * sources without negation.
1450 *
1451 * Instead, use ASR (which will give ~0/true or 0/false).
1452 */
1453 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1454 g1_6.negate = true;
1455
1456 emit(ASR(*reg, g1_6, fs_reg(31)));
1457 }
1458
1459 return reg;
1460 }
1461
1462 void
1463 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1464 {
1465 assert(stage == MESA_SHADER_FRAGMENT);
1466 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1467 assert(dst.type == BRW_REGISTER_TYPE_F);
1468
1469 if (key->compute_pos_offset) {
1470 /* Convert int_sample_pos to floating point */
1471 emit(MOV(dst, int_sample_pos));
1472 /* Scale to the range [0, 1] */
1473 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1474 }
1475 else {
1476 /* From ARB_sample_shading specification:
1477 * "When rendering to a non-multisample buffer, or if multisample
1478 * rasterization is disabled, gl_SamplePosition will always be
1479 * (0.5, 0.5).
1480 */
1481 emit(MOV(dst, fs_reg(0.5f)));
1482 }
1483 }
1484
1485 fs_reg *
1486 fs_visitor::emit_samplepos_setup()
1487 {
1488 assert(devinfo->gen >= 6);
1489
1490 this->current_annotation = "compute sample position";
1491 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1492 fs_reg pos = *reg;
1493 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1494 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1495
1496 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1497 * mode will be enabled.
1498 *
1499 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1500 * R31.1:0 Position Offset X/Y for Slot[3:0]
1501 * R31.3:2 Position Offset X/Y for Slot[7:4]
1502 * .....
1503 *
1504 * The X, Y sample positions come in as bytes in thread payload. So, read
1505 * the positions using vstride=16, width=8, hstride=2.
1506 */
1507 struct brw_reg sample_pos_reg =
1508 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1509 BRW_REGISTER_TYPE_B), 16, 8, 2);
1510
1511 if (dispatch_width == 8) {
1512 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1513 } else {
1514 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1515 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1516 ->force_sechalf = true;
1517 }
1518 /* Compute gl_SamplePosition.x */
1519 compute_sample_position(pos, int_sample_x);
1520 pos = offset(pos, 1);
1521 if (dispatch_width == 8) {
1522 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1523 } else {
1524 emit(MOV(half(int_sample_y, 0),
1525 fs_reg(suboffset(sample_pos_reg, 1))));
1526 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1527 ->force_sechalf = true;
1528 }
1529 /* Compute gl_SamplePosition.y */
1530 compute_sample_position(pos, int_sample_y);
1531 return reg;
1532 }
1533
1534 fs_reg *
1535 fs_visitor::emit_sampleid_setup()
1536 {
1537 assert(stage == MESA_SHADER_FRAGMENT);
1538 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1539 assert(devinfo->gen >= 6);
1540
1541 this->current_annotation = "compute sample id";
1542 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1543
1544 if (key->compute_sample_id) {
1545 fs_reg t1 = vgrf(glsl_type::int_type);
1546 fs_reg t2 = vgrf(glsl_type::int_type);
1547 t2.type = BRW_REGISTER_TYPE_UW;
1548
1549 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1550 * 8x multisampling, subspan 0 will represent sample N (where N
1551 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1552 * 7. We can find the value of N by looking at R0.0 bits 7:6
1553 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1554 * (since samples are always delivered in pairs). That is, we
1555 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1556 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1557 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1558 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1559 * populating a temporary variable with the sequence (0, 1, 2, 3),
1560 * and then reading from it using vstride=1, width=4, hstride=0.
1561 * These computations hold good for 4x multisampling as well.
1562 *
1563 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1564 * the first four slots are sample 0 of subspan 0; the next four
1565 * are sample 1 of subspan 0; the third group is sample 0 of
1566 * subspan 1, and finally sample 1 of subspan 1.
1567 */
1568 fs_inst *inst;
1569 inst = emit(BRW_OPCODE_AND, t1,
1570 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1571 fs_reg(0xc0));
1572 inst->force_writemask_all = true;
1573 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1574 inst->force_writemask_all = true;
1575 /* This works for both SIMD8 and SIMD16 */
1576 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1577 inst->force_writemask_all = true;
1578 /* This special instruction takes care of setting vstride=1,
1579 * width=4, hstride=0 of t2 during an ADD instruction.
1580 */
1581 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1582 } else {
1583 /* As per GL_ARB_sample_shading specification:
1584 * "When rendering to a non-multisample buffer, or if multisample
1585 * rasterization is disabled, gl_SampleID will always be zero."
1586 */
1587 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1588 }
1589
1590 return reg;
1591 }
1592
1593 void
1594 fs_visitor::resolve_source_modifiers(fs_reg *src)
1595 {
1596 if (!src->abs && !src->negate)
1597 return;
1598
1599 fs_reg temp = retype(vgrf(1), src->type);
1600 emit(MOV(temp, *src));
1601 *src = temp;
1602 }
1603
1604 fs_reg
1605 fs_visitor::fix_math_operand(fs_reg src)
1606 {
1607 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1608 * might be able to do better by doing execsize = 1 math and then
1609 * expanding that result out, but we would need to be careful with
1610 * masking.
1611 *
1612 * The hardware ignores source modifiers (negate and abs) on math
1613 * instructions, so we also move to a temp to set those up.
1614 */
1615 if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1616 !src.abs && !src.negate)
1617 return src;
1618
1619 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1620 * operands to math
1621 */
1622 if (devinfo->gen >= 7 && src.file != IMM)
1623 return src;
1624
1625 fs_reg expanded = vgrf(glsl_type::float_type);
1626 expanded.type = src.type;
1627 emit(BRW_OPCODE_MOV, expanded, src);
1628 return expanded;
1629 }
1630
1631 fs_inst *
1632 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1633 {
1634 switch (opcode) {
1635 case SHADER_OPCODE_RCP:
1636 case SHADER_OPCODE_RSQ:
1637 case SHADER_OPCODE_SQRT:
1638 case SHADER_OPCODE_EXP2:
1639 case SHADER_OPCODE_LOG2:
1640 case SHADER_OPCODE_SIN:
1641 case SHADER_OPCODE_COS:
1642 break;
1643 default:
1644 unreachable("not reached: bad math opcode");
1645 }
1646
1647 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1648 * might be able to do better by doing execsize = 1 math and then
1649 * expanding that result out, but we would need to be careful with
1650 * masking.
1651 *
1652 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1653 * instructions, so we also move to a temp to set those up.
1654 */
1655 if (devinfo->gen == 6 || devinfo->gen == 7)
1656 src = fix_math_operand(src);
1657
1658 fs_inst *inst = emit(opcode, dst, src);
1659
1660 if (devinfo->gen < 6) {
1661 inst->base_mrf = 2;
1662 inst->mlen = dispatch_width / 8;
1663 }
1664
1665 return inst;
1666 }
1667
1668 fs_inst *
1669 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1670 {
1671 int base_mrf = 2;
1672 fs_inst *inst;
1673
1674 if (devinfo->gen >= 8) {
1675 inst = emit(opcode, dst, src0, src1);
1676 } else if (devinfo->gen >= 6) {
1677 src0 = fix_math_operand(src0);
1678 src1 = fix_math_operand(src1);
1679
1680 inst = emit(opcode, dst, src0, src1);
1681 } else {
1682 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1683 * "Message Payload":
1684 *
1685 * "Operand0[7]. For the INT DIV functions, this operand is the
1686 * denominator."
1687 * ...
1688 * "Operand1[7]. For the INT DIV functions, this operand is the
1689 * numerator."
1690 */
1691 bool is_int_div = opcode != SHADER_OPCODE_POW;
1692 fs_reg &op0 = is_int_div ? src1 : src0;
1693 fs_reg &op1 = is_int_div ? src0 : src1;
1694
1695 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1696 inst = emit(opcode, dst, op0, reg_null_f);
1697
1698 inst->base_mrf = base_mrf;
1699 inst->mlen = 2 * dispatch_width / 8;
1700 }
1701 return inst;
1702 }
1703
1704 void
1705 fs_visitor::emit_discard_jump()
1706 {
1707 assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1708
1709 /* For performance, after a discard, jump to the end of the
1710 * shader if all relevant channels have been discarded.
1711 */
1712 fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1713 discard_jump->flag_subreg = 1;
1714
1715 discard_jump->predicate = (dispatch_width == 8)
1716 ? BRW_PREDICATE_ALIGN1_ANY8H
1717 : BRW_PREDICATE_ALIGN1_ANY16H;
1718 discard_jump->predicate_inverse = true;
1719 }
1720
1721 void
1722 fs_visitor::assign_curb_setup()
1723 {
1724 if (dispatch_width == 8) {
1725 prog_data->dispatch_grf_start_reg = payload.num_regs;
1726 } else {
1727 if (stage == MESA_SHADER_FRAGMENT) {
1728 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1729 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1730 } else if (stage == MESA_SHADER_COMPUTE) {
1731 brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1732 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1733 } else {
1734 unreachable("Unsupported shader type!");
1735 }
1736 }
1737
1738 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1739
1740 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1741 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1742 for (unsigned int i = 0; i < inst->sources; i++) {
1743 if (inst->src[i].file == UNIFORM) {
1744 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1745 int constant_nr;
1746 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1747 constant_nr = push_constant_loc[uniform_nr];
1748 } else {
1749 /* Section 5.11 of the OpenGL 4.1 spec says:
1750 * "Out-of-bounds reads return undefined values, which include
1751 * values from other variables of the active program or zero."
1752 * Just return the first push constant.
1753 */
1754 constant_nr = 0;
1755 }
1756
1757 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1758 constant_nr / 8,
1759 constant_nr % 8);
1760
1761 inst->src[i].file = HW_REG;
1762 inst->src[i].fixed_hw_reg = byte_offset(
1763 retype(brw_reg, inst->src[i].type),
1764 inst->src[i].subreg_offset);
1765 }
1766 }
1767 }
1768 }
1769
1770 void
1771 fs_visitor::calculate_urb_setup()
1772 {
1773 assert(stage == MESA_SHADER_FRAGMENT);
1774 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1775 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1776
1777 memset(prog_data->urb_setup, -1,
1778 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1779
1780 int urb_next = 0;
1781 /* Figure out where each of the incoming setup attributes lands. */
1782 if (devinfo->gen >= 6) {
1783 if (_mesa_bitcount_64(prog->InputsRead &
1784 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1785 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1786 * first 16 varying inputs, so we can put them wherever we want.
1787 * Just put them in order.
1788 *
1789 * This is useful because it means that (a) inputs not used by the
1790 * fragment shader won't take up valuable register space, and (b) we
1791 * won't have to recompile the fragment shader if it gets paired with
1792 * a different vertex (or geometry) shader.
1793 */
1794 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1795 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1796 BITFIELD64_BIT(i)) {
1797 prog_data->urb_setup[i] = urb_next++;
1798 }
1799 }
1800 } else {
1801 /* We have enough input varyings that the SF/SBE pipeline stage can't
1802 * arbitrarily rearrange them to suit our whim; we have to put them
1803 * in an order that matches the output of the previous pipeline stage
1804 * (geometry or vertex shader).
1805 */
1806 struct brw_vue_map prev_stage_vue_map;
1807 brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1808 key->input_slots_valid);
1809 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1810 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1811 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1812 slot++) {
1813 int varying = prev_stage_vue_map.slot_to_varying[slot];
1814 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1815 * unused.
1816 */
1817 if (varying != BRW_VARYING_SLOT_COUNT &&
1818 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1819 BITFIELD64_BIT(varying))) {
1820 prog_data->urb_setup[varying] = slot - first_slot;
1821 }
1822 }
1823 urb_next = prev_stage_vue_map.num_slots - first_slot;
1824 }
1825 } else {
1826 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1827 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1828 /* Point size is packed into the header, not as a general attribute */
1829 if (i == VARYING_SLOT_PSIZ)
1830 continue;
1831
1832 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1833 /* The back color slot is skipped when the front color is
1834 * also written to. In addition, some slots can be
1835 * written in the vertex shader and not read in the
1836 * fragment shader. So the register number must always be
1837 * incremented, mapped or not.
1838 */
1839 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1840 prog_data->urb_setup[i] = urb_next;
1841 urb_next++;
1842 }
1843 }
1844
1845 /*
1846 * It's a FS only attribute, and we did interpolation for this attribute
1847 * in SF thread. So, count it here, too.
1848 *
1849 * See compile_sf_prog() for more info.
1850 */
1851 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1852 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1853 }
1854
1855 prog_data->num_varying_inputs = urb_next;
1856 }
1857
1858 void
1859 fs_visitor::assign_urb_setup()
1860 {
1861 assert(stage == MESA_SHADER_FRAGMENT);
1862 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1863
1864 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1865
1866 /* Offset all the urb_setup[] index by the actual position of the
1867 * setup regs, now that the location of the constants has been chosen.
1868 */
1869 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1870 if (inst->opcode == FS_OPCODE_LINTERP) {
1871 assert(inst->src[1].file == HW_REG);
1872 inst->src[1].fixed_hw_reg.nr += urb_start;
1873 }
1874
1875 if (inst->opcode == FS_OPCODE_CINTERP) {
1876 assert(inst->src[0].file == HW_REG);
1877 inst->src[0].fixed_hw_reg.nr += urb_start;
1878 }
1879 }
1880
1881 /* Each attribute is 4 setup channels, each of which is half a reg. */
1882 this->first_non_payload_grf =
1883 urb_start + prog_data->num_varying_inputs * 2;
1884 }
1885
1886 void
1887 fs_visitor::assign_vs_urb_setup()
1888 {
1889 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1890 int grf, count, slot, channel, attr;
1891
1892 assert(stage == MESA_SHADER_VERTEX);
1893 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1894 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1895 count++;
1896
1897 /* Each attribute is 4 regs. */
1898 this->first_non_payload_grf =
1899 payload.num_regs + prog_data->curb_read_length + count * 4;
1900
1901 unsigned vue_entries =
1902 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1903
1904 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1905 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1906
1907 assert(vs_prog_data->base.urb_read_length <= 15);
1908
1909 /* Rewrite all ATTR file references to the hw grf that they land in. */
1910 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1911 for (int i = 0; i < inst->sources; i++) {
1912 if (inst->src[i].file == ATTR) {
1913
1914 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1915 slot = count - 1;
1916 } else {
1917 /* Attributes come in in a contiguous block, ordered by their
1918 * gl_vert_attrib value. That means we can compute the slot
1919 * number for an attribute by masking out the enabled
1920 * attributes before it and counting the bits.
1921 */
1922 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1923 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1924 BITFIELD64_MASK(attr));
1925 }
1926
1927 channel = inst->src[i].reg_offset & 3;
1928
1929 grf = payload.num_regs +
1930 prog_data->curb_read_length +
1931 slot * 4 + channel;
1932
1933 inst->src[i].file = HW_REG;
1934 inst->src[i].fixed_hw_reg =
1935 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1936 }
1937 }
1938 }
1939 }
1940
1941 /**
1942 * Split large virtual GRFs into separate components if we can.
1943 *
1944 * This is mostly duplicated with what brw_fs_vector_splitting does,
1945 * but that's really conservative because it's afraid of doing
1946 * splitting that doesn't result in real progress after the rest of
1947 * the optimization phases, which would cause infinite looping in
1948 * optimization. We can do it once here, safely. This also has the
1949 * opportunity to split interpolated values, or maybe even uniforms,
1950 * which we don't have at the IR level.
1951 *
1952 * We want to split, because virtual GRFs are what we register
1953 * allocate and spill (due to contiguousness requirements for some
1954 * instructions), and they're what we naturally generate in the
1955 * codegen process, but most virtual GRFs don't actually need to be
1956 * contiguous sets of GRFs. If we split, we'll end up with reduced
1957 * live intervals and better dead code elimination and coalescing.
1958 */
1959 void
1960 fs_visitor::split_virtual_grfs()
1961 {
1962 int num_vars = this->alloc.count;
1963
1964 /* Count the total number of registers */
1965 int reg_count = 0;
1966 int vgrf_to_reg[num_vars];
1967 for (int i = 0; i < num_vars; i++) {
1968 vgrf_to_reg[i] = reg_count;
1969 reg_count += alloc.sizes[i];
1970 }
1971
1972 /* An array of "split points". For each register slot, this indicates
1973 * if this slot can be separated from the previous slot. Every time an
1974 * instruction uses multiple elements of a register (as a source or
1975 * destination), we mark the used slots as inseparable. Then we go
1976 * through and split the registers into the smallest pieces we can.
1977 */
1978 bool split_points[reg_count];
1979 memset(split_points, 0, sizeof(split_points));
1980
1981 /* Mark all used registers as fully splittable */
1982 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1983 if (inst->dst.file == GRF) {
1984 int reg = vgrf_to_reg[inst->dst.reg];
1985 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1986 split_points[reg + j] = true;
1987 }
1988
1989 for (int i = 0; i < inst->sources; i++) {
1990 if (inst->src[i].file == GRF) {
1991 int reg = vgrf_to_reg[inst->src[i].reg];
1992 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1993 split_points[reg + j] = true;
1994 }
1995 }
1996 }
1997
1998 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1999 if (inst->dst.file == GRF) {
2000 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2001 for (int j = 1; j < inst->regs_written; j++)
2002 split_points[reg + j] = false;
2003 }
2004 for (int i = 0; i < inst->sources; i++) {
2005 if (inst->src[i].file == GRF) {
2006 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2007 for (int j = 1; j < inst->regs_read(i); j++)
2008 split_points[reg + j] = false;
2009 }
2010 }
2011 }
2012
2013 int new_virtual_grf[reg_count];
2014 int new_reg_offset[reg_count];
2015
2016 int reg = 0;
2017 for (int i = 0; i < num_vars; i++) {
2018 /* The first one should always be 0 as a quick sanity check. */
2019 assert(split_points[reg] == false);
2020
2021 /* j = 0 case */
2022 new_reg_offset[reg] = 0;
2023 reg++;
2024 int offset = 1;
2025
2026 /* j > 0 case */
2027 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2028 /* If this is a split point, reset the offset to 0 and allocate a
2029 * new virtual GRF for the previous offset many registers
2030 */
2031 if (split_points[reg]) {
2032 assert(offset <= MAX_VGRF_SIZE);
2033 int grf = alloc.allocate(offset);
2034 for (int k = reg - offset; k < reg; k++)
2035 new_virtual_grf[k] = grf;
2036 offset = 0;
2037 }
2038 new_reg_offset[reg] = offset;
2039 offset++;
2040 reg++;
2041 }
2042
2043 /* The last one gets the original register number */
2044 assert(offset <= MAX_VGRF_SIZE);
2045 alloc.sizes[i] = offset;
2046 for (int k = reg - offset; k < reg; k++)
2047 new_virtual_grf[k] = i;
2048 }
2049 assert(reg == reg_count);
2050
2051 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2052 if (inst->dst.file == GRF) {
2053 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2054 inst->dst.reg = new_virtual_grf[reg];
2055 inst->dst.reg_offset = new_reg_offset[reg];
2056 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2057 }
2058 for (int i = 0; i < inst->sources; i++) {
2059 if (inst->src[i].file == GRF) {
2060 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2061 inst->src[i].reg = new_virtual_grf[reg];
2062 inst->src[i].reg_offset = new_reg_offset[reg];
2063 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2064 }
2065 }
2066 }
2067 invalidate_live_intervals();
2068 }
2069
2070 /**
2071 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2072 *
2073 * During code generation, we create tons of temporary variables, many of
2074 * which get immediately killed and are never used again. Yet, in later
2075 * optimization and analysis passes, such as compute_live_intervals, we need
2076 * to loop over all the virtual GRFs. Compacting them can save a lot of
2077 * overhead.
2078 */
2079 bool
2080 fs_visitor::compact_virtual_grfs()
2081 {
2082 bool progress = false;
2083 int remap_table[this->alloc.count];
2084 memset(remap_table, -1, sizeof(remap_table));
2085
2086 /* Mark which virtual GRFs are used. */
2087 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2088 if (inst->dst.file == GRF)
2089 remap_table[inst->dst.reg] = 0;
2090
2091 for (int i = 0; i < inst->sources; i++) {
2092 if (inst->src[i].file == GRF)
2093 remap_table[inst->src[i].reg] = 0;
2094 }
2095 }
2096
2097 /* Compact the GRF arrays. */
2098 int new_index = 0;
2099 for (unsigned i = 0; i < this->alloc.count; i++) {
2100 if (remap_table[i] == -1) {
2101 /* We just found an unused register. This means that we are
2102 * actually going to compact something.
2103 */
2104 progress = true;
2105 } else {
2106 remap_table[i] = new_index;
2107 alloc.sizes[new_index] = alloc.sizes[i];
2108 invalidate_live_intervals();
2109 ++new_index;
2110 }
2111 }
2112
2113 this->alloc.count = new_index;
2114
2115 /* Patch all the instructions to use the newly renumbered registers */
2116 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2117 if (inst->dst.file == GRF)
2118 inst->dst.reg = remap_table[inst->dst.reg];
2119
2120 for (int i = 0; i < inst->sources; i++) {
2121 if (inst->src[i].file == GRF)
2122 inst->src[i].reg = remap_table[inst->src[i].reg];
2123 }
2124 }
2125
2126 /* Patch all the references to delta_xy, since they're used in register
2127 * allocation. If they're unused, switch them to BAD_FILE so we don't
2128 * think some random VGRF is delta_xy.
2129 */
2130 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2131 if (delta_xy[i].file == GRF) {
2132 if (remap_table[delta_xy[i].reg] != -1) {
2133 delta_xy[i].reg = remap_table[delta_xy[i].reg];
2134 } else {
2135 delta_xy[i].file = BAD_FILE;
2136 }
2137 }
2138 }
2139
2140 return progress;
2141 }
2142
2143 /*
2144 * Implements array access of uniforms by inserting a
2145 * PULL_CONSTANT_LOAD instruction.
2146 *
2147 * Unlike temporary GRF array access (where we don't support it due to
2148 * the difficulty of doing relative addressing on instruction
2149 * destinations), we could potentially do array access of uniforms
2150 * that were loaded in GRF space as push constants. In real-world
2151 * usage we've seen, though, the arrays being used are always larger
2152 * than we could load as push constants, so just always move all
2153 * uniform array access out to a pull constant buffer.
2154 */
2155 void
2156 fs_visitor::move_uniform_array_access_to_pull_constants()
2157 {
2158 if (dispatch_width != 8)
2159 return;
2160
2161 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2162 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2163
2164 /* Walk through and find array access of uniforms. Put a copy of that
2165 * uniform in the pull constant buffer.
2166 *
2167 * Note that we don't move constant-indexed accesses to arrays. No
2168 * testing has been done of the performance impact of this choice.
2169 */
2170 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2171 for (int i = 0 ; i < inst->sources; i++) {
2172 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2173 continue;
2174
2175 int uniform = inst->src[i].reg;
2176
2177 /* If this array isn't already present in the pull constant buffer,
2178 * add it.
2179 */
2180 if (pull_constant_loc[uniform] == -1) {
2181 const gl_constant_value **values = &stage_prog_data->param[uniform];
2182
2183 assert(param_size[uniform]);
2184
2185 for (int j = 0; j < param_size[uniform]; j++) {
2186 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2187
2188 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2189 values[j];
2190 }
2191 }
2192 }
2193 }
2194 }
2195
2196 /**
2197 * Assign UNIFORM file registers to either push constants or pull constants.
2198 *
2199 * We allow a fragment shader to have more than the specified minimum
2200 * maximum number of fragment shader uniform components (64). If
2201 * there are too many of these, they'd fill up all of register space.
2202 * So, this will push some of them out to the pull constant buffer and
2203 * update the program to load them.
2204 */
2205 void
2206 fs_visitor::assign_constant_locations()
2207 {
2208 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2209 if (dispatch_width != 8)
2210 return;
2211
2212 /* Find which UNIFORM registers are still in use. */
2213 bool is_live[uniforms];
2214 for (unsigned int i = 0; i < uniforms; i++) {
2215 is_live[i] = false;
2216 }
2217
2218 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2219 for (int i = 0; i < inst->sources; i++) {
2220 if (inst->src[i].file != UNIFORM)
2221 continue;
2222
2223 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2224 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2225 is_live[constant_nr] = true;
2226 }
2227 }
2228
2229 /* Only allow 16 registers (128 uniform components) as push constants.
2230 *
2231 * Just demote the end of the list. We could probably do better
2232 * here, demoting things that are rarely used in the program first.
2233 *
2234 * If changing this value, note the limitation about total_regs in
2235 * brw_curbe.c.
2236 */
2237 unsigned int max_push_components = 16 * 8;
2238 unsigned int num_push_constants = 0;
2239
2240 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2241
2242 for (unsigned int i = 0; i < uniforms; i++) {
2243 if (!is_live[i] || pull_constant_loc[i] != -1) {
2244 /* This UNIFORM register is either dead, or has already been demoted
2245 * to a pull const. Mark it as no longer living in the param[] array.
2246 */
2247 push_constant_loc[i] = -1;
2248 continue;
2249 }
2250
2251 if (num_push_constants < max_push_components) {
2252 /* Retain as a push constant. Record the location in the params[]
2253 * array.
2254 */
2255 push_constant_loc[i] = num_push_constants++;
2256 } else {
2257 /* Demote to a pull constant. */
2258 push_constant_loc[i] = -1;
2259
2260 int pull_index = stage_prog_data->nr_pull_params++;
2261 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2262 pull_constant_loc[i] = pull_index;
2263 }
2264 }
2265
2266 stage_prog_data->nr_params = num_push_constants;
2267
2268 /* Up until now, the param[] array has been indexed by reg + reg_offset
2269 * of UNIFORM registers. Condense it to only contain the uniforms we
2270 * chose to upload as push constants.
2271 */
2272 for (unsigned int i = 0; i < uniforms; i++) {
2273 int remapped = push_constant_loc[i];
2274
2275 if (remapped == -1)
2276 continue;
2277
2278 assert(remapped <= (int)i);
2279 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2280 }
2281 }
2282
2283 /**
2284 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2285 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2286 */
2287 void
2288 fs_visitor::demote_pull_constants()
2289 {
2290 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2291 for (int i = 0; i < inst->sources; i++) {
2292 if (inst->src[i].file != UNIFORM)
2293 continue;
2294
2295 int pull_index;
2296 unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2297 if (location >= uniforms) /* Out of bounds access */
2298 pull_index = -1;
2299 else
2300 pull_index = pull_constant_loc[location];
2301
2302 if (pull_index == -1)
2303 continue;
2304
2305 /* Set up the annotation tracking for new generated instructions. */
2306 base_ir = inst->ir;
2307 current_annotation = inst->annotation;
2308
2309 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2310 fs_reg dst = vgrf(glsl_type::float_type);
2311
2312 /* Generate a pull load into dst. */
2313 if (inst->src[i].reladdr) {
2314 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2315 surf_index,
2316 *inst->src[i].reladdr,
2317 pull_index);
2318 inst->insert_before(block, &list);
2319 inst->src[i].reladdr = NULL;
2320 } else {
2321 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2322 fs_inst *pull =
2323 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2324 dst, surf_index, offset);
2325 inst->insert_before(block, pull);
2326 inst->src[i].set_smear(pull_index & 3);
2327 }
2328
2329 /* Rewrite the instruction to use the temporary VGRF. */
2330 inst->src[i].file = GRF;
2331 inst->src[i].reg = dst.reg;
2332 inst->src[i].reg_offset = 0;
2333 inst->src[i].width = dispatch_width;
2334 }
2335 }
2336 invalidate_live_intervals();
2337 }
2338
2339 bool
2340 fs_visitor::opt_algebraic()
2341 {
2342 bool progress = false;
2343
2344 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2345 switch (inst->opcode) {
2346 case BRW_OPCODE_MOV:
2347 if (inst->src[0].file != IMM)
2348 break;
2349
2350 if (inst->saturate) {
2351 if (inst->dst.type != inst->src[0].type)
2352 assert(!"unimplemented: saturate mixed types");
2353
2354 if (brw_saturate_immediate(inst->dst.type,
2355 &inst->src[0].fixed_hw_reg)) {
2356 inst->saturate = false;
2357 progress = true;
2358 }
2359 }
2360 break;
2361
2362 case BRW_OPCODE_MUL:
2363 if (inst->src[1].file != IMM)
2364 continue;
2365
2366 /* a * 1.0 = a */
2367 if (inst->src[1].is_one()) {
2368 inst->opcode = BRW_OPCODE_MOV;
2369 inst->src[1] = reg_undef;
2370 progress = true;
2371 break;
2372 }
2373
2374 /* a * -1.0 = -a */
2375 if (inst->src[1].is_negative_one()) {
2376 inst->opcode = BRW_OPCODE_MOV;
2377 inst->src[0].negate = !inst->src[0].negate;
2378 inst->src[1] = reg_undef;
2379 progress = true;
2380 break;
2381 }
2382
2383 /* a * 0.0 = 0.0 */
2384 if (inst->src[1].is_zero()) {
2385 inst->opcode = BRW_OPCODE_MOV;
2386 inst->src[0] = inst->src[1];
2387 inst->src[1] = reg_undef;
2388 progress = true;
2389 break;
2390 }
2391
2392 if (inst->src[0].file == IMM) {
2393 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2394 inst->opcode = BRW_OPCODE_MOV;
2395 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2396 inst->src[1] = reg_undef;
2397 progress = true;
2398 break;
2399 }
2400 break;
2401 case BRW_OPCODE_ADD:
2402 if (inst->src[1].file != IMM)
2403 continue;
2404
2405 /* a + 0.0 = a */
2406 if (inst->src[1].is_zero()) {
2407 inst->opcode = BRW_OPCODE_MOV;
2408 inst->src[1] = reg_undef;
2409 progress = true;
2410 break;
2411 }
2412
2413 if (inst->src[0].file == IMM) {
2414 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2415 inst->opcode = BRW_OPCODE_MOV;
2416 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2417 inst->src[1] = reg_undef;
2418 progress = true;
2419 break;
2420 }
2421 break;
2422 case BRW_OPCODE_OR:
2423 if (inst->src[0].equals(inst->src[1])) {
2424 inst->opcode = BRW_OPCODE_MOV;
2425 inst->src[1] = reg_undef;
2426 progress = true;
2427 break;
2428 }
2429 break;
2430 case BRW_OPCODE_LRP:
2431 if (inst->src[1].equals(inst->src[2])) {
2432 inst->opcode = BRW_OPCODE_MOV;
2433 inst->src[0] = inst->src[1];
2434 inst->src[1] = reg_undef;
2435 inst->src[2] = reg_undef;
2436 progress = true;
2437 break;
2438 }
2439 break;
2440 case BRW_OPCODE_CMP:
2441 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2442 inst->src[0].abs &&
2443 inst->src[0].negate &&
2444 inst->src[1].is_zero()) {
2445 inst->src[0].abs = false;
2446 inst->src[0].negate = false;
2447 inst->conditional_mod = BRW_CONDITIONAL_Z;
2448 progress = true;
2449 break;
2450 }
2451 break;
2452 case BRW_OPCODE_SEL:
2453 if (inst->src[0].equals(inst->src[1])) {
2454 inst->opcode = BRW_OPCODE_MOV;
2455 inst->src[1] = reg_undef;
2456 inst->predicate = BRW_PREDICATE_NONE;
2457 inst->predicate_inverse = false;
2458 progress = true;
2459 } else if (inst->saturate && inst->src[1].file == IMM) {
2460 switch (inst->conditional_mod) {
2461 case BRW_CONDITIONAL_LE:
2462 case BRW_CONDITIONAL_L:
2463 switch (inst->src[1].type) {
2464 case BRW_REGISTER_TYPE_F:
2465 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2466 inst->opcode = BRW_OPCODE_MOV;
2467 inst->src[1] = reg_undef;
2468 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2469 progress = true;
2470 }
2471 break;
2472 default:
2473 break;
2474 }
2475 break;
2476 case BRW_CONDITIONAL_GE:
2477 case BRW_CONDITIONAL_G:
2478 switch (inst->src[1].type) {
2479 case BRW_REGISTER_TYPE_F:
2480 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2481 inst->opcode = BRW_OPCODE_MOV;
2482 inst->src[1] = reg_undef;
2483 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2484 progress = true;
2485 }
2486 break;
2487 default:
2488 break;
2489 }
2490 default:
2491 break;
2492 }
2493 }
2494 break;
2495 case BRW_OPCODE_MAD:
2496 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2497 inst->opcode = BRW_OPCODE_MOV;
2498 inst->src[1] = reg_undef;
2499 inst->src[2] = reg_undef;
2500 progress = true;
2501 } else if (inst->src[0].is_zero()) {
2502 inst->opcode = BRW_OPCODE_MUL;
2503 inst->src[0] = inst->src[2];
2504 inst->src[2] = reg_undef;
2505 progress = true;
2506 } else if (inst->src[1].is_one()) {
2507 inst->opcode = BRW_OPCODE_ADD;
2508 inst->src[1] = inst->src[2];
2509 inst->src[2] = reg_undef;
2510 progress = true;
2511 } else if (inst->src[2].is_one()) {
2512 inst->opcode = BRW_OPCODE_ADD;
2513 inst->src[2] = reg_undef;
2514 progress = true;
2515 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2516 inst->opcode = BRW_OPCODE_ADD;
2517 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2518 inst->src[2] = reg_undef;
2519 progress = true;
2520 }
2521 break;
2522 case SHADER_OPCODE_RCP: {
2523 fs_inst *prev = (fs_inst *)inst->prev;
2524 if (prev->opcode == SHADER_OPCODE_SQRT) {
2525 if (inst->src[0].equals(prev->dst)) {
2526 inst->opcode = SHADER_OPCODE_RSQ;
2527 inst->src[0] = prev->src[0];
2528 progress = true;
2529 }
2530 }
2531 break;
2532 }
2533 default:
2534 break;
2535 }
2536
2537 /* Swap if src[0] is immediate. */
2538 if (progress && inst->is_commutative()) {
2539 if (inst->src[0].file == IMM) {
2540 fs_reg tmp = inst->src[1];
2541 inst->src[1] = inst->src[0];
2542 inst->src[0] = tmp;
2543 }
2544 }
2545 }
2546 return progress;
2547 }
2548
2549 /**
2550 * Optimize sample messages that have constant zero values for the trailing
2551 * texture coordinates. We can just reduce the message length for these
2552 * instructions instead of reserving a register for it. Trailing parameters
2553 * that aren't sent default to zero anyway. This will cause the dead code
2554 * eliminator to remove the MOV instruction that would otherwise be emitted to
2555 * set up the zero value.
2556 */
2557 bool
2558 fs_visitor::opt_zero_samples()
2559 {
2560 /* Gen4 infers the texturing opcode based on the message length so we can't
2561 * change it.
2562 */
2563 if (devinfo->gen < 5)
2564 return false;
2565
2566 bool progress = false;
2567
2568 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2569 if (!inst->is_tex())
2570 continue;
2571
2572 fs_inst *load_payload = (fs_inst *) inst->prev;
2573
2574 if (load_payload->is_head_sentinel() ||
2575 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2576 continue;
2577
2578 /* We don't want to remove the message header. Removing all of the
2579 * parameters is avoided because it seems to cause a GPU hang but I
2580 * can't find any documentation indicating that this is expected.
2581 */
2582 while (inst->mlen > inst->header_present + dispatch_width / 8 &&
2583 load_payload->src[(inst->mlen - inst->header_present) /
2584 (dispatch_width / 8) +
2585 inst->header_present - 1].is_zero()) {
2586 inst->mlen -= dispatch_width / 8;
2587 progress = true;
2588 }
2589 }
2590
2591 if (progress)
2592 invalidate_live_intervals();
2593
2594 return progress;
2595 }
2596
2597 /**
2598 * Optimize sample messages which are followed by the final RT write.
2599 *
2600 * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2601 * results sent directly to the framebuffer, bypassing the EU. Recognize the
2602 * final texturing results copied to the framebuffer write payload and modify
2603 * them to write to the framebuffer directly.
2604 */
2605 bool
2606 fs_visitor::opt_sampler_eot()
2607 {
2608 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2609
2610 if (stage != MESA_SHADER_FRAGMENT)
2611 return false;
2612
2613 if (devinfo->gen < 9 && !devinfo->is_cherryview)
2614 return false;
2615
2616 /* FINISHME: It should be possible to implement this optimization when there
2617 * are multiple drawbuffers.
2618 */
2619 if (key->nr_color_regions != 1)
2620 return false;
2621
2622 /* Look for a texturing instruction immediately before the final FB_WRITE. */
2623 fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2624 assert(fb_write->eot);
2625 assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2626
2627 fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2628
2629 /* There wasn't one; nothing to do. */
2630 if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2631 return false;
2632
2633 /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2634 * It's very likely to be the previous instruction.
2635 */
2636 fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2637 if (load_payload->is_head_sentinel() ||
2638 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2639 return false;
2640
2641 assert(!tex_inst->eot); /* We can't get here twice */
2642 assert((tex_inst->offset & (0xff << 24)) == 0);
2643
2644 tex_inst->offset |= fb_write->target << 24;
2645 tex_inst->eot = true;
2646 fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2647
2648 /* If a header is present, marking the eot is sufficient. Otherwise, we need
2649 * to create a new LOAD_PAYLOAD command with the same sources and a space
2650 * saved for the header. Using a new destination register not only makes sure
2651 * we have enough space, but it will make sure the dead code eliminator kills
2652 * the instruction that this will replace.
2653 */
2654 if (tex_inst->header_present)
2655 return true;
2656
2657 fs_reg send_header = vgrf(load_payload->sources + 1);
2658 fs_reg *new_sources =
2659 ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2660
2661 new_sources[0] = fs_reg();
2662 for (int i = 0; i < load_payload->sources; i++)
2663 new_sources[i+1] = load_payload->src[i];
2664
2665 /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2666 * requires a lot of information about the sources to appropriately figure
2667 * out the number of registers needed to be used. Given this stage in our
2668 * optimization, we may not have the appropriate GRFs required by
2669 * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2670 * manually emit the instruction.
2671 */
2672 fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2673 load_payload->exec_size,
2674 send_header,
2675 new_sources,
2676 load_payload->sources + 1);
2677
2678 new_load_payload->regs_written = load_payload->regs_written + 1;
2679 tex_inst->mlen++;
2680 tex_inst->header_present = true;
2681 tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2682 tex_inst->src[0] = send_header;
2683 tex_inst->dst = reg_null_ud;
2684
2685 return true;
2686 }
2687
2688 bool
2689 fs_visitor::opt_register_renaming()
2690 {
2691 bool progress = false;
2692 int depth = 0;
2693
2694 int remap[alloc.count];
2695 memset(remap, -1, sizeof(int) * alloc.count);
2696
2697 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2698 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2699 depth++;
2700 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2701 inst->opcode == BRW_OPCODE_WHILE) {
2702 depth--;
2703 }
2704
2705 /* Rewrite instruction sources. */
2706 for (int i = 0; i < inst->sources; i++) {
2707 if (inst->src[i].file == GRF &&
2708 remap[inst->src[i].reg] != -1 &&
2709 remap[inst->src[i].reg] != inst->src[i].reg) {
2710 inst->src[i].reg = remap[inst->src[i].reg];
2711 progress = true;
2712 }
2713 }
2714
2715 const int dst = inst->dst.reg;
2716
2717 if (depth == 0 &&
2718 inst->dst.file == GRF &&
2719 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2720 !inst->is_partial_write()) {
2721 if (remap[dst] == -1) {
2722 remap[dst] = dst;
2723 } else {
2724 remap[dst] = alloc.allocate(inst->dst.width / 8);
2725 inst->dst.reg = remap[dst];
2726 progress = true;
2727 }
2728 } else if (inst->dst.file == GRF &&
2729 remap[dst] != -1 &&
2730 remap[dst] != dst) {
2731 inst->dst.reg = remap[dst];
2732 progress = true;
2733 }
2734 }
2735
2736 if (progress) {
2737 invalidate_live_intervals();
2738
2739 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2740 if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2741 delta_xy[i].reg = remap[delta_xy[i].reg];
2742 }
2743 }
2744 }
2745
2746 return progress;
2747 }
2748
2749 /**
2750 * Remove redundant or useless discard jumps.
2751 *
2752 * For example, we can eliminate jumps in the following sequence:
2753 *
2754 * discard-jump (redundant with the next jump)
2755 * discard-jump (useless; jumps to the next instruction)
2756 * placeholder-halt
2757 */
2758 bool
2759 fs_visitor::opt_redundant_discard_jumps()
2760 {
2761 bool progress = false;
2762
2763 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2764
2765 fs_inst *placeholder_halt = NULL;
2766 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2767 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2768 placeholder_halt = inst;
2769 break;
2770 }
2771 }
2772
2773 if (!placeholder_halt)
2774 return false;
2775
2776 /* Delete any HALTs immediately before the placeholder halt. */
2777 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2778 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2779 prev = (fs_inst *) placeholder_halt->prev) {
2780 prev->remove(last_bblock);
2781 progress = true;
2782 }
2783
2784 if (progress)
2785 invalidate_live_intervals();
2786
2787 return progress;
2788 }
2789
2790 bool
2791 fs_visitor::compute_to_mrf()
2792 {
2793 bool progress = false;
2794 int next_ip = 0;
2795
2796 /* No MRFs on Gen >= 7. */
2797 if (devinfo->gen >= 7)
2798 return false;
2799
2800 calculate_live_intervals();
2801
2802 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2803 int ip = next_ip;
2804 next_ip++;
2805
2806 if (inst->opcode != BRW_OPCODE_MOV ||
2807 inst->is_partial_write() ||
2808 inst->dst.file != MRF || inst->src[0].file != GRF ||
2809 inst->dst.type != inst->src[0].type ||
2810 inst->src[0].abs || inst->src[0].negate ||
2811 !inst->src[0].is_contiguous() ||
2812 inst->src[0].subreg_offset)
2813 continue;
2814
2815 /* Work out which hardware MRF registers are written by this
2816 * instruction.
2817 */
2818 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2819 int mrf_high;
2820 if (inst->dst.reg & BRW_MRF_COMPR4) {
2821 mrf_high = mrf_low + 4;
2822 } else if (inst->exec_size == 16) {
2823 mrf_high = mrf_low + 1;
2824 } else {
2825 mrf_high = mrf_low;
2826 }
2827
2828 /* Can't compute-to-MRF this GRF if someone else was going to
2829 * read it later.
2830 */
2831 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2832 continue;
2833
2834 /* Found a move of a GRF to a MRF. Let's see if we can go
2835 * rewrite the thing that made this GRF to write into the MRF.
2836 */
2837 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2838 if (scan_inst->dst.file == GRF &&
2839 scan_inst->dst.reg == inst->src[0].reg) {
2840 /* Found the last thing to write our reg we want to turn
2841 * into a compute-to-MRF.
2842 */
2843
2844 /* If this one instruction didn't populate all the
2845 * channels, bail. We might be able to rewrite everything
2846 * that writes that reg, but it would require smarter
2847 * tracking to delay the rewriting until complete success.
2848 */
2849 if (scan_inst->is_partial_write())
2850 break;
2851
2852 /* Things returning more than one register would need us to
2853 * understand coalescing out more than one MOV at a time.
2854 */
2855 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2856 break;
2857
2858 /* SEND instructions can't have MRF as a destination. */
2859 if (scan_inst->mlen)
2860 break;
2861
2862 if (devinfo->gen == 6) {
2863 /* gen6 math instructions must have the destination be
2864 * GRF, so no compute-to-MRF for them.
2865 */
2866 if (scan_inst->is_math()) {
2867 break;
2868 }
2869 }
2870
2871 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2872 /* Found the creator of our MRF's source value. */
2873 scan_inst->dst.file = MRF;
2874 scan_inst->dst.reg = inst->dst.reg;
2875 scan_inst->saturate |= inst->saturate;
2876 inst->remove(block);
2877 progress = true;
2878 }
2879 break;
2880 }
2881
2882 /* We don't handle control flow here. Most computation of
2883 * values that end up in MRFs are shortly before the MRF
2884 * write anyway.
2885 */
2886 if (block->start() == scan_inst)
2887 break;
2888
2889 /* You can't read from an MRF, so if someone else reads our
2890 * MRF's source GRF that we wanted to rewrite, that stops us.
2891 */
2892 bool interfered = false;
2893 for (int i = 0; i < scan_inst->sources; i++) {
2894 if (scan_inst->src[i].file == GRF &&
2895 scan_inst->src[i].reg == inst->src[0].reg &&
2896 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2897 interfered = true;
2898 }
2899 }
2900 if (interfered)
2901 break;
2902
2903 if (scan_inst->dst.file == MRF) {
2904 /* If somebody else writes our MRF here, we can't
2905 * compute-to-MRF before that.
2906 */
2907 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2908 int scan_mrf_high;
2909
2910 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2911 scan_mrf_high = scan_mrf_low + 4;
2912 } else if (scan_inst->exec_size == 16) {
2913 scan_mrf_high = scan_mrf_low + 1;
2914 } else {
2915 scan_mrf_high = scan_mrf_low;
2916 }
2917
2918 if (mrf_low == scan_mrf_low ||
2919 mrf_low == scan_mrf_high ||
2920 mrf_high == scan_mrf_low ||
2921 mrf_high == scan_mrf_high) {
2922 break;
2923 }
2924 }
2925
2926 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2927 /* Found a SEND instruction, which means that there are
2928 * live values in MRFs from base_mrf to base_mrf +
2929 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2930 * above it.
2931 */
2932 if (mrf_low >= scan_inst->base_mrf &&
2933 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2934 break;
2935 }
2936 if (mrf_high >= scan_inst->base_mrf &&
2937 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2938 break;
2939 }
2940 }
2941 }
2942 }
2943
2944 if (progress)
2945 invalidate_live_intervals();
2946
2947 return progress;
2948 }
2949
2950 /**
2951 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2952 * instructions to FS_OPCODE_REP_FB_WRITE.
2953 */
2954 void
2955 fs_visitor::emit_repclear_shader()
2956 {
2957 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2958 int base_mrf = 1;
2959 int color_mrf = base_mrf + 2;
2960
2961 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2962 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2963 mov->force_writemask_all = true;
2964
2965 fs_inst *write;
2966 if (key->nr_color_regions == 1) {
2967 write = emit(FS_OPCODE_REP_FB_WRITE);
2968 write->saturate = key->clamp_fragment_color;
2969 write->base_mrf = color_mrf;
2970 write->target = 0;
2971 write->header_present = false;
2972 write->mlen = 1;
2973 } else {
2974 assume(key->nr_color_regions > 0);
2975 for (int i = 0; i < key->nr_color_regions; ++i) {
2976 write = emit(FS_OPCODE_REP_FB_WRITE);
2977 write->saturate = key->clamp_fragment_color;
2978 write->base_mrf = base_mrf;
2979 write->target = i;
2980 write->header_present = true;
2981 write->mlen = 3;
2982 }
2983 }
2984 write->eot = true;
2985
2986 calculate_cfg();
2987
2988 assign_constant_locations();
2989 assign_curb_setup();
2990
2991 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2992 assert(mov->src[0].file == HW_REG);
2993 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2994 }
2995
2996 /**
2997 * Walks through basic blocks, looking for repeated MRF writes and
2998 * removing the later ones.
2999 */
3000 bool
3001 fs_visitor::remove_duplicate_mrf_writes()
3002 {
3003 fs_inst *last_mrf_move[16];
3004 bool progress = false;
3005
3006 /* Need to update the MRF tracking for compressed instructions. */
3007 if (dispatch_width == 16)
3008 return false;
3009
3010 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3011
3012 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3013 if (inst->is_control_flow()) {
3014 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3015 }
3016
3017 if (inst->opcode == BRW_OPCODE_MOV &&
3018 inst->dst.file == MRF) {
3019 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3020 if (prev_inst && inst->equals(prev_inst)) {
3021 inst->remove(block);
3022 progress = true;
3023 continue;
3024 }
3025 }
3026
3027 /* Clear out the last-write records for MRFs that were overwritten. */
3028 if (inst->dst.file == MRF) {
3029 last_mrf_move[inst->dst.reg] = NULL;
3030 }
3031
3032 if (inst->mlen > 0 && inst->base_mrf != -1) {
3033 /* Found a SEND instruction, which will include two or fewer
3034 * implied MRF writes. We could do better here.
3035 */
3036 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3037 last_mrf_move[inst->base_mrf + i] = NULL;
3038 }
3039 }
3040
3041 /* Clear out any MRF move records whose sources got overwritten. */
3042 if (inst->dst.file == GRF) {
3043 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3044 if (last_mrf_move[i] &&
3045 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3046 last_mrf_move[i] = NULL;
3047 }
3048 }
3049 }
3050
3051 if (inst->opcode == BRW_OPCODE_MOV &&
3052 inst->dst.file == MRF &&
3053 inst->src[0].file == GRF &&
3054 !inst->is_partial_write()) {
3055 last_mrf_move[inst->dst.reg] = inst;
3056 }
3057 }
3058
3059 if (progress)
3060 invalidate_live_intervals();
3061
3062 return progress;
3063 }
3064
3065 static void
3066 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3067 {
3068 /* Clear the flag for registers that actually got read (as expected). */
3069 for (int i = 0; i < inst->sources; i++) {
3070 int grf;
3071 if (inst->src[i].file == GRF) {
3072 grf = inst->src[i].reg;
3073 } else if (inst->src[i].file == HW_REG &&
3074 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3075 grf = inst->src[i].fixed_hw_reg.nr;
3076 } else {
3077 continue;
3078 }
3079
3080 if (grf >= first_grf &&
3081 grf < first_grf + grf_len) {
3082 deps[grf - first_grf] = false;
3083 if (inst->exec_size == 16)
3084 deps[grf - first_grf + 1] = false;
3085 }
3086 }
3087 }
3088
3089 /**
3090 * Implements this workaround for the original 965:
3091 *
3092 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3093 * check for post destination dependencies on this instruction, software
3094 * must ensure that there is no destination hazard for the case of ‘write
3095 * followed by a posted write’ shown in the following example.
3096 *
3097 * 1. mov r3 0
3098 * 2. send r3.xy <rest of send instruction>
3099 * 3. mov r2 r3
3100 *
3101 * Due to no post-destination dependency check on the ‘send’, the above
3102 * code sequence could have two instructions (1 and 2) in flight at the
3103 * same time that both consider ‘r3’ as the target of their final writes.
3104 */
3105 void
3106 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3107 fs_inst *inst)
3108 {
3109 int write_len = inst->regs_written;
3110 int first_write_grf = inst->dst.reg;
3111 bool needs_dep[BRW_MAX_MRF];
3112 assert(write_len < (int)sizeof(needs_dep) - 1);
3113
3114 memset(needs_dep, false, sizeof(needs_dep));
3115 memset(needs_dep, true, write_len);
3116
3117 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3118
3119 /* Walk backwards looking for writes to registers we're writing which
3120 * aren't read since being written. If we hit the start of the program,
3121 * we assume that there are no outstanding dependencies on entry to the
3122 * program.
3123 */
3124 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3125 /* If we hit control flow, assume that there *are* outstanding
3126 * dependencies, and force their cleanup before our instruction.
3127 */
3128 if (block->start() == scan_inst) {
3129 for (int i = 0; i < write_len; i++) {
3130 if (needs_dep[i]) {
3131 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3132 }
3133 }
3134 return;
3135 }
3136
3137 /* We insert our reads as late as possible on the assumption that any
3138 * instruction but a MOV that might have left us an outstanding
3139 * dependency has more latency than a MOV.
3140 */
3141 if (scan_inst->dst.file == GRF) {
3142 for (int i = 0; i < scan_inst->regs_written; i++) {
3143 int reg = scan_inst->dst.reg + i;
3144
3145 if (reg >= first_write_grf &&
3146 reg < first_write_grf + write_len &&
3147 needs_dep[reg - first_write_grf]) {
3148 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3149 needs_dep[reg - first_write_grf] = false;
3150 if (scan_inst->exec_size == 16)
3151 needs_dep[reg - first_write_grf + 1] = false;
3152 }
3153 }
3154 }
3155
3156 /* Clear the flag for registers that actually got read (as expected). */
3157 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3158
3159 /* Continue the loop only if we haven't resolved all the dependencies */
3160 int i;
3161 for (i = 0; i < write_len; i++) {
3162 if (needs_dep[i])
3163 break;
3164 }
3165 if (i == write_len)
3166 return;
3167 }
3168 }
3169
3170 /**
3171 * Implements this workaround for the original 965:
3172 *
3173 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3174 * used as a destination register until after it has been sourced by an
3175 * instruction with a different destination register.
3176 */
3177 void
3178 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3179 {
3180 int write_len = inst->regs_written;
3181 int first_write_grf = inst->dst.reg;
3182 bool needs_dep[BRW_MAX_MRF];
3183 assert(write_len < (int)sizeof(needs_dep) - 1);
3184
3185 memset(needs_dep, false, sizeof(needs_dep));
3186 memset(needs_dep, true, write_len);
3187 /* Walk forwards looking for writes to registers we're writing which aren't
3188 * read before being written.
3189 */
3190 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3191 /* If we hit control flow, force resolve all remaining dependencies. */
3192 if (block->end() == scan_inst) {
3193 for (int i = 0; i < write_len; i++) {
3194 if (needs_dep[i])
3195 scan_inst->insert_before(block,
3196 DEP_RESOLVE_MOV(first_write_grf + i));
3197 }
3198 return;
3199 }
3200
3201 /* Clear the flag for registers that actually got read (as expected). */
3202 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3203
3204 /* We insert our reads as late as possible since they're reading the
3205 * result of a SEND, which has massive latency.
3206 */
3207 if (scan_inst->dst.file == GRF &&
3208 scan_inst->dst.reg >= first_write_grf &&
3209 scan_inst->dst.reg < first_write_grf + write_len &&
3210 needs_dep[scan_inst->dst.reg - first_write_grf]) {
3211 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3212 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3213 }
3214
3215 /* Continue the loop only if we haven't resolved all the dependencies */
3216 int i;
3217 for (i = 0; i < write_len; i++) {
3218 if (needs_dep[i])
3219 break;
3220 }
3221 if (i == write_len)
3222 return;
3223 }
3224 }
3225
3226 void
3227 fs_visitor::insert_gen4_send_dependency_workarounds()
3228 {
3229 if (devinfo->gen != 4 || devinfo->is_g4x)
3230 return;
3231
3232 bool progress = false;
3233
3234 /* Note that we're done with register allocation, so GRF fs_regs always
3235 * have a .reg_offset of 0.
3236 */
3237
3238 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3239 if (inst->mlen != 0 && inst->dst.file == GRF) {
3240 insert_gen4_pre_send_dependency_workarounds(block, inst);
3241 insert_gen4_post_send_dependency_workarounds(block, inst);
3242 progress = true;
3243 }
3244 }
3245
3246 if (progress)
3247 invalidate_live_intervals();
3248 }
3249
3250 /**
3251 * Turns the generic expression-style uniform pull constant load instruction
3252 * into a hardware-specific series of instructions for loading a pull
3253 * constant.
3254 *
3255 * The expression style allows the CSE pass before this to optimize out
3256 * repeated loads from the same offset, and gives the pre-register-allocation
3257 * scheduling full flexibility, while the conversion to native instructions
3258 * allows the post-register-allocation scheduler the best information
3259 * possible.
3260 *
3261 * Note that execution masking for setting up pull constant loads is special:
3262 * the channels that need to be written are unrelated to the current execution
3263 * mask, since a later instruction will use one of the result channels as a
3264 * source operand for all 8 or 16 of its channels.
3265 */
3266 void
3267 fs_visitor::lower_uniform_pull_constant_loads()
3268 {
3269 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3270 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3271 continue;
3272
3273 if (devinfo->gen >= 7) {
3274 /* The offset arg before was a vec4-aligned byte offset. We need to
3275 * turn it into a dword offset.
3276 */
3277 fs_reg const_offset_reg = inst->src[1];
3278 assert(const_offset_reg.file == IMM &&
3279 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3280 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3281 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3282
3283 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3284 * Reserve space for the register.
3285 */
3286 if (devinfo->gen >= 9) {
3287 payload.reg_offset++;
3288 alloc.sizes[payload.reg] = 2;
3289 }
3290
3291 /* This is actually going to be a MOV, but since only the first dword
3292 * is accessed, we have a special opcode to do just that one. Note
3293 * that this needs to be an operation that will be considered a def
3294 * by live variable analysis, or register allocation will explode.
3295 */
3296 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3297 8, payload, const_offset_reg);
3298 setup->force_writemask_all = true;
3299
3300 setup->ir = inst->ir;
3301 setup->annotation = inst->annotation;
3302 inst->insert_before(block, setup);
3303
3304 /* Similarly, this will only populate the first 4 channels of the
3305 * result register (since we only use smear values from 0-3), but we
3306 * don't tell the optimizer.
3307 */
3308 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3309 inst->src[1] = payload;
3310
3311 invalidate_live_intervals();
3312 } else {
3313 /* Before register allocation, we didn't tell the scheduler about the
3314 * MRF we use. We know it's safe to use this MRF because nothing
3315 * else does except for register spill/unspill, which generates and
3316 * uses its MRF within a single IR instruction.
3317 */
3318 inst->base_mrf = 14;
3319 inst->mlen = 1;
3320 }
3321 }
3322 }
3323
3324 bool
3325 fs_visitor::lower_load_payload()
3326 {
3327 bool progress = false;
3328
3329 int vgrf_to_reg[alloc.count];
3330 int reg_count = 0;
3331 for (unsigned i = 0; i < alloc.count; ++i) {
3332 vgrf_to_reg[i] = reg_count;
3333 reg_count += alloc.sizes[i];
3334 }
3335
3336 struct {
3337 bool written:1; /* Whether this register has ever been written */
3338 bool force_writemask_all:1;
3339 bool force_sechalf:1;
3340 } metadata[reg_count];
3341 memset(metadata, 0, sizeof(metadata));
3342
3343 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3344 if (inst->dst.file == GRF) {
3345 const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3346 bool force_sechalf = inst->force_sechalf &&
3347 !inst->force_writemask_all;
3348 bool toggle_sechalf = inst->dst.width == 16 &&
3349 type_sz(inst->dst.type) == 4 &&
3350 !inst->force_writemask_all;
3351 for (int i = 0; i < inst->regs_written; ++i) {
3352 metadata[dst_reg + i].written = true;
3353 metadata[dst_reg + i].force_sechalf = force_sechalf;
3354 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3355 force_sechalf = (toggle_sechalf != force_sechalf);
3356 }
3357 }
3358
3359 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3360 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3361 fs_reg dst = inst->dst;
3362
3363 for (int i = 0; i < inst->sources; i++) {
3364 dst.width = inst->src[i].effective_width;
3365 dst.type = inst->src[i].type;
3366
3367 if (inst->src[i].file == BAD_FILE) {
3368 /* Do nothing but otherwise increment as normal */
3369 } else if (dst.file == MRF &&
3370 dst.width == 8 &&
3371 devinfo->has_compr4 &&
3372 i + 4 < inst->sources &&
3373 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3374 fs_reg compr4_dst = dst;
3375 compr4_dst.reg += BRW_MRF_COMPR4;
3376 compr4_dst.width = 16;
3377 fs_reg compr4_src = inst->src[i];
3378 compr4_src.width = 16;
3379 fs_inst *mov = MOV(compr4_dst, compr4_src);
3380 mov->force_writemask_all = true;
3381 inst->insert_before(block, mov);
3382 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3383 inst->src[i + 4].file = BAD_FILE;
3384 } else {
3385 fs_inst *mov = MOV(dst, inst->src[i]);
3386 if (inst->src[i].file == GRF) {
3387 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3388 inst->src[i].reg_offset;
3389 mov->force_sechalf = metadata[src_reg].force_sechalf;
3390 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3391 } else {
3392 /* We don't have any useful metadata for immediates or
3393 * uniforms. Assume that any of the channels of the
3394 * destination may be used.
3395 */
3396 assert(inst->src[i].file == IMM ||
3397 inst->src[i].file == UNIFORM);
3398 mov->force_writemask_all = true;
3399 }
3400
3401 if (dst.file == GRF) {
3402 const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3403 const bool force_writemask = mov->force_writemask_all;
3404 metadata[dst_reg].force_writemask_all = force_writemask;
3405 metadata[dst_reg].force_sechalf = mov->force_sechalf;
3406 if (dst.width * type_sz(dst.type) > 32) {
3407 assert(!mov->force_sechalf);
3408 metadata[dst_reg + 1].force_writemask_all = force_writemask;
3409 metadata[dst_reg + 1].force_sechalf = !force_writemask;
3410 }
3411 }
3412
3413 inst->insert_before(block, mov);
3414 }
3415
3416 dst = offset(dst, 1);
3417 }
3418
3419 inst->remove(block);
3420 progress = true;
3421 }
3422 }
3423
3424 if (progress)
3425 invalidate_live_intervals();
3426
3427 return progress;
3428 }
3429
3430 void
3431 fs_visitor::dump_instructions()
3432 {
3433 dump_instructions(NULL);
3434 }
3435
3436 void
3437 fs_visitor::dump_instructions(const char *name)
3438 {
3439 FILE *file = stderr;
3440 if (name && geteuid() != 0) {
3441 file = fopen(name, "w");
3442 if (!file)
3443 file = stderr;
3444 }
3445
3446 if (cfg) {
3447 calculate_register_pressure();
3448 int ip = 0, max_pressure = 0;
3449 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3450 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3451 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3452 dump_instruction(inst, file);
3453 ip++;
3454 }
3455 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3456 } else {
3457 int ip = 0;
3458 foreach_in_list(backend_instruction, inst, &instructions) {
3459 fprintf(file, "%4d: ", ip++);
3460 dump_instruction(inst, file);
3461 }
3462 }
3463
3464 if (file != stderr) {
3465 fclose(file);
3466 }
3467 }
3468
3469 void
3470 fs_visitor::dump_instruction(backend_instruction *be_inst)
3471 {
3472 dump_instruction(be_inst, stderr);
3473 }
3474
3475 void
3476 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3477 {
3478 fs_inst *inst = (fs_inst *)be_inst;
3479
3480 if (inst->predicate) {
3481 fprintf(file, "(%cf0.%d) ",
3482 inst->predicate_inverse ? '-' : '+',
3483 inst->flag_subreg);
3484 }
3485
3486 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3487 if (inst->saturate)
3488 fprintf(file, ".sat");
3489 if (inst->conditional_mod) {
3490 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3491 if (!inst->predicate &&
3492 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3493 inst->opcode != BRW_OPCODE_IF &&
3494 inst->opcode != BRW_OPCODE_WHILE))) {
3495 fprintf(file, ".f0.%d", inst->flag_subreg);
3496 }
3497 }
3498 fprintf(file, "(%d) ", inst->exec_size);
3499
3500
3501 switch (inst->dst.file) {
3502 case GRF:
3503 fprintf(file, "vgrf%d", inst->dst.reg);
3504 if (inst->dst.width != dispatch_width)
3505 fprintf(file, "@%d", inst->dst.width);
3506 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3507 inst->dst.subreg_offset)
3508 fprintf(file, "+%d.%d",
3509 inst->dst.reg_offset, inst->dst.subreg_offset);
3510 break;
3511 case MRF:
3512 fprintf(file, "m%d", inst->dst.reg);
3513 break;
3514 case BAD_FILE:
3515 fprintf(file, "(null)");
3516 break;
3517 case UNIFORM:
3518 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3519 break;
3520 case ATTR:
3521 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3522 break;
3523 case HW_REG:
3524 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3525 switch (inst->dst.fixed_hw_reg.nr) {
3526 case BRW_ARF_NULL:
3527 fprintf(file, "null");
3528 break;
3529 case BRW_ARF_ADDRESS:
3530 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3531 break;
3532 case BRW_ARF_ACCUMULATOR:
3533 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3534 break;
3535 case BRW_ARF_FLAG:
3536 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3537 inst->dst.fixed_hw_reg.subnr);
3538 break;
3539 default:
3540 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3541 inst->dst.fixed_hw_reg.subnr);
3542 break;
3543 }
3544 } else {
3545 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3546 }
3547 if (inst->dst.fixed_hw_reg.subnr)
3548 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3549 break;
3550 default:
3551 fprintf(file, "???");
3552 break;
3553 }
3554 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3555
3556 for (int i = 0; i < inst->sources; i++) {
3557 if (inst->src[i].negate)
3558 fprintf(file, "-");
3559 if (inst->src[i].abs)
3560 fprintf(file, "|");
3561 switch (inst->src[i].file) {
3562 case GRF:
3563 fprintf(file, "vgrf%d", inst->src[i].reg);
3564 if (inst->src[i].width != dispatch_width)
3565 fprintf(file, "@%d", inst->src[i].width);
3566 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3567 inst->src[i].subreg_offset)
3568 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3569 inst->src[i].subreg_offset);
3570 break;
3571 case MRF:
3572 fprintf(file, "***m%d***", inst->src[i].reg);
3573 break;
3574 case ATTR:
3575 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3576 break;
3577 case UNIFORM:
3578 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3579 if (inst->src[i].reladdr) {
3580 fprintf(file, "+reladdr");
3581 } else if (inst->src[i].subreg_offset) {
3582 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3583 inst->src[i].subreg_offset);
3584 }
3585 break;
3586 case BAD_FILE:
3587 fprintf(file, "(null)");
3588 break;
3589 case IMM:
3590 switch (inst->src[i].type) {
3591 case BRW_REGISTER_TYPE_F:
3592 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3593 break;
3594 case BRW_REGISTER_TYPE_W:
3595 case BRW_REGISTER_TYPE_D:
3596 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3597 break;
3598 case BRW_REGISTER_TYPE_UW:
3599 case BRW_REGISTER_TYPE_UD:
3600 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3601 break;
3602 case BRW_REGISTER_TYPE_VF:
3603 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3604 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3605 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3606 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3607 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3608 break;
3609 default:
3610 fprintf(file, "???");
3611 break;
3612 }
3613 break;
3614 case HW_REG:
3615 if (inst->src[i].fixed_hw_reg.negate)
3616 fprintf(file, "-");
3617 if (inst->src[i].fixed_hw_reg.abs)
3618 fprintf(file, "|");
3619 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3620 switch (inst->src[i].fixed_hw_reg.nr) {
3621 case BRW_ARF_NULL:
3622 fprintf(file, "null");
3623 break;
3624 case BRW_ARF_ADDRESS:
3625 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3626 break;
3627 case BRW_ARF_ACCUMULATOR:
3628 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3629 break;
3630 case BRW_ARF_FLAG:
3631 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3632 inst->src[i].fixed_hw_reg.subnr);
3633 break;
3634 default:
3635 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3636 inst->src[i].fixed_hw_reg.subnr);
3637 break;
3638 }
3639 } else {
3640 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3641 }
3642 if (inst->src[i].fixed_hw_reg.subnr)
3643 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3644 if (inst->src[i].fixed_hw_reg.abs)
3645 fprintf(file, "|");
3646 break;
3647 default:
3648 fprintf(file, "???");
3649 break;
3650 }
3651 if (inst->src[i].abs)
3652 fprintf(file, "|");
3653
3654 if (inst->src[i].file != IMM) {
3655 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3656 }
3657
3658 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3659 fprintf(file, ", ");
3660 }
3661
3662 fprintf(file, " ");
3663
3664 if (dispatch_width == 16 && inst->exec_size == 8) {
3665 if (inst->force_sechalf)
3666 fprintf(file, "2ndhalf ");
3667 else
3668 fprintf(file, "1sthalf ");
3669 }
3670
3671 fprintf(file, "\n");
3672 }
3673
3674 /**
3675 * Possibly returns an instruction that set up @param reg.
3676 *
3677 * Sometimes we want to take the result of some expression/variable
3678 * dereference tree and rewrite the instruction generating the result
3679 * of the tree. When processing the tree, we know that the
3680 * instructions generated are all writing temporaries that are dead
3681 * outside of this tree. So, if we have some instructions that write
3682 * a temporary, we're free to point that temp write somewhere else.
3683 *
3684 * Note that this doesn't guarantee that the instruction generated
3685 * only reg -- it might be the size=4 destination of a texture instruction.
3686 */
3687 fs_inst *
3688 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3689 fs_inst *end,
3690 const fs_reg &reg)
3691 {
3692 if (end == start ||
3693 end->is_partial_write() ||
3694 reg.reladdr ||
3695 !reg.equals(end->dst)) {
3696 return NULL;
3697 } else {
3698 return end;
3699 }
3700 }
3701
3702 void
3703 fs_visitor::setup_payload_gen6()
3704 {
3705 bool uses_depth =
3706 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3707 unsigned barycentric_interp_modes =
3708 (stage == MESA_SHADER_FRAGMENT) ?
3709 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3710
3711 assert(devinfo->gen >= 6);
3712
3713 /* R0-1: masks, pixel X/Y coordinates. */
3714 payload.num_regs = 2;
3715 /* R2: only for 32-pixel dispatch.*/
3716
3717 /* R3-26: barycentric interpolation coordinates. These appear in the
3718 * same order that they appear in the brw_wm_barycentric_interp_mode
3719 * enum. Each set of coordinates occupies 2 registers if dispatch width
3720 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3721 * appear if they were enabled using the "Barycentric Interpolation
3722 * Mode" bits in WM_STATE.
3723 */
3724 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3725 if (barycentric_interp_modes & (1 << i)) {
3726 payload.barycentric_coord_reg[i] = payload.num_regs;
3727 payload.num_regs += 2;
3728 if (dispatch_width == 16) {
3729 payload.num_regs += 2;
3730 }
3731 }
3732 }
3733
3734 /* R27: interpolated depth if uses source depth */
3735 if (uses_depth) {
3736 payload.source_depth_reg = payload.num_regs;
3737 payload.num_regs++;
3738 if (dispatch_width == 16) {
3739 /* R28: interpolated depth if not SIMD8. */
3740 payload.num_regs++;
3741 }
3742 }
3743 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3744 if (uses_depth) {
3745 payload.source_w_reg = payload.num_regs;
3746 payload.num_regs++;
3747 if (dispatch_width == 16) {
3748 /* R30: interpolated W if not SIMD8. */
3749 payload.num_regs++;
3750 }
3751 }
3752
3753 if (stage == MESA_SHADER_FRAGMENT) {
3754 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3755 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3756 prog_data->uses_pos_offset = key->compute_pos_offset;
3757 /* R31: MSAA position offsets. */
3758 if (prog_data->uses_pos_offset) {
3759 payload.sample_pos_reg = payload.num_regs;
3760 payload.num_regs++;
3761 }
3762 }
3763
3764 /* R32: MSAA input coverage mask */
3765 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3766 assert(devinfo->gen >= 7);
3767 payload.sample_mask_in_reg = payload.num_regs;
3768 payload.num_regs++;
3769 if (dispatch_width == 16) {
3770 /* R33: input coverage mask if not SIMD8. */
3771 payload.num_regs++;
3772 }
3773 }
3774
3775 /* R34-: bary for 32-pixel. */
3776 /* R58-59: interp W for 32-pixel. */
3777
3778 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3779 source_depth_to_render_target = true;
3780 }
3781 }
3782
3783 void
3784 fs_visitor::setup_vs_payload()
3785 {
3786 /* R0: thread header, R1: urb handles */
3787 payload.num_regs = 2;
3788 }
3789
3790 void
3791 fs_visitor::setup_cs_payload()
3792 {
3793 assert(brw->gen >= 7);
3794
3795 payload.num_regs = 1;
3796 }
3797
3798 void
3799 fs_visitor::assign_binding_table_offsets()
3800 {
3801 assert(stage == MESA_SHADER_FRAGMENT);
3802 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3803 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3804 uint32_t next_binding_table_offset = 0;
3805
3806 /* If there are no color regions, we still perform an FB write to a null
3807 * renderbuffer, which we place at surface index 0.
3808 */
3809 prog_data->binding_table.render_target_start = next_binding_table_offset;
3810 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3811
3812 assign_common_binding_table_offsets(next_binding_table_offset);
3813 }
3814
3815 void
3816 fs_visitor::calculate_register_pressure()
3817 {
3818 invalidate_live_intervals();
3819 calculate_live_intervals();
3820
3821 unsigned num_instructions = 0;
3822 foreach_block(block, cfg)
3823 num_instructions += block->instructions.length();
3824
3825 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3826
3827 for (unsigned reg = 0; reg < alloc.count; reg++) {
3828 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3829 regs_live_at_ip[ip] += alloc.sizes[reg];
3830 }
3831 }
3832
3833 void
3834 fs_visitor::optimize()
3835 {
3836 split_virtual_grfs();
3837
3838 move_uniform_array_access_to_pull_constants();
3839 assign_constant_locations();
3840 demote_pull_constants();
3841
3842 #define OPT(pass, args...) ({ \
3843 pass_num++; \
3844 bool this_progress = pass(args); \
3845 \
3846 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3847 char filename[64]; \
3848 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3849 stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3850 \
3851 backend_visitor::dump_instructions(filename); \
3852 } \
3853 \
3854 progress = progress || this_progress; \
3855 this_progress; \
3856 })
3857
3858 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3859 char filename[64];
3860 snprintf(filename, 64, "%s%d-%04d-00-start",
3861 stage_abbrev, dispatch_width,
3862 shader_prog ? shader_prog->Name : 0);
3863
3864 backend_visitor::dump_instructions(filename);
3865 }
3866
3867 bool progress;
3868 int iteration = 0;
3869 int pass_num = 0;
3870 do {
3871 progress = false;
3872 pass_num = 0;
3873 iteration++;
3874
3875 OPT(remove_duplicate_mrf_writes);
3876
3877 OPT(opt_algebraic);
3878 OPT(opt_cse);
3879 OPT(opt_copy_propagate);
3880 OPT(opt_peephole_predicated_break);
3881 OPT(opt_cmod_propagation);
3882 OPT(dead_code_eliminate);
3883 OPT(opt_peephole_sel);
3884 OPT(dead_control_flow_eliminate, this);
3885 OPT(opt_register_renaming);
3886 OPT(opt_redundant_discard_jumps);
3887 OPT(opt_saturate_propagation);
3888 OPT(opt_zero_samples);
3889 OPT(register_coalesce);
3890 OPT(compute_to_mrf);
3891
3892 OPT(compact_virtual_grfs);
3893 } while (progress);
3894
3895 pass_num = 0;
3896
3897 OPT(opt_sampler_eot);
3898
3899 if (OPT(lower_load_payload)) {
3900 split_virtual_grfs();
3901 OPT(register_coalesce);
3902 OPT(compute_to_mrf);
3903 OPT(dead_code_eliminate);
3904 }
3905
3906 OPT(opt_combine_constants);
3907
3908 lower_uniform_pull_constant_loads();
3909 }
3910
3911 /**
3912 * Three source instruction must have a GRF/MRF destination register.
3913 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
3914 */
3915 void
3916 fs_visitor::fixup_3src_null_dest()
3917 {
3918 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3919 if (inst->is_3src() && inst->dst.is_null()) {
3920 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3921 inst->dst.type);
3922 }
3923 }
3924 }
3925
3926 void
3927 fs_visitor::allocate_registers()
3928 {
3929 bool allocated_without_spills;
3930
3931 static const enum instruction_scheduler_mode pre_modes[] = {
3932 SCHEDULE_PRE,
3933 SCHEDULE_PRE_NON_LIFO,
3934 SCHEDULE_PRE_LIFO,
3935 };
3936
3937 /* Try each scheduling heuristic to see if it can successfully register
3938 * allocate without spilling. They should be ordered by decreasing
3939 * performance but increasing likelihood of allocating.
3940 */
3941 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3942 schedule_instructions(pre_modes[i]);
3943
3944 if (0) {
3945 assign_regs_trivial();
3946 allocated_without_spills = true;
3947 } else {
3948 allocated_without_spills = assign_regs(false);
3949 }
3950 if (allocated_without_spills)
3951 break;
3952 }
3953
3954 if (!allocated_without_spills) {
3955 /* We assume that any spilling is worse than just dropping back to
3956 * SIMD8. There's probably actually some intermediate point where
3957 * SIMD16 with a couple of spills is still better.
3958 */
3959 if (dispatch_width == 16) {
3960 fail("Failure to register allocate. Reduce number of "
3961 "live scalar values to avoid this.");
3962 } else {
3963 perf_debug("%s shader triggered register spilling. "
3964 "Try reducing the number of live scalar values to "
3965 "improve performance.\n", stage_name);
3966 }
3967
3968 /* Since we're out of heuristics, just go spill registers until we
3969 * get an allocation.
3970 */
3971 while (!assign_regs(true)) {
3972 if (failed)
3973 break;
3974 }
3975 }
3976
3977 /* This must come after all optimization and register allocation, since
3978 * it inserts dead code that happens to have side effects, and it does
3979 * so based on the actual physical registers in use.
3980 */
3981 insert_gen4_send_dependency_workarounds();
3982
3983 if (failed)
3984 return;
3985
3986 if (!allocated_without_spills)
3987 schedule_instructions(SCHEDULE_POST);
3988
3989 if (last_scratch > 0)
3990 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3991 }
3992
3993 bool
3994 fs_visitor::run_vs()
3995 {
3996 assert(stage == MESA_SHADER_VERTEX);
3997
3998 assign_common_binding_table_offsets(0);
3999 setup_vs_payload();
4000
4001 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4002 emit_shader_time_begin();
4003
4004 if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
4005 emit_nir_code();
4006 } else {
4007 foreach_in_list(ir_instruction, ir, shader->base.ir) {
4008 base_ir = ir;
4009 this->result = reg_undef;
4010 ir->accept(this);
4011 }
4012 base_ir = NULL;
4013 }
4014
4015 if (failed)
4016 return false;
4017
4018 emit_urb_writes();
4019
4020 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4021 emit_shader_time_end();
4022
4023 calculate_cfg();
4024
4025 optimize();
4026
4027 assign_curb_setup();
4028 assign_vs_urb_setup();
4029
4030 fixup_3src_null_dest();
4031 allocate_registers();
4032
4033 return !failed;
4034 }
4035
4036 bool
4037 fs_visitor::run_fs()
4038 {
4039 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4040 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4041
4042 assert(stage == MESA_SHADER_FRAGMENT);
4043
4044 sanity_param_count = prog->Parameters->NumParameters;
4045
4046 assign_binding_table_offsets();
4047
4048 if (devinfo->gen >= 6)
4049 setup_payload_gen6();
4050 else
4051 setup_payload_gen4();
4052
4053 if (0) {
4054 emit_dummy_fs();
4055 } else if (brw->use_rep_send && dispatch_width == 16) {
4056 emit_repclear_shader();
4057 } else {
4058 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4059 emit_shader_time_begin();
4060
4061 calculate_urb_setup();
4062 if (prog->InputsRead > 0) {
4063 if (devinfo->gen < 6)
4064 emit_interpolation_setup_gen4();
4065 else
4066 emit_interpolation_setup_gen6();
4067 }
4068
4069 /* We handle discards by keeping track of the still-live pixels in f0.1.
4070 * Initialize it with the dispatched pixels.
4071 */
4072 if (wm_prog_data->uses_kill) {
4073 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4074 discard_init->flag_subreg = 1;
4075 }
4076
4077 /* Generate FS IR for main(). (the visitor only descends into
4078 * functions called "main").
4079 */
4080 if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
4081 emit_nir_code();
4082 } else if (shader) {
4083 foreach_in_list(ir_instruction, ir, shader->base.ir) {
4084 base_ir = ir;
4085 this->result = reg_undef;
4086 ir->accept(this);
4087 }
4088 } else {
4089 emit_fragment_program_code();
4090 }
4091 base_ir = NULL;
4092 if (failed)
4093 return false;
4094
4095 if (wm_prog_data->uses_kill)
4096 emit(FS_OPCODE_PLACEHOLDER_HALT);
4097
4098 if (wm_key->alpha_test_func)
4099 emit_alpha_test();
4100
4101 emit_fb_writes();
4102
4103 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4104 emit_shader_time_end();
4105
4106 calculate_cfg();
4107
4108 optimize();
4109
4110 assign_curb_setup();
4111 assign_urb_setup();
4112
4113 fixup_3src_null_dest();
4114 allocate_registers();
4115
4116 if (failed)
4117 return false;
4118 }
4119
4120 if (dispatch_width == 8)
4121 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4122 else
4123 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4124
4125 /* If any state parameters were appended, then ParameterValues could have
4126 * been realloced, in which case the driver uniform storage set up by
4127 * _mesa_associate_uniform_storage() would point to freed memory. Make
4128 * sure that didn't happen.
4129 */
4130 assert(sanity_param_count == prog->Parameters->NumParameters);
4131
4132 return !failed;
4133 }
4134
4135 bool
4136 fs_visitor::run_cs()
4137 {
4138 assert(stage == MESA_SHADER_COMPUTE);
4139 assert(shader);
4140
4141 sanity_param_count = prog->Parameters->NumParameters;
4142
4143 assign_common_binding_table_offsets(0);
4144
4145 setup_cs_payload();
4146
4147 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4148 emit_shader_time_begin();
4149
4150 emit_nir_code();
4151
4152 if (failed)
4153 return false;
4154
4155 emit_cs_terminate();
4156
4157 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4158 emit_shader_time_end();
4159
4160 calculate_cfg();
4161
4162 optimize();
4163
4164 assign_curb_setup();
4165
4166 fixup_3src_null_dest();
4167 allocate_registers();
4168
4169 if (failed)
4170 return false;
4171
4172 /* If any state parameters were appended, then ParameterValues could have
4173 * been realloced, in which case the driver uniform storage set up by
4174 * _mesa_associate_uniform_storage() would point to freed memory. Make
4175 * sure that didn't happen.
4176 */
4177 assert(sanity_param_count == prog->Parameters->NumParameters);
4178
4179 return !failed;
4180 }
4181
4182 const unsigned *
4183 brw_wm_fs_emit(struct brw_context *brw,
4184 void *mem_ctx,
4185 const struct brw_wm_prog_key *key,
4186 struct brw_wm_prog_data *prog_data,
4187 struct gl_fragment_program *fp,
4188 struct gl_shader_program *prog,
4189 unsigned *final_assembly_size)
4190 {
4191 bool start_busy = false;
4192 double start_time = 0;
4193
4194 if (unlikely(brw->perf_debug)) {
4195 start_busy = (brw->batch.last_bo &&
4196 drm_intel_bo_busy(brw->batch.last_bo));
4197 start_time = get_time();
4198 }
4199
4200 struct brw_shader *shader = NULL;
4201 if (prog)
4202 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4203
4204 if (unlikely(INTEL_DEBUG & DEBUG_WM))
4205 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4206
4207 /* Now the main event: Visit the shader IR and generate our FS IR for it.
4208 */
4209 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
4210 if (!v.run_fs()) {
4211 if (prog) {
4212 prog->LinkStatus = false;
4213 ralloc_strcat(&prog->InfoLog, v.fail_msg);
4214 }
4215
4216 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4217 v.fail_msg);
4218
4219 return NULL;
4220 }
4221
4222 cfg_t *simd16_cfg = NULL;
4223 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
4224 if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4225 if (!v.simd16_unsupported) {
4226 /* Try a SIMD16 compile */
4227 v2.import_uniforms(&v);
4228 if (!v2.run_fs()) {
4229 perf_debug("SIMD16 shader failed to compile, falling back to "
4230 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4231 } else {
4232 simd16_cfg = v2.cfg;
4233 }
4234 } else {
4235 perf_debug("SIMD16 shader unsupported, falling back to "
4236 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4237 }
4238 }
4239
4240 cfg_t *simd8_cfg;
4241 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4242 if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4243 simd8_cfg = NULL;
4244 prog_data->no_8 = true;
4245 } else {
4246 simd8_cfg = v.cfg;
4247 prog_data->no_8 = false;
4248 }
4249
4250 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4251 &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4252
4253 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4254 char *name;
4255 if (prog)
4256 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4257 prog->Label ? prog->Label : "unnamed",
4258 prog->Name);
4259 else
4260 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4261
4262 g.enable_debug(name);
4263 }
4264
4265 if (simd8_cfg)
4266 g.generate_code(simd8_cfg, 8);
4267 if (simd16_cfg)
4268 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4269
4270 if (unlikely(brw->perf_debug) && shader) {
4271 if (shader->compiled_once)
4272 brw_wm_debug_recompile(brw, prog, key);
4273 shader->compiled_once = true;
4274
4275 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4276 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4277 (get_time() - start_time) * 1000);
4278 }
4279 }
4280
4281 return g.get_assembly(final_assembly_size);
4282 }
4283
4284 extern "C" bool
4285 brw_fs_precompile(struct gl_context *ctx,
4286 struct gl_shader_program *shader_prog,
4287 struct gl_program *prog)
4288 {
4289 struct brw_context *brw = brw_context(ctx);
4290 struct brw_wm_prog_key key;
4291
4292 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4293 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4294 bool program_uses_dfdy = fp->UsesDFdy;
4295
4296 memset(&key, 0, sizeof(key));
4297
4298 if (brw->gen < 6) {
4299 if (fp->UsesKill)
4300 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4301
4302 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4303 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4304
4305 /* Just assume depth testing. */
4306 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4307 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4308 }
4309
4310 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4311 BRW_FS_VARYING_INPUT_MASK) > 16)
4312 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4313
4314 brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4315
4316 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4317 key.drawable_height = ctx->DrawBuffer->Height;
4318 }
4319
4320 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4321 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4322 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4323
4324 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4325 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4326 key.nr_color_regions > 1;
4327 }
4328
4329 key.program_string_id = bfp->id;
4330
4331 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4332 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4333
4334 bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4335
4336 brw->wm.base.prog_offset = old_prog_offset;
4337 brw->wm.prog_data = old_prog_data;
4338
4339 return success;
4340 }
4341
4342 void
4343 brw_setup_tex_for_precompile(struct brw_context *brw,
4344 struct brw_sampler_prog_key_data *tex,
4345 struct gl_program *prog)
4346 {
4347 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4348 unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4349 for (unsigned i = 0; i < sampler_count; i++) {
4350 if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4351 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4352 tex->swizzles[i] =
4353 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4354 } else {
4355 /* Color sampler: assume no swizzling. */
4356 tex->swizzles[i] = SWIZZLE_XYZW;
4357 }
4358 }
4359 }