i965/fs: Add LINTERP's src0 to fs_inst::regs_read().
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 void
53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
54 const fs_reg *src, unsigned sources)
55 {
56 memset(this, 0, sizeof(*this));
57
58 this->src = new fs_reg[MAX2(sources, 3)];
59 for (unsigned i = 0; i < sources; i++)
60 this->src[i] = src[i];
61
62 this->opcode = opcode;
63 this->dst = dst;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (unsigned i = 0; i < sources; ++i) {
79 if (src[i].file != GRF && src[i].file != ATTR)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 for (unsigned i = 0; i < sources; ++i) {
94 switch (this->src[i].file) {
95 case BAD_FILE:
96 this->src[i].effective_width = 8;
97 break;
98 case GRF:
99 case HW_REG:
100 case ATTR:
101 assert(this->src[i].width > 0);
102 if (this->src[i].width == 1) {
103 this->src[i].effective_width = this->exec_size;
104 } else {
105 this->src[i].effective_width = this->src[i].width;
106 }
107 break;
108 case IMM:
109 case UNIFORM:
110 this->src[i].effective_width = this->exec_size;
111 break;
112 default:
113 unreachable("Invalid source register file");
114 }
115 }
116 this->dst.effective_width = this->exec_size;
117
118 this->conditional_mod = BRW_CONDITIONAL_NONE;
119
120 /* This will be the case for almost all instructions. */
121 switch (dst.file) {
122 case GRF:
123 case HW_REG:
124 case MRF:
125 case ATTR:
126 this->regs_written =
127 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
128 break;
129 case BAD_FILE:
130 this->regs_written = 0;
131 break;
132 case IMM:
133 case UNIFORM:
134 unreachable("Invalid destination register file");
135 default:
136 unreachable("Invalid register file");
137 }
138
139 this->writes_accumulator = false;
140 }
141
142 fs_inst::fs_inst()
143 {
144 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
145 }
146
147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
148 {
149 init(opcode, exec_size, reg_undef, NULL, 0);
150 }
151
152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
153 {
154 init(opcode, 0, dst, NULL, 0);
155 }
156
157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
158 const fs_reg &src0)
159 {
160 const fs_reg src[1] = { src0 };
161 init(opcode, exec_size, dst, src, 1);
162 }
163
164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
165 {
166 const fs_reg src[1] = { src0 };
167 init(opcode, 0, dst, src, 1);
168 }
169
170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
171 const fs_reg &src0, const fs_reg &src1)
172 {
173 const fs_reg src[2] = { src0, src1 };
174 init(opcode, exec_size, dst, src, 2);
175 }
176
177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
178 const fs_reg &src1)
179 {
180 const fs_reg src[2] = { src0, src1 };
181 init(opcode, 0, dst, src, 2);
182 }
183
184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
185 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
186 {
187 const fs_reg src[3] = { src0, src1, src2 };
188 init(opcode, exec_size, dst, src, 3);
189 }
190
191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
192 const fs_reg &src1, const fs_reg &src2)
193 {
194 const fs_reg src[3] = { src0, src1, src2 };
195 init(opcode, 0, dst, src, 3);
196 }
197
198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
199 const fs_reg src[], unsigned sources)
200 {
201 init(opcode, 0, dst, src, sources);
202 }
203
204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
205 const fs_reg src[], unsigned sources)
206 {
207 init(opcode, exec_width, dst, src, sources);
208 }
209
210 fs_inst::fs_inst(const fs_inst &that)
211 {
212 memcpy(this, &that, sizeof(that));
213
214 this->src = new fs_reg[MAX2(that.sources, 3)];
215
216 for (unsigned i = 0; i < that.sources; i++)
217 this->src[i] = that.src[i];
218 }
219
220 fs_inst::~fs_inst()
221 {
222 delete[] this->src;
223 }
224
225 void
226 fs_inst::resize_sources(uint8_t num_sources)
227 {
228 if (this->sources != num_sources) {
229 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
230
231 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
232 src[i] = this->src[i];
233
234 delete[] this->src;
235 this->src = src;
236 this->sources = num_sources;
237 }
238 }
239
240 #define ALU1(op) \
241 fs_inst * \
242 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
243 { \
244 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
245 }
246
247 #define ALU2(op) \
248 fs_inst * \
249 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
250 const fs_reg &src1) \
251 { \
252 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
253 }
254
255 #define ALU2_ACC(op) \
256 fs_inst * \
257 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
258 const fs_reg &src1) \
259 { \
260 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
261 inst->writes_accumulator = true; \
262 return inst; \
263 }
264
265 #define ALU3(op) \
266 fs_inst * \
267 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
268 const fs_reg &src1, const fs_reg &src2) \
269 { \
270 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
271 }
272
273 ALU1(NOT)
274 ALU1(MOV)
275 ALU1(FRC)
276 ALU1(RNDD)
277 ALU1(RNDE)
278 ALU1(RNDZ)
279 ALU2(ADD)
280 ALU2(MUL)
281 ALU2_ACC(MACH)
282 ALU2(AND)
283 ALU2(OR)
284 ALU2(XOR)
285 ALU2(SHL)
286 ALU2(SHR)
287 ALU2(ASR)
288 ALU3(LRP)
289 ALU1(BFREV)
290 ALU3(BFE)
291 ALU2(BFI1)
292 ALU3(BFI2)
293 ALU1(FBH)
294 ALU1(FBL)
295 ALU1(CBIT)
296 ALU3(MAD)
297 ALU2_ACC(ADDC)
298 ALU2_ACC(SUBB)
299 ALU2(SEL)
300 ALU2(MAC)
301
302 /** Gen4 predicated IF. */
303 fs_inst *
304 fs_visitor::IF(enum brw_predicate predicate)
305 {
306 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
307 inst->predicate = predicate;
308 return inst;
309 }
310
311 /** Gen6 IF with embedded comparison. */
312 fs_inst *
313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
314 enum brw_conditional_mod condition)
315 {
316 assert(brw->gen == 6);
317 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
318 reg_null_d, src0, src1);
319 inst->conditional_mod = condition;
320 return inst;
321 }
322
323 /**
324 * CMP: Sets the low bit of the destination channels with the result
325 * of the comparison, while the upper bits are undefined, and updates
326 * the flag register with the packed 16 bits of the result.
327 */
328 fs_inst *
329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
330 enum brw_conditional_mod condition)
331 {
332 fs_inst *inst;
333
334 /* Take the instruction:
335 *
336 * CMP null<d> src0<f> src1<f>
337 *
338 * Original gen4 does type conversion to the destination type before
339 * comparison, producing garbage results for floating point comparisons.
340 *
341 * The destination type doesn't matter on newer generations, so we set the
342 * type to match src0 so we can compact the instruction.
343 */
344 dst.type = src0.type;
345 if (dst.file == HW_REG)
346 dst.fixed_hw_reg.type = dst.type;
347
348 resolve_ud_negate(&src0);
349 resolve_ud_negate(&src1);
350
351 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
352 inst->conditional_mod = condition;
353
354 return inst;
355 }
356
357 fs_inst *
358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
359 {
360 uint8_t exec_size = dst.width;
361 for (int i = 0; i < sources; ++i) {
362 assert(src[i].width % dst.width == 0);
363 if (src[i].width > exec_size)
364 exec_size = src[i].width;
365 }
366
367 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
368 dst, src, sources);
369 inst->regs_written = 0;
370 for (int i = 0; i < sources; ++i) {
371 /* The LOAD_PAYLOAD instruction only really makes sense if we are
372 * dealing with whole registers. If this ever changes, we can deal
373 * with it later.
374 */
375 int size = inst->src[i].effective_width * type_sz(src[i].type);
376 assert(size % 32 == 0);
377 inst->regs_written += (size + 31) / 32;
378 }
379
380 return inst;
381 }
382
383 exec_list
384 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
385 const fs_reg &surf_index,
386 const fs_reg &varying_offset,
387 uint32_t const_offset)
388 {
389 exec_list instructions;
390 fs_inst *inst;
391
392 /* We have our constant surface use a pitch of 4 bytes, so our index can
393 * be any component of a vector, and then we load 4 contiguous
394 * components starting from that.
395 *
396 * We break down the const_offset to a portion added to the variable
397 * offset and a portion done using reg_offset, which means that if you
398 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
399 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
400 * CSE can later notice that those loads are all the same and eliminate
401 * the redundant ones.
402 */
403 fs_reg vec4_offset = vgrf(glsl_type::int_type);
404 instructions.push_tail(ADD(vec4_offset,
405 varying_offset, fs_reg(const_offset & ~3)));
406
407 int scale = 1;
408 if (brw->gen == 4 && dst.width == 8) {
409 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
410 * u, v, r) as parameters, or we can just use the SIMD16 message
411 * consisting of (header, u). We choose the second, at the cost of a
412 * longer return length.
413 */
414 scale = 2;
415 }
416
417 enum opcode op;
418 if (brw->gen >= 7)
419 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
420 else
421 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
422
423 assert(dst.width % 8 == 0);
424 int regs_written = 4 * (dst.width / 8) * scale;
425 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
426 dst.type, dst.width);
427 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
428 inst->regs_written = regs_written;
429 instructions.push_tail(inst);
430
431 if (brw->gen < 7) {
432 inst->base_mrf = 13;
433 inst->header_present = true;
434 if (brw->gen == 4)
435 inst->mlen = 3;
436 else
437 inst->mlen = 1 + dispatch_width / 8;
438 }
439
440 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
441 instructions.push_tail(MOV(dst, result));
442
443 return instructions;
444 }
445
446 /**
447 * A helper for MOV generation for fixing up broken hardware SEND dependency
448 * handling.
449 */
450 fs_inst *
451 fs_visitor::DEP_RESOLVE_MOV(int grf)
452 {
453 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
454
455 inst->ir = NULL;
456 inst->annotation = "send dependency resolve";
457
458 /* The caller always wants uncompressed to emit the minimal extra
459 * dependencies, and to avoid having to deal with aligning its regs to 2.
460 */
461 inst->exec_size = 8;
462
463 return inst;
464 }
465
466 bool
467 fs_inst::equals(fs_inst *inst) const
468 {
469 return (opcode == inst->opcode &&
470 dst.equals(inst->dst) &&
471 src[0].equals(inst->src[0]) &&
472 src[1].equals(inst->src[1]) &&
473 src[2].equals(inst->src[2]) &&
474 saturate == inst->saturate &&
475 predicate == inst->predicate &&
476 conditional_mod == inst->conditional_mod &&
477 mlen == inst->mlen &&
478 base_mrf == inst->base_mrf &&
479 target == inst->target &&
480 eot == inst->eot &&
481 header_present == inst->header_present &&
482 shadow_compare == inst->shadow_compare &&
483 exec_size == inst->exec_size &&
484 offset == inst->offset);
485 }
486
487 bool
488 fs_inst::overwrites_reg(const fs_reg &reg) const
489 {
490 return reg.in_range(dst, regs_written);
491 }
492
493 bool
494 fs_inst::is_send_from_grf() const
495 {
496 switch (opcode) {
497 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
498 case SHADER_OPCODE_SHADER_TIME_ADD:
499 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
500 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
501 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
502 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
503 case SHADER_OPCODE_UNTYPED_ATOMIC:
504 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
505 case SHADER_OPCODE_URB_WRITE_SIMD8:
506 return true;
507 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
508 return src[1].file == GRF;
509 case FS_OPCODE_FB_WRITE:
510 return src[0].file == GRF;
511 default:
512 if (is_tex())
513 return src[0].file == GRF;
514
515 return false;
516 }
517 }
518
519 bool
520 fs_inst::can_do_source_mods(struct brw_context *brw)
521 {
522 if (brw->gen == 6 && is_math())
523 return false;
524
525 if (is_send_from_grf())
526 return false;
527
528 if (!backend_instruction::can_do_source_mods())
529 return false;
530
531 return true;
532 }
533
534 bool
535 fs_inst::has_side_effects() const
536 {
537 return this->eot || backend_instruction::has_side_effects();
538 }
539
540 void
541 fs_reg::init()
542 {
543 memset(this, 0, sizeof(*this));
544 stride = 1;
545 }
546
547 /** Generic unset register constructor. */
548 fs_reg::fs_reg()
549 {
550 init();
551 this->file = BAD_FILE;
552 }
553
554 /** Immediate value constructor. */
555 fs_reg::fs_reg(float f)
556 {
557 init();
558 this->file = IMM;
559 this->type = BRW_REGISTER_TYPE_F;
560 this->fixed_hw_reg.dw1.f = f;
561 this->width = 1;
562 }
563
564 /** Immediate value constructor. */
565 fs_reg::fs_reg(int32_t i)
566 {
567 init();
568 this->file = IMM;
569 this->type = BRW_REGISTER_TYPE_D;
570 this->fixed_hw_reg.dw1.d = i;
571 this->width = 1;
572 }
573
574 /** Immediate value constructor. */
575 fs_reg::fs_reg(uint32_t u)
576 {
577 init();
578 this->file = IMM;
579 this->type = BRW_REGISTER_TYPE_UD;
580 this->fixed_hw_reg.dw1.ud = u;
581 this->width = 1;
582 }
583
584 /** Vector float immediate value constructor. */
585 fs_reg::fs_reg(uint8_t vf[4])
586 {
587 init();
588 this->file = IMM;
589 this->type = BRW_REGISTER_TYPE_VF;
590 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
591 }
592
593 /** Vector float immediate value constructor. */
594 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
595 {
596 init();
597 this->file = IMM;
598 this->type = BRW_REGISTER_TYPE_VF;
599 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
600 (vf1 << 8) |
601 (vf2 << 16) |
602 (vf3 << 24);
603 }
604
605 /** Fixed brw_reg. */
606 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
607 {
608 init();
609 this->file = HW_REG;
610 this->fixed_hw_reg = fixed_hw_reg;
611 this->type = fixed_hw_reg.type;
612 this->width = 1 << fixed_hw_reg.width;
613 }
614
615 bool
616 fs_reg::equals(const fs_reg &r) const
617 {
618 return (file == r.file &&
619 reg == r.reg &&
620 reg_offset == r.reg_offset &&
621 subreg_offset == r.subreg_offset &&
622 type == r.type &&
623 negate == r.negate &&
624 abs == r.abs &&
625 !reladdr && !r.reladdr &&
626 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
627 width == r.width &&
628 stride == r.stride);
629 }
630
631 fs_reg &
632 fs_reg::set_smear(unsigned subreg)
633 {
634 assert(file != HW_REG && file != IMM);
635 subreg_offset = subreg * type_sz(type);
636 stride = 0;
637 return *this;
638 }
639
640 bool
641 fs_reg::is_contiguous() const
642 {
643 return stride == 1;
644 }
645
646 int
647 fs_visitor::type_size(const struct glsl_type *type)
648 {
649 unsigned int size, i;
650
651 switch (type->base_type) {
652 case GLSL_TYPE_UINT:
653 case GLSL_TYPE_INT:
654 case GLSL_TYPE_FLOAT:
655 case GLSL_TYPE_BOOL:
656 return type->components();
657 case GLSL_TYPE_ARRAY:
658 return type_size(type->fields.array) * type->length;
659 case GLSL_TYPE_STRUCT:
660 size = 0;
661 for (i = 0; i < type->length; i++) {
662 size += type_size(type->fields.structure[i].type);
663 }
664 return size;
665 case GLSL_TYPE_SAMPLER:
666 /* Samplers take up no register space, since they're baked in at
667 * link time.
668 */
669 return 0;
670 case GLSL_TYPE_ATOMIC_UINT:
671 return 0;
672 case GLSL_TYPE_IMAGE:
673 case GLSL_TYPE_VOID:
674 case GLSL_TYPE_ERROR:
675 case GLSL_TYPE_INTERFACE:
676 case GLSL_TYPE_DOUBLE:
677 unreachable("not reached");
678 }
679
680 return 0;
681 }
682
683 /**
684 * Create a MOV to read the timestamp register.
685 *
686 * The caller is responsible for emitting the MOV. The return value is
687 * the destination of the MOV, with extra parameters set.
688 */
689 fs_reg
690 fs_visitor::get_timestamp(fs_inst **out_mov)
691 {
692 assert(brw->gen >= 7);
693
694 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
695 BRW_ARF_TIMESTAMP,
696 0),
697 BRW_REGISTER_TYPE_UD));
698
699 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
700
701 fs_inst *mov = MOV(dst, ts);
702 /* We want to read the 3 fields we care about even if it's not enabled in
703 * the dispatch.
704 */
705 mov->force_writemask_all = true;
706
707 /* The caller wants the low 32 bits of the timestamp. Since it's running
708 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
709 * which is plenty of time for our purposes. It is identical across the
710 * EUs, but since it's tracking GPU core speed it will increment at a
711 * varying rate as render P-states change.
712 *
713 * The caller could also check if render P-states have changed (or anything
714 * else that might disrupt timing) by setting smear to 2 and checking if
715 * that field is != 0.
716 */
717 dst.set_smear(0);
718
719 *out_mov = mov;
720 return dst;
721 }
722
723 void
724 fs_visitor::emit_shader_time_begin()
725 {
726 current_annotation = "shader time start";
727 fs_inst *mov;
728 shader_start_time = get_timestamp(&mov);
729 emit(mov);
730 }
731
732 void
733 fs_visitor::emit_shader_time_end()
734 {
735 current_annotation = "shader time end";
736
737 enum shader_time_shader_type type, written_type, reset_type;
738 switch (stage) {
739 case MESA_SHADER_VERTEX:
740 type = ST_VS;
741 written_type = ST_VS_WRITTEN;
742 reset_type = ST_VS_RESET;
743 break;
744 case MESA_SHADER_GEOMETRY:
745 type = ST_GS;
746 written_type = ST_GS_WRITTEN;
747 reset_type = ST_GS_RESET;
748 break;
749 case MESA_SHADER_FRAGMENT:
750 if (dispatch_width == 8) {
751 type = ST_FS8;
752 written_type = ST_FS8_WRITTEN;
753 reset_type = ST_FS8_RESET;
754 } else {
755 assert(dispatch_width == 16);
756 type = ST_FS16;
757 written_type = ST_FS16_WRITTEN;
758 reset_type = ST_FS16_RESET;
759 }
760 break;
761 default:
762 unreachable("fs_visitor::emit_shader_time_end missing code");
763 }
764
765 /* Insert our code just before the final SEND with EOT. */
766 exec_node *end = this->instructions.get_tail();
767 assert(end && ((fs_inst *) end)->eot);
768
769 fs_inst *tm_read;
770 fs_reg shader_end_time = get_timestamp(&tm_read);
771 end->insert_before(tm_read);
772
773 /* Check that there weren't any timestamp reset events (assuming these
774 * were the only two timestamp reads that happened).
775 */
776 fs_reg reset = shader_end_time;
777 reset.set_smear(2);
778 fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
779 test->conditional_mod = BRW_CONDITIONAL_Z;
780 test->force_writemask_all = true;
781 end->insert_before(test);
782 end->insert_before(IF(BRW_PREDICATE_NORMAL));
783
784 fs_reg start = shader_start_time;
785 start.negate = true;
786 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
787 diff.set_smear(0);
788 fs_inst *add = ADD(diff, start, shader_end_time);
789 add->force_writemask_all = true;
790 end->insert_before(add);
791
792 /* If there were no instructions between the two timestamp gets, the diff
793 * is 2 cycles. Remove that overhead, so I can forget about that when
794 * trying to determine the time taken for single instructions.
795 */
796 add = ADD(diff, diff, fs_reg(-2u));
797 add->force_writemask_all = true;
798 end->insert_before(add);
799
800 end->insert_before(SHADER_TIME_ADD(type, diff));
801 end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
802 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
803 end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
804 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
805 }
806
807 fs_inst *
808 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
809 {
810 int shader_time_index =
811 brw_get_shader_time_index(brw, shader_prog, prog, type);
812 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
813
814 fs_reg payload;
815 if (dispatch_width == 8)
816 payload = vgrf(glsl_type::uvec2_type);
817 else
818 payload = vgrf(glsl_type::uint_type);
819
820 return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
821 fs_reg(), payload, offset, value);
822 }
823
824 void
825 fs_visitor::vfail(const char *format, va_list va)
826 {
827 char *msg;
828
829 if (failed)
830 return;
831
832 failed = true;
833
834 msg = ralloc_vasprintf(mem_ctx, format, va);
835 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
836
837 this->fail_msg = msg;
838
839 if (debug_enabled) {
840 fprintf(stderr, "%s", msg);
841 }
842 }
843
844 void
845 fs_visitor::fail(const char *format, ...)
846 {
847 va_list va;
848
849 va_start(va, format);
850 vfail(format, va);
851 va_end(va);
852 }
853
854 /**
855 * Mark this program as impossible to compile in SIMD16 mode.
856 *
857 * During the SIMD8 compile (which happens first), we can detect and flag
858 * things that are unsupported in SIMD16 mode, so the compiler can skip
859 * the SIMD16 compile altogether.
860 *
861 * During a SIMD16 compile (if one happens anyway), this just calls fail().
862 */
863 void
864 fs_visitor::no16(const char *format, ...)
865 {
866 va_list va;
867
868 va_start(va, format);
869
870 if (dispatch_width == 16) {
871 vfail(format, va);
872 } else {
873 simd16_unsupported = true;
874
875 if (brw->perf_debug) {
876 if (no16_msg)
877 ralloc_vasprintf_append(&no16_msg, format, va);
878 else
879 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
880 }
881 }
882
883 va_end(va);
884 }
885
886 fs_inst *
887 fs_visitor::emit(enum opcode opcode)
888 {
889 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
890 }
891
892 fs_inst *
893 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
894 {
895 return emit(new(mem_ctx) fs_inst(opcode, dst));
896 }
897
898 fs_inst *
899 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
900 {
901 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
902 }
903
904 fs_inst *
905 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
906 const fs_reg &src1)
907 {
908 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
909 }
910
911 fs_inst *
912 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
913 const fs_reg &src1, const fs_reg &src2)
914 {
915 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
916 }
917
918 fs_inst *
919 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
920 fs_reg src[], int sources)
921 {
922 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
923 }
924
925 /**
926 * Returns true if the instruction has a flag that means it won't
927 * update an entire destination register.
928 *
929 * For example, dead code elimination and live variable analysis want to know
930 * when a write to a variable screens off any preceding values that were in
931 * it.
932 */
933 bool
934 fs_inst::is_partial_write() const
935 {
936 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
937 (this->dst.width * type_sz(this->dst.type)) < 32 ||
938 !this->dst.is_contiguous());
939 }
940
941 int
942 fs_inst::regs_read(int arg) const
943 {
944 if (is_tex() && arg == 0 && src[0].file == GRF) {
945 return mlen;
946 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
947 return mlen;
948 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
949 return mlen;
950 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
951 return mlen;
952 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
953 return mlen;
954 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
955 return mlen;
956 } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
957 return exec_size / 4;
958 }
959
960 switch (src[arg].file) {
961 case BAD_FILE:
962 case UNIFORM:
963 case IMM:
964 return 1;
965 case GRF:
966 case HW_REG:
967 if (src[arg].stride == 0) {
968 return 1;
969 } else {
970 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
971 return (size + 31) / 32;
972 }
973 case MRF:
974 unreachable("MRF registers are not allowed as sources");
975 default:
976 unreachable("Invalid register file");
977 }
978 }
979
980 bool
981 fs_inst::reads_flag() const
982 {
983 return predicate;
984 }
985
986 bool
987 fs_inst::writes_flag() const
988 {
989 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
990 opcode != BRW_OPCODE_IF &&
991 opcode != BRW_OPCODE_WHILE)) ||
992 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
993 }
994
995 /**
996 * Returns how many MRFs an FS opcode will write over.
997 *
998 * Note that this is not the 0 or 1 implied writes in an actual gen
999 * instruction -- the FS opcodes often generate MOVs in addition.
1000 */
1001 int
1002 fs_visitor::implied_mrf_writes(fs_inst *inst)
1003 {
1004 if (inst->mlen == 0)
1005 return 0;
1006
1007 if (inst->base_mrf == -1)
1008 return 0;
1009
1010 switch (inst->opcode) {
1011 case SHADER_OPCODE_RCP:
1012 case SHADER_OPCODE_RSQ:
1013 case SHADER_OPCODE_SQRT:
1014 case SHADER_OPCODE_EXP2:
1015 case SHADER_OPCODE_LOG2:
1016 case SHADER_OPCODE_SIN:
1017 case SHADER_OPCODE_COS:
1018 return 1 * dispatch_width / 8;
1019 case SHADER_OPCODE_POW:
1020 case SHADER_OPCODE_INT_QUOTIENT:
1021 case SHADER_OPCODE_INT_REMAINDER:
1022 return 2 * dispatch_width / 8;
1023 case SHADER_OPCODE_TEX:
1024 case FS_OPCODE_TXB:
1025 case SHADER_OPCODE_TXD:
1026 case SHADER_OPCODE_TXF:
1027 case SHADER_OPCODE_TXF_CMS:
1028 case SHADER_OPCODE_TXF_MCS:
1029 case SHADER_OPCODE_TG4:
1030 case SHADER_OPCODE_TG4_OFFSET:
1031 case SHADER_OPCODE_TXL:
1032 case SHADER_OPCODE_TXS:
1033 case SHADER_OPCODE_LOD:
1034 return 1;
1035 case FS_OPCODE_FB_WRITE:
1036 return 2;
1037 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1038 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1039 return 1;
1040 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1041 return inst->mlen;
1042 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1043 return 2;
1044 case SHADER_OPCODE_UNTYPED_ATOMIC:
1045 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1046 case SHADER_OPCODE_URB_WRITE_SIMD8:
1047 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1048 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1049 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1050 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1051 return 0;
1052 default:
1053 unreachable("not reached");
1054 }
1055 }
1056
1057 fs_reg
1058 fs_visitor::vgrf(const glsl_type *const type)
1059 {
1060 int reg_width = dispatch_width / 8;
1061 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1062 brw_type_for_base_type(type), dispatch_width);
1063 }
1064
1065 fs_reg
1066 fs_visitor::vgrf(int num_components)
1067 {
1068 int reg_width = dispatch_width / 8;
1069 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1070 BRW_REGISTER_TYPE_F, dispatch_width);
1071 }
1072
1073 /** Fixed HW reg constructor. */
1074 fs_reg::fs_reg(enum register_file file, int reg)
1075 {
1076 init();
1077 this->file = file;
1078 this->reg = reg;
1079 this->type = BRW_REGISTER_TYPE_F;
1080
1081 switch (file) {
1082 case UNIFORM:
1083 this->width = 1;
1084 break;
1085 default:
1086 this->width = 8;
1087 }
1088 }
1089
1090 /** Fixed HW reg constructor. */
1091 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1092 {
1093 init();
1094 this->file = file;
1095 this->reg = reg;
1096 this->type = type;
1097
1098 switch (file) {
1099 case UNIFORM:
1100 this->width = 1;
1101 break;
1102 default:
1103 this->width = 8;
1104 }
1105 }
1106
1107 /** Fixed HW reg constructor. */
1108 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1109 uint8_t width)
1110 {
1111 init();
1112 this->file = file;
1113 this->reg = reg;
1114 this->type = type;
1115 this->width = width;
1116 }
1117
1118 fs_reg *
1119 fs_visitor::variable_storage(ir_variable *var)
1120 {
1121 return (fs_reg *)hash_table_find(this->variable_ht, var);
1122 }
1123
1124 void
1125 import_uniforms_callback(const void *key,
1126 void *data,
1127 void *closure)
1128 {
1129 struct hash_table *dst_ht = (struct hash_table *)closure;
1130 const fs_reg *reg = (const fs_reg *)data;
1131
1132 if (reg->file != UNIFORM)
1133 return;
1134
1135 hash_table_insert(dst_ht, data, key);
1136 }
1137
1138 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1139 * This brings in those uniform definitions
1140 */
1141 void
1142 fs_visitor::import_uniforms(fs_visitor *v)
1143 {
1144 hash_table_call_foreach(v->variable_ht,
1145 import_uniforms_callback,
1146 variable_ht);
1147 this->push_constant_loc = v->push_constant_loc;
1148 this->pull_constant_loc = v->pull_constant_loc;
1149 this->uniforms = v->uniforms;
1150 this->param_size = v->param_size;
1151 }
1152
1153 /* Our support for uniforms is piggy-backed on the struct
1154 * gl_fragment_program, because that's where the values actually
1155 * get stored, rather than in some global gl_shader_program uniform
1156 * store.
1157 */
1158 void
1159 fs_visitor::setup_uniform_values(ir_variable *ir)
1160 {
1161 int namelen = strlen(ir->name);
1162
1163 /* The data for our (non-builtin) uniforms is stored in a series of
1164 * gl_uniform_driver_storage structs for each subcomponent that
1165 * glGetUniformLocation() could name. We know it's been set up in the same
1166 * order we'd walk the type, so walk the list of storage and find anything
1167 * with our name, or the prefix of a component that starts with our name.
1168 */
1169 unsigned params_before = uniforms;
1170 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1171 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1172
1173 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1174 (storage->name[namelen] != 0 &&
1175 storage->name[namelen] != '.' &&
1176 storage->name[namelen] != '[')) {
1177 continue;
1178 }
1179
1180 unsigned slots = storage->type->component_slots();
1181 if (storage->array_elements)
1182 slots *= storage->array_elements;
1183
1184 for (unsigned i = 0; i < slots; i++) {
1185 stage_prog_data->param[uniforms++] = &storage->storage[i];
1186 }
1187 }
1188
1189 /* Make sure we actually initialized the right amount of stuff here. */
1190 assert(params_before + ir->type->component_slots() == uniforms);
1191 (void)params_before;
1192 }
1193
1194
1195 /* Our support for builtin uniforms is even scarier than non-builtin.
1196 * It sits on top of the PROG_STATE_VAR parameters that are
1197 * automatically updated from GL context state.
1198 */
1199 void
1200 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1201 {
1202 const ir_state_slot *const slots = ir->get_state_slots();
1203 assert(slots != NULL);
1204
1205 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1206 /* This state reference has already been setup by ir_to_mesa, but we'll
1207 * get the same index back here.
1208 */
1209 int index = _mesa_add_state_reference(this->prog->Parameters,
1210 (gl_state_index *)slots[i].tokens);
1211
1212 /* Add each of the unique swizzles of the element as a parameter.
1213 * This'll end up matching the expected layout of the
1214 * array/matrix/structure we're trying to fill in.
1215 */
1216 int last_swiz = -1;
1217 for (unsigned int j = 0; j < 4; j++) {
1218 int swiz = GET_SWZ(slots[i].swizzle, j);
1219 if (swiz == last_swiz)
1220 break;
1221 last_swiz = swiz;
1222
1223 stage_prog_data->param[uniforms++] =
1224 &prog->Parameters->ParameterValues[index][swiz];
1225 }
1226 }
1227 }
1228
1229 fs_reg *
1230 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1231 bool origin_upper_left)
1232 {
1233 assert(stage == MESA_SHADER_FRAGMENT);
1234 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1235 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1236 fs_reg wpos = *reg;
1237 bool flip = !origin_upper_left ^ key->render_to_fbo;
1238
1239 /* gl_FragCoord.x */
1240 if (pixel_center_integer) {
1241 emit(MOV(wpos, this->pixel_x));
1242 } else {
1243 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1244 }
1245 wpos = offset(wpos, 1);
1246
1247 /* gl_FragCoord.y */
1248 if (!flip && pixel_center_integer) {
1249 emit(MOV(wpos, this->pixel_y));
1250 } else {
1251 fs_reg pixel_y = this->pixel_y;
1252 float offset = (pixel_center_integer ? 0.0 : 0.5);
1253
1254 if (flip) {
1255 pixel_y.negate = true;
1256 offset += key->drawable_height - 1.0;
1257 }
1258
1259 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1260 }
1261 wpos = offset(wpos, 1);
1262
1263 /* gl_FragCoord.z */
1264 if (brw->gen >= 6) {
1265 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1266 } else {
1267 emit(FS_OPCODE_LINTERP, wpos,
1268 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1269 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1270 interp_reg(VARYING_SLOT_POS, 2));
1271 }
1272 wpos = offset(wpos, 1);
1273
1274 /* gl_FragCoord.w: Already set up in emit_interpolation */
1275 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1276
1277 return reg;
1278 }
1279
1280 fs_inst *
1281 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1282 glsl_interp_qualifier interpolation_mode,
1283 bool is_centroid, bool is_sample)
1284 {
1285 brw_wm_barycentric_interp_mode barycoord_mode;
1286 if (brw->gen >= 6) {
1287 if (is_centroid) {
1288 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1289 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1290 else
1291 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1292 } else if (is_sample) {
1293 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1294 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1295 else
1296 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1297 } else {
1298 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1299 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1300 else
1301 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1302 }
1303 } else {
1304 /* On Ironlake and below, there is only one interpolation mode.
1305 * Centroid interpolation doesn't mean anything on this hardware --
1306 * there is no multisampling.
1307 */
1308 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1309 }
1310 return emit(FS_OPCODE_LINTERP, attr,
1311 this->delta_x[barycoord_mode],
1312 this->delta_y[barycoord_mode], interp);
1313 }
1314
1315 void
1316 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1317 const glsl_type *type,
1318 glsl_interp_qualifier interpolation_mode,
1319 int location, bool mod_centroid,
1320 bool mod_sample)
1321 {
1322 attr.type = brw_type_for_base_type(type->get_scalar_type());
1323
1324 assert(stage == MESA_SHADER_FRAGMENT);
1325 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1326 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1327
1328 unsigned int array_elements;
1329
1330 if (type->is_array()) {
1331 array_elements = type->length;
1332 if (array_elements == 0) {
1333 fail("dereferenced array '%s' has length 0\n", name);
1334 }
1335 type = type->fields.array;
1336 } else {
1337 array_elements = 1;
1338 }
1339
1340 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1341 bool is_gl_Color =
1342 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1343 if (key->flat_shade && is_gl_Color) {
1344 interpolation_mode = INTERP_QUALIFIER_FLAT;
1345 } else {
1346 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1347 }
1348 }
1349
1350 for (unsigned int i = 0; i < array_elements; i++) {
1351 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1352 if (prog_data->urb_setup[location] == -1) {
1353 /* If there's no incoming setup data for this slot, don't
1354 * emit interpolation for it.
1355 */
1356 attr = offset(attr, type->vector_elements);
1357 location++;
1358 continue;
1359 }
1360
1361 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1362 /* Constant interpolation (flat shading) case. The SF has
1363 * handed us defined values in only the constant offset
1364 * field of the setup reg.
1365 */
1366 for (unsigned int k = 0; k < type->vector_elements; k++) {
1367 struct brw_reg interp = interp_reg(location, k);
1368 interp = suboffset(interp, 3);
1369 interp.type = attr.type;
1370 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1371 attr = offset(attr, 1);
1372 }
1373 } else {
1374 /* Smooth/noperspective interpolation case. */
1375 for (unsigned int k = 0; k < type->vector_elements; k++) {
1376 struct brw_reg interp = interp_reg(location, k);
1377 if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1378 /* Get the pixel/sample mask into f0 so that we know
1379 * which pixels are lit. Then, for each channel that is
1380 * unlit, replace the centroid data with non-centroid
1381 * data.
1382 */
1383 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1384
1385 fs_inst *inst;
1386 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1387 false, false);
1388 inst->predicate = BRW_PREDICATE_NORMAL;
1389 inst->predicate_inverse = true;
1390 if (brw->has_pln)
1391 inst->no_dd_clear = true;
1392
1393 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1394 mod_centroid && !key->persample_shading,
1395 mod_sample || key->persample_shading);
1396 inst->predicate = BRW_PREDICATE_NORMAL;
1397 inst->predicate_inverse = false;
1398 if (brw->has_pln)
1399 inst->no_dd_check = true;
1400
1401 } else {
1402 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1403 mod_centroid && !key->persample_shading,
1404 mod_sample || key->persample_shading);
1405 }
1406 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1407 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1408 }
1409 attr = offset(attr, 1);
1410 }
1411
1412 }
1413 location++;
1414 }
1415 }
1416 }
1417
1418 fs_reg *
1419 fs_visitor::emit_frontfacing_interpolation()
1420 {
1421 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1422
1423 if (brw->gen >= 6) {
1424 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1425 * a boolean result from this (~0/true or 0/false).
1426 *
1427 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1428 * this task in only one instruction:
1429 * - a negation source modifier will flip the bit; and
1430 * - a W -> D type conversion will sign extend the bit into the high
1431 * word of the destination.
1432 *
1433 * An ASR 15 fills the low word of the destination.
1434 */
1435 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1436 g0.negate = true;
1437
1438 emit(ASR(*reg, g0, fs_reg(15)));
1439 } else {
1440 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1441 * a boolean result from this (1/true or 0/false).
1442 *
1443 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1444 * the negation source modifier to flip it. Unfortunately the SHR
1445 * instruction only operates on UD (or D with an abs source modifier)
1446 * sources without negation.
1447 *
1448 * Instead, use ASR (which will give ~0/true or 0/false).
1449 */
1450 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1451 g1_6.negate = true;
1452
1453 emit(ASR(*reg, g1_6, fs_reg(31)));
1454 }
1455
1456 return reg;
1457 }
1458
1459 void
1460 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1461 {
1462 assert(stage == MESA_SHADER_FRAGMENT);
1463 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1464 assert(dst.type == BRW_REGISTER_TYPE_F);
1465
1466 if (key->compute_pos_offset) {
1467 /* Convert int_sample_pos to floating point */
1468 emit(MOV(dst, int_sample_pos));
1469 /* Scale to the range [0, 1] */
1470 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1471 }
1472 else {
1473 /* From ARB_sample_shading specification:
1474 * "When rendering to a non-multisample buffer, or if multisample
1475 * rasterization is disabled, gl_SamplePosition will always be
1476 * (0.5, 0.5).
1477 */
1478 emit(MOV(dst, fs_reg(0.5f)));
1479 }
1480 }
1481
1482 fs_reg *
1483 fs_visitor::emit_samplepos_setup()
1484 {
1485 assert(brw->gen >= 6);
1486
1487 this->current_annotation = "compute sample position";
1488 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1489 fs_reg pos = *reg;
1490 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1491 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1492
1493 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1494 * mode will be enabled.
1495 *
1496 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1497 * R31.1:0 Position Offset X/Y for Slot[3:0]
1498 * R31.3:2 Position Offset X/Y for Slot[7:4]
1499 * .....
1500 *
1501 * The X, Y sample positions come in as bytes in thread payload. So, read
1502 * the positions using vstride=16, width=8, hstride=2.
1503 */
1504 struct brw_reg sample_pos_reg =
1505 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1506 BRW_REGISTER_TYPE_B), 16, 8, 2);
1507
1508 if (dispatch_width == 8) {
1509 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1510 } else {
1511 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1512 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1513 ->force_sechalf = true;
1514 }
1515 /* Compute gl_SamplePosition.x */
1516 compute_sample_position(pos, int_sample_x);
1517 pos = offset(pos, 1);
1518 if (dispatch_width == 8) {
1519 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1520 } else {
1521 emit(MOV(half(int_sample_y, 0),
1522 fs_reg(suboffset(sample_pos_reg, 1))));
1523 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1524 ->force_sechalf = true;
1525 }
1526 /* Compute gl_SamplePosition.y */
1527 compute_sample_position(pos, int_sample_y);
1528 return reg;
1529 }
1530
1531 fs_reg *
1532 fs_visitor::emit_sampleid_setup()
1533 {
1534 assert(stage == MESA_SHADER_FRAGMENT);
1535 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1536 assert(brw->gen >= 6);
1537
1538 this->current_annotation = "compute sample id";
1539 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1540
1541 if (key->compute_sample_id) {
1542 fs_reg t1 = vgrf(glsl_type::int_type);
1543 fs_reg t2 = vgrf(glsl_type::int_type);
1544 t2.type = BRW_REGISTER_TYPE_UW;
1545
1546 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1547 * 8x multisampling, subspan 0 will represent sample N (where N
1548 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1549 * 7. We can find the value of N by looking at R0.0 bits 7:6
1550 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1551 * (since samples are always delivered in pairs). That is, we
1552 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1553 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1554 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1555 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1556 * populating a temporary variable with the sequence (0, 1, 2, 3),
1557 * and then reading from it using vstride=1, width=4, hstride=0.
1558 * These computations hold good for 4x multisampling as well.
1559 *
1560 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1561 * the first four slots are sample 0 of subspan 0; the next four
1562 * are sample 1 of subspan 0; the third group is sample 0 of
1563 * subspan 1, and finally sample 1 of subspan 1.
1564 */
1565 fs_inst *inst;
1566 inst = emit(BRW_OPCODE_AND, t1,
1567 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1568 fs_reg(0xc0));
1569 inst->force_writemask_all = true;
1570 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1571 inst->force_writemask_all = true;
1572 /* This works for both SIMD8 and SIMD16 */
1573 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1574 inst->force_writemask_all = true;
1575 /* This special instruction takes care of setting vstride=1,
1576 * width=4, hstride=0 of t2 during an ADD instruction.
1577 */
1578 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1579 } else {
1580 /* As per GL_ARB_sample_shading specification:
1581 * "When rendering to a non-multisample buffer, or if multisample
1582 * rasterization is disabled, gl_SampleID will always be zero."
1583 */
1584 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1585 }
1586
1587 return reg;
1588 }
1589
1590 void
1591 fs_visitor::resolve_source_modifiers(fs_reg *src)
1592 {
1593 if (!src->abs && !src->negate)
1594 return;
1595
1596 fs_reg temp = retype(vgrf(1), src->type);
1597 emit(MOV(temp, *src));
1598 *src = temp;
1599 }
1600
1601 fs_reg
1602 fs_visitor::fix_math_operand(fs_reg src)
1603 {
1604 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1605 * might be able to do better by doing execsize = 1 math and then
1606 * expanding that result out, but we would need to be careful with
1607 * masking.
1608 *
1609 * The hardware ignores source modifiers (negate and abs) on math
1610 * instructions, so we also move to a temp to set those up.
1611 */
1612 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1613 !src.abs && !src.negate)
1614 return src;
1615
1616 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1617 * operands to math
1618 */
1619 if (brw->gen >= 7 && src.file != IMM)
1620 return src;
1621
1622 fs_reg expanded = vgrf(glsl_type::float_type);
1623 expanded.type = src.type;
1624 emit(BRW_OPCODE_MOV, expanded, src);
1625 return expanded;
1626 }
1627
1628 fs_inst *
1629 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1630 {
1631 switch (opcode) {
1632 case SHADER_OPCODE_RCP:
1633 case SHADER_OPCODE_RSQ:
1634 case SHADER_OPCODE_SQRT:
1635 case SHADER_OPCODE_EXP2:
1636 case SHADER_OPCODE_LOG2:
1637 case SHADER_OPCODE_SIN:
1638 case SHADER_OPCODE_COS:
1639 break;
1640 default:
1641 unreachable("not reached: bad math opcode");
1642 }
1643
1644 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1645 * might be able to do better by doing execsize = 1 math and then
1646 * expanding that result out, but we would need to be careful with
1647 * masking.
1648 *
1649 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1650 * instructions, so we also move to a temp to set those up.
1651 */
1652 if (brw->gen == 6 || brw->gen == 7)
1653 src = fix_math_operand(src);
1654
1655 fs_inst *inst = emit(opcode, dst, src);
1656
1657 if (brw->gen < 6) {
1658 inst->base_mrf = 2;
1659 inst->mlen = dispatch_width / 8;
1660 }
1661
1662 return inst;
1663 }
1664
1665 fs_inst *
1666 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1667 {
1668 int base_mrf = 2;
1669 fs_inst *inst;
1670
1671 if (brw->gen >= 8) {
1672 inst = emit(opcode, dst, src0, src1);
1673 } else if (brw->gen >= 6) {
1674 src0 = fix_math_operand(src0);
1675 src1 = fix_math_operand(src1);
1676
1677 inst = emit(opcode, dst, src0, src1);
1678 } else {
1679 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1680 * "Message Payload":
1681 *
1682 * "Operand0[7]. For the INT DIV functions, this operand is the
1683 * denominator."
1684 * ...
1685 * "Operand1[7]. For the INT DIV functions, this operand is the
1686 * numerator."
1687 */
1688 bool is_int_div = opcode != SHADER_OPCODE_POW;
1689 fs_reg &op0 = is_int_div ? src1 : src0;
1690 fs_reg &op1 = is_int_div ? src0 : src1;
1691
1692 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1693 inst = emit(opcode, dst, op0, reg_null_f);
1694
1695 inst->base_mrf = base_mrf;
1696 inst->mlen = 2 * dispatch_width / 8;
1697 }
1698 return inst;
1699 }
1700
1701 void
1702 fs_visitor::emit_discard_jump()
1703 {
1704 assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1705
1706 /* For performance, after a discard, jump to the end of the
1707 * shader if all relevant channels have been discarded.
1708 */
1709 fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1710 discard_jump->flag_subreg = 1;
1711
1712 discard_jump->predicate = (dispatch_width == 8)
1713 ? BRW_PREDICATE_ALIGN1_ANY8H
1714 : BRW_PREDICATE_ALIGN1_ANY16H;
1715 discard_jump->predicate_inverse = true;
1716 }
1717
1718 void
1719 fs_visitor::assign_curb_setup()
1720 {
1721 if (dispatch_width == 8) {
1722 prog_data->dispatch_grf_start_reg = payload.num_regs;
1723 } else {
1724 assert(stage == MESA_SHADER_FRAGMENT);
1725 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1726 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1727 }
1728
1729 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1730
1731 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1732 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1733 for (unsigned int i = 0; i < inst->sources; i++) {
1734 if (inst->src[i].file == UNIFORM) {
1735 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1736 int constant_nr;
1737 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1738 constant_nr = push_constant_loc[uniform_nr];
1739 } else {
1740 /* Section 5.11 of the OpenGL 4.1 spec says:
1741 * "Out-of-bounds reads return undefined values, which include
1742 * values from other variables of the active program or zero."
1743 * Just return the first push constant.
1744 */
1745 constant_nr = 0;
1746 }
1747
1748 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1749 constant_nr / 8,
1750 constant_nr % 8);
1751
1752 inst->src[i].file = HW_REG;
1753 inst->src[i].fixed_hw_reg = byte_offset(
1754 retype(brw_reg, inst->src[i].type),
1755 inst->src[i].subreg_offset);
1756 }
1757 }
1758 }
1759 }
1760
1761 void
1762 fs_visitor::calculate_urb_setup()
1763 {
1764 assert(stage == MESA_SHADER_FRAGMENT);
1765 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1766 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1767
1768 memset(prog_data->urb_setup, -1,
1769 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1770
1771 int urb_next = 0;
1772 /* Figure out where each of the incoming setup attributes lands. */
1773 if (brw->gen >= 6) {
1774 if (_mesa_bitcount_64(prog->InputsRead &
1775 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1776 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1777 * first 16 varying inputs, so we can put them wherever we want.
1778 * Just put them in order.
1779 *
1780 * This is useful because it means that (a) inputs not used by the
1781 * fragment shader won't take up valuable register space, and (b) we
1782 * won't have to recompile the fragment shader if it gets paired with
1783 * a different vertex (or geometry) shader.
1784 */
1785 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1786 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1787 BITFIELD64_BIT(i)) {
1788 prog_data->urb_setup[i] = urb_next++;
1789 }
1790 }
1791 } else {
1792 /* We have enough input varyings that the SF/SBE pipeline stage can't
1793 * arbitrarily rearrange them to suit our whim; we have to put them
1794 * in an order that matches the output of the previous pipeline stage
1795 * (geometry or vertex shader).
1796 */
1797 struct brw_vue_map prev_stage_vue_map;
1798 brw_compute_vue_map(brw, &prev_stage_vue_map,
1799 key->input_slots_valid);
1800 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1801 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1802 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1803 slot++) {
1804 int varying = prev_stage_vue_map.slot_to_varying[slot];
1805 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1806 * unused.
1807 */
1808 if (varying != BRW_VARYING_SLOT_COUNT &&
1809 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1810 BITFIELD64_BIT(varying))) {
1811 prog_data->urb_setup[varying] = slot - first_slot;
1812 }
1813 }
1814 urb_next = prev_stage_vue_map.num_slots - first_slot;
1815 }
1816 } else {
1817 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1818 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1819 /* Point size is packed into the header, not as a general attribute */
1820 if (i == VARYING_SLOT_PSIZ)
1821 continue;
1822
1823 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1824 /* The back color slot is skipped when the front color is
1825 * also written to. In addition, some slots can be
1826 * written in the vertex shader and not read in the
1827 * fragment shader. So the register number must always be
1828 * incremented, mapped or not.
1829 */
1830 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1831 prog_data->urb_setup[i] = urb_next;
1832 urb_next++;
1833 }
1834 }
1835
1836 /*
1837 * It's a FS only attribute, and we did interpolation for this attribute
1838 * in SF thread. So, count it here, too.
1839 *
1840 * See compile_sf_prog() for more info.
1841 */
1842 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1843 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1844 }
1845
1846 prog_data->num_varying_inputs = urb_next;
1847 }
1848
1849 void
1850 fs_visitor::assign_urb_setup()
1851 {
1852 assert(stage == MESA_SHADER_FRAGMENT);
1853 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1854
1855 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1856
1857 /* Offset all the urb_setup[] index by the actual position of the
1858 * setup regs, now that the location of the constants has been chosen.
1859 */
1860 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1861 if (inst->opcode == FS_OPCODE_LINTERP) {
1862 assert(inst->src[2].file == HW_REG);
1863 inst->src[2].fixed_hw_reg.nr += urb_start;
1864 }
1865
1866 if (inst->opcode == FS_OPCODE_CINTERP) {
1867 assert(inst->src[0].file == HW_REG);
1868 inst->src[0].fixed_hw_reg.nr += urb_start;
1869 }
1870 }
1871
1872 /* Each attribute is 4 setup channels, each of which is half a reg. */
1873 this->first_non_payload_grf =
1874 urb_start + prog_data->num_varying_inputs * 2;
1875 }
1876
1877 void
1878 fs_visitor::assign_vs_urb_setup()
1879 {
1880 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1881 int grf, count, slot, channel, attr;
1882
1883 assert(stage == MESA_SHADER_VERTEX);
1884 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1885 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1886 count++;
1887
1888 /* Each attribute is 4 regs. */
1889 this->first_non_payload_grf =
1890 payload.num_regs + prog_data->curb_read_length + count * 4;
1891
1892 unsigned vue_entries =
1893 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1894
1895 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1896 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1897
1898 assert(vs_prog_data->base.urb_read_length <= 15);
1899
1900 /* Rewrite all ATTR file references to the hw grf that they land in. */
1901 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1902 for (int i = 0; i < inst->sources; i++) {
1903 if (inst->src[i].file == ATTR) {
1904
1905 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1906 slot = count - 1;
1907 } else {
1908 /* Attributes come in in a contiguous block, ordered by their
1909 * gl_vert_attrib value. That means we can compute the slot
1910 * number for an attribute by masking out the enabled
1911 * attributes before it and counting the bits.
1912 */
1913 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1914 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1915 BITFIELD64_MASK(attr));
1916 }
1917
1918 channel = inst->src[i].reg_offset & 3;
1919
1920 grf = payload.num_regs +
1921 prog_data->curb_read_length +
1922 slot * 4 + channel;
1923
1924 inst->src[i].file = HW_REG;
1925 inst->src[i].fixed_hw_reg =
1926 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1927 }
1928 }
1929 }
1930 }
1931
1932 /**
1933 * Split large virtual GRFs into separate components if we can.
1934 *
1935 * This is mostly duplicated with what brw_fs_vector_splitting does,
1936 * but that's really conservative because it's afraid of doing
1937 * splitting that doesn't result in real progress after the rest of
1938 * the optimization phases, which would cause infinite looping in
1939 * optimization. We can do it once here, safely. This also has the
1940 * opportunity to split interpolated values, or maybe even uniforms,
1941 * which we don't have at the IR level.
1942 *
1943 * We want to split, because virtual GRFs are what we register
1944 * allocate and spill (due to contiguousness requirements for some
1945 * instructions), and they're what we naturally generate in the
1946 * codegen process, but most virtual GRFs don't actually need to be
1947 * contiguous sets of GRFs. If we split, we'll end up with reduced
1948 * live intervals and better dead code elimination and coalescing.
1949 */
1950 void
1951 fs_visitor::split_virtual_grfs()
1952 {
1953 int num_vars = this->alloc.count;
1954
1955 /* Count the total number of registers */
1956 int reg_count = 0;
1957 int vgrf_to_reg[num_vars];
1958 for (int i = 0; i < num_vars; i++) {
1959 vgrf_to_reg[i] = reg_count;
1960 reg_count += alloc.sizes[i];
1961 }
1962
1963 /* An array of "split points". For each register slot, this indicates
1964 * if this slot can be separated from the previous slot. Every time an
1965 * instruction uses multiple elements of a register (as a source or
1966 * destination), we mark the used slots as inseparable. Then we go
1967 * through and split the registers into the smallest pieces we can.
1968 */
1969 bool split_points[reg_count];
1970 memset(split_points, 0, sizeof(split_points));
1971
1972 /* Mark all used registers as fully splittable */
1973 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1974 if (inst->dst.file == GRF) {
1975 int reg = vgrf_to_reg[inst->dst.reg];
1976 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1977 split_points[reg + j] = true;
1978 }
1979
1980 for (int i = 0; i < inst->sources; i++) {
1981 if (inst->src[i].file == GRF) {
1982 int reg = vgrf_to_reg[inst->src[i].reg];
1983 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1984 split_points[reg + j] = true;
1985 }
1986 }
1987 }
1988
1989 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1990 if (inst->dst.file == GRF) {
1991 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1992 for (int j = 1; j < inst->regs_written; j++)
1993 split_points[reg + j] = false;
1994 }
1995 for (int i = 0; i < inst->sources; i++) {
1996 if (inst->src[i].file == GRF) {
1997 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1998 for (int j = 1; j < inst->regs_read(i); j++)
1999 split_points[reg + j] = false;
2000 }
2001 }
2002 }
2003
2004 int new_virtual_grf[reg_count];
2005 int new_reg_offset[reg_count];
2006
2007 int reg = 0;
2008 for (int i = 0; i < num_vars; i++) {
2009 /* The first one should always be 0 as a quick sanity check. */
2010 assert(split_points[reg] == false);
2011
2012 /* j = 0 case */
2013 new_reg_offset[reg] = 0;
2014 reg++;
2015 int offset = 1;
2016
2017 /* j > 0 case */
2018 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2019 /* If this is a split point, reset the offset to 0 and allocate a
2020 * new virtual GRF for the previous offset many registers
2021 */
2022 if (split_points[reg]) {
2023 assert(offset <= MAX_VGRF_SIZE);
2024 int grf = alloc.allocate(offset);
2025 for (int k = reg - offset; k < reg; k++)
2026 new_virtual_grf[k] = grf;
2027 offset = 0;
2028 }
2029 new_reg_offset[reg] = offset;
2030 offset++;
2031 reg++;
2032 }
2033
2034 /* The last one gets the original register number */
2035 assert(offset <= MAX_VGRF_SIZE);
2036 alloc.sizes[i] = offset;
2037 for (int k = reg - offset; k < reg; k++)
2038 new_virtual_grf[k] = i;
2039 }
2040 assert(reg == reg_count);
2041
2042 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2043 if (inst->dst.file == GRF) {
2044 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2045 inst->dst.reg = new_virtual_grf[reg];
2046 inst->dst.reg_offset = new_reg_offset[reg];
2047 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2048 }
2049 for (int i = 0; i < inst->sources; i++) {
2050 if (inst->src[i].file == GRF) {
2051 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2052 inst->src[i].reg = new_virtual_grf[reg];
2053 inst->src[i].reg_offset = new_reg_offset[reg];
2054 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2055 }
2056 }
2057 }
2058 invalidate_live_intervals();
2059 }
2060
2061 /**
2062 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2063 *
2064 * During code generation, we create tons of temporary variables, many of
2065 * which get immediately killed and are never used again. Yet, in later
2066 * optimization and analysis passes, such as compute_live_intervals, we need
2067 * to loop over all the virtual GRFs. Compacting them can save a lot of
2068 * overhead.
2069 */
2070 bool
2071 fs_visitor::compact_virtual_grfs()
2072 {
2073 bool progress = false;
2074 int remap_table[this->alloc.count];
2075 memset(remap_table, -1, sizeof(remap_table));
2076
2077 /* Mark which virtual GRFs are used. */
2078 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2079 if (inst->dst.file == GRF)
2080 remap_table[inst->dst.reg] = 0;
2081
2082 for (int i = 0; i < inst->sources; i++) {
2083 if (inst->src[i].file == GRF)
2084 remap_table[inst->src[i].reg] = 0;
2085 }
2086 }
2087
2088 /* Compact the GRF arrays. */
2089 int new_index = 0;
2090 for (unsigned i = 0; i < this->alloc.count; i++) {
2091 if (remap_table[i] == -1) {
2092 /* We just found an unused register. This means that we are
2093 * actually going to compact something.
2094 */
2095 progress = true;
2096 } else {
2097 remap_table[i] = new_index;
2098 alloc.sizes[new_index] = alloc.sizes[i];
2099 invalidate_live_intervals();
2100 ++new_index;
2101 }
2102 }
2103
2104 this->alloc.count = new_index;
2105
2106 /* Patch all the instructions to use the newly renumbered registers */
2107 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2108 if (inst->dst.file == GRF)
2109 inst->dst.reg = remap_table[inst->dst.reg];
2110
2111 for (int i = 0; i < inst->sources; i++) {
2112 if (inst->src[i].file == GRF)
2113 inst->src[i].reg = remap_table[inst->src[i].reg];
2114 }
2115 }
2116
2117 /* Patch all the references to delta_x/delta_y, since they're used in
2118 * register allocation. If they're unused, switch them to BAD_FILE so
2119 * we don't think some random VGRF is delta_x/delta_y.
2120 */
2121 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2122 if (delta_x[i].file == GRF) {
2123 if (remap_table[delta_x[i].reg] != -1) {
2124 delta_x[i].reg = remap_table[delta_x[i].reg];
2125 } else {
2126 delta_x[i].file = BAD_FILE;
2127 }
2128 }
2129 }
2130 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2131 if (delta_y[i].file == GRF) {
2132 if (remap_table[delta_y[i].reg] != -1) {
2133 delta_y[i].reg = remap_table[delta_y[i].reg];
2134 } else {
2135 delta_y[i].file = BAD_FILE;
2136 }
2137 }
2138 }
2139
2140 return progress;
2141 }
2142
2143 /*
2144 * Implements array access of uniforms by inserting a
2145 * PULL_CONSTANT_LOAD instruction.
2146 *
2147 * Unlike temporary GRF array access (where we don't support it due to
2148 * the difficulty of doing relative addressing on instruction
2149 * destinations), we could potentially do array access of uniforms
2150 * that were loaded in GRF space as push constants. In real-world
2151 * usage we've seen, though, the arrays being used are always larger
2152 * than we could load as push constants, so just always move all
2153 * uniform array access out to a pull constant buffer.
2154 */
2155 void
2156 fs_visitor::move_uniform_array_access_to_pull_constants()
2157 {
2158 if (dispatch_width != 8)
2159 return;
2160
2161 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2162 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2163
2164 /* Walk through and find array access of uniforms. Put a copy of that
2165 * uniform in the pull constant buffer.
2166 *
2167 * Note that we don't move constant-indexed accesses to arrays. No
2168 * testing has been done of the performance impact of this choice.
2169 */
2170 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2171 for (int i = 0 ; i < inst->sources; i++) {
2172 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2173 continue;
2174
2175 int uniform = inst->src[i].reg;
2176
2177 /* If this array isn't already present in the pull constant buffer,
2178 * add it.
2179 */
2180 if (pull_constant_loc[uniform] == -1) {
2181 const gl_constant_value **values = &stage_prog_data->param[uniform];
2182
2183 assert(param_size[uniform]);
2184
2185 for (int j = 0; j < param_size[uniform]; j++) {
2186 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2187
2188 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2189 values[j];
2190 }
2191 }
2192 }
2193 }
2194 }
2195
2196 /**
2197 * Assign UNIFORM file registers to either push constants or pull constants.
2198 *
2199 * We allow a fragment shader to have more than the specified minimum
2200 * maximum number of fragment shader uniform components (64). If
2201 * there are too many of these, they'd fill up all of register space.
2202 * So, this will push some of them out to the pull constant buffer and
2203 * update the program to load them.
2204 */
2205 void
2206 fs_visitor::assign_constant_locations()
2207 {
2208 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2209 if (dispatch_width != 8)
2210 return;
2211
2212 /* Find which UNIFORM registers are still in use. */
2213 bool is_live[uniforms];
2214 for (unsigned int i = 0; i < uniforms; i++) {
2215 is_live[i] = false;
2216 }
2217
2218 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2219 for (int i = 0; i < inst->sources; i++) {
2220 if (inst->src[i].file != UNIFORM)
2221 continue;
2222
2223 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2224 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2225 is_live[constant_nr] = true;
2226 }
2227 }
2228
2229 /* Only allow 16 registers (128 uniform components) as push constants.
2230 *
2231 * Just demote the end of the list. We could probably do better
2232 * here, demoting things that are rarely used in the program first.
2233 *
2234 * If changing this value, note the limitation about total_regs in
2235 * brw_curbe.c.
2236 */
2237 unsigned int max_push_components = 16 * 8;
2238 unsigned int num_push_constants = 0;
2239
2240 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2241
2242 for (unsigned int i = 0; i < uniforms; i++) {
2243 if (!is_live[i] || pull_constant_loc[i] != -1) {
2244 /* This UNIFORM register is either dead, or has already been demoted
2245 * to a pull const. Mark it as no longer living in the param[] array.
2246 */
2247 push_constant_loc[i] = -1;
2248 continue;
2249 }
2250
2251 if (num_push_constants < max_push_components) {
2252 /* Retain as a push constant. Record the location in the params[]
2253 * array.
2254 */
2255 push_constant_loc[i] = num_push_constants++;
2256 } else {
2257 /* Demote to a pull constant. */
2258 push_constant_loc[i] = -1;
2259
2260 int pull_index = stage_prog_data->nr_pull_params++;
2261 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2262 pull_constant_loc[i] = pull_index;
2263 }
2264 }
2265
2266 stage_prog_data->nr_params = num_push_constants;
2267
2268 /* Up until now, the param[] array has been indexed by reg + reg_offset
2269 * of UNIFORM registers. Condense it to only contain the uniforms we
2270 * chose to upload as push constants.
2271 */
2272 for (unsigned int i = 0; i < uniforms; i++) {
2273 int remapped = push_constant_loc[i];
2274
2275 if (remapped == -1)
2276 continue;
2277
2278 assert(remapped <= (int)i);
2279 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2280 }
2281 }
2282
2283 /**
2284 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2285 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2286 */
2287 void
2288 fs_visitor::demote_pull_constants()
2289 {
2290 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2291 for (int i = 0; i < inst->sources; i++) {
2292 if (inst->src[i].file != UNIFORM)
2293 continue;
2294
2295 int pull_index;
2296 unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2297 if (location >= uniforms) /* Out of bounds access */
2298 pull_index = -1;
2299 else
2300 pull_index = pull_constant_loc[location];
2301
2302 if (pull_index == -1)
2303 continue;
2304
2305 /* Set up the annotation tracking for new generated instructions. */
2306 base_ir = inst->ir;
2307 current_annotation = inst->annotation;
2308
2309 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2310 fs_reg dst = vgrf(glsl_type::float_type);
2311
2312 /* Generate a pull load into dst. */
2313 if (inst->src[i].reladdr) {
2314 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2315 surf_index,
2316 *inst->src[i].reladdr,
2317 pull_index);
2318 inst->insert_before(block, &list);
2319 inst->src[i].reladdr = NULL;
2320 } else {
2321 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2322 fs_inst *pull =
2323 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2324 dst, surf_index, offset);
2325 inst->insert_before(block, pull);
2326 inst->src[i].set_smear(pull_index & 3);
2327 }
2328
2329 /* Rewrite the instruction to use the temporary VGRF. */
2330 inst->src[i].file = GRF;
2331 inst->src[i].reg = dst.reg;
2332 inst->src[i].reg_offset = 0;
2333 inst->src[i].width = dispatch_width;
2334 }
2335 }
2336 invalidate_live_intervals();
2337 }
2338
2339 bool
2340 fs_visitor::opt_algebraic()
2341 {
2342 bool progress = false;
2343
2344 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2345 switch (inst->opcode) {
2346 case BRW_OPCODE_MOV:
2347 if (inst->src[0].file != IMM)
2348 break;
2349
2350 if (inst->saturate) {
2351 if (inst->dst.type != inst->src[0].type)
2352 assert(!"unimplemented: saturate mixed types");
2353
2354 if (brw_saturate_immediate(inst->dst.type,
2355 &inst->src[0].fixed_hw_reg)) {
2356 inst->saturate = false;
2357 progress = true;
2358 }
2359 }
2360 break;
2361
2362 case BRW_OPCODE_MUL:
2363 if (inst->src[1].file != IMM)
2364 continue;
2365
2366 /* a * 1.0 = a */
2367 if (inst->src[1].is_one()) {
2368 inst->opcode = BRW_OPCODE_MOV;
2369 inst->src[1] = reg_undef;
2370 progress = true;
2371 break;
2372 }
2373
2374 /* a * -1.0 = -a */
2375 if (inst->src[1].is_negative_one()) {
2376 inst->opcode = BRW_OPCODE_MOV;
2377 inst->src[0].negate = !inst->src[0].negate;
2378 inst->src[1] = reg_undef;
2379 progress = true;
2380 break;
2381 }
2382
2383 /* a * 0.0 = 0.0 */
2384 if (inst->src[1].is_zero()) {
2385 inst->opcode = BRW_OPCODE_MOV;
2386 inst->src[0] = inst->src[1];
2387 inst->src[1] = reg_undef;
2388 progress = true;
2389 break;
2390 }
2391
2392 if (inst->src[0].file == IMM) {
2393 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2394 inst->opcode = BRW_OPCODE_MOV;
2395 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2396 inst->src[1] = reg_undef;
2397 progress = true;
2398 break;
2399 }
2400 break;
2401 case BRW_OPCODE_ADD:
2402 if (inst->src[1].file != IMM)
2403 continue;
2404
2405 /* a + 0.0 = a */
2406 if (inst->src[1].is_zero()) {
2407 inst->opcode = BRW_OPCODE_MOV;
2408 inst->src[1] = reg_undef;
2409 progress = true;
2410 break;
2411 }
2412
2413 if (inst->src[0].file == IMM) {
2414 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2415 inst->opcode = BRW_OPCODE_MOV;
2416 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2417 inst->src[1] = reg_undef;
2418 progress = true;
2419 break;
2420 }
2421 break;
2422 case BRW_OPCODE_OR:
2423 if (inst->src[0].equals(inst->src[1])) {
2424 inst->opcode = BRW_OPCODE_MOV;
2425 inst->src[1] = reg_undef;
2426 progress = true;
2427 break;
2428 }
2429 break;
2430 case BRW_OPCODE_LRP:
2431 if (inst->src[1].equals(inst->src[2])) {
2432 inst->opcode = BRW_OPCODE_MOV;
2433 inst->src[0] = inst->src[1];
2434 inst->src[1] = reg_undef;
2435 inst->src[2] = reg_undef;
2436 progress = true;
2437 break;
2438 }
2439 break;
2440 case BRW_OPCODE_CMP:
2441 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2442 inst->src[0].abs &&
2443 inst->src[0].negate &&
2444 inst->src[1].is_zero()) {
2445 inst->src[0].abs = false;
2446 inst->src[0].negate = false;
2447 inst->conditional_mod = BRW_CONDITIONAL_Z;
2448 progress = true;
2449 break;
2450 }
2451 break;
2452 case BRW_OPCODE_SEL:
2453 if (inst->src[0].equals(inst->src[1])) {
2454 inst->opcode = BRW_OPCODE_MOV;
2455 inst->src[1] = reg_undef;
2456 inst->predicate = BRW_PREDICATE_NONE;
2457 inst->predicate_inverse = false;
2458 progress = true;
2459 } else if (inst->saturate && inst->src[1].file == IMM) {
2460 switch (inst->conditional_mod) {
2461 case BRW_CONDITIONAL_LE:
2462 case BRW_CONDITIONAL_L:
2463 switch (inst->src[1].type) {
2464 case BRW_REGISTER_TYPE_F:
2465 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2466 inst->opcode = BRW_OPCODE_MOV;
2467 inst->src[1] = reg_undef;
2468 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2469 progress = true;
2470 }
2471 break;
2472 default:
2473 break;
2474 }
2475 break;
2476 case BRW_CONDITIONAL_GE:
2477 case BRW_CONDITIONAL_G:
2478 switch (inst->src[1].type) {
2479 case BRW_REGISTER_TYPE_F:
2480 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2481 inst->opcode = BRW_OPCODE_MOV;
2482 inst->src[1] = reg_undef;
2483 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2484 progress = true;
2485 }
2486 break;
2487 default:
2488 break;
2489 }
2490 default:
2491 break;
2492 }
2493 }
2494 break;
2495 case BRW_OPCODE_MAD:
2496 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2497 inst->opcode = BRW_OPCODE_MOV;
2498 inst->src[1] = reg_undef;
2499 inst->src[2] = reg_undef;
2500 progress = true;
2501 } else if (inst->src[0].is_zero()) {
2502 inst->opcode = BRW_OPCODE_MUL;
2503 inst->src[0] = inst->src[2];
2504 inst->src[2] = reg_undef;
2505 progress = true;
2506 } else if (inst->src[1].is_one()) {
2507 inst->opcode = BRW_OPCODE_ADD;
2508 inst->src[1] = inst->src[2];
2509 inst->src[2] = reg_undef;
2510 progress = true;
2511 } else if (inst->src[2].is_one()) {
2512 inst->opcode = BRW_OPCODE_ADD;
2513 inst->src[2] = reg_undef;
2514 progress = true;
2515 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2516 inst->opcode = BRW_OPCODE_ADD;
2517 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2518 inst->src[2] = reg_undef;
2519 progress = true;
2520 }
2521 break;
2522 case SHADER_OPCODE_RCP: {
2523 fs_inst *prev = (fs_inst *)inst->prev;
2524 if (prev->opcode == SHADER_OPCODE_SQRT) {
2525 if (inst->src[0].equals(prev->dst)) {
2526 inst->opcode = SHADER_OPCODE_RSQ;
2527 inst->src[0] = prev->src[0];
2528 progress = true;
2529 }
2530 }
2531 break;
2532 }
2533 default:
2534 break;
2535 }
2536
2537 /* Swap if src[0] is immediate. */
2538 if (progress && inst->is_commutative()) {
2539 if (inst->src[0].file == IMM) {
2540 fs_reg tmp = inst->src[1];
2541 inst->src[1] = inst->src[0];
2542 inst->src[0] = tmp;
2543 }
2544 }
2545 }
2546 return progress;
2547 }
2548
2549 /**
2550 * Optimize sample messages which are followed by the final RT write.
2551 *
2552 * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2553 * results sent directly to the framebuffer, bypassing the EU. Recognize the
2554 * final texturing results copied to the framebuffer write payload and modify
2555 * them to write to the framebuffer directly.
2556 */
2557 bool
2558 fs_visitor::opt_sampler_eot()
2559 {
2560 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2561
2562 if (brw->gen < 9 && !brw->is_cherryview)
2563 return false;
2564
2565 /* FINISHME: It should be possible to implement this optimization when there
2566 * are multiple drawbuffers.
2567 */
2568 if (key->nr_color_regions != 1)
2569 return false;
2570
2571 /* Look for a texturing instruction immediately before the final FB_WRITE. */
2572 fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2573 assert(fb_write->eot);
2574 assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2575
2576 fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2577
2578 /* There wasn't one; nothing to do. */
2579 if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2580 return false;
2581
2582 /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2583 * It's very likely to be the previous instruction.
2584 */
2585 fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2586 if (load_payload->is_head_sentinel() ||
2587 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2588 return false;
2589
2590 assert(!tex_inst->eot); /* We can't get here twice */
2591 assert((tex_inst->offset & (0xff << 24)) == 0);
2592
2593 tex_inst->offset |= fb_write->target << 24;
2594 tex_inst->eot = true;
2595 fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2596
2597 /* If a header is present, marking the eot is sufficient. Otherwise, we need
2598 * to create a new LOAD_PAYLOAD command with the same sources and a space
2599 * saved for the header. Using a new destination register not only makes sure
2600 * we have enough space, but it will make sure the dead code eliminator kills
2601 * the instruction that this will replace.
2602 */
2603 if (tex_inst->header_present)
2604 return true;
2605
2606 fs_reg send_header = vgrf(load_payload->sources + 1);
2607 fs_reg *new_sources =
2608 ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2609
2610 new_sources[0] = fs_reg();
2611 for (int i = 0; i < load_payload->sources; i++)
2612 new_sources[i+1] = load_payload->src[i];
2613
2614 /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2615 * requires a lot of information about the sources to appropriately figure
2616 * out the number of registers needed to be used. Given this stage in our
2617 * optimization, we may not have the appropriate GRFs required by
2618 * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2619 * manually emit the instruction.
2620 */
2621 fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2622 load_payload->exec_size,
2623 send_header,
2624 new_sources,
2625 load_payload->sources + 1);
2626
2627 new_load_payload->regs_written = load_payload->regs_written + 1;
2628 tex_inst->mlen++;
2629 tex_inst->header_present = true;
2630 tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2631 tex_inst->src[0] = send_header;
2632 tex_inst->dst = reg_null_ud;
2633
2634 return true;
2635 }
2636
2637 bool
2638 fs_visitor::opt_register_renaming()
2639 {
2640 bool progress = false;
2641 int depth = 0;
2642
2643 int remap[alloc.count];
2644 memset(remap, -1, sizeof(int) * alloc.count);
2645
2646 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2647 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2648 depth++;
2649 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2650 inst->opcode == BRW_OPCODE_WHILE) {
2651 depth--;
2652 }
2653
2654 /* Rewrite instruction sources. */
2655 for (int i = 0; i < inst->sources; i++) {
2656 if (inst->src[i].file == GRF &&
2657 remap[inst->src[i].reg] != -1 &&
2658 remap[inst->src[i].reg] != inst->src[i].reg) {
2659 inst->src[i].reg = remap[inst->src[i].reg];
2660 progress = true;
2661 }
2662 }
2663
2664 const int dst = inst->dst.reg;
2665
2666 if (depth == 0 &&
2667 inst->dst.file == GRF &&
2668 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2669 !inst->is_partial_write()) {
2670 if (remap[dst] == -1) {
2671 remap[dst] = dst;
2672 } else {
2673 remap[dst] = alloc.allocate(inst->dst.width / 8);
2674 inst->dst.reg = remap[dst];
2675 progress = true;
2676 }
2677 } else if (inst->dst.file == GRF &&
2678 remap[dst] != -1 &&
2679 remap[dst] != dst) {
2680 inst->dst.reg = remap[dst];
2681 progress = true;
2682 }
2683 }
2684
2685 if (progress) {
2686 invalidate_live_intervals();
2687
2688 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2689 if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2690 delta_x[i].reg = remap[delta_x[i].reg];
2691 }
2692 }
2693 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2694 if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2695 delta_y[i].reg = remap[delta_y[i].reg];
2696 }
2697 }
2698 }
2699
2700 return progress;
2701 }
2702
2703 /**
2704 * Remove redundant or useless discard jumps.
2705 *
2706 * For example, we can eliminate jumps in the following sequence:
2707 *
2708 * discard-jump (redundant with the next jump)
2709 * discard-jump (useless; jumps to the next instruction)
2710 * placeholder-halt
2711 */
2712 bool
2713 fs_visitor::opt_redundant_discard_jumps()
2714 {
2715 bool progress = false;
2716
2717 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2718
2719 fs_inst *placeholder_halt = NULL;
2720 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2721 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2722 placeholder_halt = inst;
2723 break;
2724 }
2725 }
2726
2727 if (!placeholder_halt)
2728 return false;
2729
2730 /* Delete any HALTs immediately before the placeholder halt. */
2731 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2732 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2733 prev = (fs_inst *) placeholder_halt->prev) {
2734 prev->remove(last_bblock);
2735 progress = true;
2736 }
2737
2738 if (progress)
2739 invalidate_live_intervals();
2740
2741 return progress;
2742 }
2743
2744 bool
2745 fs_visitor::compute_to_mrf()
2746 {
2747 bool progress = false;
2748 int next_ip = 0;
2749
2750 /* No MRFs on Gen >= 7. */
2751 if (brw->gen >= 7)
2752 return false;
2753
2754 calculate_live_intervals();
2755
2756 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2757 int ip = next_ip;
2758 next_ip++;
2759
2760 if (inst->opcode != BRW_OPCODE_MOV ||
2761 inst->is_partial_write() ||
2762 inst->dst.file != MRF || inst->src[0].file != GRF ||
2763 inst->dst.type != inst->src[0].type ||
2764 inst->src[0].abs || inst->src[0].negate ||
2765 !inst->src[0].is_contiguous() ||
2766 inst->src[0].subreg_offset)
2767 continue;
2768
2769 /* Work out which hardware MRF registers are written by this
2770 * instruction.
2771 */
2772 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2773 int mrf_high;
2774 if (inst->dst.reg & BRW_MRF_COMPR4) {
2775 mrf_high = mrf_low + 4;
2776 } else if (inst->exec_size == 16) {
2777 mrf_high = mrf_low + 1;
2778 } else {
2779 mrf_high = mrf_low;
2780 }
2781
2782 /* Can't compute-to-MRF this GRF if someone else was going to
2783 * read it later.
2784 */
2785 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2786 continue;
2787
2788 /* Found a move of a GRF to a MRF. Let's see if we can go
2789 * rewrite the thing that made this GRF to write into the MRF.
2790 */
2791 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2792 if (scan_inst->dst.file == GRF &&
2793 scan_inst->dst.reg == inst->src[0].reg) {
2794 /* Found the last thing to write our reg we want to turn
2795 * into a compute-to-MRF.
2796 */
2797
2798 /* If this one instruction didn't populate all the
2799 * channels, bail. We might be able to rewrite everything
2800 * that writes that reg, but it would require smarter
2801 * tracking to delay the rewriting until complete success.
2802 */
2803 if (scan_inst->is_partial_write())
2804 break;
2805
2806 /* Things returning more than one register would need us to
2807 * understand coalescing out more than one MOV at a time.
2808 */
2809 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2810 break;
2811
2812 /* SEND instructions can't have MRF as a destination. */
2813 if (scan_inst->mlen)
2814 break;
2815
2816 if (brw->gen == 6) {
2817 /* gen6 math instructions must have the destination be
2818 * GRF, so no compute-to-MRF for them.
2819 */
2820 if (scan_inst->is_math()) {
2821 break;
2822 }
2823 }
2824
2825 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2826 /* Found the creator of our MRF's source value. */
2827 scan_inst->dst.file = MRF;
2828 scan_inst->dst.reg = inst->dst.reg;
2829 scan_inst->saturate |= inst->saturate;
2830 inst->remove(block);
2831 progress = true;
2832 }
2833 break;
2834 }
2835
2836 /* We don't handle control flow here. Most computation of
2837 * values that end up in MRFs are shortly before the MRF
2838 * write anyway.
2839 */
2840 if (block->start() == scan_inst)
2841 break;
2842
2843 /* You can't read from an MRF, so if someone else reads our
2844 * MRF's source GRF that we wanted to rewrite, that stops us.
2845 */
2846 bool interfered = false;
2847 for (int i = 0; i < scan_inst->sources; i++) {
2848 if (scan_inst->src[i].file == GRF &&
2849 scan_inst->src[i].reg == inst->src[0].reg &&
2850 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2851 interfered = true;
2852 }
2853 }
2854 if (interfered)
2855 break;
2856
2857 if (scan_inst->dst.file == MRF) {
2858 /* If somebody else writes our MRF here, we can't
2859 * compute-to-MRF before that.
2860 */
2861 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2862 int scan_mrf_high;
2863
2864 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2865 scan_mrf_high = scan_mrf_low + 4;
2866 } else if (scan_inst->exec_size == 16) {
2867 scan_mrf_high = scan_mrf_low + 1;
2868 } else {
2869 scan_mrf_high = scan_mrf_low;
2870 }
2871
2872 if (mrf_low == scan_mrf_low ||
2873 mrf_low == scan_mrf_high ||
2874 mrf_high == scan_mrf_low ||
2875 mrf_high == scan_mrf_high) {
2876 break;
2877 }
2878 }
2879
2880 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2881 /* Found a SEND instruction, which means that there are
2882 * live values in MRFs from base_mrf to base_mrf +
2883 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2884 * above it.
2885 */
2886 if (mrf_low >= scan_inst->base_mrf &&
2887 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2888 break;
2889 }
2890 if (mrf_high >= scan_inst->base_mrf &&
2891 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2892 break;
2893 }
2894 }
2895 }
2896 }
2897
2898 if (progress)
2899 invalidate_live_intervals();
2900
2901 return progress;
2902 }
2903
2904 /**
2905 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2906 * instructions to FS_OPCODE_REP_FB_WRITE.
2907 */
2908 void
2909 fs_visitor::emit_repclear_shader()
2910 {
2911 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2912 int base_mrf = 1;
2913 int color_mrf = base_mrf + 2;
2914
2915 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2916 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2917 mov->force_writemask_all = true;
2918
2919 fs_inst *write;
2920 if (key->nr_color_regions == 1) {
2921 write = emit(FS_OPCODE_REP_FB_WRITE);
2922 write->saturate = key->clamp_fragment_color;
2923 write->base_mrf = color_mrf;
2924 write->target = 0;
2925 write->header_present = false;
2926 write->mlen = 1;
2927 } else {
2928 assume(key->nr_color_regions > 0);
2929 for (int i = 0; i < key->nr_color_regions; ++i) {
2930 write = emit(FS_OPCODE_REP_FB_WRITE);
2931 write->saturate = key->clamp_fragment_color;
2932 write->base_mrf = base_mrf;
2933 write->target = i;
2934 write->header_present = true;
2935 write->mlen = 3;
2936 }
2937 }
2938 write->eot = true;
2939
2940 calculate_cfg();
2941
2942 assign_constant_locations();
2943 assign_curb_setup();
2944
2945 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2946 assert(mov->src[0].file == HW_REG);
2947 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2948 }
2949
2950 /**
2951 * Walks through basic blocks, looking for repeated MRF writes and
2952 * removing the later ones.
2953 */
2954 bool
2955 fs_visitor::remove_duplicate_mrf_writes()
2956 {
2957 fs_inst *last_mrf_move[16];
2958 bool progress = false;
2959
2960 /* Need to update the MRF tracking for compressed instructions. */
2961 if (dispatch_width == 16)
2962 return false;
2963
2964 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2965
2966 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2967 if (inst->is_control_flow()) {
2968 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2969 }
2970
2971 if (inst->opcode == BRW_OPCODE_MOV &&
2972 inst->dst.file == MRF) {
2973 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2974 if (prev_inst && inst->equals(prev_inst)) {
2975 inst->remove(block);
2976 progress = true;
2977 continue;
2978 }
2979 }
2980
2981 /* Clear out the last-write records for MRFs that were overwritten. */
2982 if (inst->dst.file == MRF) {
2983 last_mrf_move[inst->dst.reg] = NULL;
2984 }
2985
2986 if (inst->mlen > 0 && inst->base_mrf != -1) {
2987 /* Found a SEND instruction, which will include two or fewer
2988 * implied MRF writes. We could do better here.
2989 */
2990 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2991 last_mrf_move[inst->base_mrf + i] = NULL;
2992 }
2993 }
2994
2995 /* Clear out any MRF move records whose sources got overwritten. */
2996 if (inst->dst.file == GRF) {
2997 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2998 if (last_mrf_move[i] &&
2999 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3000 last_mrf_move[i] = NULL;
3001 }
3002 }
3003 }
3004
3005 if (inst->opcode == BRW_OPCODE_MOV &&
3006 inst->dst.file == MRF &&
3007 inst->src[0].file == GRF &&
3008 !inst->is_partial_write()) {
3009 last_mrf_move[inst->dst.reg] = inst;
3010 }
3011 }
3012
3013 if (progress)
3014 invalidate_live_intervals();
3015
3016 return progress;
3017 }
3018
3019 static void
3020 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3021 {
3022 /* Clear the flag for registers that actually got read (as expected). */
3023 for (int i = 0; i < inst->sources; i++) {
3024 int grf;
3025 if (inst->src[i].file == GRF) {
3026 grf = inst->src[i].reg;
3027 } else if (inst->src[i].file == HW_REG &&
3028 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3029 grf = inst->src[i].fixed_hw_reg.nr;
3030 } else {
3031 continue;
3032 }
3033
3034 if (grf >= first_grf &&
3035 grf < first_grf + grf_len) {
3036 deps[grf - first_grf] = false;
3037 if (inst->exec_size == 16)
3038 deps[grf - first_grf + 1] = false;
3039 }
3040 }
3041 }
3042
3043 /**
3044 * Implements this workaround for the original 965:
3045 *
3046 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3047 * check for post destination dependencies on this instruction, software
3048 * must ensure that there is no destination hazard for the case of ‘write
3049 * followed by a posted write’ shown in the following example.
3050 *
3051 * 1. mov r3 0
3052 * 2. send r3.xy <rest of send instruction>
3053 * 3. mov r2 r3
3054 *
3055 * Due to no post-destination dependency check on the ‘send’, the above
3056 * code sequence could have two instructions (1 and 2) in flight at the
3057 * same time that both consider ‘r3’ as the target of their final writes.
3058 */
3059 void
3060 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3061 fs_inst *inst)
3062 {
3063 int write_len = inst->regs_written;
3064 int first_write_grf = inst->dst.reg;
3065 bool needs_dep[BRW_MAX_MRF];
3066 assert(write_len < (int)sizeof(needs_dep) - 1);
3067
3068 memset(needs_dep, false, sizeof(needs_dep));
3069 memset(needs_dep, true, write_len);
3070
3071 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3072
3073 /* Walk backwards looking for writes to registers we're writing which
3074 * aren't read since being written. If we hit the start of the program,
3075 * we assume that there are no outstanding dependencies on entry to the
3076 * program.
3077 */
3078 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3079 /* If we hit control flow, assume that there *are* outstanding
3080 * dependencies, and force their cleanup before our instruction.
3081 */
3082 if (block->start() == scan_inst) {
3083 for (int i = 0; i < write_len; i++) {
3084 if (needs_dep[i]) {
3085 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3086 }
3087 }
3088 return;
3089 }
3090
3091 /* We insert our reads as late as possible on the assumption that any
3092 * instruction but a MOV that might have left us an outstanding
3093 * dependency has more latency than a MOV.
3094 */
3095 if (scan_inst->dst.file == GRF) {
3096 for (int i = 0; i < scan_inst->regs_written; i++) {
3097 int reg = scan_inst->dst.reg + i;
3098
3099 if (reg >= first_write_grf &&
3100 reg < first_write_grf + write_len &&
3101 needs_dep[reg - first_write_grf]) {
3102 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3103 needs_dep[reg - first_write_grf] = false;
3104 if (scan_inst->exec_size == 16)
3105 needs_dep[reg - first_write_grf + 1] = false;
3106 }
3107 }
3108 }
3109
3110 /* Clear the flag for registers that actually got read (as expected). */
3111 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3112
3113 /* Continue the loop only if we haven't resolved all the dependencies */
3114 int i;
3115 for (i = 0; i < write_len; i++) {
3116 if (needs_dep[i])
3117 break;
3118 }
3119 if (i == write_len)
3120 return;
3121 }
3122 }
3123
3124 /**
3125 * Implements this workaround for the original 965:
3126 *
3127 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3128 * used as a destination register until after it has been sourced by an
3129 * instruction with a different destination register.
3130 */
3131 void
3132 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3133 {
3134 int write_len = inst->regs_written;
3135 int first_write_grf = inst->dst.reg;
3136 bool needs_dep[BRW_MAX_MRF];
3137 assert(write_len < (int)sizeof(needs_dep) - 1);
3138
3139 memset(needs_dep, false, sizeof(needs_dep));
3140 memset(needs_dep, true, write_len);
3141 /* Walk forwards looking for writes to registers we're writing which aren't
3142 * read before being written.
3143 */
3144 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3145 /* If we hit control flow, force resolve all remaining dependencies. */
3146 if (block->end() == scan_inst) {
3147 for (int i = 0; i < write_len; i++) {
3148 if (needs_dep[i])
3149 scan_inst->insert_before(block,
3150 DEP_RESOLVE_MOV(first_write_grf + i));
3151 }
3152 return;
3153 }
3154
3155 /* Clear the flag for registers that actually got read (as expected). */
3156 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3157
3158 /* We insert our reads as late as possible since they're reading the
3159 * result of a SEND, which has massive latency.
3160 */
3161 if (scan_inst->dst.file == GRF &&
3162 scan_inst->dst.reg >= first_write_grf &&
3163 scan_inst->dst.reg < first_write_grf + write_len &&
3164 needs_dep[scan_inst->dst.reg - first_write_grf]) {
3165 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3166 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3167 }
3168
3169 /* Continue the loop only if we haven't resolved all the dependencies */
3170 int i;
3171 for (i = 0; i < write_len; i++) {
3172 if (needs_dep[i])
3173 break;
3174 }
3175 if (i == write_len)
3176 return;
3177 }
3178 }
3179
3180 void
3181 fs_visitor::insert_gen4_send_dependency_workarounds()
3182 {
3183 if (brw->gen != 4 || brw->is_g4x)
3184 return;
3185
3186 bool progress = false;
3187
3188 /* Note that we're done with register allocation, so GRF fs_regs always
3189 * have a .reg_offset of 0.
3190 */
3191
3192 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3193 if (inst->mlen != 0 && inst->dst.file == GRF) {
3194 insert_gen4_pre_send_dependency_workarounds(block, inst);
3195 insert_gen4_post_send_dependency_workarounds(block, inst);
3196 progress = true;
3197 }
3198 }
3199
3200 if (progress)
3201 invalidate_live_intervals();
3202 }
3203
3204 /**
3205 * Turns the generic expression-style uniform pull constant load instruction
3206 * into a hardware-specific series of instructions for loading a pull
3207 * constant.
3208 *
3209 * The expression style allows the CSE pass before this to optimize out
3210 * repeated loads from the same offset, and gives the pre-register-allocation
3211 * scheduling full flexibility, while the conversion to native instructions
3212 * allows the post-register-allocation scheduler the best information
3213 * possible.
3214 *
3215 * Note that execution masking for setting up pull constant loads is special:
3216 * the channels that need to be written are unrelated to the current execution
3217 * mask, since a later instruction will use one of the result channels as a
3218 * source operand for all 8 or 16 of its channels.
3219 */
3220 void
3221 fs_visitor::lower_uniform_pull_constant_loads()
3222 {
3223 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3224 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3225 continue;
3226
3227 if (brw->gen >= 7) {
3228 /* The offset arg before was a vec4-aligned byte offset. We need to
3229 * turn it into a dword offset.
3230 */
3231 fs_reg const_offset_reg = inst->src[1];
3232 assert(const_offset_reg.file == IMM &&
3233 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3234 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3235 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3236
3237 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3238 * Reserve space for the register.
3239 */
3240 if (brw->gen >= 9) {
3241 payload.reg_offset++;
3242 alloc.sizes[payload.reg] = 2;
3243 }
3244
3245 /* This is actually going to be a MOV, but since only the first dword
3246 * is accessed, we have a special opcode to do just that one. Note
3247 * that this needs to be an operation that will be considered a def
3248 * by live variable analysis, or register allocation will explode.
3249 */
3250 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3251 8, payload, const_offset_reg);
3252 setup->force_writemask_all = true;
3253
3254 setup->ir = inst->ir;
3255 setup->annotation = inst->annotation;
3256 inst->insert_before(block, setup);
3257
3258 /* Similarly, this will only populate the first 4 channels of the
3259 * result register (since we only use smear values from 0-3), but we
3260 * don't tell the optimizer.
3261 */
3262 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3263 inst->src[1] = payload;
3264
3265 invalidate_live_intervals();
3266 } else {
3267 /* Before register allocation, we didn't tell the scheduler about the
3268 * MRF we use. We know it's safe to use this MRF because nothing
3269 * else does except for register spill/unspill, which generates and
3270 * uses its MRF within a single IR instruction.
3271 */
3272 inst->base_mrf = 14;
3273 inst->mlen = 1;
3274 }
3275 }
3276 }
3277
3278 bool
3279 fs_visitor::lower_load_payload()
3280 {
3281 bool progress = false;
3282
3283 int vgrf_to_reg[alloc.count];
3284 int reg_count = 0;
3285 for (unsigned i = 0; i < alloc.count; ++i) {
3286 vgrf_to_reg[i] = reg_count;
3287 reg_count += alloc.sizes[i];
3288 }
3289
3290 struct {
3291 bool written:1; /* Whether this register has ever been written */
3292 bool force_writemask_all:1;
3293 bool force_sechalf:1;
3294 } metadata[reg_count];
3295 memset(metadata, 0, sizeof(metadata));
3296
3297 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3298 if (inst->dst.file == GRF) {
3299 const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3300 bool force_sechalf = inst->force_sechalf &&
3301 !inst->force_writemask_all;
3302 bool toggle_sechalf = inst->dst.width == 16 &&
3303 type_sz(inst->dst.type) == 4 &&
3304 !inst->force_writemask_all;
3305 for (int i = 0; i < inst->regs_written; ++i) {
3306 metadata[dst_reg + i].written = true;
3307 metadata[dst_reg + i].force_sechalf = force_sechalf;
3308 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3309 force_sechalf = (toggle_sechalf != force_sechalf);
3310 }
3311 }
3312
3313 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3314 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3315 fs_reg dst = inst->dst;
3316
3317 for (int i = 0; i < inst->sources; i++) {
3318 dst.width = inst->src[i].effective_width;
3319 dst.type = inst->src[i].type;
3320
3321 if (inst->src[i].file == BAD_FILE) {
3322 /* Do nothing but otherwise increment as normal */
3323 } else if (dst.file == MRF &&
3324 dst.width == 8 &&
3325 brw->has_compr4 &&
3326 i + 4 < inst->sources &&
3327 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3328 fs_reg compr4_dst = dst;
3329 compr4_dst.reg += BRW_MRF_COMPR4;
3330 compr4_dst.width = 16;
3331 fs_reg compr4_src = inst->src[i];
3332 compr4_src.width = 16;
3333 fs_inst *mov = MOV(compr4_dst, compr4_src);
3334 mov->force_writemask_all = true;
3335 inst->insert_before(block, mov);
3336 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3337 inst->src[i + 4].file = BAD_FILE;
3338 } else {
3339 fs_inst *mov = MOV(dst, inst->src[i]);
3340 if (inst->src[i].file == GRF) {
3341 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3342 inst->src[i].reg_offset;
3343 mov->force_sechalf = metadata[src_reg].force_sechalf;
3344 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3345 } else {
3346 /* We don't have any useful metadata for immediates or
3347 * uniforms. Assume that any of the channels of the
3348 * destination may be used.
3349 */
3350 assert(inst->src[i].file == IMM ||
3351 inst->src[i].file == UNIFORM);
3352 mov->force_writemask_all = true;
3353 }
3354
3355 if (dst.file == GRF) {
3356 const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3357 const bool force_writemask = mov->force_writemask_all;
3358 metadata[dst_reg].force_writemask_all = force_writemask;
3359 metadata[dst_reg].force_sechalf = mov->force_sechalf;
3360 if (dst.width * type_sz(dst.type) > 32) {
3361 assert(!mov->force_sechalf);
3362 metadata[dst_reg + 1].force_writemask_all = force_writemask;
3363 metadata[dst_reg + 1].force_sechalf = !force_writemask;
3364 }
3365 }
3366
3367 inst->insert_before(block, mov);
3368 }
3369
3370 dst = offset(dst, 1);
3371 }
3372
3373 inst->remove(block);
3374 progress = true;
3375 }
3376 }
3377
3378 if (progress)
3379 invalidate_live_intervals();
3380
3381 return progress;
3382 }
3383
3384 void
3385 fs_visitor::dump_instructions()
3386 {
3387 dump_instructions(NULL);
3388 }
3389
3390 void
3391 fs_visitor::dump_instructions(const char *name)
3392 {
3393 FILE *file = stderr;
3394 if (name && geteuid() != 0) {
3395 file = fopen(name, "w");
3396 if (!file)
3397 file = stderr;
3398 }
3399
3400 if (cfg) {
3401 calculate_register_pressure();
3402 int ip = 0, max_pressure = 0;
3403 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3404 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3405 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3406 dump_instruction(inst, file);
3407 ip++;
3408 }
3409 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3410 } else {
3411 int ip = 0;
3412 foreach_in_list(backend_instruction, inst, &instructions) {
3413 fprintf(file, "%4d: ", ip++);
3414 dump_instruction(inst, file);
3415 }
3416 }
3417
3418 if (file != stderr) {
3419 fclose(file);
3420 }
3421 }
3422
3423 void
3424 fs_visitor::dump_instruction(backend_instruction *be_inst)
3425 {
3426 dump_instruction(be_inst, stderr);
3427 }
3428
3429 void
3430 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3431 {
3432 fs_inst *inst = (fs_inst *)be_inst;
3433
3434 if (inst->predicate) {
3435 fprintf(file, "(%cf0.%d) ",
3436 inst->predicate_inverse ? '-' : '+',
3437 inst->flag_subreg);
3438 }
3439
3440 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3441 if (inst->saturate)
3442 fprintf(file, ".sat");
3443 if (inst->conditional_mod) {
3444 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3445 if (!inst->predicate &&
3446 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3447 inst->opcode != BRW_OPCODE_IF &&
3448 inst->opcode != BRW_OPCODE_WHILE))) {
3449 fprintf(file, ".f0.%d", inst->flag_subreg);
3450 }
3451 }
3452 fprintf(file, "(%d) ", inst->exec_size);
3453
3454
3455 switch (inst->dst.file) {
3456 case GRF:
3457 fprintf(file, "vgrf%d", inst->dst.reg);
3458 if (inst->dst.width != dispatch_width)
3459 fprintf(file, "@%d", inst->dst.width);
3460 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3461 inst->dst.subreg_offset)
3462 fprintf(file, "+%d.%d",
3463 inst->dst.reg_offset, inst->dst.subreg_offset);
3464 break;
3465 case MRF:
3466 fprintf(file, "m%d", inst->dst.reg);
3467 break;
3468 case BAD_FILE:
3469 fprintf(file, "(null)");
3470 break;
3471 case UNIFORM:
3472 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3473 break;
3474 case ATTR:
3475 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3476 break;
3477 case HW_REG:
3478 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3479 switch (inst->dst.fixed_hw_reg.nr) {
3480 case BRW_ARF_NULL:
3481 fprintf(file, "null");
3482 break;
3483 case BRW_ARF_ADDRESS:
3484 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3485 break;
3486 case BRW_ARF_ACCUMULATOR:
3487 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3488 break;
3489 case BRW_ARF_FLAG:
3490 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3491 inst->dst.fixed_hw_reg.subnr);
3492 break;
3493 default:
3494 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3495 inst->dst.fixed_hw_reg.subnr);
3496 break;
3497 }
3498 } else {
3499 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3500 }
3501 if (inst->dst.fixed_hw_reg.subnr)
3502 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3503 break;
3504 default:
3505 fprintf(file, "???");
3506 break;
3507 }
3508 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3509
3510 for (int i = 0; i < inst->sources; i++) {
3511 if (inst->src[i].negate)
3512 fprintf(file, "-");
3513 if (inst->src[i].abs)
3514 fprintf(file, "|");
3515 switch (inst->src[i].file) {
3516 case GRF:
3517 fprintf(file, "vgrf%d", inst->src[i].reg);
3518 if (inst->src[i].width != dispatch_width)
3519 fprintf(file, "@%d", inst->src[i].width);
3520 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3521 inst->src[i].subreg_offset)
3522 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3523 inst->src[i].subreg_offset);
3524 break;
3525 case MRF:
3526 fprintf(file, "***m%d***", inst->src[i].reg);
3527 break;
3528 case ATTR:
3529 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3530 break;
3531 case UNIFORM:
3532 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3533 if (inst->src[i].reladdr) {
3534 fprintf(file, "+reladdr");
3535 } else if (inst->src[i].subreg_offset) {
3536 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3537 inst->src[i].subreg_offset);
3538 }
3539 break;
3540 case BAD_FILE:
3541 fprintf(file, "(null)");
3542 break;
3543 case IMM:
3544 switch (inst->src[i].type) {
3545 case BRW_REGISTER_TYPE_F:
3546 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3547 break;
3548 case BRW_REGISTER_TYPE_W:
3549 case BRW_REGISTER_TYPE_D:
3550 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3551 break;
3552 case BRW_REGISTER_TYPE_UW:
3553 case BRW_REGISTER_TYPE_UD:
3554 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3555 break;
3556 case BRW_REGISTER_TYPE_VF:
3557 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3558 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3559 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3560 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3561 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3562 break;
3563 default:
3564 fprintf(file, "???");
3565 break;
3566 }
3567 break;
3568 case HW_REG:
3569 if (inst->src[i].fixed_hw_reg.negate)
3570 fprintf(file, "-");
3571 if (inst->src[i].fixed_hw_reg.abs)
3572 fprintf(file, "|");
3573 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3574 switch (inst->src[i].fixed_hw_reg.nr) {
3575 case BRW_ARF_NULL:
3576 fprintf(file, "null");
3577 break;
3578 case BRW_ARF_ADDRESS:
3579 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3580 break;
3581 case BRW_ARF_ACCUMULATOR:
3582 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3583 break;
3584 case BRW_ARF_FLAG:
3585 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3586 inst->src[i].fixed_hw_reg.subnr);
3587 break;
3588 default:
3589 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3590 inst->src[i].fixed_hw_reg.subnr);
3591 break;
3592 }
3593 } else {
3594 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3595 }
3596 if (inst->src[i].fixed_hw_reg.subnr)
3597 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3598 if (inst->src[i].fixed_hw_reg.abs)
3599 fprintf(file, "|");
3600 break;
3601 default:
3602 fprintf(file, "???");
3603 break;
3604 }
3605 if (inst->src[i].abs)
3606 fprintf(file, "|");
3607
3608 if (inst->src[i].file != IMM) {
3609 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3610 }
3611
3612 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3613 fprintf(file, ", ");
3614 }
3615
3616 fprintf(file, " ");
3617
3618 if (dispatch_width == 16 && inst->exec_size == 8) {
3619 if (inst->force_sechalf)
3620 fprintf(file, "2ndhalf ");
3621 else
3622 fprintf(file, "1sthalf ");
3623 }
3624
3625 fprintf(file, "\n");
3626 }
3627
3628 /**
3629 * Possibly returns an instruction that set up @param reg.
3630 *
3631 * Sometimes we want to take the result of some expression/variable
3632 * dereference tree and rewrite the instruction generating the result
3633 * of the tree. When processing the tree, we know that the
3634 * instructions generated are all writing temporaries that are dead
3635 * outside of this tree. So, if we have some instructions that write
3636 * a temporary, we're free to point that temp write somewhere else.
3637 *
3638 * Note that this doesn't guarantee that the instruction generated
3639 * only reg -- it might be the size=4 destination of a texture instruction.
3640 */
3641 fs_inst *
3642 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3643 fs_inst *end,
3644 const fs_reg &reg)
3645 {
3646 if (end == start ||
3647 end->is_partial_write() ||
3648 reg.reladdr ||
3649 !reg.equals(end->dst)) {
3650 return NULL;
3651 } else {
3652 return end;
3653 }
3654 }
3655
3656 void
3657 fs_visitor::setup_payload_gen6()
3658 {
3659 bool uses_depth =
3660 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3661 unsigned barycentric_interp_modes =
3662 (stage == MESA_SHADER_FRAGMENT) ?
3663 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3664
3665 assert(brw->gen >= 6);
3666
3667 /* R0-1: masks, pixel X/Y coordinates. */
3668 payload.num_regs = 2;
3669 /* R2: only for 32-pixel dispatch.*/
3670
3671 /* R3-26: barycentric interpolation coordinates. These appear in the
3672 * same order that they appear in the brw_wm_barycentric_interp_mode
3673 * enum. Each set of coordinates occupies 2 registers if dispatch width
3674 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3675 * appear if they were enabled using the "Barycentric Interpolation
3676 * Mode" bits in WM_STATE.
3677 */
3678 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3679 if (barycentric_interp_modes & (1 << i)) {
3680 payload.barycentric_coord_reg[i] = payload.num_regs;
3681 payload.num_regs += 2;
3682 if (dispatch_width == 16) {
3683 payload.num_regs += 2;
3684 }
3685 }
3686 }
3687
3688 /* R27: interpolated depth if uses source depth */
3689 if (uses_depth) {
3690 payload.source_depth_reg = payload.num_regs;
3691 payload.num_regs++;
3692 if (dispatch_width == 16) {
3693 /* R28: interpolated depth if not SIMD8. */
3694 payload.num_regs++;
3695 }
3696 }
3697 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3698 if (uses_depth) {
3699 payload.source_w_reg = payload.num_regs;
3700 payload.num_regs++;
3701 if (dispatch_width == 16) {
3702 /* R30: interpolated W if not SIMD8. */
3703 payload.num_regs++;
3704 }
3705 }
3706
3707 if (stage == MESA_SHADER_FRAGMENT) {
3708 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3709 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3710 prog_data->uses_pos_offset = key->compute_pos_offset;
3711 /* R31: MSAA position offsets. */
3712 if (prog_data->uses_pos_offset) {
3713 payload.sample_pos_reg = payload.num_regs;
3714 payload.num_regs++;
3715 }
3716 }
3717
3718 /* R32: MSAA input coverage mask */
3719 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3720 assert(brw->gen >= 7);
3721 payload.sample_mask_in_reg = payload.num_regs;
3722 payload.num_regs++;
3723 if (dispatch_width == 16) {
3724 /* R33: input coverage mask if not SIMD8. */
3725 payload.num_regs++;
3726 }
3727 }
3728
3729 /* R34-: bary for 32-pixel. */
3730 /* R58-59: interp W for 32-pixel. */
3731
3732 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3733 source_depth_to_render_target = true;
3734 }
3735 }
3736
3737 void
3738 fs_visitor::setup_vs_payload()
3739 {
3740 /* R0: thread header, R1: urb handles */
3741 payload.num_regs = 2;
3742 }
3743
3744 void
3745 fs_visitor::assign_binding_table_offsets()
3746 {
3747 assert(stage == MESA_SHADER_FRAGMENT);
3748 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3749 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3750 uint32_t next_binding_table_offset = 0;
3751
3752 /* If there are no color regions, we still perform an FB write to a null
3753 * renderbuffer, which we place at surface index 0.
3754 */
3755 prog_data->binding_table.render_target_start = next_binding_table_offset;
3756 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3757
3758 assign_common_binding_table_offsets(next_binding_table_offset);
3759 }
3760
3761 void
3762 fs_visitor::calculate_register_pressure()
3763 {
3764 invalidate_live_intervals();
3765 calculate_live_intervals();
3766
3767 unsigned num_instructions = 0;
3768 foreach_block(block, cfg)
3769 num_instructions += block->instructions.length();
3770
3771 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3772
3773 for (unsigned reg = 0; reg < alloc.count; reg++) {
3774 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3775 regs_live_at_ip[ip] += alloc.sizes[reg];
3776 }
3777 }
3778
3779 void
3780 fs_visitor::optimize()
3781 {
3782 const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3783
3784 split_virtual_grfs();
3785
3786 move_uniform_array_access_to_pull_constants();
3787 assign_constant_locations();
3788 demote_pull_constants();
3789
3790 #define OPT(pass, args...) ({ \
3791 pass_num++; \
3792 bool this_progress = pass(args); \
3793 \
3794 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3795 char filename[64]; \
3796 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3797 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3798 \
3799 backend_visitor::dump_instructions(filename); \
3800 } \
3801 \
3802 progress = progress || this_progress; \
3803 this_progress; \
3804 })
3805
3806 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3807 char filename[64];
3808 snprintf(filename, 64, "%s%d-%04d-00-start",
3809 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3810
3811 backend_visitor::dump_instructions(filename);
3812 }
3813
3814 bool progress;
3815 int iteration = 0;
3816 int pass_num = 0;
3817 do {
3818 progress = false;
3819 pass_num = 0;
3820 iteration++;
3821
3822 OPT(remove_duplicate_mrf_writes);
3823
3824 OPT(opt_algebraic);
3825 OPT(opt_cse);
3826 OPT(opt_copy_propagate);
3827 OPT(opt_peephole_predicated_break);
3828 OPT(opt_cmod_propagation);
3829 OPT(dead_code_eliminate);
3830 OPT(opt_peephole_sel);
3831 OPT(dead_control_flow_eliminate, this);
3832 OPT(opt_register_renaming);
3833 OPT(opt_redundant_discard_jumps);
3834 OPT(opt_saturate_propagation);
3835 OPT(register_coalesce);
3836 OPT(compute_to_mrf);
3837
3838 OPT(compact_virtual_grfs);
3839 } while (progress);
3840
3841 pass_num = 0;
3842
3843 OPT(opt_sampler_eot);
3844
3845 if (OPT(lower_load_payload)) {
3846 split_virtual_grfs();
3847 OPT(register_coalesce);
3848 OPT(compute_to_mrf);
3849 OPT(dead_code_eliminate);
3850 }
3851
3852 OPT(opt_combine_constants);
3853
3854 lower_uniform_pull_constant_loads();
3855 }
3856
3857 /**
3858 * Three source instruction must have a GRF/MRF destination register.
3859 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
3860 */
3861 void
3862 fs_visitor::fixup_3src_null_dest()
3863 {
3864 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3865 if (inst->is_3src() && inst->dst.is_null()) {
3866 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3867 inst->dst.type);
3868 }
3869 }
3870 }
3871
3872 void
3873 fs_visitor::allocate_registers()
3874 {
3875 bool allocated_without_spills;
3876
3877 static const enum instruction_scheduler_mode pre_modes[] = {
3878 SCHEDULE_PRE,
3879 SCHEDULE_PRE_NON_LIFO,
3880 SCHEDULE_PRE_LIFO,
3881 };
3882
3883 /* Try each scheduling heuristic to see if it can successfully register
3884 * allocate without spilling. They should be ordered by decreasing
3885 * performance but increasing likelihood of allocating.
3886 */
3887 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3888 schedule_instructions(pre_modes[i]);
3889
3890 if (0) {
3891 assign_regs_trivial();
3892 allocated_without_spills = true;
3893 } else {
3894 allocated_without_spills = assign_regs(false);
3895 }
3896 if (allocated_without_spills)
3897 break;
3898 }
3899
3900 if (!allocated_without_spills) {
3901 const char *stage_name = stage == MESA_SHADER_VERTEX ?
3902 "Vertex" : "Fragment";
3903
3904 /* We assume that any spilling is worse than just dropping back to
3905 * SIMD8. There's probably actually some intermediate point where
3906 * SIMD16 with a couple of spills is still better.
3907 */
3908 if (dispatch_width == 16) {
3909 fail("Failure to register allocate. Reduce number of "
3910 "live scalar values to avoid this.");
3911 } else {
3912 perf_debug("%s shader triggered register spilling. "
3913 "Try reducing the number of live scalar values to "
3914 "improve performance.\n", stage_name);
3915 }
3916
3917 /* Since we're out of heuristics, just go spill registers until we
3918 * get an allocation.
3919 */
3920 while (!assign_regs(true)) {
3921 if (failed)
3922 break;
3923 }
3924 }
3925
3926 /* This must come after all optimization and register allocation, since
3927 * it inserts dead code that happens to have side effects, and it does
3928 * so based on the actual physical registers in use.
3929 */
3930 insert_gen4_send_dependency_workarounds();
3931
3932 if (failed)
3933 return;
3934
3935 if (!allocated_without_spills)
3936 schedule_instructions(SCHEDULE_POST);
3937
3938 if (last_scratch > 0)
3939 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3940 }
3941
3942 bool
3943 fs_visitor::run_vs()
3944 {
3945 assert(stage == MESA_SHADER_VERTEX);
3946
3947 assign_common_binding_table_offsets(0);
3948 setup_vs_payload();
3949
3950 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3951 emit_shader_time_begin();
3952
3953 if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
3954 emit_nir_code();
3955 } else {
3956 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3957 base_ir = ir;
3958 this->result = reg_undef;
3959 ir->accept(this);
3960 }
3961 base_ir = NULL;
3962 }
3963
3964 if (failed)
3965 return false;
3966
3967 emit_urb_writes();
3968
3969 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3970 emit_shader_time_end();
3971
3972 calculate_cfg();
3973
3974 optimize();
3975
3976 assign_curb_setup();
3977 assign_vs_urb_setup();
3978
3979 fixup_3src_null_dest();
3980 allocate_registers();
3981
3982 return !failed;
3983 }
3984
3985 bool
3986 fs_visitor::run_fs()
3987 {
3988 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3989 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3990
3991 assert(stage == MESA_SHADER_FRAGMENT);
3992
3993 sanity_param_count = prog->Parameters->NumParameters;
3994
3995 assign_binding_table_offsets();
3996
3997 if (brw->gen >= 6)
3998 setup_payload_gen6();
3999 else
4000 setup_payload_gen4();
4001
4002 if (0) {
4003 emit_dummy_fs();
4004 } else if (brw->use_rep_send && dispatch_width == 16) {
4005 emit_repclear_shader();
4006 } else {
4007 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4008 emit_shader_time_begin();
4009
4010 calculate_urb_setup();
4011 if (prog->InputsRead > 0) {
4012 if (brw->gen < 6)
4013 emit_interpolation_setup_gen4();
4014 else
4015 emit_interpolation_setup_gen6();
4016 }
4017
4018 /* We handle discards by keeping track of the still-live pixels in f0.1.
4019 * Initialize it with the dispatched pixels.
4020 */
4021 if (wm_prog_data->uses_kill) {
4022 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4023 discard_init->flag_subreg = 1;
4024 }
4025
4026 /* Generate FS IR for main(). (the visitor only descends into
4027 * functions called "main").
4028 */
4029 if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
4030 emit_nir_code();
4031 } else if (shader) {
4032 foreach_in_list(ir_instruction, ir, shader->base.ir) {
4033 base_ir = ir;
4034 this->result = reg_undef;
4035 ir->accept(this);
4036 }
4037 } else {
4038 emit_fragment_program_code();
4039 }
4040 base_ir = NULL;
4041 if (failed)
4042 return false;
4043
4044 if (wm_prog_data->uses_kill)
4045 emit(FS_OPCODE_PLACEHOLDER_HALT);
4046
4047 if (wm_key->alpha_test_func)
4048 emit_alpha_test();
4049
4050 emit_fb_writes();
4051
4052 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4053 emit_shader_time_end();
4054
4055 calculate_cfg();
4056
4057 optimize();
4058
4059 assign_curb_setup();
4060 assign_urb_setup();
4061
4062 fixup_3src_null_dest();
4063 allocate_registers();
4064
4065 if (failed)
4066 return false;
4067 }
4068
4069 if (dispatch_width == 8)
4070 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4071 else
4072 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4073
4074 /* If any state parameters were appended, then ParameterValues could have
4075 * been realloced, in which case the driver uniform storage set up by
4076 * _mesa_associate_uniform_storage() would point to freed memory. Make
4077 * sure that didn't happen.
4078 */
4079 assert(sanity_param_count == prog->Parameters->NumParameters);
4080
4081 return !failed;
4082 }
4083
4084 const unsigned *
4085 brw_wm_fs_emit(struct brw_context *brw,
4086 void *mem_ctx,
4087 const struct brw_wm_prog_key *key,
4088 struct brw_wm_prog_data *prog_data,
4089 struct gl_fragment_program *fp,
4090 struct gl_shader_program *prog,
4091 unsigned *final_assembly_size)
4092 {
4093 bool start_busy = false;
4094 double start_time = 0;
4095
4096 if (unlikely(brw->perf_debug)) {
4097 start_busy = (brw->batch.last_bo &&
4098 drm_intel_bo_busy(brw->batch.last_bo));
4099 start_time = get_time();
4100 }
4101
4102 struct brw_shader *shader = NULL;
4103 if (prog)
4104 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4105
4106 if (unlikely(INTEL_DEBUG & DEBUG_WM))
4107 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4108
4109 /* Now the main event: Visit the shader IR and generate our FS IR for it.
4110 */
4111 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
4112 if (!v.run_fs()) {
4113 if (prog) {
4114 prog->LinkStatus = false;
4115 ralloc_strcat(&prog->InfoLog, v.fail_msg);
4116 }
4117
4118 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4119 v.fail_msg);
4120
4121 return NULL;
4122 }
4123
4124 cfg_t *simd16_cfg = NULL;
4125 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
4126 if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4127 if (!v.simd16_unsupported) {
4128 /* Try a SIMD16 compile */
4129 v2.import_uniforms(&v);
4130 if (!v2.run_fs()) {
4131 perf_debug("SIMD16 shader failed to compile, falling back to "
4132 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4133 } else {
4134 simd16_cfg = v2.cfg;
4135 }
4136 } else {
4137 perf_debug("SIMD16 shader unsupported, falling back to "
4138 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4139 }
4140 }
4141
4142 cfg_t *simd8_cfg;
4143 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4144 if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4145 simd8_cfg = NULL;
4146 prog_data->no_8 = true;
4147 } else {
4148 simd8_cfg = v.cfg;
4149 prog_data->no_8 = false;
4150 }
4151
4152 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4153 &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4154
4155 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4156 char *name;
4157 if (prog)
4158 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4159 prog->Label ? prog->Label : "unnamed",
4160 prog->Name);
4161 else
4162 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4163
4164 g.enable_debug(name);
4165 }
4166
4167 if (simd8_cfg)
4168 g.generate_code(simd8_cfg, 8);
4169 if (simd16_cfg)
4170 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4171
4172 if (unlikely(brw->perf_debug) && shader) {
4173 if (shader->compiled_once)
4174 brw_wm_debug_recompile(brw, prog, key);
4175 shader->compiled_once = true;
4176
4177 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4178 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4179 (get_time() - start_time) * 1000);
4180 }
4181 }
4182
4183 return g.get_assembly(final_assembly_size);
4184 }
4185
4186 extern "C" bool
4187 brw_fs_precompile(struct gl_context *ctx,
4188 struct gl_shader_program *shader_prog,
4189 struct gl_program *prog)
4190 {
4191 struct brw_context *brw = brw_context(ctx);
4192 struct brw_wm_prog_key key;
4193
4194 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4195 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4196 bool program_uses_dfdy = fp->UsesDFdy;
4197
4198 memset(&key, 0, sizeof(key));
4199
4200 if (brw->gen < 6) {
4201 if (fp->UsesKill)
4202 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4203
4204 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4205 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4206
4207 /* Just assume depth testing. */
4208 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4209 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4210 }
4211
4212 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4213 BRW_FS_VARYING_INPUT_MASK) > 16)
4214 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4215
4216 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4217 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
4218 for (unsigned i = 0; i < sampler_count; i++) {
4219 if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
4220 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4221 key.tex.swizzles[i] =
4222 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4223 } else {
4224 /* Color sampler: assume no swizzling. */
4225 key.tex.swizzles[i] = SWIZZLE_XYZW;
4226 }
4227 }
4228
4229 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4230 key.drawable_height = ctx->DrawBuffer->Height;
4231 }
4232
4233 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4234 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4235 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4236
4237 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4238 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4239 key.nr_color_regions > 1;
4240 }
4241
4242 key.program_string_id = bfp->id;
4243
4244 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4245 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4246
4247 bool success = brw_compile_wm_prog(brw, shader_prog, bfp, &key);
4248
4249 brw->wm.base.prog_offset = old_prog_offset;
4250 brw->wm.prog_data = old_prog_data;
4251
4252 return success;
4253 }