i965/fs: Only emit FS_OPCODE_PLACEHOLDER_HALT if there are discards
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 void
53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
54 const fs_reg *src, unsigned sources)
55 {
56 memset(this, 0, sizeof(*this));
57
58 this->src = new fs_reg[MAX2(sources, 3)];
59 for (unsigned i = 0; i < sources; i++)
60 this->src[i] = src[i];
61
62 this->opcode = opcode;
63 this->dst = dst;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (unsigned i = 0; i < sources; ++i) {
79 if (src[i].file != GRF && src[i].file != ATTR)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 for (unsigned i = 0; i < sources; ++i) {
94 switch (this->src[i].file) {
95 case BAD_FILE:
96 this->src[i].effective_width = 8;
97 break;
98 case GRF:
99 case HW_REG:
100 case ATTR:
101 assert(this->src[i].width > 0);
102 if (this->src[i].width == 1) {
103 this->src[i].effective_width = this->exec_size;
104 } else {
105 this->src[i].effective_width = this->src[i].width;
106 }
107 break;
108 case IMM:
109 case UNIFORM:
110 this->src[i].effective_width = this->exec_size;
111 break;
112 default:
113 unreachable("Invalid source register file");
114 }
115 }
116 this->dst.effective_width = this->exec_size;
117
118 this->conditional_mod = BRW_CONDITIONAL_NONE;
119
120 /* This will be the case for almost all instructions. */
121 switch (dst.file) {
122 case GRF:
123 case HW_REG:
124 case MRF:
125 case ATTR:
126 this->regs_written =
127 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
128 break;
129 case BAD_FILE:
130 this->regs_written = 0;
131 break;
132 case IMM:
133 case UNIFORM:
134 unreachable("Invalid destination register file");
135 default:
136 unreachable("Invalid register file");
137 }
138
139 this->writes_accumulator = false;
140 }
141
142 fs_inst::fs_inst()
143 {
144 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
145 }
146
147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
148 {
149 init(opcode, exec_size, reg_undef, NULL, 0);
150 }
151
152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
153 {
154 init(opcode, 0, dst, NULL, 0);
155 }
156
157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
158 const fs_reg &src0)
159 {
160 const fs_reg src[1] = { src0 };
161 init(opcode, exec_size, dst, src, 1);
162 }
163
164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
165 {
166 const fs_reg src[1] = { src0 };
167 init(opcode, 0, dst, src, 1);
168 }
169
170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
171 const fs_reg &src0, const fs_reg &src1)
172 {
173 const fs_reg src[2] = { src0, src1 };
174 init(opcode, exec_size, dst, src, 2);
175 }
176
177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
178 const fs_reg &src1)
179 {
180 const fs_reg src[2] = { src0, src1 };
181 init(opcode, 0, dst, src, 2);
182 }
183
184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
185 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
186 {
187 const fs_reg src[3] = { src0, src1, src2 };
188 init(opcode, exec_size, dst, src, 3);
189 }
190
191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
192 const fs_reg &src1, const fs_reg &src2)
193 {
194 const fs_reg src[3] = { src0, src1, src2 };
195 init(opcode, 0, dst, src, 3);
196 }
197
198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
199 const fs_reg src[], unsigned sources)
200 {
201 init(opcode, 0, dst, src, sources);
202 }
203
204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
205 const fs_reg src[], unsigned sources)
206 {
207 init(opcode, exec_width, dst, src, sources);
208 }
209
210 fs_inst::fs_inst(const fs_inst &that)
211 {
212 memcpy(this, &that, sizeof(that));
213
214 this->src = new fs_reg[MAX2(that.sources, 3)];
215
216 for (unsigned i = 0; i < that.sources; i++)
217 this->src[i] = that.src[i];
218 }
219
220 fs_inst::~fs_inst()
221 {
222 delete[] this->src;
223 }
224
225 void
226 fs_inst::resize_sources(uint8_t num_sources)
227 {
228 if (this->sources != num_sources) {
229 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
230
231 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
232 src[i] = this->src[i];
233
234 delete[] this->src;
235 this->src = src;
236 this->sources = num_sources;
237 }
238 }
239
240 #define ALU1(op) \
241 fs_inst * \
242 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
243 { \
244 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
245 }
246
247 #define ALU2(op) \
248 fs_inst * \
249 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
250 const fs_reg &src1) \
251 { \
252 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
253 }
254
255 #define ALU2_ACC(op) \
256 fs_inst * \
257 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
258 const fs_reg &src1) \
259 { \
260 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
261 inst->writes_accumulator = true; \
262 return inst; \
263 }
264
265 #define ALU3(op) \
266 fs_inst * \
267 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
268 const fs_reg &src1, const fs_reg &src2) \
269 { \
270 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
271 }
272
273 ALU1(NOT)
274 ALU1(MOV)
275 ALU1(FRC)
276 ALU1(RNDD)
277 ALU1(RNDE)
278 ALU1(RNDZ)
279 ALU2(ADD)
280 ALU2(MUL)
281 ALU2_ACC(MACH)
282 ALU2(AND)
283 ALU2(OR)
284 ALU2(XOR)
285 ALU2(SHL)
286 ALU2(SHR)
287 ALU2(ASR)
288 ALU3(LRP)
289 ALU1(BFREV)
290 ALU3(BFE)
291 ALU2(BFI1)
292 ALU3(BFI2)
293 ALU1(FBH)
294 ALU1(FBL)
295 ALU1(CBIT)
296 ALU3(MAD)
297 ALU2_ACC(ADDC)
298 ALU2_ACC(SUBB)
299 ALU2(SEL)
300 ALU2(MAC)
301
302 /** Gen4 predicated IF. */
303 fs_inst *
304 fs_visitor::IF(enum brw_predicate predicate)
305 {
306 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
307 inst->predicate = predicate;
308 return inst;
309 }
310
311 /** Gen6 IF with embedded comparison. */
312 fs_inst *
313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
314 enum brw_conditional_mod condition)
315 {
316 assert(brw->gen == 6);
317 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
318 reg_null_d, src0, src1);
319 inst->conditional_mod = condition;
320 return inst;
321 }
322
323 /**
324 * CMP: Sets the low bit of the destination channels with the result
325 * of the comparison, while the upper bits are undefined, and updates
326 * the flag register with the packed 16 bits of the result.
327 */
328 fs_inst *
329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
330 enum brw_conditional_mod condition)
331 {
332 fs_inst *inst;
333
334 /* Take the instruction:
335 *
336 * CMP null<d> src0<f> src1<f>
337 *
338 * Original gen4 does type conversion to the destination type before
339 * comparison, producing garbage results for floating point comparisons.
340 *
341 * The destination type doesn't matter on newer generations, so we set the
342 * type to match src0 so we can compact the instruction.
343 */
344 dst.type = src0.type;
345 if (dst.file == HW_REG)
346 dst.fixed_hw_reg.type = dst.type;
347
348 resolve_ud_negate(&src0);
349 resolve_ud_negate(&src1);
350
351 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
352 inst->conditional_mod = condition;
353
354 return inst;
355 }
356
357 fs_inst *
358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
359 {
360 uint8_t exec_size = dst.width;
361 for (int i = 0; i < sources; ++i) {
362 assert(src[i].width % dst.width == 0);
363 if (src[i].width > exec_size)
364 exec_size = src[i].width;
365 }
366
367 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
368 dst, src, sources);
369 inst->regs_written = 0;
370 for (int i = 0; i < sources; ++i) {
371 /* The LOAD_PAYLOAD instruction only really makes sense if we are
372 * dealing with whole registers. If this ever changes, we can deal
373 * with it later.
374 */
375 int size = inst->src[i].effective_width * type_sz(src[i].type);
376 assert(size % 32 == 0);
377 inst->regs_written += (size + 31) / 32;
378 }
379
380 return inst;
381 }
382
383 exec_list
384 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
385 const fs_reg &surf_index,
386 const fs_reg &varying_offset,
387 uint32_t const_offset)
388 {
389 exec_list instructions;
390 fs_inst *inst;
391
392 /* We have our constant surface use a pitch of 4 bytes, so our index can
393 * be any component of a vector, and then we load 4 contiguous
394 * components starting from that.
395 *
396 * We break down the const_offset to a portion added to the variable
397 * offset and a portion done using reg_offset, which means that if you
398 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
399 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
400 * CSE can later notice that those loads are all the same and eliminate
401 * the redundant ones.
402 */
403 fs_reg vec4_offset = vgrf(glsl_type::int_type);
404 instructions.push_tail(ADD(vec4_offset,
405 varying_offset, fs_reg(const_offset & ~3)));
406
407 int scale = 1;
408 if (brw->gen == 4 && dst.width == 8) {
409 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
410 * u, v, r) as parameters, or we can just use the SIMD16 message
411 * consisting of (header, u). We choose the second, at the cost of a
412 * longer return length.
413 */
414 scale = 2;
415 }
416
417 enum opcode op;
418 if (brw->gen >= 7)
419 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
420 else
421 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
422
423 assert(dst.width % 8 == 0);
424 int regs_written = 4 * (dst.width / 8) * scale;
425 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
426 dst.type, dst.width);
427 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
428 inst->regs_written = regs_written;
429 instructions.push_tail(inst);
430
431 if (brw->gen < 7) {
432 inst->base_mrf = 13;
433 inst->header_present = true;
434 if (brw->gen == 4)
435 inst->mlen = 3;
436 else
437 inst->mlen = 1 + dispatch_width / 8;
438 }
439
440 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
441 instructions.push_tail(MOV(dst, result));
442
443 return instructions;
444 }
445
446 /**
447 * A helper for MOV generation for fixing up broken hardware SEND dependency
448 * handling.
449 */
450 fs_inst *
451 fs_visitor::DEP_RESOLVE_MOV(int grf)
452 {
453 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
454
455 inst->ir = NULL;
456 inst->annotation = "send dependency resolve";
457
458 /* The caller always wants uncompressed to emit the minimal extra
459 * dependencies, and to avoid having to deal with aligning its regs to 2.
460 */
461 inst->exec_size = 8;
462
463 return inst;
464 }
465
466 bool
467 fs_inst::equals(fs_inst *inst) const
468 {
469 return (opcode == inst->opcode &&
470 dst.equals(inst->dst) &&
471 src[0].equals(inst->src[0]) &&
472 src[1].equals(inst->src[1]) &&
473 src[2].equals(inst->src[2]) &&
474 saturate == inst->saturate &&
475 predicate == inst->predicate &&
476 conditional_mod == inst->conditional_mod &&
477 mlen == inst->mlen &&
478 base_mrf == inst->base_mrf &&
479 target == inst->target &&
480 eot == inst->eot &&
481 header_present == inst->header_present &&
482 shadow_compare == inst->shadow_compare &&
483 exec_size == inst->exec_size &&
484 offset == inst->offset);
485 }
486
487 bool
488 fs_inst::overwrites_reg(const fs_reg &reg) const
489 {
490 return reg.in_range(dst, regs_written);
491 }
492
493 bool
494 fs_inst::is_send_from_grf() const
495 {
496 switch (opcode) {
497 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
498 case SHADER_OPCODE_SHADER_TIME_ADD:
499 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
500 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
501 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
502 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
503 case SHADER_OPCODE_UNTYPED_ATOMIC:
504 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
505 case SHADER_OPCODE_URB_WRITE_SIMD8:
506 return true;
507 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
508 return src[1].file == GRF;
509 case FS_OPCODE_FB_WRITE:
510 return src[0].file == GRF;
511 default:
512 if (is_tex())
513 return src[0].file == GRF;
514
515 return false;
516 }
517 }
518
519 bool
520 fs_inst::can_do_source_mods(struct brw_context *brw)
521 {
522 if (brw->gen == 6 && is_math())
523 return false;
524
525 if (is_send_from_grf())
526 return false;
527
528 if (!backend_instruction::can_do_source_mods())
529 return false;
530
531 return true;
532 }
533
534 bool
535 fs_inst::has_side_effects() const
536 {
537 return this->eot || backend_instruction::has_side_effects();
538 }
539
540 void
541 fs_reg::init()
542 {
543 memset(this, 0, sizeof(*this));
544 stride = 1;
545 }
546
547 /** Generic unset register constructor. */
548 fs_reg::fs_reg()
549 {
550 init();
551 this->file = BAD_FILE;
552 }
553
554 /** Immediate value constructor. */
555 fs_reg::fs_reg(float f)
556 {
557 init();
558 this->file = IMM;
559 this->type = BRW_REGISTER_TYPE_F;
560 this->fixed_hw_reg.dw1.f = f;
561 this->width = 1;
562 }
563
564 /** Immediate value constructor. */
565 fs_reg::fs_reg(int32_t i)
566 {
567 init();
568 this->file = IMM;
569 this->type = BRW_REGISTER_TYPE_D;
570 this->fixed_hw_reg.dw1.d = i;
571 this->width = 1;
572 }
573
574 /** Immediate value constructor. */
575 fs_reg::fs_reg(uint32_t u)
576 {
577 init();
578 this->file = IMM;
579 this->type = BRW_REGISTER_TYPE_UD;
580 this->fixed_hw_reg.dw1.ud = u;
581 this->width = 1;
582 }
583
584 /** Vector float immediate value constructor. */
585 fs_reg::fs_reg(uint8_t vf[4])
586 {
587 init();
588 this->file = IMM;
589 this->type = BRW_REGISTER_TYPE_VF;
590 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
591 }
592
593 /** Vector float immediate value constructor. */
594 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
595 {
596 init();
597 this->file = IMM;
598 this->type = BRW_REGISTER_TYPE_VF;
599 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
600 (vf1 << 8) |
601 (vf2 << 16) |
602 (vf3 << 24);
603 }
604
605 /** Fixed brw_reg. */
606 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
607 {
608 init();
609 this->file = HW_REG;
610 this->fixed_hw_reg = fixed_hw_reg;
611 this->type = fixed_hw_reg.type;
612 this->width = 1 << fixed_hw_reg.width;
613 }
614
615 bool
616 fs_reg::equals(const fs_reg &r) const
617 {
618 return (file == r.file &&
619 reg == r.reg &&
620 reg_offset == r.reg_offset &&
621 subreg_offset == r.subreg_offset &&
622 type == r.type &&
623 negate == r.negate &&
624 abs == r.abs &&
625 !reladdr && !r.reladdr &&
626 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
627 width == r.width &&
628 stride == r.stride);
629 }
630
631 fs_reg &
632 fs_reg::set_smear(unsigned subreg)
633 {
634 assert(file != HW_REG && file != IMM);
635 subreg_offset = subreg * type_sz(type);
636 stride = 0;
637 return *this;
638 }
639
640 bool
641 fs_reg::is_contiguous() const
642 {
643 return stride == 1;
644 }
645
646 int
647 fs_visitor::type_size(const struct glsl_type *type)
648 {
649 unsigned int size, i;
650
651 switch (type->base_type) {
652 case GLSL_TYPE_UINT:
653 case GLSL_TYPE_INT:
654 case GLSL_TYPE_FLOAT:
655 case GLSL_TYPE_BOOL:
656 return type->components();
657 case GLSL_TYPE_ARRAY:
658 return type_size(type->fields.array) * type->length;
659 case GLSL_TYPE_STRUCT:
660 size = 0;
661 for (i = 0; i < type->length; i++) {
662 size += type_size(type->fields.structure[i].type);
663 }
664 return size;
665 case GLSL_TYPE_SAMPLER:
666 /* Samplers take up no register space, since they're baked in at
667 * link time.
668 */
669 return 0;
670 case GLSL_TYPE_ATOMIC_UINT:
671 return 0;
672 case GLSL_TYPE_IMAGE:
673 case GLSL_TYPE_VOID:
674 case GLSL_TYPE_ERROR:
675 case GLSL_TYPE_INTERFACE:
676 case GLSL_TYPE_DOUBLE:
677 unreachable("not reached");
678 }
679
680 return 0;
681 }
682
683 /**
684 * Create a MOV to read the timestamp register.
685 *
686 * The caller is responsible for emitting the MOV. The return value is
687 * the destination of the MOV, with extra parameters set.
688 */
689 fs_reg
690 fs_visitor::get_timestamp(fs_inst **out_mov)
691 {
692 assert(brw->gen >= 7);
693
694 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
695 BRW_ARF_TIMESTAMP,
696 0),
697 BRW_REGISTER_TYPE_UD));
698
699 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
700
701 fs_inst *mov = MOV(dst, ts);
702 /* We want to read the 3 fields we care about even if it's not enabled in
703 * the dispatch.
704 */
705 mov->force_writemask_all = true;
706
707 /* The caller wants the low 32 bits of the timestamp. Since it's running
708 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
709 * which is plenty of time for our purposes. It is identical across the
710 * EUs, but since it's tracking GPU core speed it will increment at a
711 * varying rate as render P-states change.
712 *
713 * The caller could also check if render P-states have changed (or anything
714 * else that might disrupt timing) by setting smear to 2 and checking if
715 * that field is != 0.
716 */
717 dst.set_smear(0);
718
719 *out_mov = mov;
720 return dst;
721 }
722
723 void
724 fs_visitor::emit_shader_time_begin()
725 {
726 current_annotation = "shader time start";
727 fs_inst *mov;
728 shader_start_time = get_timestamp(&mov);
729 emit(mov);
730 }
731
732 void
733 fs_visitor::emit_shader_time_end()
734 {
735 current_annotation = "shader time end";
736
737 enum shader_time_shader_type type, written_type, reset_type;
738 switch (stage) {
739 case MESA_SHADER_VERTEX:
740 type = ST_VS;
741 written_type = ST_VS_WRITTEN;
742 reset_type = ST_VS_RESET;
743 break;
744 case MESA_SHADER_GEOMETRY:
745 type = ST_GS;
746 written_type = ST_GS_WRITTEN;
747 reset_type = ST_GS_RESET;
748 break;
749 case MESA_SHADER_FRAGMENT:
750 if (dispatch_width == 8) {
751 type = ST_FS8;
752 written_type = ST_FS8_WRITTEN;
753 reset_type = ST_FS8_RESET;
754 } else {
755 assert(dispatch_width == 16);
756 type = ST_FS16;
757 written_type = ST_FS16_WRITTEN;
758 reset_type = ST_FS16_RESET;
759 }
760 break;
761 default:
762 unreachable("fs_visitor::emit_shader_time_end missing code");
763 }
764
765 /* Insert our code just before the final SEND with EOT. */
766 exec_node *end = this->instructions.get_tail();
767 assert(end && ((fs_inst *) end)->eot);
768
769 fs_inst *tm_read;
770 fs_reg shader_end_time = get_timestamp(&tm_read);
771 end->insert_before(tm_read);
772
773 /* Check that there weren't any timestamp reset events (assuming these
774 * were the only two timestamp reads that happened).
775 */
776 fs_reg reset = shader_end_time;
777 reset.set_smear(2);
778 fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
779 test->conditional_mod = BRW_CONDITIONAL_Z;
780 test->force_writemask_all = true;
781 end->insert_before(test);
782 end->insert_before(IF(BRW_PREDICATE_NORMAL));
783
784 fs_reg start = shader_start_time;
785 start.negate = true;
786 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
787 diff.set_smear(0);
788 fs_inst *add = ADD(diff, start, shader_end_time);
789 add->force_writemask_all = true;
790 end->insert_before(add);
791
792 /* If there were no instructions between the two timestamp gets, the diff
793 * is 2 cycles. Remove that overhead, so I can forget about that when
794 * trying to determine the time taken for single instructions.
795 */
796 add = ADD(diff, diff, fs_reg(-2u));
797 add->force_writemask_all = true;
798 end->insert_before(add);
799
800 end->insert_before(SHADER_TIME_ADD(type, diff));
801 end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
802 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
803 end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
804 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
805 }
806
807 fs_inst *
808 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
809 {
810 int shader_time_index =
811 brw_get_shader_time_index(brw, shader_prog, prog, type);
812 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
813
814 fs_reg payload;
815 if (dispatch_width == 8)
816 payload = vgrf(glsl_type::uvec2_type);
817 else
818 payload = vgrf(glsl_type::uint_type);
819
820 return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
821 fs_reg(), payload, offset, value);
822 }
823
824 void
825 fs_visitor::vfail(const char *format, va_list va)
826 {
827 char *msg;
828
829 if (failed)
830 return;
831
832 failed = true;
833
834 msg = ralloc_vasprintf(mem_ctx, format, va);
835 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
836
837 this->fail_msg = msg;
838
839 if (debug_enabled) {
840 fprintf(stderr, "%s", msg);
841 }
842 }
843
844 void
845 fs_visitor::fail(const char *format, ...)
846 {
847 va_list va;
848
849 va_start(va, format);
850 vfail(format, va);
851 va_end(va);
852 }
853
854 /**
855 * Mark this program as impossible to compile in SIMD16 mode.
856 *
857 * During the SIMD8 compile (which happens first), we can detect and flag
858 * things that are unsupported in SIMD16 mode, so the compiler can skip
859 * the SIMD16 compile altogether.
860 *
861 * During a SIMD16 compile (if one happens anyway), this just calls fail().
862 */
863 void
864 fs_visitor::no16(const char *format, ...)
865 {
866 va_list va;
867
868 va_start(va, format);
869
870 if (dispatch_width == 16) {
871 vfail(format, va);
872 } else {
873 simd16_unsupported = true;
874
875 if (brw->perf_debug) {
876 if (no16_msg)
877 ralloc_vasprintf_append(&no16_msg, format, va);
878 else
879 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
880 }
881 }
882
883 va_end(va);
884 }
885
886 fs_inst *
887 fs_visitor::emit(enum opcode opcode)
888 {
889 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
890 }
891
892 fs_inst *
893 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
894 {
895 return emit(new(mem_ctx) fs_inst(opcode, dst));
896 }
897
898 fs_inst *
899 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
900 {
901 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
902 }
903
904 fs_inst *
905 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
906 const fs_reg &src1)
907 {
908 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
909 }
910
911 fs_inst *
912 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
913 const fs_reg &src1, const fs_reg &src2)
914 {
915 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
916 }
917
918 fs_inst *
919 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
920 fs_reg src[], int sources)
921 {
922 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
923 }
924
925 /**
926 * Returns true if the instruction has a flag that means it won't
927 * update an entire destination register.
928 *
929 * For example, dead code elimination and live variable analysis want to know
930 * when a write to a variable screens off any preceding values that were in
931 * it.
932 */
933 bool
934 fs_inst::is_partial_write() const
935 {
936 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
937 (this->dst.width * type_sz(this->dst.type)) < 32 ||
938 !this->dst.is_contiguous());
939 }
940
941 int
942 fs_inst::regs_read(int arg) const
943 {
944 if (is_tex() && arg == 0 && src[0].file == GRF) {
945 return mlen;
946 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
947 return mlen;
948 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
949 return mlen;
950 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
951 return mlen;
952 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
953 return mlen;
954 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
955 return mlen;
956 }
957
958 switch (src[arg].file) {
959 case BAD_FILE:
960 case UNIFORM:
961 case IMM:
962 return 1;
963 case GRF:
964 case HW_REG:
965 if (src[arg].stride == 0) {
966 return 1;
967 } else {
968 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
969 return (size + 31) / 32;
970 }
971 case MRF:
972 unreachable("MRF registers are not allowed as sources");
973 default:
974 unreachable("Invalid register file");
975 }
976 }
977
978 bool
979 fs_inst::reads_flag() const
980 {
981 return predicate;
982 }
983
984 bool
985 fs_inst::writes_flag() const
986 {
987 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
988 opcode != BRW_OPCODE_IF &&
989 opcode != BRW_OPCODE_WHILE)) ||
990 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
991 }
992
993 /**
994 * Returns how many MRFs an FS opcode will write over.
995 *
996 * Note that this is not the 0 or 1 implied writes in an actual gen
997 * instruction -- the FS opcodes often generate MOVs in addition.
998 */
999 int
1000 fs_visitor::implied_mrf_writes(fs_inst *inst)
1001 {
1002 if (inst->mlen == 0)
1003 return 0;
1004
1005 if (inst->base_mrf == -1)
1006 return 0;
1007
1008 switch (inst->opcode) {
1009 case SHADER_OPCODE_RCP:
1010 case SHADER_OPCODE_RSQ:
1011 case SHADER_OPCODE_SQRT:
1012 case SHADER_OPCODE_EXP2:
1013 case SHADER_OPCODE_LOG2:
1014 case SHADER_OPCODE_SIN:
1015 case SHADER_OPCODE_COS:
1016 return 1 * dispatch_width / 8;
1017 case SHADER_OPCODE_POW:
1018 case SHADER_OPCODE_INT_QUOTIENT:
1019 case SHADER_OPCODE_INT_REMAINDER:
1020 return 2 * dispatch_width / 8;
1021 case SHADER_OPCODE_TEX:
1022 case FS_OPCODE_TXB:
1023 case SHADER_OPCODE_TXD:
1024 case SHADER_OPCODE_TXF:
1025 case SHADER_OPCODE_TXF_CMS:
1026 case SHADER_OPCODE_TXF_MCS:
1027 case SHADER_OPCODE_TG4:
1028 case SHADER_OPCODE_TG4_OFFSET:
1029 case SHADER_OPCODE_TXL:
1030 case SHADER_OPCODE_TXS:
1031 case SHADER_OPCODE_LOD:
1032 return 1;
1033 case FS_OPCODE_FB_WRITE:
1034 return 2;
1035 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1036 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1037 return 1;
1038 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1039 return inst->mlen;
1040 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1041 return 2;
1042 case SHADER_OPCODE_UNTYPED_ATOMIC:
1043 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1044 case SHADER_OPCODE_URB_WRITE_SIMD8:
1045 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1046 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1047 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1048 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1049 return 0;
1050 default:
1051 unreachable("not reached");
1052 }
1053 }
1054
1055 fs_reg
1056 fs_visitor::vgrf(const glsl_type *const type)
1057 {
1058 int reg_width = dispatch_width / 8;
1059 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1060 brw_type_for_base_type(type), dispatch_width);
1061 }
1062
1063 fs_reg
1064 fs_visitor::vgrf(int num_components)
1065 {
1066 int reg_width = dispatch_width / 8;
1067 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1068 BRW_REGISTER_TYPE_F, dispatch_width);
1069 }
1070
1071 /** Fixed HW reg constructor. */
1072 fs_reg::fs_reg(enum register_file file, int reg)
1073 {
1074 init();
1075 this->file = file;
1076 this->reg = reg;
1077 this->type = BRW_REGISTER_TYPE_F;
1078
1079 switch (file) {
1080 case UNIFORM:
1081 this->width = 1;
1082 break;
1083 default:
1084 this->width = 8;
1085 }
1086 }
1087
1088 /** Fixed HW reg constructor. */
1089 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1090 {
1091 init();
1092 this->file = file;
1093 this->reg = reg;
1094 this->type = type;
1095
1096 switch (file) {
1097 case UNIFORM:
1098 this->width = 1;
1099 break;
1100 default:
1101 this->width = 8;
1102 }
1103 }
1104
1105 /** Fixed HW reg constructor. */
1106 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1107 uint8_t width)
1108 {
1109 init();
1110 this->file = file;
1111 this->reg = reg;
1112 this->type = type;
1113 this->width = width;
1114 }
1115
1116 fs_reg *
1117 fs_visitor::variable_storage(ir_variable *var)
1118 {
1119 return (fs_reg *)hash_table_find(this->variable_ht, var);
1120 }
1121
1122 void
1123 import_uniforms_callback(const void *key,
1124 void *data,
1125 void *closure)
1126 {
1127 struct hash_table *dst_ht = (struct hash_table *)closure;
1128 const fs_reg *reg = (const fs_reg *)data;
1129
1130 if (reg->file != UNIFORM)
1131 return;
1132
1133 hash_table_insert(dst_ht, data, key);
1134 }
1135
1136 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1137 * This brings in those uniform definitions
1138 */
1139 void
1140 fs_visitor::import_uniforms(fs_visitor *v)
1141 {
1142 hash_table_call_foreach(v->variable_ht,
1143 import_uniforms_callback,
1144 variable_ht);
1145 this->push_constant_loc = v->push_constant_loc;
1146 this->pull_constant_loc = v->pull_constant_loc;
1147 this->uniforms = v->uniforms;
1148 this->param_size = v->param_size;
1149 }
1150
1151 /* Our support for uniforms is piggy-backed on the struct
1152 * gl_fragment_program, because that's where the values actually
1153 * get stored, rather than in some global gl_shader_program uniform
1154 * store.
1155 */
1156 void
1157 fs_visitor::setup_uniform_values(ir_variable *ir)
1158 {
1159 int namelen = strlen(ir->name);
1160
1161 /* The data for our (non-builtin) uniforms is stored in a series of
1162 * gl_uniform_driver_storage structs for each subcomponent that
1163 * glGetUniformLocation() could name. We know it's been set up in the same
1164 * order we'd walk the type, so walk the list of storage and find anything
1165 * with our name, or the prefix of a component that starts with our name.
1166 */
1167 unsigned params_before = uniforms;
1168 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1169 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1170
1171 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1172 (storage->name[namelen] != 0 &&
1173 storage->name[namelen] != '.' &&
1174 storage->name[namelen] != '[')) {
1175 continue;
1176 }
1177
1178 unsigned slots = storage->type->component_slots();
1179 if (storage->array_elements)
1180 slots *= storage->array_elements;
1181
1182 for (unsigned i = 0; i < slots; i++) {
1183 stage_prog_data->param[uniforms++] = &storage->storage[i];
1184 }
1185 }
1186
1187 /* Make sure we actually initialized the right amount of stuff here. */
1188 assert(params_before + ir->type->component_slots() == uniforms);
1189 (void)params_before;
1190 }
1191
1192
1193 /* Our support for builtin uniforms is even scarier than non-builtin.
1194 * It sits on top of the PROG_STATE_VAR parameters that are
1195 * automatically updated from GL context state.
1196 */
1197 void
1198 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1199 {
1200 const ir_state_slot *const slots = ir->get_state_slots();
1201 assert(slots != NULL);
1202
1203 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1204 /* This state reference has already been setup by ir_to_mesa, but we'll
1205 * get the same index back here.
1206 */
1207 int index = _mesa_add_state_reference(this->prog->Parameters,
1208 (gl_state_index *)slots[i].tokens);
1209
1210 /* Add each of the unique swizzles of the element as a parameter.
1211 * This'll end up matching the expected layout of the
1212 * array/matrix/structure we're trying to fill in.
1213 */
1214 int last_swiz = -1;
1215 for (unsigned int j = 0; j < 4; j++) {
1216 int swiz = GET_SWZ(slots[i].swizzle, j);
1217 if (swiz == last_swiz)
1218 break;
1219 last_swiz = swiz;
1220
1221 stage_prog_data->param[uniforms++] =
1222 &prog->Parameters->ParameterValues[index][swiz];
1223 }
1224 }
1225 }
1226
1227 fs_reg *
1228 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1229 bool origin_upper_left)
1230 {
1231 assert(stage == MESA_SHADER_FRAGMENT);
1232 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1233 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1234 fs_reg wpos = *reg;
1235 bool flip = !origin_upper_left ^ key->render_to_fbo;
1236
1237 /* gl_FragCoord.x */
1238 if (pixel_center_integer) {
1239 emit(MOV(wpos, this->pixel_x));
1240 } else {
1241 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1242 }
1243 wpos = offset(wpos, 1);
1244
1245 /* gl_FragCoord.y */
1246 if (!flip && pixel_center_integer) {
1247 emit(MOV(wpos, this->pixel_y));
1248 } else {
1249 fs_reg pixel_y = this->pixel_y;
1250 float offset = (pixel_center_integer ? 0.0 : 0.5);
1251
1252 if (flip) {
1253 pixel_y.negate = true;
1254 offset += key->drawable_height - 1.0;
1255 }
1256
1257 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1258 }
1259 wpos = offset(wpos, 1);
1260
1261 /* gl_FragCoord.z */
1262 if (brw->gen >= 6) {
1263 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1264 } else {
1265 emit(FS_OPCODE_LINTERP, wpos,
1266 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1267 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1268 interp_reg(VARYING_SLOT_POS, 2));
1269 }
1270 wpos = offset(wpos, 1);
1271
1272 /* gl_FragCoord.w: Already set up in emit_interpolation */
1273 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1274
1275 return reg;
1276 }
1277
1278 fs_inst *
1279 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1280 glsl_interp_qualifier interpolation_mode,
1281 bool is_centroid, bool is_sample)
1282 {
1283 brw_wm_barycentric_interp_mode barycoord_mode;
1284 if (brw->gen >= 6) {
1285 if (is_centroid) {
1286 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1287 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1288 else
1289 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1290 } else if (is_sample) {
1291 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1292 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1293 else
1294 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1295 } else {
1296 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1297 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1298 else
1299 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1300 }
1301 } else {
1302 /* On Ironlake and below, there is only one interpolation mode.
1303 * Centroid interpolation doesn't mean anything on this hardware --
1304 * there is no multisampling.
1305 */
1306 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1307 }
1308 return emit(FS_OPCODE_LINTERP, attr,
1309 this->delta_x[barycoord_mode],
1310 this->delta_y[barycoord_mode], interp);
1311 }
1312
1313 void
1314 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1315 const glsl_type *type,
1316 glsl_interp_qualifier interpolation_mode,
1317 int location, bool mod_centroid,
1318 bool mod_sample)
1319 {
1320 attr.type = brw_type_for_base_type(type->get_scalar_type());
1321
1322 assert(stage == MESA_SHADER_FRAGMENT);
1323 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1324 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1325
1326 unsigned int array_elements;
1327
1328 if (type->is_array()) {
1329 array_elements = type->length;
1330 if (array_elements == 0) {
1331 fail("dereferenced array '%s' has length 0\n", name);
1332 }
1333 type = type->fields.array;
1334 } else {
1335 array_elements = 1;
1336 }
1337
1338 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1339 bool is_gl_Color =
1340 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1341 if (key->flat_shade && is_gl_Color) {
1342 interpolation_mode = INTERP_QUALIFIER_FLAT;
1343 } else {
1344 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1345 }
1346 }
1347
1348 for (unsigned int i = 0; i < array_elements; i++) {
1349 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1350 if (prog_data->urb_setup[location] == -1) {
1351 /* If there's no incoming setup data for this slot, don't
1352 * emit interpolation for it.
1353 */
1354 attr = offset(attr, type->vector_elements);
1355 location++;
1356 continue;
1357 }
1358
1359 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1360 /* Constant interpolation (flat shading) case. The SF has
1361 * handed us defined values in only the constant offset
1362 * field of the setup reg.
1363 */
1364 for (unsigned int k = 0; k < type->vector_elements; k++) {
1365 struct brw_reg interp = interp_reg(location, k);
1366 interp = suboffset(interp, 3);
1367 interp.type = attr.type;
1368 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1369 attr = offset(attr, 1);
1370 }
1371 } else {
1372 /* Smooth/noperspective interpolation case. */
1373 for (unsigned int k = 0; k < type->vector_elements; k++) {
1374 struct brw_reg interp = interp_reg(location, k);
1375 if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1376 /* Get the pixel/sample mask into f0 so that we know
1377 * which pixels are lit. Then, for each channel that is
1378 * unlit, replace the centroid data with non-centroid
1379 * data.
1380 */
1381 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1382
1383 fs_inst *inst;
1384 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1385 false, false);
1386 inst->predicate = BRW_PREDICATE_NORMAL;
1387 inst->predicate_inverse = true;
1388 if (brw->has_pln)
1389 inst->no_dd_clear = true;
1390
1391 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1392 mod_centroid && !key->persample_shading,
1393 mod_sample || key->persample_shading);
1394 inst->predicate = BRW_PREDICATE_NORMAL;
1395 inst->predicate_inverse = false;
1396 if (brw->has_pln)
1397 inst->no_dd_check = true;
1398
1399 } else {
1400 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1401 mod_centroid && !key->persample_shading,
1402 mod_sample || key->persample_shading);
1403 }
1404 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1405 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1406 }
1407 attr = offset(attr, 1);
1408 }
1409
1410 }
1411 location++;
1412 }
1413 }
1414 }
1415
1416 fs_reg *
1417 fs_visitor::emit_frontfacing_interpolation()
1418 {
1419 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1420
1421 if (brw->gen >= 6) {
1422 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1423 * a boolean result from this (~0/true or 0/false).
1424 *
1425 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1426 * this task in only one instruction:
1427 * - a negation source modifier will flip the bit; and
1428 * - a W -> D type conversion will sign extend the bit into the high
1429 * word of the destination.
1430 *
1431 * An ASR 15 fills the low word of the destination.
1432 */
1433 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1434 g0.negate = true;
1435
1436 emit(ASR(*reg, g0, fs_reg(15)));
1437 } else {
1438 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1439 * a boolean result from this (1/true or 0/false).
1440 *
1441 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1442 * the negation source modifier to flip it. Unfortunately the SHR
1443 * instruction only operates on UD (or D with an abs source modifier)
1444 * sources without negation.
1445 *
1446 * Instead, use ASR (which will give ~0/true or 0/false).
1447 */
1448 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1449 g1_6.negate = true;
1450
1451 emit(ASR(*reg, g1_6, fs_reg(31)));
1452 }
1453
1454 return reg;
1455 }
1456
1457 void
1458 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1459 {
1460 assert(stage == MESA_SHADER_FRAGMENT);
1461 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1462 assert(dst.type == BRW_REGISTER_TYPE_F);
1463
1464 if (key->compute_pos_offset) {
1465 /* Convert int_sample_pos to floating point */
1466 emit(MOV(dst, int_sample_pos));
1467 /* Scale to the range [0, 1] */
1468 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1469 }
1470 else {
1471 /* From ARB_sample_shading specification:
1472 * "When rendering to a non-multisample buffer, or if multisample
1473 * rasterization is disabled, gl_SamplePosition will always be
1474 * (0.5, 0.5).
1475 */
1476 emit(MOV(dst, fs_reg(0.5f)));
1477 }
1478 }
1479
1480 fs_reg *
1481 fs_visitor::emit_samplepos_setup()
1482 {
1483 assert(brw->gen >= 6);
1484
1485 this->current_annotation = "compute sample position";
1486 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1487 fs_reg pos = *reg;
1488 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1489 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1490
1491 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1492 * mode will be enabled.
1493 *
1494 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1495 * R31.1:0 Position Offset X/Y for Slot[3:0]
1496 * R31.3:2 Position Offset X/Y for Slot[7:4]
1497 * .....
1498 *
1499 * The X, Y sample positions come in as bytes in thread payload. So, read
1500 * the positions using vstride=16, width=8, hstride=2.
1501 */
1502 struct brw_reg sample_pos_reg =
1503 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1504 BRW_REGISTER_TYPE_B), 16, 8, 2);
1505
1506 if (dispatch_width == 8) {
1507 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1508 } else {
1509 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1510 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1511 ->force_sechalf = true;
1512 }
1513 /* Compute gl_SamplePosition.x */
1514 compute_sample_position(pos, int_sample_x);
1515 pos = offset(pos, 1);
1516 if (dispatch_width == 8) {
1517 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1518 } else {
1519 emit(MOV(half(int_sample_y, 0),
1520 fs_reg(suboffset(sample_pos_reg, 1))));
1521 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1522 ->force_sechalf = true;
1523 }
1524 /* Compute gl_SamplePosition.y */
1525 compute_sample_position(pos, int_sample_y);
1526 return reg;
1527 }
1528
1529 fs_reg *
1530 fs_visitor::emit_sampleid_setup()
1531 {
1532 assert(stage == MESA_SHADER_FRAGMENT);
1533 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1534 assert(brw->gen >= 6);
1535
1536 this->current_annotation = "compute sample id";
1537 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1538
1539 if (key->compute_sample_id) {
1540 fs_reg t1 = vgrf(glsl_type::int_type);
1541 fs_reg t2 = vgrf(glsl_type::int_type);
1542 t2.type = BRW_REGISTER_TYPE_UW;
1543
1544 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1545 * 8x multisampling, subspan 0 will represent sample N (where N
1546 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1547 * 7. We can find the value of N by looking at R0.0 bits 7:6
1548 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1549 * (since samples are always delivered in pairs). That is, we
1550 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1551 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1552 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1553 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1554 * populating a temporary variable with the sequence (0, 1, 2, 3),
1555 * and then reading from it using vstride=1, width=4, hstride=0.
1556 * These computations hold good for 4x multisampling as well.
1557 *
1558 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1559 * the first four slots are sample 0 of subspan 0; the next four
1560 * are sample 1 of subspan 0; the third group is sample 0 of
1561 * subspan 1, and finally sample 1 of subspan 1.
1562 */
1563 fs_inst *inst;
1564 inst = emit(BRW_OPCODE_AND, t1,
1565 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1566 fs_reg(0xc0));
1567 inst->force_writemask_all = true;
1568 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1569 inst->force_writemask_all = true;
1570 /* This works for both SIMD8 and SIMD16 */
1571 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1572 inst->force_writemask_all = true;
1573 /* This special instruction takes care of setting vstride=1,
1574 * width=4, hstride=0 of t2 during an ADD instruction.
1575 */
1576 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1577 } else {
1578 /* As per GL_ARB_sample_shading specification:
1579 * "When rendering to a non-multisample buffer, or if multisample
1580 * rasterization is disabled, gl_SampleID will always be zero."
1581 */
1582 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1583 }
1584
1585 return reg;
1586 }
1587
1588 void
1589 fs_visitor::resolve_source_modifiers(fs_reg *src)
1590 {
1591 if (!src->abs && !src->negate)
1592 return;
1593
1594 fs_reg temp = retype(vgrf(1), src->type);
1595 emit(MOV(temp, *src));
1596 *src = temp;
1597 }
1598
1599 fs_reg
1600 fs_visitor::fix_math_operand(fs_reg src)
1601 {
1602 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1603 * might be able to do better by doing execsize = 1 math and then
1604 * expanding that result out, but we would need to be careful with
1605 * masking.
1606 *
1607 * The hardware ignores source modifiers (negate and abs) on math
1608 * instructions, so we also move to a temp to set those up.
1609 */
1610 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1611 !src.abs && !src.negate)
1612 return src;
1613
1614 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1615 * operands to math
1616 */
1617 if (brw->gen >= 7 && src.file != IMM)
1618 return src;
1619
1620 fs_reg expanded = vgrf(glsl_type::float_type);
1621 expanded.type = src.type;
1622 emit(BRW_OPCODE_MOV, expanded, src);
1623 return expanded;
1624 }
1625
1626 fs_inst *
1627 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1628 {
1629 switch (opcode) {
1630 case SHADER_OPCODE_RCP:
1631 case SHADER_OPCODE_RSQ:
1632 case SHADER_OPCODE_SQRT:
1633 case SHADER_OPCODE_EXP2:
1634 case SHADER_OPCODE_LOG2:
1635 case SHADER_OPCODE_SIN:
1636 case SHADER_OPCODE_COS:
1637 break;
1638 default:
1639 unreachable("not reached: bad math opcode");
1640 }
1641
1642 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1643 * might be able to do better by doing execsize = 1 math and then
1644 * expanding that result out, but we would need to be careful with
1645 * masking.
1646 *
1647 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1648 * instructions, so we also move to a temp to set those up.
1649 */
1650 if (brw->gen == 6 || brw->gen == 7)
1651 src = fix_math_operand(src);
1652
1653 fs_inst *inst = emit(opcode, dst, src);
1654
1655 if (brw->gen < 6) {
1656 inst->base_mrf = 2;
1657 inst->mlen = dispatch_width / 8;
1658 }
1659
1660 return inst;
1661 }
1662
1663 fs_inst *
1664 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1665 {
1666 int base_mrf = 2;
1667 fs_inst *inst;
1668
1669 if (brw->gen >= 8) {
1670 inst = emit(opcode, dst, src0, src1);
1671 } else if (brw->gen >= 6) {
1672 src0 = fix_math_operand(src0);
1673 src1 = fix_math_operand(src1);
1674
1675 inst = emit(opcode, dst, src0, src1);
1676 } else {
1677 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1678 * "Message Payload":
1679 *
1680 * "Operand0[7]. For the INT DIV functions, this operand is the
1681 * denominator."
1682 * ...
1683 * "Operand1[7]. For the INT DIV functions, this operand is the
1684 * numerator."
1685 */
1686 bool is_int_div = opcode != SHADER_OPCODE_POW;
1687 fs_reg &op0 = is_int_div ? src1 : src0;
1688 fs_reg &op1 = is_int_div ? src0 : src1;
1689
1690 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1691 inst = emit(opcode, dst, op0, reg_null_f);
1692
1693 inst->base_mrf = base_mrf;
1694 inst->mlen = 2 * dispatch_width / 8;
1695 }
1696 return inst;
1697 }
1698
1699 void
1700 fs_visitor::emit_discard_jump()
1701 {
1702 assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1703
1704 /* For performance, after a discard, jump to the end of the
1705 * shader if all relevant channels have been discarded.
1706 */
1707 fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1708 discard_jump->flag_subreg = 1;
1709
1710 discard_jump->predicate = (dispatch_width == 8)
1711 ? BRW_PREDICATE_ALIGN1_ANY8H
1712 : BRW_PREDICATE_ALIGN1_ANY16H;
1713 discard_jump->predicate_inverse = true;
1714 }
1715
1716 void
1717 fs_visitor::assign_curb_setup()
1718 {
1719 if (dispatch_width == 8) {
1720 prog_data->dispatch_grf_start_reg = payload.num_regs;
1721 } else {
1722 assert(stage == MESA_SHADER_FRAGMENT);
1723 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1724 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1725 }
1726
1727 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1728
1729 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1730 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1731 for (unsigned int i = 0; i < inst->sources; i++) {
1732 if (inst->src[i].file == UNIFORM) {
1733 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1734 int constant_nr;
1735 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1736 constant_nr = push_constant_loc[uniform_nr];
1737 } else {
1738 /* Section 5.11 of the OpenGL 4.1 spec says:
1739 * "Out-of-bounds reads return undefined values, which include
1740 * values from other variables of the active program or zero."
1741 * Just return the first push constant.
1742 */
1743 constant_nr = 0;
1744 }
1745
1746 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1747 constant_nr / 8,
1748 constant_nr % 8);
1749
1750 inst->src[i].file = HW_REG;
1751 inst->src[i].fixed_hw_reg = byte_offset(
1752 retype(brw_reg, inst->src[i].type),
1753 inst->src[i].subreg_offset);
1754 }
1755 }
1756 }
1757 }
1758
1759 void
1760 fs_visitor::calculate_urb_setup()
1761 {
1762 assert(stage == MESA_SHADER_FRAGMENT);
1763 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1764 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1765
1766 memset(prog_data->urb_setup, -1,
1767 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1768
1769 int urb_next = 0;
1770 /* Figure out where each of the incoming setup attributes lands. */
1771 if (brw->gen >= 6) {
1772 if (_mesa_bitcount_64(prog->InputsRead &
1773 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1774 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1775 * first 16 varying inputs, so we can put them wherever we want.
1776 * Just put them in order.
1777 *
1778 * This is useful because it means that (a) inputs not used by the
1779 * fragment shader won't take up valuable register space, and (b) we
1780 * won't have to recompile the fragment shader if it gets paired with
1781 * a different vertex (or geometry) shader.
1782 */
1783 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1784 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1785 BITFIELD64_BIT(i)) {
1786 prog_data->urb_setup[i] = urb_next++;
1787 }
1788 }
1789 } else {
1790 /* We have enough input varyings that the SF/SBE pipeline stage can't
1791 * arbitrarily rearrange them to suit our whim; we have to put them
1792 * in an order that matches the output of the previous pipeline stage
1793 * (geometry or vertex shader).
1794 */
1795 struct brw_vue_map prev_stage_vue_map;
1796 brw_compute_vue_map(brw, &prev_stage_vue_map,
1797 key->input_slots_valid);
1798 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1799 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1800 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1801 slot++) {
1802 int varying = prev_stage_vue_map.slot_to_varying[slot];
1803 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1804 * unused.
1805 */
1806 if (varying != BRW_VARYING_SLOT_COUNT &&
1807 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1808 BITFIELD64_BIT(varying))) {
1809 prog_data->urb_setup[varying] = slot - first_slot;
1810 }
1811 }
1812 urb_next = prev_stage_vue_map.num_slots - first_slot;
1813 }
1814 } else {
1815 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1816 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1817 /* Point size is packed into the header, not as a general attribute */
1818 if (i == VARYING_SLOT_PSIZ)
1819 continue;
1820
1821 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1822 /* The back color slot is skipped when the front color is
1823 * also written to. In addition, some slots can be
1824 * written in the vertex shader and not read in the
1825 * fragment shader. So the register number must always be
1826 * incremented, mapped or not.
1827 */
1828 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1829 prog_data->urb_setup[i] = urb_next;
1830 urb_next++;
1831 }
1832 }
1833
1834 /*
1835 * It's a FS only attribute, and we did interpolation for this attribute
1836 * in SF thread. So, count it here, too.
1837 *
1838 * See compile_sf_prog() for more info.
1839 */
1840 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1841 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1842 }
1843
1844 prog_data->num_varying_inputs = urb_next;
1845 }
1846
1847 void
1848 fs_visitor::assign_urb_setup()
1849 {
1850 assert(stage == MESA_SHADER_FRAGMENT);
1851 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1852
1853 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1854
1855 /* Offset all the urb_setup[] index by the actual position of the
1856 * setup regs, now that the location of the constants has been chosen.
1857 */
1858 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1859 if (inst->opcode == FS_OPCODE_LINTERP) {
1860 assert(inst->src[2].file == HW_REG);
1861 inst->src[2].fixed_hw_reg.nr += urb_start;
1862 }
1863
1864 if (inst->opcode == FS_OPCODE_CINTERP) {
1865 assert(inst->src[0].file == HW_REG);
1866 inst->src[0].fixed_hw_reg.nr += urb_start;
1867 }
1868 }
1869
1870 /* Each attribute is 4 setup channels, each of which is half a reg. */
1871 this->first_non_payload_grf =
1872 urb_start + prog_data->num_varying_inputs * 2;
1873 }
1874
1875 void
1876 fs_visitor::assign_vs_urb_setup()
1877 {
1878 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1879 int grf, count, slot, channel, attr;
1880
1881 assert(stage == MESA_SHADER_VERTEX);
1882 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1883 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1884 count++;
1885
1886 /* Each attribute is 4 regs. */
1887 this->first_non_payload_grf =
1888 payload.num_regs + prog_data->curb_read_length + count * 4;
1889
1890 unsigned vue_entries =
1891 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1892
1893 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1894 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1895
1896 assert(vs_prog_data->base.urb_read_length <= 15);
1897
1898 /* Rewrite all ATTR file references to the hw grf that they land in. */
1899 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1900 for (int i = 0; i < inst->sources; i++) {
1901 if (inst->src[i].file == ATTR) {
1902
1903 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1904 slot = count - 1;
1905 } else {
1906 /* Attributes come in in a contiguous block, ordered by their
1907 * gl_vert_attrib value. That means we can compute the slot
1908 * number for an attribute by masking out the enabled
1909 * attributes before it and counting the bits.
1910 */
1911 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1912 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1913 BITFIELD64_MASK(attr));
1914 }
1915
1916 channel = inst->src[i].reg_offset & 3;
1917
1918 grf = payload.num_regs +
1919 prog_data->curb_read_length +
1920 slot * 4 + channel;
1921
1922 inst->src[i].file = HW_REG;
1923 inst->src[i].fixed_hw_reg =
1924 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1925 }
1926 }
1927 }
1928 }
1929
1930 /**
1931 * Split large virtual GRFs into separate components if we can.
1932 *
1933 * This is mostly duplicated with what brw_fs_vector_splitting does,
1934 * but that's really conservative because it's afraid of doing
1935 * splitting that doesn't result in real progress after the rest of
1936 * the optimization phases, which would cause infinite looping in
1937 * optimization. We can do it once here, safely. This also has the
1938 * opportunity to split interpolated values, or maybe even uniforms,
1939 * which we don't have at the IR level.
1940 *
1941 * We want to split, because virtual GRFs are what we register
1942 * allocate and spill (due to contiguousness requirements for some
1943 * instructions), and they're what we naturally generate in the
1944 * codegen process, but most virtual GRFs don't actually need to be
1945 * contiguous sets of GRFs. If we split, we'll end up with reduced
1946 * live intervals and better dead code elimination and coalescing.
1947 */
1948 void
1949 fs_visitor::split_virtual_grfs()
1950 {
1951 int num_vars = this->alloc.count;
1952
1953 /* Count the total number of registers */
1954 int reg_count = 0;
1955 int vgrf_to_reg[num_vars];
1956 for (int i = 0; i < num_vars; i++) {
1957 vgrf_to_reg[i] = reg_count;
1958 reg_count += alloc.sizes[i];
1959 }
1960
1961 /* An array of "split points". For each register slot, this indicates
1962 * if this slot can be separated from the previous slot. Every time an
1963 * instruction uses multiple elements of a register (as a source or
1964 * destination), we mark the used slots as inseparable. Then we go
1965 * through and split the registers into the smallest pieces we can.
1966 */
1967 bool split_points[reg_count];
1968 memset(split_points, 0, sizeof(split_points));
1969
1970 /* Mark all used registers as fully splittable */
1971 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1972 if (inst->dst.file == GRF) {
1973 int reg = vgrf_to_reg[inst->dst.reg];
1974 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1975 split_points[reg + j] = true;
1976 }
1977
1978 for (int i = 0; i < inst->sources; i++) {
1979 if (inst->src[i].file == GRF) {
1980 int reg = vgrf_to_reg[inst->src[i].reg];
1981 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1982 split_points[reg + j] = true;
1983 }
1984 }
1985 }
1986
1987 if (brw->has_pln &&
1988 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1989 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1990 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1991 * Gen6, that was the only supported interpolation mode, and since Gen6,
1992 * delta_x and delta_y are in fixed hardware registers.
1993 */
1994 int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1995 split_points[vgrf_to_reg[vgrf] + 1] = false;
1996 }
1997
1998 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1999 if (inst->dst.file == GRF) {
2000 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2001 for (int j = 1; j < inst->regs_written; j++)
2002 split_points[reg + j] = false;
2003 }
2004 for (int i = 0; i < inst->sources; i++) {
2005 if (inst->src[i].file == GRF) {
2006 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2007 for (int j = 1; j < inst->regs_read(i); j++)
2008 split_points[reg + j] = false;
2009 }
2010 }
2011 }
2012
2013 int new_virtual_grf[reg_count];
2014 int new_reg_offset[reg_count];
2015
2016 int reg = 0;
2017 for (int i = 0; i < num_vars; i++) {
2018 /* The first one should always be 0 as a quick sanity check. */
2019 assert(split_points[reg] == false);
2020
2021 /* j = 0 case */
2022 new_reg_offset[reg] = 0;
2023 reg++;
2024 int offset = 1;
2025
2026 /* j > 0 case */
2027 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2028 /* If this is a split point, reset the offset to 0 and allocate a
2029 * new virtual GRF for the previous offset many registers
2030 */
2031 if (split_points[reg]) {
2032 assert(offset <= MAX_VGRF_SIZE);
2033 int grf = alloc.allocate(offset);
2034 for (int k = reg - offset; k < reg; k++)
2035 new_virtual_grf[k] = grf;
2036 offset = 0;
2037 }
2038 new_reg_offset[reg] = offset;
2039 offset++;
2040 reg++;
2041 }
2042
2043 /* The last one gets the original register number */
2044 assert(offset <= MAX_VGRF_SIZE);
2045 alloc.sizes[i] = offset;
2046 for (int k = reg - offset; k < reg; k++)
2047 new_virtual_grf[k] = i;
2048 }
2049 assert(reg == reg_count);
2050
2051 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2052 if (inst->dst.file == GRF) {
2053 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2054 inst->dst.reg = new_virtual_grf[reg];
2055 inst->dst.reg_offset = new_reg_offset[reg];
2056 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2057 }
2058 for (int i = 0; i < inst->sources; i++) {
2059 if (inst->src[i].file == GRF) {
2060 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2061 inst->src[i].reg = new_virtual_grf[reg];
2062 inst->src[i].reg_offset = new_reg_offset[reg];
2063 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2064 }
2065 }
2066 }
2067 invalidate_live_intervals();
2068 }
2069
2070 /**
2071 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2072 *
2073 * During code generation, we create tons of temporary variables, many of
2074 * which get immediately killed and are never used again. Yet, in later
2075 * optimization and analysis passes, such as compute_live_intervals, we need
2076 * to loop over all the virtual GRFs. Compacting them can save a lot of
2077 * overhead.
2078 */
2079 bool
2080 fs_visitor::compact_virtual_grfs()
2081 {
2082 bool progress = false;
2083 int remap_table[this->alloc.count];
2084 memset(remap_table, -1, sizeof(remap_table));
2085
2086 /* Mark which virtual GRFs are used. */
2087 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2088 if (inst->dst.file == GRF)
2089 remap_table[inst->dst.reg] = 0;
2090
2091 for (int i = 0; i < inst->sources; i++) {
2092 if (inst->src[i].file == GRF)
2093 remap_table[inst->src[i].reg] = 0;
2094 }
2095 }
2096
2097 /* Compact the GRF arrays. */
2098 int new_index = 0;
2099 for (unsigned i = 0; i < this->alloc.count; i++) {
2100 if (remap_table[i] == -1) {
2101 /* We just found an unused register. This means that we are
2102 * actually going to compact something.
2103 */
2104 progress = true;
2105 } else {
2106 remap_table[i] = new_index;
2107 alloc.sizes[new_index] = alloc.sizes[i];
2108 invalidate_live_intervals();
2109 ++new_index;
2110 }
2111 }
2112
2113 this->alloc.count = new_index;
2114
2115 /* Patch all the instructions to use the newly renumbered registers */
2116 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2117 if (inst->dst.file == GRF)
2118 inst->dst.reg = remap_table[inst->dst.reg];
2119
2120 for (int i = 0; i < inst->sources; i++) {
2121 if (inst->src[i].file == GRF)
2122 inst->src[i].reg = remap_table[inst->src[i].reg];
2123 }
2124 }
2125
2126 /* Patch all the references to delta_x/delta_y, since they're used in
2127 * register allocation. If they're unused, switch them to BAD_FILE so
2128 * we don't think some random VGRF is delta_x/delta_y.
2129 */
2130 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2131 if (delta_x[i].file == GRF) {
2132 if (remap_table[delta_x[i].reg] != -1) {
2133 delta_x[i].reg = remap_table[delta_x[i].reg];
2134 } else {
2135 delta_x[i].file = BAD_FILE;
2136 }
2137 }
2138 }
2139 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2140 if (delta_y[i].file == GRF) {
2141 if (remap_table[delta_y[i].reg] != -1) {
2142 delta_y[i].reg = remap_table[delta_y[i].reg];
2143 } else {
2144 delta_y[i].file = BAD_FILE;
2145 }
2146 }
2147 }
2148
2149 return progress;
2150 }
2151
2152 /*
2153 * Implements array access of uniforms by inserting a
2154 * PULL_CONSTANT_LOAD instruction.
2155 *
2156 * Unlike temporary GRF array access (where we don't support it due to
2157 * the difficulty of doing relative addressing on instruction
2158 * destinations), we could potentially do array access of uniforms
2159 * that were loaded in GRF space as push constants. In real-world
2160 * usage we've seen, though, the arrays being used are always larger
2161 * than we could load as push constants, so just always move all
2162 * uniform array access out to a pull constant buffer.
2163 */
2164 void
2165 fs_visitor::move_uniform_array_access_to_pull_constants()
2166 {
2167 if (dispatch_width != 8)
2168 return;
2169
2170 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2171 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2172
2173 /* Walk through and find array access of uniforms. Put a copy of that
2174 * uniform in the pull constant buffer.
2175 *
2176 * Note that we don't move constant-indexed accesses to arrays. No
2177 * testing has been done of the performance impact of this choice.
2178 */
2179 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2180 for (int i = 0 ; i < inst->sources; i++) {
2181 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2182 continue;
2183
2184 int uniform = inst->src[i].reg;
2185
2186 /* If this array isn't already present in the pull constant buffer,
2187 * add it.
2188 */
2189 if (pull_constant_loc[uniform] == -1) {
2190 const gl_constant_value **values = &stage_prog_data->param[uniform];
2191
2192 assert(param_size[uniform]);
2193
2194 for (int j = 0; j < param_size[uniform]; j++) {
2195 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2196
2197 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2198 values[j];
2199 }
2200 }
2201 }
2202 }
2203 }
2204
2205 /**
2206 * Assign UNIFORM file registers to either push constants or pull constants.
2207 *
2208 * We allow a fragment shader to have more than the specified minimum
2209 * maximum number of fragment shader uniform components (64). If
2210 * there are too many of these, they'd fill up all of register space.
2211 * So, this will push some of them out to the pull constant buffer and
2212 * update the program to load them.
2213 */
2214 void
2215 fs_visitor::assign_constant_locations()
2216 {
2217 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2218 if (dispatch_width != 8)
2219 return;
2220
2221 /* Find which UNIFORM registers are still in use. */
2222 bool is_live[uniforms];
2223 for (unsigned int i = 0; i < uniforms; i++) {
2224 is_live[i] = false;
2225 }
2226
2227 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2228 for (int i = 0; i < inst->sources; i++) {
2229 if (inst->src[i].file != UNIFORM)
2230 continue;
2231
2232 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2233 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2234 is_live[constant_nr] = true;
2235 }
2236 }
2237
2238 /* Only allow 16 registers (128 uniform components) as push constants.
2239 *
2240 * Just demote the end of the list. We could probably do better
2241 * here, demoting things that are rarely used in the program first.
2242 *
2243 * If changing this value, note the limitation about total_regs in
2244 * brw_curbe.c.
2245 */
2246 unsigned int max_push_components = 16 * 8;
2247 unsigned int num_push_constants = 0;
2248
2249 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2250
2251 for (unsigned int i = 0; i < uniforms; i++) {
2252 if (!is_live[i] || pull_constant_loc[i] != -1) {
2253 /* This UNIFORM register is either dead, or has already been demoted
2254 * to a pull const. Mark it as no longer living in the param[] array.
2255 */
2256 push_constant_loc[i] = -1;
2257 continue;
2258 }
2259
2260 if (num_push_constants < max_push_components) {
2261 /* Retain as a push constant. Record the location in the params[]
2262 * array.
2263 */
2264 push_constant_loc[i] = num_push_constants++;
2265 } else {
2266 /* Demote to a pull constant. */
2267 push_constant_loc[i] = -1;
2268
2269 int pull_index = stage_prog_data->nr_pull_params++;
2270 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2271 pull_constant_loc[i] = pull_index;
2272 }
2273 }
2274
2275 stage_prog_data->nr_params = num_push_constants;
2276
2277 /* Up until now, the param[] array has been indexed by reg + reg_offset
2278 * of UNIFORM registers. Condense it to only contain the uniforms we
2279 * chose to upload as push constants.
2280 */
2281 for (unsigned int i = 0; i < uniforms; i++) {
2282 int remapped = push_constant_loc[i];
2283
2284 if (remapped == -1)
2285 continue;
2286
2287 assert(remapped <= (int)i);
2288 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2289 }
2290 }
2291
2292 /**
2293 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2294 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2295 */
2296 void
2297 fs_visitor::demote_pull_constants()
2298 {
2299 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2300 for (int i = 0; i < inst->sources; i++) {
2301 if (inst->src[i].file != UNIFORM)
2302 continue;
2303
2304 int pull_index;
2305 unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2306 if (location >= uniforms) /* Out of bounds access */
2307 pull_index = -1;
2308 else
2309 pull_index = pull_constant_loc[location];
2310
2311 if (pull_index == -1)
2312 continue;
2313
2314 /* Set up the annotation tracking for new generated instructions. */
2315 base_ir = inst->ir;
2316 current_annotation = inst->annotation;
2317
2318 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2319 fs_reg dst = vgrf(glsl_type::float_type);
2320
2321 /* Generate a pull load into dst. */
2322 if (inst->src[i].reladdr) {
2323 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2324 surf_index,
2325 *inst->src[i].reladdr,
2326 pull_index);
2327 inst->insert_before(block, &list);
2328 inst->src[i].reladdr = NULL;
2329 } else {
2330 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2331 fs_inst *pull =
2332 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2333 dst, surf_index, offset);
2334 inst->insert_before(block, pull);
2335 inst->src[i].set_smear(pull_index & 3);
2336 }
2337
2338 /* Rewrite the instruction to use the temporary VGRF. */
2339 inst->src[i].file = GRF;
2340 inst->src[i].reg = dst.reg;
2341 inst->src[i].reg_offset = 0;
2342 inst->src[i].width = dispatch_width;
2343 }
2344 }
2345 invalidate_live_intervals();
2346 }
2347
2348 bool
2349 fs_visitor::opt_algebraic()
2350 {
2351 bool progress = false;
2352
2353 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2354 switch (inst->opcode) {
2355 case BRW_OPCODE_MOV:
2356 if (inst->src[0].file != IMM)
2357 break;
2358
2359 if (inst->saturate) {
2360 if (inst->dst.type != inst->src[0].type)
2361 assert(!"unimplemented: saturate mixed types");
2362
2363 if (brw_saturate_immediate(inst->dst.type,
2364 &inst->src[0].fixed_hw_reg)) {
2365 inst->saturate = false;
2366 progress = true;
2367 }
2368 }
2369 break;
2370
2371 case BRW_OPCODE_MUL:
2372 if (inst->src[1].file != IMM)
2373 continue;
2374
2375 /* a * 1.0 = a */
2376 if (inst->src[1].is_one()) {
2377 inst->opcode = BRW_OPCODE_MOV;
2378 inst->src[1] = reg_undef;
2379 progress = true;
2380 break;
2381 }
2382
2383 /* a * -1.0 = -a */
2384 if (inst->src[1].is_negative_one()) {
2385 inst->opcode = BRW_OPCODE_MOV;
2386 inst->src[0].negate = !inst->src[0].negate;
2387 inst->src[1] = reg_undef;
2388 progress = true;
2389 break;
2390 }
2391
2392 /* a * 0.0 = 0.0 */
2393 if (inst->src[1].is_zero()) {
2394 inst->opcode = BRW_OPCODE_MOV;
2395 inst->src[0] = inst->src[1];
2396 inst->src[1] = reg_undef;
2397 progress = true;
2398 break;
2399 }
2400
2401 if (inst->src[0].file == IMM) {
2402 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2403 inst->opcode = BRW_OPCODE_MOV;
2404 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2405 inst->src[1] = reg_undef;
2406 progress = true;
2407 break;
2408 }
2409 break;
2410 case BRW_OPCODE_ADD:
2411 if (inst->src[1].file != IMM)
2412 continue;
2413
2414 /* a + 0.0 = a */
2415 if (inst->src[1].is_zero()) {
2416 inst->opcode = BRW_OPCODE_MOV;
2417 inst->src[1] = reg_undef;
2418 progress = true;
2419 break;
2420 }
2421
2422 if (inst->src[0].file == IMM) {
2423 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2424 inst->opcode = BRW_OPCODE_MOV;
2425 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2426 inst->src[1] = reg_undef;
2427 progress = true;
2428 break;
2429 }
2430 break;
2431 case BRW_OPCODE_OR:
2432 if (inst->src[0].equals(inst->src[1])) {
2433 inst->opcode = BRW_OPCODE_MOV;
2434 inst->src[1] = reg_undef;
2435 progress = true;
2436 break;
2437 }
2438 break;
2439 case BRW_OPCODE_LRP:
2440 if (inst->src[1].equals(inst->src[2])) {
2441 inst->opcode = BRW_OPCODE_MOV;
2442 inst->src[0] = inst->src[1];
2443 inst->src[1] = reg_undef;
2444 inst->src[2] = reg_undef;
2445 progress = true;
2446 break;
2447 }
2448 break;
2449 case BRW_OPCODE_CMP:
2450 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2451 inst->src[0].abs &&
2452 inst->src[0].negate &&
2453 inst->src[1].is_zero()) {
2454 inst->src[0].abs = false;
2455 inst->src[0].negate = false;
2456 inst->conditional_mod = BRW_CONDITIONAL_Z;
2457 progress = true;
2458 break;
2459 }
2460 break;
2461 case BRW_OPCODE_SEL:
2462 if (inst->src[0].equals(inst->src[1])) {
2463 inst->opcode = BRW_OPCODE_MOV;
2464 inst->src[1] = reg_undef;
2465 inst->predicate = BRW_PREDICATE_NONE;
2466 inst->predicate_inverse = false;
2467 progress = true;
2468 } else if (inst->saturate && inst->src[1].file == IMM) {
2469 switch (inst->conditional_mod) {
2470 case BRW_CONDITIONAL_LE:
2471 case BRW_CONDITIONAL_L:
2472 switch (inst->src[1].type) {
2473 case BRW_REGISTER_TYPE_F:
2474 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2475 inst->opcode = BRW_OPCODE_MOV;
2476 inst->src[1] = reg_undef;
2477 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2478 progress = true;
2479 }
2480 break;
2481 default:
2482 break;
2483 }
2484 break;
2485 case BRW_CONDITIONAL_GE:
2486 case BRW_CONDITIONAL_G:
2487 switch (inst->src[1].type) {
2488 case BRW_REGISTER_TYPE_F:
2489 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2490 inst->opcode = BRW_OPCODE_MOV;
2491 inst->src[1] = reg_undef;
2492 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2493 progress = true;
2494 }
2495 break;
2496 default:
2497 break;
2498 }
2499 default:
2500 break;
2501 }
2502 }
2503 break;
2504 case BRW_OPCODE_MAD:
2505 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2506 inst->opcode = BRW_OPCODE_MOV;
2507 inst->src[1] = reg_undef;
2508 inst->src[2] = reg_undef;
2509 progress = true;
2510 } else if (inst->src[0].is_zero()) {
2511 inst->opcode = BRW_OPCODE_MUL;
2512 inst->src[0] = inst->src[2];
2513 inst->src[2] = reg_undef;
2514 progress = true;
2515 } else if (inst->src[1].is_one()) {
2516 inst->opcode = BRW_OPCODE_ADD;
2517 inst->src[1] = inst->src[2];
2518 inst->src[2] = reg_undef;
2519 progress = true;
2520 } else if (inst->src[2].is_one()) {
2521 inst->opcode = BRW_OPCODE_ADD;
2522 inst->src[2] = reg_undef;
2523 progress = true;
2524 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2525 inst->opcode = BRW_OPCODE_ADD;
2526 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2527 inst->src[2] = reg_undef;
2528 progress = true;
2529 }
2530 break;
2531 case SHADER_OPCODE_RCP: {
2532 fs_inst *prev = (fs_inst *)inst->prev;
2533 if (prev->opcode == SHADER_OPCODE_SQRT) {
2534 if (inst->src[0].equals(prev->dst)) {
2535 inst->opcode = SHADER_OPCODE_RSQ;
2536 inst->src[0] = prev->src[0];
2537 progress = true;
2538 }
2539 }
2540 break;
2541 }
2542 default:
2543 break;
2544 }
2545
2546 /* Swap if src[0] is immediate. */
2547 if (progress && inst->is_commutative()) {
2548 if (inst->src[0].file == IMM) {
2549 fs_reg tmp = inst->src[1];
2550 inst->src[1] = inst->src[0];
2551 inst->src[0] = tmp;
2552 }
2553 }
2554 }
2555 return progress;
2556 }
2557
2558 bool
2559 fs_visitor::opt_register_renaming()
2560 {
2561 bool progress = false;
2562 int depth = 0;
2563
2564 int remap[alloc.count];
2565 memset(remap, -1, sizeof(int) * alloc.count);
2566
2567 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2568 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2569 depth++;
2570 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2571 inst->opcode == BRW_OPCODE_WHILE) {
2572 depth--;
2573 }
2574
2575 /* Rewrite instruction sources. */
2576 for (int i = 0; i < inst->sources; i++) {
2577 if (inst->src[i].file == GRF &&
2578 remap[inst->src[i].reg] != -1 &&
2579 remap[inst->src[i].reg] != inst->src[i].reg) {
2580 inst->src[i].reg = remap[inst->src[i].reg];
2581 progress = true;
2582 }
2583 }
2584
2585 const int dst = inst->dst.reg;
2586
2587 if (depth == 0 &&
2588 inst->dst.file == GRF &&
2589 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2590 !inst->is_partial_write()) {
2591 if (remap[dst] == -1) {
2592 remap[dst] = dst;
2593 } else {
2594 remap[dst] = alloc.allocate(inst->dst.width / 8);
2595 inst->dst.reg = remap[dst];
2596 progress = true;
2597 }
2598 } else if (inst->dst.file == GRF &&
2599 remap[dst] != -1 &&
2600 remap[dst] != dst) {
2601 inst->dst.reg = remap[dst];
2602 progress = true;
2603 }
2604 }
2605
2606 if (progress) {
2607 invalidate_live_intervals();
2608
2609 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2610 if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2611 delta_x[i].reg = remap[delta_x[i].reg];
2612 }
2613 }
2614 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2615 if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2616 delta_y[i].reg = remap[delta_y[i].reg];
2617 }
2618 }
2619 }
2620
2621 return progress;
2622 }
2623
2624 /**
2625 * Remove redundant or useless discard jumps.
2626 *
2627 * For example, we can eliminate jumps in the following sequence:
2628 *
2629 * discard-jump (redundant with the next jump)
2630 * discard-jump (useless; jumps to the next instruction)
2631 * placeholder-halt
2632 */
2633 bool
2634 fs_visitor::opt_redundant_discard_jumps()
2635 {
2636 bool progress = false;
2637
2638 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2639
2640 fs_inst *placeholder_halt = NULL;
2641 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2642 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2643 placeholder_halt = inst;
2644 break;
2645 }
2646 }
2647
2648 if (!placeholder_halt)
2649 return false;
2650
2651 /* Delete any HALTs immediately before the placeholder halt. */
2652 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2653 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2654 prev = (fs_inst *) placeholder_halt->prev) {
2655 prev->remove(last_bblock);
2656 progress = true;
2657 }
2658
2659 if (progress)
2660 invalidate_live_intervals();
2661
2662 return progress;
2663 }
2664
2665 bool
2666 fs_visitor::compute_to_mrf()
2667 {
2668 bool progress = false;
2669 int next_ip = 0;
2670
2671 /* No MRFs on Gen >= 7. */
2672 if (brw->gen >= 7)
2673 return false;
2674
2675 calculate_live_intervals();
2676
2677 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2678 int ip = next_ip;
2679 next_ip++;
2680
2681 if (inst->opcode != BRW_OPCODE_MOV ||
2682 inst->is_partial_write() ||
2683 inst->dst.file != MRF || inst->src[0].file != GRF ||
2684 inst->dst.type != inst->src[0].type ||
2685 inst->src[0].abs || inst->src[0].negate ||
2686 !inst->src[0].is_contiguous() ||
2687 inst->src[0].subreg_offset)
2688 continue;
2689
2690 /* Work out which hardware MRF registers are written by this
2691 * instruction.
2692 */
2693 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2694 int mrf_high;
2695 if (inst->dst.reg & BRW_MRF_COMPR4) {
2696 mrf_high = mrf_low + 4;
2697 } else if (inst->exec_size == 16) {
2698 mrf_high = mrf_low + 1;
2699 } else {
2700 mrf_high = mrf_low;
2701 }
2702
2703 /* Can't compute-to-MRF this GRF if someone else was going to
2704 * read it later.
2705 */
2706 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2707 continue;
2708
2709 /* Found a move of a GRF to a MRF. Let's see if we can go
2710 * rewrite the thing that made this GRF to write into the MRF.
2711 */
2712 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2713 if (scan_inst->dst.file == GRF &&
2714 scan_inst->dst.reg == inst->src[0].reg) {
2715 /* Found the last thing to write our reg we want to turn
2716 * into a compute-to-MRF.
2717 */
2718
2719 /* If this one instruction didn't populate all the
2720 * channels, bail. We might be able to rewrite everything
2721 * that writes that reg, but it would require smarter
2722 * tracking to delay the rewriting until complete success.
2723 */
2724 if (scan_inst->is_partial_write())
2725 break;
2726
2727 /* Things returning more than one register would need us to
2728 * understand coalescing out more than one MOV at a time.
2729 */
2730 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2731 break;
2732
2733 /* SEND instructions can't have MRF as a destination. */
2734 if (scan_inst->mlen)
2735 break;
2736
2737 if (brw->gen == 6) {
2738 /* gen6 math instructions must have the destination be
2739 * GRF, so no compute-to-MRF for them.
2740 */
2741 if (scan_inst->is_math()) {
2742 break;
2743 }
2744 }
2745
2746 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2747 /* Found the creator of our MRF's source value. */
2748 scan_inst->dst.file = MRF;
2749 scan_inst->dst.reg = inst->dst.reg;
2750 scan_inst->saturate |= inst->saturate;
2751 inst->remove(block);
2752 progress = true;
2753 }
2754 break;
2755 }
2756
2757 /* We don't handle control flow here. Most computation of
2758 * values that end up in MRFs are shortly before the MRF
2759 * write anyway.
2760 */
2761 if (block->start() == scan_inst)
2762 break;
2763
2764 /* You can't read from an MRF, so if someone else reads our
2765 * MRF's source GRF that we wanted to rewrite, that stops us.
2766 */
2767 bool interfered = false;
2768 for (int i = 0; i < scan_inst->sources; i++) {
2769 if (scan_inst->src[i].file == GRF &&
2770 scan_inst->src[i].reg == inst->src[0].reg &&
2771 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2772 interfered = true;
2773 }
2774 }
2775 if (interfered)
2776 break;
2777
2778 if (scan_inst->dst.file == MRF) {
2779 /* If somebody else writes our MRF here, we can't
2780 * compute-to-MRF before that.
2781 */
2782 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2783 int scan_mrf_high;
2784
2785 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2786 scan_mrf_high = scan_mrf_low + 4;
2787 } else if (scan_inst->exec_size == 16) {
2788 scan_mrf_high = scan_mrf_low + 1;
2789 } else {
2790 scan_mrf_high = scan_mrf_low;
2791 }
2792
2793 if (mrf_low == scan_mrf_low ||
2794 mrf_low == scan_mrf_high ||
2795 mrf_high == scan_mrf_low ||
2796 mrf_high == scan_mrf_high) {
2797 break;
2798 }
2799 }
2800
2801 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2802 /* Found a SEND instruction, which means that there are
2803 * live values in MRFs from base_mrf to base_mrf +
2804 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2805 * above it.
2806 */
2807 if (mrf_low >= scan_inst->base_mrf &&
2808 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2809 break;
2810 }
2811 if (mrf_high >= scan_inst->base_mrf &&
2812 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2813 break;
2814 }
2815 }
2816 }
2817 }
2818
2819 if (progress)
2820 invalidate_live_intervals();
2821
2822 return progress;
2823 }
2824
2825 /**
2826 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2827 * instructions to FS_OPCODE_REP_FB_WRITE.
2828 */
2829 void
2830 fs_visitor::emit_repclear_shader()
2831 {
2832 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2833 int base_mrf = 1;
2834 int color_mrf = base_mrf + 2;
2835
2836 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2837 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2838 mov->force_writemask_all = true;
2839
2840 fs_inst *write;
2841 if (key->nr_color_regions == 1) {
2842 write = emit(FS_OPCODE_REP_FB_WRITE);
2843 write->saturate = key->clamp_fragment_color;
2844 write->base_mrf = color_mrf;
2845 write->target = 0;
2846 write->header_present = false;
2847 write->mlen = 1;
2848 } else {
2849 assume(key->nr_color_regions > 0);
2850 for (int i = 0; i < key->nr_color_regions; ++i) {
2851 write = emit(FS_OPCODE_REP_FB_WRITE);
2852 write->saturate = key->clamp_fragment_color;
2853 write->base_mrf = base_mrf;
2854 write->target = i;
2855 write->header_present = true;
2856 write->mlen = 3;
2857 }
2858 }
2859 write->eot = true;
2860
2861 calculate_cfg();
2862
2863 assign_constant_locations();
2864 assign_curb_setup();
2865
2866 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2867 assert(mov->src[0].file == HW_REG);
2868 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2869 }
2870
2871 /**
2872 * Walks through basic blocks, looking for repeated MRF writes and
2873 * removing the later ones.
2874 */
2875 bool
2876 fs_visitor::remove_duplicate_mrf_writes()
2877 {
2878 fs_inst *last_mrf_move[16];
2879 bool progress = false;
2880
2881 /* Need to update the MRF tracking for compressed instructions. */
2882 if (dispatch_width == 16)
2883 return false;
2884
2885 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2886
2887 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2888 if (inst->is_control_flow()) {
2889 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2890 }
2891
2892 if (inst->opcode == BRW_OPCODE_MOV &&
2893 inst->dst.file == MRF) {
2894 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2895 if (prev_inst && inst->equals(prev_inst)) {
2896 inst->remove(block);
2897 progress = true;
2898 continue;
2899 }
2900 }
2901
2902 /* Clear out the last-write records for MRFs that were overwritten. */
2903 if (inst->dst.file == MRF) {
2904 last_mrf_move[inst->dst.reg] = NULL;
2905 }
2906
2907 if (inst->mlen > 0 && inst->base_mrf != -1) {
2908 /* Found a SEND instruction, which will include two or fewer
2909 * implied MRF writes. We could do better here.
2910 */
2911 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2912 last_mrf_move[inst->base_mrf + i] = NULL;
2913 }
2914 }
2915
2916 /* Clear out any MRF move records whose sources got overwritten. */
2917 if (inst->dst.file == GRF) {
2918 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2919 if (last_mrf_move[i] &&
2920 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2921 last_mrf_move[i] = NULL;
2922 }
2923 }
2924 }
2925
2926 if (inst->opcode == BRW_OPCODE_MOV &&
2927 inst->dst.file == MRF &&
2928 inst->src[0].file == GRF &&
2929 !inst->is_partial_write()) {
2930 last_mrf_move[inst->dst.reg] = inst;
2931 }
2932 }
2933
2934 if (progress)
2935 invalidate_live_intervals();
2936
2937 return progress;
2938 }
2939
2940 static void
2941 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
2942 {
2943 /* Clear the flag for registers that actually got read (as expected). */
2944 for (int i = 0; i < inst->sources; i++) {
2945 int grf;
2946 if (inst->src[i].file == GRF) {
2947 grf = inst->src[i].reg;
2948 } else if (inst->src[i].file == HW_REG &&
2949 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2950 grf = inst->src[i].fixed_hw_reg.nr;
2951 } else {
2952 continue;
2953 }
2954
2955 if (grf >= first_grf &&
2956 grf < first_grf + grf_len) {
2957 deps[grf - first_grf] = false;
2958 if (inst->exec_size == 16)
2959 deps[grf - first_grf + 1] = false;
2960 }
2961 }
2962 }
2963
2964 /**
2965 * Implements this workaround for the original 965:
2966 *
2967 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2968 * check for post destination dependencies on this instruction, software
2969 * must ensure that there is no destination hazard for the case of ‘write
2970 * followed by a posted write’ shown in the following example.
2971 *
2972 * 1. mov r3 0
2973 * 2. send r3.xy <rest of send instruction>
2974 * 3. mov r2 r3
2975 *
2976 * Due to no post-destination dependency check on the ‘send’, the above
2977 * code sequence could have two instructions (1 and 2) in flight at the
2978 * same time that both consider ‘r3’ as the target of their final writes.
2979 */
2980 void
2981 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2982 fs_inst *inst)
2983 {
2984 int write_len = inst->regs_written;
2985 int first_write_grf = inst->dst.reg;
2986 bool needs_dep[BRW_MAX_MRF];
2987 assert(write_len < (int)sizeof(needs_dep) - 1);
2988
2989 memset(needs_dep, false, sizeof(needs_dep));
2990 memset(needs_dep, true, write_len);
2991
2992 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
2993
2994 /* Walk backwards looking for writes to registers we're writing which
2995 * aren't read since being written. If we hit the start of the program,
2996 * we assume that there are no outstanding dependencies on entry to the
2997 * program.
2998 */
2999 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3000 /* If we hit control flow, assume that there *are* outstanding
3001 * dependencies, and force their cleanup before our instruction.
3002 */
3003 if (block->start() == scan_inst) {
3004 for (int i = 0; i < write_len; i++) {
3005 if (needs_dep[i]) {
3006 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3007 }
3008 }
3009 return;
3010 }
3011
3012 /* We insert our reads as late as possible on the assumption that any
3013 * instruction but a MOV that might have left us an outstanding
3014 * dependency has more latency than a MOV.
3015 */
3016 if (scan_inst->dst.file == GRF) {
3017 for (int i = 0; i < scan_inst->regs_written; i++) {
3018 int reg = scan_inst->dst.reg + i;
3019
3020 if (reg >= first_write_grf &&
3021 reg < first_write_grf + write_len &&
3022 needs_dep[reg - first_write_grf]) {
3023 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3024 needs_dep[reg - first_write_grf] = false;
3025 if (scan_inst->exec_size == 16)
3026 needs_dep[reg - first_write_grf + 1] = false;
3027 }
3028 }
3029 }
3030
3031 /* Clear the flag for registers that actually got read (as expected). */
3032 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3033
3034 /* Continue the loop only if we haven't resolved all the dependencies */
3035 int i;
3036 for (i = 0; i < write_len; i++) {
3037 if (needs_dep[i])
3038 break;
3039 }
3040 if (i == write_len)
3041 return;
3042 }
3043 }
3044
3045 /**
3046 * Implements this workaround for the original 965:
3047 *
3048 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3049 * used as a destination register until after it has been sourced by an
3050 * instruction with a different destination register.
3051 */
3052 void
3053 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3054 {
3055 int write_len = inst->regs_written;
3056 int first_write_grf = inst->dst.reg;
3057 bool needs_dep[BRW_MAX_MRF];
3058 assert(write_len < (int)sizeof(needs_dep) - 1);
3059
3060 memset(needs_dep, false, sizeof(needs_dep));
3061 memset(needs_dep, true, write_len);
3062 /* Walk forwards looking for writes to registers we're writing which aren't
3063 * read before being written.
3064 */
3065 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3066 /* If we hit control flow, force resolve all remaining dependencies. */
3067 if (block->end() == scan_inst) {
3068 for (int i = 0; i < write_len; i++) {
3069 if (needs_dep[i])
3070 scan_inst->insert_before(block,
3071 DEP_RESOLVE_MOV(first_write_grf + i));
3072 }
3073 return;
3074 }
3075
3076 /* Clear the flag for registers that actually got read (as expected). */
3077 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3078
3079 /* We insert our reads as late as possible since they're reading the
3080 * result of a SEND, which has massive latency.
3081 */
3082 if (scan_inst->dst.file == GRF &&
3083 scan_inst->dst.reg >= first_write_grf &&
3084 scan_inst->dst.reg < first_write_grf + write_len &&
3085 needs_dep[scan_inst->dst.reg - first_write_grf]) {
3086 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3087 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3088 }
3089
3090 /* Continue the loop only if we haven't resolved all the dependencies */
3091 int i;
3092 for (i = 0; i < write_len; i++) {
3093 if (needs_dep[i])
3094 break;
3095 }
3096 if (i == write_len)
3097 return;
3098 }
3099 }
3100
3101 void
3102 fs_visitor::insert_gen4_send_dependency_workarounds()
3103 {
3104 if (brw->gen != 4 || brw->is_g4x)
3105 return;
3106
3107 bool progress = false;
3108
3109 /* Note that we're done with register allocation, so GRF fs_regs always
3110 * have a .reg_offset of 0.
3111 */
3112
3113 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3114 if (inst->mlen != 0 && inst->dst.file == GRF) {
3115 insert_gen4_pre_send_dependency_workarounds(block, inst);
3116 insert_gen4_post_send_dependency_workarounds(block, inst);
3117 progress = true;
3118 }
3119 }
3120
3121 if (progress)
3122 invalidate_live_intervals();
3123 }
3124
3125 /**
3126 * Turns the generic expression-style uniform pull constant load instruction
3127 * into a hardware-specific series of instructions for loading a pull
3128 * constant.
3129 *
3130 * The expression style allows the CSE pass before this to optimize out
3131 * repeated loads from the same offset, and gives the pre-register-allocation
3132 * scheduling full flexibility, while the conversion to native instructions
3133 * allows the post-register-allocation scheduler the best information
3134 * possible.
3135 *
3136 * Note that execution masking for setting up pull constant loads is special:
3137 * the channels that need to be written are unrelated to the current execution
3138 * mask, since a later instruction will use one of the result channels as a
3139 * source operand for all 8 or 16 of its channels.
3140 */
3141 void
3142 fs_visitor::lower_uniform_pull_constant_loads()
3143 {
3144 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3145 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3146 continue;
3147
3148 if (brw->gen >= 7) {
3149 /* The offset arg before was a vec4-aligned byte offset. We need to
3150 * turn it into a dword offset.
3151 */
3152 fs_reg const_offset_reg = inst->src[1];
3153 assert(const_offset_reg.file == IMM &&
3154 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3155 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3156 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3157
3158 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3159 * Reserve space for the register.
3160 */
3161 if (brw->gen >= 9) {
3162 payload.reg_offset++;
3163 alloc.sizes[payload.reg] = 2;
3164 }
3165
3166 /* This is actually going to be a MOV, but since only the first dword
3167 * is accessed, we have a special opcode to do just that one. Note
3168 * that this needs to be an operation that will be considered a def
3169 * by live variable analysis, or register allocation will explode.
3170 */
3171 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3172 8, payload, const_offset_reg);
3173 setup->force_writemask_all = true;
3174
3175 setup->ir = inst->ir;
3176 setup->annotation = inst->annotation;
3177 inst->insert_before(block, setup);
3178
3179 /* Similarly, this will only populate the first 4 channels of the
3180 * result register (since we only use smear values from 0-3), but we
3181 * don't tell the optimizer.
3182 */
3183 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3184 inst->src[1] = payload;
3185
3186 invalidate_live_intervals();
3187 } else {
3188 /* Before register allocation, we didn't tell the scheduler about the
3189 * MRF we use. We know it's safe to use this MRF because nothing
3190 * else does except for register spill/unspill, which generates and
3191 * uses its MRF within a single IR instruction.
3192 */
3193 inst->base_mrf = 14;
3194 inst->mlen = 1;
3195 }
3196 }
3197 }
3198
3199 bool
3200 fs_visitor::lower_load_payload()
3201 {
3202 bool progress = false;
3203
3204 int vgrf_to_reg[alloc.count];
3205 int reg_count = 0;
3206 for (unsigned i = 0; i < alloc.count; ++i) {
3207 vgrf_to_reg[i] = reg_count;
3208 reg_count += alloc.sizes[i];
3209 }
3210
3211 struct {
3212 bool written:1; /* Whether this register has ever been written */
3213 bool force_writemask_all:1;
3214 bool force_sechalf:1;
3215 } metadata[reg_count];
3216 memset(metadata, 0, sizeof(metadata));
3217
3218 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3219 if (inst->dst.file == GRF) {
3220 const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3221 bool force_sechalf = inst->force_sechalf &&
3222 !inst->force_writemask_all;
3223 bool toggle_sechalf = inst->dst.width == 16 &&
3224 type_sz(inst->dst.type) == 4 &&
3225 !inst->force_writemask_all;
3226 for (int i = 0; i < inst->regs_written; ++i) {
3227 metadata[dst_reg + i].written = true;
3228 metadata[dst_reg + i].force_sechalf = force_sechalf;
3229 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3230 force_sechalf = (toggle_sechalf != force_sechalf);
3231 }
3232 }
3233
3234 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3235 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3236 fs_reg dst = inst->dst;
3237
3238 for (int i = 0; i < inst->sources; i++) {
3239 dst.width = inst->src[i].effective_width;
3240 dst.type = inst->src[i].type;
3241
3242 if (inst->src[i].file == BAD_FILE) {
3243 /* Do nothing but otherwise increment as normal */
3244 } else if (dst.file == MRF &&
3245 dst.width == 8 &&
3246 brw->has_compr4 &&
3247 i + 4 < inst->sources &&
3248 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3249 fs_reg compr4_dst = dst;
3250 compr4_dst.reg += BRW_MRF_COMPR4;
3251 compr4_dst.width = 16;
3252 fs_reg compr4_src = inst->src[i];
3253 compr4_src.width = 16;
3254 fs_inst *mov = MOV(compr4_dst, compr4_src);
3255 mov->force_writemask_all = true;
3256 inst->insert_before(block, mov);
3257 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3258 inst->src[i + 4].file = BAD_FILE;
3259 } else {
3260 fs_inst *mov = MOV(dst, inst->src[i]);
3261 if (inst->src[i].file == GRF) {
3262 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3263 inst->src[i].reg_offset;
3264 mov->force_sechalf = metadata[src_reg].force_sechalf;
3265 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3266 } else {
3267 /* We don't have any useful metadata for immediates or
3268 * uniforms. Assume that any of the channels of the
3269 * destination may be used.
3270 */
3271 assert(inst->src[i].file == IMM ||
3272 inst->src[i].file == UNIFORM);
3273 mov->force_writemask_all = true;
3274 }
3275
3276 if (dst.file == GRF) {
3277 const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3278 const bool force_writemask = mov->force_writemask_all;
3279 metadata[dst_reg].force_writemask_all = force_writemask;
3280 metadata[dst_reg].force_sechalf = mov->force_sechalf;
3281 if (dst.width * type_sz(dst.type) > 32) {
3282 assert(!mov->force_sechalf);
3283 metadata[dst_reg + 1].force_writemask_all = force_writemask;
3284 metadata[dst_reg + 1].force_sechalf = !force_writemask;
3285 }
3286 }
3287
3288 inst->insert_before(block, mov);
3289 }
3290
3291 dst = offset(dst, 1);
3292 }
3293
3294 inst->remove(block);
3295 progress = true;
3296 }
3297 }
3298
3299 if (progress)
3300 invalidate_live_intervals();
3301
3302 return progress;
3303 }
3304
3305 void
3306 fs_visitor::dump_instructions()
3307 {
3308 dump_instructions(NULL);
3309 }
3310
3311 void
3312 fs_visitor::dump_instructions(const char *name)
3313 {
3314 FILE *file = stderr;
3315 if (name && geteuid() != 0) {
3316 file = fopen(name, "w");
3317 if (!file)
3318 file = stderr;
3319 }
3320
3321 if (cfg) {
3322 calculate_register_pressure();
3323 int ip = 0, max_pressure = 0;
3324 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3325 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3326 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3327 dump_instruction(inst, file);
3328 ip++;
3329 }
3330 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3331 } else {
3332 int ip = 0;
3333 foreach_in_list(backend_instruction, inst, &instructions) {
3334 fprintf(file, "%4d: ", ip++);
3335 dump_instruction(inst, file);
3336 }
3337 }
3338
3339 if (file != stderr) {
3340 fclose(file);
3341 }
3342 }
3343
3344 void
3345 fs_visitor::dump_instruction(backend_instruction *be_inst)
3346 {
3347 dump_instruction(be_inst, stderr);
3348 }
3349
3350 void
3351 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3352 {
3353 fs_inst *inst = (fs_inst *)be_inst;
3354
3355 if (inst->predicate) {
3356 fprintf(file, "(%cf0.%d) ",
3357 inst->predicate_inverse ? '-' : '+',
3358 inst->flag_subreg);
3359 }
3360
3361 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3362 if (inst->saturate)
3363 fprintf(file, ".sat");
3364 if (inst->conditional_mod) {
3365 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3366 if (!inst->predicate &&
3367 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3368 inst->opcode != BRW_OPCODE_IF &&
3369 inst->opcode != BRW_OPCODE_WHILE))) {
3370 fprintf(file, ".f0.%d", inst->flag_subreg);
3371 }
3372 }
3373 fprintf(file, "(%d) ", inst->exec_size);
3374
3375
3376 switch (inst->dst.file) {
3377 case GRF:
3378 fprintf(file, "vgrf%d", inst->dst.reg);
3379 if (inst->dst.width != dispatch_width)
3380 fprintf(file, "@%d", inst->dst.width);
3381 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3382 inst->dst.subreg_offset)
3383 fprintf(file, "+%d.%d",
3384 inst->dst.reg_offset, inst->dst.subreg_offset);
3385 break;
3386 case MRF:
3387 fprintf(file, "m%d", inst->dst.reg);
3388 break;
3389 case BAD_FILE:
3390 fprintf(file, "(null)");
3391 break;
3392 case UNIFORM:
3393 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3394 break;
3395 case ATTR:
3396 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3397 break;
3398 case HW_REG:
3399 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3400 switch (inst->dst.fixed_hw_reg.nr) {
3401 case BRW_ARF_NULL:
3402 fprintf(file, "null");
3403 break;
3404 case BRW_ARF_ADDRESS:
3405 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3406 break;
3407 case BRW_ARF_ACCUMULATOR:
3408 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3409 break;
3410 case BRW_ARF_FLAG:
3411 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3412 inst->dst.fixed_hw_reg.subnr);
3413 break;
3414 default:
3415 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3416 inst->dst.fixed_hw_reg.subnr);
3417 break;
3418 }
3419 } else {
3420 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3421 }
3422 if (inst->dst.fixed_hw_reg.subnr)
3423 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3424 break;
3425 default:
3426 fprintf(file, "???");
3427 break;
3428 }
3429 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3430
3431 for (int i = 0; i < inst->sources; i++) {
3432 if (inst->src[i].negate)
3433 fprintf(file, "-");
3434 if (inst->src[i].abs)
3435 fprintf(file, "|");
3436 switch (inst->src[i].file) {
3437 case GRF:
3438 fprintf(file, "vgrf%d", inst->src[i].reg);
3439 if (inst->src[i].width != dispatch_width)
3440 fprintf(file, "@%d", inst->src[i].width);
3441 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3442 inst->src[i].subreg_offset)
3443 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3444 inst->src[i].subreg_offset);
3445 break;
3446 case MRF:
3447 fprintf(file, "***m%d***", inst->src[i].reg);
3448 break;
3449 case ATTR:
3450 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3451 break;
3452 case UNIFORM:
3453 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3454 if (inst->src[i].reladdr) {
3455 fprintf(file, "+reladdr");
3456 } else if (inst->src[i].subreg_offset) {
3457 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3458 inst->src[i].subreg_offset);
3459 }
3460 break;
3461 case BAD_FILE:
3462 fprintf(file, "(null)");
3463 break;
3464 case IMM:
3465 switch (inst->src[i].type) {
3466 case BRW_REGISTER_TYPE_F:
3467 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3468 break;
3469 case BRW_REGISTER_TYPE_W:
3470 case BRW_REGISTER_TYPE_D:
3471 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3472 break;
3473 case BRW_REGISTER_TYPE_UW:
3474 case BRW_REGISTER_TYPE_UD:
3475 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3476 break;
3477 case BRW_REGISTER_TYPE_VF:
3478 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3479 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3480 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3481 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3482 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3483 break;
3484 default:
3485 fprintf(file, "???");
3486 break;
3487 }
3488 break;
3489 case HW_REG:
3490 if (inst->src[i].fixed_hw_reg.negate)
3491 fprintf(file, "-");
3492 if (inst->src[i].fixed_hw_reg.abs)
3493 fprintf(file, "|");
3494 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3495 switch (inst->src[i].fixed_hw_reg.nr) {
3496 case BRW_ARF_NULL:
3497 fprintf(file, "null");
3498 break;
3499 case BRW_ARF_ADDRESS:
3500 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3501 break;
3502 case BRW_ARF_ACCUMULATOR:
3503 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3504 break;
3505 case BRW_ARF_FLAG:
3506 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3507 inst->src[i].fixed_hw_reg.subnr);
3508 break;
3509 default:
3510 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3511 inst->src[i].fixed_hw_reg.subnr);
3512 break;
3513 }
3514 } else {
3515 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3516 }
3517 if (inst->src[i].fixed_hw_reg.subnr)
3518 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3519 if (inst->src[i].fixed_hw_reg.abs)
3520 fprintf(file, "|");
3521 break;
3522 default:
3523 fprintf(file, "???");
3524 break;
3525 }
3526 if (inst->src[i].abs)
3527 fprintf(file, "|");
3528
3529 if (inst->src[i].file != IMM) {
3530 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3531 }
3532
3533 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3534 fprintf(file, ", ");
3535 }
3536
3537 fprintf(file, " ");
3538
3539 if (dispatch_width == 16 && inst->exec_size == 8) {
3540 if (inst->force_sechalf)
3541 fprintf(file, "2ndhalf ");
3542 else
3543 fprintf(file, "1sthalf ");
3544 }
3545
3546 fprintf(file, "\n");
3547 }
3548
3549 /**
3550 * Possibly returns an instruction that set up @param reg.
3551 *
3552 * Sometimes we want to take the result of some expression/variable
3553 * dereference tree and rewrite the instruction generating the result
3554 * of the tree. When processing the tree, we know that the
3555 * instructions generated are all writing temporaries that are dead
3556 * outside of this tree. So, if we have some instructions that write
3557 * a temporary, we're free to point that temp write somewhere else.
3558 *
3559 * Note that this doesn't guarantee that the instruction generated
3560 * only reg -- it might be the size=4 destination of a texture instruction.
3561 */
3562 fs_inst *
3563 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3564 fs_inst *end,
3565 const fs_reg &reg)
3566 {
3567 if (end == start ||
3568 end->is_partial_write() ||
3569 reg.reladdr ||
3570 !reg.equals(end->dst)) {
3571 return NULL;
3572 } else {
3573 return end;
3574 }
3575 }
3576
3577 void
3578 fs_visitor::setup_payload_gen6()
3579 {
3580 bool uses_depth =
3581 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3582 unsigned barycentric_interp_modes =
3583 (stage == MESA_SHADER_FRAGMENT) ?
3584 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3585
3586 assert(brw->gen >= 6);
3587
3588 /* R0-1: masks, pixel X/Y coordinates. */
3589 payload.num_regs = 2;
3590 /* R2: only for 32-pixel dispatch.*/
3591
3592 /* R3-26: barycentric interpolation coordinates. These appear in the
3593 * same order that they appear in the brw_wm_barycentric_interp_mode
3594 * enum. Each set of coordinates occupies 2 registers if dispatch width
3595 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3596 * appear if they were enabled using the "Barycentric Interpolation
3597 * Mode" bits in WM_STATE.
3598 */
3599 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3600 if (barycentric_interp_modes & (1 << i)) {
3601 payload.barycentric_coord_reg[i] = payload.num_regs;
3602 payload.num_regs += 2;
3603 if (dispatch_width == 16) {
3604 payload.num_regs += 2;
3605 }
3606 }
3607 }
3608
3609 /* R27: interpolated depth if uses source depth */
3610 if (uses_depth) {
3611 payload.source_depth_reg = payload.num_regs;
3612 payload.num_regs++;
3613 if (dispatch_width == 16) {
3614 /* R28: interpolated depth if not SIMD8. */
3615 payload.num_regs++;
3616 }
3617 }
3618 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3619 if (uses_depth) {
3620 payload.source_w_reg = payload.num_regs;
3621 payload.num_regs++;
3622 if (dispatch_width == 16) {
3623 /* R30: interpolated W if not SIMD8. */
3624 payload.num_regs++;
3625 }
3626 }
3627
3628 if (stage == MESA_SHADER_FRAGMENT) {
3629 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3630 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3631 prog_data->uses_pos_offset = key->compute_pos_offset;
3632 /* R31: MSAA position offsets. */
3633 if (prog_data->uses_pos_offset) {
3634 payload.sample_pos_reg = payload.num_regs;
3635 payload.num_regs++;
3636 }
3637 }
3638
3639 /* R32: MSAA input coverage mask */
3640 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3641 assert(brw->gen >= 7);
3642 payload.sample_mask_in_reg = payload.num_regs;
3643 payload.num_regs++;
3644 if (dispatch_width == 16) {
3645 /* R33: input coverage mask if not SIMD8. */
3646 payload.num_regs++;
3647 }
3648 }
3649
3650 /* R34-: bary for 32-pixel. */
3651 /* R58-59: interp W for 32-pixel. */
3652
3653 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3654 source_depth_to_render_target = true;
3655 }
3656 }
3657
3658 void
3659 fs_visitor::setup_vs_payload()
3660 {
3661 /* R0: thread header, R1: urb handles */
3662 payload.num_regs = 2;
3663 }
3664
3665 void
3666 fs_visitor::assign_binding_table_offsets()
3667 {
3668 assert(stage == MESA_SHADER_FRAGMENT);
3669 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3670 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3671 uint32_t next_binding_table_offset = 0;
3672
3673 /* If there are no color regions, we still perform an FB write to a null
3674 * renderbuffer, which we place at surface index 0.
3675 */
3676 prog_data->binding_table.render_target_start = next_binding_table_offset;
3677 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3678
3679 assign_common_binding_table_offsets(next_binding_table_offset);
3680 }
3681
3682 void
3683 fs_visitor::calculate_register_pressure()
3684 {
3685 invalidate_live_intervals();
3686 calculate_live_intervals();
3687
3688 unsigned num_instructions = 0;
3689 foreach_block(block, cfg)
3690 num_instructions += block->instructions.length();
3691
3692 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3693
3694 for (unsigned reg = 0; reg < alloc.count; reg++) {
3695 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3696 regs_live_at_ip[ip] += alloc.sizes[reg];
3697 }
3698 }
3699
3700 void
3701 fs_visitor::optimize()
3702 {
3703 const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3704
3705 split_virtual_grfs();
3706
3707 move_uniform_array_access_to_pull_constants();
3708 assign_constant_locations();
3709 demote_pull_constants();
3710
3711 #define OPT(pass, args...) ({ \
3712 pass_num++; \
3713 bool this_progress = pass(args); \
3714 \
3715 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3716 char filename[64]; \
3717 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3718 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3719 \
3720 backend_visitor::dump_instructions(filename); \
3721 } \
3722 \
3723 progress = progress || this_progress; \
3724 this_progress; \
3725 })
3726
3727 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3728 char filename[64];
3729 snprintf(filename, 64, "%s%d-%04d-00-start",
3730 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3731
3732 backend_visitor::dump_instructions(filename);
3733 }
3734
3735 bool progress;
3736 int iteration = 0;
3737 int pass_num = 0;
3738 do {
3739 progress = false;
3740 pass_num = 0;
3741 iteration++;
3742
3743 OPT(remove_duplicate_mrf_writes);
3744
3745 OPT(opt_algebraic);
3746 OPT(opt_cse);
3747 OPT(opt_copy_propagate);
3748 OPT(opt_peephole_predicated_break);
3749 OPT(opt_cmod_propagation);
3750 OPT(dead_code_eliminate);
3751 OPT(opt_peephole_sel);
3752 OPT(dead_control_flow_eliminate, this);
3753 OPT(opt_register_renaming);
3754 OPT(opt_redundant_discard_jumps);
3755 OPT(opt_saturate_propagation);
3756 OPT(register_coalesce);
3757 OPT(compute_to_mrf);
3758
3759 OPT(compact_virtual_grfs);
3760 } while (progress);
3761
3762 pass_num = 0;
3763
3764 if (OPT(lower_load_payload)) {
3765 split_virtual_grfs();
3766 OPT(register_coalesce);
3767 OPT(compute_to_mrf);
3768 OPT(dead_code_eliminate);
3769 }
3770
3771 OPT(opt_combine_constants);
3772
3773 lower_uniform_pull_constant_loads();
3774 }
3775
3776 /**
3777 * Three source instruction must have a GRF/MRF destination register.
3778 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
3779 */
3780 void
3781 fs_visitor::fixup_3src_null_dest()
3782 {
3783 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3784 if (inst->is_3src() && inst->dst.is_null()) {
3785 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3786 inst->dst.type);
3787 }
3788 }
3789 }
3790
3791 void
3792 fs_visitor::allocate_registers()
3793 {
3794 bool allocated_without_spills;
3795
3796 static const enum instruction_scheduler_mode pre_modes[] = {
3797 SCHEDULE_PRE,
3798 SCHEDULE_PRE_NON_LIFO,
3799 SCHEDULE_PRE_LIFO,
3800 };
3801
3802 /* Try each scheduling heuristic to see if it can successfully register
3803 * allocate without spilling. They should be ordered by decreasing
3804 * performance but increasing likelihood of allocating.
3805 */
3806 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3807 schedule_instructions(pre_modes[i]);
3808
3809 if (0) {
3810 assign_regs_trivial();
3811 allocated_without_spills = true;
3812 } else {
3813 allocated_without_spills = assign_regs(false);
3814 }
3815 if (allocated_without_spills)
3816 break;
3817 }
3818
3819 if (!allocated_without_spills) {
3820 const char *stage_name = stage == MESA_SHADER_VERTEX ?
3821 "Vertex" : "Fragment";
3822
3823 /* We assume that any spilling is worse than just dropping back to
3824 * SIMD8. There's probably actually some intermediate point where
3825 * SIMD16 with a couple of spills is still better.
3826 */
3827 if (dispatch_width == 16) {
3828 fail("Failure to register allocate. Reduce number of "
3829 "live scalar values to avoid this.");
3830 } else {
3831 perf_debug("%s shader triggered register spilling. "
3832 "Try reducing the number of live scalar values to "
3833 "improve performance.\n", stage_name);
3834 }
3835
3836 /* Since we're out of heuristics, just go spill registers until we
3837 * get an allocation.
3838 */
3839 while (!assign_regs(true)) {
3840 if (failed)
3841 break;
3842 }
3843 }
3844
3845 /* This must come after all optimization and register allocation, since
3846 * it inserts dead code that happens to have side effects, and it does
3847 * so based on the actual physical registers in use.
3848 */
3849 insert_gen4_send_dependency_workarounds();
3850
3851 if (failed)
3852 return;
3853
3854 if (!allocated_without_spills)
3855 schedule_instructions(SCHEDULE_POST);
3856
3857 if (last_scratch > 0)
3858 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3859 }
3860
3861 bool
3862 fs_visitor::run_vs()
3863 {
3864 assert(stage == MESA_SHADER_VERTEX);
3865
3866 assign_common_binding_table_offsets(0);
3867 setup_vs_payload();
3868
3869 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3870 emit_shader_time_begin();
3871
3872 if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
3873 emit_nir_code();
3874 } else {
3875 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3876 base_ir = ir;
3877 this->result = reg_undef;
3878 ir->accept(this);
3879 }
3880 base_ir = NULL;
3881 }
3882
3883 if (failed)
3884 return false;
3885
3886 emit_urb_writes();
3887
3888 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3889 emit_shader_time_end();
3890
3891 calculate_cfg();
3892
3893 optimize();
3894
3895 assign_curb_setup();
3896 assign_vs_urb_setup();
3897
3898 fixup_3src_null_dest();
3899 allocate_registers();
3900
3901 return !failed;
3902 }
3903
3904 bool
3905 fs_visitor::run_fs()
3906 {
3907 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3908 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3909
3910 assert(stage == MESA_SHADER_FRAGMENT);
3911
3912 sanity_param_count = prog->Parameters->NumParameters;
3913
3914 assign_binding_table_offsets();
3915
3916 if (brw->gen >= 6)
3917 setup_payload_gen6();
3918 else
3919 setup_payload_gen4();
3920
3921 if (0) {
3922 emit_dummy_fs();
3923 } else if (brw->use_rep_send && dispatch_width == 16) {
3924 emit_repclear_shader();
3925 } else {
3926 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3927 emit_shader_time_begin();
3928
3929 calculate_urb_setup();
3930 if (prog->InputsRead > 0) {
3931 if (brw->gen < 6)
3932 emit_interpolation_setup_gen4();
3933 else
3934 emit_interpolation_setup_gen6();
3935 }
3936
3937 /* We handle discards by keeping track of the still-live pixels in f0.1.
3938 * Initialize it with the dispatched pixels.
3939 */
3940 if (wm_prog_data->uses_kill) {
3941 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3942 discard_init->flag_subreg = 1;
3943 }
3944
3945 /* Generate FS IR for main(). (the visitor only descends into
3946 * functions called "main").
3947 */
3948 if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
3949 emit_nir_code();
3950 } else if (shader) {
3951 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3952 base_ir = ir;
3953 this->result = reg_undef;
3954 ir->accept(this);
3955 }
3956 } else {
3957 emit_fragment_program_code();
3958 }
3959 base_ir = NULL;
3960 if (failed)
3961 return false;
3962
3963 if (wm_prog_data->uses_kill)
3964 emit(FS_OPCODE_PLACEHOLDER_HALT);
3965
3966 if (wm_key->alpha_test_func)
3967 emit_alpha_test();
3968
3969 emit_fb_writes();
3970
3971 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3972 emit_shader_time_end();
3973
3974 calculate_cfg();
3975
3976 optimize();
3977
3978 assign_curb_setup();
3979 assign_urb_setup();
3980
3981 fixup_3src_null_dest();
3982 allocate_registers();
3983
3984 if (failed)
3985 return false;
3986 }
3987
3988 if (dispatch_width == 8)
3989 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3990 else
3991 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3992
3993 /* If any state parameters were appended, then ParameterValues could have
3994 * been realloced, in which case the driver uniform storage set up by
3995 * _mesa_associate_uniform_storage() would point to freed memory. Make
3996 * sure that didn't happen.
3997 */
3998 assert(sanity_param_count == prog->Parameters->NumParameters);
3999
4000 return !failed;
4001 }
4002
4003 const unsigned *
4004 brw_wm_fs_emit(struct brw_context *brw,
4005 void *mem_ctx,
4006 const struct brw_wm_prog_key *key,
4007 struct brw_wm_prog_data *prog_data,
4008 struct gl_fragment_program *fp,
4009 struct gl_shader_program *prog,
4010 unsigned *final_assembly_size)
4011 {
4012 bool start_busy = false;
4013 double start_time = 0;
4014
4015 if (unlikely(brw->perf_debug)) {
4016 start_busy = (brw->batch.last_bo &&
4017 drm_intel_bo_busy(brw->batch.last_bo));
4018 start_time = get_time();
4019 }
4020
4021 struct brw_shader *shader = NULL;
4022 if (prog)
4023 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4024
4025 if (unlikely(INTEL_DEBUG & DEBUG_WM))
4026 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4027
4028 /* Now the main event: Visit the shader IR and generate our FS IR for it.
4029 */
4030 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
4031 if (!v.run_fs()) {
4032 if (prog) {
4033 prog->LinkStatus = false;
4034 ralloc_strcat(&prog->InfoLog, v.fail_msg);
4035 }
4036
4037 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4038 v.fail_msg);
4039
4040 return NULL;
4041 }
4042
4043 cfg_t *simd16_cfg = NULL;
4044 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
4045 if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4046 if (!v.simd16_unsupported) {
4047 /* Try a SIMD16 compile */
4048 v2.import_uniforms(&v);
4049 if (!v2.run_fs()) {
4050 perf_debug("SIMD16 shader failed to compile, falling back to "
4051 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4052 } else {
4053 simd16_cfg = v2.cfg;
4054 }
4055 } else {
4056 perf_debug("SIMD16 shader unsupported, falling back to "
4057 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4058 }
4059 }
4060
4061 cfg_t *simd8_cfg;
4062 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4063 if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4064 simd8_cfg = NULL;
4065 prog_data->no_8 = true;
4066 } else {
4067 simd8_cfg = v.cfg;
4068 prog_data->no_8 = false;
4069 }
4070
4071 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4072 &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4073
4074 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4075 char *name;
4076 if (prog)
4077 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4078 prog->Label ? prog->Label : "unnamed",
4079 prog->Name);
4080 else
4081 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4082
4083 g.enable_debug(name);
4084 }
4085
4086 if (simd8_cfg)
4087 g.generate_code(simd8_cfg, 8);
4088 if (simd16_cfg)
4089 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4090
4091 if (unlikely(brw->perf_debug) && shader) {
4092 if (shader->compiled_once)
4093 brw_wm_debug_recompile(brw, prog, key);
4094 shader->compiled_once = true;
4095
4096 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4097 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4098 (get_time() - start_time) * 1000);
4099 }
4100 }
4101
4102 return g.get_assembly(final_assembly_size);
4103 }
4104
4105 extern "C" bool
4106 brw_fs_precompile(struct gl_context *ctx,
4107 struct gl_shader_program *shader_prog,
4108 struct gl_program *prog)
4109 {
4110 struct brw_context *brw = brw_context(ctx);
4111 struct brw_wm_prog_key key;
4112
4113 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4114 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4115 bool program_uses_dfdy = fp->UsesDFdy;
4116
4117 memset(&key, 0, sizeof(key));
4118
4119 if (brw->gen < 6) {
4120 if (fp->UsesKill)
4121 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4122
4123 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4124 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4125
4126 /* Just assume depth testing. */
4127 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4128 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4129 }
4130
4131 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4132 BRW_FS_VARYING_INPUT_MASK) > 16)
4133 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4134
4135 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4136 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
4137 for (unsigned i = 0; i < sampler_count; i++) {
4138 if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
4139 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4140 key.tex.swizzles[i] =
4141 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4142 } else {
4143 /* Color sampler: assume no swizzling. */
4144 key.tex.swizzles[i] = SWIZZLE_XYZW;
4145 }
4146 }
4147
4148 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4149 key.drawable_height = ctx->DrawBuffer->Height;
4150 }
4151
4152 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4153 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4154 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4155
4156 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4157 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4158 key.nr_color_regions > 1;
4159 }
4160
4161 key.program_string_id = bfp->id;
4162
4163 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4164 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4165
4166 bool success = brw_compile_wm_prog(brw, shader_prog, bfp, &key);
4167
4168 brw->wm.base.prog_offset = old_prog_offset;
4169 brw->wm.prog_data = old_prog_data;
4170
4171 return success;
4172 }