i965/fs: Make emit_shader_time_write return rather than emit.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 void
53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
54 const fs_reg *src, unsigned sources)
55 {
56 memset(this, 0, sizeof(*this));
57
58 this->src = new fs_reg[MAX2(sources, 3)];
59 for (unsigned i = 0; i < sources; i++)
60 this->src[i] = src[i];
61
62 this->opcode = opcode;
63 this->dst = dst;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (unsigned i = 0; i < sources; ++i) {
79 if (src[i].file != GRF && src[i].file != ATTR)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 for (unsigned i = 0; i < sources; ++i) {
94 switch (this->src[i].file) {
95 case BAD_FILE:
96 this->src[i].effective_width = 8;
97 break;
98 case GRF:
99 case HW_REG:
100 case ATTR:
101 assert(this->src[i].width > 0);
102 if (this->src[i].width == 1) {
103 this->src[i].effective_width = this->exec_size;
104 } else {
105 this->src[i].effective_width = this->src[i].width;
106 }
107 break;
108 case IMM:
109 case UNIFORM:
110 this->src[i].effective_width = this->exec_size;
111 break;
112 default:
113 unreachable("Invalid source register file");
114 }
115 }
116 this->dst.effective_width = this->exec_size;
117
118 this->conditional_mod = BRW_CONDITIONAL_NONE;
119
120 /* This will be the case for almost all instructions. */
121 switch (dst.file) {
122 case GRF:
123 case HW_REG:
124 case MRF:
125 case ATTR:
126 this->regs_written =
127 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
128 break;
129 case BAD_FILE:
130 this->regs_written = 0;
131 break;
132 case IMM:
133 case UNIFORM:
134 unreachable("Invalid destination register file");
135 default:
136 unreachable("Invalid register file");
137 }
138
139 this->writes_accumulator = false;
140 }
141
142 fs_inst::fs_inst()
143 {
144 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
145 }
146
147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
148 {
149 init(opcode, exec_size, reg_undef, NULL, 0);
150 }
151
152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
153 {
154 init(opcode, 0, dst, NULL, 0);
155 }
156
157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
158 const fs_reg &src0)
159 {
160 const fs_reg src[1] = { src0 };
161 init(opcode, exec_size, dst, src, 1);
162 }
163
164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
165 {
166 const fs_reg src[1] = { src0 };
167 init(opcode, 0, dst, src, 1);
168 }
169
170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
171 const fs_reg &src0, const fs_reg &src1)
172 {
173 const fs_reg src[2] = { src0, src1 };
174 init(opcode, exec_size, dst, src, 2);
175 }
176
177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
178 const fs_reg &src1)
179 {
180 const fs_reg src[2] = { src0, src1 };
181 init(opcode, 0, dst, src, 2);
182 }
183
184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
185 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
186 {
187 const fs_reg src[3] = { src0, src1, src2 };
188 init(opcode, exec_size, dst, src, 3);
189 }
190
191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
192 const fs_reg &src1, const fs_reg &src2)
193 {
194 const fs_reg src[3] = { src0, src1, src2 };
195 init(opcode, 0, dst, src, 3);
196 }
197
198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
199 const fs_reg src[], unsigned sources)
200 {
201 init(opcode, 0, dst, src, sources);
202 }
203
204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
205 const fs_reg src[], unsigned sources)
206 {
207 init(opcode, exec_width, dst, src, sources);
208 }
209
210 fs_inst::fs_inst(const fs_inst &that)
211 {
212 memcpy(this, &that, sizeof(that));
213
214 this->src = new fs_reg[MAX2(that.sources, 3)];
215
216 for (unsigned i = 0; i < that.sources; i++)
217 this->src[i] = that.src[i];
218 }
219
220 fs_inst::~fs_inst()
221 {
222 delete[] this->src;
223 }
224
225 void
226 fs_inst::resize_sources(uint8_t num_sources)
227 {
228 if (this->sources != num_sources) {
229 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
230
231 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
232 src[i] = this->src[i];
233
234 delete[] this->src;
235 this->src = src;
236 this->sources = num_sources;
237 }
238 }
239
240 #define ALU1(op) \
241 fs_inst * \
242 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
243 { \
244 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
245 }
246
247 #define ALU2(op) \
248 fs_inst * \
249 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
250 const fs_reg &src1) \
251 { \
252 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
253 }
254
255 #define ALU2_ACC(op) \
256 fs_inst * \
257 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
258 const fs_reg &src1) \
259 { \
260 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
261 inst->writes_accumulator = true; \
262 return inst; \
263 }
264
265 #define ALU3(op) \
266 fs_inst * \
267 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
268 const fs_reg &src1, const fs_reg &src2) \
269 { \
270 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
271 }
272
273 ALU1(NOT)
274 ALU1(MOV)
275 ALU1(FRC)
276 ALU1(RNDD)
277 ALU1(RNDE)
278 ALU1(RNDZ)
279 ALU2(ADD)
280 ALU2(MUL)
281 ALU2_ACC(MACH)
282 ALU2(AND)
283 ALU2(OR)
284 ALU2(XOR)
285 ALU2(SHL)
286 ALU2(SHR)
287 ALU2(ASR)
288 ALU3(LRP)
289 ALU1(BFREV)
290 ALU3(BFE)
291 ALU2(BFI1)
292 ALU3(BFI2)
293 ALU1(FBH)
294 ALU1(FBL)
295 ALU1(CBIT)
296 ALU3(MAD)
297 ALU2_ACC(ADDC)
298 ALU2_ACC(SUBB)
299 ALU2(SEL)
300 ALU2(MAC)
301
302 /** Gen4 predicated IF. */
303 fs_inst *
304 fs_visitor::IF(enum brw_predicate predicate)
305 {
306 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
307 inst->predicate = predicate;
308 return inst;
309 }
310
311 /** Gen6 IF with embedded comparison. */
312 fs_inst *
313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
314 enum brw_conditional_mod condition)
315 {
316 assert(brw->gen == 6);
317 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
318 reg_null_d, src0, src1);
319 inst->conditional_mod = condition;
320 return inst;
321 }
322
323 /**
324 * CMP: Sets the low bit of the destination channels with the result
325 * of the comparison, while the upper bits are undefined, and updates
326 * the flag register with the packed 16 bits of the result.
327 */
328 fs_inst *
329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
330 enum brw_conditional_mod condition)
331 {
332 fs_inst *inst;
333
334 /* Take the instruction:
335 *
336 * CMP null<d> src0<f> src1<f>
337 *
338 * Original gen4 does type conversion to the destination type before
339 * comparison, producing garbage results for floating point comparisons.
340 *
341 * The destination type doesn't matter on newer generations, so we set the
342 * type to match src0 so we can compact the instruction.
343 */
344 dst.type = src0.type;
345 if (dst.file == HW_REG)
346 dst.fixed_hw_reg.type = dst.type;
347
348 resolve_ud_negate(&src0);
349 resolve_ud_negate(&src1);
350
351 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
352 inst->conditional_mod = condition;
353
354 return inst;
355 }
356
357 fs_inst *
358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
359 {
360 uint8_t exec_size = dst.width;
361 for (int i = 0; i < sources; ++i) {
362 assert(src[i].width % dst.width == 0);
363 if (src[i].width > exec_size)
364 exec_size = src[i].width;
365 }
366
367 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
368 dst, src, sources);
369 inst->regs_written = 0;
370 for (int i = 0; i < sources; ++i) {
371 /* The LOAD_PAYLOAD instruction only really makes sense if we are
372 * dealing with whole registers. If this ever changes, we can deal
373 * with it later.
374 */
375 int size = inst->src[i].effective_width * type_sz(src[i].type);
376 assert(size % 32 == 0);
377 inst->regs_written += (size + 31) / 32;
378 }
379
380 return inst;
381 }
382
383 exec_list
384 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
385 const fs_reg &surf_index,
386 const fs_reg &varying_offset,
387 uint32_t const_offset)
388 {
389 exec_list instructions;
390 fs_inst *inst;
391
392 /* We have our constant surface use a pitch of 4 bytes, so our index can
393 * be any component of a vector, and then we load 4 contiguous
394 * components starting from that.
395 *
396 * We break down the const_offset to a portion added to the variable
397 * offset and a portion done using reg_offset, which means that if you
398 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
399 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
400 * CSE can later notice that those loads are all the same and eliminate
401 * the redundant ones.
402 */
403 fs_reg vec4_offset = vgrf(glsl_type::int_type);
404 instructions.push_tail(ADD(vec4_offset,
405 varying_offset, fs_reg(const_offset & ~3)));
406
407 int scale = 1;
408 if (brw->gen == 4 && dst.width == 8) {
409 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
410 * u, v, r) as parameters, or we can just use the SIMD16 message
411 * consisting of (header, u). We choose the second, at the cost of a
412 * longer return length.
413 */
414 scale = 2;
415 }
416
417 enum opcode op;
418 if (brw->gen >= 7)
419 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
420 else
421 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
422
423 assert(dst.width % 8 == 0);
424 int regs_written = 4 * (dst.width / 8) * scale;
425 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
426 dst.type, dst.width);
427 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
428 inst->regs_written = regs_written;
429 instructions.push_tail(inst);
430
431 if (brw->gen < 7) {
432 inst->base_mrf = 13;
433 inst->header_present = true;
434 if (brw->gen == 4)
435 inst->mlen = 3;
436 else
437 inst->mlen = 1 + dispatch_width / 8;
438 }
439
440 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
441 instructions.push_tail(MOV(dst, result));
442
443 return instructions;
444 }
445
446 /**
447 * A helper for MOV generation for fixing up broken hardware SEND dependency
448 * handling.
449 */
450 fs_inst *
451 fs_visitor::DEP_RESOLVE_MOV(int grf)
452 {
453 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
454
455 inst->ir = NULL;
456 inst->annotation = "send dependency resolve";
457
458 /* The caller always wants uncompressed to emit the minimal extra
459 * dependencies, and to avoid having to deal with aligning its regs to 2.
460 */
461 inst->exec_size = 8;
462
463 return inst;
464 }
465
466 bool
467 fs_inst::equals(fs_inst *inst) const
468 {
469 return (opcode == inst->opcode &&
470 dst.equals(inst->dst) &&
471 src[0].equals(inst->src[0]) &&
472 src[1].equals(inst->src[1]) &&
473 src[2].equals(inst->src[2]) &&
474 saturate == inst->saturate &&
475 predicate == inst->predicate &&
476 conditional_mod == inst->conditional_mod &&
477 mlen == inst->mlen &&
478 base_mrf == inst->base_mrf &&
479 target == inst->target &&
480 eot == inst->eot &&
481 header_present == inst->header_present &&
482 shadow_compare == inst->shadow_compare &&
483 exec_size == inst->exec_size &&
484 offset == inst->offset);
485 }
486
487 bool
488 fs_inst::overwrites_reg(const fs_reg &reg) const
489 {
490 return (reg.file == dst.file &&
491 reg.reg == dst.reg &&
492 reg.reg_offset >= dst.reg_offset &&
493 reg.reg_offset < dst.reg_offset + regs_written);
494 }
495
496 bool
497 fs_inst::is_send_from_grf() const
498 {
499 switch (opcode) {
500 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
501 case SHADER_OPCODE_SHADER_TIME_ADD:
502 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
503 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
504 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
505 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
506 case SHADER_OPCODE_UNTYPED_ATOMIC:
507 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
508 case SHADER_OPCODE_URB_WRITE_SIMD8:
509 return true;
510 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
511 return src[1].file == GRF;
512 case FS_OPCODE_FB_WRITE:
513 return src[0].file == GRF;
514 default:
515 if (is_tex())
516 return src[0].file == GRF;
517
518 return false;
519 }
520 }
521
522 bool
523 fs_inst::can_do_source_mods(struct brw_context *brw)
524 {
525 if (brw->gen == 6 && is_math())
526 return false;
527
528 if (is_send_from_grf())
529 return false;
530
531 if (!backend_instruction::can_do_source_mods())
532 return false;
533
534 return true;
535 }
536
537 void
538 fs_reg::init()
539 {
540 memset(this, 0, sizeof(*this));
541 stride = 1;
542 }
543
544 /** Generic unset register constructor. */
545 fs_reg::fs_reg()
546 {
547 init();
548 this->file = BAD_FILE;
549 }
550
551 /** Immediate value constructor. */
552 fs_reg::fs_reg(float f)
553 {
554 init();
555 this->file = IMM;
556 this->type = BRW_REGISTER_TYPE_F;
557 this->fixed_hw_reg.dw1.f = f;
558 this->width = 1;
559 }
560
561 /** Immediate value constructor. */
562 fs_reg::fs_reg(int32_t i)
563 {
564 init();
565 this->file = IMM;
566 this->type = BRW_REGISTER_TYPE_D;
567 this->fixed_hw_reg.dw1.d = i;
568 this->width = 1;
569 }
570
571 /** Immediate value constructor. */
572 fs_reg::fs_reg(uint32_t u)
573 {
574 init();
575 this->file = IMM;
576 this->type = BRW_REGISTER_TYPE_UD;
577 this->fixed_hw_reg.dw1.ud = u;
578 this->width = 1;
579 }
580
581 /** Vector float immediate value constructor. */
582 fs_reg::fs_reg(uint8_t vf[4])
583 {
584 init();
585 this->file = IMM;
586 this->type = BRW_REGISTER_TYPE_VF;
587 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
588 }
589
590 /** Vector float immediate value constructor. */
591 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
592 {
593 init();
594 this->file = IMM;
595 this->type = BRW_REGISTER_TYPE_VF;
596 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
597 (vf1 << 8) |
598 (vf2 << 16) |
599 (vf3 << 24);
600 }
601
602 /** Fixed brw_reg. */
603 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
604 {
605 init();
606 this->file = HW_REG;
607 this->fixed_hw_reg = fixed_hw_reg;
608 this->type = fixed_hw_reg.type;
609 this->width = 1 << fixed_hw_reg.width;
610 }
611
612 bool
613 fs_reg::equals(const fs_reg &r) const
614 {
615 return (file == r.file &&
616 reg == r.reg &&
617 reg_offset == r.reg_offset &&
618 subreg_offset == r.subreg_offset &&
619 type == r.type &&
620 negate == r.negate &&
621 abs == r.abs &&
622 !reladdr && !r.reladdr &&
623 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
624 width == r.width &&
625 stride == r.stride);
626 }
627
628 fs_reg &
629 fs_reg::set_smear(unsigned subreg)
630 {
631 assert(file != HW_REG && file != IMM);
632 subreg_offset = subreg * type_sz(type);
633 stride = 0;
634 return *this;
635 }
636
637 bool
638 fs_reg::is_contiguous() const
639 {
640 return stride == 1;
641 }
642
643 int
644 fs_visitor::type_size(const struct glsl_type *type)
645 {
646 unsigned int size, i;
647
648 switch (type->base_type) {
649 case GLSL_TYPE_UINT:
650 case GLSL_TYPE_INT:
651 case GLSL_TYPE_FLOAT:
652 case GLSL_TYPE_BOOL:
653 return type->components();
654 case GLSL_TYPE_ARRAY:
655 return type_size(type->fields.array) * type->length;
656 case GLSL_TYPE_STRUCT:
657 size = 0;
658 for (i = 0; i < type->length; i++) {
659 size += type_size(type->fields.structure[i].type);
660 }
661 return size;
662 case GLSL_TYPE_SAMPLER:
663 /* Samplers take up no register space, since they're baked in at
664 * link time.
665 */
666 return 0;
667 case GLSL_TYPE_ATOMIC_UINT:
668 return 0;
669 case GLSL_TYPE_IMAGE:
670 case GLSL_TYPE_VOID:
671 case GLSL_TYPE_ERROR:
672 case GLSL_TYPE_INTERFACE:
673 case GLSL_TYPE_DOUBLE:
674 unreachable("not reached");
675 }
676
677 return 0;
678 }
679
680 fs_reg
681 fs_visitor::get_timestamp()
682 {
683 assert(brw->gen >= 7);
684
685 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
686 BRW_ARF_TIMESTAMP,
687 0),
688 BRW_REGISTER_TYPE_UD));
689
690 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
691
692 fs_inst *mov = emit(MOV(dst, ts));
693 /* We want to read the 3 fields we care about even if it's not enabled in
694 * the dispatch.
695 */
696 mov->force_writemask_all = true;
697
698 /* The caller wants the low 32 bits of the timestamp. Since it's running
699 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
700 * which is plenty of time for our purposes. It is identical across the
701 * EUs, but since it's tracking GPU core speed it will increment at a
702 * varying rate as render P-states change.
703 *
704 * The caller could also check if render P-states have changed (or anything
705 * else that might disrupt timing) by setting smear to 2 and checking if
706 * that field is != 0.
707 */
708 dst.set_smear(0);
709
710 return dst;
711 }
712
713 void
714 fs_visitor::emit_shader_time_begin()
715 {
716 current_annotation = "shader time start";
717 shader_start_time = get_timestamp();
718 }
719
720 void
721 fs_visitor::emit_shader_time_end()
722 {
723 current_annotation = "shader time end";
724
725 enum shader_time_shader_type type, written_type, reset_type;
726 switch (stage) {
727 case MESA_SHADER_VERTEX:
728 type = ST_VS;
729 written_type = ST_VS_WRITTEN;
730 reset_type = ST_VS_RESET;
731 break;
732 case MESA_SHADER_GEOMETRY:
733 type = ST_GS;
734 written_type = ST_GS_WRITTEN;
735 reset_type = ST_GS_RESET;
736 break;
737 case MESA_SHADER_FRAGMENT:
738 if (dispatch_width == 8) {
739 type = ST_FS8;
740 written_type = ST_FS8_WRITTEN;
741 reset_type = ST_FS8_RESET;
742 } else {
743 assert(dispatch_width == 16);
744 type = ST_FS16;
745 written_type = ST_FS16_WRITTEN;
746 reset_type = ST_FS16_RESET;
747 }
748 break;
749 default:
750 unreachable("fs_visitor::emit_shader_time_end missing code");
751 }
752
753 fs_reg shader_end_time = get_timestamp();
754
755 /* Check that there weren't any timestamp reset events (assuming these
756 * were the only two timestamp reads that happened).
757 */
758 fs_reg reset = shader_end_time;
759 reset.set_smear(2);
760 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
761 test->conditional_mod = BRW_CONDITIONAL_Z;
762 test->force_writemask_all = true;
763 emit(IF(BRW_PREDICATE_NORMAL));
764
765 fs_reg start = shader_start_time;
766 start.negate = true;
767 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
768 diff.set_smear(0);
769 fs_inst *add = ADD(diff, start, shader_end_time);
770 add->force_writemask_all = true;
771 emit(add);
772
773 /* If there were no instructions between the two timestamp gets, the diff
774 * is 2 cycles. Remove that overhead, so I can forget about that when
775 * trying to determine the time taken for single instructions.
776 */
777 add = ADD(diff, diff, fs_reg(-2u));
778 add->force_writemask_all = true;
779 emit(add);
780
781 emit(SHADER_TIME_ADD(type, diff));
782 emit(SHADER_TIME_ADD(written_type, fs_reg(1u)));
783 emit(BRW_OPCODE_ELSE);
784 emit(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
785 emit(BRW_OPCODE_ENDIF);
786 }
787
788 fs_inst *
789 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
790 {
791 int shader_time_index =
792 brw_get_shader_time_index(brw, shader_prog, prog, type);
793 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
794
795 fs_reg payload;
796 if (dispatch_width == 8)
797 payload = vgrf(glsl_type::uvec2_type);
798 else
799 payload = vgrf(glsl_type::uint_type);
800
801 return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
802 fs_reg(), payload, offset, value);
803 }
804
805 void
806 fs_visitor::vfail(const char *format, va_list va)
807 {
808 char *msg;
809
810 if (failed)
811 return;
812
813 failed = true;
814
815 msg = ralloc_vasprintf(mem_ctx, format, va);
816 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
817
818 this->fail_msg = msg;
819
820 if (debug_enabled) {
821 fprintf(stderr, "%s", msg);
822 }
823 }
824
825 void
826 fs_visitor::fail(const char *format, ...)
827 {
828 va_list va;
829
830 va_start(va, format);
831 vfail(format, va);
832 va_end(va);
833 }
834
835 /**
836 * Mark this program as impossible to compile in SIMD16 mode.
837 *
838 * During the SIMD8 compile (which happens first), we can detect and flag
839 * things that are unsupported in SIMD16 mode, so the compiler can skip
840 * the SIMD16 compile altogether.
841 *
842 * During a SIMD16 compile (if one happens anyway), this just calls fail().
843 */
844 void
845 fs_visitor::no16(const char *format, ...)
846 {
847 va_list va;
848
849 va_start(va, format);
850
851 if (dispatch_width == 16) {
852 vfail(format, va);
853 } else {
854 simd16_unsupported = true;
855
856 if (brw->perf_debug) {
857 if (no16_msg)
858 ralloc_vasprintf_append(&no16_msg, format, va);
859 else
860 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
861 }
862 }
863
864 va_end(va);
865 }
866
867 fs_inst *
868 fs_visitor::emit(enum opcode opcode)
869 {
870 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
871 }
872
873 fs_inst *
874 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
875 {
876 return emit(new(mem_ctx) fs_inst(opcode, dst));
877 }
878
879 fs_inst *
880 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
881 {
882 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
883 }
884
885 fs_inst *
886 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
887 const fs_reg &src1)
888 {
889 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
890 }
891
892 fs_inst *
893 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
894 const fs_reg &src1, const fs_reg &src2)
895 {
896 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
897 }
898
899 fs_inst *
900 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
901 fs_reg src[], int sources)
902 {
903 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
904 }
905
906 /**
907 * Returns true if the instruction has a flag that means it won't
908 * update an entire destination register.
909 *
910 * For example, dead code elimination and live variable analysis want to know
911 * when a write to a variable screens off any preceding values that were in
912 * it.
913 */
914 bool
915 fs_inst::is_partial_write() const
916 {
917 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
918 (this->dst.width * type_sz(this->dst.type)) < 32 ||
919 !this->dst.is_contiguous());
920 }
921
922 int
923 fs_inst::regs_read(int arg) const
924 {
925 if (is_tex() && arg == 0 && src[0].file == GRF) {
926 return mlen;
927 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
928 return mlen;
929 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
930 return mlen;
931 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
932 return mlen;
933 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
934 return mlen;
935 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
936 return mlen;
937 }
938
939 switch (src[arg].file) {
940 case BAD_FILE:
941 case UNIFORM:
942 case IMM:
943 return 1;
944 case GRF:
945 case HW_REG:
946 if (src[arg].stride == 0) {
947 return 1;
948 } else {
949 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
950 return (size + 31) / 32;
951 }
952 case MRF:
953 unreachable("MRF registers are not allowed as sources");
954 default:
955 unreachable("Invalid register file");
956 }
957 }
958
959 bool
960 fs_inst::reads_flag() const
961 {
962 return predicate;
963 }
964
965 bool
966 fs_inst::writes_flag() const
967 {
968 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
969 opcode != BRW_OPCODE_IF &&
970 opcode != BRW_OPCODE_WHILE)) ||
971 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
972 }
973
974 /**
975 * Returns how many MRFs an FS opcode will write over.
976 *
977 * Note that this is not the 0 or 1 implied writes in an actual gen
978 * instruction -- the FS opcodes often generate MOVs in addition.
979 */
980 int
981 fs_visitor::implied_mrf_writes(fs_inst *inst)
982 {
983 if (inst->mlen == 0)
984 return 0;
985
986 if (inst->base_mrf == -1)
987 return 0;
988
989 switch (inst->opcode) {
990 case SHADER_OPCODE_RCP:
991 case SHADER_OPCODE_RSQ:
992 case SHADER_OPCODE_SQRT:
993 case SHADER_OPCODE_EXP2:
994 case SHADER_OPCODE_LOG2:
995 case SHADER_OPCODE_SIN:
996 case SHADER_OPCODE_COS:
997 return 1 * dispatch_width / 8;
998 case SHADER_OPCODE_POW:
999 case SHADER_OPCODE_INT_QUOTIENT:
1000 case SHADER_OPCODE_INT_REMAINDER:
1001 return 2 * dispatch_width / 8;
1002 case SHADER_OPCODE_TEX:
1003 case FS_OPCODE_TXB:
1004 case SHADER_OPCODE_TXD:
1005 case SHADER_OPCODE_TXF:
1006 case SHADER_OPCODE_TXF_CMS:
1007 case SHADER_OPCODE_TXF_MCS:
1008 case SHADER_OPCODE_TG4:
1009 case SHADER_OPCODE_TG4_OFFSET:
1010 case SHADER_OPCODE_TXL:
1011 case SHADER_OPCODE_TXS:
1012 case SHADER_OPCODE_LOD:
1013 return 1;
1014 case FS_OPCODE_FB_WRITE:
1015 return 2;
1016 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1017 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1018 return 1;
1019 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1020 return inst->mlen;
1021 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1022 return 2;
1023 case SHADER_OPCODE_UNTYPED_ATOMIC:
1024 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1025 case SHADER_OPCODE_URB_WRITE_SIMD8:
1026 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1027 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1028 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1029 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1030 return 0;
1031 default:
1032 unreachable("not reached");
1033 }
1034 }
1035
1036 fs_reg
1037 fs_visitor::vgrf(const glsl_type *const type)
1038 {
1039 int reg_width = dispatch_width / 8;
1040 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1041 brw_type_for_base_type(type), dispatch_width);
1042 }
1043
1044 fs_reg
1045 fs_visitor::vgrf(int num_components)
1046 {
1047 int reg_width = dispatch_width / 8;
1048 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1049 BRW_REGISTER_TYPE_F, dispatch_width);
1050 }
1051
1052 /** Fixed HW reg constructor. */
1053 fs_reg::fs_reg(enum register_file file, int reg)
1054 {
1055 init();
1056 this->file = file;
1057 this->reg = reg;
1058 this->type = BRW_REGISTER_TYPE_F;
1059
1060 switch (file) {
1061 case UNIFORM:
1062 this->width = 1;
1063 break;
1064 default:
1065 this->width = 8;
1066 }
1067 }
1068
1069 /** Fixed HW reg constructor. */
1070 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1071 {
1072 init();
1073 this->file = file;
1074 this->reg = reg;
1075 this->type = type;
1076
1077 switch (file) {
1078 case UNIFORM:
1079 this->width = 1;
1080 break;
1081 default:
1082 this->width = 8;
1083 }
1084 }
1085
1086 /** Fixed HW reg constructor. */
1087 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1088 uint8_t width)
1089 {
1090 init();
1091 this->file = file;
1092 this->reg = reg;
1093 this->type = type;
1094 this->width = width;
1095 }
1096
1097 fs_reg *
1098 fs_visitor::variable_storage(ir_variable *var)
1099 {
1100 return (fs_reg *)hash_table_find(this->variable_ht, var);
1101 }
1102
1103 void
1104 import_uniforms_callback(const void *key,
1105 void *data,
1106 void *closure)
1107 {
1108 struct hash_table *dst_ht = (struct hash_table *)closure;
1109 const fs_reg *reg = (const fs_reg *)data;
1110
1111 if (reg->file != UNIFORM)
1112 return;
1113
1114 hash_table_insert(dst_ht, data, key);
1115 }
1116
1117 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1118 * This brings in those uniform definitions
1119 */
1120 void
1121 fs_visitor::import_uniforms(fs_visitor *v)
1122 {
1123 hash_table_call_foreach(v->variable_ht,
1124 import_uniforms_callback,
1125 variable_ht);
1126 this->push_constant_loc = v->push_constant_loc;
1127 this->pull_constant_loc = v->pull_constant_loc;
1128 this->uniforms = v->uniforms;
1129 this->param_size = v->param_size;
1130 }
1131
1132 /* Our support for uniforms is piggy-backed on the struct
1133 * gl_fragment_program, because that's where the values actually
1134 * get stored, rather than in some global gl_shader_program uniform
1135 * store.
1136 */
1137 void
1138 fs_visitor::setup_uniform_values(ir_variable *ir)
1139 {
1140 int namelen = strlen(ir->name);
1141
1142 /* The data for our (non-builtin) uniforms is stored in a series of
1143 * gl_uniform_driver_storage structs for each subcomponent that
1144 * glGetUniformLocation() could name. We know it's been set up in the same
1145 * order we'd walk the type, so walk the list of storage and find anything
1146 * with our name, or the prefix of a component that starts with our name.
1147 */
1148 unsigned params_before = uniforms;
1149 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1150 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1151
1152 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1153 (storage->name[namelen] != 0 &&
1154 storage->name[namelen] != '.' &&
1155 storage->name[namelen] != '[')) {
1156 continue;
1157 }
1158
1159 unsigned slots = storage->type->component_slots();
1160 if (storage->array_elements)
1161 slots *= storage->array_elements;
1162
1163 for (unsigned i = 0; i < slots; i++) {
1164 stage_prog_data->param[uniforms++] = &storage->storage[i];
1165 }
1166 }
1167
1168 /* Make sure we actually initialized the right amount of stuff here. */
1169 assert(params_before + ir->type->component_slots() == uniforms);
1170 (void)params_before;
1171 }
1172
1173
1174 /* Our support for builtin uniforms is even scarier than non-builtin.
1175 * It sits on top of the PROG_STATE_VAR parameters that are
1176 * automatically updated from GL context state.
1177 */
1178 void
1179 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1180 {
1181 const ir_state_slot *const slots = ir->get_state_slots();
1182 assert(slots != NULL);
1183
1184 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1185 /* This state reference has already been setup by ir_to_mesa, but we'll
1186 * get the same index back here.
1187 */
1188 int index = _mesa_add_state_reference(this->prog->Parameters,
1189 (gl_state_index *)slots[i].tokens);
1190
1191 /* Add each of the unique swizzles of the element as a parameter.
1192 * This'll end up matching the expected layout of the
1193 * array/matrix/structure we're trying to fill in.
1194 */
1195 int last_swiz = -1;
1196 for (unsigned int j = 0; j < 4; j++) {
1197 int swiz = GET_SWZ(slots[i].swizzle, j);
1198 if (swiz == last_swiz)
1199 break;
1200 last_swiz = swiz;
1201
1202 stage_prog_data->param[uniforms++] =
1203 &prog->Parameters->ParameterValues[index][swiz];
1204 }
1205 }
1206 }
1207
1208 fs_reg *
1209 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1210 bool origin_upper_left)
1211 {
1212 assert(stage == MESA_SHADER_FRAGMENT);
1213 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1214 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1215 fs_reg wpos = *reg;
1216 bool flip = !origin_upper_left ^ key->render_to_fbo;
1217
1218 /* gl_FragCoord.x */
1219 if (pixel_center_integer) {
1220 emit(MOV(wpos, this->pixel_x));
1221 } else {
1222 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1223 }
1224 wpos = offset(wpos, 1);
1225
1226 /* gl_FragCoord.y */
1227 if (!flip && pixel_center_integer) {
1228 emit(MOV(wpos, this->pixel_y));
1229 } else {
1230 fs_reg pixel_y = this->pixel_y;
1231 float offset = (pixel_center_integer ? 0.0 : 0.5);
1232
1233 if (flip) {
1234 pixel_y.negate = true;
1235 offset += key->drawable_height - 1.0;
1236 }
1237
1238 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1239 }
1240 wpos = offset(wpos, 1);
1241
1242 /* gl_FragCoord.z */
1243 if (brw->gen >= 6) {
1244 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1245 } else {
1246 emit(FS_OPCODE_LINTERP, wpos,
1247 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1248 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1249 interp_reg(VARYING_SLOT_POS, 2));
1250 }
1251 wpos = offset(wpos, 1);
1252
1253 /* gl_FragCoord.w: Already set up in emit_interpolation */
1254 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1255
1256 return reg;
1257 }
1258
1259 fs_inst *
1260 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1261 glsl_interp_qualifier interpolation_mode,
1262 bool is_centroid, bool is_sample)
1263 {
1264 brw_wm_barycentric_interp_mode barycoord_mode;
1265 if (brw->gen >= 6) {
1266 if (is_centroid) {
1267 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1268 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1269 else
1270 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1271 } else if (is_sample) {
1272 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1273 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1274 else
1275 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1276 } else {
1277 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1278 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1279 else
1280 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1281 }
1282 } else {
1283 /* On Ironlake and below, there is only one interpolation mode.
1284 * Centroid interpolation doesn't mean anything on this hardware --
1285 * there is no multisampling.
1286 */
1287 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1288 }
1289 return emit(FS_OPCODE_LINTERP, attr,
1290 this->delta_x[barycoord_mode],
1291 this->delta_y[barycoord_mode], interp);
1292 }
1293
1294 void
1295 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1296 const glsl_type *type,
1297 glsl_interp_qualifier interpolation_mode,
1298 int location, bool mod_centroid,
1299 bool mod_sample)
1300 {
1301 attr.type = brw_type_for_base_type(type->get_scalar_type());
1302
1303 assert(stage == MESA_SHADER_FRAGMENT);
1304 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1305 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1306
1307 unsigned int array_elements;
1308
1309 if (type->is_array()) {
1310 array_elements = type->length;
1311 if (array_elements == 0) {
1312 fail("dereferenced array '%s' has length 0\n", name);
1313 }
1314 type = type->fields.array;
1315 } else {
1316 array_elements = 1;
1317 }
1318
1319 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1320 bool is_gl_Color =
1321 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1322 if (key->flat_shade && is_gl_Color) {
1323 interpolation_mode = INTERP_QUALIFIER_FLAT;
1324 } else {
1325 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1326 }
1327 }
1328
1329 for (unsigned int i = 0; i < array_elements; i++) {
1330 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1331 if (prog_data->urb_setup[location] == -1) {
1332 /* If there's no incoming setup data for this slot, don't
1333 * emit interpolation for it.
1334 */
1335 attr = offset(attr, type->vector_elements);
1336 location++;
1337 continue;
1338 }
1339
1340 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1341 /* Constant interpolation (flat shading) case. The SF has
1342 * handed us defined values in only the constant offset
1343 * field of the setup reg.
1344 */
1345 for (unsigned int k = 0; k < type->vector_elements; k++) {
1346 struct brw_reg interp = interp_reg(location, k);
1347 interp = suboffset(interp, 3);
1348 interp.type = attr.type;
1349 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1350 attr = offset(attr, 1);
1351 }
1352 } else {
1353 /* Smooth/noperspective interpolation case. */
1354 for (unsigned int k = 0; k < type->vector_elements; k++) {
1355 struct brw_reg interp = interp_reg(location, k);
1356 if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1357 /* Get the pixel/sample mask into f0 so that we know
1358 * which pixels are lit. Then, for each channel that is
1359 * unlit, replace the centroid data with non-centroid
1360 * data.
1361 */
1362 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1363
1364 fs_inst *inst;
1365 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1366 false, false);
1367 inst->predicate = BRW_PREDICATE_NORMAL;
1368 inst->predicate_inverse = true;
1369 if (brw->has_pln)
1370 inst->no_dd_clear = true;
1371
1372 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1373 mod_centroid && !key->persample_shading,
1374 mod_sample || key->persample_shading);
1375 inst->predicate = BRW_PREDICATE_NORMAL;
1376 inst->predicate_inverse = false;
1377 if (brw->has_pln)
1378 inst->no_dd_check = true;
1379
1380 } else {
1381 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1382 mod_centroid && !key->persample_shading,
1383 mod_sample || key->persample_shading);
1384 }
1385 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1386 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1387 }
1388 attr = offset(attr, 1);
1389 }
1390
1391 }
1392 location++;
1393 }
1394 }
1395 }
1396
1397 fs_reg *
1398 fs_visitor::emit_frontfacing_interpolation()
1399 {
1400 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1401
1402 if (brw->gen >= 6) {
1403 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1404 * a boolean result from this (~0/true or 0/false).
1405 *
1406 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1407 * this task in only one instruction:
1408 * - a negation source modifier will flip the bit; and
1409 * - a W -> D type conversion will sign extend the bit into the high
1410 * word of the destination.
1411 *
1412 * An ASR 15 fills the low word of the destination.
1413 */
1414 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1415 g0.negate = true;
1416
1417 emit(ASR(*reg, g0, fs_reg(15)));
1418 } else {
1419 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1420 * a boolean result from this (1/true or 0/false).
1421 *
1422 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1423 * the negation source modifier to flip it. Unfortunately the SHR
1424 * instruction only operates on UD (or D with an abs source modifier)
1425 * sources without negation.
1426 *
1427 * Instead, use ASR (which will give ~0/true or 0/false).
1428 */
1429 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1430 g1_6.negate = true;
1431
1432 emit(ASR(*reg, g1_6, fs_reg(31)));
1433 }
1434
1435 return reg;
1436 }
1437
1438 void
1439 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1440 {
1441 assert(stage == MESA_SHADER_FRAGMENT);
1442 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1443 assert(dst.type == BRW_REGISTER_TYPE_F);
1444
1445 if (key->compute_pos_offset) {
1446 /* Convert int_sample_pos to floating point */
1447 emit(MOV(dst, int_sample_pos));
1448 /* Scale to the range [0, 1] */
1449 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1450 }
1451 else {
1452 /* From ARB_sample_shading specification:
1453 * "When rendering to a non-multisample buffer, or if multisample
1454 * rasterization is disabled, gl_SamplePosition will always be
1455 * (0.5, 0.5).
1456 */
1457 emit(MOV(dst, fs_reg(0.5f)));
1458 }
1459 }
1460
1461 fs_reg *
1462 fs_visitor::emit_samplepos_setup()
1463 {
1464 assert(brw->gen >= 6);
1465
1466 this->current_annotation = "compute sample position";
1467 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1468 fs_reg pos = *reg;
1469 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1470 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1471
1472 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1473 * mode will be enabled.
1474 *
1475 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1476 * R31.1:0 Position Offset X/Y for Slot[3:0]
1477 * R31.3:2 Position Offset X/Y for Slot[7:4]
1478 * .....
1479 *
1480 * The X, Y sample positions come in as bytes in thread payload. So, read
1481 * the positions using vstride=16, width=8, hstride=2.
1482 */
1483 struct brw_reg sample_pos_reg =
1484 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1485 BRW_REGISTER_TYPE_B), 16, 8, 2);
1486
1487 if (dispatch_width == 8) {
1488 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1489 } else {
1490 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1491 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1492 ->force_sechalf = true;
1493 }
1494 /* Compute gl_SamplePosition.x */
1495 compute_sample_position(pos, int_sample_x);
1496 pos = offset(pos, 1);
1497 if (dispatch_width == 8) {
1498 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1499 } else {
1500 emit(MOV(half(int_sample_y, 0),
1501 fs_reg(suboffset(sample_pos_reg, 1))));
1502 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1503 ->force_sechalf = true;
1504 }
1505 /* Compute gl_SamplePosition.y */
1506 compute_sample_position(pos, int_sample_y);
1507 return reg;
1508 }
1509
1510 fs_reg *
1511 fs_visitor::emit_sampleid_setup()
1512 {
1513 assert(stage == MESA_SHADER_FRAGMENT);
1514 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1515 assert(brw->gen >= 6);
1516
1517 this->current_annotation = "compute sample id";
1518 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1519
1520 if (key->compute_sample_id) {
1521 fs_reg t1 = vgrf(glsl_type::int_type);
1522 fs_reg t2 = vgrf(glsl_type::int_type);
1523 t2.type = BRW_REGISTER_TYPE_UW;
1524
1525 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1526 * 8x multisampling, subspan 0 will represent sample N (where N
1527 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1528 * 7. We can find the value of N by looking at R0.0 bits 7:6
1529 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1530 * (since samples are always delivered in pairs). That is, we
1531 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1532 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1533 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1534 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1535 * populating a temporary variable with the sequence (0, 1, 2, 3),
1536 * and then reading from it using vstride=1, width=4, hstride=0.
1537 * These computations hold good for 4x multisampling as well.
1538 *
1539 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1540 * the first four slots are sample 0 of subspan 0; the next four
1541 * are sample 1 of subspan 0; the third group is sample 0 of
1542 * subspan 1, and finally sample 1 of subspan 1.
1543 */
1544 fs_inst *inst;
1545 inst = emit(BRW_OPCODE_AND, t1,
1546 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1547 fs_reg(0xc0));
1548 inst->force_writemask_all = true;
1549 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1550 inst->force_writemask_all = true;
1551 /* This works for both SIMD8 and SIMD16 */
1552 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1553 inst->force_writemask_all = true;
1554 /* This special instruction takes care of setting vstride=1,
1555 * width=4, hstride=0 of t2 during an ADD instruction.
1556 */
1557 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1558 } else {
1559 /* As per GL_ARB_sample_shading specification:
1560 * "When rendering to a non-multisample buffer, or if multisample
1561 * rasterization is disabled, gl_SampleID will always be zero."
1562 */
1563 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1564 }
1565
1566 return reg;
1567 }
1568
1569 void
1570 fs_visitor::resolve_source_modifiers(fs_reg *src)
1571 {
1572 if (!src->abs && !src->negate)
1573 return;
1574
1575 fs_reg temp = retype(vgrf(1), src->type);
1576 emit(MOV(temp, *src));
1577 *src = temp;
1578 }
1579
1580 fs_reg
1581 fs_visitor::fix_math_operand(fs_reg src)
1582 {
1583 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1584 * might be able to do better by doing execsize = 1 math and then
1585 * expanding that result out, but we would need to be careful with
1586 * masking.
1587 *
1588 * The hardware ignores source modifiers (negate and abs) on math
1589 * instructions, so we also move to a temp to set those up.
1590 */
1591 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1592 !src.abs && !src.negate)
1593 return src;
1594
1595 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1596 * operands to math
1597 */
1598 if (brw->gen >= 7 && src.file != IMM)
1599 return src;
1600
1601 fs_reg expanded = vgrf(glsl_type::float_type);
1602 expanded.type = src.type;
1603 emit(BRW_OPCODE_MOV, expanded, src);
1604 return expanded;
1605 }
1606
1607 fs_inst *
1608 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1609 {
1610 switch (opcode) {
1611 case SHADER_OPCODE_RCP:
1612 case SHADER_OPCODE_RSQ:
1613 case SHADER_OPCODE_SQRT:
1614 case SHADER_OPCODE_EXP2:
1615 case SHADER_OPCODE_LOG2:
1616 case SHADER_OPCODE_SIN:
1617 case SHADER_OPCODE_COS:
1618 break;
1619 default:
1620 unreachable("not reached: bad math opcode");
1621 }
1622
1623 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1624 * might be able to do better by doing execsize = 1 math and then
1625 * expanding that result out, but we would need to be careful with
1626 * masking.
1627 *
1628 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1629 * instructions, so we also move to a temp to set those up.
1630 */
1631 if (brw->gen == 6 || brw->gen == 7)
1632 src = fix_math_operand(src);
1633
1634 fs_inst *inst = emit(opcode, dst, src);
1635
1636 if (brw->gen < 6) {
1637 inst->base_mrf = 2;
1638 inst->mlen = dispatch_width / 8;
1639 }
1640
1641 return inst;
1642 }
1643
1644 fs_inst *
1645 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1646 {
1647 int base_mrf = 2;
1648 fs_inst *inst;
1649
1650 if (brw->gen >= 8) {
1651 inst = emit(opcode, dst, src0, src1);
1652 } else if (brw->gen >= 6) {
1653 src0 = fix_math_operand(src0);
1654 src1 = fix_math_operand(src1);
1655
1656 inst = emit(opcode, dst, src0, src1);
1657 } else {
1658 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1659 * "Message Payload":
1660 *
1661 * "Operand0[7]. For the INT DIV functions, this operand is the
1662 * denominator."
1663 * ...
1664 * "Operand1[7]. For the INT DIV functions, this operand is the
1665 * numerator."
1666 */
1667 bool is_int_div = opcode != SHADER_OPCODE_POW;
1668 fs_reg &op0 = is_int_div ? src1 : src0;
1669 fs_reg &op1 = is_int_div ? src0 : src1;
1670
1671 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1672 inst = emit(opcode, dst, op0, reg_null_f);
1673
1674 inst->base_mrf = base_mrf;
1675 inst->mlen = 2 * dispatch_width / 8;
1676 }
1677 return inst;
1678 }
1679
1680 void
1681 fs_visitor::assign_curb_setup()
1682 {
1683 if (dispatch_width == 8) {
1684 prog_data->dispatch_grf_start_reg = payload.num_regs;
1685 } else {
1686 assert(stage == MESA_SHADER_FRAGMENT);
1687 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1688 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1689 }
1690
1691 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1692
1693 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1694 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1695 for (unsigned int i = 0; i < inst->sources; i++) {
1696 if (inst->src[i].file == UNIFORM) {
1697 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1698 int constant_nr;
1699 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1700 constant_nr = push_constant_loc[uniform_nr];
1701 } else {
1702 /* Section 5.11 of the OpenGL 4.1 spec says:
1703 * "Out-of-bounds reads return undefined values, which include
1704 * values from other variables of the active program or zero."
1705 * Just return the first push constant.
1706 */
1707 constant_nr = 0;
1708 }
1709
1710 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1711 constant_nr / 8,
1712 constant_nr % 8);
1713
1714 inst->src[i].file = HW_REG;
1715 inst->src[i].fixed_hw_reg = byte_offset(
1716 retype(brw_reg, inst->src[i].type),
1717 inst->src[i].subreg_offset);
1718 }
1719 }
1720 }
1721 }
1722
1723 void
1724 fs_visitor::calculate_urb_setup()
1725 {
1726 assert(stage == MESA_SHADER_FRAGMENT);
1727 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1728 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1729
1730 memset(prog_data->urb_setup, -1,
1731 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1732
1733 int urb_next = 0;
1734 /* Figure out where each of the incoming setup attributes lands. */
1735 if (brw->gen >= 6) {
1736 if (_mesa_bitcount_64(prog->InputsRead &
1737 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1738 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1739 * first 16 varying inputs, so we can put them wherever we want.
1740 * Just put them in order.
1741 *
1742 * This is useful because it means that (a) inputs not used by the
1743 * fragment shader won't take up valuable register space, and (b) we
1744 * won't have to recompile the fragment shader if it gets paired with
1745 * a different vertex (or geometry) shader.
1746 */
1747 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1748 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1749 BITFIELD64_BIT(i)) {
1750 prog_data->urb_setup[i] = urb_next++;
1751 }
1752 }
1753 } else {
1754 /* We have enough input varyings that the SF/SBE pipeline stage can't
1755 * arbitrarily rearrange them to suit our whim; we have to put them
1756 * in an order that matches the output of the previous pipeline stage
1757 * (geometry or vertex shader).
1758 */
1759 struct brw_vue_map prev_stage_vue_map;
1760 brw_compute_vue_map(brw, &prev_stage_vue_map,
1761 key->input_slots_valid);
1762 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1763 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1764 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1765 slot++) {
1766 int varying = prev_stage_vue_map.slot_to_varying[slot];
1767 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1768 * unused.
1769 */
1770 if (varying != BRW_VARYING_SLOT_COUNT &&
1771 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1772 BITFIELD64_BIT(varying))) {
1773 prog_data->urb_setup[varying] = slot - first_slot;
1774 }
1775 }
1776 urb_next = prev_stage_vue_map.num_slots - first_slot;
1777 }
1778 } else {
1779 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1780 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1781 /* Point size is packed into the header, not as a general attribute */
1782 if (i == VARYING_SLOT_PSIZ)
1783 continue;
1784
1785 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1786 /* The back color slot is skipped when the front color is
1787 * also written to. In addition, some slots can be
1788 * written in the vertex shader and not read in the
1789 * fragment shader. So the register number must always be
1790 * incremented, mapped or not.
1791 */
1792 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1793 prog_data->urb_setup[i] = urb_next;
1794 urb_next++;
1795 }
1796 }
1797
1798 /*
1799 * It's a FS only attribute, and we did interpolation for this attribute
1800 * in SF thread. So, count it here, too.
1801 *
1802 * See compile_sf_prog() for more info.
1803 */
1804 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1805 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1806 }
1807
1808 prog_data->num_varying_inputs = urb_next;
1809 }
1810
1811 void
1812 fs_visitor::assign_urb_setup()
1813 {
1814 assert(stage == MESA_SHADER_FRAGMENT);
1815 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1816
1817 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1818
1819 /* Offset all the urb_setup[] index by the actual position of the
1820 * setup regs, now that the location of the constants has been chosen.
1821 */
1822 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1823 if (inst->opcode == FS_OPCODE_LINTERP) {
1824 assert(inst->src[2].file == HW_REG);
1825 inst->src[2].fixed_hw_reg.nr += urb_start;
1826 }
1827
1828 if (inst->opcode == FS_OPCODE_CINTERP) {
1829 assert(inst->src[0].file == HW_REG);
1830 inst->src[0].fixed_hw_reg.nr += urb_start;
1831 }
1832 }
1833
1834 /* Each attribute is 4 setup channels, each of which is half a reg. */
1835 this->first_non_payload_grf =
1836 urb_start + prog_data->num_varying_inputs * 2;
1837 }
1838
1839 void
1840 fs_visitor::assign_vs_urb_setup()
1841 {
1842 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1843 int grf, count, slot, channel, attr;
1844
1845 assert(stage == MESA_SHADER_VERTEX);
1846 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1847 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1848 count++;
1849
1850 /* Each attribute is 4 regs. */
1851 this->first_non_payload_grf =
1852 payload.num_regs + prog_data->curb_read_length + count * 4;
1853
1854 unsigned vue_entries =
1855 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1856
1857 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1858 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1859
1860 assert(vs_prog_data->base.urb_read_length <= 15);
1861
1862 /* Rewrite all ATTR file references to the hw grf that they land in. */
1863 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1864 for (int i = 0; i < inst->sources; i++) {
1865 if (inst->src[i].file == ATTR) {
1866
1867 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1868 slot = count - 1;
1869 } else {
1870 /* Attributes come in in a contiguous block, ordered by their
1871 * gl_vert_attrib value. That means we can compute the slot
1872 * number for an attribute by masking out the enabled
1873 * attributes before it and counting the bits.
1874 */
1875 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1876 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1877 BITFIELD64_MASK(attr));
1878 }
1879
1880 channel = inst->src[i].reg_offset & 3;
1881
1882 grf = payload.num_regs +
1883 prog_data->curb_read_length +
1884 slot * 4 + channel;
1885
1886 inst->src[i].file = HW_REG;
1887 inst->src[i].fixed_hw_reg =
1888 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1889 }
1890 }
1891 }
1892 }
1893
1894 /**
1895 * Split large virtual GRFs into separate components if we can.
1896 *
1897 * This is mostly duplicated with what brw_fs_vector_splitting does,
1898 * but that's really conservative because it's afraid of doing
1899 * splitting that doesn't result in real progress after the rest of
1900 * the optimization phases, which would cause infinite looping in
1901 * optimization. We can do it once here, safely. This also has the
1902 * opportunity to split interpolated values, or maybe even uniforms,
1903 * which we don't have at the IR level.
1904 *
1905 * We want to split, because virtual GRFs are what we register
1906 * allocate and spill (due to contiguousness requirements for some
1907 * instructions), and they're what we naturally generate in the
1908 * codegen process, but most virtual GRFs don't actually need to be
1909 * contiguous sets of GRFs. If we split, we'll end up with reduced
1910 * live intervals and better dead code elimination and coalescing.
1911 */
1912 void
1913 fs_visitor::split_virtual_grfs()
1914 {
1915 int num_vars = this->alloc.count;
1916
1917 /* Count the total number of registers */
1918 int reg_count = 0;
1919 int vgrf_to_reg[num_vars];
1920 for (int i = 0; i < num_vars; i++) {
1921 vgrf_to_reg[i] = reg_count;
1922 reg_count += alloc.sizes[i];
1923 }
1924
1925 /* An array of "split points". For each register slot, this indicates
1926 * if this slot can be separated from the previous slot. Every time an
1927 * instruction uses multiple elements of a register (as a source or
1928 * destination), we mark the used slots as inseparable. Then we go
1929 * through and split the registers into the smallest pieces we can.
1930 */
1931 bool split_points[reg_count];
1932 memset(split_points, 0, sizeof(split_points));
1933
1934 /* Mark all used registers as fully splittable */
1935 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1936 if (inst->dst.file == GRF) {
1937 int reg = vgrf_to_reg[inst->dst.reg];
1938 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1939 split_points[reg + j] = true;
1940 }
1941
1942 for (int i = 0; i < inst->sources; i++) {
1943 if (inst->src[i].file == GRF) {
1944 int reg = vgrf_to_reg[inst->src[i].reg];
1945 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1946 split_points[reg + j] = true;
1947 }
1948 }
1949 }
1950
1951 if (brw->has_pln &&
1952 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1953 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1954 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1955 * Gen6, that was the only supported interpolation mode, and since Gen6,
1956 * delta_x and delta_y are in fixed hardware registers.
1957 */
1958 int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1959 split_points[vgrf_to_reg[vgrf] + 1] = false;
1960 }
1961
1962 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1963 if (inst->dst.file == GRF) {
1964 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1965 for (int j = 1; j < inst->regs_written; j++)
1966 split_points[reg + j] = false;
1967 }
1968 for (int i = 0; i < inst->sources; i++) {
1969 if (inst->src[i].file == GRF) {
1970 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1971 for (int j = 1; j < inst->regs_read(i); j++)
1972 split_points[reg + j] = false;
1973 }
1974 }
1975 }
1976
1977 int new_virtual_grf[reg_count];
1978 int new_reg_offset[reg_count];
1979
1980 int reg = 0;
1981 for (int i = 0; i < num_vars; i++) {
1982 /* The first one should always be 0 as a quick sanity check. */
1983 assert(split_points[reg] == false);
1984
1985 /* j = 0 case */
1986 new_reg_offset[reg] = 0;
1987 reg++;
1988 int offset = 1;
1989
1990 /* j > 0 case */
1991 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1992 /* If this is a split point, reset the offset to 0 and allocate a
1993 * new virtual GRF for the previous offset many registers
1994 */
1995 if (split_points[reg]) {
1996 assert(offset <= MAX_VGRF_SIZE);
1997 int grf = alloc.allocate(offset);
1998 for (int k = reg - offset; k < reg; k++)
1999 new_virtual_grf[k] = grf;
2000 offset = 0;
2001 }
2002 new_reg_offset[reg] = offset;
2003 offset++;
2004 reg++;
2005 }
2006
2007 /* The last one gets the original register number */
2008 assert(offset <= MAX_VGRF_SIZE);
2009 alloc.sizes[i] = offset;
2010 for (int k = reg - offset; k < reg; k++)
2011 new_virtual_grf[k] = i;
2012 }
2013 assert(reg == reg_count);
2014
2015 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2016 if (inst->dst.file == GRF) {
2017 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2018 inst->dst.reg = new_virtual_grf[reg];
2019 inst->dst.reg_offset = new_reg_offset[reg];
2020 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2021 }
2022 for (int i = 0; i < inst->sources; i++) {
2023 if (inst->src[i].file == GRF) {
2024 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2025 inst->src[i].reg = new_virtual_grf[reg];
2026 inst->src[i].reg_offset = new_reg_offset[reg];
2027 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2028 }
2029 }
2030 }
2031 invalidate_live_intervals();
2032 }
2033
2034 /**
2035 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2036 *
2037 * During code generation, we create tons of temporary variables, many of
2038 * which get immediately killed and are never used again. Yet, in later
2039 * optimization and analysis passes, such as compute_live_intervals, we need
2040 * to loop over all the virtual GRFs. Compacting them can save a lot of
2041 * overhead.
2042 */
2043 bool
2044 fs_visitor::compact_virtual_grfs()
2045 {
2046 bool progress = false;
2047 int remap_table[this->alloc.count];
2048 memset(remap_table, -1, sizeof(remap_table));
2049
2050 /* Mark which virtual GRFs are used. */
2051 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2052 if (inst->dst.file == GRF)
2053 remap_table[inst->dst.reg] = 0;
2054
2055 for (int i = 0; i < inst->sources; i++) {
2056 if (inst->src[i].file == GRF)
2057 remap_table[inst->src[i].reg] = 0;
2058 }
2059 }
2060
2061 /* Compact the GRF arrays. */
2062 int new_index = 0;
2063 for (unsigned i = 0; i < this->alloc.count; i++) {
2064 if (remap_table[i] == -1) {
2065 /* We just found an unused register. This means that we are
2066 * actually going to compact something.
2067 */
2068 progress = true;
2069 } else {
2070 remap_table[i] = new_index;
2071 alloc.sizes[new_index] = alloc.sizes[i];
2072 invalidate_live_intervals();
2073 ++new_index;
2074 }
2075 }
2076
2077 this->alloc.count = new_index;
2078
2079 /* Patch all the instructions to use the newly renumbered registers */
2080 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2081 if (inst->dst.file == GRF)
2082 inst->dst.reg = remap_table[inst->dst.reg];
2083
2084 for (int i = 0; i < inst->sources; i++) {
2085 if (inst->src[i].file == GRF)
2086 inst->src[i].reg = remap_table[inst->src[i].reg];
2087 }
2088 }
2089
2090 /* Patch all the references to delta_x/delta_y, since they're used in
2091 * register allocation. If they're unused, switch them to BAD_FILE so
2092 * we don't think some random VGRF is delta_x/delta_y.
2093 */
2094 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2095 if (delta_x[i].file == GRF) {
2096 if (remap_table[delta_x[i].reg] != -1) {
2097 delta_x[i].reg = remap_table[delta_x[i].reg];
2098 } else {
2099 delta_x[i].file = BAD_FILE;
2100 }
2101 }
2102 }
2103 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2104 if (delta_y[i].file == GRF) {
2105 if (remap_table[delta_y[i].reg] != -1) {
2106 delta_y[i].reg = remap_table[delta_y[i].reg];
2107 } else {
2108 delta_y[i].file = BAD_FILE;
2109 }
2110 }
2111 }
2112
2113 return progress;
2114 }
2115
2116 /*
2117 * Implements array access of uniforms by inserting a
2118 * PULL_CONSTANT_LOAD instruction.
2119 *
2120 * Unlike temporary GRF array access (where we don't support it due to
2121 * the difficulty of doing relative addressing on instruction
2122 * destinations), we could potentially do array access of uniforms
2123 * that were loaded in GRF space as push constants. In real-world
2124 * usage we've seen, though, the arrays being used are always larger
2125 * than we could load as push constants, so just always move all
2126 * uniform array access out to a pull constant buffer.
2127 */
2128 void
2129 fs_visitor::move_uniform_array_access_to_pull_constants()
2130 {
2131 if (dispatch_width != 8)
2132 return;
2133
2134 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2135 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2136
2137 /* Walk through and find array access of uniforms. Put a copy of that
2138 * uniform in the pull constant buffer.
2139 *
2140 * Note that we don't move constant-indexed accesses to arrays. No
2141 * testing has been done of the performance impact of this choice.
2142 */
2143 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2144 for (int i = 0 ; i < inst->sources; i++) {
2145 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2146 continue;
2147
2148 int uniform = inst->src[i].reg;
2149
2150 /* If this array isn't already present in the pull constant buffer,
2151 * add it.
2152 */
2153 if (pull_constant_loc[uniform] == -1) {
2154 const gl_constant_value **values = &stage_prog_data->param[uniform];
2155
2156 assert(param_size[uniform]);
2157
2158 for (int j = 0; j < param_size[uniform]; j++) {
2159 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2160
2161 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2162 values[j];
2163 }
2164 }
2165 }
2166 }
2167 }
2168
2169 /**
2170 * Assign UNIFORM file registers to either push constants or pull constants.
2171 *
2172 * We allow a fragment shader to have more than the specified minimum
2173 * maximum number of fragment shader uniform components (64). If
2174 * there are too many of these, they'd fill up all of register space.
2175 * So, this will push some of them out to the pull constant buffer and
2176 * update the program to load them.
2177 */
2178 void
2179 fs_visitor::assign_constant_locations()
2180 {
2181 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2182 if (dispatch_width != 8)
2183 return;
2184
2185 /* Find which UNIFORM registers are still in use. */
2186 bool is_live[uniforms];
2187 for (unsigned int i = 0; i < uniforms; i++) {
2188 is_live[i] = false;
2189 }
2190
2191 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2192 for (int i = 0; i < inst->sources; i++) {
2193 if (inst->src[i].file != UNIFORM)
2194 continue;
2195
2196 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2197 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2198 is_live[constant_nr] = true;
2199 }
2200 }
2201
2202 /* Only allow 16 registers (128 uniform components) as push constants.
2203 *
2204 * Just demote the end of the list. We could probably do better
2205 * here, demoting things that are rarely used in the program first.
2206 *
2207 * If changing this value, note the limitation about total_regs in
2208 * brw_curbe.c.
2209 */
2210 unsigned int max_push_components = 16 * 8;
2211 unsigned int num_push_constants = 0;
2212
2213 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2214
2215 for (unsigned int i = 0; i < uniforms; i++) {
2216 if (!is_live[i] || pull_constant_loc[i] != -1) {
2217 /* This UNIFORM register is either dead, or has already been demoted
2218 * to a pull const. Mark it as no longer living in the param[] array.
2219 */
2220 push_constant_loc[i] = -1;
2221 continue;
2222 }
2223
2224 if (num_push_constants < max_push_components) {
2225 /* Retain as a push constant. Record the location in the params[]
2226 * array.
2227 */
2228 push_constant_loc[i] = num_push_constants++;
2229 } else {
2230 /* Demote to a pull constant. */
2231 push_constant_loc[i] = -1;
2232
2233 int pull_index = stage_prog_data->nr_pull_params++;
2234 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2235 pull_constant_loc[i] = pull_index;
2236 }
2237 }
2238
2239 stage_prog_data->nr_params = num_push_constants;
2240
2241 /* Up until now, the param[] array has been indexed by reg + reg_offset
2242 * of UNIFORM registers. Condense it to only contain the uniforms we
2243 * chose to upload as push constants.
2244 */
2245 for (unsigned int i = 0; i < uniforms; i++) {
2246 int remapped = push_constant_loc[i];
2247
2248 if (remapped == -1)
2249 continue;
2250
2251 assert(remapped <= (int)i);
2252 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2253 }
2254 }
2255
2256 /**
2257 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2258 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2259 */
2260 void
2261 fs_visitor::demote_pull_constants()
2262 {
2263 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2264 for (int i = 0; i < inst->sources; i++) {
2265 if (inst->src[i].file != UNIFORM)
2266 continue;
2267
2268 int pull_index = pull_constant_loc[inst->src[i].reg +
2269 inst->src[i].reg_offset];
2270 if (pull_index == -1)
2271 continue;
2272
2273 /* Set up the annotation tracking for new generated instructions. */
2274 base_ir = inst->ir;
2275 current_annotation = inst->annotation;
2276
2277 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2278 fs_reg dst = vgrf(glsl_type::float_type);
2279
2280 /* Generate a pull load into dst. */
2281 if (inst->src[i].reladdr) {
2282 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2283 surf_index,
2284 *inst->src[i].reladdr,
2285 pull_index);
2286 inst->insert_before(block, &list);
2287 inst->src[i].reladdr = NULL;
2288 } else {
2289 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2290 fs_inst *pull =
2291 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2292 dst, surf_index, offset);
2293 inst->insert_before(block, pull);
2294 inst->src[i].set_smear(pull_index & 3);
2295 }
2296
2297 /* Rewrite the instruction to use the temporary VGRF. */
2298 inst->src[i].file = GRF;
2299 inst->src[i].reg = dst.reg;
2300 inst->src[i].reg_offset = 0;
2301 inst->src[i].width = dispatch_width;
2302 }
2303 }
2304 invalidate_live_intervals();
2305 }
2306
2307 bool
2308 fs_visitor::opt_algebraic()
2309 {
2310 bool progress = false;
2311
2312 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2313 switch (inst->opcode) {
2314 case BRW_OPCODE_MOV:
2315 if (inst->src[0].file != IMM)
2316 break;
2317
2318 if (inst->saturate) {
2319 if (inst->dst.type != inst->src[0].type)
2320 assert(!"unimplemented: saturate mixed types");
2321
2322 if (brw_saturate_immediate(inst->dst.type,
2323 &inst->src[0].fixed_hw_reg)) {
2324 inst->saturate = false;
2325 progress = true;
2326 }
2327 }
2328 break;
2329
2330 case BRW_OPCODE_MUL:
2331 if (inst->src[1].file != IMM)
2332 continue;
2333
2334 /* a * 1.0 = a */
2335 if (inst->src[1].is_one()) {
2336 inst->opcode = BRW_OPCODE_MOV;
2337 inst->src[1] = reg_undef;
2338 progress = true;
2339 break;
2340 }
2341
2342 /* a * -1.0 = -a */
2343 if (inst->src[1].is_negative_one()) {
2344 inst->opcode = BRW_OPCODE_MOV;
2345 inst->src[0].negate = !inst->src[0].negate;
2346 inst->src[1] = reg_undef;
2347 progress = true;
2348 break;
2349 }
2350
2351 /* a * 0.0 = 0.0 */
2352 if (inst->src[1].is_zero()) {
2353 inst->opcode = BRW_OPCODE_MOV;
2354 inst->src[0] = inst->src[1];
2355 inst->src[1] = reg_undef;
2356 progress = true;
2357 break;
2358 }
2359
2360 if (inst->src[0].file == IMM) {
2361 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2362 inst->opcode = BRW_OPCODE_MOV;
2363 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2364 inst->src[1] = reg_undef;
2365 progress = true;
2366 break;
2367 }
2368 break;
2369 case BRW_OPCODE_ADD:
2370 if (inst->src[1].file != IMM)
2371 continue;
2372
2373 /* a + 0.0 = a */
2374 if (inst->src[1].is_zero()) {
2375 inst->opcode = BRW_OPCODE_MOV;
2376 inst->src[1] = reg_undef;
2377 progress = true;
2378 break;
2379 }
2380
2381 if (inst->src[0].file == IMM) {
2382 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2383 inst->opcode = BRW_OPCODE_MOV;
2384 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2385 inst->src[1] = reg_undef;
2386 progress = true;
2387 break;
2388 }
2389 break;
2390 case BRW_OPCODE_OR:
2391 if (inst->src[0].equals(inst->src[1])) {
2392 inst->opcode = BRW_OPCODE_MOV;
2393 inst->src[1] = reg_undef;
2394 progress = true;
2395 break;
2396 }
2397 break;
2398 case BRW_OPCODE_LRP:
2399 if (inst->src[1].equals(inst->src[2])) {
2400 inst->opcode = BRW_OPCODE_MOV;
2401 inst->src[0] = inst->src[1];
2402 inst->src[1] = reg_undef;
2403 inst->src[2] = reg_undef;
2404 progress = true;
2405 break;
2406 }
2407 break;
2408 case BRW_OPCODE_CMP:
2409 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2410 inst->src[0].abs &&
2411 inst->src[0].negate &&
2412 inst->src[1].is_zero()) {
2413 inst->src[0].abs = false;
2414 inst->src[0].negate = false;
2415 inst->conditional_mod = BRW_CONDITIONAL_Z;
2416 progress = true;
2417 break;
2418 }
2419 break;
2420 case BRW_OPCODE_SEL:
2421 if (inst->src[0].equals(inst->src[1])) {
2422 inst->opcode = BRW_OPCODE_MOV;
2423 inst->src[1] = reg_undef;
2424 inst->predicate = BRW_PREDICATE_NONE;
2425 inst->predicate_inverse = false;
2426 progress = true;
2427 } else if (inst->saturate && inst->src[1].file == IMM) {
2428 switch (inst->conditional_mod) {
2429 case BRW_CONDITIONAL_LE:
2430 case BRW_CONDITIONAL_L:
2431 switch (inst->src[1].type) {
2432 case BRW_REGISTER_TYPE_F:
2433 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2434 inst->opcode = BRW_OPCODE_MOV;
2435 inst->src[1] = reg_undef;
2436 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2437 progress = true;
2438 }
2439 break;
2440 default:
2441 break;
2442 }
2443 break;
2444 case BRW_CONDITIONAL_GE:
2445 case BRW_CONDITIONAL_G:
2446 switch (inst->src[1].type) {
2447 case BRW_REGISTER_TYPE_F:
2448 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2449 inst->opcode = BRW_OPCODE_MOV;
2450 inst->src[1] = reg_undef;
2451 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2452 progress = true;
2453 }
2454 break;
2455 default:
2456 break;
2457 }
2458 default:
2459 break;
2460 }
2461 }
2462 break;
2463 case BRW_OPCODE_MAD:
2464 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2465 inst->opcode = BRW_OPCODE_MOV;
2466 inst->src[1] = reg_undef;
2467 inst->src[2] = reg_undef;
2468 progress = true;
2469 } else if (inst->src[0].is_zero()) {
2470 inst->opcode = BRW_OPCODE_MUL;
2471 inst->src[0] = inst->src[2];
2472 inst->src[2] = reg_undef;
2473 } else if (inst->src[1].is_one()) {
2474 inst->opcode = BRW_OPCODE_ADD;
2475 inst->src[1] = inst->src[2];
2476 inst->src[2] = reg_undef;
2477 progress = true;
2478 } else if (inst->src[2].is_one()) {
2479 inst->opcode = BRW_OPCODE_ADD;
2480 inst->src[2] = reg_undef;
2481 progress = true;
2482 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2483 inst->opcode = BRW_OPCODE_ADD;
2484 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2485 inst->src[2] = reg_undef;
2486 progress = true;
2487 }
2488 break;
2489 case SHADER_OPCODE_RCP: {
2490 fs_inst *prev = (fs_inst *)inst->prev;
2491 if (prev->opcode == SHADER_OPCODE_SQRT) {
2492 if (inst->src[0].equals(prev->dst)) {
2493 inst->opcode = SHADER_OPCODE_RSQ;
2494 inst->src[0] = prev->src[0];
2495 progress = true;
2496 }
2497 }
2498 break;
2499 }
2500 default:
2501 break;
2502 }
2503 }
2504
2505 return progress;
2506 }
2507
2508 bool
2509 fs_visitor::opt_register_renaming()
2510 {
2511 bool progress = false;
2512 int depth = 0;
2513
2514 int remap[alloc.count];
2515 memset(remap, -1, sizeof(int) * alloc.count);
2516
2517 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2518 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2519 depth++;
2520 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2521 inst->opcode == BRW_OPCODE_WHILE) {
2522 depth--;
2523 }
2524
2525 /* Rewrite instruction sources. */
2526 for (int i = 0; i < inst->sources; i++) {
2527 if (inst->src[i].file == GRF &&
2528 remap[inst->src[i].reg] != -1 &&
2529 remap[inst->src[i].reg] != inst->src[i].reg) {
2530 inst->src[i].reg = remap[inst->src[i].reg];
2531 progress = true;
2532 }
2533 }
2534
2535 const int dst = inst->dst.reg;
2536
2537 if (depth == 0 &&
2538 inst->dst.file == GRF &&
2539 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2540 !inst->is_partial_write()) {
2541 if (remap[dst] == -1) {
2542 remap[dst] = dst;
2543 } else {
2544 remap[dst] = alloc.allocate(inst->dst.width / 8);
2545 inst->dst.reg = remap[dst];
2546 progress = true;
2547 }
2548 } else if (inst->dst.file == GRF &&
2549 remap[dst] != -1 &&
2550 remap[dst] != dst) {
2551 inst->dst.reg = remap[dst];
2552 progress = true;
2553 }
2554 }
2555
2556 if (progress) {
2557 invalidate_live_intervals();
2558
2559 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2560 if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2561 delta_x[i].reg = remap[delta_x[i].reg];
2562 }
2563 }
2564 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2565 if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2566 delta_y[i].reg = remap[delta_y[i].reg];
2567 }
2568 }
2569 }
2570
2571 return progress;
2572 }
2573
2574 /**
2575 * Remove redundant or useless discard jumps.
2576 *
2577 * For example, we can eliminate jumps in the following sequence:
2578 *
2579 * discard-jump (redundant with the next jump)
2580 * discard-jump (useless; jumps to the next instruction)
2581 * placeholder-halt
2582 */
2583 bool
2584 fs_visitor::opt_redundant_discard_jumps()
2585 {
2586 bool progress = false;
2587
2588 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2589
2590 fs_inst *placeholder_halt = NULL;
2591 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2592 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2593 placeholder_halt = inst;
2594 break;
2595 }
2596 }
2597
2598 if (!placeholder_halt)
2599 return false;
2600
2601 /* Delete any HALTs immediately before the placeholder halt. */
2602 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2603 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2604 prev = (fs_inst *) placeholder_halt->prev) {
2605 prev->remove(last_bblock);
2606 progress = true;
2607 }
2608
2609 if (progress)
2610 invalidate_live_intervals();
2611
2612 return progress;
2613 }
2614
2615 bool
2616 fs_visitor::compute_to_mrf()
2617 {
2618 bool progress = false;
2619 int next_ip = 0;
2620
2621 /* No MRFs on Gen >= 7. */
2622 if (brw->gen >= 7)
2623 return false;
2624
2625 calculate_live_intervals();
2626
2627 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2628 int ip = next_ip;
2629 next_ip++;
2630
2631 if (inst->opcode != BRW_OPCODE_MOV ||
2632 inst->is_partial_write() ||
2633 inst->dst.file != MRF || inst->src[0].file != GRF ||
2634 inst->dst.type != inst->src[0].type ||
2635 inst->src[0].abs || inst->src[0].negate ||
2636 !inst->src[0].is_contiguous() ||
2637 inst->src[0].subreg_offset)
2638 continue;
2639
2640 /* Work out which hardware MRF registers are written by this
2641 * instruction.
2642 */
2643 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2644 int mrf_high;
2645 if (inst->dst.reg & BRW_MRF_COMPR4) {
2646 mrf_high = mrf_low + 4;
2647 } else if (inst->exec_size == 16) {
2648 mrf_high = mrf_low + 1;
2649 } else {
2650 mrf_high = mrf_low;
2651 }
2652
2653 /* Can't compute-to-MRF this GRF if someone else was going to
2654 * read it later.
2655 */
2656 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2657 continue;
2658
2659 /* Found a move of a GRF to a MRF. Let's see if we can go
2660 * rewrite the thing that made this GRF to write into the MRF.
2661 */
2662 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2663 if (scan_inst->dst.file == GRF &&
2664 scan_inst->dst.reg == inst->src[0].reg) {
2665 /* Found the last thing to write our reg we want to turn
2666 * into a compute-to-MRF.
2667 */
2668
2669 /* If this one instruction didn't populate all the
2670 * channels, bail. We might be able to rewrite everything
2671 * that writes that reg, but it would require smarter
2672 * tracking to delay the rewriting until complete success.
2673 */
2674 if (scan_inst->is_partial_write())
2675 break;
2676
2677 /* Things returning more than one register would need us to
2678 * understand coalescing out more than one MOV at a time.
2679 */
2680 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2681 break;
2682
2683 /* SEND instructions can't have MRF as a destination. */
2684 if (scan_inst->mlen)
2685 break;
2686
2687 if (brw->gen == 6) {
2688 /* gen6 math instructions must have the destination be
2689 * GRF, so no compute-to-MRF for them.
2690 */
2691 if (scan_inst->is_math()) {
2692 break;
2693 }
2694 }
2695
2696 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2697 /* Found the creator of our MRF's source value. */
2698 scan_inst->dst.file = MRF;
2699 scan_inst->dst.reg = inst->dst.reg;
2700 scan_inst->saturate |= inst->saturate;
2701 inst->remove(block);
2702 progress = true;
2703 }
2704 break;
2705 }
2706
2707 /* We don't handle control flow here. Most computation of
2708 * values that end up in MRFs are shortly before the MRF
2709 * write anyway.
2710 */
2711 if (block->start() == scan_inst)
2712 break;
2713
2714 /* You can't read from an MRF, so if someone else reads our
2715 * MRF's source GRF that we wanted to rewrite, that stops us.
2716 */
2717 bool interfered = false;
2718 for (int i = 0; i < scan_inst->sources; i++) {
2719 if (scan_inst->src[i].file == GRF &&
2720 scan_inst->src[i].reg == inst->src[0].reg &&
2721 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2722 interfered = true;
2723 }
2724 }
2725 if (interfered)
2726 break;
2727
2728 if (scan_inst->dst.file == MRF) {
2729 /* If somebody else writes our MRF here, we can't
2730 * compute-to-MRF before that.
2731 */
2732 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2733 int scan_mrf_high;
2734
2735 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2736 scan_mrf_high = scan_mrf_low + 4;
2737 } else if (scan_inst->exec_size == 16) {
2738 scan_mrf_high = scan_mrf_low + 1;
2739 } else {
2740 scan_mrf_high = scan_mrf_low;
2741 }
2742
2743 if (mrf_low == scan_mrf_low ||
2744 mrf_low == scan_mrf_high ||
2745 mrf_high == scan_mrf_low ||
2746 mrf_high == scan_mrf_high) {
2747 break;
2748 }
2749 }
2750
2751 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2752 /* Found a SEND instruction, which means that there are
2753 * live values in MRFs from base_mrf to base_mrf +
2754 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2755 * above it.
2756 */
2757 if (mrf_low >= scan_inst->base_mrf &&
2758 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2759 break;
2760 }
2761 if (mrf_high >= scan_inst->base_mrf &&
2762 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2763 break;
2764 }
2765 }
2766 }
2767 }
2768
2769 if (progress)
2770 invalidate_live_intervals();
2771
2772 return progress;
2773 }
2774
2775 /**
2776 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2777 * instructions to FS_OPCODE_REP_FB_WRITE.
2778 */
2779 void
2780 fs_visitor::emit_repclear_shader()
2781 {
2782 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2783 int base_mrf = 1;
2784 int color_mrf = base_mrf + 2;
2785
2786 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2787 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2788 mov->force_writemask_all = true;
2789
2790 fs_inst *write;
2791 if (key->nr_color_regions == 1) {
2792 write = emit(FS_OPCODE_REP_FB_WRITE);
2793 write->saturate = key->clamp_fragment_color;
2794 write->base_mrf = color_mrf;
2795 write->target = 0;
2796 write->header_present = false;
2797 write->mlen = 1;
2798 } else {
2799 assume(key->nr_color_regions > 0);
2800 for (int i = 0; i < key->nr_color_regions; ++i) {
2801 write = emit(FS_OPCODE_REP_FB_WRITE);
2802 write->saturate = key->clamp_fragment_color;
2803 write->base_mrf = base_mrf;
2804 write->target = i;
2805 write->header_present = true;
2806 write->mlen = 3;
2807 }
2808 }
2809 write->eot = true;
2810
2811 calculate_cfg();
2812
2813 assign_constant_locations();
2814 assign_curb_setup();
2815
2816 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2817 assert(mov->src[0].file == HW_REG);
2818 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2819 }
2820
2821 /**
2822 * Walks through basic blocks, looking for repeated MRF writes and
2823 * removing the later ones.
2824 */
2825 bool
2826 fs_visitor::remove_duplicate_mrf_writes()
2827 {
2828 fs_inst *last_mrf_move[16];
2829 bool progress = false;
2830
2831 /* Need to update the MRF tracking for compressed instructions. */
2832 if (dispatch_width == 16)
2833 return false;
2834
2835 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2836
2837 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2838 if (inst->is_control_flow()) {
2839 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2840 }
2841
2842 if (inst->opcode == BRW_OPCODE_MOV &&
2843 inst->dst.file == MRF) {
2844 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2845 if (prev_inst && inst->equals(prev_inst)) {
2846 inst->remove(block);
2847 progress = true;
2848 continue;
2849 }
2850 }
2851
2852 /* Clear out the last-write records for MRFs that were overwritten. */
2853 if (inst->dst.file == MRF) {
2854 last_mrf_move[inst->dst.reg] = NULL;
2855 }
2856
2857 if (inst->mlen > 0 && inst->base_mrf != -1) {
2858 /* Found a SEND instruction, which will include two or fewer
2859 * implied MRF writes. We could do better here.
2860 */
2861 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2862 last_mrf_move[inst->base_mrf + i] = NULL;
2863 }
2864 }
2865
2866 /* Clear out any MRF move records whose sources got overwritten. */
2867 if (inst->dst.file == GRF) {
2868 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2869 if (last_mrf_move[i] &&
2870 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2871 last_mrf_move[i] = NULL;
2872 }
2873 }
2874 }
2875
2876 if (inst->opcode == BRW_OPCODE_MOV &&
2877 inst->dst.file == MRF &&
2878 inst->src[0].file == GRF &&
2879 !inst->is_partial_write()) {
2880 last_mrf_move[inst->dst.reg] = inst;
2881 }
2882 }
2883
2884 if (progress)
2885 invalidate_live_intervals();
2886
2887 return progress;
2888 }
2889
2890 static void
2891 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
2892 {
2893 /* Clear the flag for registers that actually got read (as expected). */
2894 for (int i = 0; i < inst->sources; i++) {
2895 int grf;
2896 if (inst->src[i].file == GRF) {
2897 grf = inst->src[i].reg;
2898 } else if (inst->src[i].file == HW_REG &&
2899 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2900 grf = inst->src[i].fixed_hw_reg.nr;
2901 } else {
2902 continue;
2903 }
2904
2905 if (grf >= first_grf &&
2906 grf < first_grf + grf_len) {
2907 deps[grf - first_grf] = false;
2908 if (inst->exec_size == 16)
2909 deps[grf - first_grf + 1] = false;
2910 }
2911 }
2912 }
2913
2914 /**
2915 * Implements this workaround for the original 965:
2916 *
2917 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2918 * check for post destination dependencies on this instruction, software
2919 * must ensure that there is no destination hazard for the case of ‘write
2920 * followed by a posted write’ shown in the following example.
2921 *
2922 * 1. mov r3 0
2923 * 2. send r3.xy <rest of send instruction>
2924 * 3. mov r2 r3
2925 *
2926 * Due to no post-destination dependency check on the ‘send’, the above
2927 * code sequence could have two instructions (1 and 2) in flight at the
2928 * same time that both consider ‘r3’ as the target of their final writes.
2929 */
2930 void
2931 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2932 fs_inst *inst)
2933 {
2934 int write_len = inst->regs_written;
2935 int first_write_grf = inst->dst.reg;
2936 bool needs_dep[BRW_MAX_MRF];
2937 assert(write_len < (int)sizeof(needs_dep) - 1);
2938
2939 memset(needs_dep, false, sizeof(needs_dep));
2940 memset(needs_dep, true, write_len);
2941
2942 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
2943
2944 /* Walk backwards looking for writes to registers we're writing which
2945 * aren't read since being written. If we hit the start of the program,
2946 * we assume that there are no outstanding dependencies on entry to the
2947 * program.
2948 */
2949 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2950 /* If we hit control flow, assume that there *are* outstanding
2951 * dependencies, and force their cleanup before our instruction.
2952 */
2953 if (block->start() == scan_inst) {
2954 for (int i = 0; i < write_len; i++) {
2955 if (needs_dep[i]) {
2956 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2957 }
2958 }
2959 return;
2960 }
2961
2962 /* We insert our reads as late as possible on the assumption that any
2963 * instruction but a MOV that might have left us an outstanding
2964 * dependency has more latency than a MOV.
2965 */
2966 if (scan_inst->dst.file == GRF) {
2967 for (int i = 0; i < scan_inst->regs_written; i++) {
2968 int reg = scan_inst->dst.reg + i;
2969
2970 if (reg >= first_write_grf &&
2971 reg < first_write_grf + write_len &&
2972 needs_dep[reg - first_write_grf]) {
2973 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2974 needs_dep[reg - first_write_grf] = false;
2975 if (scan_inst->exec_size == 16)
2976 needs_dep[reg - first_write_grf + 1] = false;
2977 }
2978 }
2979 }
2980
2981 /* Clear the flag for registers that actually got read (as expected). */
2982 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2983
2984 /* Continue the loop only if we haven't resolved all the dependencies */
2985 int i;
2986 for (i = 0; i < write_len; i++) {
2987 if (needs_dep[i])
2988 break;
2989 }
2990 if (i == write_len)
2991 return;
2992 }
2993 }
2994
2995 /**
2996 * Implements this workaround for the original 965:
2997 *
2998 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2999 * used as a destination register until after it has been sourced by an
3000 * instruction with a different destination register.
3001 */
3002 void
3003 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3004 {
3005 int write_len = inst->regs_written;
3006 int first_write_grf = inst->dst.reg;
3007 bool needs_dep[BRW_MAX_MRF];
3008 assert(write_len < (int)sizeof(needs_dep) - 1);
3009
3010 memset(needs_dep, false, sizeof(needs_dep));
3011 memset(needs_dep, true, write_len);
3012 /* Walk forwards looking for writes to registers we're writing which aren't
3013 * read before being written.
3014 */
3015 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3016 /* If we hit control flow, force resolve all remaining dependencies. */
3017 if (block->end() == scan_inst) {
3018 for (int i = 0; i < write_len; i++) {
3019 if (needs_dep[i])
3020 scan_inst->insert_before(block,
3021 DEP_RESOLVE_MOV(first_write_grf + i));
3022 }
3023 return;
3024 }
3025
3026 /* Clear the flag for registers that actually got read (as expected). */
3027 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3028
3029 /* We insert our reads as late as possible since they're reading the
3030 * result of a SEND, which has massive latency.
3031 */
3032 if (scan_inst->dst.file == GRF &&
3033 scan_inst->dst.reg >= first_write_grf &&
3034 scan_inst->dst.reg < first_write_grf + write_len &&
3035 needs_dep[scan_inst->dst.reg - first_write_grf]) {
3036 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3037 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3038 }
3039
3040 /* Continue the loop only if we haven't resolved all the dependencies */
3041 int i;
3042 for (i = 0; i < write_len; i++) {
3043 if (needs_dep[i])
3044 break;
3045 }
3046 if (i == write_len)
3047 return;
3048 }
3049 }
3050
3051 void
3052 fs_visitor::insert_gen4_send_dependency_workarounds()
3053 {
3054 if (brw->gen != 4 || brw->is_g4x)
3055 return;
3056
3057 bool progress = false;
3058
3059 /* Note that we're done with register allocation, so GRF fs_regs always
3060 * have a .reg_offset of 0.
3061 */
3062
3063 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3064 if (inst->mlen != 0 && inst->dst.file == GRF) {
3065 insert_gen4_pre_send_dependency_workarounds(block, inst);
3066 insert_gen4_post_send_dependency_workarounds(block, inst);
3067 progress = true;
3068 }
3069 }
3070
3071 if (progress)
3072 invalidate_live_intervals();
3073 }
3074
3075 /**
3076 * Turns the generic expression-style uniform pull constant load instruction
3077 * into a hardware-specific series of instructions for loading a pull
3078 * constant.
3079 *
3080 * The expression style allows the CSE pass before this to optimize out
3081 * repeated loads from the same offset, and gives the pre-register-allocation
3082 * scheduling full flexibility, while the conversion to native instructions
3083 * allows the post-register-allocation scheduler the best information
3084 * possible.
3085 *
3086 * Note that execution masking for setting up pull constant loads is special:
3087 * the channels that need to be written are unrelated to the current execution
3088 * mask, since a later instruction will use one of the result channels as a
3089 * source operand for all 8 or 16 of its channels.
3090 */
3091 void
3092 fs_visitor::lower_uniform_pull_constant_loads()
3093 {
3094 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3095 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3096 continue;
3097
3098 if (brw->gen >= 7) {
3099 /* The offset arg before was a vec4-aligned byte offset. We need to
3100 * turn it into a dword offset.
3101 */
3102 fs_reg const_offset_reg = inst->src[1];
3103 assert(const_offset_reg.file == IMM &&
3104 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3105 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3106 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3107
3108 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3109 * Reserve space for the register.
3110 */
3111 if (brw->gen >= 9) {
3112 payload.reg_offset++;
3113 alloc.sizes[payload.reg] = 2;
3114 }
3115
3116 /* This is actually going to be a MOV, but since only the first dword
3117 * is accessed, we have a special opcode to do just that one. Note
3118 * that this needs to be an operation that will be considered a def
3119 * by live variable analysis, or register allocation will explode.
3120 */
3121 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3122 8, payload, const_offset_reg);
3123 setup->force_writemask_all = true;
3124
3125 setup->ir = inst->ir;
3126 setup->annotation = inst->annotation;
3127 inst->insert_before(block, setup);
3128
3129 /* Similarly, this will only populate the first 4 channels of the
3130 * result register (since we only use smear values from 0-3), but we
3131 * don't tell the optimizer.
3132 */
3133 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3134 inst->src[1] = payload;
3135
3136 invalidate_live_intervals();
3137 } else {
3138 /* Before register allocation, we didn't tell the scheduler about the
3139 * MRF we use. We know it's safe to use this MRF because nothing
3140 * else does except for register spill/unspill, which generates and
3141 * uses its MRF within a single IR instruction.
3142 */
3143 inst->base_mrf = 14;
3144 inst->mlen = 1;
3145 }
3146 }
3147 }
3148
3149 bool
3150 fs_visitor::lower_load_payload()
3151 {
3152 bool progress = false;
3153
3154 int vgrf_to_reg[alloc.count];
3155 int reg_count = 0;
3156 for (unsigned i = 0; i < alloc.count; ++i) {
3157 vgrf_to_reg[i] = reg_count;
3158 reg_count += alloc.sizes[i];
3159 }
3160
3161 struct {
3162 bool written:1; /* Whether this register has ever been written */
3163 bool force_writemask_all:1;
3164 bool force_sechalf:1;
3165 } metadata[reg_count];
3166 memset(metadata, 0, sizeof(metadata));
3167
3168 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3169 if (inst->dst.file == GRF) {
3170 const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3171 bool force_sechalf = inst->force_sechalf &&
3172 !inst->force_writemask_all;
3173 bool toggle_sechalf = inst->dst.width == 16 &&
3174 type_sz(inst->dst.type) == 4 &&
3175 !inst->force_writemask_all;
3176 for (int i = 0; i < inst->regs_written; ++i) {
3177 metadata[dst_reg + i].written = true;
3178 metadata[dst_reg + i].force_sechalf = force_sechalf;
3179 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3180 force_sechalf = (toggle_sechalf != force_sechalf);
3181 }
3182 }
3183
3184 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3185 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3186 fs_reg dst = inst->dst;
3187
3188 for (int i = 0; i < inst->sources; i++) {
3189 dst.width = inst->src[i].effective_width;
3190 dst.type = inst->src[i].type;
3191
3192 if (inst->src[i].file == BAD_FILE) {
3193 /* Do nothing but otherwise increment as normal */
3194 } else if (dst.file == MRF &&
3195 dst.width == 8 &&
3196 brw->has_compr4 &&
3197 i + 4 < inst->sources &&
3198 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3199 fs_reg compr4_dst = dst;
3200 compr4_dst.reg += BRW_MRF_COMPR4;
3201 compr4_dst.width = 16;
3202 fs_reg compr4_src = inst->src[i];
3203 compr4_src.width = 16;
3204 fs_inst *mov = MOV(compr4_dst, compr4_src);
3205 mov->force_writemask_all = true;
3206 inst->insert_before(block, mov);
3207 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3208 inst->src[i + 4].file = BAD_FILE;
3209 } else {
3210 fs_inst *mov = MOV(dst, inst->src[i]);
3211 if (inst->src[i].file == GRF) {
3212 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3213 inst->src[i].reg_offset;
3214 mov->force_sechalf = metadata[src_reg].force_sechalf;
3215 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3216 } else {
3217 /* We don't have any useful metadata for immediates or
3218 * uniforms. Assume that any of the channels of the
3219 * destination may be used.
3220 */
3221 assert(inst->src[i].file == IMM ||
3222 inst->src[i].file == UNIFORM);
3223 mov->force_writemask_all = true;
3224 }
3225
3226 if (dst.file == GRF) {
3227 const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3228 const bool force_writemask = mov->force_writemask_all;
3229 metadata[dst_reg].force_writemask_all = force_writemask;
3230 metadata[dst_reg].force_sechalf = mov->force_sechalf;
3231 if (dst.width * type_sz(dst.type) > 32) {
3232 assert(!mov->force_sechalf);
3233 metadata[dst_reg + 1].force_writemask_all = force_writemask;
3234 metadata[dst_reg + 1].force_sechalf = !force_writemask;
3235 }
3236 }
3237
3238 inst->insert_before(block, mov);
3239 }
3240
3241 dst = offset(dst, 1);
3242 }
3243
3244 inst->remove(block);
3245 progress = true;
3246 }
3247 }
3248
3249 if (progress)
3250 invalidate_live_intervals();
3251
3252 return progress;
3253 }
3254
3255 void
3256 fs_visitor::dump_instructions()
3257 {
3258 dump_instructions(NULL);
3259 }
3260
3261 void
3262 fs_visitor::dump_instructions(const char *name)
3263 {
3264 FILE *file = stderr;
3265 if (name && geteuid() != 0) {
3266 file = fopen(name, "w");
3267 if (!file)
3268 file = stderr;
3269 }
3270
3271 if (cfg) {
3272 calculate_register_pressure();
3273 int ip = 0, max_pressure = 0;
3274 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3275 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3276 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3277 dump_instruction(inst, file);
3278 ip++;
3279 }
3280 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3281 } else {
3282 int ip = 0;
3283 foreach_in_list(backend_instruction, inst, &instructions) {
3284 fprintf(file, "%4d: ", ip++);
3285 dump_instruction(inst, file);
3286 }
3287 }
3288
3289 if (file != stderr) {
3290 fclose(file);
3291 }
3292 }
3293
3294 void
3295 fs_visitor::dump_instruction(backend_instruction *be_inst)
3296 {
3297 dump_instruction(be_inst, stderr);
3298 }
3299
3300 void
3301 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3302 {
3303 fs_inst *inst = (fs_inst *)be_inst;
3304
3305 if (inst->predicate) {
3306 fprintf(file, "(%cf0.%d) ",
3307 inst->predicate_inverse ? '-' : '+',
3308 inst->flag_subreg);
3309 }
3310
3311 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3312 if (inst->saturate)
3313 fprintf(file, ".sat");
3314 if (inst->conditional_mod) {
3315 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3316 if (!inst->predicate &&
3317 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3318 inst->opcode != BRW_OPCODE_IF &&
3319 inst->opcode != BRW_OPCODE_WHILE))) {
3320 fprintf(file, ".f0.%d", inst->flag_subreg);
3321 }
3322 }
3323 fprintf(file, "(%d) ", inst->exec_size);
3324
3325
3326 switch (inst->dst.file) {
3327 case GRF:
3328 fprintf(file, "vgrf%d", inst->dst.reg);
3329 if (inst->dst.width != dispatch_width)
3330 fprintf(file, "@%d", inst->dst.width);
3331 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3332 inst->dst.subreg_offset)
3333 fprintf(file, "+%d.%d",
3334 inst->dst.reg_offset, inst->dst.subreg_offset);
3335 break;
3336 case MRF:
3337 fprintf(file, "m%d", inst->dst.reg);
3338 break;
3339 case BAD_FILE:
3340 fprintf(file, "(null)");
3341 break;
3342 case UNIFORM:
3343 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3344 break;
3345 case ATTR:
3346 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3347 break;
3348 case HW_REG:
3349 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3350 switch (inst->dst.fixed_hw_reg.nr) {
3351 case BRW_ARF_NULL:
3352 fprintf(file, "null");
3353 break;
3354 case BRW_ARF_ADDRESS:
3355 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3356 break;
3357 case BRW_ARF_ACCUMULATOR:
3358 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3359 break;
3360 case BRW_ARF_FLAG:
3361 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3362 inst->dst.fixed_hw_reg.subnr);
3363 break;
3364 default:
3365 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3366 inst->dst.fixed_hw_reg.subnr);
3367 break;
3368 }
3369 } else {
3370 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3371 }
3372 if (inst->dst.fixed_hw_reg.subnr)
3373 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3374 break;
3375 default:
3376 fprintf(file, "???");
3377 break;
3378 }
3379 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3380
3381 for (int i = 0; i < inst->sources; i++) {
3382 if (inst->src[i].negate)
3383 fprintf(file, "-");
3384 if (inst->src[i].abs)
3385 fprintf(file, "|");
3386 switch (inst->src[i].file) {
3387 case GRF:
3388 fprintf(file, "vgrf%d", inst->src[i].reg);
3389 if (inst->src[i].width != dispatch_width)
3390 fprintf(file, "@%d", inst->src[i].width);
3391 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3392 inst->src[i].subreg_offset)
3393 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3394 inst->src[i].subreg_offset);
3395 break;
3396 case MRF:
3397 fprintf(file, "***m%d***", inst->src[i].reg);
3398 break;
3399 case ATTR:
3400 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3401 break;
3402 case UNIFORM:
3403 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3404 if (inst->src[i].reladdr) {
3405 fprintf(file, "+reladdr");
3406 } else if (inst->src[i].subreg_offset) {
3407 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3408 inst->src[i].subreg_offset);
3409 }
3410 break;
3411 case BAD_FILE:
3412 fprintf(file, "(null)");
3413 break;
3414 case IMM:
3415 switch (inst->src[i].type) {
3416 case BRW_REGISTER_TYPE_F:
3417 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3418 break;
3419 case BRW_REGISTER_TYPE_W:
3420 case BRW_REGISTER_TYPE_D:
3421 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3422 break;
3423 case BRW_REGISTER_TYPE_UW:
3424 case BRW_REGISTER_TYPE_UD:
3425 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3426 break;
3427 case BRW_REGISTER_TYPE_VF:
3428 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3429 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3430 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3431 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3432 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3433 break;
3434 default:
3435 fprintf(file, "???");
3436 break;
3437 }
3438 break;
3439 case HW_REG:
3440 if (inst->src[i].fixed_hw_reg.negate)
3441 fprintf(file, "-");
3442 if (inst->src[i].fixed_hw_reg.abs)
3443 fprintf(file, "|");
3444 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3445 switch (inst->src[i].fixed_hw_reg.nr) {
3446 case BRW_ARF_NULL:
3447 fprintf(file, "null");
3448 break;
3449 case BRW_ARF_ADDRESS:
3450 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3451 break;
3452 case BRW_ARF_ACCUMULATOR:
3453 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3454 break;
3455 case BRW_ARF_FLAG:
3456 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3457 inst->src[i].fixed_hw_reg.subnr);
3458 break;
3459 default:
3460 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3461 inst->src[i].fixed_hw_reg.subnr);
3462 break;
3463 }
3464 } else {
3465 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3466 }
3467 if (inst->src[i].fixed_hw_reg.subnr)
3468 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3469 if (inst->src[i].fixed_hw_reg.abs)
3470 fprintf(file, "|");
3471 break;
3472 default:
3473 fprintf(file, "???");
3474 break;
3475 }
3476 if (inst->src[i].abs)
3477 fprintf(file, "|");
3478
3479 if (inst->src[i].file != IMM) {
3480 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3481 }
3482
3483 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3484 fprintf(file, ", ");
3485 }
3486
3487 fprintf(file, " ");
3488
3489 if (dispatch_width == 16 && inst->exec_size == 8) {
3490 if (inst->force_sechalf)
3491 fprintf(file, "2ndhalf ");
3492 else
3493 fprintf(file, "1sthalf ");
3494 }
3495
3496 fprintf(file, "\n");
3497 }
3498
3499 /**
3500 * Possibly returns an instruction that set up @param reg.
3501 *
3502 * Sometimes we want to take the result of some expression/variable
3503 * dereference tree and rewrite the instruction generating the result
3504 * of the tree. When processing the tree, we know that the
3505 * instructions generated are all writing temporaries that are dead
3506 * outside of this tree. So, if we have some instructions that write
3507 * a temporary, we're free to point that temp write somewhere else.
3508 *
3509 * Note that this doesn't guarantee that the instruction generated
3510 * only reg -- it might be the size=4 destination of a texture instruction.
3511 */
3512 fs_inst *
3513 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3514 fs_inst *end,
3515 const fs_reg &reg)
3516 {
3517 if (end == start ||
3518 end->is_partial_write() ||
3519 reg.reladdr ||
3520 !reg.equals(end->dst)) {
3521 return NULL;
3522 } else {
3523 return end;
3524 }
3525 }
3526
3527 void
3528 fs_visitor::setup_payload_gen6()
3529 {
3530 bool uses_depth =
3531 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3532 unsigned barycentric_interp_modes =
3533 (stage == MESA_SHADER_FRAGMENT) ?
3534 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3535
3536 assert(brw->gen >= 6);
3537
3538 /* R0-1: masks, pixel X/Y coordinates. */
3539 payload.num_regs = 2;
3540 /* R2: only for 32-pixel dispatch.*/
3541
3542 /* R3-26: barycentric interpolation coordinates. These appear in the
3543 * same order that they appear in the brw_wm_barycentric_interp_mode
3544 * enum. Each set of coordinates occupies 2 registers if dispatch width
3545 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3546 * appear if they were enabled using the "Barycentric Interpolation
3547 * Mode" bits in WM_STATE.
3548 */
3549 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3550 if (barycentric_interp_modes & (1 << i)) {
3551 payload.barycentric_coord_reg[i] = payload.num_regs;
3552 payload.num_regs += 2;
3553 if (dispatch_width == 16) {
3554 payload.num_regs += 2;
3555 }
3556 }
3557 }
3558
3559 /* R27: interpolated depth if uses source depth */
3560 if (uses_depth) {
3561 payload.source_depth_reg = payload.num_regs;
3562 payload.num_regs++;
3563 if (dispatch_width == 16) {
3564 /* R28: interpolated depth if not SIMD8. */
3565 payload.num_regs++;
3566 }
3567 }
3568 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3569 if (uses_depth) {
3570 payload.source_w_reg = payload.num_regs;
3571 payload.num_regs++;
3572 if (dispatch_width == 16) {
3573 /* R30: interpolated W if not SIMD8. */
3574 payload.num_regs++;
3575 }
3576 }
3577
3578 if (stage == MESA_SHADER_FRAGMENT) {
3579 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3580 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3581 prog_data->uses_pos_offset = key->compute_pos_offset;
3582 /* R31: MSAA position offsets. */
3583 if (prog_data->uses_pos_offset) {
3584 payload.sample_pos_reg = payload.num_regs;
3585 payload.num_regs++;
3586 }
3587 }
3588
3589 /* R32: MSAA input coverage mask */
3590 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3591 assert(brw->gen >= 7);
3592 payload.sample_mask_in_reg = payload.num_regs;
3593 payload.num_regs++;
3594 if (dispatch_width == 16) {
3595 /* R33: input coverage mask if not SIMD8. */
3596 payload.num_regs++;
3597 }
3598 }
3599
3600 /* R34-: bary for 32-pixel. */
3601 /* R58-59: interp W for 32-pixel. */
3602
3603 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3604 source_depth_to_render_target = true;
3605 }
3606 }
3607
3608 void
3609 fs_visitor::setup_vs_payload()
3610 {
3611 /* R0: thread header, R1: urb handles */
3612 payload.num_regs = 2;
3613 }
3614
3615 void
3616 fs_visitor::assign_binding_table_offsets()
3617 {
3618 assert(stage == MESA_SHADER_FRAGMENT);
3619 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3620 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3621 uint32_t next_binding_table_offset = 0;
3622
3623 /* If there are no color regions, we still perform an FB write to a null
3624 * renderbuffer, which we place at surface index 0.
3625 */
3626 prog_data->binding_table.render_target_start = next_binding_table_offset;
3627 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3628
3629 assign_common_binding_table_offsets(next_binding_table_offset);
3630 }
3631
3632 void
3633 fs_visitor::calculate_register_pressure()
3634 {
3635 invalidate_live_intervals();
3636 calculate_live_intervals();
3637
3638 unsigned num_instructions = 0;
3639 foreach_block(block, cfg)
3640 num_instructions += block->instructions.length();
3641
3642 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3643
3644 for (unsigned reg = 0; reg < alloc.count; reg++) {
3645 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3646 regs_live_at_ip[ip] += alloc.sizes[reg];
3647 }
3648 }
3649
3650 void
3651 fs_visitor::optimize()
3652 {
3653 const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3654
3655 split_virtual_grfs();
3656
3657 move_uniform_array_access_to_pull_constants();
3658 assign_constant_locations();
3659 demote_pull_constants();
3660
3661 #define OPT(pass, args...) ({ \
3662 pass_num++; \
3663 bool this_progress = pass(args); \
3664 \
3665 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3666 char filename[64]; \
3667 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3668 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3669 \
3670 backend_visitor::dump_instructions(filename); \
3671 } \
3672 \
3673 progress = progress || this_progress; \
3674 this_progress; \
3675 })
3676
3677 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3678 char filename[64];
3679 snprintf(filename, 64, "%s%d-%04d-00-start",
3680 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3681
3682 backend_visitor::dump_instructions(filename);
3683 }
3684
3685 bool progress;
3686 int iteration = 0;
3687 int pass_num = 0;
3688 do {
3689 progress = false;
3690 pass_num = 0;
3691 iteration++;
3692
3693 OPT(remove_duplicate_mrf_writes);
3694
3695 OPT(opt_algebraic);
3696 OPT(opt_cse);
3697 OPT(opt_copy_propagate);
3698 OPT(opt_peephole_predicated_break);
3699 OPT(opt_cmod_propagation);
3700 OPT(dead_code_eliminate);
3701 OPT(opt_peephole_sel);
3702 OPT(dead_control_flow_eliminate, this);
3703 OPT(opt_register_renaming);
3704 OPT(opt_redundant_discard_jumps);
3705 OPT(opt_saturate_propagation);
3706 OPT(register_coalesce);
3707 OPT(compute_to_mrf);
3708
3709 OPT(compact_virtual_grfs);
3710 } while (progress);
3711
3712 pass_num = 0;
3713
3714 if (OPT(lower_load_payload)) {
3715 split_virtual_grfs();
3716 OPT(register_coalesce);
3717 OPT(compute_to_mrf);
3718 OPT(dead_code_eliminate);
3719 }
3720
3721 OPT(opt_combine_constants);
3722
3723 lower_uniform_pull_constant_loads();
3724 }
3725
3726 /**
3727 * Three source instruction must have a GRF/MRF destination register.
3728 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
3729 */
3730 void
3731 fs_visitor::fixup_3src_null_dest()
3732 {
3733 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3734 if (inst->is_3src() && inst->dst.is_null()) {
3735 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3736 inst->dst.type);
3737 }
3738 }
3739 }
3740
3741 void
3742 fs_visitor::allocate_registers()
3743 {
3744 bool allocated_without_spills;
3745
3746 static const enum instruction_scheduler_mode pre_modes[] = {
3747 SCHEDULE_PRE,
3748 SCHEDULE_PRE_NON_LIFO,
3749 SCHEDULE_PRE_LIFO,
3750 };
3751
3752 /* Try each scheduling heuristic to see if it can successfully register
3753 * allocate without spilling. They should be ordered by decreasing
3754 * performance but increasing likelihood of allocating.
3755 */
3756 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3757 schedule_instructions(pre_modes[i]);
3758
3759 if (0) {
3760 assign_regs_trivial();
3761 allocated_without_spills = true;
3762 } else {
3763 allocated_without_spills = assign_regs(false);
3764 }
3765 if (allocated_without_spills)
3766 break;
3767 }
3768
3769 if (!allocated_without_spills) {
3770 const char *stage_name = stage == MESA_SHADER_VERTEX ?
3771 "Vertex" : "Fragment";
3772
3773 /* We assume that any spilling is worse than just dropping back to
3774 * SIMD8. There's probably actually some intermediate point where
3775 * SIMD16 with a couple of spills is still better.
3776 */
3777 if (dispatch_width == 16) {
3778 fail("Failure to register allocate. Reduce number of "
3779 "live scalar values to avoid this.");
3780 } else {
3781 perf_debug("%s shader triggered register spilling. "
3782 "Try reducing the number of live scalar values to "
3783 "improve performance.\n", stage_name);
3784 }
3785
3786 /* Since we're out of heuristics, just go spill registers until we
3787 * get an allocation.
3788 */
3789 while (!assign_regs(true)) {
3790 if (failed)
3791 break;
3792 }
3793 }
3794
3795 /* This must come after all optimization and register allocation, since
3796 * it inserts dead code that happens to have side effects, and it does
3797 * so based on the actual physical registers in use.
3798 */
3799 insert_gen4_send_dependency_workarounds();
3800
3801 if (failed)
3802 return;
3803
3804 if (!allocated_without_spills)
3805 schedule_instructions(SCHEDULE_POST);
3806
3807 if (last_scratch > 0)
3808 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3809 }
3810
3811 bool
3812 fs_visitor::run_vs()
3813 {
3814 assert(stage == MESA_SHADER_VERTEX);
3815
3816 assign_common_binding_table_offsets(0);
3817 setup_vs_payload();
3818
3819 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3820 emit_shader_time_begin();
3821
3822 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3823 base_ir = ir;
3824 this->result = reg_undef;
3825 ir->accept(this);
3826 }
3827 base_ir = NULL;
3828 if (failed)
3829 return false;
3830
3831 emit_urb_writes();
3832
3833 calculate_cfg();
3834
3835 optimize();
3836
3837 assign_curb_setup();
3838 assign_vs_urb_setup();
3839
3840 fixup_3src_null_dest();
3841 allocate_registers();
3842
3843 return !failed;
3844 }
3845
3846 bool
3847 fs_visitor::run_fs()
3848 {
3849 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3850 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3851
3852 assert(stage == MESA_SHADER_FRAGMENT);
3853
3854 sanity_param_count = prog->Parameters->NumParameters;
3855
3856 assign_binding_table_offsets();
3857
3858 if (brw->gen >= 6)
3859 setup_payload_gen6();
3860 else
3861 setup_payload_gen4();
3862
3863 if (0) {
3864 emit_dummy_fs();
3865 } else if (brw->use_rep_send && dispatch_width == 16) {
3866 emit_repclear_shader();
3867 } else {
3868 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3869 emit_shader_time_begin();
3870
3871 calculate_urb_setup();
3872 if (prog->InputsRead > 0) {
3873 if (brw->gen < 6)
3874 emit_interpolation_setup_gen4();
3875 else
3876 emit_interpolation_setup_gen6();
3877 }
3878
3879 /* We handle discards by keeping track of the still-live pixels in f0.1.
3880 * Initialize it with the dispatched pixels.
3881 */
3882 if (wm_prog_data->uses_kill) {
3883 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3884 discard_init->flag_subreg = 1;
3885 }
3886
3887 /* Generate FS IR for main(). (the visitor only descends into
3888 * functions called "main").
3889 */
3890 if (shader) {
3891 if (getenv("INTEL_USE_NIR") != NULL) {
3892 emit_nir_code();
3893 } else {
3894 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3895 base_ir = ir;
3896 this->result = reg_undef;
3897 ir->accept(this);
3898 }
3899 }
3900 } else {
3901 emit_fragment_program_code();
3902 }
3903 base_ir = NULL;
3904 if (failed)
3905 return false;
3906
3907 emit(FS_OPCODE_PLACEHOLDER_HALT);
3908
3909 if (wm_key->alpha_test_func)
3910 emit_alpha_test();
3911
3912 emit_fb_writes();
3913
3914 calculate_cfg();
3915
3916 optimize();
3917
3918 assign_curb_setup();
3919 assign_urb_setup();
3920
3921 fixup_3src_null_dest();
3922 allocate_registers();
3923
3924 if (failed)
3925 return false;
3926 }
3927
3928 if (dispatch_width == 8)
3929 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3930 else
3931 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3932
3933 /* If any state parameters were appended, then ParameterValues could have
3934 * been realloced, in which case the driver uniform storage set up by
3935 * _mesa_associate_uniform_storage() would point to freed memory. Make
3936 * sure that didn't happen.
3937 */
3938 assert(sanity_param_count == prog->Parameters->NumParameters);
3939
3940 return !failed;
3941 }
3942
3943 const unsigned *
3944 brw_wm_fs_emit(struct brw_context *brw,
3945 void *mem_ctx,
3946 const struct brw_wm_prog_key *key,
3947 struct brw_wm_prog_data *prog_data,
3948 struct gl_fragment_program *fp,
3949 struct gl_shader_program *prog,
3950 unsigned *final_assembly_size)
3951 {
3952 bool start_busy = false;
3953 double start_time = 0;
3954
3955 if (unlikely(brw->perf_debug)) {
3956 start_busy = (brw->batch.last_bo &&
3957 drm_intel_bo_busy(brw->batch.last_bo));
3958 start_time = get_time();
3959 }
3960
3961 struct brw_shader *shader = NULL;
3962 if (prog)
3963 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3964
3965 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3966 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3967
3968 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3969 */
3970 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3971 if (!v.run_fs()) {
3972 if (prog) {
3973 prog->LinkStatus = false;
3974 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3975 }
3976
3977 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3978 v.fail_msg);
3979
3980 return NULL;
3981 }
3982
3983 cfg_t *simd16_cfg = NULL;
3984 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3985 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3986 brw->use_rep_send)) {
3987 if (!v.simd16_unsupported) {
3988 /* Try a SIMD16 compile */
3989 v2.import_uniforms(&v);
3990 if (!v2.run_fs()) {
3991 perf_debug("SIMD16 shader failed to compile, falling back to "
3992 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3993 } else {
3994 simd16_cfg = v2.cfg;
3995 }
3996 } else {
3997 perf_debug("SIMD16 shader unsupported, falling back to "
3998 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3999 }
4000 }
4001
4002 cfg_t *simd8_cfg;
4003 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4004 if (no_simd8 && simd16_cfg) {
4005 simd8_cfg = NULL;
4006 prog_data->no_8 = true;
4007 } else {
4008 simd8_cfg = v.cfg;
4009 prog_data->no_8 = false;
4010 }
4011
4012 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4013 &fp->Base, v.runtime_check_aads_emit, "FS");
4014
4015 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4016 char *name;
4017 if (prog)
4018 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4019 prog->Label ? prog->Label : "unnamed",
4020 prog->Name);
4021 else
4022 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4023
4024 g.enable_debug(name);
4025 }
4026
4027 if (simd8_cfg)
4028 g.generate_code(simd8_cfg, 8);
4029 if (simd16_cfg)
4030 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4031
4032 if (unlikely(brw->perf_debug) && shader) {
4033 if (shader->compiled_once)
4034 brw_wm_debug_recompile(brw, prog, key);
4035 shader->compiled_once = true;
4036
4037 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4038 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4039 (get_time() - start_time) * 1000);
4040 }
4041 }
4042
4043 return g.get_assembly(final_assembly_size);
4044 }
4045
4046 extern "C" bool
4047 brw_fs_precompile(struct gl_context *ctx,
4048 struct gl_shader_program *shader_prog,
4049 struct gl_program *prog)
4050 {
4051 struct brw_context *brw = brw_context(ctx);
4052 struct brw_wm_prog_key key;
4053
4054 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4055 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4056 bool program_uses_dfdy = fp->UsesDFdy;
4057
4058 memset(&key, 0, sizeof(key));
4059
4060 if (brw->gen < 6) {
4061 if (fp->UsesKill)
4062 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4063
4064 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4065 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4066
4067 /* Just assume depth testing. */
4068 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4069 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4070 }
4071
4072 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4073 BRW_FS_VARYING_INPUT_MASK) > 16)
4074 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4075
4076 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4077 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
4078 for (unsigned i = 0; i < sampler_count; i++) {
4079 if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
4080 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4081 key.tex.swizzles[i] =
4082 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4083 } else {
4084 /* Color sampler: assume no swizzling. */
4085 key.tex.swizzles[i] = SWIZZLE_XYZW;
4086 }
4087 }
4088
4089 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4090 key.drawable_height = ctx->DrawBuffer->Height;
4091 }
4092
4093 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4094 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4095 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4096
4097 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4098 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4099 key.nr_color_regions > 1;
4100 }
4101
4102 key.program_string_id = bfp->id;
4103
4104 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4105 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4106
4107 bool success = do_wm_prog(brw, shader_prog, bfp, &key);
4108
4109 brw->wm.base.prog_offset = old_prog_offset;
4110 brw->wm.prog_data = old_prog_data;
4111
4112 return success;
4113 }