i965/fs: Set smear on shader_time diff register.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 void
53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
54 const fs_reg *src, unsigned sources)
55 {
56 memset(this, 0, sizeof(*this));
57
58 this->src = new fs_reg[MAX2(sources, 3)];
59 for (unsigned i = 0; i < sources; i++)
60 this->src[i] = src[i];
61
62 this->opcode = opcode;
63 this->dst = dst;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (unsigned i = 0; i < sources; ++i) {
79 if (src[i].file != GRF && src[i].file != ATTR)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 for (unsigned i = 0; i < sources; ++i) {
94 switch (this->src[i].file) {
95 case BAD_FILE:
96 this->src[i].effective_width = 8;
97 break;
98 case GRF:
99 case HW_REG:
100 case ATTR:
101 assert(this->src[i].width > 0);
102 if (this->src[i].width == 1) {
103 this->src[i].effective_width = this->exec_size;
104 } else {
105 this->src[i].effective_width = this->src[i].width;
106 }
107 break;
108 case IMM:
109 case UNIFORM:
110 this->src[i].effective_width = this->exec_size;
111 break;
112 default:
113 unreachable("Invalid source register file");
114 }
115 }
116 this->dst.effective_width = this->exec_size;
117
118 this->conditional_mod = BRW_CONDITIONAL_NONE;
119
120 /* This will be the case for almost all instructions. */
121 switch (dst.file) {
122 case GRF:
123 case HW_REG:
124 case MRF:
125 case ATTR:
126 this->regs_written =
127 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
128 break;
129 case BAD_FILE:
130 this->regs_written = 0;
131 break;
132 case IMM:
133 case UNIFORM:
134 unreachable("Invalid destination register file");
135 default:
136 unreachable("Invalid register file");
137 }
138
139 this->writes_accumulator = false;
140 }
141
142 fs_inst::fs_inst()
143 {
144 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
145 }
146
147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
148 {
149 init(opcode, exec_size, reg_undef, NULL, 0);
150 }
151
152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
153 {
154 init(opcode, 0, dst, NULL, 0);
155 }
156
157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
158 const fs_reg &src0)
159 {
160 const fs_reg src[1] = { src0 };
161 init(opcode, exec_size, dst, src, 1);
162 }
163
164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
165 {
166 const fs_reg src[1] = { src0 };
167 init(opcode, 0, dst, src, 1);
168 }
169
170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
171 const fs_reg &src0, const fs_reg &src1)
172 {
173 const fs_reg src[2] = { src0, src1 };
174 init(opcode, exec_size, dst, src, 2);
175 }
176
177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
178 const fs_reg &src1)
179 {
180 const fs_reg src[2] = { src0, src1 };
181 init(opcode, 0, dst, src, 2);
182 }
183
184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
185 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
186 {
187 const fs_reg src[3] = { src0, src1, src2 };
188 init(opcode, exec_size, dst, src, 3);
189 }
190
191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
192 const fs_reg &src1, const fs_reg &src2)
193 {
194 const fs_reg src[3] = { src0, src1, src2 };
195 init(opcode, 0, dst, src, 3);
196 }
197
198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
199 const fs_reg src[], unsigned sources)
200 {
201 init(opcode, 0, dst, src, sources);
202 }
203
204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
205 const fs_reg src[], unsigned sources)
206 {
207 init(opcode, exec_width, dst, src, sources);
208 }
209
210 fs_inst::fs_inst(const fs_inst &that)
211 {
212 memcpy(this, &that, sizeof(that));
213
214 this->src = new fs_reg[MAX2(that.sources, 3)];
215
216 for (unsigned i = 0; i < that.sources; i++)
217 this->src[i] = that.src[i];
218 }
219
220 fs_inst::~fs_inst()
221 {
222 delete[] this->src;
223 }
224
225 void
226 fs_inst::resize_sources(uint8_t num_sources)
227 {
228 if (this->sources != num_sources) {
229 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
230
231 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
232 src[i] = this->src[i];
233
234 delete[] this->src;
235 this->src = src;
236 this->sources = num_sources;
237 }
238 }
239
240 #define ALU1(op) \
241 fs_inst * \
242 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
243 { \
244 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
245 }
246
247 #define ALU2(op) \
248 fs_inst * \
249 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
250 const fs_reg &src1) \
251 { \
252 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
253 }
254
255 #define ALU2_ACC(op) \
256 fs_inst * \
257 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
258 const fs_reg &src1) \
259 { \
260 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
261 inst->writes_accumulator = true; \
262 return inst; \
263 }
264
265 #define ALU3(op) \
266 fs_inst * \
267 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
268 const fs_reg &src1, const fs_reg &src2) \
269 { \
270 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
271 }
272
273 ALU1(NOT)
274 ALU1(MOV)
275 ALU1(FRC)
276 ALU1(RNDD)
277 ALU1(RNDE)
278 ALU1(RNDZ)
279 ALU2(ADD)
280 ALU2(MUL)
281 ALU2_ACC(MACH)
282 ALU2(AND)
283 ALU2(OR)
284 ALU2(XOR)
285 ALU2(SHL)
286 ALU2(SHR)
287 ALU2(ASR)
288 ALU3(LRP)
289 ALU1(BFREV)
290 ALU3(BFE)
291 ALU2(BFI1)
292 ALU3(BFI2)
293 ALU1(FBH)
294 ALU1(FBL)
295 ALU1(CBIT)
296 ALU3(MAD)
297 ALU2_ACC(ADDC)
298 ALU2_ACC(SUBB)
299 ALU2(SEL)
300 ALU2(MAC)
301
302 /** Gen4 predicated IF. */
303 fs_inst *
304 fs_visitor::IF(enum brw_predicate predicate)
305 {
306 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
307 inst->predicate = predicate;
308 return inst;
309 }
310
311 /** Gen6 IF with embedded comparison. */
312 fs_inst *
313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
314 enum brw_conditional_mod condition)
315 {
316 assert(brw->gen == 6);
317 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
318 reg_null_d, src0, src1);
319 inst->conditional_mod = condition;
320 return inst;
321 }
322
323 /**
324 * CMP: Sets the low bit of the destination channels with the result
325 * of the comparison, while the upper bits are undefined, and updates
326 * the flag register with the packed 16 bits of the result.
327 */
328 fs_inst *
329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
330 enum brw_conditional_mod condition)
331 {
332 fs_inst *inst;
333
334 /* Take the instruction:
335 *
336 * CMP null<d> src0<f> src1<f>
337 *
338 * Original gen4 does type conversion to the destination type before
339 * comparison, producing garbage results for floating point comparisons.
340 *
341 * The destination type doesn't matter on newer generations, so we set the
342 * type to match src0 so we can compact the instruction.
343 */
344 dst.type = src0.type;
345 if (dst.file == HW_REG)
346 dst.fixed_hw_reg.type = dst.type;
347
348 resolve_ud_negate(&src0);
349 resolve_ud_negate(&src1);
350
351 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
352 inst->conditional_mod = condition;
353
354 return inst;
355 }
356
357 fs_inst *
358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
359 {
360 uint8_t exec_size = dst.width;
361 for (int i = 0; i < sources; ++i) {
362 assert(src[i].width % dst.width == 0);
363 if (src[i].width > exec_size)
364 exec_size = src[i].width;
365 }
366
367 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
368 dst, src, sources);
369 inst->regs_written = 0;
370 for (int i = 0; i < sources; ++i) {
371 /* The LOAD_PAYLOAD instruction only really makes sense if we are
372 * dealing with whole registers. If this ever changes, we can deal
373 * with it later.
374 */
375 int size = inst->src[i].effective_width * type_sz(src[i].type);
376 assert(size % 32 == 0);
377 inst->regs_written += (size + 31) / 32;
378 }
379
380 return inst;
381 }
382
383 exec_list
384 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
385 const fs_reg &surf_index,
386 const fs_reg &varying_offset,
387 uint32_t const_offset)
388 {
389 exec_list instructions;
390 fs_inst *inst;
391
392 /* We have our constant surface use a pitch of 4 bytes, so our index can
393 * be any component of a vector, and then we load 4 contiguous
394 * components starting from that.
395 *
396 * We break down the const_offset to a portion added to the variable
397 * offset and a portion done using reg_offset, which means that if you
398 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
399 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
400 * CSE can later notice that those loads are all the same and eliminate
401 * the redundant ones.
402 */
403 fs_reg vec4_offset = vgrf(glsl_type::int_type);
404 instructions.push_tail(ADD(vec4_offset,
405 varying_offset, fs_reg(const_offset & ~3)));
406
407 int scale = 1;
408 if (brw->gen == 4 && dst.width == 8) {
409 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
410 * u, v, r) as parameters, or we can just use the SIMD16 message
411 * consisting of (header, u). We choose the second, at the cost of a
412 * longer return length.
413 */
414 scale = 2;
415 }
416
417 enum opcode op;
418 if (brw->gen >= 7)
419 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
420 else
421 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
422
423 assert(dst.width % 8 == 0);
424 int regs_written = 4 * (dst.width / 8) * scale;
425 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
426 dst.type, dst.width);
427 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
428 inst->regs_written = regs_written;
429 instructions.push_tail(inst);
430
431 if (brw->gen < 7) {
432 inst->base_mrf = 13;
433 inst->header_present = true;
434 if (brw->gen == 4)
435 inst->mlen = 3;
436 else
437 inst->mlen = 1 + dispatch_width / 8;
438 }
439
440 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
441 instructions.push_tail(MOV(dst, result));
442
443 return instructions;
444 }
445
446 /**
447 * A helper for MOV generation for fixing up broken hardware SEND dependency
448 * handling.
449 */
450 fs_inst *
451 fs_visitor::DEP_RESOLVE_MOV(int grf)
452 {
453 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
454
455 inst->ir = NULL;
456 inst->annotation = "send dependency resolve";
457
458 /* The caller always wants uncompressed to emit the minimal extra
459 * dependencies, and to avoid having to deal with aligning its regs to 2.
460 */
461 inst->exec_size = 8;
462
463 return inst;
464 }
465
466 bool
467 fs_inst::equals(fs_inst *inst) const
468 {
469 return (opcode == inst->opcode &&
470 dst.equals(inst->dst) &&
471 src[0].equals(inst->src[0]) &&
472 src[1].equals(inst->src[1]) &&
473 src[2].equals(inst->src[2]) &&
474 saturate == inst->saturate &&
475 predicate == inst->predicate &&
476 conditional_mod == inst->conditional_mod &&
477 mlen == inst->mlen &&
478 base_mrf == inst->base_mrf &&
479 target == inst->target &&
480 eot == inst->eot &&
481 header_present == inst->header_present &&
482 shadow_compare == inst->shadow_compare &&
483 exec_size == inst->exec_size &&
484 offset == inst->offset);
485 }
486
487 bool
488 fs_inst::overwrites_reg(const fs_reg &reg) const
489 {
490 return (reg.file == dst.file &&
491 reg.reg == dst.reg &&
492 reg.reg_offset >= dst.reg_offset &&
493 reg.reg_offset < dst.reg_offset + regs_written);
494 }
495
496 bool
497 fs_inst::is_send_from_grf() const
498 {
499 switch (opcode) {
500 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
501 case SHADER_OPCODE_SHADER_TIME_ADD:
502 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
503 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
504 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
505 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
506 case SHADER_OPCODE_UNTYPED_ATOMIC:
507 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
508 case SHADER_OPCODE_URB_WRITE_SIMD8:
509 return true;
510 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
511 return src[1].file == GRF;
512 case FS_OPCODE_FB_WRITE:
513 return src[0].file == GRF;
514 default:
515 if (is_tex())
516 return src[0].file == GRF;
517
518 return false;
519 }
520 }
521
522 bool
523 fs_inst::can_do_source_mods(struct brw_context *brw)
524 {
525 if (brw->gen == 6 && is_math())
526 return false;
527
528 if (is_send_from_grf())
529 return false;
530
531 if (!backend_instruction::can_do_source_mods())
532 return false;
533
534 return true;
535 }
536
537 void
538 fs_reg::init()
539 {
540 memset(this, 0, sizeof(*this));
541 stride = 1;
542 }
543
544 /** Generic unset register constructor. */
545 fs_reg::fs_reg()
546 {
547 init();
548 this->file = BAD_FILE;
549 }
550
551 /** Immediate value constructor. */
552 fs_reg::fs_reg(float f)
553 {
554 init();
555 this->file = IMM;
556 this->type = BRW_REGISTER_TYPE_F;
557 this->fixed_hw_reg.dw1.f = f;
558 this->width = 1;
559 }
560
561 /** Immediate value constructor. */
562 fs_reg::fs_reg(int32_t i)
563 {
564 init();
565 this->file = IMM;
566 this->type = BRW_REGISTER_TYPE_D;
567 this->fixed_hw_reg.dw1.d = i;
568 this->width = 1;
569 }
570
571 /** Immediate value constructor. */
572 fs_reg::fs_reg(uint32_t u)
573 {
574 init();
575 this->file = IMM;
576 this->type = BRW_REGISTER_TYPE_UD;
577 this->fixed_hw_reg.dw1.ud = u;
578 this->width = 1;
579 }
580
581 /** Vector float immediate value constructor. */
582 fs_reg::fs_reg(uint8_t vf[4])
583 {
584 init();
585 this->file = IMM;
586 this->type = BRW_REGISTER_TYPE_VF;
587 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
588 }
589
590 /** Vector float immediate value constructor. */
591 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
592 {
593 init();
594 this->file = IMM;
595 this->type = BRW_REGISTER_TYPE_VF;
596 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
597 (vf1 << 8) |
598 (vf2 << 16) |
599 (vf3 << 24);
600 }
601
602 /** Fixed brw_reg. */
603 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
604 {
605 init();
606 this->file = HW_REG;
607 this->fixed_hw_reg = fixed_hw_reg;
608 this->type = fixed_hw_reg.type;
609 this->width = 1 << fixed_hw_reg.width;
610 }
611
612 bool
613 fs_reg::equals(const fs_reg &r) const
614 {
615 return (file == r.file &&
616 reg == r.reg &&
617 reg_offset == r.reg_offset &&
618 subreg_offset == r.subreg_offset &&
619 type == r.type &&
620 negate == r.negate &&
621 abs == r.abs &&
622 !reladdr && !r.reladdr &&
623 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
624 width == r.width &&
625 stride == r.stride);
626 }
627
628 fs_reg &
629 fs_reg::set_smear(unsigned subreg)
630 {
631 assert(file != HW_REG && file != IMM);
632 subreg_offset = subreg * type_sz(type);
633 stride = 0;
634 return *this;
635 }
636
637 bool
638 fs_reg::is_contiguous() const
639 {
640 return stride == 1;
641 }
642
643 int
644 fs_visitor::type_size(const struct glsl_type *type)
645 {
646 unsigned int size, i;
647
648 switch (type->base_type) {
649 case GLSL_TYPE_UINT:
650 case GLSL_TYPE_INT:
651 case GLSL_TYPE_FLOAT:
652 case GLSL_TYPE_BOOL:
653 return type->components();
654 case GLSL_TYPE_ARRAY:
655 return type_size(type->fields.array) * type->length;
656 case GLSL_TYPE_STRUCT:
657 size = 0;
658 for (i = 0; i < type->length; i++) {
659 size += type_size(type->fields.structure[i].type);
660 }
661 return size;
662 case GLSL_TYPE_SAMPLER:
663 /* Samplers take up no register space, since they're baked in at
664 * link time.
665 */
666 return 0;
667 case GLSL_TYPE_ATOMIC_UINT:
668 return 0;
669 case GLSL_TYPE_IMAGE:
670 case GLSL_TYPE_VOID:
671 case GLSL_TYPE_ERROR:
672 case GLSL_TYPE_INTERFACE:
673 case GLSL_TYPE_DOUBLE:
674 unreachable("not reached");
675 }
676
677 return 0;
678 }
679
680 fs_reg
681 fs_visitor::get_timestamp()
682 {
683 assert(brw->gen >= 7);
684
685 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
686 BRW_ARF_TIMESTAMP,
687 0),
688 BRW_REGISTER_TYPE_UD));
689
690 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
691
692 fs_inst *mov = emit(MOV(dst, ts));
693 /* We want to read the 3 fields we care about even if it's not enabled in
694 * the dispatch.
695 */
696 mov->force_writemask_all = true;
697
698 /* The caller wants the low 32 bits of the timestamp. Since it's running
699 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
700 * which is plenty of time for our purposes. It is identical across the
701 * EUs, but since it's tracking GPU core speed it will increment at a
702 * varying rate as render P-states change.
703 *
704 * The caller could also check if render P-states have changed (or anything
705 * else that might disrupt timing) by setting smear to 2 and checking if
706 * that field is != 0.
707 */
708 dst.set_smear(0);
709
710 return dst;
711 }
712
713 void
714 fs_visitor::emit_shader_time_begin()
715 {
716 current_annotation = "shader time start";
717 shader_start_time = get_timestamp();
718 }
719
720 void
721 fs_visitor::emit_shader_time_end()
722 {
723 current_annotation = "shader time end";
724
725 enum shader_time_shader_type type, written_type, reset_type;
726 switch (stage) {
727 case MESA_SHADER_VERTEX:
728 type = ST_VS;
729 written_type = ST_VS_WRITTEN;
730 reset_type = ST_VS_RESET;
731 break;
732 case MESA_SHADER_GEOMETRY:
733 type = ST_GS;
734 written_type = ST_GS_WRITTEN;
735 reset_type = ST_GS_RESET;
736 break;
737 case MESA_SHADER_FRAGMENT:
738 if (dispatch_width == 8) {
739 type = ST_FS8;
740 written_type = ST_FS8_WRITTEN;
741 reset_type = ST_FS8_RESET;
742 } else {
743 assert(dispatch_width == 16);
744 type = ST_FS16;
745 written_type = ST_FS16_WRITTEN;
746 reset_type = ST_FS16_RESET;
747 }
748 break;
749 default:
750 unreachable("fs_visitor::emit_shader_time_end missing code");
751 }
752
753 fs_reg shader_end_time = get_timestamp();
754
755 /* Check that there weren't any timestamp reset events (assuming these
756 * were the only two timestamp reads that happened).
757 */
758 fs_reg reset = shader_end_time;
759 reset.set_smear(2);
760 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
761 test->conditional_mod = BRW_CONDITIONAL_Z;
762 test->force_writemask_all = true;
763 emit(IF(BRW_PREDICATE_NORMAL));
764
765 fs_reg start = shader_start_time;
766 start.negate = true;
767 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
768 diff.set_smear(0);
769 fs_inst *add = ADD(diff, start, shader_end_time);
770 add->force_writemask_all = true;
771 emit(add);
772
773 /* If there were no instructions between the two timestamp gets, the diff
774 * is 2 cycles. Remove that overhead, so I can forget about that when
775 * trying to determine the time taken for single instructions.
776 */
777 add = ADD(diff, diff, fs_reg(-2u));
778 add->force_writemask_all = true;
779 emit(add);
780
781 emit_shader_time_write(type, diff);
782 emit_shader_time_write(written_type, fs_reg(1u));
783 emit(BRW_OPCODE_ELSE);
784 emit_shader_time_write(reset_type, fs_reg(1u));
785 emit(BRW_OPCODE_ENDIF);
786 }
787
788 void
789 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
790 fs_reg value)
791 {
792 int shader_time_index =
793 brw_get_shader_time_index(brw, shader_prog, prog, type);
794 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
795
796 fs_reg payload;
797 if (dispatch_width == 8)
798 payload = vgrf(glsl_type::uvec2_type);
799 else
800 payload = vgrf(glsl_type::uint_type);
801
802 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
803 fs_reg(), payload, offset, value));
804 }
805
806 void
807 fs_visitor::vfail(const char *format, va_list va)
808 {
809 char *msg;
810
811 if (failed)
812 return;
813
814 failed = true;
815
816 msg = ralloc_vasprintf(mem_ctx, format, va);
817 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
818
819 this->fail_msg = msg;
820
821 if (debug_enabled) {
822 fprintf(stderr, "%s", msg);
823 }
824 }
825
826 void
827 fs_visitor::fail(const char *format, ...)
828 {
829 va_list va;
830
831 va_start(va, format);
832 vfail(format, va);
833 va_end(va);
834 }
835
836 /**
837 * Mark this program as impossible to compile in SIMD16 mode.
838 *
839 * During the SIMD8 compile (which happens first), we can detect and flag
840 * things that are unsupported in SIMD16 mode, so the compiler can skip
841 * the SIMD16 compile altogether.
842 *
843 * During a SIMD16 compile (if one happens anyway), this just calls fail().
844 */
845 void
846 fs_visitor::no16(const char *format, ...)
847 {
848 va_list va;
849
850 va_start(va, format);
851
852 if (dispatch_width == 16) {
853 vfail(format, va);
854 } else {
855 simd16_unsupported = true;
856
857 if (brw->perf_debug) {
858 if (no16_msg)
859 ralloc_vasprintf_append(&no16_msg, format, va);
860 else
861 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
862 }
863 }
864
865 va_end(va);
866 }
867
868 fs_inst *
869 fs_visitor::emit(enum opcode opcode)
870 {
871 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
872 }
873
874 fs_inst *
875 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
876 {
877 return emit(new(mem_ctx) fs_inst(opcode, dst));
878 }
879
880 fs_inst *
881 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
882 {
883 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
884 }
885
886 fs_inst *
887 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
888 const fs_reg &src1)
889 {
890 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
891 }
892
893 fs_inst *
894 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
895 const fs_reg &src1, const fs_reg &src2)
896 {
897 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
898 }
899
900 fs_inst *
901 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
902 fs_reg src[], int sources)
903 {
904 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
905 }
906
907 /**
908 * Returns true if the instruction has a flag that means it won't
909 * update an entire destination register.
910 *
911 * For example, dead code elimination and live variable analysis want to know
912 * when a write to a variable screens off any preceding values that were in
913 * it.
914 */
915 bool
916 fs_inst::is_partial_write() const
917 {
918 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
919 (this->dst.width * type_sz(this->dst.type)) < 32 ||
920 !this->dst.is_contiguous());
921 }
922
923 int
924 fs_inst::regs_read(int arg) const
925 {
926 if (is_tex() && arg == 0 && src[0].file == GRF) {
927 return mlen;
928 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
929 return mlen;
930 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
931 return mlen;
932 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
933 return mlen;
934 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
935 return mlen;
936 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
937 return mlen;
938 }
939
940 switch (src[arg].file) {
941 case BAD_FILE:
942 case UNIFORM:
943 case IMM:
944 return 1;
945 case GRF:
946 case HW_REG:
947 if (src[arg].stride == 0) {
948 return 1;
949 } else {
950 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
951 return (size + 31) / 32;
952 }
953 case MRF:
954 unreachable("MRF registers are not allowed as sources");
955 default:
956 unreachable("Invalid register file");
957 }
958 }
959
960 bool
961 fs_inst::reads_flag() const
962 {
963 return predicate;
964 }
965
966 bool
967 fs_inst::writes_flag() const
968 {
969 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
970 opcode != BRW_OPCODE_IF &&
971 opcode != BRW_OPCODE_WHILE)) ||
972 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
973 }
974
975 /**
976 * Returns how many MRFs an FS opcode will write over.
977 *
978 * Note that this is not the 0 or 1 implied writes in an actual gen
979 * instruction -- the FS opcodes often generate MOVs in addition.
980 */
981 int
982 fs_visitor::implied_mrf_writes(fs_inst *inst)
983 {
984 if (inst->mlen == 0)
985 return 0;
986
987 if (inst->base_mrf == -1)
988 return 0;
989
990 switch (inst->opcode) {
991 case SHADER_OPCODE_RCP:
992 case SHADER_OPCODE_RSQ:
993 case SHADER_OPCODE_SQRT:
994 case SHADER_OPCODE_EXP2:
995 case SHADER_OPCODE_LOG2:
996 case SHADER_OPCODE_SIN:
997 case SHADER_OPCODE_COS:
998 return 1 * dispatch_width / 8;
999 case SHADER_OPCODE_POW:
1000 case SHADER_OPCODE_INT_QUOTIENT:
1001 case SHADER_OPCODE_INT_REMAINDER:
1002 return 2 * dispatch_width / 8;
1003 case SHADER_OPCODE_TEX:
1004 case FS_OPCODE_TXB:
1005 case SHADER_OPCODE_TXD:
1006 case SHADER_OPCODE_TXF:
1007 case SHADER_OPCODE_TXF_CMS:
1008 case SHADER_OPCODE_TXF_MCS:
1009 case SHADER_OPCODE_TG4:
1010 case SHADER_OPCODE_TG4_OFFSET:
1011 case SHADER_OPCODE_TXL:
1012 case SHADER_OPCODE_TXS:
1013 case SHADER_OPCODE_LOD:
1014 return 1;
1015 case FS_OPCODE_FB_WRITE:
1016 return 2;
1017 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1018 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1019 return 1;
1020 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1021 return inst->mlen;
1022 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1023 return 2;
1024 case SHADER_OPCODE_UNTYPED_ATOMIC:
1025 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1026 case SHADER_OPCODE_URB_WRITE_SIMD8:
1027 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1028 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1029 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1030 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1031 return 0;
1032 default:
1033 unreachable("not reached");
1034 }
1035 }
1036
1037 fs_reg
1038 fs_visitor::vgrf(const glsl_type *const type)
1039 {
1040 int reg_width = dispatch_width / 8;
1041 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1042 brw_type_for_base_type(type), dispatch_width);
1043 }
1044
1045 fs_reg
1046 fs_visitor::vgrf(int num_components)
1047 {
1048 int reg_width = dispatch_width / 8;
1049 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1050 BRW_REGISTER_TYPE_F, dispatch_width);
1051 }
1052
1053 /** Fixed HW reg constructor. */
1054 fs_reg::fs_reg(enum register_file file, int reg)
1055 {
1056 init();
1057 this->file = file;
1058 this->reg = reg;
1059 this->type = BRW_REGISTER_TYPE_F;
1060
1061 switch (file) {
1062 case UNIFORM:
1063 this->width = 1;
1064 break;
1065 default:
1066 this->width = 8;
1067 }
1068 }
1069
1070 /** Fixed HW reg constructor. */
1071 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1072 {
1073 init();
1074 this->file = file;
1075 this->reg = reg;
1076 this->type = type;
1077
1078 switch (file) {
1079 case UNIFORM:
1080 this->width = 1;
1081 break;
1082 default:
1083 this->width = 8;
1084 }
1085 }
1086
1087 /** Fixed HW reg constructor. */
1088 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1089 uint8_t width)
1090 {
1091 init();
1092 this->file = file;
1093 this->reg = reg;
1094 this->type = type;
1095 this->width = width;
1096 }
1097
1098 fs_reg *
1099 fs_visitor::variable_storage(ir_variable *var)
1100 {
1101 return (fs_reg *)hash_table_find(this->variable_ht, var);
1102 }
1103
1104 void
1105 import_uniforms_callback(const void *key,
1106 void *data,
1107 void *closure)
1108 {
1109 struct hash_table *dst_ht = (struct hash_table *)closure;
1110 const fs_reg *reg = (const fs_reg *)data;
1111
1112 if (reg->file != UNIFORM)
1113 return;
1114
1115 hash_table_insert(dst_ht, data, key);
1116 }
1117
1118 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1119 * This brings in those uniform definitions
1120 */
1121 void
1122 fs_visitor::import_uniforms(fs_visitor *v)
1123 {
1124 hash_table_call_foreach(v->variable_ht,
1125 import_uniforms_callback,
1126 variable_ht);
1127 this->push_constant_loc = v->push_constant_loc;
1128 this->pull_constant_loc = v->pull_constant_loc;
1129 this->uniforms = v->uniforms;
1130 this->param_size = v->param_size;
1131 }
1132
1133 /* Our support for uniforms is piggy-backed on the struct
1134 * gl_fragment_program, because that's where the values actually
1135 * get stored, rather than in some global gl_shader_program uniform
1136 * store.
1137 */
1138 void
1139 fs_visitor::setup_uniform_values(ir_variable *ir)
1140 {
1141 int namelen = strlen(ir->name);
1142
1143 /* The data for our (non-builtin) uniforms is stored in a series of
1144 * gl_uniform_driver_storage structs for each subcomponent that
1145 * glGetUniformLocation() could name. We know it's been set up in the same
1146 * order we'd walk the type, so walk the list of storage and find anything
1147 * with our name, or the prefix of a component that starts with our name.
1148 */
1149 unsigned params_before = uniforms;
1150 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1151 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1152
1153 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1154 (storage->name[namelen] != 0 &&
1155 storage->name[namelen] != '.' &&
1156 storage->name[namelen] != '[')) {
1157 continue;
1158 }
1159
1160 unsigned slots = storage->type->component_slots();
1161 if (storage->array_elements)
1162 slots *= storage->array_elements;
1163
1164 for (unsigned i = 0; i < slots; i++) {
1165 stage_prog_data->param[uniforms++] = &storage->storage[i];
1166 }
1167 }
1168
1169 /* Make sure we actually initialized the right amount of stuff here. */
1170 assert(params_before + ir->type->component_slots() == uniforms);
1171 (void)params_before;
1172 }
1173
1174
1175 /* Our support for builtin uniforms is even scarier than non-builtin.
1176 * It sits on top of the PROG_STATE_VAR parameters that are
1177 * automatically updated from GL context state.
1178 */
1179 void
1180 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1181 {
1182 const ir_state_slot *const slots = ir->get_state_slots();
1183 assert(slots != NULL);
1184
1185 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1186 /* This state reference has already been setup by ir_to_mesa, but we'll
1187 * get the same index back here.
1188 */
1189 int index = _mesa_add_state_reference(this->prog->Parameters,
1190 (gl_state_index *)slots[i].tokens);
1191
1192 /* Add each of the unique swizzles of the element as a parameter.
1193 * This'll end up matching the expected layout of the
1194 * array/matrix/structure we're trying to fill in.
1195 */
1196 int last_swiz = -1;
1197 for (unsigned int j = 0; j < 4; j++) {
1198 int swiz = GET_SWZ(slots[i].swizzle, j);
1199 if (swiz == last_swiz)
1200 break;
1201 last_swiz = swiz;
1202
1203 stage_prog_data->param[uniforms++] =
1204 &prog->Parameters->ParameterValues[index][swiz];
1205 }
1206 }
1207 }
1208
1209 fs_reg *
1210 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1211 bool origin_upper_left)
1212 {
1213 assert(stage == MESA_SHADER_FRAGMENT);
1214 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1215 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1216 fs_reg wpos = *reg;
1217 bool flip = !origin_upper_left ^ key->render_to_fbo;
1218
1219 /* gl_FragCoord.x */
1220 if (pixel_center_integer) {
1221 emit(MOV(wpos, this->pixel_x));
1222 } else {
1223 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1224 }
1225 wpos = offset(wpos, 1);
1226
1227 /* gl_FragCoord.y */
1228 if (!flip && pixel_center_integer) {
1229 emit(MOV(wpos, this->pixel_y));
1230 } else {
1231 fs_reg pixel_y = this->pixel_y;
1232 float offset = (pixel_center_integer ? 0.0 : 0.5);
1233
1234 if (flip) {
1235 pixel_y.negate = true;
1236 offset += key->drawable_height - 1.0;
1237 }
1238
1239 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1240 }
1241 wpos = offset(wpos, 1);
1242
1243 /* gl_FragCoord.z */
1244 if (brw->gen >= 6) {
1245 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1246 } else {
1247 emit(FS_OPCODE_LINTERP, wpos,
1248 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1249 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1250 interp_reg(VARYING_SLOT_POS, 2));
1251 }
1252 wpos = offset(wpos, 1);
1253
1254 /* gl_FragCoord.w: Already set up in emit_interpolation */
1255 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1256
1257 return reg;
1258 }
1259
1260 fs_inst *
1261 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1262 glsl_interp_qualifier interpolation_mode,
1263 bool is_centroid, bool is_sample)
1264 {
1265 brw_wm_barycentric_interp_mode barycoord_mode;
1266 if (brw->gen >= 6) {
1267 if (is_centroid) {
1268 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1269 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1270 else
1271 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1272 } else if (is_sample) {
1273 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1274 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1275 else
1276 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1277 } else {
1278 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1279 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1280 else
1281 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1282 }
1283 } else {
1284 /* On Ironlake and below, there is only one interpolation mode.
1285 * Centroid interpolation doesn't mean anything on this hardware --
1286 * there is no multisampling.
1287 */
1288 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1289 }
1290 return emit(FS_OPCODE_LINTERP, attr,
1291 this->delta_x[barycoord_mode],
1292 this->delta_y[barycoord_mode], interp);
1293 }
1294
1295 void
1296 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1297 const glsl_type *type,
1298 glsl_interp_qualifier interpolation_mode,
1299 int location, bool mod_centroid,
1300 bool mod_sample)
1301 {
1302 attr.type = brw_type_for_base_type(type->get_scalar_type());
1303
1304 assert(stage == MESA_SHADER_FRAGMENT);
1305 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1306 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1307
1308 unsigned int array_elements;
1309
1310 if (type->is_array()) {
1311 array_elements = type->length;
1312 if (array_elements == 0) {
1313 fail("dereferenced array '%s' has length 0\n", name);
1314 }
1315 type = type->fields.array;
1316 } else {
1317 array_elements = 1;
1318 }
1319
1320 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1321 bool is_gl_Color =
1322 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1323 if (key->flat_shade && is_gl_Color) {
1324 interpolation_mode = INTERP_QUALIFIER_FLAT;
1325 } else {
1326 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1327 }
1328 }
1329
1330 for (unsigned int i = 0; i < array_elements; i++) {
1331 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1332 if (prog_data->urb_setup[location] == -1) {
1333 /* If there's no incoming setup data for this slot, don't
1334 * emit interpolation for it.
1335 */
1336 attr = offset(attr, type->vector_elements);
1337 location++;
1338 continue;
1339 }
1340
1341 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1342 /* Constant interpolation (flat shading) case. The SF has
1343 * handed us defined values in only the constant offset
1344 * field of the setup reg.
1345 */
1346 for (unsigned int k = 0; k < type->vector_elements; k++) {
1347 struct brw_reg interp = interp_reg(location, k);
1348 interp = suboffset(interp, 3);
1349 interp.type = attr.type;
1350 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1351 attr = offset(attr, 1);
1352 }
1353 } else {
1354 /* Smooth/noperspective interpolation case. */
1355 for (unsigned int k = 0; k < type->vector_elements; k++) {
1356 struct brw_reg interp = interp_reg(location, k);
1357 if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1358 /* Get the pixel/sample mask into f0 so that we know
1359 * which pixels are lit. Then, for each channel that is
1360 * unlit, replace the centroid data with non-centroid
1361 * data.
1362 */
1363 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1364
1365 fs_inst *inst;
1366 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1367 false, false);
1368 inst->predicate = BRW_PREDICATE_NORMAL;
1369 inst->predicate_inverse = true;
1370 if (brw->has_pln)
1371 inst->no_dd_clear = true;
1372
1373 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1374 mod_centroid && !key->persample_shading,
1375 mod_sample || key->persample_shading);
1376 inst->predicate = BRW_PREDICATE_NORMAL;
1377 inst->predicate_inverse = false;
1378 if (brw->has_pln)
1379 inst->no_dd_check = true;
1380
1381 } else {
1382 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1383 mod_centroid && !key->persample_shading,
1384 mod_sample || key->persample_shading);
1385 }
1386 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1387 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1388 }
1389 attr = offset(attr, 1);
1390 }
1391
1392 }
1393 location++;
1394 }
1395 }
1396 }
1397
1398 fs_reg *
1399 fs_visitor::emit_frontfacing_interpolation()
1400 {
1401 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1402
1403 if (brw->gen >= 6) {
1404 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1405 * a boolean result from this (~0/true or 0/false).
1406 *
1407 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1408 * this task in only one instruction:
1409 * - a negation source modifier will flip the bit; and
1410 * - a W -> D type conversion will sign extend the bit into the high
1411 * word of the destination.
1412 *
1413 * An ASR 15 fills the low word of the destination.
1414 */
1415 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1416 g0.negate = true;
1417
1418 emit(ASR(*reg, g0, fs_reg(15)));
1419 } else {
1420 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1421 * a boolean result from this (1/true or 0/false).
1422 *
1423 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1424 * the negation source modifier to flip it. Unfortunately the SHR
1425 * instruction only operates on UD (or D with an abs source modifier)
1426 * sources without negation.
1427 *
1428 * Instead, use ASR (which will give ~0/true or 0/false).
1429 */
1430 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1431 g1_6.negate = true;
1432
1433 emit(ASR(*reg, g1_6, fs_reg(31)));
1434 }
1435
1436 return reg;
1437 }
1438
1439 void
1440 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1441 {
1442 assert(stage == MESA_SHADER_FRAGMENT);
1443 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1444 assert(dst.type == BRW_REGISTER_TYPE_F);
1445
1446 if (key->compute_pos_offset) {
1447 /* Convert int_sample_pos to floating point */
1448 emit(MOV(dst, int_sample_pos));
1449 /* Scale to the range [0, 1] */
1450 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1451 }
1452 else {
1453 /* From ARB_sample_shading specification:
1454 * "When rendering to a non-multisample buffer, or if multisample
1455 * rasterization is disabled, gl_SamplePosition will always be
1456 * (0.5, 0.5).
1457 */
1458 emit(MOV(dst, fs_reg(0.5f)));
1459 }
1460 }
1461
1462 fs_reg *
1463 fs_visitor::emit_samplepos_setup()
1464 {
1465 assert(brw->gen >= 6);
1466
1467 this->current_annotation = "compute sample position";
1468 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1469 fs_reg pos = *reg;
1470 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1471 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1472
1473 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1474 * mode will be enabled.
1475 *
1476 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1477 * R31.1:0 Position Offset X/Y for Slot[3:0]
1478 * R31.3:2 Position Offset X/Y for Slot[7:4]
1479 * .....
1480 *
1481 * The X, Y sample positions come in as bytes in thread payload. So, read
1482 * the positions using vstride=16, width=8, hstride=2.
1483 */
1484 struct brw_reg sample_pos_reg =
1485 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1486 BRW_REGISTER_TYPE_B), 16, 8, 2);
1487
1488 if (dispatch_width == 8) {
1489 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1490 } else {
1491 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1492 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1493 ->force_sechalf = true;
1494 }
1495 /* Compute gl_SamplePosition.x */
1496 compute_sample_position(pos, int_sample_x);
1497 pos = offset(pos, 1);
1498 if (dispatch_width == 8) {
1499 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1500 } else {
1501 emit(MOV(half(int_sample_y, 0),
1502 fs_reg(suboffset(sample_pos_reg, 1))));
1503 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1504 ->force_sechalf = true;
1505 }
1506 /* Compute gl_SamplePosition.y */
1507 compute_sample_position(pos, int_sample_y);
1508 return reg;
1509 }
1510
1511 fs_reg *
1512 fs_visitor::emit_sampleid_setup()
1513 {
1514 assert(stage == MESA_SHADER_FRAGMENT);
1515 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1516 assert(brw->gen >= 6);
1517
1518 this->current_annotation = "compute sample id";
1519 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1520
1521 if (key->compute_sample_id) {
1522 fs_reg t1 = vgrf(glsl_type::int_type);
1523 fs_reg t2 = vgrf(glsl_type::int_type);
1524 t2.type = BRW_REGISTER_TYPE_UW;
1525
1526 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1527 * 8x multisampling, subspan 0 will represent sample N (where N
1528 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1529 * 7. We can find the value of N by looking at R0.0 bits 7:6
1530 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1531 * (since samples are always delivered in pairs). That is, we
1532 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1533 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1534 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1535 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1536 * populating a temporary variable with the sequence (0, 1, 2, 3),
1537 * and then reading from it using vstride=1, width=4, hstride=0.
1538 * These computations hold good for 4x multisampling as well.
1539 *
1540 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1541 * the first four slots are sample 0 of subspan 0; the next four
1542 * are sample 1 of subspan 0; the third group is sample 0 of
1543 * subspan 1, and finally sample 1 of subspan 1.
1544 */
1545 fs_inst *inst;
1546 inst = emit(BRW_OPCODE_AND, t1,
1547 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1548 fs_reg(0xc0));
1549 inst->force_writemask_all = true;
1550 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1551 inst->force_writemask_all = true;
1552 /* This works for both SIMD8 and SIMD16 */
1553 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1554 inst->force_writemask_all = true;
1555 /* This special instruction takes care of setting vstride=1,
1556 * width=4, hstride=0 of t2 during an ADD instruction.
1557 */
1558 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1559 } else {
1560 /* As per GL_ARB_sample_shading specification:
1561 * "When rendering to a non-multisample buffer, or if multisample
1562 * rasterization is disabled, gl_SampleID will always be zero."
1563 */
1564 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1565 }
1566
1567 return reg;
1568 }
1569
1570 void
1571 fs_visitor::resolve_source_modifiers(fs_reg *src)
1572 {
1573 if (!src->abs && !src->negate)
1574 return;
1575
1576 fs_reg temp = retype(vgrf(1), src->type);
1577 emit(MOV(temp, *src));
1578 *src = temp;
1579 }
1580
1581 fs_reg
1582 fs_visitor::fix_math_operand(fs_reg src)
1583 {
1584 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1585 * might be able to do better by doing execsize = 1 math and then
1586 * expanding that result out, but we would need to be careful with
1587 * masking.
1588 *
1589 * The hardware ignores source modifiers (negate and abs) on math
1590 * instructions, so we also move to a temp to set those up.
1591 */
1592 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1593 !src.abs && !src.negate)
1594 return src;
1595
1596 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1597 * operands to math
1598 */
1599 if (brw->gen >= 7 && src.file != IMM)
1600 return src;
1601
1602 fs_reg expanded = vgrf(glsl_type::float_type);
1603 expanded.type = src.type;
1604 emit(BRW_OPCODE_MOV, expanded, src);
1605 return expanded;
1606 }
1607
1608 fs_inst *
1609 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1610 {
1611 switch (opcode) {
1612 case SHADER_OPCODE_RCP:
1613 case SHADER_OPCODE_RSQ:
1614 case SHADER_OPCODE_SQRT:
1615 case SHADER_OPCODE_EXP2:
1616 case SHADER_OPCODE_LOG2:
1617 case SHADER_OPCODE_SIN:
1618 case SHADER_OPCODE_COS:
1619 break;
1620 default:
1621 unreachable("not reached: bad math opcode");
1622 }
1623
1624 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1625 * might be able to do better by doing execsize = 1 math and then
1626 * expanding that result out, but we would need to be careful with
1627 * masking.
1628 *
1629 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1630 * instructions, so we also move to a temp to set those up.
1631 */
1632 if (brw->gen == 6 || brw->gen == 7)
1633 src = fix_math_operand(src);
1634
1635 fs_inst *inst = emit(opcode, dst, src);
1636
1637 if (brw->gen < 6) {
1638 inst->base_mrf = 2;
1639 inst->mlen = dispatch_width / 8;
1640 }
1641
1642 return inst;
1643 }
1644
1645 fs_inst *
1646 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1647 {
1648 int base_mrf = 2;
1649 fs_inst *inst;
1650
1651 if (brw->gen >= 8) {
1652 inst = emit(opcode, dst, src0, src1);
1653 } else if (brw->gen >= 6) {
1654 src0 = fix_math_operand(src0);
1655 src1 = fix_math_operand(src1);
1656
1657 inst = emit(opcode, dst, src0, src1);
1658 } else {
1659 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1660 * "Message Payload":
1661 *
1662 * "Operand0[7]. For the INT DIV functions, this operand is the
1663 * denominator."
1664 * ...
1665 * "Operand1[7]. For the INT DIV functions, this operand is the
1666 * numerator."
1667 */
1668 bool is_int_div = opcode != SHADER_OPCODE_POW;
1669 fs_reg &op0 = is_int_div ? src1 : src0;
1670 fs_reg &op1 = is_int_div ? src0 : src1;
1671
1672 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1673 inst = emit(opcode, dst, op0, reg_null_f);
1674
1675 inst->base_mrf = base_mrf;
1676 inst->mlen = 2 * dispatch_width / 8;
1677 }
1678 return inst;
1679 }
1680
1681 void
1682 fs_visitor::assign_curb_setup()
1683 {
1684 if (dispatch_width == 8) {
1685 prog_data->dispatch_grf_start_reg = payload.num_regs;
1686 } else {
1687 assert(stage == MESA_SHADER_FRAGMENT);
1688 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1689 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1690 }
1691
1692 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1693
1694 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1695 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1696 for (unsigned int i = 0; i < inst->sources; i++) {
1697 if (inst->src[i].file == UNIFORM) {
1698 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1699 int constant_nr;
1700 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1701 constant_nr = push_constant_loc[uniform_nr];
1702 } else {
1703 /* Section 5.11 of the OpenGL 4.1 spec says:
1704 * "Out-of-bounds reads return undefined values, which include
1705 * values from other variables of the active program or zero."
1706 * Just return the first push constant.
1707 */
1708 constant_nr = 0;
1709 }
1710
1711 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1712 constant_nr / 8,
1713 constant_nr % 8);
1714
1715 inst->src[i].file = HW_REG;
1716 inst->src[i].fixed_hw_reg = byte_offset(
1717 retype(brw_reg, inst->src[i].type),
1718 inst->src[i].subreg_offset);
1719 }
1720 }
1721 }
1722 }
1723
1724 void
1725 fs_visitor::calculate_urb_setup()
1726 {
1727 assert(stage == MESA_SHADER_FRAGMENT);
1728 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1729 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1730
1731 memset(prog_data->urb_setup, -1,
1732 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1733
1734 int urb_next = 0;
1735 /* Figure out where each of the incoming setup attributes lands. */
1736 if (brw->gen >= 6) {
1737 if (_mesa_bitcount_64(prog->InputsRead &
1738 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1739 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1740 * first 16 varying inputs, so we can put them wherever we want.
1741 * Just put them in order.
1742 *
1743 * This is useful because it means that (a) inputs not used by the
1744 * fragment shader won't take up valuable register space, and (b) we
1745 * won't have to recompile the fragment shader if it gets paired with
1746 * a different vertex (or geometry) shader.
1747 */
1748 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1749 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1750 BITFIELD64_BIT(i)) {
1751 prog_data->urb_setup[i] = urb_next++;
1752 }
1753 }
1754 } else {
1755 /* We have enough input varyings that the SF/SBE pipeline stage can't
1756 * arbitrarily rearrange them to suit our whim; we have to put them
1757 * in an order that matches the output of the previous pipeline stage
1758 * (geometry or vertex shader).
1759 */
1760 struct brw_vue_map prev_stage_vue_map;
1761 brw_compute_vue_map(brw, &prev_stage_vue_map,
1762 key->input_slots_valid);
1763 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1764 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1765 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1766 slot++) {
1767 int varying = prev_stage_vue_map.slot_to_varying[slot];
1768 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1769 * unused.
1770 */
1771 if (varying != BRW_VARYING_SLOT_COUNT &&
1772 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1773 BITFIELD64_BIT(varying))) {
1774 prog_data->urb_setup[varying] = slot - first_slot;
1775 }
1776 }
1777 urb_next = prev_stage_vue_map.num_slots - first_slot;
1778 }
1779 } else {
1780 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1781 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1782 /* Point size is packed into the header, not as a general attribute */
1783 if (i == VARYING_SLOT_PSIZ)
1784 continue;
1785
1786 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1787 /* The back color slot is skipped when the front color is
1788 * also written to. In addition, some slots can be
1789 * written in the vertex shader and not read in the
1790 * fragment shader. So the register number must always be
1791 * incremented, mapped or not.
1792 */
1793 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1794 prog_data->urb_setup[i] = urb_next;
1795 urb_next++;
1796 }
1797 }
1798
1799 /*
1800 * It's a FS only attribute, and we did interpolation for this attribute
1801 * in SF thread. So, count it here, too.
1802 *
1803 * See compile_sf_prog() for more info.
1804 */
1805 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1806 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1807 }
1808
1809 prog_data->num_varying_inputs = urb_next;
1810 }
1811
1812 void
1813 fs_visitor::assign_urb_setup()
1814 {
1815 assert(stage == MESA_SHADER_FRAGMENT);
1816 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1817
1818 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1819
1820 /* Offset all the urb_setup[] index by the actual position of the
1821 * setup regs, now that the location of the constants has been chosen.
1822 */
1823 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1824 if (inst->opcode == FS_OPCODE_LINTERP) {
1825 assert(inst->src[2].file == HW_REG);
1826 inst->src[2].fixed_hw_reg.nr += urb_start;
1827 }
1828
1829 if (inst->opcode == FS_OPCODE_CINTERP) {
1830 assert(inst->src[0].file == HW_REG);
1831 inst->src[0].fixed_hw_reg.nr += urb_start;
1832 }
1833 }
1834
1835 /* Each attribute is 4 setup channels, each of which is half a reg. */
1836 this->first_non_payload_grf =
1837 urb_start + prog_data->num_varying_inputs * 2;
1838 }
1839
1840 void
1841 fs_visitor::assign_vs_urb_setup()
1842 {
1843 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1844 int grf, count, slot, channel, attr;
1845
1846 assert(stage == MESA_SHADER_VERTEX);
1847 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1848 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1849 count++;
1850
1851 /* Each attribute is 4 regs. */
1852 this->first_non_payload_grf =
1853 payload.num_regs + prog_data->curb_read_length + count * 4;
1854
1855 unsigned vue_entries =
1856 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1857
1858 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1859 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1860
1861 assert(vs_prog_data->base.urb_read_length <= 15);
1862
1863 /* Rewrite all ATTR file references to the hw grf that they land in. */
1864 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1865 for (int i = 0; i < inst->sources; i++) {
1866 if (inst->src[i].file == ATTR) {
1867
1868 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1869 slot = count - 1;
1870 } else {
1871 /* Attributes come in in a contiguous block, ordered by their
1872 * gl_vert_attrib value. That means we can compute the slot
1873 * number for an attribute by masking out the enabled
1874 * attributes before it and counting the bits.
1875 */
1876 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1877 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1878 BITFIELD64_MASK(attr));
1879 }
1880
1881 channel = inst->src[i].reg_offset & 3;
1882
1883 grf = payload.num_regs +
1884 prog_data->curb_read_length +
1885 slot * 4 + channel;
1886
1887 inst->src[i].file = HW_REG;
1888 inst->src[i].fixed_hw_reg =
1889 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1890 }
1891 }
1892 }
1893 }
1894
1895 /**
1896 * Split large virtual GRFs into separate components if we can.
1897 *
1898 * This is mostly duplicated with what brw_fs_vector_splitting does,
1899 * but that's really conservative because it's afraid of doing
1900 * splitting that doesn't result in real progress after the rest of
1901 * the optimization phases, which would cause infinite looping in
1902 * optimization. We can do it once here, safely. This also has the
1903 * opportunity to split interpolated values, or maybe even uniforms,
1904 * which we don't have at the IR level.
1905 *
1906 * We want to split, because virtual GRFs are what we register
1907 * allocate and spill (due to contiguousness requirements for some
1908 * instructions), and they're what we naturally generate in the
1909 * codegen process, but most virtual GRFs don't actually need to be
1910 * contiguous sets of GRFs. If we split, we'll end up with reduced
1911 * live intervals and better dead code elimination and coalescing.
1912 */
1913 void
1914 fs_visitor::split_virtual_grfs()
1915 {
1916 int num_vars = this->alloc.count;
1917
1918 /* Count the total number of registers */
1919 int reg_count = 0;
1920 int vgrf_to_reg[num_vars];
1921 for (int i = 0; i < num_vars; i++) {
1922 vgrf_to_reg[i] = reg_count;
1923 reg_count += alloc.sizes[i];
1924 }
1925
1926 /* An array of "split points". For each register slot, this indicates
1927 * if this slot can be separated from the previous slot. Every time an
1928 * instruction uses multiple elements of a register (as a source or
1929 * destination), we mark the used slots as inseparable. Then we go
1930 * through and split the registers into the smallest pieces we can.
1931 */
1932 bool split_points[reg_count];
1933 memset(split_points, 0, sizeof(split_points));
1934
1935 /* Mark all used registers as fully splittable */
1936 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1937 if (inst->dst.file == GRF) {
1938 int reg = vgrf_to_reg[inst->dst.reg];
1939 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1940 split_points[reg + j] = true;
1941 }
1942
1943 for (int i = 0; i < inst->sources; i++) {
1944 if (inst->src[i].file == GRF) {
1945 int reg = vgrf_to_reg[inst->src[i].reg];
1946 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1947 split_points[reg + j] = true;
1948 }
1949 }
1950 }
1951
1952 if (brw->has_pln &&
1953 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1954 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1955 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1956 * Gen6, that was the only supported interpolation mode, and since Gen6,
1957 * delta_x and delta_y are in fixed hardware registers.
1958 */
1959 int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1960 split_points[vgrf_to_reg[vgrf] + 1] = false;
1961 }
1962
1963 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1964 if (inst->dst.file == GRF) {
1965 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1966 for (int j = 1; j < inst->regs_written; j++)
1967 split_points[reg + j] = false;
1968 }
1969 for (int i = 0; i < inst->sources; i++) {
1970 if (inst->src[i].file == GRF) {
1971 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1972 for (int j = 1; j < inst->regs_read(i); j++)
1973 split_points[reg + j] = false;
1974 }
1975 }
1976 }
1977
1978 int new_virtual_grf[reg_count];
1979 int new_reg_offset[reg_count];
1980
1981 int reg = 0;
1982 for (int i = 0; i < num_vars; i++) {
1983 /* The first one should always be 0 as a quick sanity check. */
1984 assert(split_points[reg] == false);
1985
1986 /* j = 0 case */
1987 new_reg_offset[reg] = 0;
1988 reg++;
1989 int offset = 1;
1990
1991 /* j > 0 case */
1992 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1993 /* If this is a split point, reset the offset to 0 and allocate a
1994 * new virtual GRF for the previous offset many registers
1995 */
1996 if (split_points[reg]) {
1997 assert(offset <= MAX_VGRF_SIZE);
1998 int grf = alloc.allocate(offset);
1999 for (int k = reg - offset; k < reg; k++)
2000 new_virtual_grf[k] = grf;
2001 offset = 0;
2002 }
2003 new_reg_offset[reg] = offset;
2004 offset++;
2005 reg++;
2006 }
2007
2008 /* The last one gets the original register number */
2009 assert(offset <= MAX_VGRF_SIZE);
2010 alloc.sizes[i] = offset;
2011 for (int k = reg - offset; k < reg; k++)
2012 new_virtual_grf[k] = i;
2013 }
2014 assert(reg == reg_count);
2015
2016 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2017 if (inst->dst.file == GRF) {
2018 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2019 inst->dst.reg = new_virtual_grf[reg];
2020 inst->dst.reg_offset = new_reg_offset[reg];
2021 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2022 }
2023 for (int i = 0; i < inst->sources; i++) {
2024 if (inst->src[i].file == GRF) {
2025 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2026 inst->src[i].reg = new_virtual_grf[reg];
2027 inst->src[i].reg_offset = new_reg_offset[reg];
2028 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2029 }
2030 }
2031 }
2032 invalidate_live_intervals();
2033 }
2034
2035 /**
2036 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2037 *
2038 * During code generation, we create tons of temporary variables, many of
2039 * which get immediately killed and are never used again. Yet, in later
2040 * optimization and analysis passes, such as compute_live_intervals, we need
2041 * to loop over all the virtual GRFs. Compacting them can save a lot of
2042 * overhead.
2043 */
2044 bool
2045 fs_visitor::compact_virtual_grfs()
2046 {
2047 bool progress = false;
2048 int remap_table[this->alloc.count];
2049 memset(remap_table, -1, sizeof(remap_table));
2050
2051 /* Mark which virtual GRFs are used. */
2052 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2053 if (inst->dst.file == GRF)
2054 remap_table[inst->dst.reg] = 0;
2055
2056 for (int i = 0; i < inst->sources; i++) {
2057 if (inst->src[i].file == GRF)
2058 remap_table[inst->src[i].reg] = 0;
2059 }
2060 }
2061
2062 /* Compact the GRF arrays. */
2063 int new_index = 0;
2064 for (unsigned i = 0; i < this->alloc.count; i++) {
2065 if (remap_table[i] == -1) {
2066 /* We just found an unused register. This means that we are
2067 * actually going to compact something.
2068 */
2069 progress = true;
2070 } else {
2071 remap_table[i] = new_index;
2072 alloc.sizes[new_index] = alloc.sizes[i];
2073 invalidate_live_intervals();
2074 ++new_index;
2075 }
2076 }
2077
2078 this->alloc.count = new_index;
2079
2080 /* Patch all the instructions to use the newly renumbered registers */
2081 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2082 if (inst->dst.file == GRF)
2083 inst->dst.reg = remap_table[inst->dst.reg];
2084
2085 for (int i = 0; i < inst->sources; i++) {
2086 if (inst->src[i].file == GRF)
2087 inst->src[i].reg = remap_table[inst->src[i].reg];
2088 }
2089 }
2090
2091 /* Patch all the references to delta_x/delta_y, since they're used in
2092 * register allocation. If they're unused, switch them to BAD_FILE so
2093 * we don't think some random VGRF is delta_x/delta_y.
2094 */
2095 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2096 if (delta_x[i].file == GRF) {
2097 if (remap_table[delta_x[i].reg] != -1) {
2098 delta_x[i].reg = remap_table[delta_x[i].reg];
2099 } else {
2100 delta_x[i].file = BAD_FILE;
2101 }
2102 }
2103 }
2104 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2105 if (delta_y[i].file == GRF) {
2106 if (remap_table[delta_y[i].reg] != -1) {
2107 delta_y[i].reg = remap_table[delta_y[i].reg];
2108 } else {
2109 delta_y[i].file = BAD_FILE;
2110 }
2111 }
2112 }
2113
2114 return progress;
2115 }
2116
2117 /*
2118 * Implements array access of uniforms by inserting a
2119 * PULL_CONSTANT_LOAD instruction.
2120 *
2121 * Unlike temporary GRF array access (where we don't support it due to
2122 * the difficulty of doing relative addressing on instruction
2123 * destinations), we could potentially do array access of uniforms
2124 * that were loaded in GRF space as push constants. In real-world
2125 * usage we've seen, though, the arrays being used are always larger
2126 * than we could load as push constants, so just always move all
2127 * uniform array access out to a pull constant buffer.
2128 */
2129 void
2130 fs_visitor::move_uniform_array_access_to_pull_constants()
2131 {
2132 if (dispatch_width != 8)
2133 return;
2134
2135 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2136 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2137
2138 /* Walk through and find array access of uniforms. Put a copy of that
2139 * uniform in the pull constant buffer.
2140 *
2141 * Note that we don't move constant-indexed accesses to arrays. No
2142 * testing has been done of the performance impact of this choice.
2143 */
2144 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2145 for (int i = 0 ; i < inst->sources; i++) {
2146 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2147 continue;
2148
2149 int uniform = inst->src[i].reg;
2150
2151 /* If this array isn't already present in the pull constant buffer,
2152 * add it.
2153 */
2154 if (pull_constant_loc[uniform] == -1) {
2155 const gl_constant_value **values = &stage_prog_data->param[uniform];
2156
2157 assert(param_size[uniform]);
2158
2159 for (int j = 0; j < param_size[uniform]; j++) {
2160 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2161
2162 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2163 values[j];
2164 }
2165 }
2166 }
2167 }
2168 }
2169
2170 /**
2171 * Assign UNIFORM file registers to either push constants or pull constants.
2172 *
2173 * We allow a fragment shader to have more than the specified minimum
2174 * maximum number of fragment shader uniform components (64). If
2175 * there are too many of these, they'd fill up all of register space.
2176 * So, this will push some of them out to the pull constant buffer and
2177 * update the program to load them.
2178 */
2179 void
2180 fs_visitor::assign_constant_locations()
2181 {
2182 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2183 if (dispatch_width != 8)
2184 return;
2185
2186 /* Find which UNIFORM registers are still in use. */
2187 bool is_live[uniforms];
2188 for (unsigned int i = 0; i < uniforms; i++) {
2189 is_live[i] = false;
2190 }
2191
2192 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2193 for (int i = 0; i < inst->sources; i++) {
2194 if (inst->src[i].file != UNIFORM)
2195 continue;
2196
2197 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2198 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2199 is_live[constant_nr] = true;
2200 }
2201 }
2202
2203 /* Only allow 16 registers (128 uniform components) as push constants.
2204 *
2205 * Just demote the end of the list. We could probably do better
2206 * here, demoting things that are rarely used in the program first.
2207 *
2208 * If changing this value, note the limitation about total_regs in
2209 * brw_curbe.c.
2210 */
2211 unsigned int max_push_components = 16 * 8;
2212 unsigned int num_push_constants = 0;
2213
2214 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2215
2216 for (unsigned int i = 0; i < uniforms; i++) {
2217 if (!is_live[i] || pull_constant_loc[i] != -1) {
2218 /* This UNIFORM register is either dead, or has already been demoted
2219 * to a pull const. Mark it as no longer living in the param[] array.
2220 */
2221 push_constant_loc[i] = -1;
2222 continue;
2223 }
2224
2225 if (num_push_constants < max_push_components) {
2226 /* Retain as a push constant. Record the location in the params[]
2227 * array.
2228 */
2229 push_constant_loc[i] = num_push_constants++;
2230 } else {
2231 /* Demote to a pull constant. */
2232 push_constant_loc[i] = -1;
2233
2234 int pull_index = stage_prog_data->nr_pull_params++;
2235 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2236 pull_constant_loc[i] = pull_index;
2237 }
2238 }
2239
2240 stage_prog_data->nr_params = num_push_constants;
2241
2242 /* Up until now, the param[] array has been indexed by reg + reg_offset
2243 * of UNIFORM registers. Condense it to only contain the uniforms we
2244 * chose to upload as push constants.
2245 */
2246 for (unsigned int i = 0; i < uniforms; i++) {
2247 int remapped = push_constant_loc[i];
2248
2249 if (remapped == -1)
2250 continue;
2251
2252 assert(remapped <= (int)i);
2253 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2254 }
2255 }
2256
2257 /**
2258 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2259 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2260 */
2261 void
2262 fs_visitor::demote_pull_constants()
2263 {
2264 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2265 for (int i = 0; i < inst->sources; i++) {
2266 if (inst->src[i].file != UNIFORM)
2267 continue;
2268
2269 int pull_index = pull_constant_loc[inst->src[i].reg +
2270 inst->src[i].reg_offset];
2271 if (pull_index == -1)
2272 continue;
2273
2274 /* Set up the annotation tracking for new generated instructions. */
2275 base_ir = inst->ir;
2276 current_annotation = inst->annotation;
2277
2278 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2279 fs_reg dst = vgrf(glsl_type::float_type);
2280
2281 /* Generate a pull load into dst. */
2282 if (inst->src[i].reladdr) {
2283 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2284 surf_index,
2285 *inst->src[i].reladdr,
2286 pull_index);
2287 inst->insert_before(block, &list);
2288 inst->src[i].reladdr = NULL;
2289 } else {
2290 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2291 fs_inst *pull =
2292 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2293 dst, surf_index, offset);
2294 inst->insert_before(block, pull);
2295 inst->src[i].set_smear(pull_index & 3);
2296 }
2297
2298 /* Rewrite the instruction to use the temporary VGRF. */
2299 inst->src[i].file = GRF;
2300 inst->src[i].reg = dst.reg;
2301 inst->src[i].reg_offset = 0;
2302 inst->src[i].width = dispatch_width;
2303 }
2304 }
2305 invalidate_live_intervals();
2306 }
2307
2308 bool
2309 fs_visitor::opt_algebraic()
2310 {
2311 bool progress = false;
2312
2313 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2314 switch (inst->opcode) {
2315 case BRW_OPCODE_MOV:
2316 if (inst->src[0].file != IMM)
2317 break;
2318
2319 if (inst->saturate) {
2320 if (inst->dst.type != inst->src[0].type)
2321 assert(!"unimplemented: saturate mixed types");
2322
2323 if (brw_saturate_immediate(inst->dst.type,
2324 &inst->src[0].fixed_hw_reg)) {
2325 inst->saturate = false;
2326 progress = true;
2327 }
2328 }
2329 break;
2330
2331 case BRW_OPCODE_MUL:
2332 if (inst->src[1].file != IMM)
2333 continue;
2334
2335 /* a * 1.0 = a */
2336 if (inst->src[1].is_one()) {
2337 inst->opcode = BRW_OPCODE_MOV;
2338 inst->src[1] = reg_undef;
2339 progress = true;
2340 break;
2341 }
2342
2343 /* a * -1.0 = -a */
2344 if (inst->src[1].is_negative_one()) {
2345 inst->opcode = BRW_OPCODE_MOV;
2346 inst->src[0].negate = !inst->src[0].negate;
2347 inst->src[1] = reg_undef;
2348 progress = true;
2349 break;
2350 }
2351
2352 /* a * 0.0 = 0.0 */
2353 if (inst->src[1].is_zero()) {
2354 inst->opcode = BRW_OPCODE_MOV;
2355 inst->src[0] = inst->src[1];
2356 inst->src[1] = reg_undef;
2357 progress = true;
2358 break;
2359 }
2360
2361 if (inst->src[0].file == IMM) {
2362 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2363 inst->opcode = BRW_OPCODE_MOV;
2364 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2365 inst->src[1] = reg_undef;
2366 progress = true;
2367 break;
2368 }
2369 break;
2370 case BRW_OPCODE_ADD:
2371 if (inst->src[1].file != IMM)
2372 continue;
2373
2374 /* a + 0.0 = a */
2375 if (inst->src[1].is_zero()) {
2376 inst->opcode = BRW_OPCODE_MOV;
2377 inst->src[1] = reg_undef;
2378 progress = true;
2379 break;
2380 }
2381
2382 if (inst->src[0].file == IMM) {
2383 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2384 inst->opcode = BRW_OPCODE_MOV;
2385 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2386 inst->src[1] = reg_undef;
2387 progress = true;
2388 break;
2389 }
2390 break;
2391 case BRW_OPCODE_OR:
2392 if (inst->src[0].equals(inst->src[1])) {
2393 inst->opcode = BRW_OPCODE_MOV;
2394 inst->src[1] = reg_undef;
2395 progress = true;
2396 break;
2397 }
2398 break;
2399 case BRW_OPCODE_LRP:
2400 if (inst->src[1].equals(inst->src[2])) {
2401 inst->opcode = BRW_OPCODE_MOV;
2402 inst->src[0] = inst->src[1];
2403 inst->src[1] = reg_undef;
2404 inst->src[2] = reg_undef;
2405 progress = true;
2406 break;
2407 }
2408 break;
2409 case BRW_OPCODE_CMP:
2410 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2411 inst->src[0].abs &&
2412 inst->src[0].negate &&
2413 inst->src[1].is_zero()) {
2414 inst->src[0].abs = false;
2415 inst->src[0].negate = false;
2416 inst->conditional_mod = BRW_CONDITIONAL_Z;
2417 progress = true;
2418 break;
2419 }
2420 break;
2421 case BRW_OPCODE_SEL:
2422 if (inst->src[0].equals(inst->src[1])) {
2423 inst->opcode = BRW_OPCODE_MOV;
2424 inst->src[1] = reg_undef;
2425 inst->predicate = BRW_PREDICATE_NONE;
2426 inst->predicate_inverse = false;
2427 progress = true;
2428 } else if (inst->saturate && inst->src[1].file == IMM) {
2429 switch (inst->conditional_mod) {
2430 case BRW_CONDITIONAL_LE:
2431 case BRW_CONDITIONAL_L:
2432 switch (inst->src[1].type) {
2433 case BRW_REGISTER_TYPE_F:
2434 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2435 inst->opcode = BRW_OPCODE_MOV;
2436 inst->src[1] = reg_undef;
2437 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2438 progress = true;
2439 }
2440 break;
2441 default:
2442 break;
2443 }
2444 break;
2445 case BRW_CONDITIONAL_GE:
2446 case BRW_CONDITIONAL_G:
2447 switch (inst->src[1].type) {
2448 case BRW_REGISTER_TYPE_F:
2449 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2450 inst->opcode = BRW_OPCODE_MOV;
2451 inst->src[1] = reg_undef;
2452 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2453 progress = true;
2454 }
2455 break;
2456 default:
2457 break;
2458 }
2459 default:
2460 break;
2461 }
2462 }
2463 break;
2464 case BRW_OPCODE_MAD:
2465 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2466 inst->opcode = BRW_OPCODE_MOV;
2467 inst->src[1] = reg_undef;
2468 inst->src[2] = reg_undef;
2469 progress = true;
2470 } else if (inst->src[0].is_zero()) {
2471 inst->opcode = BRW_OPCODE_MUL;
2472 inst->src[0] = inst->src[2];
2473 inst->src[2] = reg_undef;
2474 } else if (inst->src[1].is_one()) {
2475 inst->opcode = BRW_OPCODE_ADD;
2476 inst->src[1] = inst->src[2];
2477 inst->src[2] = reg_undef;
2478 progress = true;
2479 } else if (inst->src[2].is_one()) {
2480 inst->opcode = BRW_OPCODE_ADD;
2481 inst->src[2] = reg_undef;
2482 progress = true;
2483 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2484 inst->opcode = BRW_OPCODE_ADD;
2485 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2486 inst->src[2] = reg_undef;
2487 progress = true;
2488 }
2489 break;
2490 case SHADER_OPCODE_RCP: {
2491 fs_inst *prev = (fs_inst *)inst->prev;
2492 if (prev->opcode == SHADER_OPCODE_SQRT) {
2493 if (inst->src[0].equals(prev->dst)) {
2494 inst->opcode = SHADER_OPCODE_RSQ;
2495 inst->src[0] = prev->src[0];
2496 progress = true;
2497 }
2498 }
2499 break;
2500 }
2501 default:
2502 break;
2503 }
2504 }
2505
2506 return progress;
2507 }
2508
2509 bool
2510 fs_visitor::opt_register_renaming()
2511 {
2512 bool progress = false;
2513 int depth = 0;
2514
2515 int remap[alloc.count];
2516 memset(remap, -1, sizeof(int) * alloc.count);
2517
2518 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2519 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2520 depth++;
2521 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2522 inst->opcode == BRW_OPCODE_WHILE) {
2523 depth--;
2524 }
2525
2526 /* Rewrite instruction sources. */
2527 for (int i = 0; i < inst->sources; i++) {
2528 if (inst->src[i].file == GRF &&
2529 remap[inst->src[i].reg] != -1 &&
2530 remap[inst->src[i].reg] != inst->src[i].reg) {
2531 inst->src[i].reg = remap[inst->src[i].reg];
2532 progress = true;
2533 }
2534 }
2535
2536 const int dst = inst->dst.reg;
2537
2538 if (depth == 0 &&
2539 inst->dst.file == GRF &&
2540 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2541 !inst->is_partial_write()) {
2542 if (remap[dst] == -1) {
2543 remap[dst] = dst;
2544 } else {
2545 remap[dst] = alloc.allocate(inst->dst.width / 8);
2546 inst->dst.reg = remap[dst];
2547 progress = true;
2548 }
2549 } else if (inst->dst.file == GRF &&
2550 remap[dst] != -1 &&
2551 remap[dst] != dst) {
2552 inst->dst.reg = remap[dst];
2553 progress = true;
2554 }
2555 }
2556
2557 if (progress) {
2558 invalidate_live_intervals();
2559
2560 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2561 if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2562 delta_x[i].reg = remap[delta_x[i].reg];
2563 }
2564 }
2565 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2566 if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2567 delta_y[i].reg = remap[delta_y[i].reg];
2568 }
2569 }
2570 }
2571
2572 return progress;
2573 }
2574
2575 /**
2576 * Remove redundant or useless discard jumps.
2577 *
2578 * For example, we can eliminate jumps in the following sequence:
2579 *
2580 * discard-jump (redundant with the next jump)
2581 * discard-jump (useless; jumps to the next instruction)
2582 * placeholder-halt
2583 */
2584 bool
2585 fs_visitor::opt_redundant_discard_jumps()
2586 {
2587 bool progress = false;
2588
2589 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2590
2591 fs_inst *placeholder_halt = NULL;
2592 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2593 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2594 placeholder_halt = inst;
2595 break;
2596 }
2597 }
2598
2599 if (!placeholder_halt)
2600 return false;
2601
2602 /* Delete any HALTs immediately before the placeholder halt. */
2603 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2604 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2605 prev = (fs_inst *) placeholder_halt->prev) {
2606 prev->remove(last_bblock);
2607 progress = true;
2608 }
2609
2610 if (progress)
2611 invalidate_live_intervals();
2612
2613 return progress;
2614 }
2615
2616 bool
2617 fs_visitor::compute_to_mrf()
2618 {
2619 bool progress = false;
2620 int next_ip = 0;
2621
2622 /* No MRFs on Gen >= 7. */
2623 if (brw->gen >= 7)
2624 return false;
2625
2626 calculate_live_intervals();
2627
2628 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2629 int ip = next_ip;
2630 next_ip++;
2631
2632 if (inst->opcode != BRW_OPCODE_MOV ||
2633 inst->is_partial_write() ||
2634 inst->dst.file != MRF || inst->src[0].file != GRF ||
2635 inst->dst.type != inst->src[0].type ||
2636 inst->src[0].abs || inst->src[0].negate ||
2637 !inst->src[0].is_contiguous() ||
2638 inst->src[0].subreg_offset)
2639 continue;
2640
2641 /* Work out which hardware MRF registers are written by this
2642 * instruction.
2643 */
2644 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2645 int mrf_high;
2646 if (inst->dst.reg & BRW_MRF_COMPR4) {
2647 mrf_high = mrf_low + 4;
2648 } else if (inst->exec_size == 16) {
2649 mrf_high = mrf_low + 1;
2650 } else {
2651 mrf_high = mrf_low;
2652 }
2653
2654 /* Can't compute-to-MRF this GRF if someone else was going to
2655 * read it later.
2656 */
2657 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2658 continue;
2659
2660 /* Found a move of a GRF to a MRF. Let's see if we can go
2661 * rewrite the thing that made this GRF to write into the MRF.
2662 */
2663 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2664 if (scan_inst->dst.file == GRF &&
2665 scan_inst->dst.reg == inst->src[0].reg) {
2666 /* Found the last thing to write our reg we want to turn
2667 * into a compute-to-MRF.
2668 */
2669
2670 /* If this one instruction didn't populate all the
2671 * channels, bail. We might be able to rewrite everything
2672 * that writes that reg, but it would require smarter
2673 * tracking to delay the rewriting until complete success.
2674 */
2675 if (scan_inst->is_partial_write())
2676 break;
2677
2678 /* Things returning more than one register would need us to
2679 * understand coalescing out more than one MOV at a time.
2680 */
2681 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2682 break;
2683
2684 /* SEND instructions can't have MRF as a destination. */
2685 if (scan_inst->mlen)
2686 break;
2687
2688 if (brw->gen == 6) {
2689 /* gen6 math instructions must have the destination be
2690 * GRF, so no compute-to-MRF for them.
2691 */
2692 if (scan_inst->is_math()) {
2693 break;
2694 }
2695 }
2696
2697 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2698 /* Found the creator of our MRF's source value. */
2699 scan_inst->dst.file = MRF;
2700 scan_inst->dst.reg = inst->dst.reg;
2701 scan_inst->saturate |= inst->saturate;
2702 inst->remove(block);
2703 progress = true;
2704 }
2705 break;
2706 }
2707
2708 /* We don't handle control flow here. Most computation of
2709 * values that end up in MRFs are shortly before the MRF
2710 * write anyway.
2711 */
2712 if (block->start() == scan_inst)
2713 break;
2714
2715 /* You can't read from an MRF, so if someone else reads our
2716 * MRF's source GRF that we wanted to rewrite, that stops us.
2717 */
2718 bool interfered = false;
2719 for (int i = 0; i < scan_inst->sources; i++) {
2720 if (scan_inst->src[i].file == GRF &&
2721 scan_inst->src[i].reg == inst->src[0].reg &&
2722 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2723 interfered = true;
2724 }
2725 }
2726 if (interfered)
2727 break;
2728
2729 if (scan_inst->dst.file == MRF) {
2730 /* If somebody else writes our MRF here, we can't
2731 * compute-to-MRF before that.
2732 */
2733 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2734 int scan_mrf_high;
2735
2736 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2737 scan_mrf_high = scan_mrf_low + 4;
2738 } else if (scan_inst->exec_size == 16) {
2739 scan_mrf_high = scan_mrf_low + 1;
2740 } else {
2741 scan_mrf_high = scan_mrf_low;
2742 }
2743
2744 if (mrf_low == scan_mrf_low ||
2745 mrf_low == scan_mrf_high ||
2746 mrf_high == scan_mrf_low ||
2747 mrf_high == scan_mrf_high) {
2748 break;
2749 }
2750 }
2751
2752 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2753 /* Found a SEND instruction, which means that there are
2754 * live values in MRFs from base_mrf to base_mrf +
2755 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2756 * above it.
2757 */
2758 if (mrf_low >= scan_inst->base_mrf &&
2759 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2760 break;
2761 }
2762 if (mrf_high >= scan_inst->base_mrf &&
2763 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2764 break;
2765 }
2766 }
2767 }
2768 }
2769
2770 if (progress)
2771 invalidate_live_intervals();
2772
2773 return progress;
2774 }
2775
2776 /**
2777 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2778 * instructions to FS_OPCODE_REP_FB_WRITE.
2779 */
2780 void
2781 fs_visitor::emit_repclear_shader()
2782 {
2783 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2784 int base_mrf = 1;
2785 int color_mrf = base_mrf + 2;
2786
2787 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2788 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2789 mov->force_writemask_all = true;
2790
2791 fs_inst *write;
2792 if (key->nr_color_regions == 1) {
2793 write = emit(FS_OPCODE_REP_FB_WRITE);
2794 write->saturate = key->clamp_fragment_color;
2795 write->base_mrf = color_mrf;
2796 write->target = 0;
2797 write->header_present = false;
2798 write->mlen = 1;
2799 } else {
2800 assume(key->nr_color_regions > 0);
2801 for (int i = 0; i < key->nr_color_regions; ++i) {
2802 write = emit(FS_OPCODE_REP_FB_WRITE);
2803 write->saturate = key->clamp_fragment_color;
2804 write->base_mrf = base_mrf;
2805 write->target = i;
2806 write->header_present = true;
2807 write->mlen = 3;
2808 }
2809 }
2810 write->eot = true;
2811
2812 calculate_cfg();
2813
2814 assign_constant_locations();
2815 assign_curb_setup();
2816
2817 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2818 assert(mov->src[0].file == HW_REG);
2819 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2820 }
2821
2822 /**
2823 * Walks through basic blocks, looking for repeated MRF writes and
2824 * removing the later ones.
2825 */
2826 bool
2827 fs_visitor::remove_duplicate_mrf_writes()
2828 {
2829 fs_inst *last_mrf_move[16];
2830 bool progress = false;
2831
2832 /* Need to update the MRF tracking for compressed instructions. */
2833 if (dispatch_width == 16)
2834 return false;
2835
2836 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2837
2838 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2839 if (inst->is_control_flow()) {
2840 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2841 }
2842
2843 if (inst->opcode == BRW_OPCODE_MOV &&
2844 inst->dst.file == MRF) {
2845 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2846 if (prev_inst && inst->equals(prev_inst)) {
2847 inst->remove(block);
2848 progress = true;
2849 continue;
2850 }
2851 }
2852
2853 /* Clear out the last-write records for MRFs that were overwritten. */
2854 if (inst->dst.file == MRF) {
2855 last_mrf_move[inst->dst.reg] = NULL;
2856 }
2857
2858 if (inst->mlen > 0 && inst->base_mrf != -1) {
2859 /* Found a SEND instruction, which will include two or fewer
2860 * implied MRF writes. We could do better here.
2861 */
2862 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2863 last_mrf_move[inst->base_mrf + i] = NULL;
2864 }
2865 }
2866
2867 /* Clear out any MRF move records whose sources got overwritten. */
2868 if (inst->dst.file == GRF) {
2869 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2870 if (last_mrf_move[i] &&
2871 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2872 last_mrf_move[i] = NULL;
2873 }
2874 }
2875 }
2876
2877 if (inst->opcode == BRW_OPCODE_MOV &&
2878 inst->dst.file == MRF &&
2879 inst->src[0].file == GRF &&
2880 !inst->is_partial_write()) {
2881 last_mrf_move[inst->dst.reg] = inst;
2882 }
2883 }
2884
2885 if (progress)
2886 invalidate_live_intervals();
2887
2888 return progress;
2889 }
2890
2891 static void
2892 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
2893 {
2894 /* Clear the flag for registers that actually got read (as expected). */
2895 for (int i = 0; i < inst->sources; i++) {
2896 int grf;
2897 if (inst->src[i].file == GRF) {
2898 grf = inst->src[i].reg;
2899 } else if (inst->src[i].file == HW_REG &&
2900 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2901 grf = inst->src[i].fixed_hw_reg.nr;
2902 } else {
2903 continue;
2904 }
2905
2906 if (grf >= first_grf &&
2907 grf < first_grf + grf_len) {
2908 deps[grf - first_grf] = false;
2909 if (inst->exec_size == 16)
2910 deps[grf - first_grf + 1] = false;
2911 }
2912 }
2913 }
2914
2915 /**
2916 * Implements this workaround for the original 965:
2917 *
2918 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2919 * check for post destination dependencies on this instruction, software
2920 * must ensure that there is no destination hazard for the case of ‘write
2921 * followed by a posted write’ shown in the following example.
2922 *
2923 * 1. mov r3 0
2924 * 2. send r3.xy <rest of send instruction>
2925 * 3. mov r2 r3
2926 *
2927 * Due to no post-destination dependency check on the ‘send’, the above
2928 * code sequence could have two instructions (1 and 2) in flight at the
2929 * same time that both consider ‘r3’ as the target of their final writes.
2930 */
2931 void
2932 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2933 fs_inst *inst)
2934 {
2935 int write_len = inst->regs_written;
2936 int first_write_grf = inst->dst.reg;
2937 bool needs_dep[BRW_MAX_MRF];
2938 assert(write_len < (int)sizeof(needs_dep) - 1);
2939
2940 memset(needs_dep, false, sizeof(needs_dep));
2941 memset(needs_dep, true, write_len);
2942
2943 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
2944
2945 /* Walk backwards looking for writes to registers we're writing which
2946 * aren't read since being written. If we hit the start of the program,
2947 * we assume that there are no outstanding dependencies on entry to the
2948 * program.
2949 */
2950 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2951 /* If we hit control flow, assume that there *are* outstanding
2952 * dependencies, and force their cleanup before our instruction.
2953 */
2954 if (block->start() == scan_inst) {
2955 for (int i = 0; i < write_len; i++) {
2956 if (needs_dep[i]) {
2957 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2958 }
2959 }
2960 return;
2961 }
2962
2963 /* We insert our reads as late as possible on the assumption that any
2964 * instruction but a MOV that might have left us an outstanding
2965 * dependency has more latency than a MOV.
2966 */
2967 if (scan_inst->dst.file == GRF) {
2968 for (int i = 0; i < scan_inst->regs_written; i++) {
2969 int reg = scan_inst->dst.reg + i;
2970
2971 if (reg >= first_write_grf &&
2972 reg < first_write_grf + write_len &&
2973 needs_dep[reg - first_write_grf]) {
2974 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2975 needs_dep[reg - first_write_grf] = false;
2976 if (scan_inst->exec_size == 16)
2977 needs_dep[reg - first_write_grf + 1] = false;
2978 }
2979 }
2980 }
2981
2982 /* Clear the flag for registers that actually got read (as expected). */
2983 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2984
2985 /* Continue the loop only if we haven't resolved all the dependencies */
2986 int i;
2987 for (i = 0; i < write_len; i++) {
2988 if (needs_dep[i])
2989 break;
2990 }
2991 if (i == write_len)
2992 return;
2993 }
2994 }
2995
2996 /**
2997 * Implements this workaround for the original 965:
2998 *
2999 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3000 * used as a destination register until after it has been sourced by an
3001 * instruction with a different destination register.
3002 */
3003 void
3004 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3005 {
3006 int write_len = inst->regs_written;
3007 int first_write_grf = inst->dst.reg;
3008 bool needs_dep[BRW_MAX_MRF];
3009 assert(write_len < (int)sizeof(needs_dep) - 1);
3010
3011 memset(needs_dep, false, sizeof(needs_dep));
3012 memset(needs_dep, true, write_len);
3013 /* Walk forwards looking for writes to registers we're writing which aren't
3014 * read before being written.
3015 */
3016 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3017 /* If we hit control flow, force resolve all remaining dependencies. */
3018 if (block->end() == scan_inst) {
3019 for (int i = 0; i < write_len; i++) {
3020 if (needs_dep[i])
3021 scan_inst->insert_before(block,
3022 DEP_RESOLVE_MOV(first_write_grf + i));
3023 }
3024 return;
3025 }
3026
3027 /* Clear the flag for registers that actually got read (as expected). */
3028 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3029
3030 /* We insert our reads as late as possible since they're reading the
3031 * result of a SEND, which has massive latency.
3032 */
3033 if (scan_inst->dst.file == GRF &&
3034 scan_inst->dst.reg >= first_write_grf &&
3035 scan_inst->dst.reg < first_write_grf + write_len &&
3036 needs_dep[scan_inst->dst.reg - first_write_grf]) {
3037 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3038 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3039 }
3040
3041 /* Continue the loop only if we haven't resolved all the dependencies */
3042 int i;
3043 for (i = 0; i < write_len; i++) {
3044 if (needs_dep[i])
3045 break;
3046 }
3047 if (i == write_len)
3048 return;
3049 }
3050 }
3051
3052 void
3053 fs_visitor::insert_gen4_send_dependency_workarounds()
3054 {
3055 if (brw->gen != 4 || brw->is_g4x)
3056 return;
3057
3058 bool progress = false;
3059
3060 /* Note that we're done with register allocation, so GRF fs_regs always
3061 * have a .reg_offset of 0.
3062 */
3063
3064 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3065 if (inst->mlen != 0 && inst->dst.file == GRF) {
3066 insert_gen4_pre_send_dependency_workarounds(block, inst);
3067 insert_gen4_post_send_dependency_workarounds(block, inst);
3068 progress = true;
3069 }
3070 }
3071
3072 if (progress)
3073 invalidate_live_intervals();
3074 }
3075
3076 /**
3077 * Turns the generic expression-style uniform pull constant load instruction
3078 * into a hardware-specific series of instructions for loading a pull
3079 * constant.
3080 *
3081 * The expression style allows the CSE pass before this to optimize out
3082 * repeated loads from the same offset, and gives the pre-register-allocation
3083 * scheduling full flexibility, while the conversion to native instructions
3084 * allows the post-register-allocation scheduler the best information
3085 * possible.
3086 *
3087 * Note that execution masking for setting up pull constant loads is special:
3088 * the channels that need to be written are unrelated to the current execution
3089 * mask, since a later instruction will use one of the result channels as a
3090 * source operand for all 8 or 16 of its channels.
3091 */
3092 void
3093 fs_visitor::lower_uniform_pull_constant_loads()
3094 {
3095 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3096 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3097 continue;
3098
3099 if (brw->gen >= 7) {
3100 /* The offset arg before was a vec4-aligned byte offset. We need to
3101 * turn it into a dword offset.
3102 */
3103 fs_reg const_offset_reg = inst->src[1];
3104 assert(const_offset_reg.file == IMM &&
3105 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3106 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3107 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3108
3109 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3110 * Reserve space for the register.
3111 */
3112 if (brw->gen >= 9) {
3113 payload.reg_offset++;
3114 alloc.sizes[payload.reg] = 2;
3115 }
3116
3117 /* This is actually going to be a MOV, but since only the first dword
3118 * is accessed, we have a special opcode to do just that one. Note
3119 * that this needs to be an operation that will be considered a def
3120 * by live variable analysis, or register allocation will explode.
3121 */
3122 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3123 8, payload, const_offset_reg);
3124 setup->force_writemask_all = true;
3125
3126 setup->ir = inst->ir;
3127 setup->annotation = inst->annotation;
3128 inst->insert_before(block, setup);
3129
3130 /* Similarly, this will only populate the first 4 channels of the
3131 * result register (since we only use smear values from 0-3), but we
3132 * don't tell the optimizer.
3133 */
3134 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3135 inst->src[1] = payload;
3136
3137 invalidate_live_intervals();
3138 } else {
3139 /* Before register allocation, we didn't tell the scheduler about the
3140 * MRF we use. We know it's safe to use this MRF because nothing
3141 * else does except for register spill/unspill, which generates and
3142 * uses its MRF within a single IR instruction.
3143 */
3144 inst->base_mrf = 14;
3145 inst->mlen = 1;
3146 }
3147 }
3148 }
3149
3150 bool
3151 fs_visitor::lower_load_payload()
3152 {
3153 bool progress = false;
3154
3155 int vgrf_to_reg[alloc.count];
3156 int reg_count = 0;
3157 for (unsigned i = 0; i < alloc.count; ++i) {
3158 vgrf_to_reg[i] = reg_count;
3159 reg_count += alloc.sizes[i];
3160 }
3161
3162 struct {
3163 bool written:1; /* Whether this register has ever been written */
3164 bool force_writemask_all:1;
3165 bool force_sechalf:1;
3166 } metadata[reg_count];
3167 memset(metadata, 0, sizeof(metadata));
3168
3169 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3170 if (inst->dst.file == GRF) {
3171 const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3172 bool force_sechalf = inst->force_sechalf &&
3173 !inst->force_writemask_all;
3174 bool toggle_sechalf = inst->dst.width == 16 &&
3175 type_sz(inst->dst.type) == 4 &&
3176 !inst->force_writemask_all;
3177 for (int i = 0; i < inst->regs_written; ++i) {
3178 metadata[dst_reg + i].written = true;
3179 metadata[dst_reg + i].force_sechalf = force_sechalf;
3180 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3181 force_sechalf = (toggle_sechalf != force_sechalf);
3182 }
3183 }
3184
3185 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3186 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3187 fs_reg dst = inst->dst;
3188
3189 for (int i = 0; i < inst->sources; i++) {
3190 dst.width = inst->src[i].effective_width;
3191 dst.type = inst->src[i].type;
3192
3193 if (inst->src[i].file == BAD_FILE) {
3194 /* Do nothing but otherwise increment as normal */
3195 } else if (dst.file == MRF &&
3196 dst.width == 8 &&
3197 brw->has_compr4 &&
3198 i + 4 < inst->sources &&
3199 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3200 fs_reg compr4_dst = dst;
3201 compr4_dst.reg += BRW_MRF_COMPR4;
3202 compr4_dst.width = 16;
3203 fs_reg compr4_src = inst->src[i];
3204 compr4_src.width = 16;
3205 fs_inst *mov = MOV(compr4_dst, compr4_src);
3206 mov->force_writemask_all = true;
3207 inst->insert_before(block, mov);
3208 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3209 inst->src[i + 4].file = BAD_FILE;
3210 } else {
3211 fs_inst *mov = MOV(dst, inst->src[i]);
3212 if (inst->src[i].file == GRF) {
3213 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3214 inst->src[i].reg_offset;
3215 mov->force_sechalf = metadata[src_reg].force_sechalf;
3216 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3217 } else {
3218 /* We don't have any useful metadata for immediates or
3219 * uniforms. Assume that any of the channels of the
3220 * destination may be used.
3221 */
3222 assert(inst->src[i].file == IMM ||
3223 inst->src[i].file == UNIFORM);
3224 mov->force_writemask_all = true;
3225 }
3226
3227 if (dst.file == GRF) {
3228 const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3229 const bool force_writemask = mov->force_writemask_all;
3230 metadata[dst_reg].force_writemask_all = force_writemask;
3231 metadata[dst_reg].force_sechalf = mov->force_sechalf;
3232 if (dst.width * type_sz(dst.type) > 32) {
3233 assert(!mov->force_sechalf);
3234 metadata[dst_reg + 1].force_writemask_all = force_writemask;
3235 metadata[dst_reg + 1].force_sechalf = !force_writemask;
3236 }
3237 }
3238
3239 inst->insert_before(block, mov);
3240 }
3241
3242 dst = offset(dst, 1);
3243 }
3244
3245 inst->remove(block);
3246 progress = true;
3247 }
3248 }
3249
3250 if (progress)
3251 invalidate_live_intervals();
3252
3253 return progress;
3254 }
3255
3256 void
3257 fs_visitor::dump_instructions()
3258 {
3259 dump_instructions(NULL);
3260 }
3261
3262 void
3263 fs_visitor::dump_instructions(const char *name)
3264 {
3265 FILE *file = stderr;
3266 if (name && geteuid() != 0) {
3267 file = fopen(name, "w");
3268 if (!file)
3269 file = stderr;
3270 }
3271
3272 if (cfg) {
3273 calculate_register_pressure();
3274 int ip = 0, max_pressure = 0;
3275 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3276 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3277 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3278 dump_instruction(inst, file);
3279 ip++;
3280 }
3281 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3282 } else {
3283 int ip = 0;
3284 foreach_in_list(backend_instruction, inst, &instructions) {
3285 fprintf(file, "%4d: ", ip++);
3286 dump_instruction(inst, file);
3287 }
3288 }
3289
3290 if (file != stderr) {
3291 fclose(file);
3292 }
3293 }
3294
3295 void
3296 fs_visitor::dump_instruction(backend_instruction *be_inst)
3297 {
3298 dump_instruction(be_inst, stderr);
3299 }
3300
3301 void
3302 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3303 {
3304 fs_inst *inst = (fs_inst *)be_inst;
3305
3306 if (inst->predicate) {
3307 fprintf(file, "(%cf0.%d) ",
3308 inst->predicate_inverse ? '-' : '+',
3309 inst->flag_subreg);
3310 }
3311
3312 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3313 if (inst->saturate)
3314 fprintf(file, ".sat");
3315 if (inst->conditional_mod) {
3316 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3317 if (!inst->predicate &&
3318 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3319 inst->opcode != BRW_OPCODE_IF &&
3320 inst->opcode != BRW_OPCODE_WHILE))) {
3321 fprintf(file, ".f0.%d", inst->flag_subreg);
3322 }
3323 }
3324 fprintf(file, "(%d) ", inst->exec_size);
3325
3326
3327 switch (inst->dst.file) {
3328 case GRF:
3329 fprintf(file, "vgrf%d", inst->dst.reg);
3330 if (inst->dst.width != dispatch_width)
3331 fprintf(file, "@%d", inst->dst.width);
3332 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3333 inst->dst.subreg_offset)
3334 fprintf(file, "+%d.%d",
3335 inst->dst.reg_offset, inst->dst.subreg_offset);
3336 break;
3337 case MRF:
3338 fprintf(file, "m%d", inst->dst.reg);
3339 break;
3340 case BAD_FILE:
3341 fprintf(file, "(null)");
3342 break;
3343 case UNIFORM:
3344 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3345 break;
3346 case ATTR:
3347 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3348 break;
3349 case HW_REG:
3350 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3351 switch (inst->dst.fixed_hw_reg.nr) {
3352 case BRW_ARF_NULL:
3353 fprintf(file, "null");
3354 break;
3355 case BRW_ARF_ADDRESS:
3356 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3357 break;
3358 case BRW_ARF_ACCUMULATOR:
3359 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3360 break;
3361 case BRW_ARF_FLAG:
3362 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3363 inst->dst.fixed_hw_reg.subnr);
3364 break;
3365 default:
3366 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3367 inst->dst.fixed_hw_reg.subnr);
3368 break;
3369 }
3370 } else {
3371 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3372 }
3373 if (inst->dst.fixed_hw_reg.subnr)
3374 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3375 break;
3376 default:
3377 fprintf(file, "???");
3378 break;
3379 }
3380 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3381
3382 for (int i = 0; i < inst->sources; i++) {
3383 if (inst->src[i].negate)
3384 fprintf(file, "-");
3385 if (inst->src[i].abs)
3386 fprintf(file, "|");
3387 switch (inst->src[i].file) {
3388 case GRF:
3389 fprintf(file, "vgrf%d", inst->src[i].reg);
3390 if (inst->src[i].width != dispatch_width)
3391 fprintf(file, "@%d", inst->src[i].width);
3392 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3393 inst->src[i].subreg_offset)
3394 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3395 inst->src[i].subreg_offset);
3396 break;
3397 case MRF:
3398 fprintf(file, "***m%d***", inst->src[i].reg);
3399 break;
3400 case ATTR:
3401 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3402 break;
3403 case UNIFORM:
3404 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3405 if (inst->src[i].reladdr) {
3406 fprintf(file, "+reladdr");
3407 } else if (inst->src[i].subreg_offset) {
3408 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3409 inst->src[i].subreg_offset);
3410 }
3411 break;
3412 case BAD_FILE:
3413 fprintf(file, "(null)");
3414 break;
3415 case IMM:
3416 switch (inst->src[i].type) {
3417 case BRW_REGISTER_TYPE_F:
3418 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3419 break;
3420 case BRW_REGISTER_TYPE_W:
3421 case BRW_REGISTER_TYPE_D:
3422 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3423 break;
3424 case BRW_REGISTER_TYPE_UW:
3425 case BRW_REGISTER_TYPE_UD:
3426 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3427 break;
3428 case BRW_REGISTER_TYPE_VF:
3429 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3430 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3431 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3432 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3433 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3434 break;
3435 default:
3436 fprintf(file, "???");
3437 break;
3438 }
3439 break;
3440 case HW_REG:
3441 if (inst->src[i].fixed_hw_reg.negate)
3442 fprintf(file, "-");
3443 if (inst->src[i].fixed_hw_reg.abs)
3444 fprintf(file, "|");
3445 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3446 switch (inst->src[i].fixed_hw_reg.nr) {
3447 case BRW_ARF_NULL:
3448 fprintf(file, "null");
3449 break;
3450 case BRW_ARF_ADDRESS:
3451 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3452 break;
3453 case BRW_ARF_ACCUMULATOR:
3454 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3455 break;
3456 case BRW_ARF_FLAG:
3457 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3458 inst->src[i].fixed_hw_reg.subnr);
3459 break;
3460 default:
3461 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3462 inst->src[i].fixed_hw_reg.subnr);
3463 break;
3464 }
3465 } else {
3466 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3467 }
3468 if (inst->src[i].fixed_hw_reg.subnr)
3469 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3470 if (inst->src[i].fixed_hw_reg.abs)
3471 fprintf(file, "|");
3472 break;
3473 default:
3474 fprintf(file, "???");
3475 break;
3476 }
3477 if (inst->src[i].abs)
3478 fprintf(file, "|");
3479
3480 if (inst->src[i].file != IMM) {
3481 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3482 }
3483
3484 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3485 fprintf(file, ", ");
3486 }
3487
3488 fprintf(file, " ");
3489
3490 if (dispatch_width == 16 && inst->exec_size == 8) {
3491 if (inst->force_sechalf)
3492 fprintf(file, "2ndhalf ");
3493 else
3494 fprintf(file, "1sthalf ");
3495 }
3496
3497 fprintf(file, "\n");
3498 }
3499
3500 /**
3501 * Possibly returns an instruction that set up @param reg.
3502 *
3503 * Sometimes we want to take the result of some expression/variable
3504 * dereference tree and rewrite the instruction generating the result
3505 * of the tree. When processing the tree, we know that the
3506 * instructions generated are all writing temporaries that are dead
3507 * outside of this tree. So, if we have some instructions that write
3508 * a temporary, we're free to point that temp write somewhere else.
3509 *
3510 * Note that this doesn't guarantee that the instruction generated
3511 * only reg -- it might be the size=4 destination of a texture instruction.
3512 */
3513 fs_inst *
3514 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3515 fs_inst *end,
3516 const fs_reg &reg)
3517 {
3518 if (end == start ||
3519 end->is_partial_write() ||
3520 reg.reladdr ||
3521 !reg.equals(end->dst)) {
3522 return NULL;
3523 } else {
3524 return end;
3525 }
3526 }
3527
3528 void
3529 fs_visitor::setup_payload_gen6()
3530 {
3531 bool uses_depth =
3532 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3533 unsigned barycentric_interp_modes =
3534 (stage == MESA_SHADER_FRAGMENT) ?
3535 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3536
3537 assert(brw->gen >= 6);
3538
3539 /* R0-1: masks, pixel X/Y coordinates. */
3540 payload.num_regs = 2;
3541 /* R2: only for 32-pixel dispatch.*/
3542
3543 /* R3-26: barycentric interpolation coordinates. These appear in the
3544 * same order that they appear in the brw_wm_barycentric_interp_mode
3545 * enum. Each set of coordinates occupies 2 registers if dispatch width
3546 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3547 * appear if they were enabled using the "Barycentric Interpolation
3548 * Mode" bits in WM_STATE.
3549 */
3550 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3551 if (barycentric_interp_modes & (1 << i)) {
3552 payload.barycentric_coord_reg[i] = payload.num_regs;
3553 payload.num_regs += 2;
3554 if (dispatch_width == 16) {
3555 payload.num_regs += 2;
3556 }
3557 }
3558 }
3559
3560 /* R27: interpolated depth if uses source depth */
3561 if (uses_depth) {
3562 payload.source_depth_reg = payload.num_regs;
3563 payload.num_regs++;
3564 if (dispatch_width == 16) {
3565 /* R28: interpolated depth if not SIMD8. */
3566 payload.num_regs++;
3567 }
3568 }
3569 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3570 if (uses_depth) {
3571 payload.source_w_reg = payload.num_regs;
3572 payload.num_regs++;
3573 if (dispatch_width == 16) {
3574 /* R30: interpolated W if not SIMD8. */
3575 payload.num_regs++;
3576 }
3577 }
3578
3579 if (stage == MESA_SHADER_FRAGMENT) {
3580 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3581 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3582 prog_data->uses_pos_offset = key->compute_pos_offset;
3583 /* R31: MSAA position offsets. */
3584 if (prog_data->uses_pos_offset) {
3585 payload.sample_pos_reg = payload.num_regs;
3586 payload.num_regs++;
3587 }
3588 }
3589
3590 /* R32: MSAA input coverage mask */
3591 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3592 assert(brw->gen >= 7);
3593 payload.sample_mask_in_reg = payload.num_regs;
3594 payload.num_regs++;
3595 if (dispatch_width == 16) {
3596 /* R33: input coverage mask if not SIMD8. */
3597 payload.num_regs++;
3598 }
3599 }
3600
3601 /* R34-: bary for 32-pixel. */
3602 /* R58-59: interp W for 32-pixel. */
3603
3604 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3605 source_depth_to_render_target = true;
3606 }
3607 }
3608
3609 void
3610 fs_visitor::setup_vs_payload()
3611 {
3612 /* R0: thread header, R1: urb handles */
3613 payload.num_regs = 2;
3614 }
3615
3616 void
3617 fs_visitor::assign_binding_table_offsets()
3618 {
3619 assert(stage == MESA_SHADER_FRAGMENT);
3620 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3621 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3622 uint32_t next_binding_table_offset = 0;
3623
3624 /* If there are no color regions, we still perform an FB write to a null
3625 * renderbuffer, which we place at surface index 0.
3626 */
3627 prog_data->binding_table.render_target_start = next_binding_table_offset;
3628 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3629
3630 assign_common_binding_table_offsets(next_binding_table_offset);
3631 }
3632
3633 void
3634 fs_visitor::calculate_register_pressure()
3635 {
3636 invalidate_live_intervals();
3637 calculate_live_intervals();
3638
3639 unsigned num_instructions = 0;
3640 foreach_block(block, cfg)
3641 num_instructions += block->instructions.length();
3642
3643 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3644
3645 for (unsigned reg = 0; reg < alloc.count; reg++) {
3646 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3647 regs_live_at_ip[ip] += alloc.sizes[reg];
3648 }
3649 }
3650
3651 void
3652 fs_visitor::optimize()
3653 {
3654 const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3655
3656 split_virtual_grfs();
3657
3658 move_uniform_array_access_to_pull_constants();
3659 assign_constant_locations();
3660 demote_pull_constants();
3661
3662 #define OPT(pass, args...) ({ \
3663 pass_num++; \
3664 bool this_progress = pass(args); \
3665 \
3666 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3667 char filename[64]; \
3668 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3669 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3670 \
3671 backend_visitor::dump_instructions(filename); \
3672 } \
3673 \
3674 progress = progress || this_progress; \
3675 this_progress; \
3676 })
3677
3678 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3679 char filename[64];
3680 snprintf(filename, 64, "%s%d-%04d-00-start",
3681 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3682
3683 backend_visitor::dump_instructions(filename);
3684 }
3685
3686 bool progress;
3687 int iteration = 0;
3688 int pass_num = 0;
3689 do {
3690 progress = false;
3691 pass_num = 0;
3692 iteration++;
3693
3694 OPT(remove_duplicate_mrf_writes);
3695
3696 OPT(opt_algebraic);
3697 OPT(opt_cse);
3698 OPT(opt_copy_propagate);
3699 OPT(opt_peephole_predicated_break);
3700 OPT(opt_cmod_propagation);
3701 OPT(dead_code_eliminate);
3702 OPT(opt_peephole_sel);
3703 OPT(dead_control_flow_eliminate, this);
3704 OPT(opt_register_renaming);
3705 OPT(opt_redundant_discard_jumps);
3706 OPT(opt_saturate_propagation);
3707 OPT(register_coalesce);
3708 OPT(compute_to_mrf);
3709
3710 OPT(compact_virtual_grfs);
3711 } while (progress);
3712
3713 pass_num = 0;
3714
3715 if (OPT(lower_load_payload)) {
3716 split_virtual_grfs();
3717 OPT(register_coalesce);
3718 OPT(compute_to_mrf);
3719 OPT(dead_code_eliminate);
3720 }
3721
3722 OPT(opt_combine_constants);
3723
3724 lower_uniform_pull_constant_loads();
3725 }
3726
3727 /**
3728 * Three source instruction must have a GRF/MRF destination register.
3729 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
3730 */
3731 void
3732 fs_visitor::fixup_3src_null_dest()
3733 {
3734 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3735 if (inst->is_3src() && inst->dst.is_null()) {
3736 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3737 inst->dst.type);
3738 }
3739 }
3740 }
3741
3742 void
3743 fs_visitor::allocate_registers()
3744 {
3745 bool allocated_without_spills;
3746
3747 static const enum instruction_scheduler_mode pre_modes[] = {
3748 SCHEDULE_PRE,
3749 SCHEDULE_PRE_NON_LIFO,
3750 SCHEDULE_PRE_LIFO,
3751 };
3752
3753 /* Try each scheduling heuristic to see if it can successfully register
3754 * allocate without spilling. They should be ordered by decreasing
3755 * performance but increasing likelihood of allocating.
3756 */
3757 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3758 schedule_instructions(pre_modes[i]);
3759
3760 if (0) {
3761 assign_regs_trivial();
3762 allocated_without_spills = true;
3763 } else {
3764 allocated_without_spills = assign_regs(false);
3765 }
3766 if (allocated_without_spills)
3767 break;
3768 }
3769
3770 if (!allocated_without_spills) {
3771 const char *stage_name = stage == MESA_SHADER_VERTEX ?
3772 "Vertex" : "Fragment";
3773
3774 /* We assume that any spilling is worse than just dropping back to
3775 * SIMD8. There's probably actually some intermediate point where
3776 * SIMD16 with a couple of spills is still better.
3777 */
3778 if (dispatch_width == 16) {
3779 fail("Failure to register allocate. Reduce number of "
3780 "live scalar values to avoid this.");
3781 } else {
3782 perf_debug("%s shader triggered register spilling. "
3783 "Try reducing the number of live scalar values to "
3784 "improve performance.\n", stage_name);
3785 }
3786
3787 /* Since we're out of heuristics, just go spill registers until we
3788 * get an allocation.
3789 */
3790 while (!assign_regs(true)) {
3791 if (failed)
3792 break;
3793 }
3794 }
3795
3796 /* This must come after all optimization and register allocation, since
3797 * it inserts dead code that happens to have side effects, and it does
3798 * so based on the actual physical registers in use.
3799 */
3800 insert_gen4_send_dependency_workarounds();
3801
3802 if (failed)
3803 return;
3804
3805 if (!allocated_without_spills)
3806 schedule_instructions(SCHEDULE_POST);
3807
3808 if (last_scratch > 0)
3809 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3810 }
3811
3812 bool
3813 fs_visitor::run_vs()
3814 {
3815 assert(stage == MESA_SHADER_VERTEX);
3816
3817 assign_common_binding_table_offsets(0);
3818 setup_vs_payload();
3819
3820 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3821 emit_shader_time_begin();
3822
3823 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3824 base_ir = ir;
3825 this->result = reg_undef;
3826 ir->accept(this);
3827 }
3828 base_ir = NULL;
3829 if (failed)
3830 return false;
3831
3832 emit_urb_writes();
3833
3834 calculate_cfg();
3835
3836 optimize();
3837
3838 assign_curb_setup();
3839 assign_vs_urb_setup();
3840
3841 fixup_3src_null_dest();
3842 allocate_registers();
3843
3844 return !failed;
3845 }
3846
3847 bool
3848 fs_visitor::run_fs()
3849 {
3850 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3851 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3852
3853 assert(stage == MESA_SHADER_FRAGMENT);
3854
3855 sanity_param_count = prog->Parameters->NumParameters;
3856
3857 assign_binding_table_offsets();
3858
3859 if (brw->gen >= 6)
3860 setup_payload_gen6();
3861 else
3862 setup_payload_gen4();
3863
3864 if (0) {
3865 emit_dummy_fs();
3866 } else if (brw->use_rep_send && dispatch_width == 16) {
3867 emit_repclear_shader();
3868 } else {
3869 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3870 emit_shader_time_begin();
3871
3872 calculate_urb_setup();
3873 if (prog->InputsRead > 0) {
3874 if (brw->gen < 6)
3875 emit_interpolation_setup_gen4();
3876 else
3877 emit_interpolation_setup_gen6();
3878 }
3879
3880 /* We handle discards by keeping track of the still-live pixels in f0.1.
3881 * Initialize it with the dispatched pixels.
3882 */
3883 if (wm_prog_data->uses_kill) {
3884 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3885 discard_init->flag_subreg = 1;
3886 }
3887
3888 /* Generate FS IR for main(). (the visitor only descends into
3889 * functions called "main").
3890 */
3891 if (shader) {
3892 if (getenv("INTEL_USE_NIR") != NULL) {
3893 emit_nir_code();
3894 } else {
3895 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3896 base_ir = ir;
3897 this->result = reg_undef;
3898 ir->accept(this);
3899 }
3900 }
3901 } else {
3902 emit_fragment_program_code();
3903 }
3904 base_ir = NULL;
3905 if (failed)
3906 return false;
3907
3908 emit(FS_OPCODE_PLACEHOLDER_HALT);
3909
3910 if (wm_key->alpha_test_func)
3911 emit_alpha_test();
3912
3913 emit_fb_writes();
3914
3915 calculate_cfg();
3916
3917 optimize();
3918
3919 assign_curb_setup();
3920 assign_urb_setup();
3921
3922 fixup_3src_null_dest();
3923 allocate_registers();
3924
3925 if (failed)
3926 return false;
3927 }
3928
3929 if (dispatch_width == 8)
3930 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3931 else
3932 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3933
3934 /* If any state parameters were appended, then ParameterValues could have
3935 * been realloced, in which case the driver uniform storage set up by
3936 * _mesa_associate_uniform_storage() would point to freed memory. Make
3937 * sure that didn't happen.
3938 */
3939 assert(sanity_param_count == prog->Parameters->NumParameters);
3940
3941 return !failed;
3942 }
3943
3944 const unsigned *
3945 brw_wm_fs_emit(struct brw_context *brw,
3946 void *mem_ctx,
3947 const struct brw_wm_prog_key *key,
3948 struct brw_wm_prog_data *prog_data,
3949 struct gl_fragment_program *fp,
3950 struct gl_shader_program *prog,
3951 unsigned *final_assembly_size)
3952 {
3953 bool start_busy = false;
3954 double start_time = 0;
3955
3956 if (unlikely(brw->perf_debug)) {
3957 start_busy = (brw->batch.last_bo &&
3958 drm_intel_bo_busy(brw->batch.last_bo));
3959 start_time = get_time();
3960 }
3961
3962 struct brw_shader *shader = NULL;
3963 if (prog)
3964 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3965
3966 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3967 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3968
3969 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3970 */
3971 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3972 if (!v.run_fs()) {
3973 if (prog) {
3974 prog->LinkStatus = false;
3975 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3976 }
3977
3978 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3979 v.fail_msg);
3980
3981 return NULL;
3982 }
3983
3984 cfg_t *simd16_cfg = NULL;
3985 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3986 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3987 brw->use_rep_send)) {
3988 if (!v.simd16_unsupported) {
3989 /* Try a SIMD16 compile */
3990 v2.import_uniforms(&v);
3991 if (!v2.run_fs()) {
3992 perf_debug("SIMD16 shader failed to compile, falling back to "
3993 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3994 } else {
3995 simd16_cfg = v2.cfg;
3996 }
3997 } else {
3998 perf_debug("SIMD16 shader unsupported, falling back to "
3999 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4000 }
4001 }
4002
4003 cfg_t *simd8_cfg;
4004 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4005 if (no_simd8 && simd16_cfg) {
4006 simd8_cfg = NULL;
4007 prog_data->no_8 = true;
4008 } else {
4009 simd8_cfg = v.cfg;
4010 prog_data->no_8 = false;
4011 }
4012
4013 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4014 &fp->Base, v.runtime_check_aads_emit, "FS");
4015
4016 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4017 char *name;
4018 if (prog)
4019 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4020 prog->Label ? prog->Label : "unnamed",
4021 prog->Name);
4022 else
4023 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4024
4025 g.enable_debug(name);
4026 }
4027
4028 if (simd8_cfg)
4029 g.generate_code(simd8_cfg, 8);
4030 if (simd16_cfg)
4031 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4032
4033 if (unlikely(brw->perf_debug) && shader) {
4034 if (shader->compiled_once)
4035 brw_wm_debug_recompile(brw, prog, key);
4036 shader->compiled_once = true;
4037
4038 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4039 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4040 (get_time() - start_time) * 1000);
4041 }
4042 }
4043
4044 return g.get_assembly(final_assembly_size);
4045 }
4046
4047 extern "C" bool
4048 brw_fs_precompile(struct gl_context *ctx,
4049 struct gl_shader_program *shader_prog,
4050 struct gl_program *prog)
4051 {
4052 struct brw_context *brw = brw_context(ctx);
4053 struct brw_wm_prog_key key;
4054
4055 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4056 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4057 bool program_uses_dfdy = fp->UsesDFdy;
4058
4059 memset(&key, 0, sizeof(key));
4060
4061 if (brw->gen < 6) {
4062 if (fp->UsesKill)
4063 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4064
4065 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4066 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4067
4068 /* Just assume depth testing. */
4069 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4070 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4071 }
4072
4073 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4074 BRW_FS_VARYING_INPUT_MASK) > 16)
4075 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4076
4077 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4078 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
4079 for (unsigned i = 0; i < sampler_count; i++) {
4080 if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
4081 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4082 key.tex.swizzles[i] =
4083 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4084 } else {
4085 /* Color sampler: assume no swizzling. */
4086 key.tex.swizzles[i] = SWIZZLE_XYZW;
4087 }
4088 }
4089
4090 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4091 key.drawable_height = ctx->DrawBuffer->Height;
4092 }
4093
4094 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4095 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4096 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4097
4098 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4099 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4100 key.nr_color_regions > 1;
4101 }
4102
4103 key.program_string_id = bfp->id;
4104
4105 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4106 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4107
4108 bool success = do_wm_prog(brw, shader_prog, bfp, &key);
4109
4110 brw->wm.base.prog_offset = old_prog_offset;
4111 brw->wm.prog_data = old_prog_data;
4112
4113 return success;
4114 }