0354f5698dcbee7b08f2ea39ce2cdef0fcf48127
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "util/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "util/register_allocate.h"
42 #include "program/hash_table.h"
43 #include "brw_context.h"
44 #include "brw_eu.h"
45 #include "brw_wm.h"
46 }
47 #include "brw_fs.h"
48 #include "brw_cfg.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53 #include "program/sampler.h"
54
55 void
56 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
57 const fs_reg *src, unsigned sources)
58 {
59 memset(this, 0, sizeof(*this));
60
61 this->src = new fs_reg[MAX2(sources, 3)];
62 for (unsigned i = 0; i < sources; i++)
63 this->src[i] = src[i];
64
65 this->opcode = opcode;
66 this->dst = dst;
67 this->sources = sources;
68 this->exec_size = exec_size;
69
70 assert(dst.file != IMM && dst.file != UNIFORM);
71
72 /* If exec_size == 0, try to guess it from the registers. Since all
73 * manner of things may use hardware registers, we first try to guess
74 * based on GRF registers. If this fails, we will go ahead and take the
75 * width from the destination register.
76 */
77 if (this->exec_size == 0) {
78 if (dst.file == GRF) {
79 this->exec_size = dst.width;
80 } else {
81 for (unsigned i = 0; i < sources; ++i) {
82 if (src[i].file != GRF && src[i].file != ATTR)
83 continue;
84
85 if (this->exec_size <= 1)
86 this->exec_size = src[i].width;
87 assert(src[i].width == 1 || src[i].width == this->exec_size);
88 }
89 }
90
91 if (this->exec_size == 0 && dst.file != BAD_FILE)
92 this->exec_size = dst.width;
93 }
94 assert(this->exec_size != 0);
95
96 for (unsigned i = 0; i < sources; ++i) {
97 switch (this->src[i].file) {
98 case BAD_FILE:
99 this->src[i].effective_width = 8;
100 break;
101 case GRF:
102 case HW_REG:
103 case ATTR:
104 assert(this->src[i].width > 0);
105 if (this->src[i].width == 1) {
106 this->src[i].effective_width = this->exec_size;
107 } else {
108 this->src[i].effective_width = this->src[i].width;
109 }
110 break;
111 case IMM:
112 case UNIFORM:
113 this->src[i].effective_width = this->exec_size;
114 break;
115 default:
116 unreachable("Invalid source register file");
117 }
118 }
119 this->dst.effective_width = this->exec_size;
120
121 this->conditional_mod = BRW_CONDITIONAL_NONE;
122
123 /* This will be the case for almost all instructions. */
124 switch (dst.file) {
125 case GRF:
126 case HW_REG:
127 case MRF:
128 case ATTR:
129 this->regs_written =
130 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
131 break;
132 case BAD_FILE:
133 this->regs_written = 0;
134 break;
135 case IMM:
136 case UNIFORM:
137 unreachable("Invalid destination register file");
138 default:
139 unreachable("Invalid register file");
140 }
141
142 this->writes_accumulator = false;
143 }
144
145 fs_inst::fs_inst()
146 {
147 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
148 }
149
150 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
151 {
152 init(opcode, exec_size, reg_undef, NULL, 0);
153 }
154
155 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
156 {
157 init(opcode, 0, dst, NULL, 0);
158 }
159
160 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
161 const fs_reg &src0)
162 {
163 const fs_reg src[1] = { src0 };
164 init(opcode, exec_size, dst, src, 1);
165 }
166
167 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
168 {
169 const fs_reg src[1] = { src0 };
170 init(opcode, 0, dst, src, 1);
171 }
172
173 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
174 const fs_reg &src0, const fs_reg &src1)
175 {
176 const fs_reg src[2] = { src0, src1 };
177 init(opcode, exec_size, dst, src, 2);
178 }
179
180 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
181 const fs_reg &src1)
182 {
183 const fs_reg src[2] = { src0, src1 };
184 init(opcode, 0, dst, src, 2);
185 }
186
187 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
188 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
189 {
190 const fs_reg src[3] = { src0, src1, src2 };
191 init(opcode, exec_size, dst, src, 3);
192 }
193
194 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
195 const fs_reg &src1, const fs_reg &src2)
196 {
197 const fs_reg src[3] = { src0, src1, src2 };
198 init(opcode, 0, dst, src, 3);
199 }
200
201 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
202 const fs_reg src[], unsigned sources)
203 {
204 init(opcode, 0, dst, src, sources);
205 }
206
207 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
208 const fs_reg src[], unsigned sources)
209 {
210 init(opcode, exec_width, dst, src, sources);
211 }
212
213 fs_inst::fs_inst(const fs_inst &that)
214 {
215 memcpy(this, &that, sizeof(that));
216
217 this->src = new fs_reg[MAX2(that.sources, 3)];
218
219 for (unsigned i = 0; i < that.sources; i++)
220 this->src[i] = that.src[i];
221 }
222
223 fs_inst::~fs_inst()
224 {
225 delete[] this->src;
226 }
227
228 void
229 fs_inst::resize_sources(uint8_t num_sources)
230 {
231 if (this->sources != num_sources) {
232 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
233
234 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
235 src[i] = this->src[i];
236
237 delete[] this->src;
238 this->src = src;
239 this->sources = num_sources;
240 }
241 }
242
243 #define ALU1(op) \
244 fs_inst * \
245 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
246 { \
247 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
248 }
249
250 #define ALU2(op) \
251 fs_inst * \
252 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
253 const fs_reg &src1) \
254 { \
255 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
256 }
257
258 #define ALU2_ACC(op) \
259 fs_inst * \
260 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
261 const fs_reg &src1) \
262 { \
263 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
264 inst->writes_accumulator = true; \
265 return inst; \
266 }
267
268 #define ALU3(op) \
269 fs_inst * \
270 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
271 const fs_reg &src1, const fs_reg &src2) \
272 { \
273 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
274 }
275
276 ALU1(NOT)
277 ALU1(MOV)
278 ALU1(FRC)
279 ALU1(RNDD)
280 ALU1(RNDE)
281 ALU1(RNDZ)
282 ALU2(ADD)
283 ALU2(MUL)
284 ALU2_ACC(MACH)
285 ALU2(AND)
286 ALU2(OR)
287 ALU2(XOR)
288 ALU2(SHL)
289 ALU2(SHR)
290 ALU2(ASR)
291 ALU3(LRP)
292 ALU1(BFREV)
293 ALU3(BFE)
294 ALU2(BFI1)
295 ALU3(BFI2)
296 ALU1(FBH)
297 ALU1(FBL)
298 ALU1(CBIT)
299 ALU3(MAD)
300 ALU2_ACC(ADDC)
301 ALU2_ACC(SUBB)
302 ALU2(SEL)
303 ALU2(MAC)
304
305 /** Gen4 predicated IF. */
306 fs_inst *
307 fs_visitor::IF(enum brw_predicate predicate)
308 {
309 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
310 inst->predicate = predicate;
311 return inst;
312 }
313
314 /** Gen6 IF with embedded comparison. */
315 fs_inst *
316 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
317 enum brw_conditional_mod condition)
318 {
319 assert(brw->gen == 6);
320 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
321 reg_null_d, src0, src1);
322 inst->conditional_mod = condition;
323 return inst;
324 }
325
326 /**
327 * CMP: Sets the low bit of the destination channels with the result
328 * of the comparison, while the upper bits are undefined, and updates
329 * the flag register with the packed 16 bits of the result.
330 */
331 fs_inst *
332 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
333 enum brw_conditional_mod condition)
334 {
335 fs_inst *inst;
336
337 /* Take the instruction:
338 *
339 * CMP null<d> src0<f> src1<f>
340 *
341 * Original gen4 does type conversion to the destination type before
342 * comparison, producing garbage results for floating point comparisons.
343 *
344 * The destination type doesn't matter on newer generations, so we set the
345 * type to match src0 so we can compact the instruction.
346 */
347 dst.type = src0.type;
348 if (dst.file == HW_REG)
349 dst.fixed_hw_reg.type = dst.type;
350
351 resolve_ud_negate(&src0);
352 resolve_ud_negate(&src1);
353
354 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
355 inst->conditional_mod = condition;
356
357 return inst;
358 }
359
360 fs_inst *
361 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
362 {
363 uint8_t exec_size = dst.width;
364 for (int i = 0; i < sources; ++i) {
365 assert(src[i].width % dst.width == 0);
366 if (src[i].width > exec_size)
367 exec_size = src[i].width;
368 }
369
370 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
371 dst, src, sources);
372 inst->regs_written = 0;
373 for (int i = 0; i < sources; ++i) {
374 /* The LOAD_PAYLOAD instruction only really makes sense if we are
375 * dealing with whole registers. If this ever changes, we can deal
376 * with it later.
377 */
378 int size = inst->src[i].effective_width * type_sz(src[i].type);
379 assert(size % 32 == 0);
380 inst->regs_written += (size + 31) / 32;
381 }
382
383 return inst;
384 }
385
386 exec_list
387 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
388 const fs_reg &surf_index,
389 const fs_reg &varying_offset,
390 uint32_t const_offset)
391 {
392 exec_list instructions;
393 fs_inst *inst;
394
395 /* We have our constant surface use a pitch of 4 bytes, so our index can
396 * be any component of a vector, and then we load 4 contiguous
397 * components starting from that.
398 *
399 * We break down the const_offset to a portion added to the variable
400 * offset and a portion done using reg_offset, which means that if you
401 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
402 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
403 * CSE can later notice that those loads are all the same and eliminate
404 * the redundant ones.
405 */
406 fs_reg vec4_offset = vgrf(glsl_type::int_type);
407 instructions.push_tail(ADD(vec4_offset,
408 varying_offset, fs_reg(const_offset & ~3)));
409
410 int scale = 1;
411 if (brw->gen == 4 && dst.width == 8) {
412 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
413 * u, v, r) as parameters, or we can just use the SIMD16 message
414 * consisting of (header, u). We choose the second, at the cost of a
415 * longer return length.
416 */
417 scale = 2;
418 }
419
420 enum opcode op;
421 if (brw->gen >= 7)
422 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
423 else
424 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
425
426 assert(dst.width % 8 == 0);
427 int regs_written = 4 * (dst.width / 8) * scale;
428 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
429 dst.type, dst.width);
430 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
431 inst->regs_written = regs_written;
432 instructions.push_tail(inst);
433
434 if (brw->gen < 7) {
435 inst->base_mrf = 13;
436 inst->header_present = true;
437 if (brw->gen == 4)
438 inst->mlen = 3;
439 else
440 inst->mlen = 1 + dispatch_width / 8;
441 }
442
443 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
444 instructions.push_tail(MOV(dst, result));
445
446 return instructions;
447 }
448
449 /**
450 * A helper for MOV generation for fixing up broken hardware SEND dependency
451 * handling.
452 */
453 fs_inst *
454 fs_visitor::DEP_RESOLVE_MOV(int grf)
455 {
456 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
457
458 inst->ir = NULL;
459 inst->annotation = "send dependency resolve";
460
461 /* The caller always wants uncompressed to emit the minimal extra
462 * dependencies, and to avoid having to deal with aligning its regs to 2.
463 */
464 inst->exec_size = 8;
465
466 return inst;
467 }
468
469 bool
470 fs_inst::equals(fs_inst *inst) const
471 {
472 return (opcode == inst->opcode &&
473 dst.equals(inst->dst) &&
474 src[0].equals(inst->src[0]) &&
475 src[1].equals(inst->src[1]) &&
476 src[2].equals(inst->src[2]) &&
477 saturate == inst->saturate &&
478 predicate == inst->predicate &&
479 conditional_mod == inst->conditional_mod &&
480 mlen == inst->mlen &&
481 base_mrf == inst->base_mrf &&
482 target == inst->target &&
483 eot == inst->eot &&
484 header_present == inst->header_present &&
485 shadow_compare == inst->shadow_compare &&
486 exec_size == inst->exec_size &&
487 offset == inst->offset);
488 }
489
490 bool
491 fs_inst::overwrites_reg(const fs_reg &reg) const
492 {
493 return (reg.file == dst.file &&
494 reg.reg == dst.reg &&
495 reg.reg_offset >= dst.reg_offset &&
496 reg.reg_offset < dst.reg_offset + regs_written);
497 }
498
499 bool
500 fs_inst::is_send_from_grf() const
501 {
502 switch (opcode) {
503 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
504 case SHADER_OPCODE_SHADER_TIME_ADD:
505 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
506 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
507 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
508 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
509 case SHADER_OPCODE_UNTYPED_ATOMIC:
510 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
511 case SHADER_OPCODE_URB_WRITE_SIMD8:
512 return true;
513 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
514 return src[1].file == GRF;
515 case FS_OPCODE_FB_WRITE:
516 return src[0].file == GRF;
517 default:
518 if (is_tex())
519 return src[0].file == GRF;
520
521 return false;
522 }
523 }
524
525 bool
526 fs_inst::can_do_source_mods(struct brw_context *brw)
527 {
528 if (brw->gen == 6 && is_math())
529 return false;
530
531 if (is_send_from_grf())
532 return false;
533
534 if (!backend_instruction::can_do_source_mods())
535 return false;
536
537 return true;
538 }
539
540 void
541 fs_reg::init()
542 {
543 memset(this, 0, sizeof(*this));
544 stride = 1;
545 }
546
547 /** Generic unset register constructor. */
548 fs_reg::fs_reg()
549 {
550 init();
551 this->file = BAD_FILE;
552 }
553
554 /** Immediate value constructor. */
555 fs_reg::fs_reg(float f)
556 {
557 init();
558 this->file = IMM;
559 this->type = BRW_REGISTER_TYPE_F;
560 this->fixed_hw_reg.dw1.f = f;
561 this->width = 1;
562 }
563
564 /** Immediate value constructor. */
565 fs_reg::fs_reg(int32_t i)
566 {
567 init();
568 this->file = IMM;
569 this->type = BRW_REGISTER_TYPE_D;
570 this->fixed_hw_reg.dw1.d = i;
571 this->width = 1;
572 }
573
574 /** Immediate value constructor. */
575 fs_reg::fs_reg(uint32_t u)
576 {
577 init();
578 this->file = IMM;
579 this->type = BRW_REGISTER_TYPE_UD;
580 this->fixed_hw_reg.dw1.ud = u;
581 this->width = 1;
582 }
583
584 /** Vector float immediate value constructor. */
585 fs_reg::fs_reg(uint8_t vf[4])
586 {
587 init();
588 this->file = IMM;
589 this->type = BRW_REGISTER_TYPE_VF;
590 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
591 }
592
593 /** Vector float immediate value constructor. */
594 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
595 {
596 init();
597 this->file = IMM;
598 this->type = BRW_REGISTER_TYPE_VF;
599 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
600 (vf1 << 8) |
601 (vf2 << 16) |
602 (vf3 << 24);
603 }
604
605 /** Fixed brw_reg. */
606 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
607 {
608 init();
609 this->file = HW_REG;
610 this->fixed_hw_reg = fixed_hw_reg;
611 this->type = fixed_hw_reg.type;
612 this->width = 1 << fixed_hw_reg.width;
613 }
614
615 bool
616 fs_reg::equals(const fs_reg &r) const
617 {
618 return (file == r.file &&
619 reg == r.reg &&
620 reg_offset == r.reg_offset &&
621 subreg_offset == r.subreg_offset &&
622 type == r.type &&
623 negate == r.negate &&
624 abs == r.abs &&
625 !reladdr && !r.reladdr &&
626 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
627 width == r.width &&
628 stride == r.stride);
629 }
630
631 fs_reg &
632 fs_reg::set_smear(unsigned subreg)
633 {
634 assert(file != HW_REG && file != IMM);
635 subreg_offset = subreg * type_sz(type);
636 stride = 0;
637 return *this;
638 }
639
640 bool
641 fs_reg::is_contiguous() const
642 {
643 return stride == 1;
644 }
645
646 int
647 fs_visitor::type_size(const struct glsl_type *type)
648 {
649 unsigned int size, i;
650
651 switch (type->base_type) {
652 case GLSL_TYPE_UINT:
653 case GLSL_TYPE_INT:
654 case GLSL_TYPE_FLOAT:
655 case GLSL_TYPE_BOOL:
656 return type->components();
657 case GLSL_TYPE_ARRAY:
658 return type_size(type->fields.array) * type->length;
659 case GLSL_TYPE_STRUCT:
660 size = 0;
661 for (i = 0; i < type->length; i++) {
662 size += type_size(type->fields.structure[i].type);
663 }
664 return size;
665 case GLSL_TYPE_SAMPLER:
666 /* Samplers take up no register space, since they're baked in at
667 * link time.
668 */
669 return 0;
670 case GLSL_TYPE_ATOMIC_UINT:
671 return 0;
672 case GLSL_TYPE_IMAGE:
673 case GLSL_TYPE_VOID:
674 case GLSL_TYPE_ERROR:
675 case GLSL_TYPE_INTERFACE:
676 case GLSL_TYPE_DOUBLE:
677 unreachable("not reached");
678 }
679
680 return 0;
681 }
682
683 fs_reg
684 fs_visitor::get_timestamp()
685 {
686 assert(brw->gen >= 7);
687
688 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
689 BRW_ARF_TIMESTAMP,
690 0),
691 BRW_REGISTER_TYPE_UD));
692
693 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
694
695 fs_inst *mov = emit(MOV(dst, ts));
696 /* We want to read the 3 fields we care about even if it's not enabled in
697 * the dispatch.
698 */
699 mov->force_writemask_all = true;
700
701 /* The caller wants the low 32 bits of the timestamp. Since it's running
702 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
703 * which is plenty of time for our purposes. It is identical across the
704 * EUs, but since it's tracking GPU core speed it will increment at a
705 * varying rate as render P-states change.
706 *
707 * The caller could also check if render P-states have changed (or anything
708 * else that might disrupt timing) by setting smear to 2 and checking if
709 * that field is != 0.
710 */
711 dst.set_smear(0);
712
713 return dst;
714 }
715
716 void
717 fs_visitor::emit_shader_time_begin()
718 {
719 current_annotation = "shader time start";
720 shader_start_time = get_timestamp();
721 }
722
723 void
724 fs_visitor::emit_shader_time_end()
725 {
726 current_annotation = "shader time end";
727
728 enum shader_time_shader_type type, written_type, reset_type;
729 switch (stage) {
730 case MESA_SHADER_VERTEX:
731 type = ST_VS;
732 written_type = ST_VS_WRITTEN;
733 reset_type = ST_VS_RESET;
734 break;
735 case MESA_SHADER_GEOMETRY:
736 type = ST_GS;
737 written_type = ST_GS_WRITTEN;
738 reset_type = ST_GS_RESET;
739 break;
740 case MESA_SHADER_FRAGMENT:
741 if (dispatch_width == 8) {
742 type = ST_FS8;
743 written_type = ST_FS8_WRITTEN;
744 reset_type = ST_FS8_RESET;
745 } else {
746 assert(dispatch_width == 16);
747 type = ST_FS16;
748 written_type = ST_FS16_WRITTEN;
749 reset_type = ST_FS16_RESET;
750 }
751 break;
752 default:
753 unreachable("fs_visitor::emit_shader_time_end missing code");
754 }
755
756 fs_reg shader_end_time = get_timestamp();
757
758 /* Check that there weren't any timestamp reset events (assuming these
759 * were the only two timestamp reads that happened).
760 */
761 fs_reg reset = shader_end_time;
762 reset.set_smear(2);
763 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
764 test->conditional_mod = BRW_CONDITIONAL_Z;
765 emit(IF(BRW_PREDICATE_NORMAL));
766
767 fs_reg start = shader_start_time;
768 start.negate = true;
769 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
770 emit(ADD(diff, start, shader_end_time));
771
772 /* If there were no instructions between the two timestamp gets, the diff
773 * is 2 cycles. Remove that overhead, so I can forget about that when
774 * trying to determine the time taken for single instructions.
775 */
776 emit(ADD(diff, diff, fs_reg(-2u)));
777
778 emit_shader_time_write(type, diff);
779 emit_shader_time_write(written_type, fs_reg(1u));
780 emit(BRW_OPCODE_ELSE);
781 emit_shader_time_write(reset_type, fs_reg(1u));
782 emit(BRW_OPCODE_ENDIF);
783 }
784
785 void
786 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
787 fs_reg value)
788 {
789 int shader_time_index =
790 brw_get_shader_time_index(brw, shader_prog, prog, type);
791 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
792
793 fs_reg payload;
794 if (dispatch_width == 8)
795 payload = vgrf(glsl_type::uvec2_type);
796 else
797 payload = vgrf(glsl_type::uint_type);
798
799 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
800 fs_reg(), payload, offset, value));
801 }
802
803 void
804 fs_visitor::vfail(const char *format, va_list va)
805 {
806 char *msg;
807
808 if (failed)
809 return;
810
811 failed = true;
812
813 msg = ralloc_vasprintf(mem_ctx, format, va);
814 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
815
816 this->fail_msg = msg;
817
818 if (debug_enabled) {
819 fprintf(stderr, "%s", msg);
820 }
821 }
822
823 void
824 fs_visitor::fail(const char *format, ...)
825 {
826 va_list va;
827
828 va_start(va, format);
829 vfail(format, va);
830 va_end(va);
831 }
832
833 /**
834 * Mark this program as impossible to compile in SIMD16 mode.
835 *
836 * During the SIMD8 compile (which happens first), we can detect and flag
837 * things that are unsupported in SIMD16 mode, so the compiler can skip
838 * the SIMD16 compile altogether.
839 *
840 * During a SIMD16 compile (if one happens anyway), this just calls fail().
841 */
842 void
843 fs_visitor::no16(const char *format, ...)
844 {
845 va_list va;
846
847 va_start(va, format);
848
849 if (dispatch_width == 16) {
850 vfail(format, va);
851 } else {
852 simd16_unsupported = true;
853
854 if (brw->perf_debug) {
855 if (no16_msg)
856 ralloc_vasprintf_append(&no16_msg, format, va);
857 else
858 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
859 }
860 }
861
862 va_end(va);
863 }
864
865 fs_inst *
866 fs_visitor::emit(enum opcode opcode)
867 {
868 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
869 }
870
871 fs_inst *
872 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
873 {
874 return emit(new(mem_ctx) fs_inst(opcode, dst));
875 }
876
877 fs_inst *
878 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
879 {
880 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
881 }
882
883 fs_inst *
884 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
885 const fs_reg &src1)
886 {
887 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
888 }
889
890 fs_inst *
891 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
892 const fs_reg &src1, const fs_reg &src2)
893 {
894 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
895 }
896
897 fs_inst *
898 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
899 fs_reg src[], int sources)
900 {
901 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
902 }
903
904 /**
905 * Returns true if the instruction has a flag that means it won't
906 * update an entire destination register.
907 *
908 * For example, dead code elimination and live variable analysis want to know
909 * when a write to a variable screens off any preceding values that were in
910 * it.
911 */
912 bool
913 fs_inst::is_partial_write() const
914 {
915 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
916 (this->dst.width * type_sz(this->dst.type)) < 32 ||
917 !this->dst.is_contiguous());
918 }
919
920 int
921 fs_inst::regs_read(int arg) const
922 {
923 if (is_tex() && arg == 0 && src[0].file == GRF) {
924 return mlen;
925 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
926 return mlen;
927 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
928 return mlen;
929 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
930 return mlen;
931 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
932 return mlen;
933 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
934 return mlen;
935 }
936
937 switch (src[arg].file) {
938 case BAD_FILE:
939 case UNIFORM:
940 case IMM:
941 return 1;
942 case GRF:
943 case HW_REG:
944 if (src[arg].stride == 0) {
945 return 1;
946 } else {
947 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
948 return (size + 31) / 32;
949 }
950 case MRF:
951 unreachable("MRF registers are not allowed as sources");
952 default:
953 unreachable("Invalid register file");
954 }
955 }
956
957 bool
958 fs_inst::reads_flag() const
959 {
960 return predicate;
961 }
962
963 bool
964 fs_inst::writes_flag() const
965 {
966 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
967 opcode != BRW_OPCODE_IF &&
968 opcode != BRW_OPCODE_WHILE)) ||
969 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
970 }
971
972 /**
973 * Returns how many MRFs an FS opcode will write over.
974 *
975 * Note that this is not the 0 or 1 implied writes in an actual gen
976 * instruction -- the FS opcodes often generate MOVs in addition.
977 */
978 int
979 fs_visitor::implied_mrf_writes(fs_inst *inst)
980 {
981 if (inst->mlen == 0)
982 return 0;
983
984 if (inst->base_mrf == -1)
985 return 0;
986
987 switch (inst->opcode) {
988 case SHADER_OPCODE_RCP:
989 case SHADER_OPCODE_RSQ:
990 case SHADER_OPCODE_SQRT:
991 case SHADER_OPCODE_EXP2:
992 case SHADER_OPCODE_LOG2:
993 case SHADER_OPCODE_SIN:
994 case SHADER_OPCODE_COS:
995 return 1 * dispatch_width / 8;
996 case SHADER_OPCODE_POW:
997 case SHADER_OPCODE_INT_QUOTIENT:
998 case SHADER_OPCODE_INT_REMAINDER:
999 return 2 * dispatch_width / 8;
1000 case SHADER_OPCODE_TEX:
1001 case FS_OPCODE_TXB:
1002 case SHADER_OPCODE_TXD:
1003 case SHADER_OPCODE_TXF:
1004 case SHADER_OPCODE_TXF_CMS:
1005 case SHADER_OPCODE_TXF_MCS:
1006 case SHADER_OPCODE_TG4:
1007 case SHADER_OPCODE_TG4_OFFSET:
1008 case SHADER_OPCODE_TXL:
1009 case SHADER_OPCODE_TXS:
1010 case SHADER_OPCODE_LOD:
1011 return 1;
1012 case FS_OPCODE_FB_WRITE:
1013 return 2;
1014 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1015 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1016 return 1;
1017 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1018 return inst->mlen;
1019 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1020 return 2;
1021 case SHADER_OPCODE_UNTYPED_ATOMIC:
1022 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1023 case SHADER_OPCODE_URB_WRITE_SIMD8:
1024 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1025 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1026 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1027 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1028 return 0;
1029 default:
1030 unreachable("not reached");
1031 }
1032 }
1033
1034 fs_reg
1035 fs_visitor::vgrf(const glsl_type *const type)
1036 {
1037 int reg_width = dispatch_width / 8;
1038 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1039 brw_type_for_base_type(type), dispatch_width);
1040 }
1041
1042 fs_reg
1043 fs_visitor::vgrf(int num_components)
1044 {
1045 int reg_width = dispatch_width / 8;
1046 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1047 BRW_REGISTER_TYPE_F, dispatch_width);
1048 }
1049
1050 /** Fixed HW reg constructor. */
1051 fs_reg::fs_reg(enum register_file file, int reg)
1052 {
1053 init();
1054 this->file = file;
1055 this->reg = reg;
1056 this->type = BRW_REGISTER_TYPE_F;
1057
1058 switch (file) {
1059 case UNIFORM:
1060 this->width = 1;
1061 break;
1062 default:
1063 this->width = 8;
1064 }
1065 }
1066
1067 /** Fixed HW reg constructor. */
1068 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1069 {
1070 init();
1071 this->file = file;
1072 this->reg = reg;
1073 this->type = type;
1074
1075 switch (file) {
1076 case UNIFORM:
1077 this->width = 1;
1078 break;
1079 default:
1080 this->width = 8;
1081 }
1082 }
1083
1084 /** Fixed HW reg constructor. */
1085 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1086 uint8_t width)
1087 {
1088 init();
1089 this->file = file;
1090 this->reg = reg;
1091 this->type = type;
1092 this->width = width;
1093 }
1094
1095 fs_reg *
1096 fs_visitor::variable_storage(ir_variable *var)
1097 {
1098 return (fs_reg *)hash_table_find(this->variable_ht, var);
1099 }
1100
1101 void
1102 import_uniforms_callback(const void *key,
1103 void *data,
1104 void *closure)
1105 {
1106 struct hash_table *dst_ht = (struct hash_table *)closure;
1107 const fs_reg *reg = (const fs_reg *)data;
1108
1109 if (reg->file != UNIFORM)
1110 return;
1111
1112 hash_table_insert(dst_ht, data, key);
1113 }
1114
1115 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1116 * This brings in those uniform definitions
1117 */
1118 void
1119 fs_visitor::import_uniforms(fs_visitor *v)
1120 {
1121 hash_table_call_foreach(v->variable_ht,
1122 import_uniforms_callback,
1123 variable_ht);
1124 this->push_constant_loc = v->push_constant_loc;
1125 this->pull_constant_loc = v->pull_constant_loc;
1126 this->uniforms = v->uniforms;
1127 this->param_size = v->param_size;
1128 }
1129
1130 /* Our support for uniforms is piggy-backed on the struct
1131 * gl_fragment_program, because that's where the values actually
1132 * get stored, rather than in some global gl_shader_program uniform
1133 * store.
1134 */
1135 void
1136 fs_visitor::setup_uniform_values(ir_variable *ir)
1137 {
1138 int namelen = strlen(ir->name);
1139
1140 /* The data for our (non-builtin) uniforms is stored in a series of
1141 * gl_uniform_driver_storage structs for each subcomponent that
1142 * glGetUniformLocation() could name. We know it's been set up in the same
1143 * order we'd walk the type, so walk the list of storage and find anything
1144 * with our name, or the prefix of a component that starts with our name.
1145 */
1146 unsigned params_before = uniforms;
1147 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1148 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1149
1150 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1151 (storage->name[namelen] != 0 &&
1152 storage->name[namelen] != '.' &&
1153 storage->name[namelen] != '[')) {
1154 continue;
1155 }
1156
1157 unsigned slots = storage->type->component_slots();
1158 if (storage->array_elements)
1159 slots *= storage->array_elements;
1160
1161 for (unsigned i = 0; i < slots; i++) {
1162 stage_prog_data->param[uniforms++] = &storage->storage[i];
1163 }
1164 }
1165
1166 /* Make sure we actually initialized the right amount of stuff here. */
1167 assert(params_before + ir->type->component_slots() == uniforms);
1168 (void)params_before;
1169 }
1170
1171
1172 /* Our support for builtin uniforms is even scarier than non-builtin.
1173 * It sits on top of the PROG_STATE_VAR parameters that are
1174 * automatically updated from GL context state.
1175 */
1176 void
1177 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1178 {
1179 const ir_state_slot *const slots = ir->get_state_slots();
1180 assert(slots != NULL);
1181
1182 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1183 /* This state reference has already been setup by ir_to_mesa, but we'll
1184 * get the same index back here.
1185 */
1186 int index = _mesa_add_state_reference(this->prog->Parameters,
1187 (gl_state_index *)slots[i].tokens);
1188
1189 /* Add each of the unique swizzles of the element as a parameter.
1190 * This'll end up matching the expected layout of the
1191 * array/matrix/structure we're trying to fill in.
1192 */
1193 int last_swiz = -1;
1194 for (unsigned int j = 0; j < 4; j++) {
1195 int swiz = GET_SWZ(slots[i].swizzle, j);
1196 if (swiz == last_swiz)
1197 break;
1198 last_swiz = swiz;
1199
1200 stage_prog_data->param[uniforms++] =
1201 &prog->Parameters->ParameterValues[index][swiz];
1202 }
1203 }
1204 }
1205
1206 fs_reg *
1207 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1208 bool origin_upper_left)
1209 {
1210 assert(stage == MESA_SHADER_FRAGMENT);
1211 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1212 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1213 fs_reg wpos = *reg;
1214 bool flip = !origin_upper_left ^ key->render_to_fbo;
1215
1216 /* gl_FragCoord.x */
1217 if (pixel_center_integer) {
1218 emit(MOV(wpos, this->pixel_x));
1219 } else {
1220 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1221 }
1222 wpos = offset(wpos, 1);
1223
1224 /* gl_FragCoord.y */
1225 if (!flip && pixel_center_integer) {
1226 emit(MOV(wpos, this->pixel_y));
1227 } else {
1228 fs_reg pixel_y = this->pixel_y;
1229 float offset = (pixel_center_integer ? 0.0 : 0.5);
1230
1231 if (flip) {
1232 pixel_y.negate = true;
1233 offset += key->drawable_height - 1.0;
1234 }
1235
1236 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1237 }
1238 wpos = offset(wpos, 1);
1239
1240 /* gl_FragCoord.z */
1241 if (brw->gen >= 6) {
1242 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1243 } else {
1244 emit(FS_OPCODE_LINTERP, wpos,
1245 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1246 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1247 interp_reg(VARYING_SLOT_POS, 2));
1248 }
1249 wpos = offset(wpos, 1);
1250
1251 /* gl_FragCoord.w: Already set up in emit_interpolation */
1252 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1253
1254 return reg;
1255 }
1256
1257 fs_inst *
1258 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1259 glsl_interp_qualifier interpolation_mode,
1260 bool is_centroid, bool is_sample)
1261 {
1262 brw_wm_barycentric_interp_mode barycoord_mode;
1263 if (brw->gen >= 6) {
1264 if (is_centroid) {
1265 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1266 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1267 else
1268 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1269 } else if (is_sample) {
1270 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1271 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1272 else
1273 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1274 } else {
1275 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1276 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1277 else
1278 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1279 }
1280 } else {
1281 /* On Ironlake and below, there is only one interpolation mode.
1282 * Centroid interpolation doesn't mean anything on this hardware --
1283 * there is no multisampling.
1284 */
1285 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1286 }
1287 return emit(FS_OPCODE_LINTERP, attr,
1288 this->delta_x[barycoord_mode],
1289 this->delta_y[barycoord_mode], interp);
1290 }
1291
1292 void
1293 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1294 const glsl_type *type,
1295 glsl_interp_qualifier interpolation_mode,
1296 int location, bool mod_centroid,
1297 bool mod_sample)
1298 {
1299 attr.type = brw_type_for_base_type(type->get_scalar_type());
1300
1301 assert(stage == MESA_SHADER_FRAGMENT);
1302 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1303 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1304
1305 unsigned int array_elements;
1306
1307 if (type->is_array()) {
1308 array_elements = type->length;
1309 if (array_elements == 0) {
1310 fail("dereferenced array '%s' has length 0\n", name);
1311 }
1312 type = type->fields.array;
1313 } else {
1314 array_elements = 1;
1315 }
1316
1317 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1318 bool is_gl_Color =
1319 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1320 if (key->flat_shade && is_gl_Color) {
1321 interpolation_mode = INTERP_QUALIFIER_FLAT;
1322 } else {
1323 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1324 }
1325 }
1326
1327 for (unsigned int i = 0; i < array_elements; i++) {
1328 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1329 if (prog_data->urb_setup[location] == -1) {
1330 /* If there's no incoming setup data for this slot, don't
1331 * emit interpolation for it.
1332 */
1333 attr = offset(attr, type->vector_elements);
1334 location++;
1335 continue;
1336 }
1337
1338 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1339 /* Constant interpolation (flat shading) case. The SF has
1340 * handed us defined values in only the constant offset
1341 * field of the setup reg.
1342 */
1343 for (unsigned int k = 0; k < type->vector_elements; k++) {
1344 struct brw_reg interp = interp_reg(location, k);
1345 interp = suboffset(interp, 3);
1346 interp.type = attr.type;
1347 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1348 attr = offset(attr, 1);
1349 }
1350 } else {
1351 /* Smooth/noperspective interpolation case. */
1352 for (unsigned int k = 0; k < type->vector_elements; k++) {
1353 struct brw_reg interp = interp_reg(location, k);
1354 if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1355 /* Get the pixel/sample mask into f0 so that we know
1356 * which pixels are lit. Then, for each channel that is
1357 * unlit, replace the centroid data with non-centroid
1358 * data.
1359 */
1360 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1361
1362 fs_inst *inst;
1363 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1364 false, false);
1365 inst->predicate = BRW_PREDICATE_NORMAL;
1366 inst->predicate_inverse = true;
1367 if (brw->has_pln)
1368 inst->no_dd_clear = true;
1369
1370 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1371 mod_centroid && !key->persample_shading,
1372 mod_sample || key->persample_shading);
1373 inst->predicate = BRW_PREDICATE_NORMAL;
1374 inst->predicate_inverse = false;
1375 if (brw->has_pln)
1376 inst->no_dd_check = true;
1377
1378 } else {
1379 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1380 mod_centroid && !key->persample_shading,
1381 mod_sample || key->persample_shading);
1382 }
1383 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1384 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1385 }
1386 attr = offset(attr, 1);
1387 }
1388
1389 }
1390 location++;
1391 }
1392 }
1393 }
1394
1395 fs_reg *
1396 fs_visitor::emit_frontfacing_interpolation()
1397 {
1398 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1399
1400 if (brw->gen >= 6) {
1401 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1402 * a boolean result from this (~0/true or 0/false).
1403 *
1404 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1405 * this task in only one instruction:
1406 * - a negation source modifier will flip the bit; and
1407 * - a W -> D type conversion will sign extend the bit into the high
1408 * word of the destination.
1409 *
1410 * An ASR 15 fills the low word of the destination.
1411 */
1412 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1413 g0.negate = true;
1414
1415 emit(ASR(*reg, g0, fs_reg(15)));
1416 } else {
1417 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1418 * a boolean result from this (1/true or 0/false).
1419 *
1420 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1421 * the negation source modifier to flip it. Unfortunately the SHR
1422 * instruction only operates on UD (or D with an abs source modifier)
1423 * sources without negation.
1424 *
1425 * Instead, use ASR (which will give ~0/true or 0/false).
1426 */
1427 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1428 g1_6.negate = true;
1429
1430 emit(ASR(*reg, g1_6, fs_reg(31)));
1431 }
1432
1433 return reg;
1434 }
1435
1436 void
1437 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1438 {
1439 assert(stage == MESA_SHADER_FRAGMENT);
1440 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1441 assert(dst.type == BRW_REGISTER_TYPE_F);
1442
1443 if (key->compute_pos_offset) {
1444 /* Convert int_sample_pos to floating point */
1445 emit(MOV(dst, int_sample_pos));
1446 /* Scale to the range [0, 1] */
1447 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1448 }
1449 else {
1450 /* From ARB_sample_shading specification:
1451 * "When rendering to a non-multisample buffer, or if multisample
1452 * rasterization is disabled, gl_SamplePosition will always be
1453 * (0.5, 0.5).
1454 */
1455 emit(MOV(dst, fs_reg(0.5f)));
1456 }
1457 }
1458
1459 fs_reg *
1460 fs_visitor::emit_samplepos_setup()
1461 {
1462 assert(brw->gen >= 6);
1463
1464 this->current_annotation = "compute sample position";
1465 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1466 fs_reg pos = *reg;
1467 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1468 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1469
1470 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1471 * mode will be enabled.
1472 *
1473 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1474 * R31.1:0 Position Offset X/Y for Slot[3:0]
1475 * R31.3:2 Position Offset X/Y for Slot[7:4]
1476 * .....
1477 *
1478 * The X, Y sample positions come in as bytes in thread payload. So, read
1479 * the positions using vstride=16, width=8, hstride=2.
1480 */
1481 struct brw_reg sample_pos_reg =
1482 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1483 BRW_REGISTER_TYPE_B), 16, 8, 2);
1484
1485 if (dispatch_width == 8) {
1486 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1487 } else {
1488 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1489 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1490 ->force_sechalf = true;
1491 }
1492 /* Compute gl_SamplePosition.x */
1493 compute_sample_position(pos, int_sample_x);
1494 pos = offset(pos, 1);
1495 if (dispatch_width == 8) {
1496 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1497 } else {
1498 emit(MOV(half(int_sample_y, 0),
1499 fs_reg(suboffset(sample_pos_reg, 1))));
1500 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1501 ->force_sechalf = true;
1502 }
1503 /* Compute gl_SamplePosition.y */
1504 compute_sample_position(pos, int_sample_y);
1505 return reg;
1506 }
1507
1508 fs_reg *
1509 fs_visitor::emit_sampleid_setup()
1510 {
1511 assert(stage == MESA_SHADER_FRAGMENT);
1512 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1513 assert(brw->gen >= 6);
1514
1515 this->current_annotation = "compute sample id";
1516 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1517
1518 if (key->compute_sample_id) {
1519 fs_reg t1 = vgrf(glsl_type::int_type);
1520 fs_reg t2 = vgrf(glsl_type::int_type);
1521 t2.type = BRW_REGISTER_TYPE_UW;
1522
1523 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1524 * 8x multisampling, subspan 0 will represent sample N (where N
1525 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1526 * 7. We can find the value of N by looking at R0.0 bits 7:6
1527 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1528 * (since samples are always delivered in pairs). That is, we
1529 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1530 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1531 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1532 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1533 * populating a temporary variable with the sequence (0, 1, 2, 3),
1534 * and then reading from it using vstride=1, width=4, hstride=0.
1535 * These computations hold good for 4x multisampling as well.
1536 *
1537 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1538 * the first four slots are sample 0 of subspan 0; the next four
1539 * are sample 1 of subspan 0; the third group is sample 0 of
1540 * subspan 1, and finally sample 1 of subspan 1.
1541 */
1542 fs_inst *inst;
1543 inst = emit(BRW_OPCODE_AND, t1,
1544 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1545 fs_reg(0xc0));
1546 inst->force_writemask_all = true;
1547 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1548 inst->force_writemask_all = true;
1549 /* This works for both SIMD8 and SIMD16 */
1550 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1551 inst->force_writemask_all = true;
1552 /* This special instruction takes care of setting vstride=1,
1553 * width=4, hstride=0 of t2 during an ADD instruction.
1554 */
1555 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1556 } else {
1557 /* As per GL_ARB_sample_shading specification:
1558 * "When rendering to a non-multisample buffer, or if multisample
1559 * rasterization is disabled, gl_SampleID will always be zero."
1560 */
1561 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1562 }
1563
1564 return reg;
1565 }
1566
1567 fs_reg
1568 fs_visitor::fix_math_operand(fs_reg src)
1569 {
1570 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1571 * might be able to do better by doing execsize = 1 math and then
1572 * expanding that result out, but we would need to be careful with
1573 * masking.
1574 *
1575 * The hardware ignores source modifiers (negate and abs) on math
1576 * instructions, so we also move to a temp to set those up.
1577 */
1578 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1579 !src.abs && !src.negate)
1580 return src;
1581
1582 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1583 * operands to math
1584 */
1585 if (brw->gen >= 7 && src.file != IMM)
1586 return src;
1587
1588 fs_reg expanded = vgrf(glsl_type::float_type);
1589 expanded.type = src.type;
1590 emit(BRW_OPCODE_MOV, expanded, src);
1591 return expanded;
1592 }
1593
1594 fs_inst *
1595 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1596 {
1597 switch (opcode) {
1598 case SHADER_OPCODE_RCP:
1599 case SHADER_OPCODE_RSQ:
1600 case SHADER_OPCODE_SQRT:
1601 case SHADER_OPCODE_EXP2:
1602 case SHADER_OPCODE_LOG2:
1603 case SHADER_OPCODE_SIN:
1604 case SHADER_OPCODE_COS:
1605 break;
1606 default:
1607 unreachable("not reached: bad math opcode");
1608 }
1609
1610 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1611 * might be able to do better by doing execsize = 1 math and then
1612 * expanding that result out, but we would need to be careful with
1613 * masking.
1614 *
1615 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1616 * instructions, so we also move to a temp to set those up.
1617 */
1618 if (brw->gen == 6 || brw->gen == 7)
1619 src = fix_math_operand(src);
1620
1621 fs_inst *inst = emit(opcode, dst, src);
1622
1623 if (brw->gen < 6) {
1624 inst->base_mrf = 2;
1625 inst->mlen = dispatch_width / 8;
1626 }
1627
1628 return inst;
1629 }
1630
1631 fs_inst *
1632 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1633 {
1634 int base_mrf = 2;
1635 fs_inst *inst;
1636
1637 if (brw->gen >= 8) {
1638 inst = emit(opcode, dst, src0, src1);
1639 } else if (brw->gen >= 6) {
1640 src0 = fix_math_operand(src0);
1641 src1 = fix_math_operand(src1);
1642
1643 inst = emit(opcode, dst, src0, src1);
1644 } else {
1645 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1646 * "Message Payload":
1647 *
1648 * "Operand0[7]. For the INT DIV functions, this operand is the
1649 * denominator."
1650 * ...
1651 * "Operand1[7]. For the INT DIV functions, this operand is the
1652 * numerator."
1653 */
1654 bool is_int_div = opcode != SHADER_OPCODE_POW;
1655 fs_reg &op0 = is_int_div ? src1 : src0;
1656 fs_reg &op1 = is_int_div ? src0 : src1;
1657
1658 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1659 inst = emit(opcode, dst, op0, reg_null_f);
1660
1661 inst->base_mrf = base_mrf;
1662 inst->mlen = 2 * dispatch_width / 8;
1663 }
1664 return inst;
1665 }
1666
1667 void
1668 fs_visitor::assign_curb_setup()
1669 {
1670 if (dispatch_width == 8) {
1671 prog_data->dispatch_grf_start_reg = payload.num_regs;
1672 } else {
1673 assert(stage == MESA_SHADER_FRAGMENT);
1674 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1675 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1676 }
1677
1678 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1679
1680 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1681 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1682 for (unsigned int i = 0; i < inst->sources; i++) {
1683 if (inst->src[i].file == UNIFORM) {
1684 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1685 int constant_nr;
1686 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1687 constant_nr = push_constant_loc[uniform_nr];
1688 } else {
1689 /* Section 5.11 of the OpenGL 4.1 spec says:
1690 * "Out-of-bounds reads return undefined values, which include
1691 * values from other variables of the active program or zero."
1692 * Just return the first push constant.
1693 */
1694 constant_nr = 0;
1695 }
1696
1697 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1698 constant_nr / 8,
1699 constant_nr % 8);
1700
1701 inst->src[i].file = HW_REG;
1702 inst->src[i].fixed_hw_reg = byte_offset(
1703 retype(brw_reg, inst->src[i].type),
1704 inst->src[i].subreg_offset);
1705 }
1706 }
1707 }
1708 }
1709
1710 void
1711 fs_visitor::calculate_urb_setup()
1712 {
1713 assert(stage == MESA_SHADER_FRAGMENT);
1714 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1715 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1716
1717 memset(prog_data->urb_setup, -1,
1718 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1719
1720 int urb_next = 0;
1721 /* Figure out where each of the incoming setup attributes lands. */
1722 if (brw->gen >= 6) {
1723 if (_mesa_bitcount_64(prog->InputsRead &
1724 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1725 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1726 * first 16 varying inputs, so we can put them wherever we want.
1727 * Just put them in order.
1728 *
1729 * This is useful because it means that (a) inputs not used by the
1730 * fragment shader won't take up valuable register space, and (b) we
1731 * won't have to recompile the fragment shader if it gets paired with
1732 * a different vertex (or geometry) shader.
1733 */
1734 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1735 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1736 BITFIELD64_BIT(i)) {
1737 prog_data->urb_setup[i] = urb_next++;
1738 }
1739 }
1740 } else {
1741 /* We have enough input varyings that the SF/SBE pipeline stage can't
1742 * arbitrarily rearrange them to suit our whim; we have to put them
1743 * in an order that matches the output of the previous pipeline stage
1744 * (geometry or vertex shader).
1745 */
1746 struct brw_vue_map prev_stage_vue_map;
1747 brw_compute_vue_map(brw, &prev_stage_vue_map,
1748 key->input_slots_valid);
1749 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1750 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1751 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1752 slot++) {
1753 int varying = prev_stage_vue_map.slot_to_varying[slot];
1754 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1755 * unused.
1756 */
1757 if (varying != BRW_VARYING_SLOT_COUNT &&
1758 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1759 BITFIELD64_BIT(varying))) {
1760 prog_data->urb_setup[varying] = slot - first_slot;
1761 }
1762 }
1763 urb_next = prev_stage_vue_map.num_slots - first_slot;
1764 }
1765 } else {
1766 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1767 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1768 /* Point size is packed into the header, not as a general attribute */
1769 if (i == VARYING_SLOT_PSIZ)
1770 continue;
1771
1772 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1773 /* The back color slot is skipped when the front color is
1774 * also written to. In addition, some slots can be
1775 * written in the vertex shader and not read in the
1776 * fragment shader. So the register number must always be
1777 * incremented, mapped or not.
1778 */
1779 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1780 prog_data->urb_setup[i] = urb_next;
1781 urb_next++;
1782 }
1783 }
1784
1785 /*
1786 * It's a FS only attribute, and we did interpolation for this attribute
1787 * in SF thread. So, count it here, too.
1788 *
1789 * See compile_sf_prog() for more info.
1790 */
1791 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1792 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1793 }
1794
1795 prog_data->num_varying_inputs = urb_next;
1796 }
1797
1798 void
1799 fs_visitor::assign_urb_setup()
1800 {
1801 assert(stage == MESA_SHADER_FRAGMENT);
1802 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1803
1804 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1805
1806 /* Offset all the urb_setup[] index by the actual position of the
1807 * setup regs, now that the location of the constants has been chosen.
1808 */
1809 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1810 if (inst->opcode == FS_OPCODE_LINTERP) {
1811 assert(inst->src[2].file == HW_REG);
1812 inst->src[2].fixed_hw_reg.nr += urb_start;
1813 }
1814
1815 if (inst->opcode == FS_OPCODE_CINTERP) {
1816 assert(inst->src[0].file == HW_REG);
1817 inst->src[0].fixed_hw_reg.nr += urb_start;
1818 }
1819 }
1820
1821 /* Each attribute is 4 setup channels, each of which is half a reg. */
1822 this->first_non_payload_grf =
1823 urb_start + prog_data->num_varying_inputs * 2;
1824 }
1825
1826 void
1827 fs_visitor::assign_vs_urb_setup()
1828 {
1829 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1830 int grf, count, slot, channel, attr;
1831
1832 assert(stage == MESA_SHADER_VERTEX);
1833 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1834 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1835 count++;
1836
1837 /* Each attribute is 4 regs. */
1838 this->first_non_payload_grf =
1839 payload.num_regs + prog_data->curb_read_length + count * 4;
1840
1841 unsigned vue_entries =
1842 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1843
1844 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1845 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1846
1847 assert(vs_prog_data->base.urb_read_length <= 15);
1848
1849 /* Rewrite all ATTR file references to the hw grf that they land in. */
1850 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1851 for (int i = 0; i < inst->sources; i++) {
1852 if (inst->src[i].file == ATTR) {
1853
1854 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1855 slot = count - 1;
1856 } else {
1857 /* Attributes come in in a contiguous block, ordered by their
1858 * gl_vert_attrib value. That means we can compute the slot
1859 * number for an attribute by masking out the enabled
1860 * attributes before it and counting the bits.
1861 */
1862 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1863 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1864 BITFIELD64_MASK(attr));
1865 }
1866
1867 channel = inst->src[i].reg_offset & 3;
1868
1869 grf = payload.num_regs +
1870 prog_data->curb_read_length +
1871 slot * 4 + channel;
1872
1873 inst->src[i].file = HW_REG;
1874 inst->src[i].fixed_hw_reg =
1875 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1876 }
1877 }
1878 }
1879 }
1880
1881 /**
1882 * Split large virtual GRFs into separate components if we can.
1883 *
1884 * This is mostly duplicated with what brw_fs_vector_splitting does,
1885 * but that's really conservative because it's afraid of doing
1886 * splitting that doesn't result in real progress after the rest of
1887 * the optimization phases, which would cause infinite looping in
1888 * optimization. We can do it once here, safely. This also has the
1889 * opportunity to split interpolated values, or maybe even uniforms,
1890 * which we don't have at the IR level.
1891 *
1892 * We want to split, because virtual GRFs are what we register
1893 * allocate and spill (due to contiguousness requirements for some
1894 * instructions), and they're what we naturally generate in the
1895 * codegen process, but most virtual GRFs don't actually need to be
1896 * contiguous sets of GRFs. If we split, we'll end up with reduced
1897 * live intervals and better dead code elimination and coalescing.
1898 */
1899 void
1900 fs_visitor::split_virtual_grfs()
1901 {
1902 int num_vars = this->alloc.count;
1903
1904 /* Count the total number of registers */
1905 int reg_count = 0;
1906 int vgrf_to_reg[num_vars];
1907 for (int i = 0; i < num_vars; i++) {
1908 vgrf_to_reg[i] = reg_count;
1909 reg_count += alloc.sizes[i];
1910 }
1911
1912 /* An array of "split points". For each register slot, this indicates
1913 * if this slot can be separated from the previous slot. Every time an
1914 * instruction uses multiple elements of a register (as a source or
1915 * destination), we mark the used slots as inseparable. Then we go
1916 * through and split the registers into the smallest pieces we can.
1917 */
1918 bool split_points[reg_count];
1919 memset(split_points, 0, sizeof(split_points));
1920
1921 /* Mark all used registers as fully splittable */
1922 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1923 if (inst->dst.file == GRF) {
1924 int reg = vgrf_to_reg[inst->dst.reg];
1925 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1926 split_points[reg + j] = true;
1927 }
1928
1929 for (int i = 0; i < inst->sources; i++) {
1930 if (inst->src[i].file == GRF) {
1931 int reg = vgrf_to_reg[inst->src[i].reg];
1932 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1933 split_points[reg + j] = true;
1934 }
1935 }
1936 }
1937
1938 if (brw->has_pln &&
1939 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1940 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1941 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1942 * Gen6, that was the only supported interpolation mode, and since Gen6,
1943 * delta_x and delta_y are in fixed hardware registers.
1944 */
1945 int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1946 split_points[vgrf_to_reg[vgrf] + 1] = false;
1947 }
1948
1949 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1950 if (inst->dst.file == GRF) {
1951 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1952 for (int j = 1; j < inst->regs_written; j++)
1953 split_points[reg + j] = false;
1954 }
1955 for (int i = 0; i < inst->sources; i++) {
1956 if (inst->src[i].file == GRF) {
1957 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1958 for (int j = 1; j < inst->regs_read(i); j++)
1959 split_points[reg + j] = false;
1960 }
1961 }
1962 }
1963
1964 int new_virtual_grf[reg_count];
1965 int new_reg_offset[reg_count];
1966
1967 int reg = 0;
1968 for (int i = 0; i < num_vars; i++) {
1969 /* The first one should always be 0 as a quick sanity check. */
1970 assert(split_points[reg] == false);
1971
1972 /* j = 0 case */
1973 new_reg_offset[reg] = 0;
1974 reg++;
1975 int offset = 1;
1976
1977 /* j > 0 case */
1978 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1979 /* If this is a split point, reset the offset to 0 and allocate a
1980 * new virtual GRF for the previous offset many registers
1981 */
1982 if (split_points[reg]) {
1983 assert(offset <= MAX_VGRF_SIZE);
1984 int grf = alloc.allocate(offset);
1985 for (int k = reg - offset; k < reg; k++)
1986 new_virtual_grf[k] = grf;
1987 offset = 0;
1988 }
1989 new_reg_offset[reg] = offset;
1990 offset++;
1991 reg++;
1992 }
1993
1994 /* The last one gets the original register number */
1995 assert(offset <= MAX_VGRF_SIZE);
1996 alloc.sizes[i] = offset;
1997 for (int k = reg - offset; k < reg; k++)
1998 new_virtual_grf[k] = i;
1999 }
2000 assert(reg == reg_count);
2001
2002 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2003 if (inst->dst.file == GRF) {
2004 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2005 inst->dst.reg = new_virtual_grf[reg];
2006 inst->dst.reg_offset = new_reg_offset[reg];
2007 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2008 }
2009 for (int i = 0; i < inst->sources; i++) {
2010 if (inst->src[i].file == GRF) {
2011 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2012 inst->src[i].reg = new_virtual_grf[reg];
2013 inst->src[i].reg_offset = new_reg_offset[reg];
2014 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2015 }
2016 }
2017 }
2018 invalidate_live_intervals();
2019 }
2020
2021 /**
2022 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2023 *
2024 * During code generation, we create tons of temporary variables, many of
2025 * which get immediately killed and are never used again. Yet, in later
2026 * optimization and analysis passes, such as compute_live_intervals, we need
2027 * to loop over all the virtual GRFs. Compacting them can save a lot of
2028 * overhead.
2029 */
2030 bool
2031 fs_visitor::compact_virtual_grfs()
2032 {
2033 bool progress = false;
2034 int remap_table[this->alloc.count];
2035 memset(remap_table, -1, sizeof(remap_table));
2036
2037 /* Mark which virtual GRFs are used. */
2038 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2039 if (inst->dst.file == GRF)
2040 remap_table[inst->dst.reg] = 0;
2041
2042 for (int i = 0; i < inst->sources; i++) {
2043 if (inst->src[i].file == GRF)
2044 remap_table[inst->src[i].reg] = 0;
2045 }
2046 }
2047
2048 /* Compact the GRF arrays. */
2049 int new_index = 0;
2050 for (unsigned i = 0; i < this->alloc.count; i++) {
2051 if (remap_table[i] == -1) {
2052 /* We just found an unused register. This means that we are
2053 * actually going to compact something.
2054 */
2055 progress = true;
2056 } else {
2057 remap_table[i] = new_index;
2058 alloc.sizes[new_index] = alloc.sizes[i];
2059 invalidate_live_intervals();
2060 ++new_index;
2061 }
2062 }
2063
2064 this->alloc.count = new_index;
2065
2066 /* Patch all the instructions to use the newly renumbered registers */
2067 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2068 if (inst->dst.file == GRF)
2069 inst->dst.reg = remap_table[inst->dst.reg];
2070
2071 for (int i = 0; i < inst->sources; i++) {
2072 if (inst->src[i].file == GRF)
2073 inst->src[i].reg = remap_table[inst->src[i].reg];
2074 }
2075 }
2076
2077 /* Patch all the references to delta_x/delta_y, since they're used in
2078 * register allocation. If they're unused, switch them to BAD_FILE so
2079 * we don't think some random VGRF is delta_x/delta_y.
2080 */
2081 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2082 if (delta_x[i].file == GRF) {
2083 if (remap_table[delta_x[i].reg] != -1) {
2084 delta_x[i].reg = remap_table[delta_x[i].reg];
2085 } else {
2086 delta_x[i].file = BAD_FILE;
2087 }
2088 }
2089 }
2090 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2091 if (delta_y[i].file == GRF) {
2092 if (remap_table[delta_y[i].reg] != -1) {
2093 delta_y[i].reg = remap_table[delta_y[i].reg];
2094 } else {
2095 delta_y[i].file = BAD_FILE;
2096 }
2097 }
2098 }
2099
2100 return progress;
2101 }
2102
2103 /*
2104 * Implements array access of uniforms by inserting a
2105 * PULL_CONSTANT_LOAD instruction.
2106 *
2107 * Unlike temporary GRF array access (where we don't support it due to
2108 * the difficulty of doing relative addressing on instruction
2109 * destinations), we could potentially do array access of uniforms
2110 * that were loaded in GRF space as push constants. In real-world
2111 * usage we've seen, though, the arrays being used are always larger
2112 * than we could load as push constants, so just always move all
2113 * uniform array access out to a pull constant buffer.
2114 */
2115 void
2116 fs_visitor::move_uniform_array_access_to_pull_constants()
2117 {
2118 if (dispatch_width != 8)
2119 return;
2120
2121 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2122 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2123
2124 /* Walk through and find array access of uniforms. Put a copy of that
2125 * uniform in the pull constant buffer.
2126 *
2127 * Note that we don't move constant-indexed accesses to arrays. No
2128 * testing has been done of the performance impact of this choice.
2129 */
2130 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2131 for (int i = 0 ; i < inst->sources; i++) {
2132 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2133 continue;
2134
2135 int uniform = inst->src[i].reg;
2136
2137 /* If this array isn't already present in the pull constant buffer,
2138 * add it.
2139 */
2140 if (pull_constant_loc[uniform] == -1) {
2141 const gl_constant_value **values = &stage_prog_data->param[uniform];
2142
2143 assert(param_size[uniform]);
2144
2145 for (int j = 0; j < param_size[uniform]; j++) {
2146 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2147
2148 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2149 values[j];
2150 }
2151 }
2152 }
2153 }
2154 }
2155
2156 /**
2157 * Assign UNIFORM file registers to either push constants or pull constants.
2158 *
2159 * We allow a fragment shader to have more than the specified minimum
2160 * maximum number of fragment shader uniform components (64). If
2161 * there are too many of these, they'd fill up all of register space.
2162 * So, this will push some of them out to the pull constant buffer and
2163 * update the program to load them.
2164 */
2165 void
2166 fs_visitor::assign_constant_locations()
2167 {
2168 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2169 if (dispatch_width != 8)
2170 return;
2171
2172 /* Find which UNIFORM registers are still in use. */
2173 bool is_live[uniforms];
2174 for (unsigned int i = 0; i < uniforms; i++) {
2175 is_live[i] = false;
2176 }
2177
2178 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2179 for (int i = 0; i < inst->sources; i++) {
2180 if (inst->src[i].file != UNIFORM)
2181 continue;
2182
2183 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2184 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2185 is_live[constant_nr] = true;
2186 }
2187 }
2188
2189 /* Only allow 16 registers (128 uniform components) as push constants.
2190 *
2191 * Just demote the end of the list. We could probably do better
2192 * here, demoting things that are rarely used in the program first.
2193 *
2194 * If changing this value, note the limitation about total_regs in
2195 * brw_curbe.c.
2196 */
2197 unsigned int max_push_components = 16 * 8;
2198 unsigned int num_push_constants = 0;
2199
2200 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2201
2202 for (unsigned int i = 0; i < uniforms; i++) {
2203 if (!is_live[i] || pull_constant_loc[i] != -1) {
2204 /* This UNIFORM register is either dead, or has already been demoted
2205 * to a pull const. Mark it as no longer living in the param[] array.
2206 */
2207 push_constant_loc[i] = -1;
2208 continue;
2209 }
2210
2211 if (num_push_constants < max_push_components) {
2212 /* Retain as a push constant. Record the location in the params[]
2213 * array.
2214 */
2215 push_constant_loc[i] = num_push_constants++;
2216 } else {
2217 /* Demote to a pull constant. */
2218 push_constant_loc[i] = -1;
2219
2220 int pull_index = stage_prog_data->nr_pull_params++;
2221 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2222 pull_constant_loc[i] = pull_index;
2223 }
2224 }
2225
2226 stage_prog_data->nr_params = num_push_constants;
2227
2228 /* Up until now, the param[] array has been indexed by reg + reg_offset
2229 * of UNIFORM registers. Condense it to only contain the uniforms we
2230 * chose to upload as push constants.
2231 */
2232 for (unsigned int i = 0; i < uniforms; i++) {
2233 int remapped = push_constant_loc[i];
2234
2235 if (remapped == -1)
2236 continue;
2237
2238 assert(remapped <= (int)i);
2239 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2240 }
2241 }
2242
2243 /**
2244 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2245 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2246 */
2247 void
2248 fs_visitor::demote_pull_constants()
2249 {
2250 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2251 for (int i = 0; i < inst->sources; i++) {
2252 if (inst->src[i].file != UNIFORM)
2253 continue;
2254
2255 int pull_index = pull_constant_loc[inst->src[i].reg +
2256 inst->src[i].reg_offset];
2257 if (pull_index == -1)
2258 continue;
2259
2260 /* Set up the annotation tracking for new generated instructions. */
2261 base_ir = inst->ir;
2262 current_annotation = inst->annotation;
2263
2264 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2265 fs_reg dst = vgrf(glsl_type::float_type);
2266
2267 /* Generate a pull load into dst. */
2268 if (inst->src[i].reladdr) {
2269 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2270 surf_index,
2271 *inst->src[i].reladdr,
2272 pull_index);
2273 inst->insert_before(block, &list);
2274 inst->src[i].reladdr = NULL;
2275 } else {
2276 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2277 fs_inst *pull =
2278 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2279 dst, surf_index, offset);
2280 inst->insert_before(block, pull);
2281 inst->src[i].set_smear(pull_index & 3);
2282 }
2283
2284 /* Rewrite the instruction to use the temporary VGRF. */
2285 inst->src[i].file = GRF;
2286 inst->src[i].reg = dst.reg;
2287 inst->src[i].reg_offset = 0;
2288 inst->src[i].width = dispatch_width;
2289 }
2290 }
2291 invalidate_live_intervals();
2292 }
2293
2294 bool
2295 fs_visitor::opt_algebraic()
2296 {
2297 bool progress = false;
2298
2299 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2300 switch (inst->opcode) {
2301 case BRW_OPCODE_MOV:
2302 if (inst->src[0].file != IMM)
2303 break;
2304
2305 if (inst->saturate) {
2306 if (inst->dst.type != inst->src[0].type)
2307 assert(!"unimplemented: saturate mixed types");
2308
2309 if (brw_saturate_immediate(inst->dst.type,
2310 &inst->src[0].fixed_hw_reg)) {
2311 inst->saturate = false;
2312 progress = true;
2313 }
2314 }
2315 break;
2316
2317 case BRW_OPCODE_MUL:
2318 if (inst->src[1].file != IMM)
2319 continue;
2320
2321 /* a * 1.0 = a */
2322 if (inst->src[1].is_one()) {
2323 inst->opcode = BRW_OPCODE_MOV;
2324 inst->src[1] = reg_undef;
2325 progress = true;
2326 break;
2327 }
2328
2329 /* a * -1.0 = -a */
2330 if (inst->src[1].is_negative_one()) {
2331 inst->opcode = BRW_OPCODE_MOV;
2332 inst->src[0].negate = !inst->src[0].negate;
2333 inst->src[1] = reg_undef;
2334 progress = true;
2335 break;
2336 }
2337
2338 /* a * 0.0 = 0.0 */
2339 if (inst->src[1].is_zero()) {
2340 inst->opcode = BRW_OPCODE_MOV;
2341 inst->src[0] = inst->src[1];
2342 inst->src[1] = reg_undef;
2343 progress = true;
2344 break;
2345 }
2346
2347 if (inst->src[0].file == IMM) {
2348 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2349 inst->opcode = BRW_OPCODE_MOV;
2350 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2351 inst->src[1] = reg_undef;
2352 progress = true;
2353 break;
2354 }
2355 break;
2356 case BRW_OPCODE_ADD:
2357 if (inst->src[1].file != IMM)
2358 continue;
2359
2360 /* a + 0.0 = a */
2361 if (inst->src[1].is_zero()) {
2362 inst->opcode = BRW_OPCODE_MOV;
2363 inst->src[1] = reg_undef;
2364 progress = true;
2365 break;
2366 }
2367
2368 if (inst->src[0].file == IMM) {
2369 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2370 inst->opcode = BRW_OPCODE_MOV;
2371 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2372 inst->src[1] = reg_undef;
2373 progress = true;
2374 break;
2375 }
2376 break;
2377 case BRW_OPCODE_OR:
2378 if (inst->src[0].equals(inst->src[1])) {
2379 inst->opcode = BRW_OPCODE_MOV;
2380 inst->src[1] = reg_undef;
2381 progress = true;
2382 break;
2383 }
2384 break;
2385 case BRW_OPCODE_LRP:
2386 if (inst->src[1].equals(inst->src[2])) {
2387 inst->opcode = BRW_OPCODE_MOV;
2388 inst->src[0] = inst->src[1];
2389 inst->src[1] = reg_undef;
2390 inst->src[2] = reg_undef;
2391 progress = true;
2392 break;
2393 }
2394 break;
2395 case BRW_OPCODE_CMP:
2396 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2397 inst->src[0].abs &&
2398 inst->src[0].negate &&
2399 inst->src[1].is_zero()) {
2400 inst->src[0].abs = false;
2401 inst->src[0].negate = false;
2402 inst->conditional_mod = BRW_CONDITIONAL_Z;
2403 progress = true;
2404 break;
2405 }
2406 break;
2407 case BRW_OPCODE_SEL:
2408 if (inst->src[0].equals(inst->src[1])) {
2409 inst->opcode = BRW_OPCODE_MOV;
2410 inst->src[1] = reg_undef;
2411 inst->predicate = BRW_PREDICATE_NONE;
2412 inst->predicate_inverse = false;
2413 progress = true;
2414 } else if (inst->saturate && inst->src[1].file == IMM) {
2415 switch (inst->conditional_mod) {
2416 case BRW_CONDITIONAL_LE:
2417 case BRW_CONDITIONAL_L:
2418 switch (inst->src[1].type) {
2419 case BRW_REGISTER_TYPE_F:
2420 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2421 inst->opcode = BRW_OPCODE_MOV;
2422 inst->src[1] = reg_undef;
2423 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2424 progress = true;
2425 }
2426 break;
2427 default:
2428 break;
2429 }
2430 break;
2431 case BRW_CONDITIONAL_GE:
2432 case BRW_CONDITIONAL_G:
2433 switch (inst->src[1].type) {
2434 case BRW_REGISTER_TYPE_F:
2435 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2436 inst->opcode = BRW_OPCODE_MOV;
2437 inst->src[1] = reg_undef;
2438 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2439 progress = true;
2440 }
2441 break;
2442 default:
2443 break;
2444 }
2445 default:
2446 break;
2447 }
2448 }
2449 break;
2450 case BRW_OPCODE_MAD:
2451 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2452 inst->opcode = BRW_OPCODE_MOV;
2453 inst->src[1] = reg_undef;
2454 inst->src[2] = reg_undef;
2455 progress = true;
2456 } else if (inst->src[0].is_zero()) {
2457 inst->opcode = BRW_OPCODE_MUL;
2458 inst->src[0] = inst->src[2];
2459 inst->src[2] = reg_undef;
2460 } else if (inst->src[1].is_one()) {
2461 inst->opcode = BRW_OPCODE_ADD;
2462 inst->src[1] = inst->src[2];
2463 inst->src[2] = reg_undef;
2464 progress = true;
2465 } else if (inst->src[2].is_one()) {
2466 inst->opcode = BRW_OPCODE_ADD;
2467 inst->src[2] = reg_undef;
2468 progress = true;
2469 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2470 inst->opcode = BRW_OPCODE_ADD;
2471 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2472 inst->src[2] = reg_undef;
2473 progress = true;
2474 }
2475 break;
2476 case SHADER_OPCODE_RCP: {
2477 fs_inst *prev = (fs_inst *)inst->prev;
2478 if (prev->opcode == SHADER_OPCODE_SQRT) {
2479 if (inst->src[0].equals(prev->dst)) {
2480 inst->opcode = SHADER_OPCODE_RSQ;
2481 inst->src[0] = prev->src[0];
2482 progress = true;
2483 }
2484 }
2485 break;
2486 }
2487 default:
2488 break;
2489 }
2490 }
2491
2492 return progress;
2493 }
2494
2495 bool
2496 fs_visitor::opt_register_renaming()
2497 {
2498 bool progress = false;
2499 int depth = 0;
2500
2501 int remap[alloc.count];
2502 memset(remap, -1, sizeof(int) * alloc.count);
2503
2504 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2505 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2506 depth++;
2507 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2508 inst->opcode == BRW_OPCODE_WHILE) {
2509 depth--;
2510 }
2511
2512 /* Rewrite instruction sources. */
2513 for (int i = 0; i < inst->sources; i++) {
2514 if (inst->src[i].file == GRF &&
2515 remap[inst->src[i].reg] != -1 &&
2516 remap[inst->src[i].reg] != inst->src[i].reg) {
2517 inst->src[i].reg = remap[inst->src[i].reg];
2518 progress = true;
2519 }
2520 }
2521
2522 const int dst = inst->dst.reg;
2523
2524 if (depth == 0 &&
2525 inst->dst.file == GRF &&
2526 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2527 !inst->is_partial_write()) {
2528 if (remap[dst] == -1) {
2529 remap[dst] = dst;
2530 } else {
2531 remap[dst] = alloc.allocate(inst->dst.width / 8);
2532 inst->dst.reg = remap[dst];
2533 progress = true;
2534 }
2535 } else if (inst->dst.file == GRF &&
2536 remap[dst] != -1 &&
2537 remap[dst] != dst) {
2538 inst->dst.reg = remap[dst];
2539 progress = true;
2540 }
2541 }
2542
2543 if (progress) {
2544 invalidate_live_intervals();
2545
2546 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2547 if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2548 delta_x[i].reg = remap[delta_x[i].reg];
2549 }
2550 }
2551 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2552 if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2553 delta_y[i].reg = remap[delta_y[i].reg];
2554 }
2555 }
2556 }
2557
2558 return progress;
2559 }
2560
2561 /**
2562 * Remove redundant or useless discard jumps.
2563 *
2564 * For example, we can eliminate jumps in the following sequence:
2565 *
2566 * discard-jump (redundant with the next jump)
2567 * discard-jump (useless; jumps to the next instruction)
2568 * placeholder-halt
2569 */
2570 bool
2571 fs_visitor::opt_redundant_discard_jumps()
2572 {
2573 bool progress = false;
2574
2575 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2576
2577 fs_inst *placeholder_halt = NULL;
2578 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2579 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2580 placeholder_halt = inst;
2581 break;
2582 }
2583 }
2584
2585 if (!placeholder_halt)
2586 return false;
2587
2588 /* Delete any HALTs immediately before the placeholder halt. */
2589 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2590 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2591 prev = (fs_inst *) placeholder_halt->prev) {
2592 prev->remove(last_bblock);
2593 progress = true;
2594 }
2595
2596 if (progress)
2597 invalidate_live_intervals();
2598
2599 return progress;
2600 }
2601
2602 bool
2603 fs_visitor::compute_to_mrf()
2604 {
2605 bool progress = false;
2606 int next_ip = 0;
2607
2608 /* No MRFs on Gen >= 7. */
2609 if (brw->gen >= 7)
2610 return false;
2611
2612 calculate_live_intervals();
2613
2614 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2615 int ip = next_ip;
2616 next_ip++;
2617
2618 if (inst->opcode != BRW_OPCODE_MOV ||
2619 inst->is_partial_write() ||
2620 inst->dst.file != MRF || inst->src[0].file != GRF ||
2621 inst->dst.type != inst->src[0].type ||
2622 inst->src[0].abs || inst->src[0].negate ||
2623 !inst->src[0].is_contiguous() ||
2624 inst->src[0].subreg_offset)
2625 continue;
2626
2627 /* Work out which hardware MRF registers are written by this
2628 * instruction.
2629 */
2630 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2631 int mrf_high;
2632 if (inst->dst.reg & BRW_MRF_COMPR4) {
2633 mrf_high = mrf_low + 4;
2634 } else if (inst->exec_size == 16) {
2635 mrf_high = mrf_low + 1;
2636 } else {
2637 mrf_high = mrf_low;
2638 }
2639
2640 /* Can't compute-to-MRF this GRF if someone else was going to
2641 * read it later.
2642 */
2643 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2644 continue;
2645
2646 /* Found a move of a GRF to a MRF. Let's see if we can go
2647 * rewrite the thing that made this GRF to write into the MRF.
2648 */
2649 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2650 if (scan_inst->dst.file == GRF &&
2651 scan_inst->dst.reg == inst->src[0].reg) {
2652 /* Found the last thing to write our reg we want to turn
2653 * into a compute-to-MRF.
2654 */
2655
2656 /* If this one instruction didn't populate all the
2657 * channels, bail. We might be able to rewrite everything
2658 * that writes that reg, but it would require smarter
2659 * tracking to delay the rewriting until complete success.
2660 */
2661 if (scan_inst->is_partial_write())
2662 break;
2663
2664 /* Things returning more than one register would need us to
2665 * understand coalescing out more than one MOV at a time.
2666 */
2667 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2668 break;
2669
2670 /* SEND instructions can't have MRF as a destination. */
2671 if (scan_inst->mlen)
2672 break;
2673
2674 if (brw->gen == 6) {
2675 /* gen6 math instructions must have the destination be
2676 * GRF, so no compute-to-MRF for them.
2677 */
2678 if (scan_inst->is_math()) {
2679 break;
2680 }
2681 }
2682
2683 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2684 /* Found the creator of our MRF's source value. */
2685 scan_inst->dst.file = MRF;
2686 scan_inst->dst.reg = inst->dst.reg;
2687 scan_inst->saturate |= inst->saturate;
2688 inst->remove(block);
2689 progress = true;
2690 }
2691 break;
2692 }
2693
2694 /* We don't handle control flow here. Most computation of
2695 * values that end up in MRFs are shortly before the MRF
2696 * write anyway.
2697 */
2698 if (block->start() == scan_inst)
2699 break;
2700
2701 /* You can't read from an MRF, so if someone else reads our
2702 * MRF's source GRF that we wanted to rewrite, that stops us.
2703 */
2704 bool interfered = false;
2705 for (int i = 0; i < scan_inst->sources; i++) {
2706 if (scan_inst->src[i].file == GRF &&
2707 scan_inst->src[i].reg == inst->src[0].reg &&
2708 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2709 interfered = true;
2710 }
2711 }
2712 if (interfered)
2713 break;
2714
2715 if (scan_inst->dst.file == MRF) {
2716 /* If somebody else writes our MRF here, we can't
2717 * compute-to-MRF before that.
2718 */
2719 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2720 int scan_mrf_high;
2721
2722 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2723 scan_mrf_high = scan_mrf_low + 4;
2724 } else if (scan_inst->exec_size == 16) {
2725 scan_mrf_high = scan_mrf_low + 1;
2726 } else {
2727 scan_mrf_high = scan_mrf_low;
2728 }
2729
2730 if (mrf_low == scan_mrf_low ||
2731 mrf_low == scan_mrf_high ||
2732 mrf_high == scan_mrf_low ||
2733 mrf_high == scan_mrf_high) {
2734 break;
2735 }
2736 }
2737
2738 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2739 /* Found a SEND instruction, which means that there are
2740 * live values in MRFs from base_mrf to base_mrf +
2741 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2742 * above it.
2743 */
2744 if (mrf_low >= scan_inst->base_mrf &&
2745 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2746 break;
2747 }
2748 if (mrf_high >= scan_inst->base_mrf &&
2749 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2750 break;
2751 }
2752 }
2753 }
2754 }
2755
2756 if (progress)
2757 invalidate_live_intervals();
2758
2759 return progress;
2760 }
2761
2762 /**
2763 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2764 * instructions to FS_OPCODE_REP_FB_WRITE.
2765 */
2766 void
2767 fs_visitor::emit_repclear_shader()
2768 {
2769 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2770 int base_mrf = 1;
2771 int color_mrf = base_mrf + 2;
2772
2773 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2774 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2775 mov->force_writemask_all = true;
2776
2777 fs_inst *write;
2778 if (key->nr_color_regions == 1) {
2779 write = emit(FS_OPCODE_REP_FB_WRITE);
2780 write->saturate = key->clamp_fragment_color;
2781 write->base_mrf = color_mrf;
2782 write->target = 0;
2783 write->header_present = false;
2784 write->mlen = 1;
2785 } else {
2786 assume(key->nr_color_regions > 0);
2787 for (int i = 0; i < key->nr_color_regions; ++i) {
2788 write = emit(FS_OPCODE_REP_FB_WRITE);
2789 write->saturate = key->clamp_fragment_color;
2790 write->base_mrf = base_mrf;
2791 write->target = i;
2792 write->header_present = true;
2793 write->mlen = 3;
2794 }
2795 }
2796 write->eot = true;
2797
2798 calculate_cfg();
2799
2800 assign_constant_locations();
2801 assign_curb_setup();
2802
2803 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2804 assert(mov->src[0].file == HW_REG);
2805 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2806 }
2807
2808 /**
2809 * Walks through basic blocks, looking for repeated MRF writes and
2810 * removing the later ones.
2811 */
2812 bool
2813 fs_visitor::remove_duplicate_mrf_writes()
2814 {
2815 fs_inst *last_mrf_move[16];
2816 bool progress = false;
2817
2818 /* Need to update the MRF tracking for compressed instructions. */
2819 if (dispatch_width == 16)
2820 return false;
2821
2822 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2823
2824 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2825 if (inst->is_control_flow()) {
2826 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2827 }
2828
2829 if (inst->opcode == BRW_OPCODE_MOV &&
2830 inst->dst.file == MRF) {
2831 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2832 if (prev_inst && inst->equals(prev_inst)) {
2833 inst->remove(block);
2834 progress = true;
2835 continue;
2836 }
2837 }
2838
2839 /* Clear out the last-write records for MRFs that were overwritten. */
2840 if (inst->dst.file == MRF) {
2841 last_mrf_move[inst->dst.reg] = NULL;
2842 }
2843
2844 if (inst->mlen > 0 && inst->base_mrf != -1) {
2845 /* Found a SEND instruction, which will include two or fewer
2846 * implied MRF writes. We could do better here.
2847 */
2848 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2849 last_mrf_move[inst->base_mrf + i] = NULL;
2850 }
2851 }
2852
2853 /* Clear out any MRF move records whose sources got overwritten. */
2854 if (inst->dst.file == GRF) {
2855 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2856 if (last_mrf_move[i] &&
2857 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2858 last_mrf_move[i] = NULL;
2859 }
2860 }
2861 }
2862
2863 if (inst->opcode == BRW_OPCODE_MOV &&
2864 inst->dst.file == MRF &&
2865 inst->src[0].file == GRF &&
2866 !inst->is_partial_write()) {
2867 last_mrf_move[inst->dst.reg] = inst;
2868 }
2869 }
2870
2871 if (progress)
2872 invalidate_live_intervals();
2873
2874 return progress;
2875 }
2876
2877 static void
2878 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2879 int first_grf, int grf_len)
2880 {
2881 /* Clear the flag for registers that actually got read (as expected). */
2882 for (int i = 0; i < inst->sources; i++) {
2883 int grf;
2884 if (inst->src[i].file == GRF) {
2885 grf = inst->src[i].reg;
2886 } else if (inst->src[i].file == HW_REG &&
2887 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2888 grf = inst->src[i].fixed_hw_reg.nr;
2889 } else {
2890 continue;
2891 }
2892
2893 if (grf >= first_grf &&
2894 grf < first_grf + grf_len) {
2895 deps[grf - first_grf] = false;
2896 if (inst->exec_size == 16)
2897 deps[grf - first_grf + 1] = false;
2898 }
2899 }
2900 }
2901
2902 /**
2903 * Implements this workaround for the original 965:
2904 *
2905 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2906 * check for post destination dependencies on this instruction, software
2907 * must ensure that there is no destination hazard for the case of ‘write
2908 * followed by a posted write’ shown in the following example.
2909 *
2910 * 1. mov r3 0
2911 * 2. send r3.xy <rest of send instruction>
2912 * 3. mov r2 r3
2913 *
2914 * Due to no post-destination dependency check on the ‘send’, the above
2915 * code sequence could have two instructions (1 and 2) in flight at the
2916 * same time that both consider ‘r3’ as the target of their final writes.
2917 */
2918 void
2919 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2920 fs_inst *inst)
2921 {
2922 int write_len = inst->regs_written;
2923 int first_write_grf = inst->dst.reg;
2924 bool needs_dep[BRW_MAX_MRF];
2925 assert(write_len < (int)sizeof(needs_dep) - 1);
2926
2927 memset(needs_dep, false, sizeof(needs_dep));
2928 memset(needs_dep, true, write_len);
2929
2930 clear_deps_for_inst_src(inst, dispatch_width,
2931 needs_dep, first_write_grf, write_len);
2932
2933 /* Walk backwards looking for writes to registers we're writing which
2934 * aren't read since being written. If we hit the start of the program,
2935 * we assume that there are no outstanding dependencies on entry to the
2936 * program.
2937 */
2938 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2939 /* If we hit control flow, assume that there *are* outstanding
2940 * dependencies, and force their cleanup before our instruction.
2941 */
2942 if (block->start() == scan_inst) {
2943 for (int i = 0; i < write_len; i++) {
2944 if (needs_dep[i]) {
2945 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2946 }
2947 }
2948 return;
2949 }
2950
2951 /* We insert our reads as late as possible on the assumption that any
2952 * instruction but a MOV that might have left us an outstanding
2953 * dependency has more latency than a MOV.
2954 */
2955 if (scan_inst->dst.file == GRF) {
2956 for (int i = 0; i < scan_inst->regs_written; i++) {
2957 int reg = scan_inst->dst.reg + i;
2958
2959 if (reg >= first_write_grf &&
2960 reg < first_write_grf + write_len &&
2961 needs_dep[reg - first_write_grf]) {
2962 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2963 needs_dep[reg - first_write_grf] = false;
2964 if (scan_inst->exec_size == 16)
2965 needs_dep[reg - first_write_grf + 1] = false;
2966 }
2967 }
2968 }
2969
2970 /* Clear the flag for registers that actually got read (as expected). */
2971 clear_deps_for_inst_src(scan_inst, dispatch_width,
2972 needs_dep, first_write_grf, write_len);
2973
2974 /* Continue the loop only if we haven't resolved all the dependencies */
2975 int i;
2976 for (i = 0; i < write_len; i++) {
2977 if (needs_dep[i])
2978 break;
2979 }
2980 if (i == write_len)
2981 return;
2982 }
2983 }
2984
2985 /**
2986 * Implements this workaround for the original 965:
2987 *
2988 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2989 * used as a destination register until after it has been sourced by an
2990 * instruction with a different destination register.
2991 */
2992 void
2993 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2994 {
2995 int write_len = inst->regs_written;
2996 int first_write_grf = inst->dst.reg;
2997 bool needs_dep[BRW_MAX_MRF];
2998 assert(write_len < (int)sizeof(needs_dep) - 1);
2999
3000 memset(needs_dep, false, sizeof(needs_dep));
3001 memset(needs_dep, true, write_len);
3002 /* Walk forwards looking for writes to registers we're writing which aren't
3003 * read before being written.
3004 */
3005 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3006 /* If we hit control flow, force resolve all remaining dependencies. */
3007 if (block->end() == scan_inst) {
3008 for (int i = 0; i < write_len; i++) {
3009 if (needs_dep[i])
3010 scan_inst->insert_before(block,
3011 DEP_RESOLVE_MOV(first_write_grf + i));
3012 }
3013 return;
3014 }
3015
3016 /* Clear the flag for registers that actually got read (as expected). */
3017 clear_deps_for_inst_src(scan_inst, dispatch_width,
3018 needs_dep, first_write_grf, write_len);
3019
3020 /* We insert our reads as late as possible since they're reading the
3021 * result of a SEND, which has massive latency.
3022 */
3023 if (scan_inst->dst.file == GRF &&
3024 scan_inst->dst.reg >= first_write_grf &&
3025 scan_inst->dst.reg < first_write_grf + write_len &&
3026 needs_dep[scan_inst->dst.reg - first_write_grf]) {
3027 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3028 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3029 }
3030
3031 /* Continue the loop only if we haven't resolved all the dependencies */
3032 int i;
3033 for (i = 0; i < write_len; i++) {
3034 if (needs_dep[i])
3035 break;
3036 }
3037 if (i == write_len)
3038 return;
3039 }
3040
3041 /* If we hit the end of the program, resolve all remaining dependencies out
3042 * of paranoia.
3043 */
3044 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
3045 assert(last_inst->eot);
3046 for (int i = 0; i < write_len; i++) {
3047 if (needs_dep[i])
3048 last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3049 }
3050 }
3051
3052 void
3053 fs_visitor::insert_gen4_send_dependency_workarounds()
3054 {
3055 if (brw->gen != 4 || brw->is_g4x)
3056 return;
3057
3058 bool progress = false;
3059
3060 /* Note that we're done with register allocation, so GRF fs_regs always
3061 * have a .reg_offset of 0.
3062 */
3063
3064 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3065 if (inst->mlen != 0 && inst->dst.file == GRF) {
3066 insert_gen4_pre_send_dependency_workarounds(block, inst);
3067 insert_gen4_post_send_dependency_workarounds(block, inst);
3068 progress = true;
3069 }
3070 }
3071
3072 if (progress)
3073 invalidate_live_intervals();
3074 }
3075
3076 /**
3077 * Turns the generic expression-style uniform pull constant load instruction
3078 * into a hardware-specific series of instructions for loading a pull
3079 * constant.
3080 *
3081 * The expression style allows the CSE pass before this to optimize out
3082 * repeated loads from the same offset, and gives the pre-register-allocation
3083 * scheduling full flexibility, while the conversion to native instructions
3084 * allows the post-register-allocation scheduler the best information
3085 * possible.
3086 *
3087 * Note that execution masking for setting up pull constant loads is special:
3088 * the channels that need to be written are unrelated to the current execution
3089 * mask, since a later instruction will use one of the result channels as a
3090 * source operand for all 8 or 16 of its channels.
3091 */
3092 void
3093 fs_visitor::lower_uniform_pull_constant_loads()
3094 {
3095 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3096 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3097 continue;
3098
3099 if (brw->gen >= 7) {
3100 /* The offset arg before was a vec4-aligned byte offset. We need to
3101 * turn it into a dword offset.
3102 */
3103 fs_reg const_offset_reg = inst->src[1];
3104 assert(const_offset_reg.file == IMM &&
3105 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3106 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3107 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3108
3109 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3110 * Reserve space for the register.
3111 */
3112 if (brw->gen >= 9) {
3113 payload.reg_offset++;
3114 alloc.sizes[payload.reg] = 2;
3115 }
3116
3117 /* This is actually going to be a MOV, but since only the first dword
3118 * is accessed, we have a special opcode to do just that one. Note
3119 * that this needs to be an operation that will be considered a def
3120 * by live variable analysis, or register allocation will explode.
3121 */
3122 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3123 8, payload, const_offset_reg);
3124 setup->force_writemask_all = true;
3125
3126 setup->ir = inst->ir;
3127 setup->annotation = inst->annotation;
3128 inst->insert_before(block, setup);
3129
3130 /* Similarly, this will only populate the first 4 channels of the
3131 * result register (since we only use smear values from 0-3), but we
3132 * don't tell the optimizer.
3133 */
3134 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3135 inst->src[1] = payload;
3136
3137 invalidate_live_intervals();
3138 } else {
3139 /* Before register allocation, we didn't tell the scheduler about the
3140 * MRF we use. We know it's safe to use this MRF because nothing
3141 * else does except for register spill/unspill, which generates and
3142 * uses its MRF within a single IR instruction.
3143 */
3144 inst->base_mrf = 14;
3145 inst->mlen = 1;
3146 }
3147 }
3148 }
3149
3150 bool
3151 fs_visitor::lower_load_payload()
3152 {
3153 bool progress = false;
3154
3155 int vgrf_to_reg[alloc.count];
3156 int reg_count = 0;
3157 for (unsigned i = 0; i < alloc.count; ++i) {
3158 vgrf_to_reg[i] = reg_count;
3159 reg_count += alloc.sizes[i];
3160 }
3161
3162 struct {
3163 bool written:1; /* Whether this register has ever been written */
3164 bool force_writemask_all:1;
3165 bool force_sechalf:1;
3166 } metadata[reg_count];
3167 memset(metadata, 0, sizeof(metadata));
3168
3169 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3170 if (inst->dst.file == GRF) {
3171 const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3172 bool force_sechalf = inst->force_sechalf &&
3173 !inst->force_writemask_all;
3174 bool toggle_sechalf = inst->dst.width == 16 &&
3175 type_sz(inst->dst.type) == 4 &&
3176 !inst->force_writemask_all;
3177 for (int i = 0; i < inst->regs_written; ++i) {
3178 metadata[dst_reg + i].written = true;
3179 metadata[dst_reg + i].force_sechalf = force_sechalf;
3180 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3181 force_sechalf = (toggle_sechalf != force_sechalf);
3182 }
3183 }
3184
3185 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3186 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3187 fs_reg dst = inst->dst;
3188
3189 for (int i = 0; i < inst->sources; i++) {
3190 dst.width = inst->src[i].effective_width;
3191 dst.type = inst->src[i].type;
3192
3193 if (inst->src[i].file == BAD_FILE) {
3194 /* Do nothing but otherwise increment as normal */
3195 } else if (dst.file == MRF &&
3196 dst.width == 8 &&
3197 brw->has_compr4 &&
3198 i + 4 < inst->sources &&
3199 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3200 fs_reg compr4_dst = dst;
3201 compr4_dst.reg += BRW_MRF_COMPR4;
3202 compr4_dst.width = 16;
3203 fs_reg compr4_src = inst->src[i];
3204 compr4_src.width = 16;
3205 fs_inst *mov = MOV(compr4_dst, compr4_src);
3206 mov->force_writemask_all = true;
3207 inst->insert_before(block, mov);
3208 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3209 inst->src[i + 4].file = BAD_FILE;
3210 } else {
3211 fs_inst *mov = MOV(dst, inst->src[i]);
3212 if (inst->src[i].file == GRF) {
3213 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3214 inst->src[i].reg_offset;
3215 mov->force_sechalf = metadata[src_reg].force_sechalf;
3216 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3217 } else {
3218 /* We don't have any useful metadata for immediates or
3219 * uniforms. Assume that any of the channels of the
3220 * destination may be used.
3221 */
3222 assert(inst->src[i].file == IMM ||
3223 inst->src[i].file == UNIFORM);
3224 mov->force_writemask_all = true;
3225 }
3226
3227 if (dst.file == GRF) {
3228 const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3229 const bool force_writemask = mov->force_writemask_all;
3230 metadata[dst_reg].force_writemask_all = force_writemask;
3231 metadata[dst_reg].force_sechalf = mov->force_sechalf;
3232 if (dst.width * type_sz(dst.type) > 32) {
3233 assert(!mov->force_sechalf);
3234 metadata[dst_reg + 1].force_writemask_all = force_writemask;
3235 metadata[dst_reg + 1].force_sechalf = !force_writemask;
3236 }
3237 }
3238
3239 inst->insert_before(block, mov);
3240 }
3241
3242 dst = offset(dst, 1);
3243 }
3244
3245 inst->remove(block);
3246 progress = true;
3247 }
3248 }
3249
3250 if (progress)
3251 invalidate_live_intervals();
3252
3253 return progress;
3254 }
3255
3256 void
3257 fs_visitor::dump_instructions()
3258 {
3259 dump_instructions(NULL);
3260 }
3261
3262 void
3263 fs_visitor::dump_instructions(const char *name)
3264 {
3265 FILE *file = stderr;
3266 if (name && geteuid() != 0) {
3267 file = fopen(name, "w");
3268 if (!file)
3269 file = stderr;
3270 }
3271
3272 if (cfg) {
3273 calculate_register_pressure();
3274 int ip = 0, max_pressure = 0;
3275 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3276 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3277 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3278 dump_instruction(inst, file);
3279 ip++;
3280 }
3281 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3282 } else {
3283 int ip = 0;
3284 foreach_in_list(backend_instruction, inst, &instructions) {
3285 fprintf(file, "%4d: ", ip++);
3286 dump_instruction(inst, file);
3287 }
3288 }
3289
3290 if (file != stderr) {
3291 fclose(file);
3292 }
3293 }
3294
3295 void
3296 fs_visitor::dump_instruction(backend_instruction *be_inst)
3297 {
3298 dump_instruction(be_inst, stderr);
3299 }
3300
3301 void
3302 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3303 {
3304 fs_inst *inst = (fs_inst *)be_inst;
3305
3306 if (inst->predicate) {
3307 fprintf(file, "(%cf0.%d) ",
3308 inst->predicate_inverse ? '-' : '+',
3309 inst->flag_subreg);
3310 }
3311
3312 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3313 if (inst->saturate)
3314 fprintf(file, ".sat");
3315 if (inst->conditional_mod) {
3316 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3317 if (!inst->predicate &&
3318 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3319 inst->opcode != BRW_OPCODE_IF &&
3320 inst->opcode != BRW_OPCODE_WHILE))) {
3321 fprintf(file, ".f0.%d", inst->flag_subreg);
3322 }
3323 }
3324 fprintf(file, "(%d) ", inst->exec_size);
3325
3326
3327 switch (inst->dst.file) {
3328 case GRF:
3329 fprintf(file, "vgrf%d", inst->dst.reg);
3330 if (inst->dst.width != dispatch_width)
3331 fprintf(file, "@%d", inst->dst.width);
3332 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3333 inst->dst.subreg_offset)
3334 fprintf(file, "+%d.%d",
3335 inst->dst.reg_offset, inst->dst.subreg_offset);
3336 break;
3337 case MRF:
3338 fprintf(file, "m%d", inst->dst.reg);
3339 break;
3340 case BAD_FILE:
3341 fprintf(file, "(null)");
3342 break;
3343 case UNIFORM:
3344 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3345 break;
3346 case ATTR:
3347 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3348 break;
3349 case HW_REG:
3350 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3351 switch (inst->dst.fixed_hw_reg.nr) {
3352 case BRW_ARF_NULL:
3353 fprintf(file, "null");
3354 break;
3355 case BRW_ARF_ADDRESS:
3356 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3357 break;
3358 case BRW_ARF_ACCUMULATOR:
3359 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3360 break;
3361 case BRW_ARF_FLAG:
3362 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3363 inst->dst.fixed_hw_reg.subnr);
3364 break;
3365 default:
3366 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3367 inst->dst.fixed_hw_reg.subnr);
3368 break;
3369 }
3370 } else {
3371 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3372 }
3373 if (inst->dst.fixed_hw_reg.subnr)
3374 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3375 break;
3376 default:
3377 fprintf(file, "???");
3378 break;
3379 }
3380 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3381
3382 for (int i = 0; i < inst->sources; i++) {
3383 if (inst->src[i].negate)
3384 fprintf(file, "-");
3385 if (inst->src[i].abs)
3386 fprintf(file, "|");
3387 switch (inst->src[i].file) {
3388 case GRF:
3389 fprintf(file, "vgrf%d", inst->src[i].reg);
3390 if (inst->src[i].width != dispatch_width)
3391 fprintf(file, "@%d", inst->src[i].width);
3392 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3393 inst->src[i].subreg_offset)
3394 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3395 inst->src[i].subreg_offset);
3396 break;
3397 case MRF:
3398 fprintf(file, "***m%d***", inst->src[i].reg);
3399 break;
3400 case ATTR:
3401 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3402 break;
3403 case UNIFORM:
3404 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3405 if (inst->src[i].reladdr) {
3406 fprintf(file, "+reladdr");
3407 } else if (inst->src[i].subreg_offset) {
3408 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3409 inst->src[i].subreg_offset);
3410 }
3411 break;
3412 case BAD_FILE:
3413 fprintf(file, "(null)");
3414 break;
3415 case IMM:
3416 switch (inst->src[i].type) {
3417 case BRW_REGISTER_TYPE_F:
3418 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3419 break;
3420 case BRW_REGISTER_TYPE_W:
3421 case BRW_REGISTER_TYPE_D:
3422 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3423 break;
3424 case BRW_REGISTER_TYPE_UW:
3425 case BRW_REGISTER_TYPE_UD:
3426 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3427 break;
3428 case BRW_REGISTER_TYPE_VF:
3429 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3430 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3431 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3432 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3433 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3434 break;
3435 default:
3436 fprintf(file, "???");
3437 break;
3438 }
3439 break;
3440 case HW_REG:
3441 if (inst->src[i].fixed_hw_reg.negate)
3442 fprintf(file, "-");
3443 if (inst->src[i].fixed_hw_reg.abs)
3444 fprintf(file, "|");
3445 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3446 switch (inst->src[i].fixed_hw_reg.nr) {
3447 case BRW_ARF_NULL:
3448 fprintf(file, "null");
3449 break;
3450 case BRW_ARF_ADDRESS:
3451 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3452 break;
3453 case BRW_ARF_ACCUMULATOR:
3454 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3455 break;
3456 case BRW_ARF_FLAG:
3457 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3458 inst->src[i].fixed_hw_reg.subnr);
3459 break;
3460 default:
3461 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3462 inst->src[i].fixed_hw_reg.subnr);
3463 break;
3464 }
3465 } else {
3466 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3467 }
3468 if (inst->src[i].fixed_hw_reg.subnr)
3469 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3470 if (inst->src[i].fixed_hw_reg.abs)
3471 fprintf(file, "|");
3472 break;
3473 default:
3474 fprintf(file, "???");
3475 break;
3476 }
3477 if (inst->src[i].abs)
3478 fprintf(file, "|");
3479
3480 if (inst->src[i].file != IMM) {
3481 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3482 }
3483
3484 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3485 fprintf(file, ", ");
3486 }
3487
3488 fprintf(file, " ");
3489
3490 if (dispatch_width == 16 && inst->exec_size == 8) {
3491 if (inst->force_sechalf)
3492 fprintf(file, "2ndhalf ");
3493 else
3494 fprintf(file, "1sthalf ");
3495 }
3496
3497 fprintf(file, "\n");
3498 }
3499
3500 /**
3501 * Possibly returns an instruction that set up @param reg.
3502 *
3503 * Sometimes we want to take the result of some expression/variable
3504 * dereference tree and rewrite the instruction generating the result
3505 * of the tree. When processing the tree, we know that the
3506 * instructions generated are all writing temporaries that are dead
3507 * outside of this tree. So, if we have some instructions that write
3508 * a temporary, we're free to point that temp write somewhere else.
3509 *
3510 * Note that this doesn't guarantee that the instruction generated
3511 * only reg -- it might be the size=4 destination of a texture instruction.
3512 */
3513 fs_inst *
3514 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3515 fs_inst *end,
3516 const fs_reg &reg)
3517 {
3518 if (end == start ||
3519 end->is_partial_write() ||
3520 reg.reladdr ||
3521 !reg.equals(end->dst)) {
3522 return NULL;
3523 } else {
3524 return end;
3525 }
3526 }
3527
3528 void
3529 fs_visitor::setup_payload_gen6()
3530 {
3531 bool uses_depth =
3532 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3533 unsigned barycentric_interp_modes =
3534 (stage == MESA_SHADER_FRAGMENT) ?
3535 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3536
3537 assert(brw->gen >= 6);
3538
3539 /* R0-1: masks, pixel X/Y coordinates. */
3540 payload.num_regs = 2;
3541 /* R2: only for 32-pixel dispatch.*/
3542
3543 /* R3-26: barycentric interpolation coordinates. These appear in the
3544 * same order that they appear in the brw_wm_barycentric_interp_mode
3545 * enum. Each set of coordinates occupies 2 registers if dispatch width
3546 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3547 * appear if they were enabled using the "Barycentric Interpolation
3548 * Mode" bits in WM_STATE.
3549 */
3550 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3551 if (barycentric_interp_modes & (1 << i)) {
3552 payload.barycentric_coord_reg[i] = payload.num_regs;
3553 payload.num_regs += 2;
3554 if (dispatch_width == 16) {
3555 payload.num_regs += 2;
3556 }
3557 }
3558 }
3559
3560 /* R27: interpolated depth if uses source depth */
3561 if (uses_depth) {
3562 payload.source_depth_reg = payload.num_regs;
3563 payload.num_regs++;
3564 if (dispatch_width == 16) {
3565 /* R28: interpolated depth if not SIMD8. */
3566 payload.num_regs++;
3567 }
3568 }
3569 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3570 if (uses_depth) {
3571 payload.source_w_reg = payload.num_regs;
3572 payload.num_regs++;
3573 if (dispatch_width == 16) {
3574 /* R30: interpolated W if not SIMD8. */
3575 payload.num_regs++;
3576 }
3577 }
3578
3579 if (stage == MESA_SHADER_FRAGMENT) {
3580 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3581 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3582 prog_data->uses_pos_offset = key->compute_pos_offset;
3583 /* R31: MSAA position offsets. */
3584 if (prog_data->uses_pos_offset) {
3585 payload.sample_pos_reg = payload.num_regs;
3586 payload.num_regs++;
3587 }
3588 }
3589
3590 /* R32: MSAA input coverage mask */
3591 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3592 assert(brw->gen >= 7);
3593 payload.sample_mask_in_reg = payload.num_regs;
3594 payload.num_regs++;
3595 if (dispatch_width == 16) {
3596 /* R33: input coverage mask if not SIMD8. */
3597 payload.num_regs++;
3598 }
3599 }
3600
3601 /* R34-: bary for 32-pixel. */
3602 /* R58-59: interp W for 32-pixel. */
3603
3604 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3605 source_depth_to_render_target = true;
3606 }
3607 }
3608
3609 void
3610 fs_visitor::setup_vs_payload()
3611 {
3612 /* R0: thread header, R1: urb handles */
3613 payload.num_regs = 2;
3614 }
3615
3616 void
3617 fs_visitor::assign_binding_table_offsets()
3618 {
3619 assert(stage == MESA_SHADER_FRAGMENT);
3620 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3621 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3622 uint32_t next_binding_table_offset = 0;
3623
3624 /* If there are no color regions, we still perform an FB write to a null
3625 * renderbuffer, which we place at surface index 0.
3626 */
3627 prog_data->binding_table.render_target_start = next_binding_table_offset;
3628 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3629
3630 assign_common_binding_table_offsets(next_binding_table_offset);
3631 }
3632
3633 void
3634 fs_visitor::calculate_register_pressure()
3635 {
3636 invalidate_live_intervals();
3637 calculate_live_intervals();
3638
3639 unsigned num_instructions = 0;
3640 foreach_block(block, cfg)
3641 num_instructions += block->instructions.length();
3642
3643 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3644
3645 for (unsigned reg = 0; reg < alloc.count; reg++) {
3646 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3647 regs_live_at_ip[ip] += alloc.sizes[reg];
3648 }
3649 }
3650
3651 void
3652 fs_visitor::optimize()
3653 {
3654 const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3655
3656 split_virtual_grfs();
3657
3658 move_uniform_array_access_to_pull_constants();
3659 assign_constant_locations();
3660 demote_pull_constants();
3661
3662 #define OPT(pass, args...) ({ \
3663 pass_num++; \
3664 bool this_progress = pass(args); \
3665 \
3666 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3667 char filename[64]; \
3668 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3669 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3670 \
3671 backend_visitor::dump_instructions(filename); \
3672 } \
3673 \
3674 progress = progress || this_progress; \
3675 this_progress; \
3676 })
3677
3678 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3679 char filename[64];
3680 snprintf(filename, 64, "%s%d-%04d-00-start",
3681 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3682
3683 backend_visitor::dump_instructions(filename);
3684 }
3685
3686 bool progress;
3687 int iteration = 0;
3688 int pass_num = 0;
3689 do {
3690 progress = false;
3691 pass_num = 0;
3692 iteration++;
3693
3694 OPT(remove_duplicate_mrf_writes);
3695
3696 OPT(opt_algebraic);
3697 OPT(opt_cse);
3698 OPT(opt_copy_propagate);
3699 OPT(opt_peephole_predicated_break);
3700 OPT(opt_cmod_propagation);
3701 OPT(dead_code_eliminate);
3702 OPT(opt_peephole_sel);
3703 OPT(dead_control_flow_eliminate, this);
3704 OPT(opt_register_renaming);
3705 OPT(opt_redundant_discard_jumps);
3706 OPT(opt_saturate_propagation);
3707 OPT(register_coalesce);
3708 OPT(compute_to_mrf);
3709
3710 OPT(compact_virtual_grfs);
3711 } while (progress);
3712
3713 pass_num = 0;
3714
3715 if (OPT(lower_load_payload)) {
3716 split_virtual_grfs();
3717 OPT(register_coalesce);
3718 OPT(compute_to_mrf);
3719 OPT(dead_code_eliminate);
3720 }
3721
3722 OPT(opt_combine_constants);
3723
3724 lower_uniform_pull_constant_loads();
3725 }
3726
3727 /**
3728 * Three source instruction must have a GRF/MRF destination register.
3729 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
3730 */
3731 void
3732 fs_visitor::fixup_3src_null_dest()
3733 {
3734 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3735 if (inst->is_3src() && inst->dst.is_null()) {
3736 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3737 inst->dst.type);
3738 }
3739 }
3740 }
3741
3742 void
3743 fs_visitor::allocate_registers()
3744 {
3745 bool allocated_without_spills;
3746
3747 static const enum instruction_scheduler_mode pre_modes[] = {
3748 SCHEDULE_PRE,
3749 SCHEDULE_PRE_NON_LIFO,
3750 SCHEDULE_PRE_LIFO,
3751 };
3752
3753 /* Try each scheduling heuristic to see if it can successfully register
3754 * allocate without spilling. They should be ordered by decreasing
3755 * performance but increasing likelihood of allocating.
3756 */
3757 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3758 schedule_instructions(pre_modes[i]);
3759
3760 if (0) {
3761 assign_regs_trivial();
3762 allocated_without_spills = true;
3763 } else {
3764 allocated_without_spills = assign_regs(false);
3765 }
3766 if (allocated_without_spills)
3767 break;
3768 }
3769
3770 if (!allocated_without_spills) {
3771 const char *stage_name = stage == MESA_SHADER_VERTEX ?
3772 "Vertex" : "Fragment";
3773
3774 /* We assume that any spilling is worse than just dropping back to
3775 * SIMD8. There's probably actually some intermediate point where
3776 * SIMD16 with a couple of spills is still better.
3777 */
3778 if (dispatch_width == 16) {
3779 fail("Failure to register allocate. Reduce number of "
3780 "live scalar values to avoid this.");
3781 } else {
3782 perf_debug("%s shader triggered register spilling. "
3783 "Try reducing the number of live scalar values to "
3784 "improve performance.\n", stage_name);
3785 }
3786
3787 /* Since we're out of heuristics, just go spill registers until we
3788 * get an allocation.
3789 */
3790 while (!assign_regs(true)) {
3791 if (failed)
3792 break;
3793 }
3794 }
3795
3796 /* This must come after all optimization and register allocation, since
3797 * it inserts dead code that happens to have side effects, and it does
3798 * so based on the actual physical registers in use.
3799 */
3800 insert_gen4_send_dependency_workarounds();
3801
3802 if (failed)
3803 return;
3804
3805 if (!allocated_without_spills)
3806 schedule_instructions(SCHEDULE_POST);
3807
3808 if (last_scratch > 0)
3809 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3810 }
3811
3812 bool
3813 fs_visitor::run_vs()
3814 {
3815 assert(stage == MESA_SHADER_VERTEX);
3816
3817 assign_common_binding_table_offsets(0);
3818 setup_vs_payload();
3819
3820 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3821 emit_shader_time_begin();
3822
3823 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3824 base_ir = ir;
3825 this->result = reg_undef;
3826 ir->accept(this);
3827 }
3828 base_ir = NULL;
3829 if (failed)
3830 return false;
3831
3832 emit_urb_writes();
3833
3834 calculate_cfg();
3835
3836 optimize();
3837
3838 assign_curb_setup();
3839 assign_vs_urb_setup();
3840
3841 fixup_3src_null_dest();
3842 allocate_registers();
3843
3844 return !failed;
3845 }
3846
3847 bool
3848 fs_visitor::run_fs()
3849 {
3850 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3851 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3852
3853 assert(stage == MESA_SHADER_FRAGMENT);
3854
3855 sanity_param_count = prog->Parameters->NumParameters;
3856
3857 assign_binding_table_offsets();
3858
3859 if (brw->gen >= 6)
3860 setup_payload_gen6();
3861 else
3862 setup_payload_gen4();
3863
3864 if (0) {
3865 emit_dummy_fs();
3866 } else if (brw->use_rep_send && dispatch_width == 16) {
3867 emit_repclear_shader();
3868 } else {
3869 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3870 emit_shader_time_begin();
3871
3872 calculate_urb_setup();
3873 if (prog->InputsRead > 0) {
3874 if (brw->gen < 6)
3875 emit_interpolation_setup_gen4();
3876 else
3877 emit_interpolation_setup_gen6();
3878 }
3879
3880 /* We handle discards by keeping track of the still-live pixels in f0.1.
3881 * Initialize it with the dispatched pixels.
3882 */
3883 if (wm_prog_data->uses_kill) {
3884 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3885 discard_init->flag_subreg = 1;
3886 }
3887
3888 /* Generate FS IR for main(). (the visitor only descends into
3889 * functions called "main").
3890 */
3891 if (shader) {
3892 if (getenv("INTEL_USE_NIR") != NULL) {
3893 emit_nir_code();
3894 } else {
3895 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3896 base_ir = ir;
3897 this->result = reg_undef;
3898 ir->accept(this);
3899 }
3900 }
3901 } else {
3902 emit_fragment_program_code();
3903 }
3904 base_ir = NULL;
3905 if (failed)
3906 return false;
3907
3908 emit(FS_OPCODE_PLACEHOLDER_HALT);
3909
3910 if (wm_key->alpha_test_func)
3911 emit_alpha_test();
3912
3913 emit_fb_writes();
3914
3915 calculate_cfg();
3916
3917 optimize();
3918
3919 assign_curb_setup();
3920 assign_urb_setup();
3921
3922 fixup_3src_null_dest();
3923 allocate_registers();
3924
3925 if (failed)
3926 return false;
3927 }
3928
3929 if (dispatch_width == 8)
3930 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3931 else
3932 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3933
3934 /* If any state parameters were appended, then ParameterValues could have
3935 * been realloced, in which case the driver uniform storage set up by
3936 * _mesa_associate_uniform_storage() would point to freed memory. Make
3937 * sure that didn't happen.
3938 */
3939 assert(sanity_param_count == prog->Parameters->NumParameters);
3940
3941 return !failed;
3942 }
3943
3944 const unsigned *
3945 brw_wm_fs_emit(struct brw_context *brw,
3946 void *mem_ctx,
3947 const struct brw_wm_prog_key *key,
3948 struct brw_wm_prog_data *prog_data,
3949 struct gl_fragment_program *fp,
3950 struct gl_shader_program *prog,
3951 unsigned *final_assembly_size)
3952 {
3953 bool start_busy = false;
3954 double start_time = 0;
3955
3956 if (unlikely(brw->perf_debug)) {
3957 start_busy = (brw->batch.last_bo &&
3958 drm_intel_bo_busy(brw->batch.last_bo));
3959 start_time = get_time();
3960 }
3961
3962 struct brw_shader *shader = NULL;
3963 if (prog)
3964 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3965
3966 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3967 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3968
3969 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3970 */
3971 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3972 if (!v.run_fs()) {
3973 if (prog) {
3974 prog->LinkStatus = false;
3975 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3976 }
3977
3978 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3979 v.fail_msg);
3980
3981 return NULL;
3982 }
3983
3984 cfg_t *simd16_cfg = NULL;
3985 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3986 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3987 brw->use_rep_send)) {
3988 if (!v.simd16_unsupported) {
3989 /* Try a SIMD16 compile */
3990 v2.import_uniforms(&v);
3991 if (!v2.run_fs()) {
3992 perf_debug("SIMD16 shader failed to compile, falling back to "
3993 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3994 } else {
3995 simd16_cfg = v2.cfg;
3996 }
3997 } else {
3998 perf_debug("SIMD16 shader unsupported, falling back to "
3999 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4000 }
4001 }
4002
4003 cfg_t *simd8_cfg;
4004 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4005 if (no_simd8 && simd16_cfg) {
4006 simd8_cfg = NULL;
4007 prog_data->no_8 = true;
4008 } else {
4009 simd8_cfg = v.cfg;
4010 prog_data->no_8 = false;
4011 }
4012
4013 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4014 &fp->Base, v.runtime_check_aads_emit, "FS");
4015
4016 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4017 char *name;
4018 if (prog)
4019 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4020 prog->Label ? prog->Label : "unnamed",
4021 prog->Name);
4022 else
4023 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4024
4025 g.enable_debug(name);
4026 }
4027
4028 if (simd8_cfg)
4029 g.generate_code(simd8_cfg, 8);
4030 if (simd16_cfg)
4031 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4032
4033 if (unlikely(brw->perf_debug) && shader) {
4034 if (shader->compiled_once)
4035 brw_wm_debug_recompile(brw, prog, key);
4036 shader->compiled_once = true;
4037
4038 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4039 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4040 (get_time() - start_time) * 1000);
4041 }
4042 }
4043
4044 return g.get_assembly(final_assembly_size);
4045 }
4046
4047 extern "C" bool
4048 brw_fs_precompile(struct gl_context *ctx,
4049 struct gl_shader_program *shader_prog,
4050 struct gl_program *prog)
4051 {
4052 struct brw_context *brw = brw_context(ctx);
4053 struct brw_wm_prog_key key;
4054
4055 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4056 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4057 bool program_uses_dfdy = fp->UsesDFdy;
4058
4059 memset(&key, 0, sizeof(key));
4060
4061 if (brw->gen < 6) {
4062 if (fp->UsesKill)
4063 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4064
4065 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4066 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4067
4068 /* Just assume depth testing. */
4069 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4070 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4071 }
4072
4073 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4074 BRW_FS_VARYING_INPUT_MASK) > 16)
4075 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4076
4077 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4078 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
4079 for (unsigned i = 0; i < sampler_count; i++) {
4080 if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
4081 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4082 key.tex.swizzles[i] =
4083 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4084 } else {
4085 /* Color sampler: assume no swizzling. */
4086 key.tex.swizzles[i] = SWIZZLE_XYZW;
4087 }
4088 }
4089
4090 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4091 key.drawable_height = ctx->DrawBuffer->Height;
4092 }
4093
4094 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4095 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4096 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4097
4098 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4099 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4100 key.nr_color_regions > 1;
4101 }
4102
4103 key.program_string_id = bfp->id;
4104
4105 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4106 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4107
4108 bool success = do_wm_prog(brw, shader_prog, bfp, &key);
4109
4110 brw->wm.base.prog_offset = old_prog_offset;
4111 brw->wm.prog_data = old_prog_data;
4112
4113 return success;
4114 }