i965/fs: Fix lower_load_payload() to take into account non-zero reg_offset.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "util/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "util/register_allocate.h"
42 #include "program/hash_table.h"
43 #include "brw_context.h"
44 #include "brw_eu.h"
45 #include "brw_wm.h"
46 }
47 #include "brw_fs.h"
48 #include "brw_cfg.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53 #include "program/sampler.h"
54
55 void
56 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
57 const fs_reg *src, unsigned sources)
58 {
59 memset(this, 0, sizeof(*this));
60
61 this->src = new fs_reg[MAX2(sources, 3)];
62 for (unsigned i = 0; i < sources; i++)
63 this->src[i] = src[i];
64
65 this->opcode = opcode;
66 this->dst = dst;
67 this->sources = sources;
68 this->exec_size = exec_size;
69
70 assert(dst.file != IMM && dst.file != UNIFORM);
71
72 /* If exec_size == 0, try to guess it from the registers. Since all
73 * manner of things may use hardware registers, we first try to guess
74 * based on GRF registers. If this fails, we will go ahead and take the
75 * width from the destination register.
76 */
77 if (this->exec_size == 0) {
78 if (dst.file == GRF) {
79 this->exec_size = dst.width;
80 } else {
81 for (unsigned i = 0; i < sources; ++i) {
82 if (src[i].file != GRF && src[i].file != ATTR)
83 continue;
84
85 if (this->exec_size <= 1)
86 this->exec_size = src[i].width;
87 assert(src[i].width == 1 || src[i].width == this->exec_size);
88 }
89 }
90
91 if (this->exec_size == 0 && dst.file != BAD_FILE)
92 this->exec_size = dst.width;
93 }
94 assert(this->exec_size != 0);
95
96 for (unsigned i = 0; i < sources; ++i) {
97 switch (this->src[i].file) {
98 case BAD_FILE:
99 this->src[i].effective_width = 8;
100 break;
101 case GRF:
102 case HW_REG:
103 case ATTR:
104 assert(this->src[i].width > 0);
105 if (this->src[i].width == 1) {
106 this->src[i].effective_width = this->exec_size;
107 } else {
108 this->src[i].effective_width = this->src[i].width;
109 }
110 break;
111 case IMM:
112 case UNIFORM:
113 this->src[i].effective_width = this->exec_size;
114 break;
115 default:
116 unreachable("Invalid source register file");
117 }
118 }
119 this->dst.effective_width = this->exec_size;
120
121 this->conditional_mod = BRW_CONDITIONAL_NONE;
122
123 /* This will be the case for almost all instructions. */
124 switch (dst.file) {
125 case GRF:
126 case HW_REG:
127 case MRF:
128 case ATTR:
129 this->regs_written =
130 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
131 break;
132 case BAD_FILE:
133 this->regs_written = 0;
134 break;
135 case IMM:
136 case UNIFORM:
137 unreachable("Invalid destination register file");
138 default:
139 unreachable("Invalid register file");
140 }
141
142 this->writes_accumulator = false;
143 }
144
145 fs_inst::fs_inst()
146 {
147 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
148 }
149
150 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
151 {
152 init(opcode, exec_size, reg_undef, NULL, 0);
153 }
154
155 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
156 {
157 init(opcode, 0, dst, NULL, 0);
158 }
159
160 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
161 const fs_reg &src0)
162 {
163 const fs_reg src[1] = { src0 };
164 init(opcode, exec_size, dst, src, 1);
165 }
166
167 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
168 {
169 const fs_reg src[1] = { src0 };
170 init(opcode, 0, dst, src, 1);
171 }
172
173 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
174 const fs_reg &src0, const fs_reg &src1)
175 {
176 const fs_reg src[2] = { src0, src1 };
177 init(opcode, exec_size, dst, src, 2);
178 }
179
180 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
181 const fs_reg &src1)
182 {
183 const fs_reg src[2] = { src0, src1 };
184 init(opcode, 0, dst, src, 2);
185 }
186
187 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
188 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
189 {
190 const fs_reg src[3] = { src0, src1, src2 };
191 init(opcode, exec_size, dst, src, 3);
192 }
193
194 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
195 const fs_reg &src1, const fs_reg &src2)
196 {
197 const fs_reg src[3] = { src0, src1, src2 };
198 init(opcode, 0, dst, src, 3);
199 }
200
201 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
202 const fs_reg src[], unsigned sources)
203 {
204 init(opcode, 0, dst, src, sources);
205 }
206
207 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
208 const fs_reg src[], unsigned sources)
209 {
210 init(opcode, exec_width, dst, src, sources);
211 }
212
213 fs_inst::fs_inst(const fs_inst &that)
214 {
215 memcpy(this, &that, sizeof(that));
216
217 this->src = new fs_reg[MAX2(that.sources, 3)];
218
219 for (unsigned i = 0; i < that.sources; i++)
220 this->src[i] = that.src[i];
221 }
222
223 fs_inst::~fs_inst()
224 {
225 delete[] this->src;
226 }
227
228 void
229 fs_inst::resize_sources(uint8_t num_sources)
230 {
231 if (this->sources != num_sources) {
232 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
233
234 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
235 src[i] = this->src[i];
236
237 delete[] this->src;
238 this->src = src;
239 this->sources = num_sources;
240 }
241 }
242
243 #define ALU1(op) \
244 fs_inst * \
245 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
246 { \
247 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
248 }
249
250 #define ALU2(op) \
251 fs_inst * \
252 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
253 const fs_reg &src1) \
254 { \
255 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
256 }
257
258 #define ALU2_ACC(op) \
259 fs_inst * \
260 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
261 const fs_reg &src1) \
262 { \
263 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
264 inst->writes_accumulator = true; \
265 return inst; \
266 }
267
268 #define ALU3(op) \
269 fs_inst * \
270 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
271 const fs_reg &src1, const fs_reg &src2) \
272 { \
273 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
274 }
275
276 ALU1(NOT)
277 ALU1(MOV)
278 ALU1(FRC)
279 ALU1(RNDD)
280 ALU1(RNDE)
281 ALU1(RNDZ)
282 ALU2(ADD)
283 ALU2(MUL)
284 ALU2_ACC(MACH)
285 ALU2(AND)
286 ALU2(OR)
287 ALU2(XOR)
288 ALU2(SHL)
289 ALU2(SHR)
290 ALU2(ASR)
291 ALU3(LRP)
292 ALU1(BFREV)
293 ALU3(BFE)
294 ALU2(BFI1)
295 ALU3(BFI2)
296 ALU1(FBH)
297 ALU1(FBL)
298 ALU1(CBIT)
299 ALU3(MAD)
300 ALU2_ACC(ADDC)
301 ALU2_ACC(SUBB)
302 ALU2(SEL)
303 ALU2(MAC)
304
305 /** Gen4 predicated IF. */
306 fs_inst *
307 fs_visitor::IF(enum brw_predicate predicate)
308 {
309 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
310 inst->predicate = predicate;
311 return inst;
312 }
313
314 /** Gen6 IF with embedded comparison. */
315 fs_inst *
316 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
317 enum brw_conditional_mod condition)
318 {
319 assert(brw->gen == 6);
320 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
321 reg_null_d, src0, src1);
322 inst->conditional_mod = condition;
323 return inst;
324 }
325
326 /**
327 * CMP: Sets the low bit of the destination channels with the result
328 * of the comparison, while the upper bits are undefined, and updates
329 * the flag register with the packed 16 bits of the result.
330 */
331 fs_inst *
332 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
333 enum brw_conditional_mod condition)
334 {
335 fs_inst *inst;
336
337 /* Take the instruction:
338 *
339 * CMP null<d> src0<f> src1<f>
340 *
341 * Original gen4 does type conversion to the destination type before
342 * comparison, producing garbage results for floating point comparisons.
343 *
344 * The destination type doesn't matter on newer generations, so we set the
345 * type to match src0 so we can compact the instruction.
346 */
347 dst.type = src0.type;
348 if (dst.file == HW_REG)
349 dst.fixed_hw_reg.type = dst.type;
350
351 resolve_ud_negate(&src0);
352 resolve_ud_negate(&src1);
353
354 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
355 inst->conditional_mod = condition;
356
357 return inst;
358 }
359
360 fs_inst *
361 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
362 {
363 uint8_t exec_size = dst.width;
364 for (int i = 0; i < sources; ++i) {
365 assert(src[i].width % dst.width == 0);
366 if (src[i].width > exec_size)
367 exec_size = src[i].width;
368 }
369
370 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
371 dst, src, sources);
372 inst->regs_written = 0;
373 for (int i = 0; i < sources; ++i) {
374 /* The LOAD_PAYLOAD instruction only really makes sense if we are
375 * dealing with whole registers. If this ever changes, we can deal
376 * with it later.
377 */
378 int size = inst->src[i].effective_width * type_sz(src[i].type);
379 assert(size % 32 == 0);
380 inst->regs_written += (size + 31) / 32;
381 }
382
383 return inst;
384 }
385
386 exec_list
387 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
388 const fs_reg &surf_index,
389 const fs_reg &varying_offset,
390 uint32_t const_offset)
391 {
392 exec_list instructions;
393 fs_inst *inst;
394
395 /* We have our constant surface use a pitch of 4 bytes, so our index can
396 * be any component of a vector, and then we load 4 contiguous
397 * components starting from that.
398 *
399 * We break down the const_offset to a portion added to the variable
400 * offset and a portion done using reg_offset, which means that if you
401 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
402 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
403 * CSE can later notice that those loads are all the same and eliminate
404 * the redundant ones.
405 */
406 fs_reg vec4_offset = vgrf(glsl_type::int_type);
407 instructions.push_tail(ADD(vec4_offset,
408 varying_offset, fs_reg(const_offset & ~3)));
409
410 int scale = 1;
411 if (brw->gen == 4 && dst.width == 8) {
412 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
413 * u, v, r) as parameters, or we can just use the SIMD16 message
414 * consisting of (header, u). We choose the second, at the cost of a
415 * longer return length.
416 */
417 scale = 2;
418 }
419
420 enum opcode op;
421 if (brw->gen >= 7)
422 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
423 else
424 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
425
426 assert(dst.width % 8 == 0);
427 int regs_written = 4 * (dst.width / 8) * scale;
428 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
429 dst.type, dst.width);
430 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
431 inst->regs_written = regs_written;
432 instructions.push_tail(inst);
433
434 if (brw->gen < 7) {
435 inst->base_mrf = 13;
436 inst->header_present = true;
437 if (brw->gen == 4)
438 inst->mlen = 3;
439 else
440 inst->mlen = 1 + dispatch_width / 8;
441 }
442
443 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
444 instructions.push_tail(MOV(dst, result));
445
446 return instructions;
447 }
448
449 /**
450 * A helper for MOV generation for fixing up broken hardware SEND dependency
451 * handling.
452 */
453 fs_inst *
454 fs_visitor::DEP_RESOLVE_MOV(int grf)
455 {
456 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
457
458 inst->ir = NULL;
459 inst->annotation = "send dependency resolve";
460
461 /* The caller always wants uncompressed to emit the minimal extra
462 * dependencies, and to avoid having to deal with aligning its regs to 2.
463 */
464 inst->exec_size = 8;
465
466 return inst;
467 }
468
469 bool
470 fs_inst::equals(fs_inst *inst) const
471 {
472 return (opcode == inst->opcode &&
473 dst.equals(inst->dst) &&
474 src[0].equals(inst->src[0]) &&
475 src[1].equals(inst->src[1]) &&
476 src[2].equals(inst->src[2]) &&
477 saturate == inst->saturate &&
478 predicate == inst->predicate &&
479 conditional_mod == inst->conditional_mod &&
480 mlen == inst->mlen &&
481 base_mrf == inst->base_mrf &&
482 target == inst->target &&
483 eot == inst->eot &&
484 header_present == inst->header_present &&
485 shadow_compare == inst->shadow_compare &&
486 exec_size == inst->exec_size &&
487 offset == inst->offset);
488 }
489
490 bool
491 fs_inst::overwrites_reg(const fs_reg &reg) const
492 {
493 return (reg.file == dst.file &&
494 reg.reg == dst.reg &&
495 reg.reg_offset >= dst.reg_offset &&
496 reg.reg_offset < dst.reg_offset + regs_written);
497 }
498
499 bool
500 fs_inst::is_send_from_grf() const
501 {
502 switch (opcode) {
503 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
504 case SHADER_OPCODE_SHADER_TIME_ADD:
505 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
506 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
507 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
508 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
509 case SHADER_OPCODE_UNTYPED_ATOMIC:
510 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
511 case SHADER_OPCODE_URB_WRITE_SIMD8:
512 return true;
513 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
514 return src[1].file == GRF;
515 case FS_OPCODE_FB_WRITE:
516 return src[0].file == GRF;
517 default:
518 if (is_tex())
519 return src[0].file == GRF;
520
521 return false;
522 }
523 }
524
525 bool
526 fs_inst::can_do_source_mods(struct brw_context *brw)
527 {
528 if (brw->gen == 6 && is_math())
529 return false;
530
531 if (is_send_from_grf())
532 return false;
533
534 if (!backend_instruction::can_do_source_mods())
535 return false;
536
537 return true;
538 }
539
540 void
541 fs_reg::init()
542 {
543 memset(this, 0, sizeof(*this));
544 stride = 1;
545 }
546
547 /** Generic unset register constructor. */
548 fs_reg::fs_reg()
549 {
550 init();
551 this->file = BAD_FILE;
552 }
553
554 /** Immediate value constructor. */
555 fs_reg::fs_reg(float f)
556 {
557 init();
558 this->file = IMM;
559 this->type = BRW_REGISTER_TYPE_F;
560 this->fixed_hw_reg.dw1.f = f;
561 this->width = 1;
562 }
563
564 /** Immediate value constructor. */
565 fs_reg::fs_reg(int32_t i)
566 {
567 init();
568 this->file = IMM;
569 this->type = BRW_REGISTER_TYPE_D;
570 this->fixed_hw_reg.dw1.d = i;
571 this->width = 1;
572 }
573
574 /** Immediate value constructor. */
575 fs_reg::fs_reg(uint32_t u)
576 {
577 init();
578 this->file = IMM;
579 this->type = BRW_REGISTER_TYPE_UD;
580 this->fixed_hw_reg.dw1.ud = u;
581 this->width = 1;
582 }
583
584 /** Vector float immediate value constructor. */
585 fs_reg::fs_reg(uint8_t vf[4])
586 {
587 init();
588 this->file = IMM;
589 this->type = BRW_REGISTER_TYPE_VF;
590 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
591 }
592
593 /** Vector float immediate value constructor. */
594 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
595 {
596 init();
597 this->file = IMM;
598 this->type = BRW_REGISTER_TYPE_VF;
599 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
600 (vf1 << 8) |
601 (vf2 << 16) |
602 (vf3 << 24);
603 }
604
605 /** Fixed brw_reg. */
606 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
607 {
608 init();
609 this->file = HW_REG;
610 this->fixed_hw_reg = fixed_hw_reg;
611 this->type = fixed_hw_reg.type;
612 this->width = 1 << fixed_hw_reg.width;
613 }
614
615 bool
616 fs_reg::equals(const fs_reg &r) const
617 {
618 return (file == r.file &&
619 reg == r.reg &&
620 reg_offset == r.reg_offset &&
621 subreg_offset == r.subreg_offset &&
622 type == r.type &&
623 negate == r.negate &&
624 abs == r.abs &&
625 !reladdr && !r.reladdr &&
626 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
627 width == r.width &&
628 stride == r.stride);
629 }
630
631 fs_reg &
632 fs_reg::set_smear(unsigned subreg)
633 {
634 assert(file != HW_REG && file != IMM);
635 subreg_offset = subreg * type_sz(type);
636 stride = 0;
637 return *this;
638 }
639
640 bool
641 fs_reg::is_contiguous() const
642 {
643 return stride == 1;
644 }
645
646 int
647 fs_visitor::type_size(const struct glsl_type *type)
648 {
649 unsigned int size, i;
650
651 switch (type->base_type) {
652 case GLSL_TYPE_UINT:
653 case GLSL_TYPE_INT:
654 case GLSL_TYPE_FLOAT:
655 case GLSL_TYPE_BOOL:
656 return type->components();
657 case GLSL_TYPE_ARRAY:
658 return type_size(type->fields.array) * type->length;
659 case GLSL_TYPE_STRUCT:
660 size = 0;
661 for (i = 0; i < type->length; i++) {
662 size += type_size(type->fields.structure[i].type);
663 }
664 return size;
665 case GLSL_TYPE_SAMPLER:
666 /* Samplers take up no register space, since they're baked in at
667 * link time.
668 */
669 return 0;
670 case GLSL_TYPE_ATOMIC_UINT:
671 return 0;
672 case GLSL_TYPE_IMAGE:
673 case GLSL_TYPE_VOID:
674 case GLSL_TYPE_ERROR:
675 case GLSL_TYPE_INTERFACE:
676 case GLSL_TYPE_DOUBLE:
677 unreachable("not reached");
678 }
679
680 return 0;
681 }
682
683 fs_reg
684 fs_visitor::get_timestamp()
685 {
686 assert(brw->gen >= 7);
687
688 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
689 BRW_ARF_TIMESTAMP,
690 0),
691 BRW_REGISTER_TYPE_UD));
692
693 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
694
695 fs_inst *mov = emit(MOV(dst, ts));
696 /* We want to read the 3 fields we care about even if it's not enabled in
697 * the dispatch.
698 */
699 mov->force_writemask_all = true;
700
701 /* The caller wants the low 32 bits of the timestamp. Since it's running
702 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
703 * which is plenty of time for our purposes. It is identical across the
704 * EUs, but since it's tracking GPU core speed it will increment at a
705 * varying rate as render P-states change.
706 *
707 * The caller could also check if render P-states have changed (or anything
708 * else that might disrupt timing) by setting smear to 2 and checking if
709 * that field is != 0.
710 */
711 dst.set_smear(0);
712
713 return dst;
714 }
715
716 void
717 fs_visitor::emit_shader_time_begin()
718 {
719 current_annotation = "shader time start";
720 shader_start_time = get_timestamp();
721 }
722
723 void
724 fs_visitor::emit_shader_time_end()
725 {
726 current_annotation = "shader time end";
727
728 enum shader_time_shader_type type, written_type, reset_type;
729 switch (stage) {
730 case MESA_SHADER_VERTEX:
731 type = ST_VS;
732 written_type = ST_VS_WRITTEN;
733 reset_type = ST_VS_RESET;
734 break;
735 case MESA_SHADER_GEOMETRY:
736 type = ST_GS;
737 written_type = ST_GS_WRITTEN;
738 reset_type = ST_GS_RESET;
739 break;
740 case MESA_SHADER_FRAGMENT:
741 if (dispatch_width == 8) {
742 type = ST_FS8;
743 written_type = ST_FS8_WRITTEN;
744 reset_type = ST_FS8_RESET;
745 } else {
746 assert(dispatch_width == 16);
747 type = ST_FS16;
748 written_type = ST_FS16_WRITTEN;
749 reset_type = ST_FS16_RESET;
750 }
751 break;
752 default:
753 unreachable("fs_visitor::emit_shader_time_end missing code");
754 }
755
756 fs_reg shader_end_time = get_timestamp();
757
758 /* Check that there weren't any timestamp reset events (assuming these
759 * were the only two timestamp reads that happened).
760 */
761 fs_reg reset = shader_end_time;
762 reset.set_smear(2);
763 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
764 test->conditional_mod = BRW_CONDITIONAL_Z;
765 emit(IF(BRW_PREDICATE_NORMAL));
766
767 fs_reg start = shader_start_time;
768 start.negate = true;
769 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
770 emit(ADD(diff, start, shader_end_time));
771
772 /* If there were no instructions between the two timestamp gets, the diff
773 * is 2 cycles. Remove that overhead, so I can forget about that when
774 * trying to determine the time taken for single instructions.
775 */
776 emit(ADD(diff, diff, fs_reg(-2u)));
777
778 emit_shader_time_write(type, diff);
779 emit_shader_time_write(written_type, fs_reg(1u));
780 emit(BRW_OPCODE_ELSE);
781 emit_shader_time_write(reset_type, fs_reg(1u));
782 emit(BRW_OPCODE_ENDIF);
783 }
784
785 void
786 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
787 fs_reg value)
788 {
789 int shader_time_index =
790 brw_get_shader_time_index(brw, shader_prog, prog, type);
791 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
792
793 fs_reg payload;
794 if (dispatch_width == 8)
795 payload = vgrf(glsl_type::uvec2_type);
796 else
797 payload = vgrf(glsl_type::uint_type);
798
799 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
800 fs_reg(), payload, offset, value));
801 }
802
803 void
804 fs_visitor::vfail(const char *format, va_list va)
805 {
806 char *msg;
807
808 if (failed)
809 return;
810
811 failed = true;
812
813 msg = ralloc_vasprintf(mem_ctx, format, va);
814 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
815
816 this->fail_msg = msg;
817
818 if (debug_enabled) {
819 fprintf(stderr, "%s", msg);
820 }
821 }
822
823 void
824 fs_visitor::fail(const char *format, ...)
825 {
826 va_list va;
827
828 va_start(va, format);
829 vfail(format, va);
830 va_end(va);
831 }
832
833 /**
834 * Mark this program as impossible to compile in SIMD16 mode.
835 *
836 * During the SIMD8 compile (which happens first), we can detect and flag
837 * things that are unsupported in SIMD16 mode, so the compiler can skip
838 * the SIMD16 compile altogether.
839 *
840 * During a SIMD16 compile (if one happens anyway), this just calls fail().
841 */
842 void
843 fs_visitor::no16(const char *format, ...)
844 {
845 va_list va;
846
847 va_start(va, format);
848
849 if (dispatch_width == 16) {
850 vfail(format, va);
851 } else {
852 simd16_unsupported = true;
853
854 if (brw->perf_debug) {
855 if (no16_msg)
856 ralloc_vasprintf_append(&no16_msg, format, va);
857 else
858 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
859 }
860 }
861
862 va_end(va);
863 }
864
865 fs_inst *
866 fs_visitor::emit(enum opcode opcode)
867 {
868 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
869 }
870
871 fs_inst *
872 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
873 {
874 return emit(new(mem_ctx) fs_inst(opcode, dst));
875 }
876
877 fs_inst *
878 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
879 {
880 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
881 }
882
883 fs_inst *
884 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
885 const fs_reg &src1)
886 {
887 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
888 }
889
890 fs_inst *
891 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
892 const fs_reg &src1, const fs_reg &src2)
893 {
894 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
895 }
896
897 fs_inst *
898 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
899 fs_reg src[], int sources)
900 {
901 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
902 }
903
904 /**
905 * Returns true if the instruction has a flag that means it won't
906 * update an entire destination register.
907 *
908 * For example, dead code elimination and live variable analysis want to know
909 * when a write to a variable screens off any preceding values that were in
910 * it.
911 */
912 bool
913 fs_inst::is_partial_write() const
914 {
915 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
916 (this->dst.width * type_sz(this->dst.type)) < 32 ||
917 !this->dst.is_contiguous());
918 }
919
920 int
921 fs_inst::regs_read(int arg) const
922 {
923 if (is_tex() && arg == 0 && src[0].file == GRF) {
924 return mlen;
925 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
926 return mlen;
927 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
928 return mlen;
929 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
930 return mlen;
931 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
932 return mlen;
933 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
934 return mlen;
935 }
936
937 switch (src[arg].file) {
938 case BAD_FILE:
939 case UNIFORM:
940 case IMM:
941 return 1;
942 case GRF:
943 case HW_REG:
944 if (src[arg].stride == 0) {
945 return 1;
946 } else {
947 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
948 return (size + 31) / 32;
949 }
950 case MRF:
951 unreachable("MRF registers are not allowed as sources");
952 default:
953 unreachable("Invalid register file");
954 }
955 }
956
957 bool
958 fs_inst::reads_flag() const
959 {
960 return predicate;
961 }
962
963 bool
964 fs_inst::writes_flag() const
965 {
966 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
967 opcode != BRW_OPCODE_IF &&
968 opcode != BRW_OPCODE_WHILE)) ||
969 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
970 }
971
972 /**
973 * Returns how many MRFs an FS opcode will write over.
974 *
975 * Note that this is not the 0 or 1 implied writes in an actual gen
976 * instruction -- the FS opcodes often generate MOVs in addition.
977 */
978 int
979 fs_visitor::implied_mrf_writes(fs_inst *inst)
980 {
981 if (inst->mlen == 0)
982 return 0;
983
984 if (inst->base_mrf == -1)
985 return 0;
986
987 switch (inst->opcode) {
988 case SHADER_OPCODE_RCP:
989 case SHADER_OPCODE_RSQ:
990 case SHADER_OPCODE_SQRT:
991 case SHADER_OPCODE_EXP2:
992 case SHADER_OPCODE_LOG2:
993 case SHADER_OPCODE_SIN:
994 case SHADER_OPCODE_COS:
995 return 1 * dispatch_width / 8;
996 case SHADER_OPCODE_POW:
997 case SHADER_OPCODE_INT_QUOTIENT:
998 case SHADER_OPCODE_INT_REMAINDER:
999 return 2 * dispatch_width / 8;
1000 case SHADER_OPCODE_TEX:
1001 case FS_OPCODE_TXB:
1002 case SHADER_OPCODE_TXD:
1003 case SHADER_OPCODE_TXF:
1004 case SHADER_OPCODE_TXF_CMS:
1005 case SHADER_OPCODE_TXF_MCS:
1006 case SHADER_OPCODE_TG4:
1007 case SHADER_OPCODE_TG4_OFFSET:
1008 case SHADER_OPCODE_TXL:
1009 case SHADER_OPCODE_TXS:
1010 case SHADER_OPCODE_LOD:
1011 return 1;
1012 case FS_OPCODE_FB_WRITE:
1013 return 2;
1014 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1015 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1016 return 1;
1017 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1018 return inst->mlen;
1019 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1020 return 2;
1021 case SHADER_OPCODE_UNTYPED_ATOMIC:
1022 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1023 case SHADER_OPCODE_URB_WRITE_SIMD8:
1024 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1025 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1026 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1027 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1028 return 0;
1029 default:
1030 unreachable("not reached");
1031 }
1032 }
1033
1034 fs_reg
1035 fs_visitor::vgrf(const glsl_type *const type)
1036 {
1037 int reg_width = dispatch_width / 8;
1038 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1039 brw_type_for_base_type(type), dispatch_width);
1040 }
1041
1042 fs_reg
1043 fs_visitor::vgrf(int num_components)
1044 {
1045 int reg_width = dispatch_width / 8;
1046 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1047 BRW_REGISTER_TYPE_F, dispatch_width);
1048 }
1049
1050 /** Fixed HW reg constructor. */
1051 fs_reg::fs_reg(enum register_file file, int reg)
1052 {
1053 init();
1054 this->file = file;
1055 this->reg = reg;
1056 this->type = BRW_REGISTER_TYPE_F;
1057
1058 switch (file) {
1059 case UNIFORM:
1060 this->width = 1;
1061 break;
1062 default:
1063 this->width = 8;
1064 }
1065 }
1066
1067 /** Fixed HW reg constructor. */
1068 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1069 {
1070 init();
1071 this->file = file;
1072 this->reg = reg;
1073 this->type = type;
1074
1075 switch (file) {
1076 case UNIFORM:
1077 this->width = 1;
1078 break;
1079 default:
1080 this->width = 8;
1081 }
1082 }
1083
1084 /** Fixed HW reg constructor. */
1085 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1086 uint8_t width)
1087 {
1088 init();
1089 this->file = file;
1090 this->reg = reg;
1091 this->type = type;
1092 this->width = width;
1093 }
1094
1095 fs_reg *
1096 fs_visitor::variable_storage(ir_variable *var)
1097 {
1098 return (fs_reg *)hash_table_find(this->variable_ht, var);
1099 }
1100
1101 void
1102 import_uniforms_callback(const void *key,
1103 void *data,
1104 void *closure)
1105 {
1106 struct hash_table *dst_ht = (struct hash_table *)closure;
1107 const fs_reg *reg = (const fs_reg *)data;
1108
1109 if (reg->file != UNIFORM)
1110 return;
1111
1112 hash_table_insert(dst_ht, data, key);
1113 }
1114
1115 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1116 * This brings in those uniform definitions
1117 */
1118 void
1119 fs_visitor::import_uniforms(fs_visitor *v)
1120 {
1121 hash_table_call_foreach(v->variable_ht,
1122 import_uniforms_callback,
1123 variable_ht);
1124 this->push_constant_loc = v->push_constant_loc;
1125 this->pull_constant_loc = v->pull_constant_loc;
1126 this->uniforms = v->uniforms;
1127 this->param_size = v->param_size;
1128 }
1129
1130 /* Our support for uniforms is piggy-backed on the struct
1131 * gl_fragment_program, because that's where the values actually
1132 * get stored, rather than in some global gl_shader_program uniform
1133 * store.
1134 */
1135 void
1136 fs_visitor::setup_uniform_values(ir_variable *ir)
1137 {
1138 int namelen = strlen(ir->name);
1139
1140 /* The data for our (non-builtin) uniforms is stored in a series of
1141 * gl_uniform_driver_storage structs for each subcomponent that
1142 * glGetUniformLocation() could name. We know it's been set up in the same
1143 * order we'd walk the type, so walk the list of storage and find anything
1144 * with our name, or the prefix of a component that starts with our name.
1145 */
1146 unsigned params_before = uniforms;
1147 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1148 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1149
1150 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1151 (storage->name[namelen] != 0 &&
1152 storage->name[namelen] != '.' &&
1153 storage->name[namelen] != '[')) {
1154 continue;
1155 }
1156
1157 unsigned slots = storage->type->component_slots();
1158 if (storage->array_elements)
1159 slots *= storage->array_elements;
1160
1161 for (unsigned i = 0; i < slots; i++) {
1162 stage_prog_data->param[uniforms++] = &storage->storage[i];
1163 }
1164 }
1165
1166 /* Make sure we actually initialized the right amount of stuff here. */
1167 assert(params_before + ir->type->component_slots() == uniforms);
1168 (void)params_before;
1169 }
1170
1171
1172 /* Our support for builtin uniforms is even scarier than non-builtin.
1173 * It sits on top of the PROG_STATE_VAR parameters that are
1174 * automatically updated from GL context state.
1175 */
1176 void
1177 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1178 {
1179 const ir_state_slot *const slots = ir->get_state_slots();
1180 assert(slots != NULL);
1181
1182 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1183 /* This state reference has already been setup by ir_to_mesa, but we'll
1184 * get the same index back here.
1185 */
1186 int index = _mesa_add_state_reference(this->prog->Parameters,
1187 (gl_state_index *)slots[i].tokens);
1188
1189 /* Add each of the unique swizzles of the element as a parameter.
1190 * This'll end up matching the expected layout of the
1191 * array/matrix/structure we're trying to fill in.
1192 */
1193 int last_swiz = -1;
1194 for (unsigned int j = 0; j < 4; j++) {
1195 int swiz = GET_SWZ(slots[i].swizzle, j);
1196 if (swiz == last_swiz)
1197 break;
1198 last_swiz = swiz;
1199
1200 stage_prog_data->param[uniforms++] =
1201 &prog->Parameters->ParameterValues[index][swiz];
1202 }
1203 }
1204 }
1205
1206 fs_reg *
1207 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1208 bool origin_upper_left)
1209 {
1210 assert(stage == MESA_SHADER_FRAGMENT);
1211 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1212 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1213 fs_reg wpos = *reg;
1214 bool flip = !origin_upper_left ^ key->render_to_fbo;
1215
1216 /* gl_FragCoord.x */
1217 if (pixel_center_integer) {
1218 emit(MOV(wpos, this->pixel_x));
1219 } else {
1220 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1221 }
1222 wpos = offset(wpos, 1);
1223
1224 /* gl_FragCoord.y */
1225 if (!flip && pixel_center_integer) {
1226 emit(MOV(wpos, this->pixel_y));
1227 } else {
1228 fs_reg pixel_y = this->pixel_y;
1229 float offset = (pixel_center_integer ? 0.0 : 0.5);
1230
1231 if (flip) {
1232 pixel_y.negate = true;
1233 offset += key->drawable_height - 1.0;
1234 }
1235
1236 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1237 }
1238 wpos = offset(wpos, 1);
1239
1240 /* gl_FragCoord.z */
1241 if (brw->gen >= 6) {
1242 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1243 } else {
1244 emit(FS_OPCODE_LINTERP, wpos,
1245 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1246 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1247 interp_reg(VARYING_SLOT_POS, 2));
1248 }
1249 wpos = offset(wpos, 1);
1250
1251 /* gl_FragCoord.w: Already set up in emit_interpolation */
1252 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1253
1254 return reg;
1255 }
1256
1257 fs_inst *
1258 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1259 glsl_interp_qualifier interpolation_mode,
1260 bool is_centroid, bool is_sample)
1261 {
1262 brw_wm_barycentric_interp_mode barycoord_mode;
1263 if (brw->gen >= 6) {
1264 if (is_centroid) {
1265 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1266 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1267 else
1268 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1269 } else if (is_sample) {
1270 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1271 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1272 else
1273 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1274 } else {
1275 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1276 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1277 else
1278 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1279 }
1280 } else {
1281 /* On Ironlake and below, there is only one interpolation mode.
1282 * Centroid interpolation doesn't mean anything on this hardware --
1283 * there is no multisampling.
1284 */
1285 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1286 }
1287 return emit(FS_OPCODE_LINTERP, attr,
1288 this->delta_x[barycoord_mode],
1289 this->delta_y[barycoord_mode], interp);
1290 }
1291
1292 void
1293 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1294 const glsl_type *type,
1295 glsl_interp_qualifier interpolation_mode,
1296 int location, bool mod_centroid,
1297 bool mod_sample)
1298 {
1299 attr.type = brw_type_for_base_type(type->get_scalar_type());
1300
1301 assert(stage == MESA_SHADER_FRAGMENT);
1302 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1303 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1304
1305 unsigned int array_elements;
1306
1307 if (type->is_array()) {
1308 array_elements = type->length;
1309 if (array_elements == 0) {
1310 fail("dereferenced array '%s' has length 0\n", name);
1311 }
1312 type = type->fields.array;
1313 } else {
1314 array_elements = 1;
1315 }
1316
1317 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1318 bool is_gl_Color =
1319 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1320 if (key->flat_shade && is_gl_Color) {
1321 interpolation_mode = INTERP_QUALIFIER_FLAT;
1322 } else {
1323 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1324 }
1325 }
1326
1327 for (unsigned int i = 0; i < array_elements; i++) {
1328 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1329 if (prog_data->urb_setup[location] == -1) {
1330 /* If there's no incoming setup data for this slot, don't
1331 * emit interpolation for it.
1332 */
1333 attr = offset(attr, type->vector_elements);
1334 location++;
1335 continue;
1336 }
1337
1338 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1339 /* Constant interpolation (flat shading) case. The SF has
1340 * handed us defined values in only the constant offset
1341 * field of the setup reg.
1342 */
1343 for (unsigned int k = 0; k < type->vector_elements; k++) {
1344 struct brw_reg interp = interp_reg(location, k);
1345 interp = suboffset(interp, 3);
1346 interp.type = attr.type;
1347 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1348 attr = offset(attr, 1);
1349 }
1350 } else {
1351 /* Smooth/noperspective interpolation case. */
1352 for (unsigned int k = 0; k < type->vector_elements; k++) {
1353 struct brw_reg interp = interp_reg(location, k);
1354 if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1355 /* Get the pixel/sample mask into f0 so that we know
1356 * which pixels are lit. Then, for each channel that is
1357 * unlit, replace the centroid data with non-centroid
1358 * data.
1359 */
1360 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1361
1362 fs_inst *inst;
1363 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1364 false, false);
1365 inst->predicate = BRW_PREDICATE_NORMAL;
1366 inst->predicate_inverse = true;
1367 if (brw->has_pln)
1368 inst->no_dd_clear = true;
1369
1370 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1371 mod_centroid && !key->persample_shading,
1372 mod_sample || key->persample_shading);
1373 inst->predicate = BRW_PREDICATE_NORMAL;
1374 inst->predicate_inverse = false;
1375 if (brw->has_pln)
1376 inst->no_dd_check = true;
1377
1378 } else {
1379 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1380 mod_centroid && !key->persample_shading,
1381 mod_sample || key->persample_shading);
1382 }
1383 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1384 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1385 }
1386 attr = offset(attr, 1);
1387 }
1388
1389 }
1390 location++;
1391 }
1392 }
1393 }
1394
1395 fs_reg *
1396 fs_visitor::emit_frontfacing_interpolation()
1397 {
1398 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1399
1400 if (brw->gen >= 6) {
1401 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1402 * a boolean result from this (~0/true or 0/false).
1403 *
1404 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1405 * this task in only one instruction:
1406 * - a negation source modifier will flip the bit; and
1407 * - a W -> D type conversion will sign extend the bit into the high
1408 * word of the destination.
1409 *
1410 * An ASR 15 fills the low word of the destination.
1411 */
1412 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1413 g0.negate = true;
1414
1415 emit(ASR(*reg, g0, fs_reg(15)));
1416 } else {
1417 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1418 * a boolean result from this (1/true or 0/false).
1419 *
1420 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1421 * the negation source modifier to flip it. Unfortunately the SHR
1422 * instruction only operates on UD (or D with an abs source modifier)
1423 * sources without negation.
1424 *
1425 * Instead, use ASR (which will give ~0/true or 0/false).
1426 */
1427 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1428 g1_6.negate = true;
1429
1430 emit(ASR(*reg, g1_6, fs_reg(31)));
1431 }
1432
1433 return reg;
1434 }
1435
1436 void
1437 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1438 {
1439 assert(stage == MESA_SHADER_FRAGMENT);
1440 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1441 assert(dst.type == BRW_REGISTER_TYPE_F);
1442
1443 if (key->compute_pos_offset) {
1444 /* Convert int_sample_pos to floating point */
1445 emit(MOV(dst, int_sample_pos));
1446 /* Scale to the range [0, 1] */
1447 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1448 }
1449 else {
1450 /* From ARB_sample_shading specification:
1451 * "When rendering to a non-multisample buffer, or if multisample
1452 * rasterization is disabled, gl_SamplePosition will always be
1453 * (0.5, 0.5).
1454 */
1455 emit(MOV(dst, fs_reg(0.5f)));
1456 }
1457 }
1458
1459 fs_reg *
1460 fs_visitor::emit_samplepos_setup()
1461 {
1462 assert(brw->gen >= 6);
1463
1464 this->current_annotation = "compute sample position";
1465 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1466 fs_reg pos = *reg;
1467 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1468 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1469
1470 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1471 * mode will be enabled.
1472 *
1473 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1474 * R31.1:0 Position Offset X/Y for Slot[3:0]
1475 * R31.3:2 Position Offset X/Y for Slot[7:4]
1476 * .....
1477 *
1478 * The X, Y sample positions come in as bytes in thread payload. So, read
1479 * the positions using vstride=16, width=8, hstride=2.
1480 */
1481 struct brw_reg sample_pos_reg =
1482 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1483 BRW_REGISTER_TYPE_B), 16, 8, 2);
1484
1485 if (dispatch_width == 8) {
1486 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1487 } else {
1488 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1489 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1490 ->force_sechalf = true;
1491 }
1492 /* Compute gl_SamplePosition.x */
1493 compute_sample_position(pos, int_sample_x);
1494 pos = offset(pos, 1);
1495 if (dispatch_width == 8) {
1496 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1497 } else {
1498 emit(MOV(half(int_sample_y, 0),
1499 fs_reg(suboffset(sample_pos_reg, 1))));
1500 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1501 ->force_sechalf = true;
1502 }
1503 /* Compute gl_SamplePosition.y */
1504 compute_sample_position(pos, int_sample_y);
1505 return reg;
1506 }
1507
1508 fs_reg *
1509 fs_visitor::emit_sampleid_setup()
1510 {
1511 assert(stage == MESA_SHADER_FRAGMENT);
1512 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1513 assert(brw->gen >= 6);
1514
1515 this->current_annotation = "compute sample id";
1516 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1517
1518 if (key->compute_sample_id) {
1519 fs_reg t1 = vgrf(glsl_type::int_type);
1520 fs_reg t2 = vgrf(glsl_type::int_type);
1521 t2.type = BRW_REGISTER_TYPE_UW;
1522
1523 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1524 * 8x multisampling, subspan 0 will represent sample N (where N
1525 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1526 * 7. We can find the value of N by looking at R0.0 bits 7:6
1527 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1528 * (since samples are always delivered in pairs). That is, we
1529 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1530 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1531 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1532 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1533 * populating a temporary variable with the sequence (0, 1, 2, 3),
1534 * and then reading from it using vstride=1, width=4, hstride=0.
1535 * These computations hold good for 4x multisampling as well.
1536 *
1537 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1538 * the first four slots are sample 0 of subspan 0; the next four
1539 * are sample 1 of subspan 0; the third group is sample 0 of
1540 * subspan 1, and finally sample 1 of subspan 1.
1541 */
1542 fs_inst *inst;
1543 inst = emit(BRW_OPCODE_AND, t1,
1544 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1545 fs_reg(0xc0));
1546 inst->force_writemask_all = true;
1547 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1548 inst->force_writemask_all = true;
1549 /* This works for both SIMD8 and SIMD16 */
1550 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1551 inst->force_writemask_all = true;
1552 /* This special instruction takes care of setting vstride=1,
1553 * width=4, hstride=0 of t2 during an ADD instruction.
1554 */
1555 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1556 } else {
1557 /* As per GL_ARB_sample_shading specification:
1558 * "When rendering to a non-multisample buffer, or if multisample
1559 * rasterization is disabled, gl_SampleID will always be zero."
1560 */
1561 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1562 }
1563
1564 return reg;
1565 }
1566
1567 fs_reg
1568 fs_visitor::fix_math_operand(fs_reg src)
1569 {
1570 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1571 * might be able to do better by doing execsize = 1 math and then
1572 * expanding that result out, but we would need to be careful with
1573 * masking.
1574 *
1575 * The hardware ignores source modifiers (negate and abs) on math
1576 * instructions, so we also move to a temp to set those up.
1577 */
1578 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1579 !src.abs && !src.negate)
1580 return src;
1581
1582 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1583 * operands to math
1584 */
1585 if (brw->gen >= 7 && src.file != IMM)
1586 return src;
1587
1588 fs_reg expanded = vgrf(glsl_type::float_type);
1589 expanded.type = src.type;
1590 emit(BRW_OPCODE_MOV, expanded, src);
1591 return expanded;
1592 }
1593
1594 fs_inst *
1595 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1596 {
1597 switch (opcode) {
1598 case SHADER_OPCODE_RCP:
1599 case SHADER_OPCODE_RSQ:
1600 case SHADER_OPCODE_SQRT:
1601 case SHADER_OPCODE_EXP2:
1602 case SHADER_OPCODE_LOG2:
1603 case SHADER_OPCODE_SIN:
1604 case SHADER_OPCODE_COS:
1605 break;
1606 default:
1607 unreachable("not reached: bad math opcode");
1608 }
1609
1610 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1611 * might be able to do better by doing execsize = 1 math and then
1612 * expanding that result out, but we would need to be careful with
1613 * masking.
1614 *
1615 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1616 * instructions, so we also move to a temp to set those up.
1617 */
1618 if (brw->gen == 6 || brw->gen == 7)
1619 src = fix_math_operand(src);
1620
1621 fs_inst *inst = emit(opcode, dst, src);
1622
1623 if (brw->gen < 6) {
1624 inst->base_mrf = 2;
1625 inst->mlen = dispatch_width / 8;
1626 }
1627
1628 return inst;
1629 }
1630
1631 fs_inst *
1632 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1633 {
1634 int base_mrf = 2;
1635 fs_inst *inst;
1636
1637 if (brw->gen >= 8) {
1638 inst = emit(opcode, dst, src0, src1);
1639 } else if (brw->gen >= 6) {
1640 src0 = fix_math_operand(src0);
1641 src1 = fix_math_operand(src1);
1642
1643 inst = emit(opcode, dst, src0, src1);
1644 } else {
1645 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1646 * "Message Payload":
1647 *
1648 * "Operand0[7]. For the INT DIV functions, this operand is the
1649 * denominator."
1650 * ...
1651 * "Operand1[7]. For the INT DIV functions, this operand is the
1652 * numerator."
1653 */
1654 bool is_int_div = opcode != SHADER_OPCODE_POW;
1655 fs_reg &op0 = is_int_div ? src1 : src0;
1656 fs_reg &op1 = is_int_div ? src0 : src1;
1657
1658 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1659 inst = emit(opcode, dst, op0, reg_null_f);
1660
1661 inst->base_mrf = base_mrf;
1662 inst->mlen = 2 * dispatch_width / 8;
1663 }
1664 return inst;
1665 }
1666
1667 void
1668 fs_visitor::assign_curb_setup()
1669 {
1670 if (dispatch_width == 8) {
1671 prog_data->dispatch_grf_start_reg = payload.num_regs;
1672 } else {
1673 assert(stage == MESA_SHADER_FRAGMENT);
1674 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1675 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1676 }
1677
1678 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1679
1680 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1681 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1682 for (unsigned int i = 0; i < inst->sources; i++) {
1683 if (inst->src[i].file == UNIFORM) {
1684 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1685 int constant_nr;
1686 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1687 constant_nr = push_constant_loc[uniform_nr];
1688 } else {
1689 /* Section 5.11 of the OpenGL 4.1 spec says:
1690 * "Out-of-bounds reads return undefined values, which include
1691 * values from other variables of the active program or zero."
1692 * Just return the first push constant.
1693 */
1694 constant_nr = 0;
1695 }
1696
1697 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1698 constant_nr / 8,
1699 constant_nr % 8);
1700
1701 inst->src[i].file = HW_REG;
1702 inst->src[i].fixed_hw_reg = byte_offset(
1703 retype(brw_reg, inst->src[i].type),
1704 inst->src[i].subreg_offset);
1705 }
1706 }
1707 }
1708 }
1709
1710 void
1711 fs_visitor::calculate_urb_setup()
1712 {
1713 assert(stage == MESA_SHADER_FRAGMENT);
1714 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1715 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1716
1717 memset(prog_data->urb_setup, -1,
1718 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1719
1720 int urb_next = 0;
1721 /* Figure out where each of the incoming setup attributes lands. */
1722 if (brw->gen >= 6) {
1723 if (_mesa_bitcount_64(prog->InputsRead &
1724 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1725 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1726 * first 16 varying inputs, so we can put them wherever we want.
1727 * Just put them in order.
1728 *
1729 * This is useful because it means that (a) inputs not used by the
1730 * fragment shader won't take up valuable register space, and (b) we
1731 * won't have to recompile the fragment shader if it gets paired with
1732 * a different vertex (or geometry) shader.
1733 */
1734 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1735 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1736 BITFIELD64_BIT(i)) {
1737 prog_data->urb_setup[i] = urb_next++;
1738 }
1739 }
1740 } else {
1741 /* We have enough input varyings that the SF/SBE pipeline stage can't
1742 * arbitrarily rearrange them to suit our whim; we have to put them
1743 * in an order that matches the output of the previous pipeline stage
1744 * (geometry or vertex shader).
1745 */
1746 struct brw_vue_map prev_stage_vue_map;
1747 brw_compute_vue_map(brw, &prev_stage_vue_map,
1748 key->input_slots_valid);
1749 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1750 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1751 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1752 slot++) {
1753 int varying = prev_stage_vue_map.slot_to_varying[slot];
1754 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1755 * unused.
1756 */
1757 if (varying != BRW_VARYING_SLOT_COUNT &&
1758 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1759 BITFIELD64_BIT(varying))) {
1760 prog_data->urb_setup[varying] = slot - first_slot;
1761 }
1762 }
1763 urb_next = prev_stage_vue_map.num_slots - first_slot;
1764 }
1765 } else {
1766 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1767 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1768 /* Point size is packed into the header, not as a general attribute */
1769 if (i == VARYING_SLOT_PSIZ)
1770 continue;
1771
1772 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1773 /* The back color slot is skipped when the front color is
1774 * also written to. In addition, some slots can be
1775 * written in the vertex shader and not read in the
1776 * fragment shader. So the register number must always be
1777 * incremented, mapped or not.
1778 */
1779 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1780 prog_data->urb_setup[i] = urb_next;
1781 urb_next++;
1782 }
1783 }
1784
1785 /*
1786 * It's a FS only attribute, and we did interpolation for this attribute
1787 * in SF thread. So, count it here, too.
1788 *
1789 * See compile_sf_prog() for more info.
1790 */
1791 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1792 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1793 }
1794
1795 prog_data->num_varying_inputs = urb_next;
1796 }
1797
1798 void
1799 fs_visitor::assign_urb_setup()
1800 {
1801 assert(stage == MESA_SHADER_FRAGMENT);
1802 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1803
1804 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1805
1806 /* Offset all the urb_setup[] index by the actual position of the
1807 * setup regs, now that the location of the constants has been chosen.
1808 */
1809 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1810 if (inst->opcode == FS_OPCODE_LINTERP) {
1811 assert(inst->src[2].file == HW_REG);
1812 inst->src[2].fixed_hw_reg.nr += urb_start;
1813 }
1814
1815 if (inst->opcode == FS_OPCODE_CINTERP) {
1816 assert(inst->src[0].file == HW_REG);
1817 inst->src[0].fixed_hw_reg.nr += urb_start;
1818 }
1819 }
1820
1821 /* Each attribute is 4 setup channels, each of which is half a reg. */
1822 this->first_non_payload_grf =
1823 urb_start + prog_data->num_varying_inputs * 2;
1824 }
1825
1826 void
1827 fs_visitor::assign_vs_urb_setup()
1828 {
1829 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1830 int grf, count, slot, channel, attr;
1831
1832 assert(stage == MESA_SHADER_VERTEX);
1833 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1834 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1835 count++;
1836
1837 /* Each attribute is 4 regs. */
1838 this->first_non_payload_grf =
1839 payload.num_regs + prog_data->curb_read_length + count * 4;
1840
1841 unsigned vue_entries =
1842 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1843
1844 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1845 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1846
1847 assert(vs_prog_data->base.urb_read_length <= 15);
1848
1849 /* Rewrite all ATTR file references to the hw grf that they land in. */
1850 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1851 for (int i = 0; i < inst->sources; i++) {
1852 if (inst->src[i].file == ATTR) {
1853
1854 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1855 slot = count - 1;
1856 } else {
1857 /* Attributes come in in a contiguous block, ordered by their
1858 * gl_vert_attrib value. That means we can compute the slot
1859 * number for an attribute by masking out the enabled
1860 * attributes before it and counting the bits.
1861 */
1862 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1863 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1864 BITFIELD64_MASK(attr));
1865 }
1866
1867 channel = inst->src[i].reg_offset & 3;
1868
1869 grf = payload.num_regs +
1870 prog_data->curb_read_length +
1871 slot * 4 + channel;
1872
1873 inst->src[i].file = HW_REG;
1874 inst->src[i].fixed_hw_reg =
1875 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1876 }
1877 }
1878 }
1879 }
1880
1881 /**
1882 * Split large virtual GRFs into separate components if we can.
1883 *
1884 * This is mostly duplicated with what brw_fs_vector_splitting does,
1885 * but that's really conservative because it's afraid of doing
1886 * splitting that doesn't result in real progress after the rest of
1887 * the optimization phases, which would cause infinite looping in
1888 * optimization. We can do it once here, safely. This also has the
1889 * opportunity to split interpolated values, or maybe even uniforms,
1890 * which we don't have at the IR level.
1891 *
1892 * We want to split, because virtual GRFs are what we register
1893 * allocate and spill (due to contiguousness requirements for some
1894 * instructions), and they're what we naturally generate in the
1895 * codegen process, but most virtual GRFs don't actually need to be
1896 * contiguous sets of GRFs. If we split, we'll end up with reduced
1897 * live intervals and better dead code elimination and coalescing.
1898 */
1899 void
1900 fs_visitor::split_virtual_grfs()
1901 {
1902 int num_vars = this->alloc.count;
1903
1904 /* Count the total number of registers */
1905 int reg_count = 0;
1906 int vgrf_to_reg[num_vars];
1907 for (int i = 0; i < num_vars; i++) {
1908 vgrf_to_reg[i] = reg_count;
1909 reg_count += alloc.sizes[i];
1910 }
1911
1912 /* An array of "split points". For each register slot, this indicates
1913 * if this slot can be separated from the previous slot. Every time an
1914 * instruction uses multiple elements of a register (as a source or
1915 * destination), we mark the used slots as inseparable. Then we go
1916 * through and split the registers into the smallest pieces we can.
1917 */
1918 bool split_points[reg_count];
1919 memset(split_points, 0, sizeof(split_points));
1920
1921 /* Mark all used registers as fully splittable */
1922 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1923 if (inst->dst.file == GRF) {
1924 int reg = vgrf_to_reg[inst->dst.reg];
1925 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1926 split_points[reg + j] = true;
1927 }
1928
1929 for (int i = 0; i < inst->sources; i++) {
1930 if (inst->src[i].file == GRF) {
1931 int reg = vgrf_to_reg[inst->src[i].reg];
1932 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1933 split_points[reg + j] = true;
1934 }
1935 }
1936 }
1937
1938 if (brw->has_pln &&
1939 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1940 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1941 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1942 * Gen6, that was the only supported interpolation mode, and since Gen6,
1943 * delta_x and delta_y are in fixed hardware registers.
1944 */
1945 int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1946 split_points[vgrf_to_reg[vgrf] + 1] = false;
1947 }
1948
1949 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1950 if (inst->dst.file == GRF) {
1951 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1952 for (int j = 1; j < inst->regs_written; j++)
1953 split_points[reg + j] = false;
1954 }
1955 for (int i = 0; i < inst->sources; i++) {
1956 if (inst->src[i].file == GRF) {
1957 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1958 for (int j = 1; j < inst->regs_read(i); j++)
1959 split_points[reg + j] = false;
1960 }
1961 }
1962 }
1963
1964 int new_virtual_grf[reg_count];
1965 int new_reg_offset[reg_count];
1966
1967 int reg = 0;
1968 for (int i = 0; i < num_vars; i++) {
1969 /* The first one should always be 0 as a quick sanity check. */
1970 assert(split_points[reg] == false);
1971
1972 /* j = 0 case */
1973 new_reg_offset[reg] = 0;
1974 reg++;
1975 int offset = 1;
1976
1977 /* j > 0 case */
1978 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1979 /* If this is a split point, reset the offset to 0 and allocate a
1980 * new virtual GRF for the previous offset many registers
1981 */
1982 if (split_points[reg]) {
1983 assert(offset <= MAX_VGRF_SIZE);
1984 int grf = alloc.allocate(offset);
1985 for (int k = reg - offset; k < reg; k++)
1986 new_virtual_grf[k] = grf;
1987 offset = 0;
1988 }
1989 new_reg_offset[reg] = offset;
1990 offset++;
1991 reg++;
1992 }
1993
1994 /* The last one gets the original register number */
1995 assert(offset <= MAX_VGRF_SIZE);
1996 alloc.sizes[i] = offset;
1997 for (int k = reg - offset; k < reg; k++)
1998 new_virtual_grf[k] = i;
1999 }
2000 assert(reg == reg_count);
2001
2002 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2003 if (inst->dst.file == GRF) {
2004 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2005 inst->dst.reg = new_virtual_grf[reg];
2006 inst->dst.reg_offset = new_reg_offset[reg];
2007 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2008 }
2009 for (int i = 0; i < inst->sources; i++) {
2010 if (inst->src[i].file == GRF) {
2011 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2012 inst->src[i].reg = new_virtual_grf[reg];
2013 inst->src[i].reg_offset = new_reg_offset[reg];
2014 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2015 }
2016 }
2017 }
2018 invalidate_live_intervals();
2019 }
2020
2021 /**
2022 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2023 *
2024 * During code generation, we create tons of temporary variables, many of
2025 * which get immediately killed and are never used again. Yet, in later
2026 * optimization and analysis passes, such as compute_live_intervals, we need
2027 * to loop over all the virtual GRFs. Compacting them can save a lot of
2028 * overhead.
2029 */
2030 bool
2031 fs_visitor::compact_virtual_grfs()
2032 {
2033 bool progress = false;
2034 int remap_table[this->alloc.count];
2035 memset(remap_table, -1, sizeof(remap_table));
2036
2037 /* Mark which virtual GRFs are used. */
2038 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2039 if (inst->dst.file == GRF)
2040 remap_table[inst->dst.reg] = 0;
2041
2042 for (int i = 0; i < inst->sources; i++) {
2043 if (inst->src[i].file == GRF)
2044 remap_table[inst->src[i].reg] = 0;
2045 }
2046 }
2047
2048 /* Compact the GRF arrays. */
2049 int new_index = 0;
2050 for (unsigned i = 0; i < this->alloc.count; i++) {
2051 if (remap_table[i] == -1) {
2052 /* We just found an unused register. This means that we are
2053 * actually going to compact something.
2054 */
2055 progress = true;
2056 } else {
2057 remap_table[i] = new_index;
2058 alloc.sizes[new_index] = alloc.sizes[i];
2059 invalidate_live_intervals();
2060 ++new_index;
2061 }
2062 }
2063
2064 this->alloc.count = new_index;
2065
2066 /* Patch all the instructions to use the newly renumbered registers */
2067 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2068 if (inst->dst.file == GRF)
2069 inst->dst.reg = remap_table[inst->dst.reg];
2070
2071 for (int i = 0; i < inst->sources; i++) {
2072 if (inst->src[i].file == GRF)
2073 inst->src[i].reg = remap_table[inst->src[i].reg];
2074 }
2075 }
2076
2077 /* Patch all the references to delta_x/delta_y, since they're used in
2078 * register allocation. If they're unused, switch them to BAD_FILE so
2079 * we don't think some random VGRF is delta_x/delta_y.
2080 */
2081 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2082 if (delta_x[i].file == GRF) {
2083 if (remap_table[delta_x[i].reg] != -1) {
2084 delta_x[i].reg = remap_table[delta_x[i].reg];
2085 } else {
2086 delta_x[i].file = BAD_FILE;
2087 }
2088 }
2089 }
2090 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2091 if (delta_y[i].file == GRF) {
2092 if (remap_table[delta_y[i].reg] != -1) {
2093 delta_y[i].reg = remap_table[delta_y[i].reg];
2094 } else {
2095 delta_y[i].file = BAD_FILE;
2096 }
2097 }
2098 }
2099
2100 return progress;
2101 }
2102
2103 /*
2104 * Implements array access of uniforms by inserting a
2105 * PULL_CONSTANT_LOAD instruction.
2106 *
2107 * Unlike temporary GRF array access (where we don't support it due to
2108 * the difficulty of doing relative addressing on instruction
2109 * destinations), we could potentially do array access of uniforms
2110 * that were loaded in GRF space as push constants. In real-world
2111 * usage we've seen, though, the arrays being used are always larger
2112 * than we could load as push constants, so just always move all
2113 * uniform array access out to a pull constant buffer.
2114 */
2115 void
2116 fs_visitor::move_uniform_array_access_to_pull_constants()
2117 {
2118 if (dispatch_width != 8)
2119 return;
2120
2121 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2122 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2123
2124 /* Walk through and find array access of uniforms. Put a copy of that
2125 * uniform in the pull constant buffer.
2126 *
2127 * Note that we don't move constant-indexed accesses to arrays. No
2128 * testing has been done of the performance impact of this choice.
2129 */
2130 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2131 for (int i = 0 ; i < inst->sources; i++) {
2132 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2133 continue;
2134
2135 int uniform = inst->src[i].reg;
2136
2137 /* If this array isn't already present in the pull constant buffer,
2138 * add it.
2139 */
2140 if (pull_constant_loc[uniform] == -1) {
2141 const gl_constant_value **values = &stage_prog_data->param[uniform];
2142
2143 assert(param_size[uniform]);
2144
2145 for (int j = 0; j < param_size[uniform]; j++) {
2146 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2147
2148 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2149 values[j];
2150 }
2151 }
2152 }
2153 }
2154 }
2155
2156 /**
2157 * Assign UNIFORM file registers to either push constants or pull constants.
2158 *
2159 * We allow a fragment shader to have more than the specified minimum
2160 * maximum number of fragment shader uniform components (64). If
2161 * there are too many of these, they'd fill up all of register space.
2162 * So, this will push some of them out to the pull constant buffer and
2163 * update the program to load them.
2164 */
2165 void
2166 fs_visitor::assign_constant_locations()
2167 {
2168 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2169 if (dispatch_width != 8)
2170 return;
2171
2172 /* Find which UNIFORM registers are still in use. */
2173 bool is_live[uniforms];
2174 for (unsigned int i = 0; i < uniforms; i++) {
2175 is_live[i] = false;
2176 }
2177
2178 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2179 for (int i = 0; i < inst->sources; i++) {
2180 if (inst->src[i].file != UNIFORM)
2181 continue;
2182
2183 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2184 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2185 is_live[constant_nr] = true;
2186 }
2187 }
2188
2189 /* Only allow 16 registers (128 uniform components) as push constants.
2190 *
2191 * Just demote the end of the list. We could probably do better
2192 * here, demoting things that are rarely used in the program first.
2193 *
2194 * If changing this value, note the limitation about total_regs in
2195 * brw_curbe.c.
2196 */
2197 unsigned int max_push_components = 16 * 8;
2198 unsigned int num_push_constants = 0;
2199
2200 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2201
2202 for (unsigned int i = 0; i < uniforms; i++) {
2203 if (!is_live[i] || pull_constant_loc[i] != -1) {
2204 /* This UNIFORM register is either dead, or has already been demoted
2205 * to a pull const. Mark it as no longer living in the param[] array.
2206 */
2207 push_constant_loc[i] = -1;
2208 continue;
2209 }
2210
2211 if (num_push_constants < max_push_components) {
2212 /* Retain as a push constant. Record the location in the params[]
2213 * array.
2214 */
2215 push_constant_loc[i] = num_push_constants++;
2216 } else {
2217 /* Demote to a pull constant. */
2218 push_constant_loc[i] = -1;
2219
2220 int pull_index = stage_prog_data->nr_pull_params++;
2221 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2222 pull_constant_loc[i] = pull_index;
2223 }
2224 }
2225
2226 stage_prog_data->nr_params = num_push_constants;
2227
2228 /* Up until now, the param[] array has been indexed by reg + reg_offset
2229 * of UNIFORM registers. Condense it to only contain the uniforms we
2230 * chose to upload as push constants.
2231 */
2232 for (unsigned int i = 0; i < uniforms; i++) {
2233 int remapped = push_constant_loc[i];
2234
2235 if (remapped == -1)
2236 continue;
2237
2238 assert(remapped <= (int)i);
2239 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2240 }
2241 }
2242
2243 /**
2244 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2245 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2246 */
2247 void
2248 fs_visitor::demote_pull_constants()
2249 {
2250 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2251 for (int i = 0; i < inst->sources; i++) {
2252 if (inst->src[i].file != UNIFORM)
2253 continue;
2254
2255 int pull_index = pull_constant_loc[inst->src[i].reg +
2256 inst->src[i].reg_offset];
2257 if (pull_index == -1)
2258 continue;
2259
2260 /* Set up the annotation tracking for new generated instructions. */
2261 base_ir = inst->ir;
2262 current_annotation = inst->annotation;
2263
2264 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2265 fs_reg dst = vgrf(glsl_type::float_type);
2266
2267 /* Generate a pull load into dst. */
2268 if (inst->src[i].reladdr) {
2269 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2270 surf_index,
2271 *inst->src[i].reladdr,
2272 pull_index);
2273 inst->insert_before(block, &list);
2274 inst->src[i].reladdr = NULL;
2275 } else {
2276 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2277 fs_inst *pull =
2278 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2279 dst, surf_index, offset);
2280 inst->insert_before(block, pull);
2281 inst->src[i].set_smear(pull_index & 3);
2282 }
2283
2284 /* Rewrite the instruction to use the temporary VGRF. */
2285 inst->src[i].file = GRF;
2286 inst->src[i].reg = dst.reg;
2287 inst->src[i].reg_offset = 0;
2288 inst->src[i].width = dispatch_width;
2289 }
2290 }
2291 invalidate_live_intervals();
2292 }
2293
2294 bool
2295 fs_visitor::opt_algebraic()
2296 {
2297 bool progress = false;
2298
2299 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2300 switch (inst->opcode) {
2301 case BRW_OPCODE_MOV:
2302 if (inst->src[0].file != IMM)
2303 break;
2304
2305 if (inst->saturate) {
2306 if (inst->dst.type != inst->src[0].type)
2307 assert(!"unimplemented: saturate mixed types");
2308
2309 if (brw_saturate_immediate(inst->dst.type,
2310 &inst->src[0].fixed_hw_reg)) {
2311 inst->saturate = false;
2312 progress = true;
2313 }
2314 }
2315 break;
2316
2317 case BRW_OPCODE_MUL:
2318 if (inst->src[1].file != IMM)
2319 continue;
2320
2321 /* a * 1.0 = a */
2322 if (inst->src[1].is_one()) {
2323 inst->opcode = BRW_OPCODE_MOV;
2324 inst->src[1] = reg_undef;
2325 progress = true;
2326 break;
2327 }
2328
2329 /* a * -1.0 = -a */
2330 if (inst->src[1].is_negative_one()) {
2331 inst->opcode = BRW_OPCODE_MOV;
2332 inst->src[0].negate = !inst->src[0].negate;
2333 inst->src[1] = reg_undef;
2334 progress = true;
2335 break;
2336 }
2337
2338 /* a * 0.0 = 0.0 */
2339 if (inst->src[1].is_zero()) {
2340 inst->opcode = BRW_OPCODE_MOV;
2341 inst->src[0] = inst->src[1];
2342 inst->src[1] = reg_undef;
2343 progress = true;
2344 break;
2345 }
2346
2347 if (inst->src[0].file == IMM) {
2348 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2349 inst->opcode = BRW_OPCODE_MOV;
2350 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2351 inst->src[1] = reg_undef;
2352 progress = true;
2353 break;
2354 }
2355 break;
2356 case BRW_OPCODE_ADD:
2357 if (inst->src[1].file != IMM)
2358 continue;
2359
2360 /* a + 0.0 = a */
2361 if (inst->src[1].is_zero()) {
2362 inst->opcode = BRW_OPCODE_MOV;
2363 inst->src[1] = reg_undef;
2364 progress = true;
2365 break;
2366 }
2367
2368 if (inst->src[0].file == IMM) {
2369 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2370 inst->opcode = BRW_OPCODE_MOV;
2371 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2372 inst->src[1] = reg_undef;
2373 progress = true;
2374 break;
2375 }
2376 break;
2377 case BRW_OPCODE_OR:
2378 if (inst->src[0].equals(inst->src[1])) {
2379 inst->opcode = BRW_OPCODE_MOV;
2380 inst->src[1] = reg_undef;
2381 progress = true;
2382 break;
2383 }
2384 break;
2385 case BRW_OPCODE_LRP:
2386 if (inst->src[1].equals(inst->src[2])) {
2387 inst->opcode = BRW_OPCODE_MOV;
2388 inst->src[0] = inst->src[1];
2389 inst->src[1] = reg_undef;
2390 inst->src[2] = reg_undef;
2391 progress = true;
2392 break;
2393 }
2394 break;
2395 case BRW_OPCODE_CMP:
2396 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2397 inst->src[0].abs &&
2398 inst->src[0].negate &&
2399 inst->src[1].is_zero()) {
2400 inst->src[0].abs = false;
2401 inst->src[0].negate = false;
2402 inst->conditional_mod = BRW_CONDITIONAL_Z;
2403 progress = true;
2404 break;
2405 }
2406 break;
2407 case BRW_OPCODE_SEL:
2408 if (inst->src[0].equals(inst->src[1])) {
2409 inst->opcode = BRW_OPCODE_MOV;
2410 inst->src[1] = reg_undef;
2411 inst->predicate = BRW_PREDICATE_NONE;
2412 inst->predicate_inverse = false;
2413 progress = true;
2414 } else if (inst->saturate && inst->src[1].file == IMM) {
2415 switch (inst->conditional_mod) {
2416 case BRW_CONDITIONAL_LE:
2417 case BRW_CONDITIONAL_L:
2418 switch (inst->src[1].type) {
2419 case BRW_REGISTER_TYPE_F:
2420 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2421 inst->opcode = BRW_OPCODE_MOV;
2422 inst->src[1] = reg_undef;
2423 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2424 progress = true;
2425 }
2426 break;
2427 default:
2428 break;
2429 }
2430 break;
2431 case BRW_CONDITIONAL_GE:
2432 case BRW_CONDITIONAL_G:
2433 switch (inst->src[1].type) {
2434 case BRW_REGISTER_TYPE_F:
2435 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2436 inst->opcode = BRW_OPCODE_MOV;
2437 inst->src[1] = reg_undef;
2438 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2439 progress = true;
2440 }
2441 break;
2442 default:
2443 break;
2444 }
2445 default:
2446 break;
2447 }
2448 }
2449 break;
2450 case BRW_OPCODE_MAD:
2451 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2452 inst->opcode = BRW_OPCODE_MOV;
2453 inst->src[1] = reg_undef;
2454 inst->src[2] = reg_undef;
2455 progress = true;
2456 } else if (inst->src[0].is_zero()) {
2457 inst->opcode = BRW_OPCODE_MUL;
2458 inst->src[0] = inst->src[2];
2459 inst->src[2] = reg_undef;
2460 } else if (inst->src[1].is_one()) {
2461 inst->opcode = BRW_OPCODE_ADD;
2462 inst->src[1] = inst->src[2];
2463 inst->src[2] = reg_undef;
2464 progress = true;
2465 } else if (inst->src[2].is_one()) {
2466 inst->opcode = BRW_OPCODE_ADD;
2467 inst->src[2] = reg_undef;
2468 progress = true;
2469 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2470 inst->opcode = BRW_OPCODE_ADD;
2471 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2472 inst->src[2] = reg_undef;
2473 progress = true;
2474 }
2475 break;
2476 case SHADER_OPCODE_RCP: {
2477 fs_inst *prev = (fs_inst *)inst->prev;
2478 if (prev->opcode == SHADER_OPCODE_SQRT) {
2479 if (inst->src[0].equals(prev->dst)) {
2480 inst->opcode = SHADER_OPCODE_RSQ;
2481 inst->src[0] = prev->src[0];
2482 progress = true;
2483 }
2484 }
2485 break;
2486 }
2487 default:
2488 break;
2489 }
2490 }
2491
2492 return progress;
2493 }
2494
2495 bool
2496 fs_visitor::opt_register_renaming()
2497 {
2498 bool progress = false;
2499 int depth = 0;
2500
2501 int remap[alloc.count];
2502 memset(remap, -1, sizeof(int) * alloc.count);
2503
2504 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2505 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2506 depth++;
2507 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2508 inst->opcode == BRW_OPCODE_WHILE) {
2509 depth--;
2510 }
2511
2512 /* Rewrite instruction sources. */
2513 for (int i = 0; i < inst->sources; i++) {
2514 if (inst->src[i].file == GRF &&
2515 remap[inst->src[i].reg] != -1 &&
2516 remap[inst->src[i].reg] != inst->src[i].reg) {
2517 inst->src[i].reg = remap[inst->src[i].reg];
2518 progress = true;
2519 }
2520 }
2521
2522 const int dst = inst->dst.reg;
2523
2524 if (depth == 0 &&
2525 inst->dst.file == GRF &&
2526 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2527 !inst->is_partial_write()) {
2528 if (remap[dst] == -1) {
2529 remap[dst] = dst;
2530 } else {
2531 remap[dst] = alloc.allocate(inst->dst.width / 8);
2532 inst->dst.reg = remap[dst];
2533 progress = true;
2534 }
2535 } else if (inst->dst.file == GRF &&
2536 remap[dst] != -1 &&
2537 remap[dst] != dst) {
2538 inst->dst.reg = remap[dst];
2539 progress = true;
2540 }
2541 }
2542
2543 if (progress) {
2544 invalidate_live_intervals();
2545
2546 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2547 if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2548 delta_x[i].reg = remap[delta_x[i].reg];
2549 }
2550 }
2551 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2552 if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2553 delta_y[i].reg = remap[delta_y[i].reg];
2554 }
2555 }
2556 }
2557
2558 return progress;
2559 }
2560
2561 bool
2562 fs_visitor::compute_to_mrf()
2563 {
2564 bool progress = false;
2565 int next_ip = 0;
2566
2567 /* No MRFs on Gen >= 7. */
2568 if (brw->gen >= 7)
2569 return false;
2570
2571 calculate_live_intervals();
2572
2573 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2574 int ip = next_ip;
2575 next_ip++;
2576
2577 if (inst->opcode != BRW_OPCODE_MOV ||
2578 inst->is_partial_write() ||
2579 inst->dst.file != MRF || inst->src[0].file != GRF ||
2580 inst->dst.type != inst->src[0].type ||
2581 inst->src[0].abs || inst->src[0].negate ||
2582 !inst->src[0].is_contiguous() ||
2583 inst->src[0].subreg_offset)
2584 continue;
2585
2586 /* Work out which hardware MRF registers are written by this
2587 * instruction.
2588 */
2589 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2590 int mrf_high;
2591 if (inst->dst.reg & BRW_MRF_COMPR4) {
2592 mrf_high = mrf_low + 4;
2593 } else if (inst->exec_size == 16) {
2594 mrf_high = mrf_low + 1;
2595 } else {
2596 mrf_high = mrf_low;
2597 }
2598
2599 /* Can't compute-to-MRF this GRF if someone else was going to
2600 * read it later.
2601 */
2602 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2603 continue;
2604
2605 /* Found a move of a GRF to a MRF. Let's see if we can go
2606 * rewrite the thing that made this GRF to write into the MRF.
2607 */
2608 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2609 if (scan_inst->dst.file == GRF &&
2610 scan_inst->dst.reg == inst->src[0].reg) {
2611 /* Found the last thing to write our reg we want to turn
2612 * into a compute-to-MRF.
2613 */
2614
2615 /* If this one instruction didn't populate all the
2616 * channels, bail. We might be able to rewrite everything
2617 * that writes that reg, but it would require smarter
2618 * tracking to delay the rewriting until complete success.
2619 */
2620 if (scan_inst->is_partial_write())
2621 break;
2622
2623 /* Things returning more than one register would need us to
2624 * understand coalescing out more than one MOV at a time.
2625 */
2626 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2627 break;
2628
2629 /* SEND instructions can't have MRF as a destination. */
2630 if (scan_inst->mlen)
2631 break;
2632
2633 if (brw->gen == 6) {
2634 /* gen6 math instructions must have the destination be
2635 * GRF, so no compute-to-MRF for them.
2636 */
2637 if (scan_inst->is_math()) {
2638 break;
2639 }
2640 }
2641
2642 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2643 /* Found the creator of our MRF's source value. */
2644 scan_inst->dst.file = MRF;
2645 scan_inst->dst.reg = inst->dst.reg;
2646 scan_inst->saturate |= inst->saturate;
2647 inst->remove(block);
2648 progress = true;
2649 }
2650 break;
2651 }
2652
2653 /* We don't handle control flow here. Most computation of
2654 * values that end up in MRFs are shortly before the MRF
2655 * write anyway.
2656 */
2657 if (block->start() == scan_inst)
2658 break;
2659
2660 /* You can't read from an MRF, so if someone else reads our
2661 * MRF's source GRF that we wanted to rewrite, that stops us.
2662 */
2663 bool interfered = false;
2664 for (int i = 0; i < scan_inst->sources; i++) {
2665 if (scan_inst->src[i].file == GRF &&
2666 scan_inst->src[i].reg == inst->src[0].reg &&
2667 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2668 interfered = true;
2669 }
2670 }
2671 if (interfered)
2672 break;
2673
2674 if (scan_inst->dst.file == MRF) {
2675 /* If somebody else writes our MRF here, we can't
2676 * compute-to-MRF before that.
2677 */
2678 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2679 int scan_mrf_high;
2680
2681 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2682 scan_mrf_high = scan_mrf_low + 4;
2683 } else if (scan_inst->exec_size == 16) {
2684 scan_mrf_high = scan_mrf_low + 1;
2685 } else {
2686 scan_mrf_high = scan_mrf_low;
2687 }
2688
2689 if (mrf_low == scan_mrf_low ||
2690 mrf_low == scan_mrf_high ||
2691 mrf_high == scan_mrf_low ||
2692 mrf_high == scan_mrf_high) {
2693 break;
2694 }
2695 }
2696
2697 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2698 /* Found a SEND instruction, which means that there are
2699 * live values in MRFs from base_mrf to base_mrf +
2700 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2701 * above it.
2702 */
2703 if (mrf_low >= scan_inst->base_mrf &&
2704 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2705 break;
2706 }
2707 if (mrf_high >= scan_inst->base_mrf &&
2708 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2709 break;
2710 }
2711 }
2712 }
2713 }
2714
2715 if (progress)
2716 invalidate_live_intervals();
2717
2718 return progress;
2719 }
2720
2721 /**
2722 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2723 * instructions to FS_OPCODE_REP_FB_WRITE.
2724 */
2725 void
2726 fs_visitor::emit_repclear_shader()
2727 {
2728 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2729 int base_mrf = 1;
2730 int color_mrf = base_mrf + 2;
2731
2732 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2733 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2734 mov->force_writemask_all = true;
2735
2736 fs_inst *write;
2737 if (key->nr_color_regions == 1) {
2738 write = emit(FS_OPCODE_REP_FB_WRITE);
2739 write->saturate = key->clamp_fragment_color;
2740 write->base_mrf = color_mrf;
2741 write->target = 0;
2742 write->header_present = false;
2743 write->mlen = 1;
2744 } else {
2745 assume(key->nr_color_regions > 0);
2746 for (int i = 0; i < key->nr_color_regions; ++i) {
2747 write = emit(FS_OPCODE_REP_FB_WRITE);
2748 write->saturate = key->clamp_fragment_color;
2749 write->base_mrf = base_mrf;
2750 write->target = i;
2751 write->header_present = true;
2752 write->mlen = 3;
2753 }
2754 }
2755 write->eot = true;
2756
2757 calculate_cfg();
2758
2759 assign_constant_locations();
2760 assign_curb_setup();
2761
2762 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2763 assert(mov->src[0].file == HW_REG);
2764 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2765 }
2766
2767 /**
2768 * Walks through basic blocks, looking for repeated MRF writes and
2769 * removing the later ones.
2770 */
2771 bool
2772 fs_visitor::remove_duplicate_mrf_writes()
2773 {
2774 fs_inst *last_mrf_move[16];
2775 bool progress = false;
2776
2777 /* Need to update the MRF tracking for compressed instructions. */
2778 if (dispatch_width == 16)
2779 return false;
2780
2781 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2782
2783 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2784 if (inst->is_control_flow()) {
2785 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2786 }
2787
2788 if (inst->opcode == BRW_OPCODE_MOV &&
2789 inst->dst.file == MRF) {
2790 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2791 if (prev_inst && inst->equals(prev_inst)) {
2792 inst->remove(block);
2793 progress = true;
2794 continue;
2795 }
2796 }
2797
2798 /* Clear out the last-write records for MRFs that were overwritten. */
2799 if (inst->dst.file == MRF) {
2800 last_mrf_move[inst->dst.reg] = NULL;
2801 }
2802
2803 if (inst->mlen > 0 && inst->base_mrf != -1) {
2804 /* Found a SEND instruction, which will include two or fewer
2805 * implied MRF writes. We could do better here.
2806 */
2807 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2808 last_mrf_move[inst->base_mrf + i] = NULL;
2809 }
2810 }
2811
2812 /* Clear out any MRF move records whose sources got overwritten. */
2813 if (inst->dst.file == GRF) {
2814 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2815 if (last_mrf_move[i] &&
2816 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2817 last_mrf_move[i] = NULL;
2818 }
2819 }
2820 }
2821
2822 if (inst->opcode == BRW_OPCODE_MOV &&
2823 inst->dst.file == MRF &&
2824 inst->src[0].file == GRF &&
2825 !inst->is_partial_write()) {
2826 last_mrf_move[inst->dst.reg] = inst;
2827 }
2828 }
2829
2830 if (progress)
2831 invalidate_live_intervals();
2832
2833 return progress;
2834 }
2835
2836 static void
2837 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2838 int first_grf, int grf_len)
2839 {
2840 /* Clear the flag for registers that actually got read (as expected). */
2841 for (int i = 0; i < inst->sources; i++) {
2842 int grf;
2843 if (inst->src[i].file == GRF) {
2844 grf = inst->src[i].reg;
2845 } else if (inst->src[i].file == HW_REG &&
2846 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2847 grf = inst->src[i].fixed_hw_reg.nr;
2848 } else {
2849 continue;
2850 }
2851
2852 if (grf >= first_grf &&
2853 grf < first_grf + grf_len) {
2854 deps[grf - first_grf] = false;
2855 if (inst->exec_size == 16)
2856 deps[grf - first_grf + 1] = false;
2857 }
2858 }
2859 }
2860
2861 /**
2862 * Implements this workaround for the original 965:
2863 *
2864 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2865 * check for post destination dependencies on this instruction, software
2866 * must ensure that there is no destination hazard for the case of ‘write
2867 * followed by a posted write’ shown in the following example.
2868 *
2869 * 1. mov r3 0
2870 * 2. send r3.xy <rest of send instruction>
2871 * 3. mov r2 r3
2872 *
2873 * Due to no post-destination dependency check on the ‘send’, the above
2874 * code sequence could have two instructions (1 and 2) in flight at the
2875 * same time that both consider ‘r3’ as the target of their final writes.
2876 */
2877 void
2878 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2879 fs_inst *inst)
2880 {
2881 int write_len = inst->regs_written;
2882 int first_write_grf = inst->dst.reg;
2883 bool needs_dep[BRW_MAX_MRF];
2884 assert(write_len < (int)sizeof(needs_dep) - 1);
2885
2886 memset(needs_dep, false, sizeof(needs_dep));
2887 memset(needs_dep, true, write_len);
2888
2889 clear_deps_for_inst_src(inst, dispatch_width,
2890 needs_dep, first_write_grf, write_len);
2891
2892 /* Walk backwards looking for writes to registers we're writing which
2893 * aren't read since being written. If we hit the start of the program,
2894 * we assume that there are no outstanding dependencies on entry to the
2895 * program.
2896 */
2897 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2898 /* If we hit control flow, assume that there *are* outstanding
2899 * dependencies, and force their cleanup before our instruction.
2900 */
2901 if (block->start() == scan_inst) {
2902 for (int i = 0; i < write_len; i++) {
2903 if (needs_dep[i]) {
2904 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2905 }
2906 }
2907 return;
2908 }
2909
2910 /* We insert our reads as late as possible on the assumption that any
2911 * instruction but a MOV that might have left us an outstanding
2912 * dependency has more latency than a MOV.
2913 */
2914 if (scan_inst->dst.file == GRF) {
2915 for (int i = 0; i < scan_inst->regs_written; i++) {
2916 int reg = scan_inst->dst.reg + i;
2917
2918 if (reg >= first_write_grf &&
2919 reg < first_write_grf + write_len &&
2920 needs_dep[reg - first_write_grf]) {
2921 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2922 needs_dep[reg - first_write_grf] = false;
2923 if (scan_inst->exec_size == 16)
2924 needs_dep[reg - first_write_grf + 1] = false;
2925 }
2926 }
2927 }
2928
2929 /* Clear the flag for registers that actually got read (as expected). */
2930 clear_deps_for_inst_src(scan_inst, dispatch_width,
2931 needs_dep, first_write_grf, write_len);
2932
2933 /* Continue the loop only if we haven't resolved all the dependencies */
2934 int i;
2935 for (i = 0; i < write_len; i++) {
2936 if (needs_dep[i])
2937 break;
2938 }
2939 if (i == write_len)
2940 return;
2941 }
2942 }
2943
2944 /**
2945 * Implements this workaround for the original 965:
2946 *
2947 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2948 * used as a destination register until after it has been sourced by an
2949 * instruction with a different destination register.
2950 */
2951 void
2952 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2953 {
2954 int write_len = inst->regs_written;
2955 int first_write_grf = inst->dst.reg;
2956 bool needs_dep[BRW_MAX_MRF];
2957 assert(write_len < (int)sizeof(needs_dep) - 1);
2958
2959 memset(needs_dep, false, sizeof(needs_dep));
2960 memset(needs_dep, true, write_len);
2961 /* Walk forwards looking for writes to registers we're writing which aren't
2962 * read before being written.
2963 */
2964 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2965 /* If we hit control flow, force resolve all remaining dependencies. */
2966 if (block->end() == scan_inst) {
2967 for (int i = 0; i < write_len; i++) {
2968 if (needs_dep[i])
2969 scan_inst->insert_before(block,
2970 DEP_RESOLVE_MOV(first_write_grf + i));
2971 }
2972 return;
2973 }
2974
2975 /* Clear the flag for registers that actually got read (as expected). */
2976 clear_deps_for_inst_src(scan_inst, dispatch_width,
2977 needs_dep, first_write_grf, write_len);
2978
2979 /* We insert our reads as late as possible since they're reading the
2980 * result of a SEND, which has massive latency.
2981 */
2982 if (scan_inst->dst.file == GRF &&
2983 scan_inst->dst.reg >= first_write_grf &&
2984 scan_inst->dst.reg < first_write_grf + write_len &&
2985 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2986 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
2987 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2988 }
2989
2990 /* Continue the loop only if we haven't resolved all the dependencies */
2991 int i;
2992 for (i = 0; i < write_len; i++) {
2993 if (needs_dep[i])
2994 break;
2995 }
2996 if (i == write_len)
2997 return;
2998 }
2999
3000 /* If we hit the end of the program, resolve all remaining dependencies out
3001 * of paranoia.
3002 */
3003 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
3004 assert(last_inst->eot);
3005 for (int i = 0; i < write_len; i++) {
3006 if (needs_dep[i])
3007 last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3008 }
3009 }
3010
3011 void
3012 fs_visitor::insert_gen4_send_dependency_workarounds()
3013 {
3014 if (brw->gen != 4 || brw->is_g4x)
3015 return;
3016
3017 bool progress = false;
3018
3019 /* Note that we're done with register allocation, so GRF fs_regs always
3020 * have a .reg_offset of 0.
3021 */
3022
3023 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3024 if (inst->mlen != 0 && inst->dst.file == GRF) {
3025 insert_gen4_pre_send_dependency_workarounds(block, inst);
3026 insert_gen4_post_send_dependency_workarounds(block, inst);
3027 progress = true;
3028 }
3029 }
3030
3031 if (progress)
3032 invalidate_live_intervals();
3033 }
3034
3035 /**
3036 * Turns the generic expression-style uniform pull constant load instruction
3037 * into a hardware-specific series of instructions for loading a pull
3038 * constant.
3039 *
3040 * The expression style allows the CSE pass before this to optimize out
3041 * repeated loads from the same offset, and gives the pre-register-allocation
3042 * scheduling full flexibility, while the conversion to native instructions
3043 * allows the post-register-allocation scheduler the best information
3044 * possible.
3045 *
3046 * Note that execution masking for setting up pull constant loads is special:
3047 * the channels that need to be written are unrelated to the current execution
3048 * mask, since a later instruction will use one of the result channels as a
3049 * source operand for all 8 or 16 of its channels.
3050 */
3051 void
3052 fs_visitor::lower_uniform_pull_constant_loads()
3053 {
3054 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3055 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3056 continue;
3057
3058 if (brw->gen >= 7) {
3059 /* The offset arg before was a vec4-aligned byte offset. We need to
3060 * turn it into a dword offset.
3061 */
3062 fs_reg const_offset_reg = inst->src[1];
3063 assert(const_offset_reg.file == IMM &&
3064 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3065 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3066 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3067
3068 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3069 * Reserve space for the register.
3070 */
3071 if (brw->gen >= 9) {
3072 payload.reg_offset++;
3073 alloc.sizes[payload.reg] = 2;
3074 }
3075
3076 /* This is actually going to be a MOV, but since only the first dword
3077 * is accessed, we have a special opcode to do just that one. Note
3078 * that this needs to be an operation that will be considered a def
3079 * by live variable analysis, or register allocation will explode.
3080 */
3081 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3082 8, payload, const_offset_reg);
3083 setup->force_writemask_all = true;
3084
3085 setup->ir = inst->ir;
3086 setup->annotation = inst->annotation;
3087 inst->insert_before(block, setup);
3088
3089 /* Similarly, this will only populate the first 4 channels of the
3090 * result register (since we only use smear values from 0-3), but we
3091 * don't tell the optimizer.
3092 */
3093 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3094 inst->src[1] = payload;
3095
3096 invalidate_live_intervals();
3097 } else {
3098 /* Before register allocation, we didn't tell the scheduler about the
3099 * MRF we use. We know it's safe to use this MRF because nothing
3100 * else does except for register spill/unspill, which generates and
3101 * uses its MRF within a single IR instruction.
3102 */
3103 inst->base_mrf = 14;
3104 inst->mlen = 1;
3105 }
3106 }
3107 }
3108
3109 bool
3110 fs_visitor::lower_load_payload()
3111 {
3112 bool progress = false;
3113
3114 int vgrf_to_reg[alloc.count];
3115 int reg_count = 0;
3116 for (unsigned i = 0; i < alloc.count; ++i) {
3117 vgrf_to_reg[i] = reg_count;
3118 reg_count += alloc.sizes[i];
3119 }
3120
3121 struct {
3122 bool written:1; /* Whether this register has ever been written */
3123 bool force_writemask_all:1;
3124 bool force_sechalf:1;
3125 } metadata[reg_count];
3126 memset(metadata, 0, sizeof(metadata));
3127
3128 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3129 if (inst->dst.file == GRF) {
3130 const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3131 bool force_sechalf = inst->force_sechalf &&
3132 !inst->force_writemask_all;
3133 bool toggle_sechalf = inst->dst.width == 16 &&
3134 type_sz(inst->dst.type) == 4 &&
3135 !inst->force_writemask_all;
3136 for (int i = 0; i < inst->regs_written; ++i) {
3137 metadata[dst_reg + i].written = true;
3138 metadata[dst_reg + i].force_sechalf = force_sechalf;
3139 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3140 force_sechalf = (toggle_sechalf != force_sechalf);
3141 }
3142 }
3143
3144 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3145 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3146 fs_reg dst = inst->dst;
3147
3148 for (int i = 0; i < inst->sources; i++) {
3149 dst.width = inst->src[i].effective_width;
3150 dst.type = inst->src[i].type;
3151
3152 if (inst->src[i].file == BAD_FILE) {
3153 /* Do nothing but otherwise increment as normal */
3154 } else if (dst.file == MRF &&
3155 dst.width == 8 &&
3156 brw->has_compr4 &&
3157 i + 4 < inst->sources &&
3158 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3159 fs_reg compr4_dst = dst;
3160 compr4_dst.reg += BRW_MRF_COMPR4;
3161 compr4_dst.width = 16;
3162 fs_reg compr4_src = inst->src[i];
3163 compr4_src.width = 16;
3164 fs_inst *mov = MOV(compr4_dst, compr4_src);
3165 mov->force_writemask_all = true;
3166 inst->insert_before(block, mov);
3167 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3168 inst->src[i + 4].file = BAD_FILE;
3169 } else {
3170 fs_inst *mov = MOV(dst, inst->src[i]);
3171 if (inst->src[i].file == GRF) {
3172 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3173 inst->src[i].reg_offset;
3174 mov->force_sechalf = metadata[src_reg].force_sechalf;
3175 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3176 }
3177
3178 if (dst.file == GRF) {
3179 const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3180 const bool force_writemask = mov->force_writemask_all;
3181 metadata[dst_reg].force_writemask_all = force_writemask;
3182 metadata[dst_reg].force_sechalf = mov->force_sechalf;
3183 if (dst.width * type_sz(dst.type) > 32) {
3184 assert(!mov->force_sechalf);
3185 metadata[dst_reg + 1].force_writemask_all = force_writemask;
3186 metadata[dst_reg + 1].force_sechalf = !force_writemask;
3187 }
3188 }
3189
3190 inst->insert_before(block, mov);
3191 }
3192
3193 dst = offset(dst, 1);
3194 }
3195
3196 inst->remove(block);
3197 progress = true;
3198 }
3199 }
3200
3201 if (progress)
3202 invalidate_live_intervals();
3203
3204 return progress;
3205 }
3206
3207 void
3208 fs_visitor::dump_instructions()
3209 {
3210 dump_instructions(NULL);
3211 }
3212
3213 void
3214 fs_visitor::dump_instructions(const char *name)
3215 {
3216 FILE *file = stderr;
3217 if (name && geteuid() != 0) {
3218 file = fopen(name, "w");
3219 if (!file)
3220 file = stderr;
3221 }
3222
3223 if (cfg) {
3224 calculate_register_pressure();
3225 int ip = 0, max_pressure = 0;
3226 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3227 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3228 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3229 dump_instruction(inst, file);
3230 ip++;
3231 }
3232 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3233 } else {
3234 int ip = 0;
3235 foreach_in_list(backend_instruction, inst, &instructions) {
3236 fprintf(file, "%4d: ", ip++);
3237 dump_instruction(inst, file);
3238 }
3239 }
3240
3241 if (file != stderr) {
3242 fclose(file);
3243 }
3244 }
3245
3246 void
3247 fs_visitor::dump_instruction(backend_instruction *be_inst)
3248 {
3249 dump_instruction(be_inst, stderr);
3250 }
3251
3252 void
3253 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3254 {
3255 fs_inst *inst = (fs_inst *)be_inst;
3256
3257 if (inst->predicate) {
3258 fprintf(file, "(%cf0.%d) ",
3259 inst->predicate_inverse ? '-' : '+',
3260 inst->flag_subreg);
3261 }
3262
3263 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3264 if (inst->saturate)
3265 fprintf(file, ".sat");
3266 if (inst->conditional_mod) {
3267 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3268 if (!inst->predicate &&
3269 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3270 inst->opcode != BRW_OPCODE_IF &&
3271 inst->opcode != BRW_OPCODE_WHILE))) {
3272 fprintf(file, ".f0.%d", inst->flag_subreg);
3273 }
3274 }
3275 fprintf(file, "(%d) ", inst->exec_size);
3276
3277
3278 switch (inst->dst.file) {
3279 case GRF:
3280 fprintf(file, "vgrf%d", inst->dst.reg);
3281 if (inst->dst.width != dispatch_width)
3282 fprintf(file, "@%d", inst->dst.width);
3283 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3284 inst->dst.subreg_offset)
3285 fprintf(file, "+%d.%d",
3286 inst->dst.reg_offset, inst->dst.subreg_offset);
3287 break;
3288 case MRF:
3289 fprintf(file, "m%d", inst->dst.reg);
3290 break;
3291 case BAD_FILE:
3292 fprintf(file, "(null)");
3293 break;
3294 case UNIFORM:
3295 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3296 break;
3297 case ATTR:
3298 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3299 break;
3300 case HW_REG:
3301 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3302 switch (inst->dst.fixed_hw_reg.nr) {
3303 case BRW_ARF_NULL:
3304 fprintf(file, "null");
3305 break;
3306 case BRW_ARF_ADDRESS:
3307 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3308 break;
3309 case BRW_ARF_ACCUMULATOR:
3310 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3311 break;
3312 case BRW_ARF_FLAG:
3313 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3314 inst->dst.fixed_hw_reg.subnr);
3315 break;
3316 default:
3317 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3318 inst->dst.fixed_hw_reg.subnr);
3319 break;
3320 }
3321 } else {
3322 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3323 }
3324 if (inst->dst.fixed_hw_reg.subnr)
3325 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3326 break;
3327 default:
3328 fprintf(file, "???");
3329 break;
3330 }
3331 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3332
3333 for (int i = 0; i < inst->sources; i++) {
3334 if (inst->src[i].negate)
3335 fprintf(file, "-");
3336 if (inst->src[i].abs)
3337 fprintf(file, "|");
3338 switch (inst->src[i].file) {
3339 case GRF:
3340 fprintf(file, "vgrf%d", inst->src[i].reg);
3341 if (inst->src[i].width != dispatch_width)
3342 fprintf(file, "@%d", inst->src[i].width);
3343 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3344 inst->src[i].subreg_offset)
3345 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3346 inst->src[i].subreg_offset);
3347 break;
3348 case MRF:
3349 fprintf(file, "***m%d***", inst->src[i].reg);
3350 break;
3351 case ATTR:
3352 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3353 break;
3354 case UNIFORM:
3355 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3356 if (inst->src[i].reladdr) {
3357 fprintf(file, "+reladdr");
3358 } else if (inst->src[i].subreg_offset) {
3359 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3360 inst->src[i].subreg_offset);
3361 }
3362 break;
3363 case BAD_FILE:
3364 fprintf(file, "(null)");
3365 break;
3366 case IMM:
3367 switch (inst->src[i].type) {
3368 case BRW_REGISTER_TYPE_F:
3369 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3370 break;
3371 case BRW_REGISTER_TYPE_W:
3372 case BRW_REGISTER_TYPE_D:
3373 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3374 break;
3375 case BRW_REGISTER_TYPE_UW:
3376 case BRW_REGISTER_TYPE_UD:
3377 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3378 break;
3379 case BRW_REGISTER_TYPE_VF:
3380 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3381 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3382 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3383 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3384 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3385 break;
3386 default:
3387 fprintf(file, "???");
3388 break;
3389 }
3390 break;
3391 case HW_REG:
3392 if (inst->src[i].fixed_hw_reg.negate)
3393 fprintf(file, "-");
3394 if (inst->src[i].fixed_hw_reg.abs)
3395 fprintf(file, "|");
3396 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3397 switch (inst->src[i].fixed_hw_reg.nr) {
3398 case BRW_ARF_NULL:
3399 fprintf(file, "null");
3400 break;
3401 case BRW_ARF_ADDRESS:
3402 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3403 break;
3404 case BRW_ARF_ACCUMULATOR:
3405 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3406 break;
3407 case BRW_ARF_FLAG:
3408 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3409 inst->src[i].fixed_hw_reg.subnr);
3410 break;
3411 default:
3412 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3413 inst->src[i].fixed_hw_reg.subnr);
3414 break;
3415 }
3416 } else {
3417 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3418 }
3419 if (inst->src[i].fixed_hw_reg.subnr)
3420 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3421 if (inst->src[i].fixed_hw_reg.abs)
3422 fprintf(file, "|");
3423 break;
3424 default:
3425 fprintf(file, "???");
3426 break;
3427 }
3428 if (inst->src[i].abs)
3429 fprintf(file, "|");
3430
3431 if (inst->src[i].file != IMM) {
3432 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3433 }
3434
3435 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3436 fprintf(file, ", ");
3437 }
3438
3439 fprintf(file, " ");
3440
3441 if (dispatch_width == 16 && inst->exec_size == 8) {
3442 if (inst->force_sechalf)
3443 fprintf(file, "2ndhalf ");
3444 else
3445 fprintf(file, "1sthalf ");
3446 }
3447
3448 fprintf(file, "\n");
3449 }
3450
3451 /**
3452 * Possibly returns an instruction that set up @param reg.
3453 *
3454 * Sometimes we want to take the result of some expression/variable
3455 * dereference tree and rewrite the instruction generating the result
3456 * of the tree. When processing the tree, we know that the
3457 * instructions generated are all writing temporaries that are dead
3458 * outside of this tree. So, if we have some instructions that write
3459 * a temporary, we're free to point that temp write somewhere else.
3460 *
3461 * Note that this doesn't guarantee that the instruction generated
3462 * only reg -- it might be the size=4 destination of a texture instruction.
3463 */
3464 fs_inst *
3465 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3466 fs_inst *end,
3467 const fs_reg &reg)
3468 {
3469 if (end == start ||
3470 end->is_partial_write() ||
3471 reg.reladdr ||
3472 !reg.equals(end->dst)) {
3473 return NULL;
3474 } else {
3475 return end;
3476 }
3477 }
3478
3479 void
3480 fs_visitor::setup_payload_gen6()
3481 {
3482 bool uses_depth =
3483 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3484 unsigned barycentric_interp_modes =
3485 (stage == MESA_SHADER_FRAGMENT) ?
3486 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3487
3488 assert(brw->gen >= 6);
3489
3490 /* R0-1: masks, pixel X/Y coordinates. */
3491 payload.num_regs = 2;
3492 /* R2: only for 32-pixel dispatch.*/
3493
3494 /* R3-26: barycentric interpolation coordinates. These appear in the
3495 * same order that they appear in the brw_wm_barycentric_interp_mode
3496 * enum. Each set of coordinates occupies 2 registers if dispatch width
3497 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3498 * appear if they were enabled using the "Barycentric Interpolation
3499 * Mode" bits in WM_STATE.
3500 */
3501 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3502 if (barycentric_interp_modes & (1 << i)) {
3503 payload.barycentric_coord_reg[i] = payload.num_regs;
3504 payload.num_regs += 2;
3505 if (dispatch_width == 16) {
3506 payload.num_regs += 2;
3507 }
3508 }
3509 }
3510
3511 /* R27: interpolated depth if uses source depth */
3512 if (uses_depth) {
3513 payload.source_depth_reg = payload.num_regs;
3514 payload.num_regs++;
3515 if (dispatch_width == 16) {
3516 /* R28: interpolated depth if not SIMD8. */
3517 payload.num_regs++;
3518 }
3519 }
3520 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3521 if (uses_depth) {
3522 payload.source_w_reg = payload.num_regs;
3523 payload.num_regs++;
3524 if (dispatch_width == 16) {
3525 /* R30: interpolated W if not SIMD8. */
3526 payload.num_regs++;
3527 }
3528 }
3529
3530 if (stage == MESA_SHADER_FRAGMENT) {
3531 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3532 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3533 prog_data->uses_pos_offset = key->compute_pos_offset;
3534 /* R31: MSAA position offsets. */
3535 if (prog_data->uses_pos_offset) {
3536 payload.sample_pos_reg = payload.num_regs;
3537 payload.num_regs++;
3538 }
3539 }
3540
3541 /* R32: MSAA input coverage mask */
3542 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3543 assert(brw->gen >= 7);
3544 payload.sample_mask_in_reg = payload.num_regs;
3545 payload.num_regs++;
3546 if (dispatch_width == 16) {
3547 /* R33: input coverage mask if not SIMD8. */
3548 payload.num_regs++;
3549 }
3550 }
3551
3552 /* R34-: bary for 32-pixel. */
3553 /* R58-59: interp W for 32-pixel. */
3554
3555 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3556 source_depth_to_render_target = true;
3557 }
3558 }
3559
3560 void
3561 fs_visitor::setup_vs_payload()
3562 {
3563 /* R0: thread header, R1: urb handles */
3564 payload.num_regs = 2;
3565 }
3566
3567 void
3568 fs_visitor::assign_binding_table_offsets()
3569 {
3570 assert(stage == MESA_SHADER_FRAGMENT);
3571 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3572 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3573 uint32_t next_binding_table_offset = 0;
3574
3575 /* If there are no color regions, we still perform an FB write to a null
3576 * renderbuffer, which we place at surface index 0.
3577 */
3578 prog_data->binding_table.render_target_start = next_binding_table_offset;
3579 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3580
3581 assign_common_binding_table_offsets(next_binding_table_offset);
3582 }
3583
3584 void
3585 fs_visitor::calculate_register_pressure()
3586 {
3587 invalidate_live_intervals();
3588 calculate_live_intervals();
3589
3590 unsigned num_instructions = 0;
3591 foreach_block(block, cfg)
3592 num_instructions += block->instructions.length();
3593
3594 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3595
3596 for (unsigned reg = 0; reg < alloc.count; reg++) {
3597 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3598 regs_live_at_ip[ip] += alloc.sizes[reg];
3599 }
3600 }
3601
3602 void
3603 fs_visitor::optimize()
3604 {
3605 const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3606
3607 split_virtual_grfs();
3608
3609 move_uniform_array_access_to_pull_constants();
3610 assign_constant_locations();
3611 demote_pull_constants();
3612
3613 #define OPT(pass, args...) ({ \
3614 pass_num++; \
3615 bool this_progress = pass(args); \
3616 \
3617 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3618 char filename[64]; \
3619 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3620 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3621 \
3622 backend_visitor::dump_instructions(filename); \
3623 } \
3624 \
3625 progress = progress || this_progress; \
3626 this_progress; \
3627 })
3628
3629 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3630 char filename[64];
3631 snprintf(filename, 64, "%s%d-%04d-00-start",
3632 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3633
3634 backend_visitor::dump_instructions(filename);
3635 }
3636
3637 bool progress;
3638 int iteration = 0;
3639 int pass_num = 0;
3640 do {
3641 progress = false;
3642 pass_num = 0;
3643 iteration++;
3644
3645 OPT(remove_duplicate_mrf_writes);
3646
3647 OPT(opt_algebraic);
3648 OPT(opt_cse);
3649 OPT(opt_copy_propagate);
3650 OPT(opt_peephole_predicated_break);
3651 OPT(opt_cmod_propagation);
3652 OPT(dead_code_eliminate);
3653 OPT(opt_peephole_sel);
3654 OPT(dead_control_flow_eliminate, this);
3655 OPT(opt_register_renaming);
3656 OPT(opt_saturate_propagation);
3657 OPT(register_coalesce);
3658 OPT(compute_to_mrf);
3659
3660 OPT(compact_virtual_grfs);
3661 } while (progress);
3662
3663 pass_num = 0;
3664
3665 if (OPT(lower_load_payload)) {
3666 split_virtual_grfs();
3667 OPT(register_coalesce);
3668 OPT(compute_to_mrf);
3669 OPT(dead_code_eliminate);
3670 }
3671
3672 OPT(opt_combine_constants);
3673
3674 lower_uniform_pull_constant_loads();
3675 }
3676
3677 /**
3678 * Three source instruction must have a GRF/MRF destination register.
3679 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
3680 */
3681 void
3682 fs_visitor::fixup_3src_null_dest()
3683 {
3684 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3685 if (inst->is_3src() && inst->dst.is_null()) {
3686 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3687 inst->dst.type);
3688 }
3689 }
3690 }
3691
3692 void
3693 fs_visitor::allocate_registers()
3694 {
3695 bool allocated_without_spills;
3696
3697 static const enum instruction_scheduler_mode pre_modes[] = {
3698 SCHEDULE_PRE,
3699 SCHEDULE_PRE_NON_LIFO,
3700 SCHEDULE_PRE_LIFO,
3701 };
3702
3703 /* Try each scheduling heuristic to see if it can successfully register
3704 * allocate without spilling. They should be ordered by decreasing
3705 * performance but increasing likelihood of allocating.
3706 */
3707 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3708 schedule_instructions(pre_modes[i]);
3709
3710 if (0) {
3711 assign_regs_trivial();
3712 allocated_without_spills = true;
3713 } else {
3714 allocated_without_spills = assign_regs(false);
3715 }
3716 if (allocated_without_spills)
3717 break;
3718 }
3719
3720 if (!allocated_without_spills) {
3721 const char *stage_name = stage == MESA_SHADER_VERTEX ?
3722 "Vertex" : "Fragment";
3723
3724 /* We assume that any spilling is worse than just dropping back to
3725 * SIMD8. There's probably actually some intermediate point where
3726 * SIMD16 with a couple of spills is still better.
3727 */
3728 if (dispatch_width == 16) {
3729 fail("Failure to register allocate. Reduce number of "
3730 "live scalar values to avoid this.");
3731 } else {
3732 perf_debug("%s shader triggered register spilling. "
3733 "Try reducing the number of live scalar values to "
3734 "improve performance.\n", stage_name);
3735 }
3736
3737 /* Since we're out of heuristics, just go spill registers until we
3738 * get an allocation.
3739 */
3740 while (!assign_regs(true)) {
3741 if (failed)
3742 break;
3743 }
3744 }
3745
3746 /* This must come after all optimization and register allocation, since
3747 * it inserts dead code that happens to have side effects, and it does
3748 * so based on the actual physical registers in use.
3749 */
3750 insert_gen4_send_dependency_workarounds();
3751
3752 if (failed)
3753 return;
3754
3755 if (!allocated_without_spills)
3756 schedule_instructions(SCHEDULE_POST);
3757
3758 if (last_scratch > 0)
3759 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3760 }
3761
3762 bool
3763 fs_visitor::run_vs()
3764 {
3765 assert(stage == MESA_SHADER_VERTEX);
3766
3767 assign_common_binding_table_offsets(0);
3768 setup_vs_payload();
3769
3770 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3771 emit_shader_time_begin();
3772
3773 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3774 base_ir = ir;
3775 this->result = reg_undef;
3776 ir->accept(this);
3777 }
3778 base_ir = NULL;
3779 if (failed)
3780 return false;
3781
3782 emit_urb_writes();
3783
3784 calculate_cfg();
3785
3786 optimize();
3787
3788 assign_curb_setup();
3789 assign_vs_urb_setup();
3790
3791 fixup_3src_null_dest();
3792 allocate_registers();
3793
3794 return !failed;
3795 }
3796
3797 bool
3798 fs_visitor::run_fs()
3799 {
3800 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3801 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3802
3803 assert(stage == MESA_SHADER_FRAGMENT);
3804
3805 sanity_param_count = prog->Parameters->NumParameters;
3806
3807 assign_binding_table_offsets();
3808
3809 if (brw->gen >= 6)
3810 setup_payload_gen6();
3811 else
3812 setup_payload_gen4();
3813
3814 if (0) {
3815 emit_dummy_fs();
3816 } else if (brw->use_rep_send && dispatch_width == 16) {
3817 emit_repclear_shader();
3818 } else {
3819 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3820 emit_shader_time_begin();
3821
3822 calculate_urb_setup();
3823 if (prog->InputsRead > 0) {
3824 if (brw->gen < 6)
3825 emit_interpolation_setup_gen4();
3826 else
3827 emit_interpolation_setup_gen6();
3828 }
3829
3830 /* We handle discards by keeping track of the still-live pixels in f0.1.
3831 * Initialize it with the dispatched pixels.
3832 */
3833 if (wm_prog_data->uses_kill) {
3834 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3835 discard_init->flag_subreg = 1;
3836 }
3837
3838 /* Generate FS IR for main(). (the visitor only descends into
3839 * functions called "main").
3840 */
3841 if (shader) {
3842 if (getenv("INTEL_USE_NIR") != NULL) {
3843 emit_nir_code();
3844 } else {
3845 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3846 base_ir = ir;
3847 this->result = reg_undef;
3848 ir->accept(this);
3849 }
3850 }
3851 } else {
3852 emit_fragment_program_code();
3853 }
3854 base_ir = NULL;
3855 if (failed)
3856 return false;
3857
3858 emit(FS_OPCODE_PLACEHOLDER_HALT);
3859
3860 if (wm_key->alpha_test_func)
3861 emit_alpha_test();
3862
3863 emit_fb_writes();
3864
3865 calculate_cfg();
3866
3867 optimize();
3868
3869 assign_curb_setup();
3870 assign_urb_setup();
3871
3872 fixup_3src_null_dest();
3873 allocate_registers();
3874
3875 if (failed)
3876 return false;
3877 }
3878
3879 if (dispatch_width == 8)
3880 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3881 else
3882 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3883
3884 /* If any state parameters were appended, then ParameterValues could have
3885 * been realloced, in which case the driver uniform storage set up by
3886 * _mesa_associate_uniform_storage() would point to freed memory. Make
3887 * sure that didn't happen.
3888 */
3889 assert(sanity_param_count == prog->Parameters->NumParameters);
3890
3891 return !failed;
3892 }
3893
3894 const unsigned *
3895 brw_wm_fs_emit(struct brw_context *brw,
3896 void *mem_ctx,
3897 const struct brw_wm_prog_key *key,
3898 struct brw_wm_prog_data *prog_data,
3899 struct gl_fragment_program *fp,
3900 struct gl_shader_program *prog,
3901 unsigned *final_assembly_size)
3902 {
3903 bool start_busy = false;
3904 double start_time = 0;
3905
3906 if (unlikely(brw->perf_debug)) {
3907 start_busy = (brw->batch.last_bo &&
3908 drm_intel_bo_busy(brw->batch.last_bo));
3909 start_time = get_time();
3910 }
3911
3912 struct brw_shader *shader = NULL;
3913 if (prog)
3914 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3915
3916 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3917 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3918
3919 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3920 */
3921 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3922 if (!v.run_fs()) {
3923 if (prog) {
3924 prog->LinkStatus = false;
3925 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3926 }
3927
3928 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3929 v.fail_msg);
3930
3931 return NULL;
3932 }
3933
3934 cfg_t *simd16_cfg = NULL;
3935 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3936 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3937 brw->use_rep_send)) {
3938 if (!v.simd16_unsupported) {
3939 /* Try a SIMD16 compile */
3940 v2.import_uniforms(&v);
3941 if (!v2.run_fs()) {
3942 perf_debug("SIMD16 shader failed to compile, falling back to "
3943 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3944 } else {
3945 simd16_cfg = v2.cfg;
3946 }
3947 } else {
3948 perf_debug("SIMD16 shader unsupported, falling back to "
3949 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3950 }
3951 }
3952
3953 cfg_t *simd8_cfg;
3954 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3955 if (no_simd8 && simd16_cfg) {
3956 simd8_cfg = NULL;
3957 prog_data->no_8 = true;
3958 } else {
3959 simd8_cfg = v.cfg;
3960 prog_data->no_8 = false;
3961 }
3962
3963 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
3964 &fp->Base, v.runtime_check_aads_emit, "FS");
3965
3966 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3967 char *name;
3968 if (prog)
3969 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
3970 prog->Label ? prog->Label : "unnamed",
3971 prog->Name);
3972 else
3973 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
3974
3975 g.enable_debug(name);
3976 }
3977
3978 if (simd8_cfg)
3979 g.generate_code(simd8_cfg, 8);
3980 if (simd16_cfg)
3981 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
3982
3983 if (unlikely(brw->perf_debug) && shader) {
3984 if (shader->compiled_once)
3985 brw_wm_debug_recompile(brw, prog, key);
3986 shader->compiled_once = true;
3987
3988 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3989 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3990 (get_time() - start_time) * 1000);
3991 }
3992 }
3993
3994 return g.get_assembly(final_assembly_size);
3995 }
3996
3997 extern "C" bool
3998 brw_fs_precompile(struct gl_context *ctx,
3999 struct gl_shader_program *shader_prog,
4000 struct gl_program *prog)
4001 {
4002 struct brw_context *brw = brw_context(ctx);
4003 struct brw_wm_prog_key key;
4004
4005 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4006 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4007 bool program_uses_dfdy = fp->UsesDFdy;
4008
4009 memset(&key, 0, sizeof(key));
4010
4011 if (brw->gen < 6) {
4012 if (fp->UsesKill)
4013 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4014
4015 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4016 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4017
4018 /* Just assume depth testing. */
4019 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4020 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4021 }
4022
4023 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4024 BRW_FS_VARYING_INPUT_MASK) > 16)
4025 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4026
4027 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4028 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
4029 for (unsigned i = 0; i < sampler_count; i++) {
4030 if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
4031 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4032 key.tex.swizzles[i] =
4033 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4034 } else {
4035 /* Color sampler: assume no swizzling. */
4036 key.tex.swizzles[i] = SWIZZLE_XYZW;
4037 }
4038 }
4039
4040 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4041 key.drawable_height = ctx->DrawBuffer->Height;
4042 }
4043
4044 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4045 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4046 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4047
4048 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4049 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4050 key.nr_color_regions > 1;
4051 }
4052
4053 key.program_string_id = bfp->id;
4054
4055 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4056 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4057
4058 bool success = do_wm_prog(brw, shader_prog, bfp, &key);
4059
4060 brw->wm.base.prog_offset = old_prog_offset;
4061 brw->wm.prog_data = old_prog_data;
4062
4063 return success;
4064 }