182c79fc83cf5de557f7715fb9734f11efdcefa4
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 void
53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
54 const fs_reg *src, unsigned sources)
55 {
56 memset(this, 0, sizeof(*this));
57
58 this->src = new fs_reg[MAX2(sources, 3)];
59 for (unsigned i = 0; i < sources; i++)
60 this->src[i] = src[i];
61
62 this->opcode = opcode;
63 this->dst = dst;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (unsigned i = 0; i < sources; ++i) {
79 if (src[i].file != GRF && src[i].file != ATTR)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 for (unsigned i = 0; i < sources; ++i) {
94 switch (this->src[i].file) {
95 case BAD_FILE:
96 this->src[i].effective_width = 8;
97 break;
98 case GRF:
99 case HW_REG:
100 case ATTR:
101 assert(this->src[i].width > 0);
102 if (this->src[i].width == 1) {
103 this->src[i].effective_width = this->exec_size;
104 } else {
105 this->src[i].effective_width = this->src[i].width;
106 }
107 break;
108 case IMM:
109 case UNIFORM:
110 this->src[i].effective_width = this->exec_size;
111 break;
112 default:
113 unreachable("Invalid source register file");
114 }
115 }
116 this->dst.effective_width = this->exec_size;
117
118 this->conditional_mod = BRW_CONDITIONAL_NONE;
119
120 /* This will be the case for almost all instructions. */
121 switch (dst.file) {
122 case GRF:
123 case HW_REG:
124 case MRF:
125 case ATTR:
126 this->regs_written =
127 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
128 break;
129 case BAD_FILE:
130 this->regs_written = 0;
131 break;
132 case IMM:
133 case UNIFORM:
134 unreachable("Invalid destination register file");
135 default:
136 unreachable("Invalid register file");
137 }
138
139 this->writes_accumulator = false;
140 }
141
142 fs_inst::fs_inst()
143 {
144 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
145 }
146
147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
148 {
149 init(opcode, exec_size, reg_undef, NULL, 0);
150 }
151
152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
153 {
154 init(opcode, 0, dst, NULL, 0);
155 }
156
157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
158 const fs_reg &src0)
159 {
160 const fs_reg src[1] = { src0 };
161 init(opcode, exec_size, dst, src, 1);
162 }
163
164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
165 {
166 const fs_reg src[1] = { src0 };
167 init(opcode, 0, dst, src, 1);
168 }
169
170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
171 const fs_reg &src0, const fs_reg &src1)
172 {
173 const fs_reg src[2] = { src0, src1 };
174 init(opcode, exec_size, dst, src, 2);
175 }
176
177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
178 const fs_reg &src1)
179 {
180 const fs_reg src[2] = { src0, src1 };
181 init(opcode, 0, dst, src, 2);
182 }
183
184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
185 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
186 {
187 const fs_reg src[3] = { src0, src1, src2 };
188 init(opcode, exec_size, dst, src, 3);
189 }
190
191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
192 const fs_reg &src1, const fs_reg &src2)
193 {
194 const fs_reg src[3] = { src0, src1, src2 };
195 init(opcode, 0, dst, src, 3);
196 }
197
198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
199 const fs_reg src[], unsigned sources)
200 {
201 init(opcode, 0, dst, src, sources);
202 }
203
204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
205 const fs_reg src[], unsigned sources)
206 {
207 init(opcode, exec_width, dst, src, sources);
208 }
209
210 fs_inst::fs_inst(const fs_inst &that)
211 {
212 memcpy(this, &that, sizeof(that));
213
214 this->src = new fs_reg[MAX2(that.sources, 3)];
215
216 for (unsigned i = 0; i < that.sources; i++)
217 this->src[i] = that.src[i];
218 }
219
220 fs_inst::~fs_inst()
221 {
222 delete[] this->src;
223 }
224
225 void
226 fs_inst::resize_sources(uint8_t num_sources)
227 {
228 if (this->sources != num_sources) {
229 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
230
231 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
232 src[i] = this->src[i];
233
234 delete[] this->src;
235 this->src = src;
236 this->sources = num_sources;
237 }
238 }
239
240 #define ALU1(op) \
241 fs_inst * \
242 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
243 { \
244 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
245 }
246
247 #define ALU2(op) \
248 fs_inst * \
249 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
250 const fs_reg &src1) \
251 { \
252 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
253 }
254
255 #define ALU2_ACC(op) \
256 fs_inst * \
257 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
258 const fs_reg &src1) \
259 { \
260 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
261 inst->writes_accumulator = true; \
262 return inst; \
263 }
264
265 #define ALU3(op) \
266 fs_inst * \
267 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
268 const fs_reg &src1, const fs_reg &src2) \
269 { \
270 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
271 }
272
273 ALU1(NOT)
274 ALU1(MOV)
275 ALU1(FRC)
276 ALU1(RNDD)
277 ALU1(RNDE)
278 ALU1(RNDZ)
279 ALU2(ADD)
280 ALU2(MUL)
281 ALU2_ACC(MACH)
282 ALU2(AND)
283 ALU2(OR)
284 ALU2(XOR)
285 ALU2(SHL)
286 ALU2(SHR)
287 ALU2(ASR)
288 ALU3(LRP)
289 ALU1(BFREV)
290 ALU3(BFE)
291 ALU2(BFI1)
292 ALU3(BFI2)
293 ALU1(FBH)
294 ALU1(FBL)
295 ALU1(CBIT)
296 ALU3(MAD)
297 ALU2_ACC(ADDC)
298 ALU2_ACC(SUBB)
299 ALU2(SEL)
300 ALU2(MAC)
301
302 /** Gen4 predicated IF. */
303 fs_inst *
304 fs_visitor::IF(enum brw_predicate predicate)
305 {
306 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
307 inst->predicate = predicate;
308 return inst;
309 }
310
311 /** Gen6 IF with embedded comparison. */
312 fs_inst *
313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
314 enum brw_conditional_mod condition)
315 {
316 assert(devinfo->gen == 6);
317 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
318 reg_null_d, src0, src1);
319 inst->conditional_mod = condition;
320 return inst;
321 }
322
323 /**
324 * CMP: Sets the low bit of the destination channels with the result
325 * of the comparison, while the upper bits are undefined, and updates
326 * the flag register with the packed 16 bits of the result.
327 */
328 fs_inst *
329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
330 enum brw_conditional_mod condition)
331 {
332 fs_inst *inst;
333
334 /* Take the instruction:
335 *
336 * CMP null<d> src0<f> src1<f>
337 *
338 * Original gen4 does type conversion to the destination type before
339 * comparison, producing garbage results for floating point comparisons.
340 *
341 * The destination type doesn't matter on newer generations, so we set the
342 * type to match src0 so we can compact the instruction.
343 */
344 dst.type = src0.type;
345 if (dst.file == HW_REG)
346 dst.fixed_hw_reg.type = dst.type;
347
348 resolve_ud_negate(&src0);
349 resolve_ud_negate(&src1);
350
351 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
352 inst->conditional_mod = condition;
353
354 return inst;
355 }
356
357 fs_inst *
358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
359 {
360 uint8_t exec_size = dst.width;
361 for (int i = 0; i < sources; ++i) {
362 assert(src[i].width % dst.width == 0);
363 if (src[i].width > exec_size)
364 exec_size = src[i].width;
365 }
366
367 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
368 dst, src, sources);
369 inst->regs_written = 0;
370 for (int i = 0; i < sources; ++i) {
371 /* The LOAD_PAYLOAD instruction only really makes sense if we are
372 * dealing with whole registers. If this ever changes, we can deal
373 * with it later.
374 */
375 int size = inst->src[i].effective_width * type_sz(src[i].type);
376 assert(size % 32 == 0);
377 inst->regs_written += (size + 31) / 32;
378 }
379
380 return inst;
381 }
382
383 exec_list
384 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
385 const fs_reg &surf_index,
386 const fs_reg &varying_offset,
387 uint32_t const_offset)
388 {
389 exec_list instructions;
390 fs_inst *inst;
391
392 /* We have our constant surface use a pitch of 4 bytes, so our index can
393 * be any component of a vector, and then we load 4 contiguous
394 * components starting from that.
395 *
396 * We break down the const_offset to a portion added to the variable
397 * offset and a portion done using reg_offset, which means that if you
398 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
399 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
400 * CSE can later notice that those loads are all the same and eliminate
401 * the redundant ones.
402 */
403 fs_reg vec4_offset = vgrf(glsl_type::int_type);
404 instructions.push_tail(ADD(vec4_offset,
405 varying_offset, fs_reg(const_offset & ~3)));
406
407 int scale = 1;
408 if (devinfo->gen == 4 && dst.width == 8) {
409 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
410 * u, v, r) as parameters, or we can just use the SIMD16 message
411 * consisting of (header, u). We choose the second, at the cost of a
412 * longer return length.
413 */
414 scale = 2;
415 }
416
417 enum opcode op;
418 if (devinfo->gen >= 7)
419 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
420 else
421 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
422
423 assert(dst.width % 8 == 0);
424 int regs_written = 4 * (dst.width / 8) * scale;
425 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
426 dst.type, dst.width);
427 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
428 inst->regs_written = regs_written;
429 instructions.push_tail(inst);
430
431 if (devinfo->gen < 7) {
432 inst->base_mrf = 13;
433 inst->header_present = true;
434 if (devinfo->gen == 4)
435 inst->mlen = 3;
436 else
437 inst->mlen = 1 + dispatch_width / 8;
438 }
439
440 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
441 instructions.push_tail(MOV(dst, result));
442
443 return instructions;
444 }
445
446 /**
447 * A helper for MOV generation for fixing up broken hardware SEND dependency
448 * handling.
449 */
450 fs_inst *
451 fs_visitor::DEP_RESOLVE_MOV(int grf)
452 {
453 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
454
455 inst->ir = NULL;
456 inst->annotation = "send dependency resolve";
457
458 /* The caller always wants uncompressed to emit the minimal extra
459 * dependencies, and to avoid having to deal with aligning its regs to 2.
460 */
461 inst->exec_size = 8;
462
463 return inst;
464 }
465
466 bool
467 fs_inst::equals(fs_inst *inst) const
468 {
469 return (opcode == inst->opcode &&
470 dst.equals(inst->dst) &&
471 src[0].equals(inst->src[0]) &&
472 src[1].equals(inst->src[1]) &&
473 src[2].equals(inst->src[2]) &&
474 saturate == inst->saturate &&
475 predicate == inst->predicate &&
476 conditional_mod == inst->conditional_mod &&
477 mlen == inst->mlen &&
478 base_mrf == inst->base_mrf &&
479 target == inst->target &&
480 eot == inst->eot &&
481 header_present == inst->header_present &&
482 shadow_compare == inst->shadow_compare &&
483 exec_size == inst->exec_size &&
484 offset == inst->offset);
485 }
486
487 bool
488 fs_inst::overwrites_reg(const fs_reg &reg) const
489 {
490 return reg.in_range(dst, regs_written);
491 }
492
493 bool
494 fs_inst::is_send_from_grf() const
495 {
496 switch (opcode) {
497 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
498 case SHADER_OPCODE_SHADER_TIME_ADD:
499 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
500 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
501 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
502 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
503 case SHADER_OPCODE_UNTYPED_ATOMIC:
504 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
505 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
506 case SHADER_OPCODE_TYPED_ATOMIC:
507 case SHADER_OPCODE_TYPED_SURFACE_READ:
508 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
509 case SHADER_OPCODE_URB_WRITE_SIMD8:
510 return true;
511 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
512 return src[1].file == GRF;
513 case FS_OPCODE_FB_WRITE:
514 return src[0].file == GRF;
515 default:
516 if (is_tex())
517 return src[0].file == GRF;
518
519 return false;
520 }
521 }
522
523 bool
524 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
525 {
526 if (devinfo->gen == 6 && is_math())
527 return false;
528
529 if (is_send_from_grf())
530 return false;
531
532 if (!backend_instruction::can_do_source_mods())
533 return false;
534
535 return true;
536 }
537
538 bool
539 fs_inst::has_side_effects() const
540 {
541 return this->eot || backend_instruction::has_side_effects();
542 }
543
544 void
545 fs_reg::init()
546 {
547 memset(this, 0, sizeof(*this));
548 stride = 1;
549 }
550
551 /** Generic unset register constructor. */
552 fs_reg::fs_reg()
553 {
554 init();
555 this->file = BAD_FILE;
556 }
557
558 /** Immediate value constructor. */
559 fs_reg::fs_reg(float f)
560 {
561 init();
562 this->file = IMM;
563 this->type = BRW_REGISTER_TYPE_F;
564 this->fixed_hw_reg.dw1.f = f;
565 this->width = 1;
566 }
567
568 /** Immediate value constructor. */
569 fs_reg::fs_reg(int32_t i)
570 {
571 init();
572 this->file = IMM;
573 this->type = BRW_REGISTER_TYPE_D;
574 this->fixed_hw_reg.dw1.d = i;
575 this->width = 1;
576 }
577
578 /** Immediate value constructor. */
579 fs_reg::fs_reg(uint32_t u)
580 {
581 init();
582 this->file = IMM;
583 this->type = BRW_REGISTER_TYPE_UD;
584 this->fixed_hw_reg.dw1.ud = u;
585 this->width = 1;
586 }
587
588 /** Vector float immediate value constructor. */
589 fs_reg::fs_reg(uint8_t vf[4])
590 {
591 init();
592 this->file = IMM;
593 this->type = BRW_REGISTER_TYPE_VF;
594 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
595 }
596
597 /** Vector float immediate value constructor. */
598 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
599 {
600 init();
601 this->file = IMM;
602 this->type = BRW_REGISTER_TYPE_VF;
603 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
604 (vf1 << 8) |
605 (vf2 << 16) |
606 (vf3 << 24);
607 }
608
609 /** Fixed brw_reg. */
610 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
611 {
612 init();
613 this->file = HW_REG;
614 this->fixed_hw_reg = fixed_hw_reg;
615 this->type = fixed_hw_reg.type;
616 this->width = 1 << fixed_hw_reg.width;
617 }
618
619 bool
620 fs_reg::equals(const fs_reg &r) const
621 {
622 return (file == r.file &&
623 reg == r.reg &&
624 reg_offset == r.reg_offset &&
625 subreg_offset == r.subreg_offset &&
626 type == r.type &&
627 negate == r.negate &&
628 abs == r.abs &&
629 !reladdr && !r.reladdr &&
630 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
631 width == r.width &&
632 stride == r.stride);
633 }
634
635 fs_reg &
636 fs_reg::set_smear(unsigned subreg)
637 {
638 assert(file != HW_REG && file != IMM);
639 subreg_offset = subreg * type_sz(type);
640 stride = 0;
641 return *this;
642 }
643
644 bool
645 fs_reg::is_contiguous() const
646 {
647 return stride == 1;
648 }
649
650 int
651 fs_visitor::type_size(const struct glsl_type *type)
652 {
653 unsigned int size, i;
654
655 switch (type->base_type) {
656 case GLSL_TYPE_UINT:
657 case GLSL_TYPE_INT:
658 case GLSL_TYPE_FLOAT:
659 case GLSL_TYPE_BOOL:
660 return type->components();
661 case GLSL_TYPE_ARRAY:
662 return type_size(type->fields.array) * type->length;
663 case GLSL_TYPE_STRUCT:
664 size = 0;
665 for (i = 0; i < type->length; i++) {
666 size += type_size(type->fields.structure[i].type);
667 }
668 return size;
669 case GLSL_TYPE_SAMPLER:
670 /* Samplers take up no register space, since they're baked in at
671 * link time.
672 */
673 return 0;
674 case GLSL_TYPE_ATOMIC_UINT:
675 return 0;
676 case GLSL_TYPE_IMAGE:
677 case GLSL_TYPE_VOID:
678 case GLSL_TYPE_ERROR:
679 case GLSL_TYPE_INTERFACE:
680 case GLSL_TYPE_DOUBLE:
681 unreachable("not reached");
682 }
683
684 return 0;
685 }
686
687 /**
688 * Create a MOV to read the timestamp register.
689 *
690 * The caller is responsible for emitting the MOV. The return value is
691 * the destination of the MOV, with extra parameters set.
692 */
693 fs_reg
694 fs_visitor::get_timestamp(fs_inst **out_mov)
695 {
696 assert(devinfo->gen >= 7);
697
698 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
699 BRW_ARF_TIMESTAMP,
700 0),
701 BRW_REGISTER_TYPE_UD));
702
703 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
704
705 fs_inst *mov = MOV(dst, ts);
706 /* We want to read the 3 fields we care about even if it's not enabled in
707 * the dispatch.
708 */
709 mov->force_writemask_all = true;
710
711 /* The caller wants the low 32 bits of the timestamp. Since it's running
712 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
713 * which is plenty of time for our purposes. It is identical across the
714 * EUs, but since it's tracking GPU core speed it will increment at a
715 * varying rate as render P-states change.
716 *
717 * The caller could also check if render P-states have changed (or anything
718 * else that might disrupt timing) by setting smear to 2 and checking if
719 * that field is != 0.
720 */
721 dst.set_smear(0);
722
723 *out_mov = mov;
724 return dst;
725 }
726
727 void
728 fs_visitor::emit_shader_time_begin()
729 {
730 current_annotation = "shader time start";
731 fs_inst *mov;
732 shader_start_time = get_timestamp(&mov);
733 emit(mov);
734 }
735
736 void
737 fs_visitor::emit_shader_time_end()
738 {
739 current_annotation = "shader time end";
740
741 enum shader_time_shader_type type, written_type, reset_type;
742 switch (stage) {
743 case MESA_SHADER_VERTEX:
744 type = ST_VS;
745 written_type = ST_VS_WRITTEN;
746 reset_type = ST_VS_RESET;
747 break;
748 case MESA_SHADER_GEOMETRY:
749 type = ST_GS;
750 written_type = ST_GS_WRITTEN;
751 reset_type = ST_GS_RESET;
752 break;
753 case MESA_SHADER_FRAGMENT:
754 if (dispatch_width == 8) {
755 type = ST_FS8;
756 written_type = ST_FS8_WRITTEN;
757 reset_type = ST_FS8_RESET;
758 } else {
759 assert(dispatch_width == 16);
760 type = ST_FS16;
761 written_type = ST_FS16_WRITTEN;
762 reset_type = ST_FS16_RESET;
763 }
764 break;
765 case MESA_SHADER_COMPUTE:
766 type = ST_CS;
767 written_type = ST_CS_WRITTEN;
768 reset_type = ST_CS_RESET;
769 break;
770 default:
771 unreachable("fs_visitor::emit_shader_time_end missing code");
772 }
773
774 /* Insert our code just before the final SEND with EOT. */
775 exec_node *end = this->instructions.get_tail();
776 assert(end && ((fs_inst *) end)->eot);
777
778 fs_inst *tm_read;
779 fs_reg shader_end_time = get_timestamp(&tm_read);
780 end->insert_before(tm_read);
781
782 /* Check that there weren't any timestamp reset events (assuming these
783 * were the only two timestamp reads that happened).
784 */
785 fs_reg reset = shader_end_time;
786 reset.set_smear(2);
787 fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
788 test->conditional_mod = BRW_CONDITIONAL_Z;
789 test->force_writemask_all = true;
790 end->insert_before(test);
791 end->insert_before(IF(BRW_PREDICATE_NORMAL));
792
793 fs_reg start = shader_start_time;
794 start.negate = true;
795 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
796 diff.set_smear(0);
797 fs_inst *add = ADD(diff, start, shader_end_time);
798 add->force_writemask_all = true;
799 end->insert_before(add);
800
801 /* If there were no instructions between the two timestamp gets, the diff
802 * is 2 cycles. Remove that overhead, so I can forget about that when
803 * trying to determine the time taken for single instructions.
804 */
805 add = ADD(diff, diff, fs_reg(-2u));
806 add->force_writemask_all = true;
807 end->insert_before(add);
808
809 end->insert_before(SHADER_TIME_ADD(type, diff));
810 end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
811 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
812 end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
813 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
814 }
815
816 fs_inst *
817 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
818 {
819 int shader_time_index =
820 brw_get_shader_time_index(brw, shader_prog, prog, type);
821 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
822
823 fs_reg payload;
824 if (dispatch_width == 8)
825 payload = vgrf(glsl_type::uvec2_type);
826 else
827 payload = vgrf(glsl_type::uint_type);
828
829 return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
830 fs_reg(), payload, offset, value);
831 }
832
833 void
834 fs_visitor::vfail(const char *format, va_list va)
835 {
836 char *msg;
837
838 if (failed)
839 return;
840
841 failed = true;
842
843 msg = ralloc_vasprintf(mem_ctx, format, va);
844 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
845
846 this->fail_msg = msg;
847
848 if (debug_enabled) {
849 fprintf(stderr, "%s", msg);
850 }
851 }
852
853 void
854 fs_visitor::fail(const char *format, ...)
855 {
856 va_list va;
857
858 va_start(va, format);
859 vfail(format, va);
860 va_end(va);
861 }
862
863 /**
864 * Mark this program as impossible to compile in SIMD16 mode.
865 *
866 * During the SIMD8 compile (which happens first), we can detect and flag
867 * things that are unsupported in SIMD16 mode, so the compiler can skip
868 * the SIMD16 compile altogether.
869 *
870 * During a SIMD16 compile (if one happens anyway), this just calls fail().
871 */
872 void
873 fs_visitor::no16(const char *format, ...)
874 {
875 va_list va;
876
877 va_start(va, format);
878
879 if (dispatch_width == 16) {
880 vfail(format, va);
881 } else {
882 simd16_unsupported = true;
883
884 if (brw->perf_debug) {
885 if (no16_msg)
886 ralloc_vasprintf_append(&no16_msg, format, va);
887 else
888 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
889 }
890 }
891
892 va_end(va);
893 }
894
895 fs_inst *
896 fs_visitor::emit(enum opcode opcode)
897 {
898 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
899 }
900
901 fs_inst *
902 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
903 {
904 return emit(new(mem_ctx) fs_inst(opcode, dst));
905 }
906
907 fs_inst *
908 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
909 {
910 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
911 }
912
913 fs_inst *
914 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
915 const fs_reg &src1)
916 {
917 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
918 }
919
920 fs_inst *
921 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
922 const fs_reg &src1, const fs_reg &src2)
923 {
924 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
925 }
926
927 fs_inst *
928 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
929 fs_reg src[], int sources)
930 {
931 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
932 }
933
934 /**
935 * Returns true if the instruction has a flag that means it won't
936 * update an entire destination register.
937 *
938 * For example, dead code elimination and live variable analysis want to know
939 * when a write to a variable screens off any preceding values that were in
940 * it.
941 */
942 bool
943 fs_inst::is_partial_write() const
944 {
945 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
946 (this->dst.width * type_sz(this->dst.type)) < 32 ||
947 !this->dst.is_contiguous());
948 }
949
950 int
951 fs_inst::regs_read(int arg) const
952 {
953 if (is_tex() && arg == 0 && src[0].file == GRF) {
954 return mlen;
955 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
956 return mlen;
957 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
958 return mlen;
959 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
960 return mlen;
961 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
962 return mlen;
963 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
964 return mlen;
965 } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
966 return mlen;
967 } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
968 return mlen;
969 } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
970 return mlen;
971 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
972 return mlen;
973 } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
974 return exec_size / 4;
975 }
976
977 switch (src[arg].file) {
978 case BAD_FILE:
979 case UNIFORM:
980 case IMM:
981 return 1;
982 case GRF:
983 case HW_REG:
984 if (src[arg].stride == 0) {
985 return 1;
986 } else {
987 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
988 return (size + 31) / 32;
989 }
990 case MRF:
991 unreachable("MRF registers are not allowed as sources");
992 default:
993 unreachable("Invalid register file");
994 }
995 }
996
997 bool
998 fs_inst::reads_flag() const
999 {
1000 return predicate;
1001 }
1002
1003 bool
1004 fs_inst::writes_flag() const
1005 {
1006 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
1007 opcode != BRW_OPCODE_IF &&
1008 opcode != BRW_OPCODE_WHILE)) ||
1009 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
1010 }
1011
1012 /**
1013 * Returns how many MRFs an FS opcode will write over.
1014 *
1015 * Note that this is not the 0 or 1 implied writes in an actual gen
1016 * instruction -- the FS opcodes often generate MOVs in addition.
1017 */
1018 int
1019 fs_visitor::implied_mrf_writes(fs_inst *inst)
1020 {
1021 if (inst->mlen == 0)
1022 return 0;
1023
1024 if (inst->base_mrf == -1)
1025 return 0;
1026
1027 switch (inst->opcode) {
1028 case SHADER_OPCODE_RCP:
1029 case SHADER_OPCODE_RSQ:
1030 case SHADER_OPCODE_SQRT:
1031 case SHADER_OPCODE_EXP2:
1032 case SHADER_OPCODE_LOG2:
1033 case SHADER_OPCODE_SIN:
1034 case SHADER_OPCODE_COS:
1035 return 1 * dispatch_width / 8;
1036 case SHADER_OPCODE_POW:
1037 case SHADER_OPCODE_INT_QUOTIENT:
1038 case SHADER_OPCODE_INT_REMAINDER:
1039 return 2 * dispatch_width / 8;
1040 case SHADER_OPCODE_TEX:
1041 case FS_OPCODE_TXB:
1042 case SHADER_OPCODE_TXD:
1043 case SHADER_OPCODE_TXF:
1044 case SHADER_OPCODE_TXF_CMS:
1045 case SHADER_OPCODE_TXF_MCS:
1046 case SHADER_OPCODE_TG4:
1047 case SHADER_OPCODE_TG4_OFFSET:
1048 case SHADER_OPCODE_TXL:
1049 case SHADER_OPCODE_TXS:
1050 case SHADER_OPCODE_LOD:
1051 return 1;
1052 case FS_OPCODE_FB_WRITE:
1053 return 2;
1054 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1055 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1056 return 1;
1057 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1058 return inst->mlen;
1059 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1060 return 2;
1061 case SHADER_OPCODE_UNTYPED_ATOMIC:
1062 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1063 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1064 case SHADER_OPCODE_TYPED_ATOMIC:
1065 case SHADER_OPCODE_TYPED_SURFACE_READ:
1066 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1067 case SHADER_OPCODE_URB_WRITE_SIMD8:
1068 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1069 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1070 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1071 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1072 return 0;
1073 default:
1074 unreachable("not reached");
1075 }
1076 }
1077
1078 fs_reg
1079 fs_visitor::vgrf(const glsl_type *const type)
1080 {
1081 int reg_width = dispatch_width / 8;
1082 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1083 brw_type_for_base_type(type), dispatch_width);
1084 }
1085
1086 fs_reg
1087 fs_visitor::vgrf(int num_components)
1088 {
1089 int reg_width = dispatch_width / 8;
1090 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1091 BRW_REGISTER_TYPE_F, dispatch_width);
1092 }
1093
1094 /** Fixed HW reg constructor. */
1095 fs_reg::fs_reg(enum register_file file, int reg)
1096 {
1097 init();
1098 this->file = file;
1099 this->reg = reg;
1100 this->type = BRW_REGISTER_TYPE_F;
1101
1102 switch (file) {
1103 case UNIFORM:
1104 this->width = 1;
1105 break;
1106 default:
1107 this->width = 8;
1108 }
1109 }
1110
1111 /** Fixed HW reg constructor. */
1112 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1113 {
1114 init();
1115 this->file = file;
1116 this->reg = reg;
1117 this->type = type;
1118
1119 switch (file) {
1120 case UNIFORM:
1121 this->width = 1;
1122 break;
1123 default:
1124 this->width = 8;
1125 }
1126 }
1127
1128 /** Fixed HW reg constructor. */
1129 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1130 uint8_t width)
1131 {
1132 init();
1133 this->file = file;
1134 this->reg = reg;
1135 this->type = type;
1136 this->width = width;
1137 }
1138
1139 fs_reg *
1140 fs_visitor::variable_storage(ir_variable *var)
1141 {
1142 return (fs_reg *)hash_table_find(this->variable_ht, var);
1143 }
1144
1145 void
1146 import_uniforms_callback(const void *key,
1147 void *data,
1148 void *closure)
1149 {
1150 struct hash_table *dst_ht = (struct hash_table *)closure;
1151 const fs_reg *reg = (const fs_reg *)data;
1152
1153 if (reg->file != UNIFORM)
1154 return;
1155
1156 hash_table_insert(dst_ht, data, key);
1157 }
1158
1159 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1160 * This brings in those uniform definitions
1161 */
1162 void
1163 fs_visitor::import_uniforms(fs_visitor *v)
1164 {
1165 hash_table_call_foreach(v->variable_ht,
1166 import_uniforms_callback,
1167 variable_ht);
1168 this->push_constant_loc = v->push_constant_loc;
1169 this->pull_constant_loc = v->pull_constant_loc;
1170 this->uniforms = v->uniforms;
1171 this->param_size = v->param_size;
1172 }
1173
1174 /* Our support for uniforms is piggy-backed on the struct
1175 * gl_fragment_program, because that's where the values actually
1176 * get stored, rather than in some global gl_shader_program uniform
1177 * store.
1178 */
1179 void
1180 fs_visitor::setup_uniform_values(ir_variable *ir)
1181 {
1182 int namelen = strlen(ir->name);
1183
1184 /* The data for our (non-builtin) uniforms is stored in a series of
1185 * gl_uniform_driver_storage structs for each subcomponent that
1186 * glGetUniformLocation() could name. We know it's been set up in the same
1187 * order we'd walk the type, so walk the list of storage and find anything
1188 * with our name, or the prefix of a component that starts with our name.
1189 */
1190 unsigned params_before = uniforms;
1191 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1192 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1193
1194 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1195 (storage->name[namelen] != 0 &&
1196 storage->name[namelen] != '.' &&
1197 storage->name[namelen] != '[')) {
1198 continue;
1199 }
1200
1201 unsigned slots = storage->type->component_slots();
1202 if (storage->array_elements)
1203 slots *= storage->array_elements;
1204
1205 for (unsigned i = 0; i < slots; i++) {
1206 stage_prog_data->param[uniforms++] = &storage->storage[i];
1207 }
1208 }
1209
1210 /* Make sure we actually initialized the right amount of stuff here. */
1211 assert(params_before + ir->type->component_slots() == uniforms);
1212 (void)params_before;
1213 }
1214
1215
1216 /* Our support for builtin uniforms is even scarier than non-builtin.
1217 * It sits on top of the PROG_STATE_VAR parameters that are
1218 * automatically updated from GL context state.
1219 */
1220 void
1221 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1222 {
1223 const ir_state_slot *const slots = ir->get_state_slots();
1224 assert(slots != NULL);
1225
1226 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1227 /* This state reference has already been setup by ir_to_mesa, but we'll
1228 * get the same index back here.
1229 */
1230 int index = _mesa_add_state_reference(this->prog->Parameters,
1231 (gl_state_index *)slots[i].tokens);
1232
1233 /* Add each of the unique swizzles of the element as a parameter.
1234 * This'll end up matching the expected layout of the
1235 * array/matrix/structure we're trying to fill in.
1236 */
1237 int last_swiz = -1;
1238 for (unsigned int j = 0; j < 4; j++) {
1239 int swiz = GET_SWZ(slots[i].swizzle, j);
1240 if (swiz == last_swiz)
1241 break;
1242 last_swiz = swiz;
1243
1244 stage_prog_data->param[uniforms++] =
1245 &prog->Parameters->ParameterValues[index][swiz];
1246 }
1247 }
1248 }
1249
1250 fs_reg *
1251 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1252 bool origin_upper_left)
1253 {
1254 assert(stage == MESA_SHADER_FRAGMENT);
1255 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1256 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1257 fs_reg wpos = *reg;
1258 bool flip = !origin_upper_left ^ key->render_to_fbo;
1259
1260 /* gl_FragCoord.x */
1261 if (pixel_center_integer) {
1262 emit(MOV(wpos, this->pixel_x));
1263 } else {
1264 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1265 }
1266 wpos = offset(wpos, 1);
1267
1268 /* gl_FragCoord.y */
1269 if (!flip && pixel_center_integer) {
1270 emit(MOV(wpos, this->pixel_y));
1271 } else {
1272 fs_reg pixel_y = this->pixel_y;
1273 float offset = (pixel_center_integer ? 0.0 : 0.5);
1274
1275 if (flip) {
1276 pixel_y.negate = true;
1277 offset += key->drawable_height - 1.0;
1278 }
1279
1280 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1281 }
1282 wpos = offset(wpos, 1);
1283
1284 /* gl_FragCoord.z */
1285 if (devinfo->gen >= 6) {
1286 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1287 } else {
1288 emit(FS_OPCODE_LINTERP, wpos,
1289 this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1290 interp_reg(VARYING_SLOT_POS, 2));
1291 }
1292 wpos = offset(wpos, 1);
1293
1294 /* gl_FragCoord.w: Already set up in emit_interpolation */
1295 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1296
1297 return reg;
1298 }
1299
1300 fs_inst *
1301 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1302 glsl_interp_qualifier interpolation_mode,
1303 bool is_centroid, bool is_sample)
1304 {
1305 brw_wm_barycentric_interp_mode barycoord_mode;
1306 if (devinfo->gen >= 6) {
1307 if (is_centroid) {
1308 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1309 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1310 else
1311 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1312 } else if (is_sample) {
1313 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1314 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1315 else
1316 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1317 } else {
1318 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1319 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1320 else
1321 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1322 }
1323 } else {
1324 /* On Ironlake and below, there is only one interpolation mode.
1325 * Centroid interpolation doesn't mean anything on this hardware --
1326 * there is no multisampling.
1327 */
1328 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1329 }
1330 return emit(FS_OPCODE_LINTERP, attr,
1331 this->delta_xy[barycoord_mode], interp);
1332 }
1333
1334 void
1335 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1336 const glsl_type *type,
1337 glsl_interp_qualifier interpolation_mode,
1338 int location, bool mod_centroid,
1339 bool mod_sample)
1340 {
1341 attr.type = brw_type_for_base_type(type->get_scalar_type());
1342
1343 assert(stage == MESA_SHADER_FRAGMENT);
1344 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1345 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1346
1347 unsigned int array_elements;
1348
1349 if (type->is_array()) {
1350 array_elements = type->length;
1351 if (array_elements == 0) {
1352 fail("dereferenced array '%s' has length 0\n", name);
1353 }
1354 type = type->fields.array;
1355 } else {
1356 array_elements = 1;
1357 }
1358
1359 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1360 bool is_gl_Color =
1361 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1362 if (key->flat_shade && is_gl_Color) {
1363 interpolation_mode = INTERP_QUALIFIER_FLAT;
1364 } else {
1365 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1366 }
1367 }
1368
1369 for (unsigned int i = 0; i < array_elements; i++) {
1370 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1371 if (prog_data->urb_setup[location] == -1) {
1372 /* If there's no incoming setup data for this slot, don't
1373 * emit interpolation for it.
1374 */
1375 attr = offset(attr, type->vector_elements);
1376 location++;
1377 continue;
1378 }
1379
1380 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1381 /* Constant interpolation (flat shading) case. The SF has
1382 * handed us defined values in only the constant offset
1383 * field of the setup reg.
1384 */
1385 for (unsigned int k = 0; k < type->vector_elements; k++) {
1386 struct brw_reg interp = interp_reg(location, k);
1387 interp = suboffset(interp, 3);
1388 interp.type = attr.type;
1389 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1390 attr = offset(attr, 1);
1391 }
1392 } else {
1393 /* Smooth/noperspective interpolation case. */
1394 for (unsigned int k = 0; k < type->vector_elements; k++) {
1395 struct brw_reg interp = interp_reg(location, k);
1396 if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1397 /* Get the pixel/sample mask into f0 so that we know
1398 * which pixels are lit. Then, for each channel that is
1399 * unlit, replace the centroid data with non-centroid
1400 * data.
1401 */
1402 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1403
1404 fs_inst *inst;
1405 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1406 false, false);
1407 inst->predicate = BRW_PREDICATE_NORMAL;
1408 inst->predicate_inverse = true;
1409 if (devinfo->has_pln)
1410 inst->no_dd_clear = true;
1411
1412 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1413 mod_centroid && !key->persample_shading,
1414 mod_sample || key->persample_shading);
1415 inst->predicate = BRW_PREDICATE_NORMAL;
1416 inst->predicate_inverse = false;
1417 if (devinfo->has_pln)
1418 inst->no_dd_check = true;
1419
1420 } else {
1421 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1422 mod_centroid && !key->persample_shading,
1423 mod_sample || key->persample_shading);
1424 }
1425 if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1426 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1427 }
1428 attr = offset(attr, 1);
1429 }
1430
1431 }
1432 location++;
1433 }
1434 }
1435 }
1436
1437 fs_reg *
1438 fs_visitor::emit_frontfacing_interpolation()
1439 {
1440 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1441
1442 if (devinfo->gen >= 6) {
1443 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1444 * a boolean result from this (~0/true or 0/false).
1445 *
1446 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1447 * this task in only one instruction:
1448 * - a negation source modifier will flip the bit; and
1449 * - a W -> D type conversion will sign extend the bit into the high
1450 * word of the destination.
1451 *
1452 * An ASR 15 fills the low word of the destination.
1453 */
1454 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1455 g0.negate = true;
1456
1457 emit(ASR(*reg, g0, fs_reg(15)));
1458 } else {
1459 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1460 * a boolean result from this (1/true or 0/false).
1461 *
1462 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1463 * the negation source modifier to flip it. Unfortunately the SHR
1464 * instruction only operates on UD (or D with an abs source modifier)
1465 * sources without negation.
1466 *
1467 * Instead, use ASR (which will give ~0/true or 0/false).
1468 */
1469 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1470 g1_6.negate = true;
1471
1472 emit(ASR(*reg, g1_6, fs_reg(31)));
1473 }
1474
1475 return reg;
1476 }
1477
1478 void
1479 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1480 {
1481 assert(stage == MESA_SHADER_FRAGMENT);
1482 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1483 assert(dst.type == BRW_REGISTER_TYPE_F);
1484
1485 if (key->compute_pos_offset) {
1486 /* Convert int_sample_pos to floating point */
1487 emit(MOV(dst, int_sample_pos));
1488 /* Scale to the range [0, 1] */
1489 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1490 }
1491 else {
1492 /* From ARB_sample_shading specification:
1493 * "When rendering to a non-multisample buffer, or if multisample
1494 * rasterization is disabled, gl_SamplePosition will always be
1495 * (0.5, 0.5).
1496 */
1497 emit(MOV(dst, fs_reg(0.5f)));
1498 }
1499 }
1500
1501 fs_reg *
1502 fs_visitor::emit_samplepos_setup()
1503 {
1504 assert(devinfo->gen >= 6);
1505
1506 this->current_annotation = "compute sample position";
1507 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1508 fs_reg pos = *reg;
1509 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1510 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1511
1512 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1513 * mode will be enabled.
1514 *
1515 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1516 * R31.1:0 Position Offset X/Y for Slot[3:0]
1517 * R31.3:2 Position Offset X/Y for Slot[7:4]
1518 * .....
1519 *
1520 * The X, Y sample positions come in as bytes in thread payload. So, read
1521 * the positions using vstride=16, width=8, hstride=2.
1522 */
1523 struct brw_reg sample_pos_reg =
1524 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1525 BRW_REGISTER_TYPE_B), 16, 8, 2);
1526
1527 if (dispatch_width == 8) {
1528 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1529 } else {
1530 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1531 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1532 ->force_sechalf = true;
1533 }
1534 /* Compute gl_SamplePosition.x */
1535 compute_sample_position(pos, int_sample_x);
1536 pos = offset(pos, 1);
1537 if (dispatch_width == 8) {
1538 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1539 } else {
1540 emit(MOV(half(int_sample_y, 0),
1541 fs_reg(suboffset(sample_pos_reg, 1))));
1542 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1543 ->force_sechalf = true;
1544 }
1545 /* Compute gl_SamplePosition.y */
1546 compute_sample_position(pos, int_sample_y);
1547 return reg;
1548 }
1549
1550 fs_reg *
1551 fs_visitor::emit_sampleid_setup()
1552 {
1553 assert(stage == MESA_SHADER_FRAGMENT);
1554 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1555 assert(devinfo->gen >= 6);
1556
1557 this->current_annotation = "compute sample id";
1558 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1559
1560 if (key->compute_sample_id) {
1561 fs_reg t1 = vgrf(glsl_type::int_type);
1562 fs_reg t2 = vgrf(glsl_type::int_type);
1563 t2.type = BRW_REGISTER_TYPE_UW;
1564
1565 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1566 * 8x multisampling, subspan 0 will represent sample N (where N
1567 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1568 * 7. We can find the value of N by looking at R0.0 bits 7:6
1569 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1570 * (since samples are always delivered in pairs). That is, we
1571 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1572 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1573 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1574 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1575 * populating a temporary variable with the sequence (0, 1, 2, 3),
1576 * and then reading from it using vstride=1, width=4, hstride=0.
1577 * These computations hold good for 4x multisampling as well.
1578 *
1579 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1580 * the first four slots are sample 0 of subspan 0; the next four
1581 * are sample 1 of subspan 0; the third group is sample 0 of
1582 * subspan 1, and finally sample 1 of subspan 1.
1583 */
1584 fs_inst *inst;
1585 inst = emit(BRW_OPCODE_AND, t1,
1586 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1587 fs_reg(0xc0));
1588 inst->force_writemask_all = true;
1589 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1590 inst->force_writemask_all = true;
1591 /* This works for both SIMD8 and SIMD16 */
1592 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1593 inst->force_writemask_all = true;
1594 /* This special instruction takes care of setting vstride=1,
1595 * width=4, hstride=0 of t2 during an ADD instruction.
1596 */
1597 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1598 } else {
1599 /* As per GL_ARB_sample_shading specification:
1600 * "When rendering to a non-multisample buffer, or if multisample
1601 * rasterization is disabled, gl_SampleID will always be zero."
1602 */
1603 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1604 }
1605
1606 return reg;
1607 }
1608
1609 void
1610 fs_visitor::resolve_source_modifiers(fs_reg *src)
1611 {
1612 if (!src->abs && !src->negate)
1613 return;
1614
1615 fs_reg temp = retype(vgrf(1), src->type);
1616 emit(MOV(temp, *src));
1617 *src = temp;
1618 }
1619
1620 fs_reg
1621 fs_visitor::fix_math_operand(fs_reg src)
1622 {
1623 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1624 * might be able to do better by doing execsize = 1 math and then
1625 * expanding that result out, but we would need to be careful with
1626 * masking.
1627 *
1628 * The hardware ignores source modifiers (negate and abs) on math
1629 * instructions, so we also move to a temp to set those up.
1630 */
1631 if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1632 !src.abs && !src.negate)
1633 return src;
1634
1635 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1636 * operands to math
1637 */
1638 if (devinfo->gen >= 7 && src.file != IMM)
1639 return src;
1640
1641 fs_reg expanded = vgrf(glsl_type::float_type);
1642 expanded.type = src.type;
1643 emit(BRW_OPCODE_MOV, expanded, src);
1644 return expanded;
1645 }
1646
1647 fs_inst *
1648 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1649 {
1650 switch (opcode) {
1651 case SHADER_OPCODE_RCP:
1652 case SHADER_OPCODE_RSQ:
1653 case SHADER_OPCODE_SQRT:
1654 case SHADER_OPCODE_EXP2:
1655 case SHADER_OPCODE_LOG2:
1656 case SHADER_OPCODE_SIN:
1657 case SHADER_OPCODE_COS:
1658 break;
1659 default:
1660 unreachable("not reached: bad math opcode");
1661 }
1662
1663 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1664 * might be able to do better by doing execsize = 1 math and then
1665 * expanding that result out, but we would need to be careful with
1666 * masking.
1667 *
1668 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1669 * instructions, so we also move to a temp to set those up.
1670 */
1671 if (devinfo->gen == 6 || devinfo->gen == 7)
1672 src = fix_math_operand(src);
1673
1674 fs_inst *inst = emit(opcode, dst, src);
1675
1676 if (devinfo->gen < 6) {
1677 inst->base_mrf = 2;
1678 inst->mlen = dispatch_width / 8;
1679 }
1680
1681 return inst;
1682 }
1683
1684 fs_inst *
1685 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1686 {
1687 int base_mrf = 2;
1688 fs_inst *inst;
1689
1690 if (devinfo->gen >= 8) {
1691 inst = emit(opcode, dst, src0, src1);
1692 } else if (devinfo->gen >= 6) {
1693 src0 = fix_math_operand(src0);
1694 src1 = fix_math_operand(src1);
1695
1696 inst = emit(opcode, dst, src0, src1);
1697 } else {
1698 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1699 * "Message Payload":
1700 *
1701 * "Operand0[7]. For the INT DIV functions, this operand is the
1702 * denominator."
1703 * ...
1704 * "Operand1[7]. For the INT DIV functions, this operand is the
1705 * numerator."
1706 */
1707 bool is_int_div = opcode != SHADER_OPCODE_POW;
1708 fs_reg &op0 = is_int_div ? src1 : src0;
1709 fs_reg &op1 = is_int_div ? src0 : src1;
1710
1711 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1712 inst = emit(opcode, dst, op0, reg_null_f);
1713
1714 inst->base_mrf = base_mrf;
1715 inst->mlen = 2 * dispatch_width / 8;
1716 }
1717 return inst;
1718 }
1719
1720 void
1721 fs_visitor::emit_discard_jump()
1722 {
1723 assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1724
1725 /* For performance, after a discard, jump to the end of the
1726 * shader if all relevant channels have been discarded.
1727 */
1728 fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1729 discard_jump->flag_subreg = 1;
1730
1731 discard_jump->predicate = (dispatch_width == 8)
1732 ? BRW_PREDICATE_ALIGN1_ANY8H
1733 : BRW_PREDICATE_ALIGN1_ANY16H;
1734 discard_jump->predicate_inverse = true;
1735 }
1736
1737 void
1738 fs_visitor::assign_curb_setup()
1739 {
1740 if (dispatch_width == 8) {
1741 prog_data->dispatch_grf_start_reg = payload.num_regs;
1742 } else {
1743 if (stage == MESA_SHADER_FRAGMENT) {
1744 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1745 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1746 } else if (stage == MESA_SHADER_COMPUTE) {
1747 brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1748 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1749 } else {
1750 unreachable("Unsupported shader type!");
1751 }
1752 }
1753
1754 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1755
1756 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1757 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1758 for (unsigned int i = 0; i < inst->sources; i++) {
1759 if (inst->src[i].file == UNIFORM) {
1760 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1761 int constant_nr;
1762 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1763 constant_nr = push_constant_loc[uniform_nr];
1764 } else {
1765 /* Section 5.11 of the OpenGL 4.1 spec says:
1766 * "Out-of-bounds reads return undefined values, which include
1767 * values from other variables of the active program or zero."
1768 * Just return the first push constant.
1769 */
1770 constant_nr = 0;
1771 }
1772
1773 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1774 constant_nr / 8,
1775 constant_nr % 8);
1776
1777 inst->src[i].file = HW_REG;
1778 inst->src[i].fixed_hw_reg = byte_offset(
1779 retype(brw_reg, inst->src[i].type),
1780 inst->src[i].subreg_offset);
1781 }
1782 }
1783 }
1784 }
1785
1786 void
1787 fs_visitor::calculate_urb_setup()
1788 {
1789 assert(stage == MESA_SHADER_FRAGMENT);
1790 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1791 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1792
1793 memset(prog_data->urb_setup, -1,
1794 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1795
1796 int urb_next = 0;
1797 /* Figure out where each of the incoming setup attributes lands. */
1798 if (devinfo->gen >= 6) {
1799 if (_mesa_bitcount_64(prog->InputsRead &
1800 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1801 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1802 * first 16 varying inputs, so we can put them wherever we want.
1803 * Just put them in order.
1804 *
1805 * This is useful because it means that (a) inputs not used by the
1806 * fragment shader won't take up valuable register space, and (b) we
1807 * won't have to recompile the fragment shader if it gets paired with
1808 * a different vertex (or geometry) shader.
1809 */
1810 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1811 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1812 BITFIELD64_BIT(i)) {
1813 prog_data->urb_setup[i] = urb_next++;
1814 }
1815 }
1816 } else {
1817 /* We have enough input varyings that the SF/SBE pipeline stage can't
1818 * arbitrarily rearrange them to suit our whim; we have to put them
1819 * in an order that matches the output of the previous pipeline stage
1820 * (geometry or vertex shader).
1821 */
1822 struct brw_vue_map prev_stage_vue_map;
1823 brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1824 key->input_slots_valid);
1825 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1826 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1827 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1828 slot++) {
1829 int varying = prev_stage_vue_map.slot_to_varying[slot];
1830 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1831 * unused.
1832 */
1833 if (varying != BRW_VARYING_SLOT_COUNT &&
1834 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1835 BITFIELD64_BIT(varying))) {
1836 prog_data->urb_setup[varying] = slot - first_slot;
1837 }
1838 }
1839 urb_next = prev_stage_vue_map.num_slots - first_slot;
1840 }
1841 } else {
1842 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1843 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1844 /* Point size is packed into the header, not as a general attribute */
1845 if (i == VARYING_SLOT_PSIZ)
1846 continue;
1847
1848 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1849 /* The back color slot is skipped when the front color is
1850 * also written to. In addition, some slots can be
1851 * written in the vertex shader and not read in the
1852 * fragment shader. So the register number must always be
1853 * incremented, mapped or not.
1854 */
1855 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1856 prog_data->urb_setup[i] = urb_next;
1857 urb_next++;
1858 }
1859 }
1860
1861 /*
1862 * It's a FS only attribute, and we did interpolation for this attribute
1863 * in SF thread. So, count it here, too.
1864 *
1865 * See compile_sf_prog() for more info.
1866 */
1867 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1868 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1869 }
1870
1871 prog_data->num_varying_inputs = urb_next;
1872 }
1873
1874 void
1875 fs_visitor::assign_urb_setup()
1876 {
1877 assert(stage == MESA_SHADER_FRAGMENT);
1878 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1879
1880 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1881
1882 /* Offset all the urb_setup[] index by the actual position of the
1883 * setup regs, now that the location of the constants has been chosen.
1884 */
1885 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1886 if (inst->opcode == FS_OPCODE_LINTERP) {
1887 assert(inst->src[1].file == HW_REG);
1888 inst->src[1].fixed_hw_reg.nr += urb_start;
1889 }
1890
1891 if (inst->opcode == FS_OPCODE_CINTERP) {
1892 assert(inst->src[0].file == HW_REG);
1893 inst->src[0].fixed_hw_reg.nr += urb_start;
1894 }
1895 }
1896
1897 /* Each attribute is 4 setup channels, each of which is half a reg. */
1898 this->first_non_payload_grf =
1899 urb_start + prog_data->num_varying_inputs * 2;
1900 }
1901
1902 void
1903 fs_visitor::assign_vs_urb_setup()
1904 {
1905 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1906 int grf, count, slot, channel, attr;
1907
1908 assert(stage == MESA_SHADER_VERTEX);
1909 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1910 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1911 count++;
1912
1913 /* Each attribute is 4 regs. */
1914 this->first_non_payload_grf =
1915 payload.num_regs + prog_data->curb_read_length + count * 4;
1916
1917 unsigned vue_entries =
1918 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1919
1920 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1921 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1922
1923 assert(vs_prog_data->base.urb_read_length <= 15);
1924
1925 /* Rewrite all ATTR file references to the hw grf that they land in. */
1926 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1927 for (int i = 0; i < inst->sources; i++) {
1928 if (inst->src[i].file == ATTR) {
1929
1930 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1931 slot = count - 1;
1932 } else {
1933 /* Attributes come in in a contiguous block, ordered by their
1934 * gl_vert_attrib value. That means we can compute the slot
1935 * number for an attribute by masking out the enabled
1936 * attributes before it and counting the bits.
1937 */
1938 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1939 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1940 BITFIELD64_MASK(attr));
1941 }
1942
1943 channel = inst->src[i].reg_offset & 3;
1944
1945 grf = payload.num_regs +
1946 prog_data->curb_read_length +
1947 slot * 4 + channel;
1948
1949 inst->src[i].file = HW_REG;
1950 inst->src[i].fixed_hw_reg =
1951 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1952 }
1953 }
1954 }
1955 }
1956
1957 /**
1958 * Split large virtual GRFs into separate components if we can.
1959 *
1960 * This is mostly duplicated with what brw_fs_vector_splitting does,
1961 * but that's really conservative because it's afraid of doing
1962 * splitting that doesn't result in real progress after the rest of
1963 * the optimization phases, which would cause infinite looping in
1964 * optimization. We can do it once here, safely. This also has the
1965 * opportunity to split interpolated values, or maybe even uniforms,
1966 * which we don't have at the IR level.
1967 *
1968 * We want to split, because virtual GRFs are what we register
1969 * allocate and spill (due to contiguousness requirements for some
1970 * instructions), and they're what we naturally generate in the
1971 * codegen process, but most virtual GRFs don't actually need to be
1972 * contiguous sets of GRFs. If we split, we'll end up with reduced
1973 * live intervals and better dead code elimination and coalescing.
1974 */
1975 void
1976 fs_visitor::split_virtual_grfs()
1977 {
1978 int num_vars = this->alloc.count;
1979
1980 /* Count the total number of registers */
1981 int reg_count = 0;
1982 int vgrf_to_reg[num_vars];
1983 for (int i = 0; i < num_vars; i++) {
1984 vgrf_to_reg[i] = reg_count;
1985 reg_count += alloc.sizes[i];
1986 }
1987
1988 /* An array of "split points". For each register slot, this indicates
1989 * if this slot can be separated from the previous slot. Every time an
1990 * instruction uses multiple elements of a register (as a source or
1991 * destination), we mark the used slots as inseparable. Then we go
1992 * through and split the registers into the smallest pieces we can.
1993 */
1994 bool split_points[reg_count];
1995 memset(split_points, 0, sizeof(split_points));
1996
1997 /* Mark all used registers as fully splittable */
1998 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1999 if (inst->dst.file == GRF) {
2000 int reg = vgrf_to_reg[inst->dst.reg];
2001 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
2002 split_points[reg + j] = true;
2003 }
2004
2005 for (int i = 0; i < inst->sources; i++) {
2006 if (inst->src[i].file == GRF) {
2007 int reg = vgrf_to_reg[inst->src[i].reg];
2008 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
2009 split_points[reg + j] = true;
2010 }
2011 }
2012 }
2013
2014 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2015 if (inst->dst.file == GRF) {
2016 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2017 for (int j = 1; j < inst->regs_written; j++)
2018 split_points[reg + j] = false;
2019 }
2020 for (int i = 0; i < inst->sources; i++) {
2021 if (inst->src[i].file == GRF) {
2022 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2023 for (int j = 1; j < inst->regs_read(i); j++)
2024 split_points[reg + j] = false;
2025 }
2026 }
2027 }
2028
2029 int new_virtual_grf[reg_count];
2030 int new_reg_offset[reg_count];
2031
2032 int reg = 0;
2033 for (int i = 0; i < num_vars; i++) {
2034 /* The first one should always be 0 as a quick sanity check. */
2035 assert(split_points[reg] == false);
2036
2037 /* j = 0 case */
2038 new_reg_offset[reg] = 0;
2039 reg++;
2040 int offset = 1;
2041
2042 /* j > 0 case */
2043 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2044 /* If this is a split point, reset the offset to 0 and allocate a
2045 * new virtual GRF for the previous offset many registers
2046 */
2047 if (split_points[reg]) {
2048 assert(offset <= MAX_VGRF_SIZE);
2049 int grf = alloc.allocate(offset);
2050 for (int k = reg - offset; k < reg; k++)
2051 new_virtual_grf[k] = grf;
2052 offset = 0;
2053 }
2054 new_reg_offset[reg] = offset;
2055 offset++;
2056 reg++;
2057 }
2058
2059 /* The last one gets the original register number */
2060 assert(offset <= MAX_VGRF_SIZE);
2061 alloc.sizes[i] = offset;
2062 for (int k = reg - offset; k < reg; k++)
2063 new_virtual_grf[k] = i;
2064 }
2065 assert(reg == reg_count);
2066
2067 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2068 if (inst->dst.file == GRF) {
2069 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2070 inst->dst.reg = new_virtual_grf[reg];
2071 inst->dst.reg_offset = new_reg_offset[reg];
2072 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2073 }
2074 for (int i = 0; i < inst->sources; i++) {
2075 if (inst->src[i].file == GRF) {
2076 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2077 inst->src[i].reg = new_virtual_grf[reg];
2078 inst->src[i].reg_offset = new_reg_offset[reg];
2079 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2080 }
2081 }
2082 }
2083 invalidate_live_intervals();
2084 }
2085
2086 /**
2087 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2088 *
2089 * During code generation, we create tons of temporary variables, many of
2090 * which get immediately killed and are never used again. Yet, in later
2091 * optimization and analysis passes, such as compute_live_intervals, we need
2092 * to loop over all the virtual GRFs. Compacting them can save a lot of
2093 * overhead.
2094 */
2095 bool
2096 fs_visitor::compact_virtual_grfs()
2097 {
2098 bool progress = false;
2099 int remap_table[this->alloc.count];
2100 memset(remap_table, -1, sizeof(remap_table));
2101
2102 /* Mark which virtual GRFs are used. */
2103 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2104 if (inst->dst.file == GRF)
2105 remap_table[inst->dst.reg] = 0;
2106
2107 for (int i = 0; i < inst->sources; i++) {
2108 if (inst->src[i].file == GRF)
2109 remap_table[inst->src[i].reg] = 0;
2110 }
2111 }
2112
2113 /* Compact the GRF arrays. */
2114 int new_index = 0;
2115 for (unsigned i = 0; i < this->alloc.count; i++) {
2116 if (remap_table[i] == -1) {
2117 /* We just found an unused register. This means that we are
2118 * actually going to compact something.
2119 */
2120 progress = true;
2121 } else {
2122 remap_table[i] = new_index;
2123 alloc.sizes[new_index] = alloc.sizes[i];
2124 invalidate_live_intervals();
2125 ++new_index;
2126 }
2127 }
2128
2129 this->alloc.count = new_index;
2130
2131 /* Patch all the instructions to use the newly renumbered registers */
2132 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2133 if (inst->dst.file == GRF)
2134 inst->dst.reg = remap_table[inst->dst.reg];
2135
2136 for (int i = 0; i < inst->sources; i++) {
2137 if (inst->src[i].file == GRF)
2138 inst->src[i].reg = remap_table[inst->src[i].reg];
2139 }
2140 }
2141
2142 /* Patch all the references to delta_xy, since they're used in register
2143 * allocation. If they're unused, switch them to BAD_FILE so we don't
2144 * think some random VGRF is delta_xy.
2145 */
2146 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2147 if (delta_xy[i].file == GRF) {
2148 if (remap_table[delta_xy[i].reg] != -1) {
2149 delta_xy[i].reg = remap_table[delta_xy[i].reg];
2150 } else {
2151 delta_xy[i].file = BAD_FILE;
2152 }
2153 }
2154 }
2155
2156 return progress;
2157 }
2158
2159 /*
2160 * Implements array access of uniforms by inserting a
2161 * PULL_CONSTANT_LOAD instruction.
2162 *
2163 * Unlike temporary GRF array access (where we don't support it due to
2164 * the difficulty of doing relative addressing on instruction
2165 * destinations), we could potentially do array access of uniforms
2166 * that were loaded in GRF space as push constants. In real-world
2167 * usage we've seen, though, the arrays being used are always larger
2168 * than we could load as push constants, so just always move all
2169 * uniform array access out to a pull constant buffer.
2170 */
2171 void
2172 fs_visitor::move_uniform_array_access_to_pull_constants()
2173 {
2174 if (dispatch_width != 8)
2175 return;
2176
2177 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2178 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2179
2180 /* Walk through and find array access of uniforms. Put a copy of that
2181 * uniform in the pull constant buffer.
2182 *
2183 * Note that we don't move constant-indexed accesses to arrays. No
2184 * testing has been done of the performance impact of this choice.
2185 */
2186 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2187 for (int i = 0 ; i < inst->sources; i++) {
2188 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2189 continue;
2190
2191 int uniform = inst->src[i].reg;
2192
2193 /* If this array isn't already present in the pull constant buffer,
2194 * add it.
2195 */
2196 if (pull_constant_loc[uniform] == -1) {
2197 const gl_constant_value **values = &stage_prog_data->param[uniform];
2198
2199 assert(param_size[uniform]);
2200
2201 for (int j = 0; j < param_size[uniform]; j++) {
2202 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2203
2204 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2205 values[j];
2206 }
2207 }
2208 }
2209 }
2210 }
2211
2212 /**
2213 * Assign UNIFORM file registers to either push constants or pull constants.
2214 *
2215 * We allow a fragment shader to have more than the specified minimum
2216 * maximum number of fragment shader uniform components (64). If
2217 * there are too many of these, they'd fill up all of register space.
2218 * So, this will push some of them out to the pull constant buffer and
2219 * update the program to load them.
2220 */
2221 void
2222 fs_visitor::assign_constant_locations()
2223 {
2224 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2225 if (dispatch_width != 8)
2226 return;
2227
2228 /* Find which UNIFORM registers are still in use. */
2229 bool is_live[uniforms];
2230 for (unsigned int i = 0; i < uniforms; i++) {
2231 is_live[i] = false;
2232 }
2233
2234 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2235 for (int i = 0; i < inst->sources; i++) {
2236 if (inst->src[i].file != UNIFORM)
2237 continue;
2238
2239 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2240 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2241 is_live[constant_nr] = true;
2242 }
2243 }
2244
2245 /* Only allow 16 registers (128 uniform components) as push constants.
2246 *
2247 * Just demote the end of the list. We could probably do better
2248 * here, demoting things that are rarely used in the program first.
2249 *
2250 * If changing this value, note the limitation about total_regs in
2251 * brw_curbe.c.
2252 */
2253 unsigned int max_push_components = 16 * 8;
2254 unsigned int num_push_constants = 0;
2255
2256 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2257
2258 for (unsigned int i = 0; i < uniforms; i++) {
2259 if (!is_live[i] || pull_constant_loc[i] != -1) {
2260 /* This UNIFORM register is either dead, or has already been demoted
2261 * to a pull const. Mark it as no longer living in the param[] array.
2262 */
2263 push_constant_loc[i] = -1;
2264 continue;
2265 }
2266
2267 if (num_push_constants < max_push_components) {
2268 /* Retain as a push constant. Record the location in the params[]
2269 * array.
2270 */
2271 push_constant_loc[i] = num_push_constants++;
2272 } else {
2273 /* Demote to a pull constant. */
2274 push_constant_loc[i] = -1;
2275
2276 int pull_index = stage_prog_data->nr_pull_params++;
2277 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2278 pull_constant_loc[i] = pull_index;
2279 }
2280 }
2281
2282 stage_prog_data->nr_params = num_push_constants;
2283
2284 /* Up until now, the param[] array has been indexed by reg + reg_offset
2285 * of UNIFORM registers. Condense it to only contain the uniforms we
2286 * chose to upload as push constants.
2287 */
2288 for (unsigned int i = 0; i < uniforms; i++) {
2289 int remapped = push_constant_loc[i];
2290
2291 if (remapped == -1)
2292 continue;
2293
2294 assert(remapped <= (int)i);
2295 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2296 }
2297 }
2298
2299 /**
2300 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2301 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2302 */
2303 void
2304 fs_visitor::demote_pull_constants()
2305 {
2306 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2307 for (int i = 0; i < inst->sources; i++) {
2308 if (inst->src[i].file != UNIFORM)
2309 continue;
2310
2311 int pull_index;
2312 unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2313 if (location >= uniforms) /* Out of bounds access */
2314 pull_index = -1;
2315 else
2316 pull_index = pull_constant_loc[location];
2317
2318 if (pull_index == -1)
2319 continue;
2320
2321 /* Set up the annotation tracking for new generated instructions. */
2322 base_ir = inst->ir;
2323 current_annotation = inst->annotation;
2324
2325 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2326 fs_reg dst = vgrf(glsl_type::float_type);
2327
2328 /* Generate a pull load into dst. */
2329 if (inst->src[i].reladdr) {
2330 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2331 surf_index,
2332 *inst->src[i].reladdr,
2333 pull_index);
2334 inst->insert_before(block, &list);
2335 inst->src[i].reladdr = NULL;
2336 } else {
2337 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2338 fs_inst *pull =
2339 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2340 dst, surf_index, offset);
2341 inst->insert_before(block, pull);
2342 inst->src[i].set_smear(pull_index & 3);
2343 }
2344
2345 /* Rewrite the instruction to use the temporary VGRF. */
2346 inst->src[i].file = GRF;
2347 inst->src[i].reg = dst.reg;
2348 inst->src[i].reg_offset = 0;
2349 inst->src[i].width = dispatch_width;
2350 }
2351 }
2352 invalidate_live_intervals();
2353 }
2354
2355 bool
2356 fs_visitor::opt_algebraic()
2357 {
2358 bool progress = false;
2359
2360 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2361 switch (inst->opcode) {
2362 case BRW_OPCODE_MOV:
2363 if (inst->src[0].file != IMM)
2364 break;
2365
2366 if (inst->saturate) {
2367 if (inst->dst.type != inst->src[0].type)
2368 assert(!"unimplemented: saturate mixed types");
2369
2370 if (brw_saturate_immediate(inst->dst.type,
2371 &inst->src[0].fixed_hw_reg)) {
2372 inst->saturate = false;
2373 progress = true;
2374 }
2375 }
2376 break;
2377
2378 case BRW_OPCODE_MUL:
2379 if (inst->src[1].file != IMM)
2380 continue;
2381
2382 /* a * 1.0 = a */
2383 if (inst->src[1].is_one()) {
2384 inst->opcode = BRW_OPCODE_MOV;
2385 inst->src[1] = reg_undef;
2386 progress = true;
2387 break;
2388 }
2389
2390 /* a * -1.0 = -a */
2391 if (inst->src[1].is_negative_one()) {
2392 inst->opcode = BRW_OPCODE_MOV;
2393 inst->src[0].negate = !inst->src[0].negate;
2394 inst->src[1] = reg_undef;
2395 progress = true;
2396 break;
2397 }
2398
2399 /* a * 0.0 = 0.0 */
2400 if (inst->src[1].is_zero()) {
2401 inst->opcode = BRW_OPCODE_MOV;
2402 inst->src[0] = inst->src[1];
2403 inst->src[1] = reg_undef;
2404 progress = true;
2405 break;
2406 }
2407
2408 if (inst->src[0].file == IMM) {
2409 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2410 inst->opcode = BRW_OPCODE_MOV;
2411 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2412 inst->src[1] = reg_undef;
2413 progress = true;
2414 break;
2415 }
2416 break;
2417 case BRW_OPCODE_ADD:
2418 if (inst->src[1].file != IMM)
2419 continue;
2420
2421 /* a + 0.0 = a */
2422 if (inst->src[1].is_zero()) {
2423 inst->opcode = BRW_OPCODE_MOV;
2424 inst->src[1] = reg_undef;
2425 progress = true;
2426 break;
2427 }
2428
2429 if (inst->src[0].file == IMM) {
2430 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2431 inst->opcode = BRW_OPCODE_MOV;
2432 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2433 inst->src[1] = reg_undef;
2434 progress = true;
2435 break;
2436 }
2437 break;
2438 case BRW_OPCODE_OR:
2439 if (inst->src[0].equals(inst->src[1])) {
2440 inst->opcode = BRW_OPCODE_MOV;
2441 inst->src[1] = reg_undef;
2442 progress = true;
2443 break;
2444 }
2445 break;
2446 case BRW_OPCODE_LRP:
2447 if (inst->src[1].equals(inst->src[2])) {
2448 inst->opcode = BRW_OPCODE_MOV;
2449 inst->src[0] = inst->src[1];
2450 inst->src[1] = reg_undef;
2451 inst->src[2] = reg_undef;
2452 progress = true;
2453 break;
2454 }
2455 break;
2456 case BRW_OPCODE_CMP:
2457 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2458 inst->src[0].abs &&
2459 inst->src[0].negate &&
2460 inst->src[1].is_zero()) {
2461 inst->src[0].abs = false;
2462 inst->src[0].negate = false;
2463 inst->conditional_mod = BRW_CONDITIONAL_Z;
2464 progress = true;
2465 break;
2466 }
2467 break;
2468 case BRW_OPCODE_SEL:
2469 if (inst->src[0].equals(inst->src[1])) {
2470 inst->opcode = BRW_OPCODE_MOV;
2471 inst->src[1] = reg_undef;
2472 inst->predicate = BRW_PREDICATE_NONE;
2473 inst->predicate_inverse = false;
2474 progress = true;
2475 } else if (inst->saturate && inst->src[1].file == IMM) {
2476 switch (inst->conditional_mod) {
2477 case BRW_CONDITIONAL_LE:
2478 case BRW_CONDITIONAL_L:
2479 switch (inst->src[1].type) {
2480 case BRW_REGISTER_TYPE_F:
2481 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2482 inst->opcode = BRW_OPCODE_MOV;
2483 inst->src[1] = reg_undef;
2484 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2485 progress = true;
2486 }
2487 break;
2488 default:
2489 break;
2490 }
2491 break;
2492 case BRW_CONDITIONAL_GE:
2493 case BRW_CONDITIONAL_G:
2494 switch (inst->src[1].type) {
2495 case BRW_REGISTER_TYPE_F:
2496 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2497 inst->opcode = BRW_OPCODE_MOV;
2498 inst->src[1] = reg_undef;
2499 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2500 progress = true;
2501 }
2502 break;
2503 default:
2504 break;
2505 }
2506 default:
2507 break;
2508 }
2509 }
2510 break;
2511 case BRW_OPCODE_MAD:
2512 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2513 inst->opcode = BRW_OPCODE_MOV;
2514 inst->src[1] = reg_undef;
2515 inst->src[2] = reg_undef;
2516 progress = true;
2517 } else if (inst->src[0].is_zero()) {
2518 inst->opcode = BRW_OPCODE_MUL;
2519 inst->src[0] = inst->src[2];
2520 inst->src[2] = reg_undef;
2521 progress = true;
2522 } else if (inst->src[1].is_one()) {
2523 inst->opcode = BRW_OPCODE_ADD;
2524 inst->src[1] = inst->src[2];
2525 inst->src[2] = reg_undef;
2526 progress = true;
2527 } else if (inst->src[2].is_one()) {
2528 inst->opcode = BRW_OPCODE_ADD;
2529 inst->src[2] = reg_undef;
2530 progress = true;
2531 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2532 inst->opcode = BRW_OPCODE_ADD;
2533 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2534 inst->src[2] = reg_undef;
2535 progress = true;
2536 }
2537 break;
2538 case SHADER_OPCODE_RCP: {
2539 fs_inst *prev = (fs_inst *)inst->prev;
2540 if (prev->opcode == SHADER_OPCODE_SQRT) {
2541 if (inst->src[0].equals(prev->dst)) {
2542 inst->opcode = SHADER_OPCODE_RSQ;
2543 inst->src[0] = prev->src[0];
2544 progress = true;
2545 }
2546 }
2547 break;
2548 }
2549 case SHADER_OPCODE_BROADCAST:
2550 if (is_uniform(inst->src[0])) {
2551 inst->opcode = BRW_OPCODE_MOV;
2552 inst->sources = 1;
2553 inst->force_writemask_all = true;
2554 progress = true;
2555 } else if (inst->src[1].file == IMM) {
2556 inst->opcode = BRW_OPCODE_MOV;
2557 inst->src[0] = component(inst->src[0],
2558 inst->src[1].fixed_hw_reg.dw1.ud);
2559 inst->sources = 1;
2560 inst->force_writemask_all = true;
2561 progress = true;
2562 }
2563 break;
2564
2565 default:
2566 break;
2567 }
2568
2569 /* Swap if src[0] is immediate. */
2570 if (progress && inst->is_commutative()) {
2571 if (inst->src[0].file == IMM) {
2572 fs_reg tmp = inst->src[1];
2573 inst->src[1] = inst->src[0];
2574 inst->src[0] = tmp;
2575 }
2576 }
2577 }
2578 return progress;
2579 }
2580
2581 /**
2582 * Optimize sample messages that have constant zero values for the trailing
2583 * texture coordinates. We can just reduce the message length for these
2584 * instructions instead of reserving a register for it. Trailing parameters
2585 * that aren't sent default to zero anyway. This will cause the dead code
2586 * eliminator to remove the MOV instruction that would otherwise be emitted to
2587 * set up the zero value.
2588 */
2589 bool
2590 fs_visitor::opt_zero_samples()
2591 {
2592 /* Gen4 infers the texturing opcode based on the message length so we can't
2593 * change it.
2594 */
2595 if (devinfo->gen < 5)
2596 return false;
2597
2598 bool progress = false;
2599
2600 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2601 if (!inst->is_tex())
2602 continue;
2603
2604 fs_inst *load_payload = (fs_inst *) inst->prev;
2605
2606 if (load_payload->is_head_sentinel() ||
2607 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2608 continue;
2609
2610 /* We don't want to remove the message header. Removing all of the
2611 * parameters is avoided because it seems to cause a GPU hang but I
2612 * can't find any documentation indicating that this is expected.
2613 */
2614 while (inst->mlen > inst->header_present + dispatch_width / 8 &&
2615 load_payload->src[(inst->mlen - inst->header_present) /
2616 (dispatch_width / 8) +
2617 inst->header_present - 1].is_zero()) {
2618 inst->mlen -= dispatch_width / 8;
2619 progress = true;
2620 }
2621 }
2622
2623 if (progress)
2624 invalidate_live_intervals();
2625
2626 return progress;
2627 }
2628
2629 /**
2630 * Optimize sample messages which are followed by the final RT write.
2631 *
2632 * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2633 * results sent directly to the framebuffer, bypassing the EU. Recognize the
2634 * final texturing results copied to the framebuffer write payload and modify
2635 * them to write to the framebuffer directly.
2636 */
2637 bool
2638 fs_visitor::opt_sampler_eot()
2639 {
2640 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2641
2642 if (stage != MESA_SHADER_FRAGMENT)
2643 return false;
2644
2645 if (devinfo->gen < 9 && !devinfo->is_cherryview)
2646 return false;
2647
2648 /* FINISHME: It should be possible to implement this optimization when there
2649 * are multiple drawbuffers.
2650 */
2651 if (key->nr_color_regions != 1)
2652 return false;
2653
2654 /* Look for a texturing instruction immediately before the final FB_WRITE. */
2655 fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2656 assert(fb_write->eot);
2657 assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2658
2659 fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2660
2661 /* There wasn't one; nothing to do. */
2662 if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2663 return false;
2664
2665 /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2666 * It's very likely to be the previous instruction.
2667 */
2668 fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2669 if (load_payload->is_head_sentinel() ||
2670 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2671 return false;
2672
2673 assert(!tex_inst->eot); /* We can't get here twice */
2674 assert((tex_inst->offset & (0xff << 24)) == 0);
2675
2676 tex_inst->offset |= fb_write->target << 24;
2677 tex_inst->eot = true;
2678 fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2679
2680 /* If a header is present, marking the eot is sufficient. Otherwise, we need
2681 * to create a new LOAD_PAYLOAD command with the same sources and a space
2682 * saved for the header. Using a new destination register not only makes sure
2683 * we have enough space, but it will make sure the dead code eliminator kills
2684 * the instruction that this will replace.
2685 */
2686 if (tex_inst->header_present)
2687 return true;
2688
2689 fs_reg send_header = vgrf(load_payload->sources + 1);
2690 fs_reg *new_sources =
2691 ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2692
2693 new_sources[0] = fs_reg();
2694 for (int i = 0; i < load_payload->sources; i++)
2695 new_sources[i+1] = load_payload->src[i];
2696
2697 /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2698 * requires a lot of information about the sources to appropriately figure
2699 * out the number of registers needed to be used. Given this stage in our
2700 * optimization, we may not have the appropriate GRFs required by
2701 * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2702 * manually emit the instruction.
2703 */
2704 fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2705 load_payload->exec_size,
2706 send_header,
2707 new_sources,
2708 load_payload->sources + 1);
2709
2710 new_load_payload->regs_written = load_payload->regs_written + 1;
2711 tex_inst->mlen++;
2712 tex_inst->header_present = true;
2713 tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2714 tex_inst->src[0] = send_header;
2715 tex_inst->dst = reg_null_ud;
2716
2717 return true;
2718 }
2719
2720 bool
2721 fs_visitor::opt_register_renaming()
2722 {
2723 bool progress = false;
2724 int depth = 0;
2725
2726 int remap[alloc.count];
2727 memset(remap, -1, sizeof(int) * alloc.count);
2728
2729 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2730 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2731 depth++;
2732 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2733 inst->opcode == BRW_OPCODE_WHILE) {
2734 depth--;
2735 }
2736
2737 /* Rewrite instruction sources. */
2738 for (int i = 0; i < inst->sources; i++) {
2739 if (inst->src[i].file == GRF &&
2740 remap[inst->src[i].reg] != -1 &&
2741 remap[inst->src[i].reg] != inst->src[i].reg) {
2742 inst->src[i].reg = remap[inst->src[i].reg];
2743 progress = true;
2744 }
2745 }
2746
2747 const int dst = inst->dst.reg;
2748
2749 if (depth == 0 &&
2750 inst->dst.file == GRF &&
2751 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2752 !inst->is_partial_write()) {
2753 if (remap[dst] == -1) {
2754 remap[dst] = dst;
2755 } else {
2756 remap[dst] = alloc.allocate(inst->dst.width / 8);
2757 inst->dst.reg = remap[dst];
2758 progress = true;
2759 }
2760 } else if (inst->dst.file == GRF &&
2761 remap[dst] != -1 &&
2762 remap[dst] != dst) {
2763 inst->dst.reg = remap[dst];
2764 progress = true;
2765 }
2766 }
2767
2768 if (progress) {
2769 invalidate_live_intervals();
2770
2771 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2772 if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2773 delta_xy[i].reg = remap[delta_xy[i].reg];
2774 }
2775 }
2776 }
2777
2778 return progress;
2779 }
2780
2781 /**
2782 * Remove redundant or useless discard jumps.
2783 *
2784 * For example, we can eliminate jumps in the following sequence:
2785 *
2786 * discard-jump (redundant with the next jump)
2787 * discard-jump (useless; jumps to the next instruction)
2788 * placeholder-halt
2789 */
2790 bool
2791 fs_visitor::opt_redundant_discard_jumps()
2792 {
2793 bool progress = false;
2794
2795 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2796
2797 fs_inst *placeholder_halt = NULL;
2798 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2799 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2800 placeholder_halt = inst;
2801 break;
2802 }
2803 }
2804
2805 if (!placeholder_halt)
2806 return false;
2807
2808 /* Delete any HALTs immediately before the placeholder halt. */
2809 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2810 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2811 prev = (fs_inst *) placeholder_halt->prev) {
2812 prev->remove(last_bblock);
2813 progress = true;
2814 }
2815
2816 if (progress)
2817 invalidate_live_intervals();
2818
2819 return progress;
2820 }
2821
2822 bool
2823 fs_visitor::compute_to_mrf()
2824 {
2825 bool progress = false;
2826 int next_ip = 0;
2827
2828 /* No MRFs on Gen >= 7. */
2829 if (devinfo->gen >= 7)
2830 return false;
2831
2832 calculate_live_intervals();
2833
2834 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2835 int ip = next_ip;
2836 next_ip++;
2837
2838 if (inst->opcode != BRW_OPCODE_MOV ||
2839 inst->is_partial_write() ||
2840 inst->dst.file != MRF || inst->src[0].file != GRF ||
2841 inst->dst.type != inst->src[0].type ||
2842 inst->src[0].abs || inst->src[0].negate ||
2843 !inst->src[0].is_contiguous() ||
2844 inst->src[0].subreg_offset)
2845 continue;
2846
2847 /* Work out which hardware MRF registers are written by this
2848 * instruction.
2849 */
2850 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2851 int mrf_high;
2852 if (inst->dst.reg & BRW_MRF_COMPR4) {
2853 mrf_high = mrf_low + 4;
2854 } else if (inst->exec_size == 16) {
2855 mrf_high = mrf_low + 1;
2856 } else {
2857 mrf_high = mrf_low;
2858 }
2859
2860 /* Can't compute-to-MRF this GRF if someone else was going to
2861 * read it later.
2862 */
2863 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2864 continue;
2865
2866 /* Found a move of a GRF to a MRF. Let's see if we can go
2867 * rewrite the thing that made this GRF to write into the MRF.
2868 */
2869 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2870 if (scan_inst->dst.file == GRF &&
2871 scan_inst->dst.reg == inst->src[0].reg) {
2872 /* Found the last thing to write our reg we want to turn
2873 * into a compute-to-MRF.
2874 */
2875
2876 /* If this one instruction didn't populate all the
2877 * channels, bail. We might be able to rewrite everything
2878 * that writes that reg, but it would require smarter
2879 * tracking to delay the rewriting until complete success.
2880 */
2881 if (scan_inst->is_partial_write())
2882 break;
2883
2884 /* Things returning more than one register would need us to
2885 * understand coalescing out more than one MOV at a time.
2886 */
2887 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2888 break;
2889
2890 /* SEND instructions can't have MRF as a destination. */
2891 if (scan_inst->mlen)
2892 break;
2893
2894 if (devinfo->gen == 6) {
2895 /* gen6 math instructions must have the destination be
2896 * GRF, so no compute-to-MRF for them.
2897 */
2898 if (scan_inst->is_math()) {
2899 break;
2900 }
2901 }
2902
2903 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2904 /* Found the creator of our MRF's source value. */
2905 scan_inst->dst.file = MRF;
2906 scan_inst->dst.reg = inst->dst.reg;
2907 scan_inst->saturate |= inst->saturate;
2908 inst->remove(block);
2909 progress = true;
2910 }
2911 break;
2912 }
2913
2914 /* We don't handle control flow here. Most computation of
2915 * values that end up in MRFs are shortly before the MRF
2916 * write anyway.
2917 */
2918 if (block->start() == scan_inst)
2919 break;
2920
2921 /* You can't read from an MRF, so if someone else reads our
2922 * MRF's source GRF that we wanted to rewrite, that stops us.
2923 */
2924 bool interfered = false;
2925 for (int i = 0; i < scan_inst->sources; i++) {
2926 if (scan_inst->src[i].file == GRF &&
2927 scan_inst->src[i].reg == inst->src[0].reg &&
2928 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2929 interfered = true;
2930 }
2931 }
2932 if (interfered)
2933 break;
2934
2935 if (scan_inst->dst.file == MRF) {
2936 /* If somebody else writes our MRF here, we can't
2937 * compute-to-MRF before that.
2938 */
2939 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2940 int scan_mrf_high;
2941
2942 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2943 scan_mrf_high = scan_mrf_low + 4;
2944 } else if (scan_inst->exec_size == 16) {
2945 scan_mrf_high = scan_mrf_low + 1;
2946 } else {
2947 scan_mrf_high = scan_mrf_low;
2948 }
2949
2950 if (mrf_low == scan_mrf_low ||
2951 mrf_low == scan_mrf_high ||
2952 mrf_high == scan_mrf_low ||
2953 mrf_high == scan_mrf_high) {
2954 break;
2955 }
2956 }
2957
2958 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2959 /* Found a SEND instruction, which means that there are
2960 * live values in MRFs from base_mrf to base_mrf +
2961 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2962 * above it.
2963 */
2964 if (mrf_low >= scan_inst->base_mrf &&
2965 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2966 break;
2967 }
2968 if (mrf_high >= scan_inst->base_mrf &&
2969 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2970 break;
2971 }
2972 }
2973 }
2974 }
2975
2976 if (progress)
2977 invalidate_live_intervals();
2978
2979 return progress;
2980 }
2981
2982 /**
2983 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2984 * instructions to FS_OPCODE_REP_FB_WRITE.
2985 */
2986 void
2987 fs_visitor::emit_repclear_shader()
2988 {
2989 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2990 int base_mrf = 1;
2991 int color_mrf = base_mrf + 2;
2992
2993 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2994 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2995 mov->force_writemask_all = true;
2996
2997 fs_inst *write;
2998 if (key->nr_color_regions == 1) {
2999 write = emit(FS_OPCODE_REP_FB_WRITE);
3000 write->saturate = key->clamp_fragment_color;
3001 write->base_mrf = color_mrf;
3002 write->target = 0;
3003 write->header_present = false;
3004 write->mlen = 1;
3005 } else {
3006 assume(key->nr_color_regions > 0);
3007 for (int i = 0; i < key->nr_color_regions; ++i) {
3008 write = emit(FS_OPCODE_REP_FB_WRITE);
3009 write->saturate = key->clamp_fragment_color;
3010 write->base_mrf = base_mrf;
3011 write->target = i;
3012 write->header_present = true;
3013 write->mlen = 3;
3014 }
3015 }
3016 write->eot = true;
3017
3018 calculate_cfg();
3019
3020 assign_constant_locations();
3021 assign_curb_setup();
3022
3023 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
3024 assert(mov->src[0].file == HW_REG);
3025 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
3026 }
3027
3028 /**
3029 * Walks through basic blocks, looking for repeated MRF writes and
3030 * removing the later ones.
3031 */
3032 bool
3033 fs_visitor::remove_duplicate_mrf_writes()
3034 {
3035 fs_inst *last_mrf_move[16];
3036 bool progress = false;
3037
3038 /* Need to update the MRF tracking for compressed instructions. */
3039 if (dispatch_width == 16)
3040 return false;
3041
3042 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3043
3044 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3045 if (inst->is_control_flow()) {
3046 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3047 }
3048
3049 if (inst->opcode == BRW_OPCODE_MOV &&
3050 inst->dst.file == MRF) {
3051 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3052 if (prev_inst && inst->equals(prev_inst)) {
3053 inst->remove(block);
3054 progress = true;
3055 continue;
3056 }
3057 }
3058
3059 /* Clear out the last-write records for MRFs that were overwritten. */
3060 if (inst->dst.file == MRF) {
3061 last_mrf_move[inst->dst.reg] = NULL;
3062 }
3063
3064 if (inst->mlen > 0 && inst->base_mrf != -1) {
3065 /* Found a SEND instruction, which will include two or fewer
3066 * implied MRF writes. We could do better here.
3067 */
3068 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3069 last_mrf_move[inst->base_mrf + i] = NULL;
3070 }
3071 }
3072
3073 /* Clear out any MRF move records whose sources got overwritten. */
3074 if (inst->dst.file == GRF) {
3075 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3076 if (last_mrf_move[i] &&
3077 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3078 last_mrf_move[i] = NULL;
3079 }
3080 }
3081 }
3082
3083 if (inst->opcode == BRW_OPCODE_MOV &&
3084 inst->dst.file == MRF &&
3085 inst->src[0].file == GRF &&
3086 !inst->is_partial_write()) {
3087 last_mrf_move[inst->dst.reg] = inst;
3088 }
3089 }
3090
3091 if (progress)
3092 invalidate_live_intervals();
3093
3094 return progress;
3095 }
3096
3097 static void
3098 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3099 {
3100 /* Clear the flag for registers that actually got read (as expected). */
3101 for (int i = 0; i < inst->sources; i++) {
3102 int grf;
3103 if (inst->src[i].file == GRF) {
3104 grf = inst->src[i].reg;
3105 } else if (inst->src[i].file == HW_REG &&
3106 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3107 grf = inst->src[i].fixed_hw_reg.nr;
3108 } else {
3109 continue;
3110 }
3111
3112 if (grf >= first_grf &&
3113 grf < first_grf + grf_len) {
3114 deps[grf - first_grf] = false;
3115 if (inst->exec_size == 16)
3116 deps[grf - first_grf + 1] = false;
3117 }
3118 }
3119 }
3120
3121 /**
3122 * Implements this workaround for the original 965:
3123 *
3124 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3125 * check for post destination dependencies on this instruction, software
3126 * must ensure that there is no destination hazard for the case of ‘write
3127 * followed by a posted write’ shown in the following example.
3128 *
3129 * 1. mov r3 0
3130 * 2. send r3.xy <rest of send instruction>
3131 * 3. mov r2 r3
3132 *
3133 * Due to no post-destination dependency check on the ‘send’, the above
3134 * code sequence could have two instructions (1 and 2) in flight at the
3135 * same time that both consider ‘r3’ as the target of their final writes.
3136 */
3137 void
3138 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3139 fs_inst *inst)
3140 {
3141 int write_len = inst->regs_written;
3142 int first_write_grf = inst->dst.reg;
3143 bool needs_dep[BRW_MAX_MRF];
3144 assert(write_len < (int)sizeof(needs_dep) - 1);
3145
3146 memset(needs_dep, false, sizeof(needs_dep));
3147 memset(needs_dep, true, write_len);
3148
3149 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3150
3151 /* Walk backwards looking for writes to registers we're writing which
3152 * aren't read since being written. If we hit the start of the program,
3153 * we assume that there are no outstanding dependencies on entry to the
3154 * program.
3155 */
3156 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3157 /* If we hit control flow, assume that there *are* outstanding
3158 * dependencies, and force their cleanup before our instruction.
3159 */
3160 if (block->start() == scan_inst) {
3161 for (int i = 0; i < write_len; i++) {
3162 if (needs_dep[i]) {
3163 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3164 }
3165 }
3166 return;
3167 }
3168
3169 /* We insert our reads as late as possible on the assumption that any
3170 * instruction but a MOV that might have left us an outstanding
3171 * dependency has more latency than a MOV.
3172 */
3173 if (scan_inst->dst.file == GRF) {
3174 for (int i = 0; i < scan_inst->regs_written; i++) {
3175 int reg = scan_inst->dst.reg + i;
3176
3177 if (reg >= first_write_grf &&
3178 reg < first_write_grf + write_len &&
3179 needs_dep[reg - first_write_grf]) {
3180 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3181 needs_dep[reg - first_write_grf] = false;
3182 if (scan_inst->exec_size == 16)
3183 needs_dep[reg - first_write_grf + 1] = false;
3184 }
3185 }
3186 }
3187
3188 /* Clear the flag for registers that actually got read (as expected). */
3189 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3190
3191 /* Continue the loop only if we haven't resolved all the dependencies */
3192 int i;
3193 for (i = 0; i < write_len; i++) {
3194 if (needs_dep[i])
3195 break;
3196 }
3197 if (i == write_len)
3198 return;
3199 }
3200 }
3201
3202 /**
3203 * Implements this workaround for the original 965:
3204 *
3205 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3206 * used as a destination register until after it has been sourced by an
3207 * instruction with a different destination register.
3208 */
3209 void
3210 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3211 {
3212 int write_len = inst->regs_written;
3213 int first_write_grf = inst->dst.reg;
3214 bool needs_dep[BRW_MAX_MRF];
3215 assert(write_len < (int)sizeof(needs_dep) - 1);
3216
3217 memset(needs_dep, false, sizeof(needs_dep));
3218 memset(needs_dep, true, write_len);
3219 /* Walk forwards looking for writes to registers we're writing which aren't
3220 * read before being written.
3221 */
3222 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3223 /* If we hit control flow, force resolve all remaining dependencies. */
3224 if (block->end() == scan_inst) {
3225 for (int i = 0; i < write_len; i++) {
3226 if (needs_dep[i])
3227 scan_inst->insert_before(block,
3228 DEP_RESOLVE_MOV(first_write_grf + i));
3229 }
3230 return;
3231 }
3232
3233 /* Clear the flag for registers that actually got read (as expected). */
3234 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3235
3236 /* We insert our reads as late as possible since they're reading the
3237 * result of a SEND, which has massive latency.
3238 */
3239 if (scan_inst->dst.file == GRF &&
3240 scan_inst->dst.reg >= first_write_grf &&
3241 scan_inst->dst.reg < first_write_grf + write_len &&
3242 needs_dep[scan_inst->dst.reg - first_write_grf]) {
3243 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3244 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3245 }
3246
3247 /* Continue the loop only if we haven't resolved all the dependencies */
3248 int i;
3249 for (i = 0; i < write_len; i++) {
3250 if (needs_dep[i])
3251 break;
3252 }
3253 if (i == write_len)
3254 return;
3255 }
3256 }
3257
3258 void
3259 fs_visitor::insert_gen4_send_dependency_workarounds()
3260 {
3261 if (devinfo->gen != 4 || devinfo->is_g4x)
3262 return;
3263
3264 bool progress = false;
3265
3266 /* Note that we're done with register allocation, so GRF fs_regs always
3267 * have a .reg_offset of 0.
3268 */
3269
3270 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3271 if (inst->mlen != 0 && inst->dst.file == GRF) {
3272 insert_gen4_pre_send_dependency_workarounds(block, inst);
3273 insert_gen4_post_send_dependency_workarounds(block, inst);
3274 progress = true;
3275 }
3276 }
3277
3278 if (progress)
3279 invalidate_live_intervals();
3280 }
3281
3282 /**
3283 * Turns the generic expression-style uniform pull constant load instruction
3284 * into a hardware-specific series of instructions for loading a pull
3285 * constant.
3286 *
3287 * The expression style allows the CSE pass before this to optimize out
3288 * repeated loads from the same offset, and gives the pre-register-allocation
3289 * scheduling full flexibility, while the conversion to native instructions
3290 * allows the post-register-allocation scheduler the best information
3291 * possible.
3292 *
3293 * Note that execution masking for setting up pull constant loads is special:
3294 * the channels that need to be written are unrelated to the current execution
3295 * mask, since a later instruction will use one of the result channels as a
3296 * source operand for all 8 or 16 of its channels.
3297 */
3298 void
3299 fs_visitor::lower_uniform_pull_constant_loads()
3300 {
3301 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3302 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3303 continue;
3304
3305 if (devinfo->gen >= 7) {
3306 /* The offset arg before was a vec4-aligned byte offset. We need to
3307 * turn it into a dword offset.
3308 */
3309 fs_reg const_offset_reg = inst->src[1];
3310 assert(const_offset_reg.file == IMM &&
3311 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3312 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3313 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3314
3315 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3316 * Reserve space for the register.
3317 */
3318 if (devinfo->gen >= 9) {
3319 payload.reg_offset++;
3320 alloc.sizes[payload.reg] = 2;
3321 }
3322
3323 /* This is actually going to be a MOV, but since only the first dword
3324 * is accessed, we have a special opcode to do just that one. Note
3325 * that this needs to be an operation that will be considered a def
3326 * by live variable analysis, or register allocation will explode.
3327 */
3328 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3329 8, payload, const_offset_reg);
3330 setup->force_writemask_all = true;
3331
3332 setup->ir = inst->ir;
3333 setup->annotation = inst->annotation;
3334 inst->insert_before(block, setup);
3335
3336 /* Similarly, this will only populate the first 4 channels of the
3337 * result register (since we only use smear values from 0-3), but we
3338 * don't tell the optimizer.
3339 */
3340 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3341 inst->src[1] = payload;
3342
3343 invalidate_live_intervals();
3344 } else {
3345 /* Before register allocation, we didn't tell the scheduler about the
3346 * MRF we use. We know it's safe to use this MRF because nothing
3347 * else does except for register spill/unspill, which generates and
3348 * uses its MRF within a single IR instruction.
3349 */
3350 inst->base_mrf = 14;
3351 inst->mlen = 1;
3352 }
3353 }
3354 }
3355
3356 bool
3357 fs_visitor::lower_load_payload()
3358 {
3359 bool progress = false;
3360
3361 int vgrf_to_reg[alloc.count];
3362 int reg_count = 0;
3363 for (unsigned i = 0; i < alloc.count; ++i) {
3364 vgrf_to_reg[i] = reg_count;
3365 reg_count += alloc.sizes[i];
3366 }
3367
3368 struct {
3369 bool written:1; /* Whether this register has ever been written */
3370 bool force_writemask_all:1;
3371 bool force_sechalf:1;
3372 } metadata[reg_count];
3373 memset(metadata, 0, sizeof(metadata));
3374
3375 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3376 if (inst->dst.file == GRF) {
3377 const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3378 bool force_sechalf = inst->force_sechalf &&
3379 !inst->force_writemask_all;
3380 bool toggle_sechalf = inst->dst.width == 16 &&
3381 type_sz(inst->dst.type) == 4 &&
3382 !inst->force_writemask_all;
3383 for (int i = 0; i < inst->regs_written; ++i) {
3384 metadata[dst_reg + i].written = true;
3385 metadata[dst_reg + i].force_sechalf = force_sechalf;
3386 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3387 force_sechalf = (toggle_sechalf != force_sechalf);
3388 }
3389 }
3390
3391 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3392 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3393 fs_reg dst = inst->dst;
3394
3395 for (int i = 0; i < inst->sources; i++) {
3396 dst.width = inst->src[i].effective_width;
3397 dst.type = inst->src[i].type;
3398
3399 if (inst->src[i].file == BAD_FILE) {
3400 /* Do nothing but otherwise increment as normal */
3401 } else if (dst.file == MRF &&
3402 dst.width == 8 &&
3403 devinfo->has_compr4 &&
3404 i + 4 < inst->sources &&
3405 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3406 fs_reg compr4_dst = dst;
3407 compr4_dst.reg += BRW_MRF_COMPR4;
3408 compr4_dst.width = 16;
3409 fs_reg compr4_src = inst->src[i];
3410 compr4_src.width = 16;
3411 fs_inst *mov = MOV(compr4_dst, compr4_src);
3412 mov->force_writemask_all = true;
3413 inst->insert_before(block, mov);
3414 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3415 inst->src[i + 4].file = BAD_FILE;
3416 } else {
3417 fs_inst *mov = MOV(dst, inst->src[i]);
3418 if (inst->src[i].file == GRF) {
3419 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3420 inst->src[i].reg_offset;
3421 mov->force_sechalf = metadata[src_reg].force_sechalf;
3422 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3423 } else {
3424 /* We don't have any useful metadata for immediates or
3425 * uniforms. Assume that any of the channels of the
3426 * destination may be used.
3427 */
3428 assert(inst->src[i].file == IMM ||
3429 inst->src[i].file == UNIFORM);
3430 mov->force_writemask_all = true;
3431 }
3432
3433 if (dst.file == GRF) {
3434 const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3435 const bool force_writemask = mov->force_writemask_all;
3436 metadata[dst_reg].force_writemask_all = force_writemask;
3437 metadata[dst_reg].force_sechalf = mov->force_sechalf;
3438 if (dst.width * type_sz(dst.type) > 32) {
3439 assert(!mov->force_sechalf);
3440 metadata[dst_reg + 1].force_writemask_all = force_writemask;
3441 metadata[dst_reg + 1].force_sechalf = !force_writemask;
3442 }
3443 }
3444
3445 inst->insert_before(block, mov);
3446 }
3447
3448 dst = offset(dst, 1);
3449 }
3450
3451 inst->remove(block);
3452 progress = true;
3453 }
3454 }
3455
3456 if (progress)
3457 invalidate_live_intervals();
3458
3459 return progress;
3460 }
3461
3462 void
3463 fs_visitor::dump_instructions()
3464 {
3465 dump_instructions(NULL);
3466 }
3467
3468 void
3469 fs_visitor::dump_instructions(const char *name)
3470 {
3471 FILE *file = stderr;
3472 if (name && geteuid() != 0) {
3473 file = fopen(name, "w");
3474 if (!file)
3475 file = stderr;
3476 }
3477
3478 if (cfg) {
3479 calculate_register_pressure();
3480 int ip = 0, max_pressure = 0;
3481 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3482 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3483 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3484 dump_instruction(inst, file);
3485 ip++;
3486 }
3487 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3488 } else {
3489 int ip = 0;
3490 foreach_in_list(backend_instruction, inst, &instructions) {
3491 fprintf(file, "%4d: ", ip++);
3492 dump_instruction(inst, file);
3493 }
3494 }
3495
3496 if (file != stderr) {
3497 fclose(file);
3498 }
3499 }
3500
3501 void
3502 fs_visitor::dump_instruction(backend_instruction *be_inst)
3503 {
3504 dump_instruction(be_inst, stderr);
3505 }
3506
3507 void
3508 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3509 {
3510 fs_inst *inst = (fs_inst *)be_inst;
3511
3512 if (inst->predicate) {
3513 fprintf(file, "(%cf0.%d) ",
3514 inst->predicate_inverse ? '-' : '+',
3515 inst->flag_subreg);
3516 }
3517
3518 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3519 if (inst->saturate)
3520 fprintf(file, ".sat");
3521 if (inst->conditional_mod) {
3522 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3523 if (!inst->predicate &&
3524 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3525 inst->opcode != BRW_OPCODE_IF &&
3526 inst->opcode != BRW_OPCODE_WHILE))) {
3527 fprintf(file, ".f0.%d", inst->flag_subreg);
3528 }
3529 }
3530 fprintf(file, "(%d) ", inst->exec_size);
3531
3532
3533 switch (inst->dst.file) {
3534 case GRF:
3535 fprintf(file, "vgrf%d", inst->dst.reg);
3536 if (inst->dst.width != dispatch_width)
3537 fprintf(file, "@%d", inst->dst.width);
3538 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3539 inst->dst.subreg_offset)
3540 fprintf(file, "+%d.%d",
3541 inst->dst.reg_offset, inst->dst.subreg_offset);
3542 break;
3543 case MRF:
3544 fprintf(file, "m%d", inst->dst.reg);
3545 break;
3546 case BAD_FILE:
3547 fprintf(file, "(null)");
3548 break;
3549 case UNIFORM:
3550 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3551 break;
3552 case ATTR:
3553 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3554 break;
3555 case HW_REG:
3556 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3557 switch (inst->dst.fixed_hw_reg.nr) {
3558 case BRW_ARF_NULL:
3559 fprintf(file, "null");
3560 break;
3561 case BRW_ARF_ADDRESS:
3562 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3563 break;
3564 case BRW_ARF_ACCUMULATOR:
3565 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3566 break;
3567 case BRW_ARF_FLAG:
3568 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3569 inst->dst.fixed_hw_reg.subnr);
3570 break;
3571 default:
3572 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3573 inst->dst.fixed_hw_reg.subnr);
3574 break;
3575 }
3576 } else {
3577 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3578 }
3579 if (inst->dst.fixed_hw_reg.subnr)
3580 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3581 break;
3582 default:
3583 fprintf(file, "???");
3584 break;
3585 }
3586 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3587
3588 for (int i = 0; i < inst->sources; i++) {
3589 if (inst->src[i].negate)
3590 fprintf(file, "-");
3591 if (inst->src[i].abs)
3592 fprintf(file, "|");
3593 switch (inst->src[i].file) {
3594 case GRF:
3595 fprintf(file, "vgrf%d", inst->src[i].reg);
3596 if (inst->src[i].width != dispatch_width)
3597 fprintf(file, "@%d", inst->src[i].width);
3598 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3599 inst->src[i].subreg_offset)
3600 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3601 inst->src[i].subreg_offset);
3602 break;
3603 case MRF:
3604 fprintf(file, "***m%d***", inst->src[i].reg);
3605 break;
3606 case ATTR:
3607 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3608 break;
3609 case UNIFORM:
3610 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3611 if (inst->src[i].reladdr) {
3612 fprintf(file, "+reladdr");
3613 } else if (inst->src[i].subreg_offset) {
3614 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3615 inst->src[i].subreg_offset);
3616 }
3617 break;
3618 case BAD_FILE:
3619 fprintf(file, "(null)");
3620 break;
3621 case IMM:
3622 switch (inst->src[i].type) {
3623 case BRW_REGISTER_TYPE_F:
3624 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3625 break;
3626 case BRW_REGISTER_TYPE_W:
3627 case BRW_REGISTER_TYPE_D:
3628 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3629 break;
3630 case BRW_REGISTER_TYPE_UW:
3631 case BRW_REGISTER_TYPE_UD:
3632 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3633 break;
3634 case BRW_REGISTER_TYPE_VF:
3635 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3636 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3637 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3638 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3639 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3640 break;
3641 default:
3642 fprintf(file, "???");
3643 break;
3644 }
3645 break;
3646 case HW_REG:
3647 if (inst->src[i].fixed_hw_reg.negate)
3648 fprintf(file, "-");
3649 if (inst->src[i].fixed_hw_reg.abs)
3650 fprintf(file, "|");
3651 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3652 switch (inst->src[i].fixed_hw_reg.nr) {
3653 case BRW_ARF_NULL:
3654 fprintf(file, "null");
3655 break;
3656 case BRW_ARF_ADDRESS:
3657 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3658 break;
3659 case BRW_ARF_ACCUMULATOR:
3660 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3661 break;
3662 case BRW_ARF_FLAG:
3663 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3664 inst->src[i].fixed_hw_reg.subnr);
3665 break;
3666 default:
3667 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3668 inst->src[i].fixed_hw_reg.subnr);
3669 break;
3670 }
3671 } else {
3672 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3673 }
3674 if (inst->src[i].fixed_hw_reg.subnr)
3675 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3676 if (inst->src[i].fixed_hw_reg.abs)
3677 fprintf(file, "|");
3678 break;
3679 default:
3680 fprintf(file, "???");
3681 break;
3682 }
3683 if (inst->src[i].abs)
3684 fprintf(file, "|");
3685
3686 if (inst->src[i].file != IMM) {
3687 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3688 }
3689
3690 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3691 fprintf(file, ", ");
3692 }
3693
3694 fprintf(file, " ");
3695
3696 if (dispatch_width == 16 && inst->exec_size == 8) {
3697 if (inst->force_sechalf)
3698 fprintf(file, "2ndhalf ");
3699 else
3700 fprintf(file, "1sthalf ");
3701 }
3702
3703 fprintf(file, "\n");
3704 }
3705
3706 /**
3707 * Possibly returns an instruction that set up @param reg.
3708 *
3709 * Sometimes we want to take the result of some expression/variable
3710 * dereference tree and rewrite the instruction generating the result
3711 * of the tree. When processing the tree, we know that the
3712 * instructions generated are all writing temporaries that are dead
3713 * outside of this tree. So, if we have some instructions that write
3714 * a temporary, we're free to point that temp write somewhere else.
3715 *
3716 * Note that this doesn't guarantee that the instruction generated
3717 * only reg -- it might be the size=4 destination of a texture instruction.
3718 */
3719 fs_inst *
3720 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3721 fs_inst *end,
3722 const fs_reg &reg)
3723 {
3724 if (end == start ||
3725 end->is_partial_write() ||
3726 reg.reladdr ||
3727 !reg.equals(end->dst)) {
3728 return NULL;
3729 } else {
3730 return end;
3731 }
3732 }
3733
3734 void
3735 fs_visitor::setup_payload_gen6()
3736 {
3737 bool uses_depth =
3738 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3739 unsigned barycentric_interp_modes =
3740 (stage == MESA_SHADER_FRAGMENT) ?
3741 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3742
3743 assert(devinfo->gen >= 6);
3744
3745 /* R0-1: masks, pixel X/Y coordinates. */
3746 payload.num_regs = 2;
3747 /* R2: only for 32-pixel dispatch.*/
3748
3749 /* R3-26: barycentric interpolation coordinates. These appear in the
3750 * same order that they appear in the brw_wm_barycentric_interp_mode
3751 * enum. Each set of coordinates occupies 2 registers if dispatch width
3752 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3753 * appear if they were enabled using the "Barycentric Interpolation
3754 * Mode" bits in WM_STATE.
3755 */
3756 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3757 if (barycentric_interp_modes & (1 << i)) {
3758 payload.barycentric_coord_reg[i] = payload.num_regs;
3759 payload.num_regs += 2;
3760 if (dispatch_width == 16) {
3761 payload.num_regs += 2;
3762 }
3763 }
3764 }
3765
3766 /* R27: interpolated depth if uses source depth */
3767 if (uses_depth) {
3768 payload.source_depth_reg = payload.num_regs;
3769 payload.num_regs++;
3770 if (dispatch_width == 16) {
3771 /* R28: interpolated depth if not SIMD8. */
3772 payload.num_regs++;
3773 }
3774 }
3775 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3776 if (uses_depth) {
3777 payload.source_w_reg = payload.num_regs;
3778 payload.num_regs++;
3779 if (dispatch_width == 16) {
3780 /* R30: interpolated W if not SIMD8. */
3781 payload.num_regs++;
3782 }
3783 }
3784
3785 if (stage == MESA_SHADER_FRAGMENT) {
3786 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3787 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3788 prog_data->uses_pos_offset = key->compute_pos_offset;
3789 /* R31: MSAA position offsets. */
3790 if (prog_data->uses_pos_offset) {
3791 payload.sample_pos_reg = payload.num_regs;
3792 payload.num_regs++;
3793 }
3794 }
3795
3796 /* R32: MSAA input coverage mask */
3797 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3798 assert(devinfo->gen >= 7);
3799 payload.sample_mask_in_reg = payload.num_regs;
3800 payload.num_regs++;
3801 if (dispatch_width == 16) {
3802 /* R33: input coverage mask if not SIMD8. */
3803 payload.num_regs++;
3804 }
3805 }
3806
3807 /* R34-: bary for 32-pixel. */
3808 /* R58-59: interp W for 32-pixel. */
3809
3810 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3811 source_depth_to_render_target = true;
3812 }
3813 }
3814
3815 void
3816 fs_visitor::setup_vs_payload()
3817 {
3818 /* R0: thread header, R1: urb handles */
3819 payload.num_regs = 2;
3820 }
3821
3822 void
3823 fs_visitor::setup_cs_payload()
3824 {
3825 assert(brw->gen >= 7);
3826
3827 payload.num_regs = 1;
3828 }
3829
3830 void
3831 fs_visitor::assign_binding_table_offsets()
3832 {
3833 assert(stage == MESA_SHADER_FRAGMENT);
3834 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3835 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3836 uint32_t next_binding_table_offset = 0;
3837
3838 /* If there are no color regions, we still perform an FB write to a null
3839 * renderbuffer, which we place at surface index 0.
3840 */
3841 prog_data->binding_table.render_target_start = next_binding_table_offset;
3842 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3843
3844 assign_common_binding_table_offsets(next_binding_table_offset);
3845 }
3846
3847 void
3848 fs_visitor::calculate_register_pressure()
3849 {
3850 invalidate_live_intervals();
3851 calculate_live_intervals();
3852
3853 unsigned num_instructions = 0;
3854 foreach_block(block, cfg)
3855 num_instructions += block->instructions.length();
3856
3857 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3858
3859 for (unsigned reg = 0; reg < alloc.count; reg++) {
3860 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3861 regs_live_at_ip[ip] += alloc.sizes[reg];
3862 }
3863 }
3864
3865 void
3866 fs_visitor::optimize()
3867 {
3868 split_virtual_grfs();
3869
3870 move_uniform_array_access_to_pull_constants();
3871 assign_constant_locations();
3872 demote_pull_constants();
3873
3874 #define OPT(pass, args...) ({ \
3875 pass_num++; \
3876 bool this_progress = pass(args); \
3877 \
3878 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3879 char filename[64]; \
3880 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3881 stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3882 \
3883 backend_visitor::dump_instructions(filename); \
3884 } \
3885 \
3886 progress = progress || this_progress; \
3887 this_progress; \
3888 })
3889
3890 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3891 char filename[64];
3892 snprintf(filename, 64, "%s%d-%04d-00-start",
3893 stage_abbrev, dispatch_width,
3894 shader_prog ? shader_prog->Name : 0);
3895
3896 backend_visitor::dump_instructions(filename);
3897 }
3898
3899 bool progress;
3900 int iteration = 0;
3901 int pass_num = 0;
3902 do {
3903 progress = false;
3904 pass_num = 0;
3905 iteration++;
3906
3907 OPT(remove_duplicate_mrf_writes);
3908
3909 OPT(opt_algebraic);
3910 OPT(opt_cse);
3911 OPT(opt_copy_propagate);
3912 OPT(opt_peephole_predicated_break);
3913 OPT(opt_cmod_propagation);
3914 OPT(dead_code_eliminate);
3915 OPT(opt_peephole_sel);
3916 OPT(dead_control_flow_eliminate, this);
3917 OPT(opt_register_renaming);
3918 OPT(opt_redundant_discard_jumps);
3919 OPT(opt_saturate_propagation);
3920 OPT(opt_zero_samples);
3921 OPT(register_coalesce);
3922 OPT(compute_to_mrf);
3923
3924 OPT(compact_virtual_grfs);
3925 } while (progress);
3926
3927 pass_num = 0;
3928
3929 OPT(opt_sampler_eot);
3930
3931 if (OPT(lower_load_payload)) {
3932 split_virtual_grfs();
3933 OPT(register_coalesce);
3934 OPT(compute_to_mrf);
3935 OPT(dead_code_eliminate);
3936 }
3937
3938 OPT(opt_combine_constants);
3939
3940 lower_uniform_pull_constant_loads();
3941 }
3942
3943 /**
3944 * Three source instruction must have a GRF/MRF destination register.
3945 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
3946 */
3947 void
3948 fs_visitor::fixup_3src_null_dest()
3949 {
3950 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3951 if (inst->is_3src() && inst->dst.is_null()) {
3952 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3953 inst->dst.type);
3954 }
3955 }
3956 }
3957
3958 void
3959 fs_visitor::allocate_registers()
3960 {
3961 bool allocated_without_spills;
3962
3963 static const enum instruction_scheduler_mode pre_modes[] = {
3964 SCHEDULE_PRE,
3965 SCHEDULE_PRE_NON_LIFO,
3966 SCHEDULE_PRE_LIFO,
3967 };
3968
3969 /* Try each scheduling heuristic to see if it can successfully register
3970 * allocate without spilling. They should be ordered by decreasing
3971 * performance but increasing likelihood of allocating.
3972 */
3973 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3974 schedule_instructions(pre_modes[i]);
3975
3976 if (0) {
3977 assign_regs_trivial();
3978 allocated_without_spills = true;
3979 } else {
3980 allocated_without_spills = assign_regs(false);
3981 }
3982 if (allocated_without_spills)
3983 break;
3984 }
3985
3986 if (!allocated_without_spills) {
3987 /* We assume that any spilling is worse than just dropping back to
3988 * SIMD8. There's probably actually some intermediate point where
3989 * SIMD16 with a couple of spills is still better.
3990 */
3991 if (dispatch_width == 16) {
3992 fail("Failure to register allocate. Reduce number of "
3993 "live scalar values to avoid this.");
3994 } else {
3995 perf_debug("%s shader triggered register spilling. "
3996 "Try reducing the number of live scalar values to "
3997 "improve performance.\n", stage_name);
3998 }
3999
4000 /* Since we're out of heuristics, just go spill registers until we
4001 * get an allocation.
4002 */
4003 while (!assign_regs(true)) {
4004 if (failed)
4005 break;
4006 }
4007 }
4008
4009 /* This must come after all optimization and register allocation, since
4010 * it inserts dead code that happens to have side effects, and it does
4011 * so based on the actual physical registers in use.
4012 */
4013 insert_gen4_send_dependency_workarounds();
4014
4015 if (failed)
4016 return;
4017
4018 if (!allocated_without_spills)
4019 schedule_instructions(SCHEDULE_POST);
4020
4021 if (last_scratch > 0)
4022 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4023 }
4024
4025 bool
4026 fs_visitor::run_vs()
4027 {
4028 assert(stage == MESA_SHADER_VERTEX);
4029
4030 assign_common_binding_table_offsets(0);
4031 setup_vs_payload();
4032
4033 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4034 emit_shader_time_begin();
4035
4036 if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
4037 emit_nir_code();
4038 } else {
4039 foreach_in_list(ir_instruction, ir, shader->base.ir) {
4040 base_ir = ir;
4041 this->result = reg_undef;
4042 ir->accept(this);
4043 }
4044 base_ir = NULL;
4045 }
4046
4047 if (failed)
4048 return false;
4049
4050 emit_urb_writes();
4051
4052 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4053 emit_shader_time_end();
4054
4055 calculate_cfg();
4056
4057 optimize();
4058
4059 assign_curb_setup();
4060 assign_vs_urb_setup();
4061
4062 fixup_3src_null_dest();
4063 allocate_registers();
4064
4065 return !failed;
4066 }
4067
4068 bool
4069 fs_visitor::run_fs()
4070 {
4071 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4072 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4073
4074 assert(stage == MESA_SHADER_FRAGMENT);
4075
4076 sanity_param_count = prog->Parameters->NumParameters;
4077
4078 assign_binding_table_offsets();
4079
4080 if (devinfo->gen >= 6)
4081 setup_payload_gen6();
4082 else
4083 setup_payload_gen4();
4084
4085 if (0) {
4086 emit_dummy_fs();
4087 } else if (brw->use_rep_send && dispatch_width == 16) {
4088 emit_repclear_shader();
4089 } else {
4090 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4091 emit_shader_time_begin();
4092
4093 calculate_urb_setup();
4094 if (prog->InputsRead > 0) {
4095 if (devinfo->gen < 6)
4096 emit_interpolation_setup_gen4();
4097 else
4098 emit_interpolation_setup_gen6();
4099 }
4100
4101 /* We handle discards by keeping track of the still-live pixels in f0.1.
4102 * Initialize it with the dispatched pixels.
4103 */
4104 if (wm_prog_data->uses_kill) {
4105 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4106 discard_init->flag_subreg = 1;
4107 }
4108
4109 /* Generate FS IR for main(). (the visitor only descends into
4110 * functions called "main").
4111 */
4112 if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
4113 emit_nir_code();
4114 } else if (shader) {
4115 foreach_in_list(ir_instruction, ir, shader->base.ir) {
4116 base_ir = ir;
4117 this->result = reg_undef;
4118 ir->accept(this);
4119 }
4120 } else {
4121 emit_fragment_program_code();
4122 }
4123 base_ir = NULL;
4124 if (failed)
4125 return false;
4126
4127 if (wm_prog_data->uses_kill)
4128 emit(FS_OPCODE_PLACEHOLDER_HALT);
4129
4130 if (wm_key->alpha_test_func)
4131 emit_alpha_test();
4132
4133 emit_fb_writes();
4134
4135 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4136 emit_shader_time_end();
4137
4138 calculate_cfg();
4139
4140 optimize();
4141
4142 assign_curb_setup();
4143 assign_urb_setup();
4144
4145 fixup_3src_null_dest();
4146 allocate_registers();
4147
4148 if (failed)
4149 return false;
4150 }
4151
4152 if (dispatch_width == 8)
4153 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4154 else
4155 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4156
4157 /* If any state parameters were appended, then ParameterValues could have
4158 * been realloced, in which case the driver uniform storage set up by
4159 * _mesa_associate_uniform_storage() would point to freed memory. Make
4160 * sure that didn't happen.
4161 */
4162 assert(sanity_param_count == prog->Parameters->NumParameters);
4163
4164 return !failed;
4165 }
4166
4167 bool
4168 fs_visitor::run_cs()
4169 {
4170 assert(stage == MESA_SHADER_COMPUTE);
4171 assert(shader);
4172
4173 sanity_param_count = prog->Parameters->NumParameters;
4174
4175 assign_common_binding_table_offsets(0);
4176
4177 setup_cs_payload();
4178
4179 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4180 emit_shader_time_begin();
4181
4182 emit_nir_code();
4183
4184 if (failed)
4185 return false;
4186
4187 emit_cs_terminate();
4188
4189 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4190 emit_shader_time_end();
4191
4192 calculate_cfg();
4193
4194 optimize();
4195
4196 assign_curb_setup();
4197
4198 fixup_3src_null_dest();
4199 allocate_registers();
4200
4201 if (failed)
4202 return false;
4203
4204 /* If any state parameters were appended, then ParameterValues could have
4205 * been realloced, in which case the driver uniform storage set up by
4206 * _mesa_associate_uniform_storage() would point to freed memory. Make
4207 * sure that didn't happen.
4208 */
4209 assert(sanity_param_count == prog->Parameters->NumParameters);
4210
4211 return !failed;
4212 }
4213
4214 const unsigned *
4215 brw_wm_fs_emit(struct brw_context *brw,
4216 void *mem_ctx,
4217 const struct brw_wm_prog_key *key,
4218 struct brw_wm_prog_data *prog_data,
4219 struct gl_fragment_program *fp,
4220 struct gl_shader_program *prog,
4221 unsigned *final_assembly_size)
4222 {
4223 bool start_busy = false;
4224 double start_time = 0;
4225
4226 if (unlikely(brw->perf_debug)) {
4227 start_busy = (brw->batch.last_bo &&
4228 drm_intel_bo_busy(brw->batch.last_bo));
4229 start_time = get_time();
4230 }
4231
4232 struct brw_shader *shader = NULL;
4233 if (prog)
4234 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4235
4236 if (unlikely(INTEL_DEBUG & DEBUG_WM))
4237 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4238
4239 /* Now the main event: Visit the shader IR and generate our FS IR for it.
4240 */
4241 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
4242 if (!v.run_fs()) {
4243 if (prog) {
4244 prog->LinkStatus = false;
4245 ralloc_strcat(&prog->InfoLog, v.fail_msg);
4246 }
4247
4248 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4249 v.fail_msg);
4250
4251 return NULL;
4252 }
4253
4254 cfg_t *simd16_cfg = NULL;
4255 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
4256 if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4257 if (!v.simd16_unsupported) {
4258 /* Try a SIMD16 compile */
4259 v2.import_uniforms(&v);
4260 if (!v2.run_fs()) {
4261 perf_debug("SIMD16 shader failed to compile, falling back to "
4262 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4263 } else {
4264 simd16_cfg = v2.cfg;
4265 }
4266 } else {
4267 perf_debug("SIMD16 shader unsupported, falling back to "
4268 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4269 }
4270 }
4271
4272 cfg_t *simd8_cfg;
4273 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4274 if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4275 simd8_cfg = NULL;
4276 prog_data->no_8 = true;
4277 } else {
4278 simd8_cfg = v.cfg;
4279 prog_data->no_8 = false;
4280 }
4281
4282 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4283 &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4284
4285 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4286 char *name;
4287 if (prog)
4288 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4289 prog->Label ? prog->Label : "unnamed",
4290 prog->Name);
4291 else
4292 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4293
4294 g.enable_debug(name);
4295 }
4296
4297 if (simd8_cfg)
4298 g.generate_code(simd8_cfg, 8);
4299 if (simd16_cfg)
4300 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4301
4302 if (unlikely(brw->perf_debug) && shader) {
4303 if (shader->compiled_once)
4304 brw_wm_debug_recompile(brw, prog, key);
4305 shader->compiled_once = true;
4306
4307 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4308 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4309 (get_time() - start_time) * 1000);
4310 }
4311 }
4312
4313 return g.get_assembly(final_assembly_size);
4314 }
4315
4316 extern "C" bool
4317 brw_fs_precompile(struct gl_context *ctx,
4318 struct gl_shader_program *shader_prog,
4319 struct gl_program *prog)
4320 {
4321 struct brw_context *brw = brw_context(ctx);
4322 struct brw_wm_prog_key key;
4323
4324 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4325 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4326 bool program_uses_dfdy = fp->UsesDFdy;
4327
4328 memset(&key, 0, sizeof(key));
4329
4330 if (brw->gen < 6) {
4331 if (fp->UsesKill)
4332 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4333
4334 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4335 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4336
4337 /* Just assume depth testing. */
4338 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4339 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4340 }
4341
4342 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4343 BRW_FS_VARYING_INPUT_MASK) > 16)
4344 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4345
4346 brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4347
4348 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4349 key.drawable_height = ctx->DrawBuffer->Height;
4350 }
4351
4352 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4353 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4354 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4355
4356 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4357 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4358 key.nr_color_regions > 1;
4359 }
4360
4361 key.program_string_id = bfp->id;
4362
4363 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4364 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4365
4366 bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4367
4368 brw->wm.base.prog_offset = old_prog_offset;
4369 brw->wm.prog_data = old_prog_data;
4370
4371 return success;
4372 }
4373
4374 void
4375 brw_setup_tex_for_precompile(struct brw_context *brw,
4376 struct brw_sampler_prog_key_data *tex,
4377 struct gl_program *prog)
4378 {
4379 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4380 unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4381 for (unsigned i = 0; i < sampler_count; i++) {
4382 if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4383 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4384 tex->swizzles[i] =
4385 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4386 } else {
4387 /* Color sampler: assume no swizzling. */
4388 tex->swizzles[i] = SWIZZLE_XYZW;
4389 }
4390 }
4391 }