i965/fs: Make LOAD_PAYLOAD take a header size
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 void
53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
54 const fs_reg *src, unsigned sources)
55 {
56 memset(this, 0, sizeof(*this));
57
58 this->src = new fs_reg[MAX2(sources, 3)];
59 for (unsigned i = 0; i < sources; i++)
60 this->src[i] = src[i];
61
62 this->opcode = opcode;
63 this->dst = dst;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (unsigned i = 0; i < sources; ++i) {
79 if (src[i].file != GRF && src[i].file != ATTR)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 for (unsigned i = 0; i < sources; ++i) {
94 switch (this->src[i].file) {
95 case BAD_FILE:
96 this->src[i].effective_width = 8;
97 break;
98 case GRF:
99 case HW_REG:
100 case ATTR:
101 assert(this->src[i].width > 0);
102 if (this->src[i].width == 1) {
103 this->src[i].effective_width = this->exec_size;
104 } else {
105 this->src[i].effective_width = this->src[i].width;
106 }
107 break;
108 case IMM:
109 case UNIFORM:
110 this->src[i].effective_width = this->exec_size;
111 break;
112 default:
113 unreachable("Invalid source register file");
114 }
115 }
116 this->dst.effective_width = this->exec_size;
117
118 this->conditional_mod = BRW_CONDITIONAL_NONE;
119
120 /* This will be the case for almost all instructions. */
121 switch (dst.file) {
122 case GRF:
123 case HW_REG:
124 case MRF:
125 case ATTR:
126 this->regs_written =
127 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
128 break;
129 case BAD_FILE:
130 this->regs_written = 0;
131 break;
132 case IMM:
133 case UNIFORM:
134 unreachable("Invalid destination register file");
135 default:
136 unreachable("Invalid register file");
137 }
138
139 this->writes_accumulator = false;
140 }
141
142 fs_inst::fs_inst()
143 {
144 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
145 }
146
147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
148 {
149 init(opcode, exec_size, reg_undef, NULL, 0);
150 }
151
152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
153 {
154 init(opcode, 0, dst, NULL, 0);
155 }
156
157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
158 const fs_reg &src0)
159 {
160 const fs_reg src[1] = { src0 };
161 init(opcode, exec_size, dst, src, 1);
162 }
163
164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
165 {
166 const fs_reg src[1] = { src0 };
167 init(opcode, 0, dst, src, 1);
168 }
169
170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
171 const fs_reg &src0, const fs_reg &src1)
172 {
173 const fs_reg src[2] = { src0, src1 };
174 init(opcode, exec_size, dst, src, 2);
175 }
176
177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
178 const fs_reg &src1)
179 {
180 const fs_reg src[2] = { src0, src1 };
181 init(opcode, 0, dst, src, 2);
182 }
183
184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
185 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
186 {
187 const fs_reg src[3] = { src0, src1, src2 };
188 init(opcode, exec_size, dst, src, 3);
189 }
190
191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
192 const fs_reg &src1, const fs_reg &src2)
193 {
194 const fs_reg src[3] = { src0, src1, src2 };
195 init(opcode, 0, dst, src, 3);
196 }
197
198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
199 const fs_reg src[], unsigned sources)
200 {
201 init(opcode, 0, dst, src, sources);
202 }
203
204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
205 const fs_reg src[], unsigned sources)
206 {
207 init(opcode, exec_width, dst, src, sources);
208 }
209
210 fs_inst::fs_inst(const fs_inst &that)
211 {
212 memcpy(this, &that, sizeof(that));
213
214 this->src = new fs_reg[MAX2(that.sources, 3)];
215
216 for (unsigned i = 0; i < that.sources; i++)
217 this->src[i] = that.src[i];
218 }
219
220 fs_inst::~fs_inst()
221 {
222 delete[] this->src;
223 }
224
225 void
226 fs_inst::resize_sources(uint8_t num_sources)
227 {
228 if (this->sources != num_sources) {
229 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
230
231 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
232 src[i] = this->src[i];
233
234 delete[] this->src;
235 this->src = src;
236 this->sources = num_sources;
237 }
238 }
239
240 #define ALU1(op) \
241 fs_inst * \
242 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
243 { \
244 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
245 }
246
247 #define ALU2(op) \
248 fs_inst * \
249 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
250 const fs_reg &src1) \
251 { \
252 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
253 }
254
255 #define ALU2_ACC(op) \
256 fs_inst * \
257 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
258 const fs_reg &src1) \
259 { \
260 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
261 inst->writes_accumulator = true; \
262 return inst; \
263 }
264
265 #define ALU3(op) \
266 fs_inst * \
267 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
268 const fs_reg &src1, const fs_reg &src2) \
269 { \
270 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
271 }
272
273 ALU1(NOT)
274 ALU1(MOV)
275 ALU1(FRC)
276 ALU1(RNDD)
277 ALU1(RNDE)
278 ALU1(RNDZ)
279 ALU2(ADD)
280 ALU2(MUL)
281 ALU2_ACC(MACH)
282 ALU2(AND)
283 ALU2(OR)
284 ALU2(XOR)
285 ALU2(SHL)
286 ALU2(SHR)
287 ALU2(ASR)
288 ALU3(LRP)
289 ALU1(BFREV)
290 ALU3(BFE)
291 ALU2(BFI1)
292 ALU3(BFI2)
293 ALU1(FBH)
294 ALU1(FBL)
295 ALU1(CBIT)
296 ALU3(MAD)
297 ALU2_ACC(ADDC)
298 ALU2_ACC(SUBB)
299 ALU2(SEL)
300 ALU2(MAC)
301
302 /** Gen4 predicated IF. */
303 fs_inst *
304 fs_visitor::IF(enum brw_predicate predicate)
305 {
306 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
307 inst->predicate = predicate;
308 return inst;
309 }
310
311 /** Gen6 IF with embedded comparison. */
312 fs_inst *
313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
314 enum brw_conditional_mod condition)
315 {
316 assert(devinfo->gen == 6);
317 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
318 reg_null_d, src0, src1);
319 inst->conditional_mod = condition;
320 return inst;
321 }
322
323 /**
324 * CMP: Sets the low bit of the destination channels with the result
325 * of the comparison, while the upper bits are undefined, and updates
326 * the flag register with the packed 16 bits of the result.
327 */
328 fs_inst *
329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
330 enum brw_conditional_mod condition)
331 {
332 fs_inst *inst;
333
334 /* Take the instruction:
335 *
336 * CMP null<d> src0<f> src1<f>
337 *
338 * Original gen4 does type conversion to the destination type before
339 * comparison, producing garbage results for floating point comparisons.
340 *
341 * The destination type doesn't matter on newer generations, so we set the
342 * type to match src0 so we can compact the instruction.
343 */
344 dst.type = src0.type;
345 if (dst.file == HW_REG)
346 dst.fixed_hw_reg.type = dst.type;
347
348 resolve_ud_negate(&src0);
349 resolve_ud_negate(&src1);
350
351 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
352 inst->conditional_mod = condition;
353
354 return inst;
355 }
356
357 fs_inst *
358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
359 int header_size)
360 {
361 for (int i = 0; i < header_size; i++)
362 assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
363
364 uint8_t exec_size = dst.width;
365 for (int i = 0; i < sources; ++i) {
366 assert(src[i].width % dst.width == 0);
367 if (src[i].width > exec_size)
368 exec_size = src[i].width;
369 }
370
371 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
372 dst, src, sources);
373 inst->regs_written = 0;
374 for (int i = 0; i < sources; ++i) {
375 /* The LOAD_PAYLOAD instruction only really makes sense if we are
376 * dealing with whole registers. If this ever changes, we can deal
377 * with it later.
378 */
379 int size = inst->src[i].effective_width * type_sz(src[i].type);
380 assert(size % 32 == 0);
381 inst->regs_written += (size + 31) / 32;
382 }
383
384 return inst;
385 }
386
387 exec_list
388 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
389 const fs_reg &surf_index,
390 const fs_reg &varying_offset,
391 uint32_t const_offset)
392 {
393 exec_list instructions;
394 fs_inst *inst;
395
396 /* We have our constant surface use a pitch of 4 bytes, so our index can
397 * be any component of a vector, and then we load 4 contiguous
398 * components starting from that.
399 *
400 * We break down the const_offset to a portion added to the variable
401 * offset and a portion done using reg_offset, which means that if you
402 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
403 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
404 * CSE can later notice that those loads are all the same and eliminate
405 * the redundant ones.
406 */
407 fs_reg vec4_offset = vgrf(glsl_type::int_type);
408 instructions.push_tail(ADD(vec4_offset,
409 varying_offset, fs_reg(const_offset & ~3)));
410
411 int scale = 1;
412 if (devinfo->gen == 4 && dst.width == 8) {
413 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
414 * u, v, r) as parameters, or we can just use the SIMD16 message
415 * consisting of (header, u). We choose the second, at the cost of a
416 * longer return length.
417 */
418 scale = 2;
419 }
420
421 enum opcode op;
422 if (devinfo->gen >= 7)
423 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
424 else
425 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
426
427 assert(dst.width % 8 == 0);
428 int regs_written = 4 * (dst.width / 8) * scale;
429 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
430 dst.type, dst.width);
431 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
432 inst->regs_written = regs_written;
433 instructions.push_tail(inst);
434
435 if (devinfo->gen < 7) {
436 inst->base_mrf = 13;
437 inst->header_size = 1;
438 if (devinfo->gen == 4)
439 inst->mlen = 3;
440 else
441 inst->mlen = 1 + dispatch_width / 8;
442 }
443
444 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
445 instructions.push_tail(MOV(dst, result));
446
447 return instructions;
448 }
449
450 /**
451 * A helper for MOV generation for fixing up broken hardware SEND dependency
452 * handling.
453 */
454 fs_inst *
455 fs_visitor::DEP_RESOLVE_MOV(int grf)
456 {
457 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
458
459 inst->ir = NULL;
460 inst->annotation = "send dependency resolve";
461
462 /* The caller always wants uncompressed to emit the minimal extra
463 * dependencies, and to avoid having to deal with aligning its regs to 2.
464 */
465 inst->exec_size = 8;
466
467 return inst;
468 }
469
470 bool
471 fs_inst::equals(fs_inst *inst) const
472 {
473 return (opcode == inst->opcode &&
474 dst.equals(inst->dst) &&
475 src[0].equals(inst->src[0]) &&
476 src[1].equals(inst->src[1]) &&
477 src[2].equals(inst->src[2]) &&
478 saturate == inst->saturate &&
479 predicate == inst->predicate &&
480 conditional_mod == inst->conditional_mod &&
481 mlen == inst->mlen &&
482 base_mrf == inst->base_mrf &&
483 target == inst->target &&
484 eot == inst->eot &&
485 header_size == inst->header_size &&
486 shadow_compare == inst->shadow_compare &&
487 exec_size == inst->exec_size &&
488 offset == inst->offset);
489 }
490
491 bool
492 fs_inst::overwrites_reg(const fs_reg &reg) const
493 {
494 return reg.in_range(dst, regs_written);
495 }
496
497 bool
498 fs_inst::is_send_from_grf() const
499 {
500 switch (opcode) {
501 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
502 case SHADER_OPCODE_SHADER_TIME_ADD:
503 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
504 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
505 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
506 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
507 case SHADER_OPCODE_UNTYPED_ATOMIC:
508 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
509 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
510 case SHADER_OPCODE_TYPED_ATOMIC:
511 case SHADER_OPCODE_TYPED_SURFACE_READ:
512 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
513 case SHADER_OPCODE_URB_WRITE_SIMD8:
514 return true;
515 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
516 return src[1].file == GRF;
517 case FS_OPCODE_FB_WRITE:
518 return src[0].file == GRF;
519 default:
520 if (is_tex())
521 return src[0].file == GRF;
522
523 return false;
524 }
525 }
526
527 bool
528 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
529 {
530 if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
531 return false;
532
533 fs_reg reg = this->src[0];
534 if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
535 return false;
536
537 if (grf_alloc.sizes[reg.reg] != this->regs_written)
538 return false;
539
540 for (int i = 1; i < this->sources; i++)
541 if (!this->src[i].equals(::offset(reg, i)))
542 return false;
543
544 return true;
545 }
546
547 bool
548 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
549 {
550 if (devinfo->gen == 6 && is_math())
551 return false;
552
553 if (is_send_from_grf())
554 return false;
555
556 if (!backend_instruction::can_do_source_mods())
557 return false;
558
559 return true;
560 }
561
562 bool
563 fs_inst::has_side_effects() const
564 {
565 return this->eot || backend_instruction::has_side_effects();
566 }
567
568 void
569 fs_reg::init()
570 {
571 memset(this, 0, sizeof(*this));
572 stride = 1;
573 }
574
575 /** Generic unset register constructor. */
576 fs_reg::fs_reg()
577 {
578 init();
579 this->file = BAD_FILE;
580 }
581
582 /** Immediate value constructor. */
583 fs_reg::fs_reg(float f)
584 {
585 init();
586 this->file = IMM;
587 this->type = BRW_REGISTER_TYPE_F;
588 this->fixed_hw_reg.dw1.f = f;
589 this->width = 1;
590 }
591
592 /** Immediate value constructor. */
593 fs_reg::fs_reg(int32_t i)
594 {
595 init();
596 this->file = IMM;
597 this->type = BRW_REGISTER_TYPE_D;
598 this->fixed_hw_reg.dw1.d = i;
599 this->width = 1;
600 }
601
602 /** Immediate value constructor. */
603 fs_reg::fs_reg(uint32_t u)
604 {
605 init();
606 this->file = IMM;
607 this->type = BRW_REGISTER_TYPE_UD;
608 this->fixed_hw_reg.dw1.ud = u;
609 this->width = 1;
610 }
611
612 /** Vector float immediate value constructor. */
613 fs_reg::fs_reg(uint8_t vf[4])
614 {
615 init();
616 this->file = IMM;
617 this->type = BRW_REGISTER_TYPE_VF;
618 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
619 }
620
621 /** Vector float immediate value constructor. */
622 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
623 {
624 init();
625 this->file = IMM;
626 this->type = BRW_REGISTER_TYPE_VF;
627 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
628 (vf1 << 8) |
629 (vf2 << 16) |
630 (vf3 << 24);
631 }
632
633 /** Fixed brw_reg. */
634 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
635 {
636 init();
637 this->file = HW_REG;
638 this->fixed_hw_reg = fixed_hw_reg;
639 this->type = fixed_hw_reg.type;
640 this->width = 1 << fixed_hw_reg.width;
641 }
642
643 bool
644 fs_reg::equals(const fs_reg &r) const
645 {
646 return (file == r.file &&
647 reg == r.reg &&
648 reg_offset == r.reg_offset &&
649 subreg_offset == r.subreg_offset &&
650 type == r.type &&
651 negate == r.negate &&
652 abs == r.abs &&
653 !reladdr && !r.reladdr &&
654 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
655 width == r.width &&
656 stride == r.stride);
657 }
658
659 fs_reg &
660 fs_reg::set_smear(unsigned subreg)
661 {
662 assert(file != HW_REG && file != IMM);
663 subreg_offset = subreg * type_sz(type);
664 stride = 0;
665 return *this;
666 }
667
668 bool
669 fs_reg::is_contiguous() const
670 {
671 return stride == 1;
672 }
673
674 int
675 fs_visitor::type_size(const struct glsl_type *type)
676 {
677 unsigned int size, i;
678
679 switch (type->base_type) {
680 case GLSL_TYPE_UINT:
681 case GLSL_TYPE_INT:
682 case GLSL_TYPE_FLOAT:
683 case GLSL_TYPE_BOOL:
684 return type->components();
685 case GLSL_TYPE_ARRAY:
686 return type_size(type->fields.array) * type->length;
687 case GLSL_TYPE_STRUCT:
688 size = 0;
689 for (i = 0; i < type->length; i++) {
690 size += type_size(type->fields.structure[i].type);
691 }
692 return size;
693 case GLSL_TYPE_SAMPLER:
694 /* Samplers take up no register space, since they're baked in at
695 * link time.
696 */
697 return 0;
698 case GLSL_TYPE_ATOMIC_UINT:
699 return 0;
700 case GLSL_TYPE_IMAGE:
701 case GLSL_TYPE_VOID:
702 case GLSL_TYPE_ERROR:
703 case GLSL_TYPE_INTERFACE:
704 case GLSL_TYPE_DOUBLE:
705 unreachable("not reached");
706 }
707
708 return 0;
709 }
710
711 /**
712 * Create a MOV to read the timestamp register.
713 *
714 * The caller is responsible for emitting the MOV. The return value is
715 * the destination of the MOV, with extra parameters set.
716 */
717 fs_reg
718 fs_visitor::get_timestamp(fs_inst **out_mov)
719 {
720 assert(devinfo->gen >= 7);
721
722 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
723 BRW_ARF_TIMESTAMP,
724 0),
725 BRW_REGISTER_TYPE_UD));
726
727 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
728
729 fs_inst *mov = MOV(dst, ts);
730 /* We want to read the 3 fields we care about even if it's not enabled in
731 * the dispatch.
732 */
733 mov->force_writemask_all = true;
734
735 /* The caller wants the low 32 bits of the timestamp. Since it's running
736 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
737 * which is plenty of time for our purposes. It is identical across the
738 * EUs, but since it's tracking GPU core speed it will increment at a
739 * varying rate as render P-states change.
740 *
741 * The caller could also check if render P-states have changed (or anything
742 * else that might disrupt timing) by setting smear to 2 and checking if
743 * that field is != 0.
744 */
745 dst.set_smear(0);
746
747 *out_mov = mov;
748 return dst;
749 }
750
751 void
752 fs_visitor::emit_shader_time_begin()
753 {
754 current_annotation = "shader time start";
755 fs_inst *mov;
756 shader_start_time = get_timestamp(&mov);
757 emit(mov);
758 }
759
760 void
761 fs_visitor::emit_shader_time_end()
762 {
763 current_annotation = "shader time end";
764
765 enum shader_time_shader_type type, written_type, reset_type;
766 switch (stage) {
767 case MESA_SHADER_VERTEX:
768 type = ST_VS;
769 written_type = ST_VS_WRITTEN;
770 reset_type = ST_VS_RESET;
771 break;
772 case MESA_SHADER_GEOMETRY:
773 type = ST_GS;
774 written_type = ST_GS_WRITTEN;
775 reset_type = ST_GS_RESET;
776 break;
777 case MESA_SHADER_FRAGMENT:
778 if (dispatch_width == 8) {
779 type = ST_FS8;
780 written_type = ST_FS8_WRITTEN;
781 reset_type = ST_FS8_RESET;
782 } else {
783 assert(dispatch_width == 16);
784 type = ST_FS16;
785 written_type = ST_FS16_WRITTEN;
786 reset_type = ST_FS16_RESET;
787 }
788 break;
789 case MESA_SHADER_COMPUTE:
790 type = ST_CS;
791 written_type = ST_CS_WRITTEN;
792 reset_type = ST_CS_RESET;
793 break;
794 default:
795 unreachable("fs_visitor::emit_shader_time_end missing code");
796 }
797
798 /* Insert our code just before the final SEND with EOT. */
799 exec_node *end = this->instructions.get_tail();
800 assert(end && ((fs_inst *) end)->eot);
801
802 fs_inst *tm_read;
803 fs_reg shader_end_time = get_timestamp(&tm_read);
804 end->insert_before(tm_read);
805
806 /* Check that there weren't any timestamp reset events (assuming these
807 * were the only two timestamp reads that happened).
808 */
809 fs_reg reset = shader_end_time;
810 reset.set_smear(2);
811 fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
812 test->conditional_mod = BRW_CONDITIONAL_Z;
813 test->force_writemask_all = true;
814 end->insert_before(test);
815 end->insert_before(IF(BRW_PREDICATE_NORMAL));
816
817 fs_reg start = shader_start_time;
818 start.negate = true;
819 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
820 diff.set_smear(0);
821 fs_inst *add = ADD(diff, start, shader_end_time);
822 add->force_writemask_all = true;
823 end->insert_before(add);
824
825 /* If there were no instructions between the two timestamp gets, the diff
826 * is 2 cycles. Remove that overhead, so I can forget about that when
827 * trying to determine the time taken for single instructions.
828 */
829 add = ADD(diff, diff, fs_reg(-2u));
830 add->force_writemask_all = true;
831 end->insert_before(add);
832
833 end->insert_before(SHADER_TIME_ADD(type, diff));
834 end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
835 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
836 end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
837 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
838 }
839
840 fs_inst *
841 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
842 {
843 int shader_time_index =
844 brw_get_shader_time_index(brw, shader_prog, prog, type);
845 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
846
847 fs_reg payload;
848 if (dispatch_width == 8)
849 payload = vgrf(glsl_type::uvec2_type);
850 else
851 payload = vgrf(glsl_type::uint_type);
852
853 return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
854 fs_reg(), payload, offset, value);
855 }
856
857 void
858 fs_visitor::vfail(const char *format, va_list va)
859 {
860 char *msg;
861
862 if (failed)
863 return;
864
865 failed = true;
866
867 msg = ralloc_vasprintf(mem_ctx, format, va);
868 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
869
870 this->fail_msg = msg;
871
872 if (debug_enabled) {
873 fprintf(stderr, "%s", msg);
874 }
875 }
876
877 void
878 fs_visitor::fail(const char *format, ...)
879 {
880 va_list va;
881
882 va_start(va, format);
883 vfail(format, va);
884 va_end(va);
885 }
886
887 /**
888 * Mark this program as impossible to compile in SIMD16 mode.
889 *
890 * During the SIMD8 compile (which happens first), we can detect and flag
891 * things that are unsupported in SIMD16 mode, so the compiler can skip
892 * the SIMD16 compile altogether.
893 *
894 * During a SIMD16 compile (if one happens anyway), this just calls fail().
895 */
896 void
897 fs_visitor::no16(const char *format, ...)
898 {
899 va_list va;
900
901 va_start(va, format);
902
903 if (dispatch_width == 16) {
904 vfail(format, va);
905 } else {
906 simd16_unsupported = true;
907
908 if (brw->perf_debug) {
909 if (no16_msg)
910 ralloc_vasprintf_append(&no16_msg, format, va);
911 else
912 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
913 }
914 }
915
916 va_end(va);
917 }
918
919 fs_inst *
920 fs_visitor::emit(enum opcode opcode)
921 {
922 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
923 }
924
925 fs_inst *
926 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
927 {
928 return emit(new(mem_ctx) fs_inst(opcode, dst));
929 }
930
931 fs_inst *
932 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
933 {
934 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
935 }
936
937 fs_inst *
938 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
939 const fs_reg &src1)
940 {
941 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
942 }
943
944 fs_inst *
945 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
946 const fs_reg &src1, const fs_reg &src2)
947 {
948 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
949 }
950
951 fs_inst *
952 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
953 fs_reg src[], int sources)
954 {
955 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
956 }
957
958 /**
959 * Returns true if the instruction has a flag that means it won't
960 * update an entire destination register.
961 *
962 * For example, dead code elimination and live variable analysis want to know
963 * when a write to a variable screens off any preceding values that were in
964 * it.
965 */
966 bool
967 fs_inst::is_partial_write() const
968 {
969 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
970 (this->dst.width * type_sz(this->dst.type)) < 32 ||
971 !this->dst.is_contiguous());
972 }
973
974 int
975 fs_inst::regs_read(int arg) const
976 {
977 if (is_tex() && arg == 0 && src[0].file == GRF) {
978 return mlen;
979 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
980 return mlen;
981 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
982 return mlen;
983 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
984 return mlen;
985 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
986 return mlen;
987 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
988 return mlen;
989 } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
990 return mlen;
991 } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
992 return mlen;
993 } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
994 return mlen;
995 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
996 return mlen;
997 } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
998 return exec_size / 4;
999 }
1000
1001 switch (src[arg].file) {
1002 case BAD_FILE:
1003 case UNIFORM:
1004 case IMM:
1005 return 1;
1006 case GRF:
1007 case HW_REG:
1008 if (src[arg].stride == 0) {
1009 return 1;
1010 } else {
1011 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
1012 return (size + 31) / 32;
1013 }
1014 case MRF:
1015 unreachable("MRF registers are not allowed as sources");
1016 default:
1017 unreachable("Invalid register file");
1018 }
1019 }
1020
1021 bool
1022 fs_inst::reads_flag() const
1023 {
1024 return predicate;
1025 }
1026
1027 bool
1028 fs_inst::writes_flag() const
1029 {
1030 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
1031 opcode != BRW_OPCODE_IF &&
1032 opcode != BRW_OPCODE_WHILE)) ||
1033 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
1034 }
1035
1036 /**
1037 * Returns how many MRFs an FS opcode will write over.
1038 *
1039 * Note that this is not the 0 or 1 implied writes in an actual gen
1040 * instruction -- the FS opcodes often generate MOVs in addition.
1041 */
1042 int
1043 fs_visitor::implied_mrf_writes(fs_inst *inst)
1044 {
1045 if (inst->mlen == 0)
1046 return 0;
1047
1048 if (inst->base_mrf == -1)
1049 return 0;
1050
1051 switch (inst->opcode) {
1052 case SHADER_OPCODE_RCP:
1053 case SHADER_OPCODE_RSQ:
1054 case SHADER_OPCODE_SQRT:
1055 case SHADER_OPCODE_EXP2:
1056 case SHADER_OPCODE_LOG2:
1057 case SHADER_OPCODE_SIN:
1058 case SHADER_OPCODE_COS:
1059 return 1 * dispatch_width / 8;
1060 case SHADER_OPCODE_POW:
1061 case SHADER_OPCODE_INT_QUOTIENT:
1062 case SHADER_OPCODE_INT_REMAINDER:
1063 return 2 * dispatch_width / 8;
1064 case SHADER_OPCODE_TEX:
1065 case FS_OPCODE_TXB:
1066 case SHADER_OPCODE_TXD:
1067 case SHADER_OPCODE_TXF:
1068 case SHADER_OPCODE_TXF_CMS:
1069 case SHADER_OPCODE_TXF_MCS:
1070 case SHADER_OPCODE_TG4:
1071 case SHADER_OPCODE_TG4_OFFSET:
1072 case SHADER_OPCODE_TXL:
1073 case SHADER_OPCODE_TXS:
1074 case SHADER_OPCODE_LOD:
1075 return 1;
1076 case FS_OPCODE_FB_WRITE:
1077 return 2;
1078 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1079 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1080 return 1;
1081 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1082 return inst->mlen;
1083 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1084 return 2;
1085 case SHADER_OPCODE_UNTYPED_ATOMIC:
1086 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1087 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1088 case SHADER_OPCODE_TYPED_ATOMIC:
1089 case SHADER_OPCODE_TYPED_SURFACE_READ:
1090 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1091 case SHADER_OPCODE_URB_WRITE_SIMD8:
1092 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1093 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1094 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1095 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1096 return 0;
1097 default:
1098 unreachable("not reached");
1099 }
1100 }
1101
1102 fs_reg
1103 fs_visitor::vgrf(const glsl_type *const type)
1104 {
1105 int reg_width = dispatch_width / 8;
1106 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1107 brw_type_for_base_type(type), dispatch_width);
1108 }
1109
1110 fs_reg
1111 fs_visitor::vgrf(int num_components)
1112 {
1113 int reg_width = dispatch_width / 8;
1114 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1115 BRW_REGISTER_TYPE_F, dispatch_width);
1116 }
1117
1118 /** Fixed HW reg constructor. */
1119 fs_reg::fs_reg(enum register_file file, int reg)
1120 {
1121 init();
1122 this->file = file;
1123 this->reg = reg;
1124 this->type = BRW_REGISTER_TYPE_F;
1125
1126 switch (file) {
1127 case UNIFORM:
1128 this->width = 1;
1129 break;
1130 default:
1131 this->width = 8;
1132 }
1133 }
1134
1135 /** Fixed HW reg constructor. */
1136 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1137 {
1138 init();
1139 this->file = file;
1140 this->reg = reg;
1141 this->type = type;
1142
1143 switch (file) {
1144 case UNIFORM:
1145 this->width = 1;
1146 break;
1147 default:
1148 this->width = 8;
1149 }
1150 }
1151
1152 /** Fixed HW reg constructor. */
1153 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1154 uint8_t width)
1155 {
1156 init();
1157 this->file = file;
1158 this->reg = reg;
1159 this->type = type;
1160 this->width = width;
1161 }
1162
1163 fs_reg *
1164 fs_visitor::variable_storage(ir_variable *var)
1165 {
1166 return (fs_reg *)hash_table_find(this->variable_ht, var);
1167 }
1168
1169 void
1170 import_uniforms_callback(const void *key,
1171 void *data,
1172 void *closure)
1173 {
1174 struct hash_table *dst_ht = (struct hash_table *)closure;
1175 const fs_reg *reg = (const fs_reg *)data;
1176
1177 if (reg->file != UNIFORM)
1178 return;
1179
1180 hash_table_insert(dst_ht, data, key);
1181 }
1182
1183 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1184 * This brings in those uniform definitions
1185 */
1186 void
1187 fs_visitor::import_uniforms(fs_visitor *v)
1188 {
1189 hash_table_call_foreach(v->variable_ht,
1190 import_uniforms_callback,
1191 variable_ht);
1192 this->push_constant_loc = v->push_constant_loc;
1193 this->pull_constant_loc = v->pull_constant_loc;
1194 this->uniforms = v->uniforms;
1195 this->param_size = v->param_size;
1196 }
1197
1198 /* Our support for uniforms is piggy-backed on the struct
1199 * gl_fragment_program, because that's where the values actually
1200 * get stored, rather than in some global gl_shader_program uniform
1201 * store.
1202 */
1203 void
1204 fs_visitor::setup_uniform_values(ir_variable *ir)
1205 {
1206 int namelen = strlen(ir->name);
1207
1208 /* The data for our (non-builtin) uniforms is stored in a series of
1209 * gl_uniform_driver_storage structs for each subcomponent that
1210 * glGetUniformLocation() could name. We know it's been set up in the same
1211 * order we'd walk the type, so walk the list of storage and find anything
1212 * with our name, or the prefix of a component that starts with our name.
1213 */
1214 unsigned params_before = uniforms;
1215 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1216 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1217
1218 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1219 (storage->name[namelen] != 0 &&
1220 storage->name[namelen] != '.' &&
1221 storage->name[namelen] != '[')) {
1222 continue;
1223 }
1224
1225 unsigned slots = storage->type->component_slots();
1226 if (storage->array_elements)
1227 slots *= storage->array_elements;
1228
1229 for (unsigned i = 0; i < slots; i++) {
1230 stage_prog_data->param[uniforms++] = &storage->storage[i];
1231 }
1232 }
1233
1234 /* Make sure we actually initialized the right amount of stuff here. */
1235 assert(params_before + ir->type->component_slots() == uniforms);
1236 (void)params_before;
1237 }
1238
1239
1240 /* Our support for builtin uniforms is even scarier than non-builtin.
1241 * It sits on top of the PROG_STATE_VAR parameters that are
1242 * automatically updated from GL context state.
1243 */
1244 void
1245 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1246 {
1247 const ir_state_slot *const slots = ir->get_state_slots();
1248 assert(slots != NULL);
1249
1250 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1251 /* This state reference has already been setup by ir_to_mesa, but we'll
1252 * get the same index back here.
1253 */
1254 int index = _mesa_add_state_reference(this->prog->Parameters,
1255 (gl_state_index *)slots[i].tokens);
1256
1257 /* Add each of the unique swizzles of the element as a parameter.
1258 * This'll end up matching the expected layout of the
1259 * array/matrix/structure we're trying to fill in.
1260 */
1261 int last_swiz = -1;
1262 for (unsigned int j = 0; j < 4; j++) {
1263 int swiz = GET_SWZ(slots[i].swizzle, j);
1264 if (swiz == last_swiz)
1265 break;
1266 last_swiz = swiz;
1267
1268 stage_prog_data->param[uniforms++] =
1269 &prog->Parameters->ParameterValues[index][swiz];
1270 }
1271 }
1272 }
1273
1274 fs_reg *
1275 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1276 bool origin_upper_left)
1277 {
1278 assert(stage == MESA_SHADER_FRAGMENT);
1279 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1280 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1281 fs_reg wpos = *reg;
1282 bool flip = !origin_upper_left ^ key->render_to_fbo;
1283
1284 /* gl_FragCoord.x */
1285 if (pixel_center_integer) {
1286 emit(MOV(wpos, this->pixel_x));
1287 } else {
1288 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1289 }
1290 wpos = offset(wpos, 1);
1291
1292 /* gl_FragCoord.y */
1293 if (!flip && pixel_center_integer) {
1294 emit(MOV(wpos, this->pixel_y));
1295 } else {
1296 fs_reg pixel_y = this->pixel_y;
1297 float offset = (pixel_center_integer ? 0.0 : 0.5);
1298
1299 if (flip) {
1300 pixel_y.negate = true;
1301 offset += key->drawable_height - 1.0;
1302 }
1303
1304 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1305 }
1306 wpos = offset(wpos, 1);
1307
1308 /* gl_FragCoord.z */
1309 if (devinfo->gen >= 6) {
1310 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1311 } else {
1312 emit(FS_OPCODE_LINTERP, wpos,
1313 this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1314 interp_reg(VARYING_SLOT_POS, 2));
1315 }
1316 wpos = offset(wpos, 1);
1317
1318 /* gl_FragCoord.w: Already set up in emit_interpolation */
1319 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1320
1321 return reg;
1322 }
1323
1324 fs_inst *
1325 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1326 glsl_interp_qualifier interpolation_mode,
1327 bool is_centroid, bool is_sample)
1328 {
1329 brw_wm_barycentric_interp_mode barycoord_mode;
1330 if (devinfo->gen >= 6) {
1331 if (is_centroid) {
1332 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1333 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1334 else
1335 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1336 } else if (is_sample) {
1337 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1338 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1339 else
1340 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1341 } else {
1342 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1343 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1344 else
1345 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1346 }
1347 } else {
1348 /* On Ironlake and below, there is only one interpolation mode.
1349 * Centroid interpolation doesn't mean anything on this hardware --
1350 * there is no multisampling.
1351 */
1352 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1353 }
1354 return emit(FS_OPCODE_LINTERP, attr,
1355 this->delta_xy[barycoord_mode], interp);
1356 }
1357
1358 void
1359 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1360 const glsl_type *type,
1361 glsl_interp_qualifier interpolation_mode,
1362 int location, bool mod_centroid,
1363 bool mod_sample)
1364 {
1365 attr.type = brw_type_for_base_type(type->get_scalar_type());
1366
1367 assert(stage == MESA_SHADER_FRAGMENT);
1368 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1369 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1370
1371 unsigned int array_elements;
1372
1373 if (type->is_array()) {
1374 array_elements = type->length;
1375 if (array_elements == 0) {
1376 fail("dereferenced array '%s' has length 0\n", name);
1377 }
1378 type = type->fields.array;
1379 } else {
1380 array_elements = 1;
1381 }
1382
1383 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1384 bool is_gl_Color =
1385 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1386 if (key->flat_shade && is_gl_Color) {
1387 interpolation_mode = INTERP_QUALIFIER_FLAT;
1388 } else {
1389 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1390 }
1391 }
1392
1393 for (unsigned int i = 0; i < array_elements; i++) {
1394 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1395 if (prog_data->urb_setup[location] == -1) {
1396 /* If there's no incoming setup data for this slot, don't
1397 * emit interpolation for it.
1398 */
1399 attr = offset(attr, type->vector_elements);
1400 location++;
1401 continue;
1402 }
1403
1404 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1405 /* Constant interpolation (flat shading) case. The SF has
1406 * handed us defined values in only the constant offset
1407 * field of the setup reg.
1408 */
1409 for (unsigned int k = 0; k < type->vector_elements; k++) {
1410 struct brw_reg interp = interp_reg(location, k);
1411 interp = suboffset(interp, 3);
1412 interp.type = attr.type;
1413 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1414 attr = offset(attr, 1);
1415 }
1416 } else {
1417 /* Smooth/noperspective interpolation case. */
1418 for (unsigned int k = 0; k < type->vector_elements; k++) {
1419 struct brw_reg interp = interp_reg(location, k);
1420 if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1421 /* Get the pixel/sample mask into f0 so that we know
1422 * which pixels are lit. Then, for each channel that is
1423 * unlit, replace the centroid data with non-centroid
1424 * data.
1425 */
1426 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1427
1428 fs_inst *inst;
1429 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1430 false, false);
1431 inst->predicate = BRW_PREDICATE_NORMAL;
1432 inst->predicate_inverse = true;
1433 if (devinfo->has_pln)
1434 inst->no_dd_clear = true;
1435
1436 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1437 mod_centroid && !key->persample_shading,
1438 mod_sample || key->persample_shading);
1439 inst->predicate = BRW_PREDICATE_NORMAL;
1440 inst->predicate_inverse = false;
1441 if (devinfo->has_pln)
1442 inst->no_dd_check = true;
1443
1444 } else {
1445 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1446 mod_centroid && !key->persample_shading,
1447 mod_sample || key->persample_shading);
1448 }
1449 if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1450 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1451 }
1452 attr = offset(attr, 1);
1453 }
1454
1455 }
1456 location++;
1457 }
1458 }
1459 }
1460
1461 fs_reg *
1462 fs_visitor::emit_frontfacing_interpolation()
1463 {
1464 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1465
1466 if (devinfo->gen >= 6) {
1467 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1468 * a boolean result from this (~0/true or 0/false).
1469 *
1470 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1471 * this task in only one instruction:
1472 * - a negation source modifier will flip the bit; and
1473 * - a W -> D type conversion will sign extend the bit into the high
1474 * word of the destination.
1475 *
1476 * An ASR 15 fills the low word of the destination.
1477 */
1478 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1479 g0.negate = true;
1480
1481 emit(ASR(*reg, g0, fs_reg(15)));
1482 } else {
1483 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1484 * a boolean result from this (1/true or 0/false).
1485 *
1486 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1487 * the negation source modifier to flip it. Unfortunately the SHR
1488 * instruction only operates on UD (or D with an abs source modifier)
1489 * sources without negation.
1490 *
1491 * Instead, use ASR (which will give ~0/true or 0/false).
1492 */
1493 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1494 g1_6.negate = true;
1495
1496 emit(ASR(*reg, g1_6, fs_reg(31)));
1497 }
1498
1499 return reg;
1500 }
1501
1502 void
1503 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1504 {
1505 assert(stage == MESA_SHADER_FRAGMENT);
1506 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1507 assert(dst.type == BRW_REGISTER_TYPE_F);
1508
1509 if (key->compute_pos_offset) {
1510 /* Convert int_sample_pos to floating point */
1511 emit(MOV(dst, int_sample_pos));
1512 /* Scale to the range [0, 1] */
1513 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1514 }
1515 else {
1516 /* From ARB_sample_shading specification:
1517 * "When rendering to a non-multisample buffer, or if multisample
1518 * rasterization is disabled, gl_SamplePosition will always be
1519 * (0.5, 0.5).
1520 */
1521 emit(MOV(dst, fs_reg(0.5f)));
1522 }
1523 }
1524
1525 fs_reg *
1526 fs_visitor::emit_samplepos_setup()
1527 {
1528 assert(devinfo->gen >= 6);
1529
1530 this->current_annotation = "compute sample position";
1531 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1532 fs_reg pos = *reg;
1533 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1534 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1535
1536 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1537 * mode will be enabled.
1538 *
1539 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1540 * R31.1:0 Position Offset X/Y for Slot[3:0]
1541 * R31.3:2 Position Offset X/Y for Slot[7:4]
1542 * .....
1543 *
1544 * The X, Y sample positions come in as bytes in thread payload. So, read
1545 * the positions using vstride=16, width=8, hstride=2.
1546 */
1547 struct brw_reg sample_pos_reg =
1548 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1549 BRW_REGISTER_TYPE_B), 16, 8, 2);
1550
1551 if (dispatch_width == 8) {
1552 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1553 } else {
1554 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1555 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1556 ->force_sechalf = true;
1557 }
1558 /* Compute gl_SamplePosition.x */
1559 compute_sample_position(pos, int_sample_x);
1560 pos = offset(pos, 1);
1561 if (dispatch_width == 8) {
1562 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1563 } else {
1564 emit(MOV(half(int_sample_y, 0),
1565 fs_reg(suboffset(sample_pos_reg, 1))));
1566 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1567 ->force_sechalf = true;
1568 }
1569 /* Compute gl_SamplePosition.y */
1570 compute_sample_position(pos, int_sample_y);
1571 return reg;
1572 }
1573
1574 fs_reg *
1575 fs_visitor::emit_sampleid_setup()
1576 {
1577 assert(stage == MESA_SHADER_FRAGMENT);
1578 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1579 assert(devinfo->gen >= 6);
1580
1581 this->current_annotation = "compute sample id";
1582 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1583
1584 if (key->compute_sample_id) {
1585 fs_reg t1 = vgrf(glsl_type::int_type);
1586 fs_reg t2 = vgrf(glsl_type::int_type);
1587 t2.type = BRW_REGISTER_TYPE_UW;
1588
1589 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1590 * 8x multisampling, subspan 0 will represent sample N (where N
1591 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1592 * 7. We can find the value of N by looking at R0.0 bits 7:6
1593 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1594 * (since samples are always delivered in pairs). That is, we
1595 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1596 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1597 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1598 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1599 * populating a temporary variable with the sequence (0, 1, 2, 3),
1600 * and then reading from it using vstride=1, width=4, hstride=0.
1601 * These computations hold good for 4x multisampling as well.
1602 *
1603 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1604 * the first four slots are sample 0 of subspan 0; the next four
1605 * are sample 1 of subspan 0; the third group is sample 0 of
1606 * subspan 1, and finally sample 1 of subspan 1.
1607 */
1608 fs_inst *inst;
1609 inst = emit(BRW_OPCODE_AND, t1,
1610 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1611 fs_reg(0xc0));
1612 inst->force_writemask_all = true;
1613 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1614 inst->force_writemask_all = true;
1615 /* This works for both SIMD8 and SIMD16 */
1616 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1617 inst->force_writemask_all = true;
1618 /* This special instruction takes care of setting vstride=1,
1619 * width=4, hstride=0 of t2 during an ADD instruction.
1620 */
1621 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1622 } else {
1623 /* As per GL_ARB_sample_shading specification:
1624 * "When rendering to a non-multisample buffer, or if multisample
1625 * rasterization is disabled, gl_SampleID will always be zero."
1626 */
1627 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1628 }
1629
1630 return reg;
1631 }
1632
1633 void
1634 fs_visitor::resolve_source_modifiers(fs_reg *src)
1635 {
1636 if (!src->abs && !src->negate)
1637 return;
1638
1639 fs_reg temp = retype(vgrf(1), src->type);
1640 emit(MOV(temp, *src));
1641 *src = temp;
1642 }
1643
1644 fs_reg
1645 fs_visitor::fix_math_operand(fs_reg src)
1646 {
1647 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1648 * might be able to do better by doing execsize = 1 math and then
1649 * expanding that result out, but we would need to be careful with
1650 * masking.
1651 *
1652 * The hardware ignores source modifiers (negate and abs) on math
1653 * instructions, so we also move to a temp to set those up.
1654 */
1655 if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1656 !src.abs && !src.negate)
1657 return src;
1658
1659 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1660 * operands to math
1661 */
1662 if (devinfo->gen >= 7 && src.file != IMM)
1663 return src;
1664
1665 fs_reg expanded = vgrf(glsl_type::float_type);
1666 expanded.type = src.type;
1667 emit(BRW_OPCODE_MOV, expanded, src);
1668 return expanded;
1669 }
1670
1671 fs_inst *
1672 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1673 {
1674 switch (opcode) {
1675 case SHADER_OPCODE_RCP:
1676 case SHADER_OPCODE_RSQ:
1677 case SHADER_OPCODE_SQRT:
1678 case SHADER_OPCODE_EXP2:
1679 case SHADER_OPCODE_LOG2:
1680 case SHADER_OPCODE_SIN:
1681 case SHADER_OPCODE_COS:
1682 break;
1683 default:
1684 unreachable("not reached: bad math opcode");
1685 }
1686
1687 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1688 * might be able to do better by doing execsize = 1 math and then
1689 * expanding that result out, but we would need to be careful with
1690 * masking.
1691 *
1692 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1693 * instructions, so we also move to a temp to set those up.
1694 */
1695 if (devinfo->gen == 6 || devinfo->gen == 7)
1696 src = fix_math_operand(src);
1697
1698 fs_inst *inst = emit(opcode, dst, src);
1699
1700 if (devinfo->gen < 6) {
1701 inst->base_mrf = 2;
1702 inst->mlen = dispatch_width / 8;
1703 }
1704
1705 return inst;
1706 }
1707
1708 fs_inst *
1709 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1710 {
1711 int base_mrf = 2;
1712 fs_inst *inst;
1713
1714 if (devinfo->gen >= 8) {
1715 inst = emit(opcode, dst, src0, src1);
1716 } else if (devinfo->gen >= 6) {
1717 src0 = fix_math_operand(src0);
1718 src1 = fix_math_operand(src1);
1719
1720 inst = emit(opcode, dst, src0, src1);
1721 } else {
1722 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1723 * "Message Payload":
1724 *
1725 * "Operand0[7]. For the INT DIV functions, this operand is the
1726 * denominator."
1727 * ...
1728 * "Operand1[7]. For the INT DIV functions, this operand is the
1729 * numerator."
1730 */
1731 bool is_int_div = opcode != SHADER_OPCODE_POW;
1732 fs_reg &op0 = is_int_div ? src1 : src0;
1733 fs_reg &op1 = is_int_div ? src0 : src1;
1734
1735 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1736 inst = emit(opcode, dst, op0, reg_null_f);
1737
1738 inst->base_mrf = base_mrf;
1739 inst->mlen = 2 * dispatch_width / 8;
1740 }
1741 return inst;
1742 }
1743
1744 void
1745 fs_visitor::emit_discard_jump()
1746 {
1747 assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1748
1749 /* For performance, after a discard, jump to the end of the
1750 * shader if all relevant channels have been discarded.
1751 */
1752 fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1753 discard_jump->flag_subreg = 1;
1754
1755 discard_jump->predicate = (dispatch_width == 8)
1756 ? BRW_PREDICATE_ALIGN1_ANY8H
1757 : BRW_PREDICATE_ALIGN1_ANY16H;
1758 discard_jump->predicate_inverse = true;
1759 }
1760
1761 void
1762 fs_visitor::assign_curb_setup()
1763 {
1764 if (dispatch_width == 8) {
1765 prog_data->dispatch_grf_start_reg = payload.num_regs;
1766 } else {
1767 if (stage == MESA_SHADER_FRAGMENT) {
1768 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1769 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1770 } else if (stage == MESA_SHADER_COMPUTE) {
1771 brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1772 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1773 } else {
1774 unreachable("Unsupported shader type!");
1775 }
1776 }
1777
1778 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1779
1780 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1781 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1782 for (unsigned int i = 0; i < inst->sources; i++) {
1783 if (inst->src[i].file == UNIFORM) {
1784 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1785 int constant_nr;
1786 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1787 constant_nr = push_constant_loc[uniform_nr];
1788 } else {
1789 /* Section 5.11 of the OpenGL 4.1 spec says:
1790 * "Out-of-bounds reads return undefined values, which include
1791 * values from other variables of the active program or zero."
1792 * Just return the first push constant.
1793 */
1794 constant_nr = 0;
1795 }
1796
1797 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1798 constant_nr / 8,
1799 constant_nr % 8);
1800
1801 inst->src[i].file = HW_REG;
1802 inst->src[i].fixed_hw_reg = byte_offset(
1803 retype(brw_reg, inst->src[i].type),
1804 inst->src[i].subreg_offset);
1805 }
1806 }
1807 }
1808 }
1809
1810 void
1811 fs_visitor::calculate_urb_setup()
1812 {
1813 assert(stage == MESA_SHADER_FRAGMENT);
1814 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1815 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1816
1817 memset(prog_data->urb_setup, -1,
1818 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1819
1820 int urb_next = 0;
1821 /* Figure out where each of the incoming setup attributes lands. */
1822 if (devinfo->gen >= 6) {
1823 if (_mesa_bitcount_64(prog->InputsRead &
1824 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1825 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1826 * first 16 varying inputs, so we can put them wherever we want.
1827 * Just put them in order.
1828 *
1829 * This is useful because it means that (a) inputs not used by the
1830 * fragment shader won't take up valuable register space, and (b) we
1831 * won't have to recompile the fragment shader if it gets paired with
1832 * a different vertex (or geometry) shader.
1833 */
1834 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1835 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1836 BITFIELD64_BIT(i)) {
1837 prog_data->urb_setup[i] = urb_next++;
1838 }
1839 }
1840 } else {
1841 /* We have enough input varyings that the SF/SBE pipeline stage can't
1842 * arbitrarily rearrange them to suit our whim; we have to put them
1843 * in an order that matches the output of the previous pipeline stage
1844 * (geometry or vertex shader).
1845 */
1846 struct brw_vue_map prev_stage_vue_map;
1847 brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1848 key->input_slots_valid);
1849 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1850 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1851 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1852 slot++) {
1853 int varying = prev_stage_vue_map.slot_to_varying[slot];
1854 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1855 * unused.
1856 */
1857 if (varying != BRW_VARYING_SLOT_COUNT &&
1858 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1859 BITFIELD64_BIT(varying))) {
1860 prog_data->urb_setup[varying] = slot - first_slot;
1861 }
1862 }
1863 urb_next = prev_stage_vue_map.num_slots - first_slot;
1864 }
1865 } else {
1866 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1867 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1868 /* Point size is packed into the header, not as a general attribute */
1869 if (i == VARYING_SLOT_PSIZ)
1870 continue;
1871
1872 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1873 /* The back color slot is skipped when the front color is
1874 * also written to. In addition, some slots can be
1875 * written in the vertex shader and not read in the
1876 * fragment shader. So the register number must always be
1877 * incremented, mapped or not.
1878 */
1879 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1880 prog_data->urb_setup[i] = urb_next;
1881 urb_next++;
1882 }
1883 }
1884
1885 /*
1886 * It's a FS only attribute, and we did interpolation for this attribute
1887 * in SF thread. So, count it here, too.
1888 *
1889 * See compile_sf_prog() for more info.
1890 */
1891 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1892 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1893 }
1894
1895 prog_data->num_varying_inputs = urb_next;
1896 }
1897
1898 void
1899 fs_visitor::assign_urb_setup()
1900 {
1901 assert(stage == MESA_SHADER_FRAGMENT);
1902 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1903
1904 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1905
1906 /* Offset all the urb_setup[] index by the actual position of the
1907 * setup regs, now that the location of the constants has been chosen.
1908 */
1909 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1910 if (inst->opcode == FS_OPCODE_LINTERP) {
1911 assert(inst->src[1].file == HW_REG);
1912 inst->src[1].fixed_hw_reg.nr += urb_start;
1913 }
1914
1915 if (inst->opcode == FS_OPCODE_CINTERP) {
1916 assert(inst->src[0].file == HW_REG);
1917 inst->src[0].fixed_hw_reg.nr += urb_start;
1918 }
1919 }
1920
1921 /* Each attribute is 4 setup channels, each of which is half a reg. */
1922 this->first_non_payload_grf =
1923 urb_start + prog_data->num_varying_inputs * 2;
1924 }
1925
1926 void
1927 fs_visitor::assign_vs_urb_setup()
1928 {
1929 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1930 int grf, count, slot, channel, attr;
1931
1932 assert(stage == MESA_SHADER_VERTEX);
1933 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1934 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1935 count++;
1936
1937 /* Each attribute is 4 regs. */
1938 this->first_non_payload_grf =
1939 payload.num_regs + prog_data->curb_read_length + count * 4;
1940
1941 unsigned vue_entries =
1942 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1943
1944 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1945 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1946
1947 assert(vs_prog_data->base.urb_read_length <= 15);
1948
1949 /* Rewrite all ATTR file references to the hw grf that they land in. */
1950 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1951 for (int i = 0; i < inst->sources; i++) {
1952 if (inst->src[i].file == ATTR) {
1953
1954 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1955 slot = count - 1;
1956 } else {
1957 /* Attributes come in in a contiguous block, ordered by their
1958 * gl_vert_attrib value. That means we can compute the slot
1959 * number for an attribute by masking out the enabled
1960 * attributes before it and counting the bits.
1961 */
1962 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1963 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1964 BITFIELD64_MASK(attr));
1965 }
1966
1967 channel = inst->src[i].reg_offset & 3;
1968
1969 grf = payload.num_regs +
1970 prog_data->curb_read_length +
1971 slot * 4 + channel;
1972
1973 inst->src[i].file = HW_REG;
1974 inst->src[i].fixed_hw_reg =
1975 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1976 }
1977 }
1978 }
1979 }
1980
1981 /**
1982 * Split large virtual GRFs into separate components if we can.
1983 *
1984 * This is mostly duplicated with what brw_fs_vector_splitting does,
1985 * but that's really conservative because it's afraid of doing
1986 * splitting that doesn't result in real progress after the rest of
1987 * the optimization phases, which would cause infinite looping in
1988 * optimization. We can do it once here, safely. This also has the
1989 * opportunity to split interpolated values, or maybe even uniforms,
1990 * which we don't have at the IR level.
1991 *
1992 * We want to split, because virtual GRFs are what we register
1993 * allocate and spill (due to contiguousness requirements for some
1994 * instructions), and they're what we naturally generate in the
1995 * codegen process, but most virtual GRFs don't actually need to be
1996 * contiguous sets of GRFs. If we split, we'll end up with reduced
1997 * live intervals and better dead code elimination and coalescing.
1998 */
1999 void
2000 fs_visitor::split_virtual_grfs()
2001 {
2002 int num_vars = this->alloc.count;
2003
2004 /* Count the total number of registers */
2005 int reg_count = 0;
2006 int vgrf_to_reg[num_vars];
2007 for (int i = 0; i < num_vars; i++) {
2008 vgrf_to_reg[i] = reg_count;
2009 reg_count += alloc.sizes[i];
2010 }
2011
2012 /* An array of "split points". For each register slot, this indicates
2013 * if this slot can be separated from the previous slot. Every time an
2014 * instruction uses multiple elements of a register (as a source or
2015 * destination), we mark the used slots as inseparable. Then we go
2016 * through and split the registers into the smallest pieces we can.
2017 */
2018 bool split_points[reg_count];
2019 memset(split_points, 0, sizeof(split_points));
2020
2021 /* Mark all used registers as fully splittable */
2022 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2023 if (inst->dst.file == GRF) {
2024 int reg = vgrf_to_reg[inst->dst.reg];
2025 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
2026 split_points[reg + j] = true;
2027 }
2028
2029 for (int i = 0; i < inst->sources; i++) {
2030 if (inst->src[i].file == GRF) {
2031 int reg = vgrf_to_reg[inst->src[i].reg];
2032 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
2033 split_points[reg + j] = true;
2034 }
2035 }
2036 }
2037
2038 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2039 if (inst->dst.file == GRF) {
2040 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2041 for (int j = 1; j < inst->regs_written; j++)
2042 split_points[reg + j] = false;
2043 }
2044 for (int i = 0; i < inst->sources; i++) {
2045 if (inst->src[i].file == GRF) {
2046 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2047 for (int j = 1; j < inst->regs_read(i); j++)
2048 split_points[reg + j] = false;
2049 }
2050 }
2051 }
2052
2053 int new_virtual_grf[reg_count];
2054 int new_reg_offset[reg_count];
2055
2056 int reg = 0;
2057 for (int i = 0; i < num_vars; i++) {
2058 /* The first one should always be 0 as a quick sanity check. */
2059 assert(split_points[reg] == false);
2060
2061 /* j = 0 case */
2062 new_reg_offset[reg] = 0;
2063 reg++;
2064 int offset = 1;
2065
2066 /* j > 0 case */
2067 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2068 /* If this is a split point, reset the offset to 0 and allocate a
2069 * new virtual GRF for the previous offset many registers
2070 */
2071 if (split_points[reg]) {
2072 assert(offset <= MAX_VGRF_SIZE);
2073 int grf = alloc.allocate(offset);
2074 for (int k = reg - offset; k < reg; k++)
2075 new_virtual_grf[k] = grf;
2076 offset = 0;
2077 }
2078 new_reg_offset[reg] = offset;
2079 offset++;
2080 reg++;
2081 }
2082
2083 /* The last one gets the original register number */
2084 assert(offset <= MAX_VGRF_SIZE);
2085 alloc.sizes[i] = offset;
2086 for (int k = reg - offset; k < reg; k++)
2087 new_virtual_grf[k] = i;
2088 }
2089 assert(reg == reg_count);
2090
2091 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2092 if (inst->dst.file == GRF) {
2093 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2094 inst->dst.reg = new_virtual_grf[reg];
2095 inst->dst.reg_offset = new_reg_offset[reg];
2096 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2097 }
2098 for (int i = 0; i < inst->sources; i++) {
2099 if (inst->src[i].file == GRF) {
2100 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2101 inst->src[i].reg = new_virtual_grf[reg];
2102 inst->src[i].reg_offset = new_reg_offset[reg];
2103 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2104 }
2105 }
2106 }
2107 invalidate_live_intervals();
2108 }
2109
2110 /**
2111 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2112 *
2113 * During code generation, we create tons of temporary variables, many of
2114 * which get immediately killed and are never used again. Yet, in later
2115 * optimization and analysis passes, such as compute_live_intervals, we need
2116 * to loop over all the virtual GRFs. Compacting them can save a lot of
2117 * overhead.
2118 */
2119 bool
2120 fs_visitor::compact_virtual_grfs()
2121 {
2122 bool progress = false;
2123 int remap_table[this->alloc.count];
2124 memset(remap_table, -1, sizeof(remap_table));
2125
2126 /* Mark which virtual GRFs are used. */
2127 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2128 if (inst->dst.file == GRF)
2129 remap_table[inst->dst.reg] = 0;
2130
2131 for (int i = 0; i < inst->sources; i++) {
2132 if (inst->src[i].file == GRF)
2133 remap_table[inst->src[i].reg] = 0;
2134 }
2135 }
2136
2137 /* Compact the GRF arrays. */
2138 int new_index = 0;
2139 for (unsigned i = 0; i < this->alloc.count; i++) {
2140 if (remap_table[i] == -1) {
2141 /* We just found an unused register. This means that we are
2142 * actually going to compact something.
2143 */
2144 progress = true;
2145 } else {
2146 remap_table[i] = new_index;
2147 alloc.sizes[new_index] = alloc.sizes[i];
2148 invalidate_live_intervals();
2149 ++new_index;
2150 }
2151 }
2152
2153 this->alloc.count = new_index;
2154
2155 /* Patch all the instructions to use the newly renumbered registers */
2156 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2157 if (inst->dst.file == GRF)
2158 inst->dst.reg = remap_table[inst->dst.reg];
2159
2160 for (int i = 0; i < inst->sources; i++) {
2161 if (inst->src[i].file == GRF)
2162 inst->src[i].reg = remap_table[inst->src[i].reg];
2163 }
2164 }
2165
2166 /* Patch all the references to delta_xy, since they're used in register
2167 * allocation. If they're unused, switch them to BAD_FILE so we don't
2168 * think some random VGRF is delta_xy.
2169 */
2170 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2171 if (delta_xy[i].file == GRF) {
2172 if (remap_table[delta_xy[i].reg] != -1) {
2173 delta_xy[i].reg = remap_table[delta_xy[i].reg];
2174 } else {
2175 delta_xy[i].file = BAD_FILE;
2176 }
2177 }
2178 }
2179
2180 return progress;
2181 }
2182
2183 /*
2184 * Implements array access of uniforms by inserting a
2185 * PULL_CONSTANT_LOAD instruction.
2186 *
2187 * Unlike temporary GRF array access (where we don't support it due to
2188 * the difficulty of doing relative addressing on instruction
2189 * destinations), we could potentially do array access of uniforms
2190 * that were loaded in GRF space as push constants. In real-world
2191 * usage we've seen, though, the arrays being used are always larger
2192 * than we could load as push constants, so just always move all
2193 * uniform array access out to a pull constant buffer.
2194 */
2195 void
2196 fs_visitor::move_uniform_array_access_to_pull_constants()
2197 {
2198 if (dispatch_width != 8)
2199 return;
2200
2201 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2202 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2203
2204 /* Walk through and find array access of uniforms. Put a copy of that
2205 * uniform in the pull constant buffer.
2206 *
2207 * Note that we don't move constant-indexed accesses to arrays. No
2208 * testing has been done of the performance impact of this choice.
2209 */
2210 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2211 for (int i = 0 ; i < inst->sources; i++) {
2212 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2213 continue;
2214
2215 int uniform = inst->src[i].reg;
2216
2217 /* If this array isn't already present in the pull constant buffer,
2218 * add it.
2219 */
2220 if (pull_constant_loc[uniform] == -1) {
2221 const gl_constant_value **values = &stage_prog_data->param[uniform];
2222
2223 assert(param_size[uniform]);
2224
2225 for (int j = 0; j < param_size[uniform]; j++) {
2226 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2227
2228 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2229 values[j];
2230 }
2231 }
2232 }
2233 }
2234 }
2235
2236 /**
2237 * Assign UNIFORM file registers to either push constants or pull constants.
2238 *
2239 * We allow a fragment shader to have more than the specified minimum
2240 * maximum number of fragment shader uniform components (64). If
2241 * there are too many of these, they'd fill up all of register space.
2242 * So, this will push some of them out to the pull constant buffer and
2243 * update the program to load them.
2244 */
2245 void
2246 fs_visitor::assign_constant_locations()
2247 {
2248 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2249 if (dispatch_width != 8)
2250 return;
2251
2252 /* Find which UNIFORM registers are still in use. */
2253 bool is_live[uniforms];
2254 for (unsigned int i = 0; i < uniforms; i++) {
2255 is_live[i] = false;
2256 }
2257
2258 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2259 for (int i = 0; i < inst->sources; i++) {
2260 if (inst->src[i].file != UNIFORM)
2261 continue;
2262
2263 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2264 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2265 is_live[constant_nr] = true;
2266 }
2267 }
2268
2269 /* Only allow 16 registers (128 uniform components) as push constants.
2270 *
2271 * Just demote the end of the list. We could probably do better
2272 * here, demoting things that are rarely used in the program first.
2273 *
2274 * If changing this value, note the limitation about total_regs in
2275 * brw_curbe.c.
2276 */
2277 unsigned int max_push_components = 16 * 8;
2278 unsigned int num_push_constants = 0;
2279
2280 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2281
2282 for (unsigned int i = 0; i < uniforms; i++) {
2283 if (!is_live[i] || pull_constant_loc[i] != -1) {
2284 /* This UNIFORM register is either dead, or has already been demoted
2285 * to a pull const. Mark it as no longer living in the param[] array.
2286 */
2287 push_constant_loc[i] = -1;
2288 continue;
2289 }
2290
2291 if (num_push_constants < max_push_components) {
2292 /* Retain as a push constant. Record the location in the params[]
2293 * array.
2294 */
2295 push_constant_loc[i] = num_push_constants++;
2296 } else {
2297 /* Demote to a pull constant. */
2298 push_constant_loc[i] = -1;
2299
2300 int pull_index = stage_prog_data->nr_pull_params++;
2301 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2302 pull_constant_loc[i] = pull_index;
2303 }
2304 }
2305
2306 stage_prog_data->nr_params = num_push_constants;
2307
2308 /* Up until now, the param[] array has been indexed by reg + reg_offset
2309 * of UNIFORM registers. Condense it to only contain the uniforms we
2310 * chose to upload as push constants.
2311 */
2312 for (unsigned int i = 0; i < uniforms; i++) {
2313 int remapped = push_constant_loc[i];
2314
2315 if (remapped == -1)
2316 continue;
2317
2318 assert(remapped <= (int)i);
2319 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2320 }
2321 }
2322
2323 /**
2324 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2325 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2326 */
2327 void
2328 fs_visitor::demote_pull_constants()
2329 {
2330 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2331 for (int i = 0; i < inst->sources; i++) {
2332 if (inst->src[i].file != UNIFORM)
2333 continue;
2334
2335 int pull_index;
2336 unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2337 if (location >= uniforms) /* Out of bounds access */
2338 pull_index = -1;
2339 else
2340 pull_index = pull_constant_loc[location];
2341
2342 if (pull_index == -1)
2343 continue;
2344
2345 /* Set up the annotation tracking for new generated instructions. */
2346 base_ir = inst->ir;
2347 current_annotation = inst->annotation;
2348
2349 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2350 fs_reg dst = vgrf(glsl_type::float_type);
2351
2352 /* Generate a pull load into dst. */
2353 if (inst->src[i].reladdr) {
2354 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2355 surf_index,
2356 *inst->src[i].reladdr,
2357 pull_index);
2358 inst->insert_before(block, &list);
2359 inst->src[i].reladdr = NULL;
2360 } else {
2361 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2362 fs_inst *pull =
2363 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2364 dst, surf_index, offset);
2365 inst->insert_before(block, pull);
2366 inst->src[i].set_smear(pull_index & 3);
2367 }
2368
2369 /* Rewrite the instruction to use the temporary VGRF. */
2370 inst->src[i].file = GRF;
2371 inst->src[i].reg = dst.reg;
2372 inst->src[i].reg_offset = 0;
2373 inst->src[i].width = dispatch_width;
2374 }
2375 }
2376 invalidate_live_intervals();
2377 }
2378
2379 bool
2380 fs_visitor::opt_algebraic()
2381 {
2382 bool progress = false;
2383
2384 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2385 switch (inst->opcode) {
2386 case BRW_OPCODE_MOV:
2387 if (inst->src[0].file != IMM)
2388 break;
2389
2390 if (inst->saturate) {
2391 if (inst->dst.type != inst->src[0].type)
2392 assert(!"unimplemented: saturate mixed types");
2393
2394 if (brw_saturate_immediate(inst->dst.type,
2395 &inst->src[0].fixed_hw_reg)) {
2396 inst->saturate = false;
2397 progress = true;
2398 }
2399 }
2400 break;
2401
2402 case BRW_OPCODE_MUL:
2403 if (inst->src[1].file != IMM)
2404 continue;
2405
2406 /* a * 1.0 = a */
2407 if (inst->src[1].is_one()) {
2408 inst->opcode = BRW_OPCODE_MOV;
2409 inst->src[1] = reg_undef;
2410 progress = true;
2411 break;
2412 }
2413
2414 /* a * -1.0 = -a */
2415 if (inst->src[1].is_negative_one()) {
2416 inst->opcode = BRW_OPCODE_MOV;
2417 inst->src[0].negate = !inst->src[0].negate;
2418 inst->src[1] = reg_undef;
2419 progress = true;
2420 break;
2421 }
2422
2423 /* a * 0.0 = 0.0 */
2424 if (inst->src[1].is_zero()) {
2425 inst->opcode = BRW_OPCODE_MOV;
2426 inst->src[0] = inst->src[1];
2427 inst->src[1] = reg_undef;
2428 progress = true;
2429 break;
2430 }
2431
2432 if (inst->src[0].file == IMM) {
2433 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2434 inst->opcode = BRW_OPCODE_MOV;
2435 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2436 inst->src[1] = reg_undef;
2437 progress = true;
2438 break;
2439 }
2440 break;
2441 case BRW_OPCODE_ADD:
2442 if (inst->src[1].file != IMM)
2443 continue;
2444
2445 /* a + 0.0 = a */
2446 if (inst->src[1].is_zero()) {
2447 inst->opcode = BRW_OPCODE_MOV;
2448 inst->src[1] = reg_undef;
2449 progress = true;
2450 break;
2451 }
2452
2453 if (inst->src[0].file == IMM) {
2454 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2455 inst->opcode = BRW_OPCODE_MOV;
2456 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2457 inst->src[1] = reg_undef;
2458 progress = true;
2459 break;
2460 }
2461 break;
2462 case BRW_OPCODE_OR:
2463 if (inst->src[0].equals(inst->src[1])) {
2464 inst->opcode = BRW_OPCODE_MOV;
2465 inst->src[1] = reg_undef;
2466 progress = true;
2467 break;
2468 }
2469 break;
2470 case BRW_OPCODE_LRP:
2471 if (inst->src[1].equals(inst->src[2])) {
2472 inst->opcode = BRW_OPCODE_MOV;
2473 inst->src[0] = inst->src[1];
2474 inst->src[1] = reg_undef;
2475 inst->src[2] = reg_undef;
2476 progress = true;
2477 break;
2478 }
2479 break;
2480 case BRW_OPCODE_CMP:
2481 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2482 inst->src[0].abs &&
2483 inst->src[0].negate &&
2484 inst->src[1].is_zero()) {
2485 inst->src[0].abs = false;
2486 inst->src[0].negate = false;
2487 inst->conditional_mod = BRW_CONDITIONAL_Z;
2488 progress = true;
2489 break;
2490 }
2491 break;
2492 case BRW_OPCODE_SEL:
2493 if (inst->src[0].equals(inst->src[1])) {
2494 inst->opcode = BRW_OPCODE_MOV;
2495 inst->src[1] = reg_undef;
2496 inst->predicate = BRW_PREDICATE_NONE;
2497 inst->predicate_inverse = false;
2498 progress = true;
2499 } else if (inst->saturate && inst->src[1].file == IMM) {
2500 switch (inst->conditional_mod) {
2501 case BRW_CONDITIONAL_LE:
2502 case BRW_CONDITIONAL_L:
2503 switch (inst->src[1].type) {
2504 case BRW_REGISTER_TYPE_F:
2505 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2506 inst->opcode = BRW_OPCODE_MOV;
2507 inst->src[1] = reg_undef;
2508 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2509 progress = true;
2510 }
2511 break;
2512 default:
2513 break;
2514 }
2515 break;
2516 case BRW_CONDITIONAL_GE:
2517 case BRW_CONDITIONAL_G:
2518 switch (inst->src[1].type) {
2519 case BRW_REGISTER_TYPE_F:
2520 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2521 inst->opcode = BRW_OPCODE_MOV;
2522 inst->src[1] = reg_undef;
2523 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2524 progress = true;
2525 }
2526 break;
2527 default:
2528 break;
2529 }
2530 default:
2531 break;
2532 }
2533 }
2534 break;
2535 case BRW_OPCODE_MAD:
2536 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2537 inst->opcode = BRW_OPCODE_MOV;
2538 inst->src[1] = reg_undef;
2539 inst->src[2] = reg_undef;
2540 progress = true;
2541 } else if (inst->src[0].is_zero()) {
2542 inst->opcode = BRW_OPCODE_MUL;
2543 inst->src[0] = inst->src[2];
2544 inst->src[2] = reg_undef;
2545 progress = true;
2546 } else if (inst->src[1].is_one()) {
2547 inst->opcode = BRW_OPCODE_ADD;
2548 inst->src[1] = inst->src[2];
2549 inst->src[2] = reg_undef;
2550 progress = true;
2551 } else if (inst->src[2].is_one()) {
2552 inst->opcode = BRW_OPCODE_ADD;
2553 inst->src[2] = reg_undef;
2554 progress = true;
2555 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2556 inst->opcode = BRW_OPCODE_ADD;
2557 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2558 inst->src[2] = reg_undef;
2559 progress = true;
2560 }
2561 break;
2562 case SHADER_OPCODE_RCP: {
2563 fs_inst *prev = (fs_inst *)inst->prev;
2564 if (prev->opcode == SHADER_OPCODE_SQRT) {
2565 if (inst->src[0].equals(prev->dst)) {
2566 inst->opcode = SHADER_OPCODE_RSQ;
2567 inst->src[0] = prev->src[0];
2568 progress = true;
2569 }
2570 }
2571 break;
2572 }
2573 case SHADER_OPCODE_BROADCAST:
2574 if (is_uniform(inst->src[0])) {
2575 inst->opcode = BRW_OPCODE_MOV;
2576 inst->sources = 1;
2577 inst->force_writemask_all = true;
2578 progress = true;
2579 } else if (inst->src[1].file == IMM) {
2580 inst->opcode = BRW_OPCODE_MOV;
2581 inst->src[0] = component(inst->src[0],
2582 inst->src[1].fixed_hw_reg.dw1.ud);
2583 inst->sources = 1;
2584 inst->force_writemask_all = true;
2585 progress = true;
2586 }
2587 break;
2588
2589 default:
2590 break;
2591 }
2592
2593 /* Swap if src[0] is immediate. */
2594 if (progress && inst->is_commutative()) {
2595 if (inst->src[0].file == IMM) {
2596 fs_reg tmp = inst->src[1];
2597 inst->src[1] = inst->src[0];
2598 inst->src[0] = tmp;
2599 }
2600 }
2601 }
2602 return progress;
2603 }
2604
2605 /**
2606 * Optimize sample messages that have constant zero values for the trailing
2607 * texture coordinates. We can just reduce the message length for these
2608 * instructions instead of reserving a register for it. Trailing parameters
2609 * that aren't sent default to zero anyway. This will cause the dead code
2610 * eliminator to remove the MOV instruction that would otherwise be emitted to
2611 * set up the zero value.
2612 */
2613 bool
2614 fs_visitor::opt_zero_samples()
2615 {
2616 /* Gen4 infers the texturing opcode based on the message length so we can't
2617 * change it.
2618 */
2619 if (devinfo->gen < 5)
2620 return false;
2621
2622 bool progress = false;
2623
2624 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2625 if (!inst->is_tex())
2626 continue;
2627
2628 fs_inst *load_payload = (fs_inst *) inst->prev;
2629
2630 if (load_payload->is_head_sentinel() ||
2631 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2632 continue;
2633
2634 /* We don't want to remove the message header. Removing all of the
2635 * parameters is avoided because it seems to cause a GPU hang but I
2636 * can't find any documentation indicating that this is expected.
2637 */
2638 while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2639 load_payload->src[(inst->mlen - inst->header_size) /
2640 (dispatch_width / 8) +
2641 inst->header_size - 1].is_zero()) {
2642 inst->mlen -= dispatch_width / 8;
2643 progress = true;
2644 }
2645 }
2646
2647 if (progress)
2648 invalidate_live_intervals();
2649
2650 return progress;
2651 }
2652
2653 /**
2654 * Optimize sample messages which are followed by the final RT write.
2655 *
2656 * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2657 * results sent directly to the framebuffer, bypassing the EU. Recognize the
2658 * final texturing results copied to the framebuffer write payload and modify
2659 * them to write to the framebuffer directly.
2660 */
2661 bool
2662 fs_visitor::opt_sampler_eot()
2663 {
2664 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2665
2666 if (stage != MESA_SHADER_FRAGMENT)
2667 return false;
2668
2669 if (devinfo->gen < 9 && !devinfo->is_cherryview)
2670 return false;
2671
2672 /* FINISHME: It should be possible to implement this optimization when there
2673 * are multiple drawbuffers.
2674 */
2675 if (key->nr_color_regions != 1)
2676 return false;
2677
2678 /* Look for a texturing instruction immediately before the final FB_WRITE. */
2679 fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2680 assert(fb_write->eot);
2681 assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2682
2683 fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2684
2685 /* There wasn't one; nothing to do. */
2686 if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2687 return false;
2688
2689 /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2690 * It's very likely to be the previous instruction.
2691 */
2692 fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2693 if (load_payload->is_head_sentinel() ||
2694 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2695 return false;
2696
2697 assert(!tex_inst->eot); /* We can't get here twice */
2698 assert((tex_inst->offset & (0xff << 24)) == 0);
2699
2700 tex_inst->offset |= fb_write->target << 24;
2701 tex_inst->eot = true;
2702 fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2703
2704 /* If a header is present, marking the eot is sufficient. Otherwise, we need
2705 * to create a new LOAD_PAYLOAD command with the same sources and a space
2706 * saved for the header. Using a new destination register not only makes sure
2707 * we have enough space, but it will make sure the dead code eliminator kills
2708 * the instruction that this will replace.
2709 */
2710 if (tex_inst->header_size != 0)
2711 return true;
2712
2713 fs_reg send_header = vgrf(load_payload->sources + 1);
2714 fs_reg *new_sources =
2715 ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2716
2717 new_sources[0] = fs_reg();
2718 for (int i = 0; i < load_payload->sources; i++)
2719 new_sources[i+1] = load_payload->src[i];
2720
2721 /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2722 * requires a lot of information about the sources to appropriately figure
2723 * out the number of registers needed to be used. Given this stage in our
2724 * optimization, we may not have the appropriate GRFs required by
2725 * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2726 * manually emit the instruction.
2727 */
2728 fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2729 load_payload->exec_size,
2730 send_header,
2731 new_sources,
2732 load_payload->sources + 1);
2733
2734 new_load_payload->regs_written = load_payload->regs_written + 1;
2735 tex_inst->mlen++;
2736 tex_inst->header_size = 1;
2737 tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2738 tex_inst->src[0] = send_header;
2739 tex_inst->dst = reg_null_ud;
2740
2741 return true;
2742 }
2743
2744 bool
2745 fs_visitor::opt_register_renaming()
2746 {
2747 bool progress = false;
2748 int depth = 0;
2749
2750 int remap[alloc.count];
2751 memset(remap, -1, sizeof(int) * alloc.count);
2752
2753 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2754 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2755 depth++;
2756 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2757 inst->opcode == BRW_OPCODE_WHILE) {
2758 depth--;
2759 }
2760
2761 /* Rewrite instruction sources. */
2762 for (int i = 0; i < inst->sources; i++) {
2763 if (inst->src[i].file == GRF &&
2764 remap[inst->src[i].reg] != -1 &&
2765 remap[inst->src[i].reg] != inst->src[i].reg) {
2766 inst->src[i].reg = remap[inst->src[i].reg];
2767 progress = true;
2768 }
2769 }
2770
2771 const int dst = inst->dst.reg;
2772
2773 if (depth == 0 &&
2774 inst->dst.file == GRF &&
2775 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2776 !inst->is_partial_write()) {
2777 if (remap[dst] == -1) {
2778 remap[dst] = dst;
2779 } else {
2780 remap[dst] = alloc.allocate(inst->dst.width / 8);
2781 inst->dst.reg = remap[dst];
2782 progress = true;
2783 }
2784 } else if (inst->dst.file == GRF &&
2785 remap[dst] != -1 &&
2786 remap[dst] != dst) {
2787 inst->dst.reg = remap[dst];
2788 progress = true;
2789 }
2790 }
2791
2792 if (progress) {
2793 invalidate_live_intervals();
2794
2795 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2796 if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2797 delta_xy[i].reg = remap[delta_xy[i].reg];
2798 }
2799 }
2800 }
2801
2802 return progress;
2803 }
2804
2805 /**
2806 * Remove redundant or useless discard jumps.
2807 *
2808 * For example, we can eliminate jumps in the following sequence:
2809 *
2810 * discard-jump (redundant with the next jump)
2811 * discard-jump (useless; jumps to the next instruction)
2812 * placeholder-halt
2813 */
2814 bool
2815 fs_visitor::opt_redundant_discard_jumps()
2816 {
2817 bool progress = false;
2818
2819 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2820
2821 fs_inst *placeholder_halt = NULL;
2822 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2823 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2824 placeholder_halt = inst;
2825 break;
2826 }
2827 }
2828
2829 if (!placeholder_halt)
2830 return false;
2831
2832 /* Delete any HALTs immediately before the placeholder halt. */
2833 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2834 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2835 prev = (fs_inst *) placeholder_halt->prev) {
2836 prev->remove(last_bblock);
2837 progress = true;
2838 }
2839
2840 if (progress)
2841 invalidate_live_intervals();
2842
2843 return progress;
2844 }
2845
2846 bool
2847 fs_visitor::compute_to_mrf()
2848 {
2849 bool progress = false;
2850 int next_ip = 0;
2851
2852 /* No MRFs on Gen >= 7. */
2853 if (devinfo->gen >= 7)
2854 return false;
2855
2856 calculate_live_intervals();
2857
2858 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2859 int ip = next_ip;
2860 next_ip++;
2861
2862 if (inst->opcode != BRW_OPCODE_MOV ||
2863 inst->is_partial_write() ||
2864 inst->dst.file != MRF || inst->src[0].file != GRF ||
2865 inst->dst.type != inst->src[0].type ||
2866 inst->src[0].abs || inst->src[0].negate ||
2867 !inst->src[0].is_contiguous() ||
2868 inst->src[0].subreg_offset)
2869 continue;
2870
2871 /* Work out which hardware MRF registers are written by this
2872 * instruction.
2873 */
2874 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2875 int mrf_high;
2876 if (inst->dst.reg & BRW_MRF_COMPR4) {
2877 mrf_high = mrf_low + 4;
2878 } else if (inst->exec_size == 16) {
2879 mrf_high = mrf_low + 1;
2880 } else {
2881 mrf_high = mrf_low;
2882 }
2883
2884 /* Can't compute-to-MRF this GRF if someone else was going to
2885 * read it later.
2886 */
2887 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2888 continue;
2889
2890 /* Found a move of a GRF to a MRF. Let's see if we can go
2891 * rewrite the thing that made this GRF to write into the MRF.
2892 */
2893 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2894 if (scan_inst->dst.file == GRF &&
2895 scan_inst->dst.reg == inst->src[0].reg) {
2896 /* Found the last thing to write our reg we want to turn
2897 * into a compute-to-MRF.
2898 */
2899
2900 /* If this one instruction didn't populate all the
2901 * channels, bail. We might be able to rewrite everything
2902 * that writes that reg, but it would require smarter
2903 * tracking to delay the rewriting until complete success.
2904 */
2905 if (scan_inst->is_partial_write())
2906 break;
2907
2908 /* Things returning more than one register would need us to
2909 * understand coalescing out more than one MOV at a time.
2910 */
2911 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2912 break;
2913
2914 /* SEND instructions can't have MRF as a destination. */
2915 if (scan_inst->mlen)
2916 break;
2917
2918 if (devinfo->gen == 6) {
2919 /* gen6 math instructions must have the destination be
2920 * GRF, so no compute-to-MRF for them.
2921 */
2922 if (scan_inst->is_math()) {
2923 break;
2924 }
2925 }
2926
2927 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2928 /* Found the creator of our MRF's source value. */
2929 scan_inst->dst.file = MRF;
2930 scan_inst->dst.reg = inst->dst.reg;
2931 scan_inst->saturate |= inst->saturate;
2932 inst->remove(block);
2933 progress = true;
2934 }
2935 break;
2936 }
2937
2938 /* We don't handle control flow here. Most computation of
2939 * values that end up in MRFs are shortly before the MRF
2940 * write anyway.
2941 */
2942 if (block->start() == scan_inst)
2943 break;
2944
2945 /* You can't read from an MRF, so if someone else reads our
2946 * MRF's source GRF that we wanted to rewrite, that stops us.
2947 */
2948 bool interfered = false;
2949 for (int i = 0; i < scan_inst->sources; i++) {
2950 if (scan_inst->src[i].file == GRF &&
2951 scan_inst->src[i].reg == inst->src[0].reg &&
2952 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2953 interfered = true;
2954 }
2955 }
2956 if (interfered)
2957 break;
2958
2959 if (scan_inst->dst.file == MRF) {
2960 /* If somebody else writes our MRF here, we can't
2961 * compute-to-MRF before that.
2962 */
2963 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2964 int scan_mrf_high;
2965
2966 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2967 scan_mrf_high = scan_mrf_low + 4;
2968 } else if (scan_inst->exec_size == 16) {
2969 scan_mrf_high = scan_mrf_low + 1;
2970 } else {
2971 scan_mrf_high = scan_mrf_low;
2972 }
2973
2974 if (mrf_low == scan_mrf_low ||
2975 mrf_low == scan_mrf_high ||
2976 mrf_high == scan_mrf_low ||
2977 mrf_high == scan_mrf_high) {
2978 break;
2979 }
2980 }
2981
2982 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2983 /* Found a SEND instruction, which means that there are
2984 * live values in MRFs from base_mrf to base_mrf +
2985 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2986 * above it.
2987 */
2988 if (mrf_low >= scan_inst->base_mrf &&
2989 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2990 break;
2991 }
2992 if (mrf_high >= scan_inst->base_mrf &&
2993 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2994 break;
2995 }
2996 }
2997 }
2998 }
2999
3000 if (progress)
3001 invalidate_live_intervals();
3002
3003 return progress;
3004 }
3005
3006 /**
3007 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
3008 * flow. We could probably do better here with some form of divergence
3009 * analysis.
3010 */
3011 bool
3012 fs_visitor::eliminate_find_live_channel()
3013 {
3014 bool progress = false;
3015 unsigned depth = 0;
3016
3017 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3018 switch (inst->opcode) {
3019 case BRW_OPCODE_IF:
3020 case BRW_OPCODE_DO:
3021 depth++;
3022 break;
3023
3024 case BRW_OPCODE_ENDIF:
3025 case BRW_OPCODE_WHILE:
3026 depth--;
3027 break;
3028
3029 case FS_OPCODE_DISCARD_JUMP:
3030 /* This can potentially make control flow non-uniform until the end
3031 * of the program.
3032 */
3033 return progress;
3034
3035 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
3036 if (depth == 0) {
3037 inst->opcode = BRW_OPCODE_MOV;
3038 inst->src[0] = fs_reg(0);
3039 inst->sources = 1;
3040 inst->force_writemask_all = true;
3041 progress = true;
3042 }
3043 break;
3044
3045 default:
3046 break;
3047 }
3048 }
3049
3050 return progress;
3051 }
3052
3053 /**
3054 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
3055 * instructions to FS_OPCODE_REP_FB_WRITE.
3056 */
3057 void
3058 fs_visitor::emit_repclear_shader()
3059 {
3060 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3061 int base_mrf = 1;
3062 int color_mrf = base_mrf + 2;
3063
3064 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
3065 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
3066 mov->force_writemask_all = true;
3067
3068 fs_inst *write;
3069 if (key->nr_color_regions == 1) {
3070 write = emit(FS_OPCODE_REP_FB_WRITE);
3071 write->saturate = key->clamp_fragment_color;
3072 write->base_mrf = color_mrf;
3073 write->target = 0;
3074 write->header_size = 0;
3075 write->mlen = 1;
3076 } else {
3077 assume(key->nr_color_regions > 0);
3078 for (int i = 0; i < key->nr_color_regions; ++i) {
3079 write = emit(FS_OPCODE_REP_FB_WRITE);
3080 write->saturate = key->clamp_fragment_color;
3081 write->base_mrf = base_mrf;
3082 write->target = i;
3083 write->header_size = 2;
3084 write->mlen = 3;
3085 }
3086 }
3087 write->eot = true;
3088
3089 calculate_cfg();
3090
3091 assign_constant_locations();
3092 assign_curb_setup();
3093
3094 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
3095 assert(mov->src[0].file == HW_REG);
3096 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
3097 }
3098
3099 /**
3100 * Walks through basic blocks, looking for repeated MRF writes and
3101 * removing the later ones.
3102 */
3103 bool
3104 fs_visitor::remove_duplicate_mrf_writes()
3105 {
3106 fs_inst *last_mrf_move[16];
3107 bool progress = false;
3108
3109 /* Need to update the MRF tracking for compressed instructions. */
3110 if (dispatch_width == 16)
3111 return false;
3112
3113 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3114
3115 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3116 if (inst->is_control_flow()) {
3117 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3118 }
3119
3120 if (inst->opcode == BRW_OPCODE_MOV &&
3121 inst->dst.file == MRF) {
3122 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3123 if (prev_inst && inst->equals(prev_inst)) {
3124 inst->remove(block);
3125 progress = true;
3126 continue;
3127 }
3128 }
3129
3130 /* Clear out the last-write records for MRFs that were overwritten. */
3131 if (inst->dst.file == MRF) {
3132 last_mrf_move[inst->dst.reg] = NULL;
3133 }
3134
3135 if (inst->mlen > 0 && inst->base_mrf != -1) {
3136 /* Found a SEND instruction, which will include two or fewer
3137 * implied MRF writes. We could do better here.
3138 */
3139 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3140 last_mrf_move[inst->base_mrf + i] = NULL;
3141 }
3142 }
3143
3144 /* Clear out any MRF move records whose sources got overwritten. */
3145 if (inst->dst.file == GRF) {
3146 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3147 if (last_mrf_move[i] &&
3148 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3149 last_mrf_move[i] = NULL;
3150 }
3151 }
3152 }
3153
3154 if (inst->opcode == BRW_OPCODE_MOV &&
3155 inst->dst.file == MRF &&
3156 inst->src[0].file == GRF &&
3157 !inst->is_partial_write()) {
3158 last_mrf_move[inst->dst.reg] = inst;
3159 }
3160 }
3161
3162 if (progress)
3163 invalidate_live_intervals();
3164
3165 return progress;
3166 }
3167
3168 static void
3169 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3170 {
3171 /* Clear the flag for registers that actually got read (as expected). */
3172 for (int i = 0; i < inst->sources; i++) {
3173 int grf;
3174 if (inst->src[i].file == GRF) {
3175 grf = inst->src[i].reg;
3176 } else if (inst->src[i].file == HW_REG &&
3177 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3178 grf = inst->src[i].fixed_hw_reg.nr;
3179 } else {
3180 continue;
3181 }
3182
3183 if (grf >= first_grf &&
3184 grf < first_grf + grf_len) {
3185 deps[grf - first_grf] = false;
3186 if (inst->exec_size == 16)
3187 deps[grf - first_grf + 1] = false;
3188 }
3189 }
3190 }
3191
3192 /**
3193 * Implements this workaround for the original 965:
3194 *
3195 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3196 * check for post destination dependencies on this instruction, software
3197 * must ensure that there is no destination hazard for the case of ‘write
3198 * followed by a posted write’ shown in the following example.
3199 *
3200 * 1. mov r3 0
3201 * 2. send r3.xy <rest of send instruction>
3202 * 3. mov r2 r3
3203 *
3204 * Due to no post-destination dependency check on the ‘send’, the above
3205 * code sequence could have two instructions (1 and 2) in flight at the
3206 * same time that both consider ‘r3’ as the target of their final writes.
3207 */
3208 void
3209 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3210 fs_inst *inst)
3211 {
3212 int write_len = inst->regs_written;
3213 int first_write_grf = inst->dst.reg;
3214 bool needs_dep[BRW_MAX_MRF];
3215 assert(write_len < (int)sizeof(needs_dep) - 1);
3216
3217 memset(needs_dep, false, sizeof(needs_dep));
3218 memset(needs_dep, true, write_len);
3219
3220 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3221
3222 /* Walk backwards looking for writes to registers we're writing which
3223 * aren't read since being written. If we hit the start of the program,
3224 * we assume that there are no outstanding dependencies on entry to the
3225 * program.
3226 */
3227 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3228 /* If we hit control flow, assume that there *are* outstanding
3229 * dependencies, and force their cleanup before our instruction.
3230 */
3231 if (block->start() == scan_inst) {
3232 for (int i = 0; i < write_len; i++) {
3233 if (needs_dep[i]) {
3234 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3235 }
3236 }
3237 return;
3238 }
3239
3240 /* We insert our reads as late as possible on the assumption that any
3241 * instruction but a MOV that might have left us an outstanding
3242 * dependency has more latency than a MOV.
3243 */
3244 if (scan_inst->dst.file == GRF) {
3245 for (int i = 0; i < scan_inst->regs_written; i++) {
3246 int reg = scan_inst->dst.reg + i;
3247
3248 if (reg >= first_write_grf &&
3249 reg < first_write_grf + write_len &&
3250 needs_dep[reg - first_write_grf]) {
3251 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3252 needs_dep[reg - first_write_grf] = false;
3253 if (scan_inst->exec_size == 16)
3254 needs_dep[reg - first_write_grf + 1] = false;
3255 }
3256 }
3257 }
3258
3259 /* Clear the flag for registers that actually got read (as expected). */
3260 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3261
3262 /* Continue the loop only if we haven't resolved all the dependencies */
3263 int i;
3264 for (i = 0; i < write_len; i++) {
3265 if (needs_dep[i])
3266 break;
3267 }
3268 if (i == write_len)
3269 return;
3270 }
3271 }
3272
3273 /**
3274 * Implements this workaround for the original 965:
3275 *
3276 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3277 * used as a destination register until after it has been sourced by an
3278 * instruction with a different destination register.
3279 */
3280 void
3281 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3282 {
3283 int write_len = inst->regs_written;
3284 int first_write_grf = inst->dst.reg;
3285 bool needs_dep[BRW_MAX_MRF];
3286 assert(write_len < (int)sizeof(needs_dep) - 1);
3287
3288 memset(needs_dep, false, sizeof(needs_dep));
3289 memset(needs_dep, true, write_len);
3290 /* Walk forwards looking for writes to registers we're writing which aren't
3291 * read before being written.
3292 */
3293 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3294 /* If we hit control flow, force resolve all remaining dependencies. */
3295 if (block->end() == scan_inst) {
3296 for (int i = 0; i < write_len; i++) {
3297 if (needs_dep[i])
3298 scan_inst->insert_before(block,
3299 DEP_RESOLVE_MOV(first_write_grf + i));
3300 }
3301 return;
3302 }
3303
3304 /* Clear the flag for registers that actually got read (as expected). */
3305 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3306
3307 /* We insert our reads as late as possible since they're reading the
3308 * result of a SEND, which has massive latency.
3309 */
3310 if (scan_inst->dst.file == GRF &&
3311 scan_inst->dst.reg >= first_write_grf &&
3312 scan_inst->dst.reg < first_write_grf + write_len &&
3313 needs_dep[scan_inst->dst.reg - first_write_grf]) {
3314 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3315 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3316 }
3317
3318 /* Continue the loop only if we haven't resolved all the dependencies */
3319 int i;
3320 for (i = 0; i < write_len; i++) {
3321 if (needs_dep[i])
3322 break;
3323 }
3324 if (i == write_len)
3325 return;
3326 }
3327 }
3328
3329 void
3330 fs_visitor::insert_gen4_send_dependency_workarounds()
3331 {
3332 if (devinfo->gen != 4 || devinfo->is_g4x)
3333 return;
3334
3335 bool progress = false;
3336
3337 /* Note that we're done with register allocation, so GRF fs_regs always
3338 * have a .reg_offset of 0.
3339 */
3340
3341 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3342 if (inst->mlen != 0 && inst->dst.file == GRF) {
3343 insert_gen4_pre_send_dependency_workarounds(block, inst);
3344 insert_gen4_post_send_dependency_workarounds(block, inst);
3345 progress = true;
3346 }
3347 }
3348
3349 if (progress)
3350 invalidate_live_intervals();
3351 }
3352
3353 /**
3354 * Turns the generic expression-style uniform pull constant load instruction
3355 * into a hardware-specific series of instructions for loading a pull
3356 * constant.
3357 *
3358 * The expression style allows the CSE pass before this to optimize out
3359 * repeated loads from the same offset, and gives the pre-register-allocation
3360 * scheduling full flexibility, while the conversion to native instructions
3361 * allows the post-register-allocation scheduler the best information
3362 * possible.
3363 *
3364 * Note that execution masking for setting up pull constant loads is special:
3365 * the channels that need to be written are unrelated to the current execution
3366 * mask, since a later instruction will use one of the result channels as a
3367 * source operand for all 8 or 16 of its channels.
3368 */
3369 void
3370 fs_visitor::lower_uniform_pull_constant_loads()
3371 {
3372 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3373 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3374 continue;
3375
3376 if (devinfo->gen >= 7) {
3377 /* The offset arg before was a vec4-aligned byte offset. We need to
3378 * turn it into a dword offset.
3379 */
3380 fs_reg const_offset_reg = inst->src[1];
3381 assert(const_offset_reg.file == IMM &&
3382 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3383 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3384 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3385
3386 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3387 * Reserve space for the register.
3388 */
3389 if (devinfo->gen >= 9) {
3390 payload.reg_offset++;
3391 alloc.sizes[payload.reg] = 2;
3392 }
3393
3394 /* This is actually going to be a MOV, but since only the first dword
3395 * is accessed, we have a special opcode to do just that one. Note
3396 * that this needs to be an operation that will be considered a def
3397 * by live variable analysis, or register allocation will explode.
3398 */
3399 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3400 8, payload, const_offset_reg);
3401 setup->force_writemask_all = true;
3402
3403 setup->ir = inst->ir;
3404 setup->annotation = inst->annotation;
3405 inst->insert_before(block, setup);
3406
3407 /* Similarly, this will only populate the first 4 channels of the
3408 * result register (since we only use smear values from 0-3), but we
3409 * don't tell the optimizer.
3410 */
3411 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3412 inst->src[1] = payload;
3413
3414 invalidate_live_intervals();
3415 } else {
3416 /* Before register allocation, we didn't tell the scheduler about the
3417 * MRF we use. We know it's safe to use this MRF because nothing
3418 * else does except for register spill/unspill, which generates and
3419 * uses its MRF within a single IR instruction.
3420 */
3421 inst->base_mrf = 14;
3422 inst->mlen = 1;
3423 }
3424 }
3425 }
3426
3427 bool
3428 fs_visitor::lower_load_payload()
3429 {
3430 bool progress = false;
3431
3432 int vgrf_to_reg[alloc.count];
3433 int reg_count = 0;
3434 for (unsigned i = 0; i < alloc.count; ++i) {
3435 vgrf_to_reg[i] = reg_count;
3436 reg_count += alloc.sizes[i];
3437 }
3438
3439 struct {
3440 bool written:1; /* Whether this register has ever been written */
3441 bool force_writemask_all:1;
3442 bool force_sechalf:1;
3443 } metadata[reg_count];
3444 memset(metadata, 0, sizeof(metadata));
3445
3446 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3447 if (inst->dst.file == GRF) {
3448 const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3449 bool force_sechalf = inst->force_sechalf &&
3450 !inst->force_writemask_all;
3451 bool toggle_sechalf = inst->dst.width == 16 &&
3452 type_sz(inst->dst.type) == 4 &&
3453 !inst->force_writemask_all;
3454 for (int i = 0; i < inst->regs_written; ++i) {
3455 metadata[dst_reg + i].written = true;
3456 metadata[dst_reg + i].force_sechalf = force_sechalf;
3457 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3458 force_sechalf = (toggle_sechalf != force_sechalf);
3459 }
3460 }
3461
3462 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3463 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3464 fs_reg dst = inst->dst;
3465
3466 for (int i = 0; i < inst->sources; i++) {
3467 dst.width = inst->src[i].effective_width;
3468 dst.type = inst->src[i].type;
3469
3470 if (inst->src[i].file == BAD_FILE) {
3471 /* Do nothing but otherwise increment as normal */
3472 } else if (dst.file == MRF &&
3473 dst.width == 8 &&
3474 devinfo->has_compr4 &&
3475 i + 4 < inst->sources &&
3476 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3477 fs_reg compr4_dst = dst;
3478 compr4_dst.reg += BRW_MRF_COMPR4;
3479 compr4_dst.width = 16;
3480 fs_reg compr4_src = inst->src[i];
3481 compr4_src.width = 16;
3482 fs_inst *mov = MOV(compr4_dst, compr4_src);
3483 mov->force_writemask_all = true;
3484 inst->insert_before(block, mov);
3485 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3486 inst->src[i + 4].file = BAD_FILE;
3487 } else {
3488 fs_inst *mov = MOV(dst, inst->src[i]);
3489 if (inst->src[i].file == GRF) {
3490 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3491 inst->src[i].reg_offset;
3492 mov->force_sechalf = metadata[src_reg].force_sechalf;
3493 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3494 } else {
3495 /* We don't have any useful metadata for immediates or
3496 * uniforms. Assume that any of the channels of the
3497 * destination may be used.
3498 */
3499 assert(inst->src[i].file == IMM ||
3500 inst->src[i].file == UNIFORM);
3501 mov->force_writemask_all = true;
3502 }
3503
3504 if (dst.file == GRF) {
3505 const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3506 const bool force_writemask = mov->force_writemask_all;
3507 metadata[dst_reg].force_writemask_all = force_writemask;
3508 metadata[dst_reg].force_sechalf = mov->force_sechalf;
3509 if (dst.width * type_sz(dst.type) > 32) {
3510 assert(!mov->force_sechalf);
3511 metadata[dst_reg + 1].force_writemask_all = force_writemask;
3512 metadata[dst_reg + 1].force_sechalf = !force_writemask;
3513 }
3514 }
3515
3516 inst->insert_before(block, mov);
3517 }
3518
3519 dst = offset(dst, 1);
3520 }
3521
3522 inst->remove(block);
3523 progress = true;
3524 }
3525 }
3526
3527 if (progress)
3528 invalidate_live_intervals();
3529
3530 return progress;
3531 }
3532
3533 void
3534 fs_visitor::dump_instructions()
3535 {
3536 dump_instructions(NULL);
3537 }
3538
3539 void
3540 fs_visitor::dump_instructions(const char *name)
3541 {
3542 FILE *file = stderr;
3543 if (name && geteuid() != 0) {
3544 file = fopen(name, "w");
3545 if (!file)
3546 file = stderr;
3547 }
3548
3549 if (cfg) {
3550 calculate_register_pressure();
3551 int ip = 0, max_pressure = 0;
3552 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3553 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3554 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3555 dump_instruction(inst, file);
3556 ip++;
3557 }
3558 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3559 } else {
3560 int ip = 0;
3561 foreach_in_list(backend_instruction, inst, &instructions) {
3562 fprintf(file, "%4d: ", ip++);
3563 dump_instruction(inst, file);
3564 }
3565 }
3566
3567 if (file != stderr) {
3568 fclose(file);
3569 }
3570 }
3571
3572 void
3573 fs_visitor::dump_instruction(backend_instruction *be_inst)
3574 {
3575 dump_instruction(be_inst, stderr);
3576 }
3577
3578 void
3579 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3580 {
3581 fs_inst *inst = (fs_inst *)be_inst;
3582
3583 if (inst->predicate) {
3584 fprintf(file, "(%cf0.%d) ",
3585 inst->predicate_inverse ? '-' : '+',
3586 inst->flag_subreg);
3587 }
3588
3589 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3590 if (inst->saturate)
3591 fprintf(file, ".sat");
3592 if (inst->conditional_mod) {
3593 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3594 if (!inst->predicate &&
3595 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3596 inst->opcode != BRW_OPCODE_IF &&
3597 inst->opcode != BRW_OPCODE_WHILE))) {
3598 fprintf(file, ".f0.%d", inst->flag_subreg);
3599 }
3600 }
3601 fprintf(file, "(%d) ", inst->exec_size);
3602
3603
3604 switch (inst->dst.file) {
3605 case GRF:
3606 fprintf(file, "vgrf%d", inst->dst.reg);
3607 if (inst->dst.width != dispatch_width)
3608 fprintf(file, "@%d", inst->dst.width);
3609 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3610 inst->dst.subreg_offset)
3611 fprintf(file, "+%d.%d",
3612 inst->dst.reg_offset, inst->dst.subreg_offset);
3613 break;
3614 case MRF:
3615 fprintf(file, "m%d", inst->dst.reg);
3616 break;
3617 case BAD_FILE:
3618 fprintf(file, "(null)");
3619 break;
3620 case UNIFORM:
3621 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3622 break;
3623 case ATTR:
3624 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3625 break;
3626 case HW_REG:
3627 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3628 switch (inst->dst.fixed_hw_reg.nr) {
3629 case BRW_ARF_NULL:
3630 fprintf(file, "null");
3631 break;
3632 case BRW_ARF_ADDRESS:
3633 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3634 break;
3635 case BRW_ARF_ACCUMULATOR:
3636 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3637 break;
3638 case BRW_ARF_FLAG:
3639 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3640 inst->dst.fixed_hw_reg.subnr);
3641 break;
3642 default:
3643 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3644 inst->dst.fixed_hw_reg.subnr);
3645 break;
3646 }
3647 } else {
3648 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3649 }
3650 if (inst->dst.fixed_hw_reg.subnr)
3651 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3652 break;
3653 default:
3654 fprintf(file, "???");
3655 break;
3656 }
3657 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3658
3659 for (int i = 0; i < inst->sources; i++) {
3660 if (inst->src[i].negate)
3661 fprintf(file, "-");
3662 if (inst->src[i].abs)
3663 fprintf(file, "|");
3664 switch (inst->src[i].file) {
3665 case GRF:
3666 fprintf(file, "vgrf%d", inst->src[i].reg);
3667 if (inst->src[i].width != dispatch_width)
3668 fprintf(file, "@%d", inst->src[i].width);
3669 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3670 inst->src[i].subreg_offset)
3671 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3672 inst->src[i].subreg_offset);
3673 break;
3674 case MRF:
3675 fprintf(file, "***m%d***", inst->src[i].reg);
3676 break;
3677 case ATTR:
3678 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3679 break;
3680 case UNIFORM:
3681 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3682 if (inst->src[i].reladdr) {
3683 fprintf(file, "+reladdr");
3684 } else if (inst->src[i].subreg_offset) {
3685 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3686 inst->src[i].subreg_offset);
3687 }
3688 break;
3689 case BAD_FILE:
3690 fprintf(file, "(null)");
3691 break;
3692 case IMM:
3693 switch (inst->src[i].type) {
3694 case BRW_REGISTER_TYPE_F:
3695 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3696 break;
3697 case BRW_REGISTER_TYPE_W:
3698 case BRW_REGISTER_TYPE_D:
3699 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3700 break;
3701 case BRW_REGISTER_TYPE_UW:
3702 case BRW_REGISTER_TYPE_UD:
3703 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3704 break;
3705 case BRW_REGISTER_TYPE_VF:
3706 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3707 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3708 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3709 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3710 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3711 break;
3712 default:
3713 fprintf(file, "???");
3714 break;
3715 }
3716 break;
3717 case HW_REG:
3718 if (inst->src[i].fixed_hw_reg.negate)
3719 fprintf(file, "-");
3720 if (inst->src[i].fixed_hw_reg.abs)
3721 fprintf(file, "|");
3722 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3723 switch (inst->src[i].fixed_hw_reg.nr) {
3724 case BRW_ARF_NULL:
3725 fprintf(file, "null");
3726 break;
3727 case BRW_ARF_ADDRESS:
3728 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3729 break;
3730 case BRW_ARF_ACCUMULATOR:
3731 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3732 break;
3733 case BRW_ARF_FLAG:
3734 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3735 inst->src[i].fixed_hw_reg.subnr);
3736 break;
3737 default:
3738 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3739 inst->src[i].fixed_hw_reg.subnr);
3740 break;
3741 }
3742 } else {
3743 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3744 }
3745 if (inst->src[i].fixed_hw_reg.subnr)
3746 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3747 if (inst->src[i].fixed_hw_reg.abs)
3748 fprintf(file, "|");
3749 break;
3750 default:
3751 fprintf(file, "???");
3752 break;
3753 }
3754 if (inst->src[i].abs)
3755 fprintf(file, "|");
3756
3757 if (inst->src[i].file != IMM) {
3758 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3759 }
3760
3761 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3762 fprintf(file, ", ");
3763 }
3764
3765 fprintf(file, " ");
3766
3767 if (dispatch_width == 16 && inst->exec_size == 8) {
3768 if (inst->force_sechalf)
3769 fprintf(file, "2ndhalf ");
3770 else
3771 fprintf(file, "1sthalf ");
3772 }
3773
3774 fprintf(file, "\n");
3775 }
3776
3777 /**
3778 * Possibly returns an instruction that set up @param reg.
3779 *
3780 * Sometimes we want to take the result of some expression/variable
3781 * dereference tree and rewrite the instruction generating the result
3782 * of the tree. When processing the tree, we know that the
3783 * instructions generated are all writing temporaries that are dead
3784 * outside of this tree. So, if we have some instructions that write
3785 * a temporary, we're free to point that temp write somewhere else.
3786 *
3787 * Note that this doesn't guarantee that the instruction generated
3788 * only reg -- it might be the size=4 destination of a texture instruction.
3789 */
3790 fs_inst *
3791 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3792 fs_inst *end,
3793 const fs_reg &reg)
3794 {
3795 if (end == start ||
3796 end->is_partial_write() ||
3797 reg.reladdr ||
3798 !reg.equals(end->dst)) {
3799 return NULL;
3800 } else {
3801 return end;
3802 }
3803 }
3804
3805 void
3806 fs_visitor::setup_payload_gen6()
3807 {
3808 bool uses_depth =
3809 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3810 unsigned barycentric_interp_modes =
3811 (stage == MESA_SHADER_FRAGMENT) ?
3812 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3813
3814 assert(devinfo->gen >= 6);
3815
3816 /* R0-1: masks, pixel X/Y coordinates. */
3817 payload.num_regs = 2;
3818 /* R2: only for 32-pixel dispatch.*/
3819
3820 /* R3-26: barycentric interpolation coordinates. These appear in the
3821 * same order that they appear in the brw_wm_barycentric_interp_mode
3822 * enum. Each set of coordinates occupies 2 registers if dispatch width
3823 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3824 * appear if they were enabled using the "Barycentric Interpolation
3825 * Mode" bits in WM_STATE.
3826 */
3827 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3828 if (barycentric_interp_modes & (1 << i)) {
3829 payload.barycentric_coord_reg[i] = payload.num_regs;
3830 payload.num_regs += 2;
3831 if (dispatch_width == 16) {
3832 payload.num_regs += 2;
3833 }
3834 }
3835 }
3836
3837 /* R27: interpolated depth if uses source depth */
3838 if (uses_depth) {
3839 payload.source_depth_reg = payload.num_regs;
3840 payload.num_regs++;
3841 if (dispatch_width == 16) {
3842 /* R28: interpolated depth if not SIMD8. */
3843 payload.num_regs++;
3844 }
3845 }
3846 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3847 if (uses_depth) {
3848 payload.source_w_reg = payload.num_regs;
3849 payload.num_regs++;
3850 if (dispatch_width == 16) {
3851 /* R30: interpolated W if not SIMD8. */
3852 payload.num_regs++;
3853 }
3854 }
3855
3856 if (stage == MESA_SHADER_FRAGMENT) {
3857 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3858 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3859 prog_data->uses_pos_offset = key->compute_pos_offset;
3860 /* R31: MSAA position offsets. */
3861 if (prog_data->uses_pos_offset) {
3862 payload.sample_pos_reg = payload.num_regs;
3863 payload.num_regs++;
3864 }
3865 }
3866
3867 /* R32: MSAA input coverage mask */
3868 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3869 assert(devinfo->gen >= 7);
3870 payload.sample_mask_in_reg = payload.num_regs;
3871 payload.num_regs++;
3872 if (dispatch_width == 16) {
3873 /* R33: input coverage mask if not SIMD8. */
3874 payload.num_regs++;
3875 }
3876 }
3877
3878 /* R34-: bary for 32-pixel. */
3879 /* R58-59: interp W for 32-pixel. */
3880
3881 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3882 source_depth_to_render_target = true;
3883 }
3884 }
3885
3886 void
3887 fs_visitor::setup_vs_payload()
3888 {
3889 /* R0: thread header, R1: urb handles */
3890 payload.num_regs = 2;
3891 }
3892
3893 void
3894 fs_visitor::setup_cs_payload()
3895 {
3896 assert(brw->gen >= 7);
3897
3898 payload.num_regs = 1;
3899 }
3900
3901 void
3902 fs_visitor::assign_binding_table_offsets()
3903 {
3904 assert(stage == MESA_SHADER_FRAGMENT);
3905 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3906 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3907 uint32_t next_binding_table_offset = 0;
3908
3909 /* If there are no color regions, we still perform an FB write to a null
3910 * renderbuffer, which we place at surface index 0.
3911 */
3912 prog_data->binding_table.render_target_start = next_binding_table_offset;
3913 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3914
3915 assign_common_binding_table_offsets(next_binding_table_offset);
3916 }
3917
3918 void
3919 fs_visitor::calculate_register_pressure()
3920 {
3921 invalidate_live_intervals();
3922 calculate_live_intervals();
3923
3924 unsigned num_instructions = 0;
3925 foreach_block(block, cfg)
3926 num_instructions += block->instructions.length();
3927
3928 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3929
3930 for (unsigned reg = 0; reg < alloc.count; reg++) {
3931 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3932 regs_live_at_ip[ip] += alloc.sizes[reg];
3933 }
3934 }
3935
3936 void
3937 fs_visitor::optimize()
3938 {
3939 split_virtual_grfs();
3940
3941 move_uniform_array_access_to_pull_constants();
3942 assign_constant_locations();
3943 demote_pull_constants();
3944
3945 #define OPT(pass, args...) ({ \
3946 pass_num++; \
3947 bool this_progress = pass(args); \
3948 \
3949 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3950 char filename[64]; \
3951 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3952 stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3953 \
3954 backend_visitor::dump_instructions(filename); \
3955 } \
3956 \
3957 progress = progress || this_progress; \
3958 this_progress; \
3959 })
3960
3961 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3962 char filename[64];
3963 snprintf(filename, 64, "%s%d-%04d-00-start",
3964 stage_abbrev, dispatch_width,
3965 shader_prog ? shader_prog->Name : 0);
3966
3967 backend_visitor::dump_instructions(filename);
3968 }
3969
3970 bool progress;
3971 int iteration = 0;
3972 int pass_num = 0;
3973 do {
3974 progress = false;
3975 pass_num = 0;
3976 iteration++;
3977
3978 OPT(remove_duplicate_mrf_writes);
3979
3980 OPT(opt_algebraic);
3981 OPT(opt_cse);
3982 OPT(opt_copy_propagate);
3983 OPT(opt_peephole_predicated_break);
3984 OPT(opt_cmod_propagation);
3985 OPT(dead_code_eliminate);
3986 OPT(opt_peephole_sel);
3987 OPT(dead_control_flow_eliminate, this);
3988 OPT(opt_register_renaming);
3989 OPT(opt_redundant_discard_jumps);
3990 OPT(opt_saturate_propagation);
3991 OPT(opt_zero_samples);
3992 OPT(register_coalesce);
3993 OPT(compute_to_mrf);
3994 OPT(eliminate_find_live_channel);
3995
3996 OPT(compact_virtual_grfs);
3997 } while (progress);
3998
3999 pass_num = 0;
4000
4001 OPT(opt_sampler_eot);
4002
4003 if (OPT(lower_load_payload)) {
4004 split_virtual_grfs();
4005 OPT(register_coalesce);
4006 OPT(compute_to_mrf);
4007 OPT(dead_code_eliminate);
4008 }
4009
4010 OPT(opt_combine_constants);
4011
4012 lower_uniform_pull_constant_loads();
4013 }
4014
4015 /**
4016 * Three source instruction must have a GRF/MRF destination register.
4017 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
4018 */
4019 void
4020 fs_visitor::fixup_3src_null_dest()
4021 {
4022 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
4023 if (inst->is_3src() && inst->dst.is_null()) {
4024 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
4025 inst->dst.type);
4026 }
4027 }
4028 }
4029
4030 void
4031 fs_visitor::allocate_registers()
4032 {
4033 bool allocated_without_spills;
4034
4035 static const enum instruction_scheduler_mode pre_modes[] = {
4036 SCHEDULE_PRE,
4037 SCHEDULE_PRE_NON_LIFO,
4038 SCHEDULE_PRE_LIFO,
4039 };
4040
4041 /* Try each scheduling heuristic to see if it can successfully register
4042 * allocate without spilling. They should be ordered by decreasing
4043 * performance but increasing likelihood of allocating.
4044 */
4045 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
4046 schedule_instructions(pre_modes[i]);
4047
4048 if (0) {
4049 assign_regs_trivial();
4050 allocated_without_spills = true;
4051 } else {
4052 allocated_without_spills = assign_regs(false);
4053 }
4054 if (allocated_without_spills)
4055 break;
4056 }
4057
4058 if (!allocated_without_spills) {
4059 /* We assume that any spilling is worse than just dropping back to
4060 * SIMD8. There's probably actually some intermediate point where
4061 * SIMD16 with a couple of spills is still better.
4062 */
4063 if (dispatch_width == 16) {
4064 fail("Failure to register allocate. Reduce number of "
4065 "live scalar values to avoid this.");
4066 } else {
4067 perf_debug("%s shader triggered register spilling. "
4068 "Try reducing the number of live scalar values to "
4069 "improve performance.\n", stage_name);
4070 }
4071
4072 /* Since we're out of heuristics, just go spill registers until we
4073 * get an allocation.
4074 */
4075 while (!assign_regs(true)) {
4076 if (failed)
4077 break;
4078 }
4079 }
4080
4081 /* This must come after all optimization and register allocation, since
4082 * it inserts dead code that happens to have side effects, and it does
4083 * so based on the actual physical registers in use.
4084 */
4085 insert_gen4_send_dependency_workarounds();
4086
4087 if (failed)
4088 return;
4089
4090 if (!allocated_without_spills)
4091 schedule_instructions(SCHEDULE_POST);
4092
4093 if (last_scratch > 0)
4094 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4095 }
4096
4097 bool
4098 fs_visitor::run_vs()
4099 {
4100 assert(stage == MESA_SHADER_VERTEX);
4101
4102 assign_common_binding_table_offsets(0);
4103 setup_vs_payload();
4104
4105 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4106 emit_shader_time_begin();
4107
4108 if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
4109 emit_nir_code();
4110 } else {
4111 foreach_in_list(ir_instruction, ir, shader->base.ir) {
4112 base_ir = ir;
4113 this->result = reg_undef;
4114 ir->accept(this);
4115 }
4116 base_ir = NULL;
4117 }
4118
4119 if (failed)
4120 return false;
4121
4122 emit_urb_writes();
4123
4124 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4125 emit_shader_time_end();
4126
4127 calculate_cfg();
4128
4129 optimize();
4130
4131 assign_curb_setup();
4132 assign_vs_urb_setup();
4133
4134 fixup_3src_null_dest();
4135 allocate_registers();
4136
4137 return !failed;
4138 }
4139
4140 bool
4141 fs_visitor::run_fs()
4142 {
4143 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4144 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4145
4146 assert(stage == MESA_SHADER_FRAGMENT);
4147
4148 sanity_param_count = prog->Parameters->NumParameters;
4149
4150 assign_binding_table_offsets();
4151
4152 if (devinfo->gen >= 6)
4153 setup_payload_gen6();
4154 else
4155 setup_payload_gen4();
4156
4157 if (0) {
4158 emit_dummy_fs();
4159 } else if (brw->use_rep_send && dispatch_width == 16) {
4160 emit_repclear_shader();
4161 } else {
4162 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4163 emit_shader_time_begin();
4164
4165 calculate_urb_setup();
4166 if (prog->InputsRead > 0) {
4167 if (devinfo->gen < 6)
4168 emit_interpolation_setup_gen4();
4169 else
4170 emit_interpolation_setup_gen6();
4171 }
4172
4173 /* We handle discards by keeping track of the still-live pixels in f0.1.
4174 * Initialize it with the dispatched pixels.
4175 */
4176 if (wm_prog_data->uses_kill) {
4177 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4178 discard_init->flag_subreg = 1;
4179 }
4180
4181 /* Generate FS IR for main(). (the visitor only descends into
4182 * functions called "main").
4183 */
4184 if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
4185 emit_nir_code();
4186 } else if (shader) {
4187 foreach_in_list(ir_instruction, ir, shader->base.ir) {
4188 base_ir = ir;
4189 this->result = reg_undef;
4190 ir->accept(this);
4191 }
4192 } else {
4193 emit_fragment_program_code();
4194 }
4195 base_ir = NULL;
4196 if (failed)
4197 return false;
4198
4199 if (wm_prog_data->uses_kill)
4200 emit(FS_OPCODE_PLACEHOLDER_HALT);
4201
4202 if (wm_key->alpha_test_func)
4203 emit_alpha_test();
4204
4205 emit_fb_writes();
4206
4207 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4208 emit_shader_time_end();
4209
4210 calculate_cfg();
4211
4212 optimize();
4213
4214 assign_curb_setup();
4215 assign_urb_setup();
4216
4217 fixup_3src_null_dest();
4218 allocate_registers();
4219
4220 if (failed)
4221 return false;
4222 }
4223
4224 if (dispatch_width == 8)
4225 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4226 else
4227 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4228
4229 /* If any state parameters were appended, then ParameterValues could have
4230 * been realloced, in which case the driver uniform storage set up by
4231 * _mesa_associate_uniform_storage() would point to freed memory. Make
4232 * sure that didn't happen.
4233 */
4234 assert(sanity_param_count == prog->Parameters->NumParameters);
4235
4236 return !failed;
4237 }
4238
4239 bool
4240 fs_visitor::run_cs()
4241 {
4242 assert(stage == MESA_SHADER_COMPUTE);
4243 assert(shader);
4244
4245 sanity_param_count = prog->Parameters->NumParameters;
4246
4247 assign_common_binding_table_offsets(0);
4248
4249 setup_cs_payload();
4250
4251 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4252 emit_shader_time_begin();
4253
4254 emit_nir_code();
4255
4256 if (failed)
4257 return false;
4258
4259 emit_cs_terminate();
4260
4261 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4262 emit_shader_time_end();
4263
4264 calculate_cfg();
4265
4266 optimize();
4267
4268 assign_curb_setup();
4269
4270 fixup_3src_null_dest();
4271 allocate_registers();
4272
4273 if (failed)
4274 return false;
4275
4276 /* If any state parameters were appended, then ParameterValues could have
4277 * been realloced, in which case the driver uniform storage set up by
4278 * _mesa_associate_uniform_storage() would point to freed memory. Make
4279 * sure that didn't happen.
4280 */
4281 assert(sanity_param_count == prog->Parameters->NumParameters);
4282
4283 return !failed;
4284 }
4285
4286 const unsigned *
4287 brw_wm_fs_emit(struct brw_context *brw,
4288 void *mem_ctx,
4289 const struct brw_wm_prog_key *key,
4290 struct brw_wm_prog_data *prog_data,
4291 struct gl_fragment_program *fp,
4292 struct gl_shader_program *prog,
4293 unsigned *final_assembly_size)
4294 {
4295 bool start_busy = false;
4296 double start_time = 0;
4297
4298 if (unlikely(brw->perf_debug)) {
4299 start_busy = (brw->batch.last_bo &&
4300 drm_intel_bo_busy(brw->batch.last_bo));
4301 start_time = get_time();
4302 }
4303
4304 struct brw_shader *shader = NULL;
4305 if (prog)
4306 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4307
4308 if (unlikely(INTEL_DEBUG & DEBUG_WM))
4309 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4310
4311 /* Now the main event: Visit the shader IR and generate our FS IR for it.
4312 */
4313 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
4314 if (!v.run_fs()) {
4315 if (prog) {
4316 prog->LinkStatus = false;
4317 ralloc_strcat(&prog->InfoLog, v.fail_msg);
4318 }
4319
4320 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4321 v.fail_msg);
4322
4323 return NULL;
4324 }
4325
4326 cfg_t *simd16_cfg = NULL;
4327 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
4328 if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4329 if (!v.simd16_unsupported) {
4330 /* Try a SIMD16 compile */
4331 v2.import_uniforms(&v);
4332 if (!v2.run_fs()) {
4333 perf_debug("SIMD16 shader failed to compile, falling back to "
4334 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4335 } else {
4336 simd16_cfg = v2.cfg;
4337 }
4338 } else {
4339 perf_debug("SIMD16 shader unsupported, falling back to "
4340 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4341 }
4342 }
4343
4344 cfg_t *simd8_cfg;
4345 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4346 if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4347 simd8_cfg = NULL;
4348 prog_data->no_8 = true;
4349 } else {
4350 simd8_cfg = v.cfg;
4351 prog_data->no_8 = false;
4352 }
4353
4354 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4355 &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4356
4357 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4358 char *name;
4359 if (prog)
4360 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4361 prog->Label ? prog->Label : "unnamed",
4362 prog->Name);
4363 else
4364 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4365
4366 g.enable_debug(name);
4367 }
4368
4369 if (simd8_cfg)
4370 g.generate_code(simd8_cfg, 8);
4371 if (simd16_cfg)
4372 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4373
4374 if (unlikely(brw->perf_debug) && shader) {
4375 if (shader->compiled_once)
4376 brw_wm_debug_recompile(brw, prog, key);
4377 shader->compiled_once = true;
4378
4379 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4380 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4381 (get_time() - start_time) * 1000);
4382 }
4383 }
4384
4385 return g.get_assembly(final_assembly_size);
4386 }
4387
4388 extern "C" bool
4389 brw_fs_precompile(struct gl_context *ctx,
4390 struct gl_shader_program *shader_prog,
4391 struct gl_program *prog)
4392 {
4393 struct brw_context *brw = brw_context(ctx);
4394 struct brw_wm_prog_key key;
4395
4396 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4397 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4398 bool program_uses_dfdy = fp->UsesDFdy;
4399
4400 memset(&key, 0, sizeof(key));
4401
4402 if (brw->gen < 6) {
4403 if (fp->UsesKill)
4404 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4405
4406 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4407 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4408
4409 /* Just assume depth testing. */
4410 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4411 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4412 }
4413
4414 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4415 BRW_FS_VARYING_INPUT_MASK) > 16)
4416 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4417
4418 brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4419
4420 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4421 key.drawable_height = ctx->DrawBuffer->Height;
4422 }
4423
4424 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4425 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4426 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4427
4428 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4429 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4430 key.nr_color_regions > 1;
4431 }
4432
4433 key.program_string_id = bfp->id;
4434
4435 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4436 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4437
4438 bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4439
4440 brw->wm.base.prog_offset = old_prog_offset;
4441 brw->wm.prog_data = old_prog_data;
4442
4443 return success;
4444 }
4445
4446 void
4447 brw_setup_tex_for_precompile(struct brw_context *brw,
4448 struct brw_sampler_prog_key_data *tex,
4449 struct gl_program *prog)
4450 {
4451 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4452 unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4453 for (unsigned i = 0; i < sampler_count; i++) {
4454 if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4455 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4456 tex->swizzles[i] =
4457 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4458 } else {
4459 /* Color sampler: assume no swizzling. */
4460 tex->swizzles[i] = SWIZZLE_XYZW;
4461 }
4462 }
4463 }