b2701b896892e5c0bb3de6c8edddd2f463273367
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 void
53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
54 const fs_reg *src, unsigned sources)
55 {
56 memset(this, 0, sizeof(*this));
57
58 this->src = new fs_reg[MAX2(sources, 3)];
59 for (unsigned i = 0; i < sources; i++)
60 this->src[i] = src[i];
61
62 this->opcode = opcode;
63 this->dst = dst;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (unsigned i = 0; i < sources; ++i) {
79 if (src[i].file != GRF && src[i].file != ATTR)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 this->conditional_mod = BRW_CONDITIONAL_NONE;
94
95 /* This will be the case for almost all instructions. */
96 switch (dst.file) {
97 case GRF:
98 case HW_REG:
99 case MRF:
100 case ATTR:
101 this->regs_written =
102 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
103 break;
104 case BAD_FILE:
105 this->regs_written = 0;
106 break;
107 case IMM:
108 case UNIFORM:
109 unreachable("Invalid destination register file");
110 default:
111 unreachable("Invalid register file");
112 }
113
114 this->writes_accumulator = false;
115 }
116
117 fs_inst::fs_inst()
118 {
119 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
120 }
121
122 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
123 {
124 init(opcode, exec_size, reg_undef, NULL, 0);
125 }
126
127 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
128 {
129 init(opcode, 0, dst, NULL, 0);
130 }
131
132 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
133 const fs_reg &src0)
134 {
135 const fs_reg src[1] = { src0 };
136 init(opcode, exec_size, dst, src, 1);
137 }
138
139 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
140 {
141 const fs_reg src[1] = { src0 };
142 init(opcode, 0, dst, src, 1);
143 }
144
145 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
146 const fs_reg &src0, const fs_reg &src1)
147 {
148 const fs_reg src[2] = { src0, src1 };
149 init(opcode, exec_size, dst, src, 2);
150 }
151
152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
153 const fs_reg &src1)
154 {
155 const fs_reg src[2] = { src0, src1 };
156 init(opcode, 0, dst, src, 2);
157 }
158
159 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
160 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
161 {
162 const fs_reg src[3] = { src0, src1, src2 };
163 init(opcode, exec_size, dst, src, 3);
164 }
165
166 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
167 const fs_reg &src1, const fs_reg &src2)
168 {
169 const fs_reg src[3] = { src0, src1, src2 };
170 init(opcode, 0, dst, src, 3);
171 }
172
173 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
174 const fs_reg src[], unsigned sources)
175 {
176 init(opcode, 0, dst, src, sources);
177 }
178
179 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
180 const fs_reg src[], unsigned sources)
181 {
182 init(opcode, exec_width, dst, src, sources);
183 }
184
185 fs_inst::fs_inst(const fs_inst &that)
186 {
187 memcpy(this, &that, sizeof(that));
188
189 this->src = new fs_reg[MAX2(that.sources, 3)];
190
191 for (unsigned i = 0; i < that.sources; i++)
192 this->src[i] = that.src[i];
193 }
194
195 fs_inst::~fs_inst()
196 {
197 delete[] this->src;
198 }
199
200 void
201 fs_inst::resize_sources(uint8_t num_sources)
202 {
203 if (this->sources != num_sources) {
204 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
205
206 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
207 src[i] = this->src[i];
208
209 delete[] this->src;
210 this->src = src;
211 this->sources = num_sources;
212 }
213 }
214
215 #define ALU1(op) \
216 fs_inst * \
217 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
218 { \
219 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
220 }
221
222 #define ALU2(op) \
223 fs_inst * \
224 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
225 const fs_reg &src1) \
226 { \
227 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
228 }
229
230 #define ALU2_ACC(op) \
231 fs_inst * \
232 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
233 const fs_reg &src1) \
234 { \
235 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
236 inst->writes_accumulator = true; \
237 return inst; \
238 }
239
240 #define ALU3(op) \
241 fs_inst * \
242 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
243 const fs_reg &src1, const fs_reg &src2) \
244 { \
245 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
246 }
247
248 ALU1(NOT)
249 ALU1(MOV)
250 ALU1(FRC)
251 ALU1(RNDD)
252 ALU1(RNDE)
253 ALU1(RNDZ)
254 ALU2(ADD)
255 ALU2(MUL)
256 ALU2_ACC(MACH)
257 ALU2(AND)
258 ALU2(OR)
259 ALU2(XOR)
260 ALU2(SHL)
261 ALU2(SHR)
262 ALU2(ASR)
263 ALU3(LRP)
264 ALU1(BFREV)
265 ALU3(BFE)
266 ALU2(BFI1)
267 ALU3(BFI2)
268 ALU1(FBH)
269 ALU1(FBL)
270 ALU1(CBIT)
271 ALU3(MAD)
272 ALU2_ACC(ADDC)
273 ALU2_ACC(SUBB)
274 ALU2(SEL)
275 ALU2(MAC)
276
277 /** Gen4 predicated IF. */
278 fs_inst *
279 fs_visitor::IF(enum brw_predicate predicate)
280 {
281 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
282 inst->predicate = predicate;
283 return inst;
284 }
285
286 /** Gen6 IF with embedded comparison. */
287 fs_inst *
288 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
289 enum brw_conditional_mod condition)
290 {
291 assert(devinfo->gen == 6);
292 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
293 reg_null_d, src0, src1);
294 inst->conditional_mod = condition;
295 return inst;
296 }
297
298 /**
299 * CMP: Sets the low bit of the destination channels with the result
300 * of the comparison, while the upper bits are undefined, and updates
301 * the flag register with the packed 16 bits of the result.
302 */
303 fs_inst *
304 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
305 enum brw_conditional_mod condition)
306 {
307 fs_inst *inst;
308
309 /* Take the instruction:
310 *
311 * CMP null<d> src0<f> src1<f>
312 *
313 * Original gen4 does type conversion to the destination type before
314 * comparison, producing garbage results for floating point comparisons.
315 *
316 * The destination type doesn't matter on newer generations, so we set the
317 * type to match src0 so we can compact the instruction.
318 */
319 dst.type = src0.type;
320 if (dst.file == HW_REG)
321 dst.fixed_hw_reg.type = dst.type;
322
323 resolve_ud_negate(&src0);
324 resolve_ud_negate(&src1);
325
326 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
327 inst->conditional_mod = condition;
328
329 return inst;
330 }
331
332 fs_inst *
333 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
334 int header_size)
335 {
336 assert(dst.width % 8 == 0);
337 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
338 dst, src, sources);
339 inst->header_size = header_size;
340
341 for (int i = 0; i < header_size; i++)
342 assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
343 inst->regs_written = header_size;
344
345 for (int i = header_size; i < sources; ++i)
346 assert(src[i].file != GRF || src[i].width == dst.width);
347 inst->regs_written += (sources - header_size) * (dst.width / 8);
348
349 return inst;
350 }
351
352 exec_list
353 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
354 const fs_reg &surf_index,
355 const fs_reg &varying_offset,
356 uint32_t const_offset)
357 {
358 exec_list instructions;
359 fs_inst *inst;
360
361 /* We have our constant surface use a pitch of 4 bytes, so our index can
362 * be any component of a vector, and then we load 4 contiguous
363 * components starting from that.
364 *
365 * We break down the const_offset to a portion added to the variable
366 * offset and a portion done using reg_offset, which means that if you
367 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
368 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
369 * CSE can later notice that those loads are all the same and eliminate
370 * the redundant ones.
371 */
372 fs_reg vec4_offset = vgrf(glsl_type::int_type);
373 instructions.push_tail(ADD(vec4_offset,
374 varying_offset, fs_reg(const_offset & ~3)));
375
376 int scale = 1;
377 if (devinfo->gen == 4 && dst.width == 8) {
378 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
379 * u, v, r) as parameters, or we can just use the SIMD16 message
380 * consisting of (header, u). We choose the second, at the cost of a
381 * longer return length.
382 */
383 scale = 2;
384 }
385
386 enum opcode op;
387 if (devinfo->gen >= 7)
388 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
389 else
390 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
391
392 assert(dst.width % 8 == 0);
393 int regs_written = 4 * (dst.width / 8) * scale;
394 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
395 dst.type, dst.width);
396 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
397 inst->regs_written = regs_written;
398 instructions.push_tail(inst);
399
400 if (devinfo->gen < 7) {
401 inst->base_mrf = 13;
402 inst->header_size = 1;
403 if (devinfo->gen == 4)
404 inst->mlen = 3;
405 else
406 inst->mlen = 1 + dispatch_width / 8;
407 }
408
409 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
410 instructions.push_tail(MOV(dst, result));
411
412 return instructions;
413 }
414
415 /**
416 * A helper for MOV generation for fixing up broken hardware SEND dependency
417 * handling.
418 */
419 fs_inst *
420 fs_visitor::DEP_RESOLVE_MOV(int grf)
421 {
422 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
423
424 inst->ir = NULL;
425 inst->annotation = "send dependency resolve";
426
427 /* The caller always wants uncompressed to emit the minimal extra
428 * dependencies, and to avoid having to deal with aligning its regs to 2.
429 */
430 inst->exec_size = 8;
431
432 return inst;
433 }
434
435 bool
436 fs_inst::equals(fs_inst *inst) const
437 {
438 return (opcode == inst->opcode &&
439 dst.equals(inst->dst) &&
440 src[0].equals(inst->src[0]) &&
441 src[1].equals(inst->src[1]) &&
442 src[2].equals(inst->src[2]) &&
443 saturate == inst->saturate &&
444 predicate == inst->predicate &&
445 conditional_mod == inst->conditional_mod &&
446 mlen == inst->mlen &&
447 base_mrf == inst->base_mrf &&
448 target == inst->target &&
449 eot == inst->eot &&
450 header_size == inst->header_size &&
451 shadow_compare == inst->shadow_compare &&
452 exec_size == inst->exec_size &&
453 offset == inst->offset);
454 }
455
456 bool
457 fs_inst::overwrites_reg(const fs_reg &reg) const
458 {
459 return reg.in_range(dst, regs_written);
460 }
461
462 bool
463 fs_inst::is_send_from_grf() const
464 {
465 switch (opcode) {
466 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
467 case SHADER_OPCODE_SHADER_TIME_ADD:
468 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
469 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
470 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
471 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
472 case SHADER_OPCODE_UNTYPED_ATOMIC:
473 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
474 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
475 case SHADER_OPCODE_TYPED_ATOMIC:
476 case SHADER_OPCODE_TYPED_SURFACE_READ:
477 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
478 case SHADER_OPCODE_URB_WRITE_SIMD8:
479 return true;
480 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
481 return src[1].file == GRF;
482 case FS_OPCODE_FB_WRITE:
483 return src[0].file == GRF;
484 default:
485 if (is_tex())
486 return src[0].file == GRF;
487
488 return false;
489 }
490 }
491
492 bool
493 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
494 {
495 if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
496 return false;
497
498 fs_reg reg = this->src[0];
499 if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
500 return false;
501
502 if (grf_alloc.sizes[reg.reg] != this->regs_written)
503 return false;
504
505 for (int i = 0; i < this->sources; i++) {
506 reg.type = this->src[i].type;
507 reg.width = this->src[i].width;
508 if (!this->src[i].equals(reg))
509 return false;
510 reg = ::offset(reg, 1);
511 }
512
513 return true;
514 }
515
516 bool
517 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
518 {
519 if (devinfo->gen == 6 && is_math())
520 return false;
521
522 if (is_send_from_grf())
523 return false;
524
525 if (!backend_instruction::can_do_source_mods())
526 return false;
527
528 return true;
529 }
530
531 bool
532 fs_inst::has_side_effects() const
533 {
534 return this->eot || backend_instruction::has_side_effects();
535 }
536
537 void
538 fs_reg::init()
539 {
540 memset(this, 0, sizeof(*this));
541 stride = 1;
542 }
543
544 /** Generic unset register constructor. */
545 fs_reg::fs_reg()
546 {
547 init();
548 this->file = BAD_FILE;
549 }
550
551 /** Immediate value constructor. */
552 fs_reg::fs_reg(float f)
553 {
554 init();
555 this->file = IMM;
556 this->type = BRW_REGISTER_TYPE_F;
557 this->fixed_hw_reg.dw1.f = f;
558 this->width = 1;
559 }
560
561 /** Immediate value constructor. */
562 fs_reg::fs_reg(int32_t i)
563 {
564 init();
565 this->file = IMM;
566 this->type = BRW_REGISTER_TYPE_D;
567 this->fixed_hw_reg.dw1.d = i;
568 this->width = 1;
569 }
570
571 /** Immediate value constructor. */
572 fs_reg::fs_reg(uint32_t u)
573 {
574 init();
575 this->file = IMM;
576 this->type = BRW_REGISTER_TYPE_UD;
577 this->fixed_hw_reg.dw1.ud = u;
578 this->width = 1;
579 }
580
581 /** Vector float immediate value constructor. */
582 fs_reg::fs_reg(uint8_t vf[4])
583 {
584 init();
585 this->file = IMM;
586 this->type = BRW_REGISTER_TYPE_VF;
587 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
588 }
589
590 /** Vector float immediate value constructor. */
591 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
592 {
593 init();
594 this->file = IMM;
595 this->type = BRW_REGISTER_TYPE_VF;
596 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
597 (vf1 << 8) |
598 (vf2 << 16) |
599 (vf3 << 24);
600 }
601
602 /** Fixed brw_reg. */
603 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
604 {
605 init();
606 this->file = HW_REG;
607 this->fixed_hw_reg = fixed_hw_reg;
608 this->type = fixed_hw_reg.type;
609 this->width = 1 << fixed_hw_reg.width;
610 }
611
612 bool
613 fs_reg::equals(const fs_reg &r) const
614 {
615 return (file == r.file &&
616 reg == r.reg &&
617 reg_offset == r.reg_offset &&
618 subreg_offset == r.subreg_offset &&
619 type == r.type &&
620 negate == r.negate &&
621 abs == r.abs &&
622 !reladdr && !r.reladdr &&
623 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
624 width == r.width &&
625 stride == r.stride);
626 }
627
628 fs_reg &
629 fs_reg::set_smear(unsigned subreg)
630 {
631 assert(file != HW_REG && file != IMM);
632 subreg_offset = subreg * type_sz(type);
633 stride = 0;
634 return *this;
635 }
636
637 bool
638 fs_reg::is_contiguous() const
639 {
640 return stride == 1;
641 }
642
643 int
644 fs_visitor::type_size(const struct glsl_type *type)
645 {
646 unsigned int size, i;
647
648 switch (type->base_type) {
649 case GLSL_TYPE_UINT:
650 case GLSL_TYPE_INT:
651 case GLSL_TYPE_FLOAT:
652 case GLSL_TYPE_BOOL:
653 return type->components();
654 case GLSL_TYPE_ARRAY:
655 return type_size(type->fields.array) * type->length;
656 case GLSL_TYPE_STRUCT:
657 size = 0;
658 for (i = 0; i < type->length; i++) {
659 size += type_size(type->fields.structure[i].type);
660 }
661 return size;
662 case GLSL_TYPE_SAMPLER:
663 /* Samplers take up no register space, since they're baked in at
664 * link time.
665 */
666 return 0;
667 case GLSL_TYPE_ATOMIC_UINT:
668 return 0;
669 case GLSL_TYPE_IMAGE:
670 case GLSL_TYPE_VOID:
671 case GLSL_TYPE_ERROR:
672 case GLSL_TYPE_INTERFACE:
673 case GLSL_TYPE_DOUBLE:
674 unreachable("not reached");
675 }
676
677 return 0;
678 }
679
680 /**
681 * Create a MOV to read the timestamp register.
682 *
683 * The caller is responsible for emitting the MOV. The return value is
684 * the destination of the MOV, with extra parameters set.
685 */
686 fs_reg
687 fs_visitor::get_timestamp(fs_inst **out_mov)
688 {
689 assert(devinfo->gen >= 7);
690
691 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
692 BRW_ARF_TIMESTAMP,
693 0),
694 BRW_REGISTER_TYPE_UD));
695
696 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
697
698 fs_inst *mov = MOV(dst, ts);
699 /* We want to read the 3 fields we care about even if it's not enabled in
700 * the dispatch.
701 */
702 mov->force_writemask_all = true;
703
704 /* The caller wants the low 32 bits of the timestamp. Since it's running
705 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
706 * which is plenty of time for our purposes. It is identical across the
707 * EUs, but since it's tracking GPU core speed it will increment at a
708 * varying rate as render P-states change.
709 *
710 * The caller could also check if render P-states have changed (or anything
711 * else that might disrupt timing) by setting smear to 2 and checking if
712 * that field is != 0.
713 */
714 dst.set_smear(0);
715
716 *out_mov = mov;
717 return dst;
718 }
719
720 void
721 fs_visitor::emit_shader_time_begin()
722 {
723 current_annotation = "shader time start";
724 fs_inst *mov;
725 shader_start_time = get_timestamp(&mov);
726 emit(mov);
727 }
728
729 void
730 fs_visitor::emit_shader_time_end()
731 {
732 current_annotation = "shader time end";
733
734 enum shader_time_shader_type type, written_type, reset_type;
735 switch (stage) {
736 case MESA_SHADER_VERTEX:
737 type = ST_VS;
738 written_type = ST_VS_WRITTEN;
739 reset_type = ST_VS_RESET;
740 break;
741 case MESA_SHADER_GEOMETRY:
742 type = ST_GS;
743 written_type = ST_GS_WRITTEN;
744 reset_type = ST_GS_RESET;
745 break;
746 case MESA_SHADER_FRAGMENT:
747 if (dispatch_width == 8) {
748 type = ST_FS8;
749 written_type = ST_FS8_WRITTEN;
750 reset_type = ST_FS8_RESET;
751 } else {
752 assert(dispatch_width == 16);
753 type = ST_FS16;
754 written_type = ST_FS16_WRITTEN;
755 reset_type = ST_FS16_RESET;
756 }
757 break;
758 case MESA_SHADER_COMPUTE:
759 type = ST_CS;
760 written_type = ST_CS_WRITTEN;
761 reset_type = ST_CS_RESET;
762 break;
763 default:
764 unreachable("fs_visitor::emit_shader_time_end missing code");
765 }
766
767 /* Insert our code just before the final SEND with EOT. */
768 exec_node *end = this->instructions.get_tail();
769 assert(end && ((fs_inst *) end)->eot);
770
771 fs_inst *tm_read;
772 fs_reg shader_end_time = get_timestamp(&tm_read);
773 end->insert_before(tm_read);
774
775 /* Check that there weren't any timestamp reset events (assuming these
776 * were the only two timestamp reads that happened).
777 */
778 fs_reg reset = shader_end_time;
779 reset.set_smear(2);
780 fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
781 test->conditional_mod = BRW_CONDITIONAL_Z;
782 test->force_writemask_all = true;
783 end->insert_before(test);
784 end->insert_before(IF(BRW_PREDICATE_NORMAL));
785
786 fs_reg start = shader_start_time;
787 start.negate = true;
788 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
789 diff.set_smear(0);
790 fs_inst *add = ADD(diff, start, shader_end_time);
791 add->force_writemask_all = true;
792 end->insert_before(add);
793
794 /* If there were no instructions between the two timestamp gets, the diff
795 * is 2 cycles. Remove that overhead, so I can forget about that when
796 * trying to determine the time taken for single instructions.
797 */
798 add = ADD(diff, diff, fs_reg(-2u));
799 add->force_writemask_all = true;
800 end->insert_before(add);
801
802 end->insert_before(SHADER_TIME_ADD(type, diff));
803 end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
804 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
805 end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
806 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
807 }
808
809 fs_inst *
810 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
811 {
812 int shader_time_index =
813 brw_get_shader_time_index(brw, shader_prog, prog, type);
814 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
815
816 fs_reg payload;
817 if (dispatch_width == 8)
818 payload = vgrf(glsl_type::uvec2_type);
819 else
820 payload = vgrf(glsl_type::uint_type);
821
822 return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
823 fs_reg(), payload, offset, value);
824 }
825
826 void
827 fs_visitor::vfail(const char *format, va_list va)
828 {
829 char *msg;
830
831 if (failed)
832 return;
833
834 failed = true;
835
836 msg = ralloc_vasprintf(mem_ctx, format, va);
837 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
838
839 this->fail_msg = msg;
840
841 if (debug_enabled) {
842 fprintf(stderr, "%s", msg);
843 }
844 }
845
846 void
847 fs_visitor::fail(const char *format, ...)
848 {
849 va_list va;
850
851 va_start(va, format);
852 vfail(format, va);
853 va_end(va);
854 }
855
856 /**
857 * Mark this program as impossible to compile in SIMD16 mode.
858 *
859 * During the SIMD8 compile (which happens first), we can detect and flag
860 * things that are unsupported in SIMD16 mode, so the compiler can skip
861 * the SIMD16 compile altogether.
862 *
863 * During a SIMD16 compile (if one happens anyway), this just calls fail().
864 */
865 void
866 fs_visitor::no16(const char *format, ...)
867 {
868 va_list va;
869
870 va_start(va, format);
871
872 if (dispatch_width == 16) {
873 vfail(format, va);
874 } else {
875 simd16_unsupported = true;
876
877 if (brw->perf_debug) {
878 if (no16_msg)
879 ralloc_vasprintf_append(&no16_msg, format, va);
880 else
881 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
882 }
883 }
884
885 va_end(va);
886 }
887
888 fs_inst *
889 fs_visitor::emit(enum opcode opcode)
890 {
891 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
892 }
893
894 fs_inst *
895 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
896 {
897 return emit(new(mem_ctx) fs_inst(opcode, dst));
898 }
899
900 fs_inst *
901 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
902 {
903 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
904 }
905
906 fs_inst *
907 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
908 const fs_reg &src1)
909 {
910 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
911 }
912
913 fs_inst *
914 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
915 const fs_reg &src1, const fs_reg &src2)
916 {
917 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
918 }
919
920 fs_inst *
921 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
922 fs_reg src[], int sources)
923 {
924 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
925 }
926
927 /**
928 * Returns true if the instruction has a flag that means it won't
929 * update an entire destination register.
930 *
931 * For example, dead code elimination and live variable analysis want to know
932 * when a write to a variable screens off any preceding values that were in
933 * it.
934 */
935 bool
936 fs_inst::is_partial_write() const
937 {
938 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
939 (this->dst.width * type_sz(this->dst.type)) < 32 ||
940 !this->dst.is_contiguous());
941 }
942
943 int
944 fs_inst::regs_read(int arg) const
945 {
946 if (is_tex() && arg == 0 && src[0].file == GRF) {
947 return mlen;
948 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
949 return mlen;
950 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
951 return mlen;
952 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
953 return mlen;
954 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
955 return mlen;
956 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
957 return mlen;
958 } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
959 return mlen;
960 } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
961 return mlen;
962 } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
963 return mlen;
964 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
965 return mlen;
966 } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
967 return exec_size / 4;
968 }
969
970 switch (src[arg].file) {
971 case BAD_FILE:
972 case UNIFORM:
973 case IMM:
974 return 1;
975 case GRF:
976 case HW_REG:
977 if (src[arg].stride == 0) {
978 return 1;
979 } else {
980 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
981 return (size + 31) / 32;
982 }
983 case MRF:
984 unreachable("MRF registers are not allowed as sources");
985 default:
986 unreachable("Invalid register file");
987 }
988 }
989
990 bool
991 fs_inst::reads_flag() const
992 {
993 return predicate;
994 }
995
996 bool
997 fs_inst::writes_flag() const
998 {
999 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
1000 opcode != BRW_OPCODE_IF &&
1001 opcode != BRW_OPCODE_WHILE)) ||
1002 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
1003 }
1004
1005 /**
1006 * Returns how many MRFs an FS opcode will write over.
1007 *
1008 * Note that this is not the 0 or 1 implied writes in an actual gen
1009 * instruction -- the FS opcodes often generate MOVs in addition.
1010 */
1011 int
1012 fs_visitor::implied_mrf_writes(fs_inst *inst)
1013 {
1014 if (inst->mlen == 0)
1015 return 0;
1016
1017 if (inst->base_mrf == -1)
1018 return 0;
1019
1020 switch (inst->opcode) {
1021 case SHADER_OPCODE_RCP:
1022 case SHADER_OPCODE_RSQ:
1023 case SHADER_OPCODE_SQRT:
1024 case SHADER_OPCODE_EXP2:
1025 case SHADER_OPCODE_LOG2:
1026 case SHADER_OPCODE_SIN:
1027 case SHADER_OPCODE_COS:
1028 return 1 * dispatch_width / 8;
1029 case SHADER_OPCODE_POW:
1030 case SHADER_OPCODE_INT_QUOTIENT:
1031 case SHADER_OPCODE_INT_REMAINDER:
1032 return 2 * dispatch_width / 8;
1033 case SHADER_OPCODE_TEX:
1034 case FS_OPCODE_TXB:
1035 case SHADER_OPCODE_TXD:
1036 case SHADER_OPCODE_TXF:
1037 case SHADER_OPCODE_TXF_CMS:
1038 case SHADER_OPCODE_TXF_MCS:
1039 case SHADER_OPCODE_TG4:
1040 case SHADER_OPCODE_TG4_OFFSET:
1041 case SHADER_OPCODE_TXL:
1042 case SHADER_OPCODE_TXS:
1043 case SHADER_OPCODE_LOD:
1044 return 1;
1045 case FS_OPCODE_FB_WRITE:
1046 return 2;
1047 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1048 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1049 return 1;
1050 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1051 return inst->mlen;
1052 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1053 return 2;
1054 case SHADER_OPCODE_UNTYPED_ATOMIC:
1055 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1056 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1057 case SHADER_OPCODE_TYPED_ATOMIC:
1058 case SHADER_OPCODE_TYPED_SURFACE_READ:
1059 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1060 case SHADER_OPCODE_URB_WRITE_SIMD8:
1061 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1062 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1063 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1064 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1065 return 0;
1066 default:
1067 unreachable("not reached");
1068 }
1069 }
1070
1071 fs_reg
1072 fs_visitor::vgrf(const glsl_type *const type)
1073 {
1074 int reg_width = dispatch_width / 8;
1075 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1076 brw_type_for_base_type(type), dispatch_width);
1077 }
1078
1079 fs_reg
1080 fs_visitor::vgrf(int num_components)
1081 {
1082 int reg_width = dispatch_width / 8;
1083 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1084 BRW_REGISTER_TYPE_F, dispatch_width);
1085 }
1086
1087 /** Fixed HW reg constructor. */
1088 fs_reg::fs_reg(enum register_file file, int reg)
1089 {
1090 init();
1091 this->file = file;
1092 this->reg = reg;
1093 this->type = BRW_REGISTER_TYPE_F;
1094
1095 switch (file) {
1096 case UNIFORM:
1097 this->width = 1;
1098 break;
1099 default:
1100 this->width = 8;
1101 }
1102 }
1103
1104 /** Fixed HW reg constructor. */
1105 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1106 {
1107 init();
1108 this->file = file;
1109 this->reg = reg;
1110 this->type = type;
1111
1112 switch (file) {
1113 case UNIFORM:
1114 this->width = 1;
1115 break;
1116 default:
1117 this->width = 8;
1118 }
1119 }
1120
1121 /** Fixed HW reg constructor. */
1122 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1123 uint8_t width)
1124 {
1125 init();
1126 this->file = file;
1127 this->reg = reg;
1128 this->type = type;
1129 this->width = width;
1130 }
1131
1132 fs_reg *
1133 fs_visitor::variable_storage(ir_variable *var)
1134 {
1135 return (fs_reg *)hash_table_find(this->variable_ht, var);
1136 }
1137
1138 void
1139 import_uniforms_callback(const void *key,
1140 void *data,
1141 void *closure)
1142 {
1143 struct hash_table *dst_ht = (struct hash_table *)closure;
1144 const fs_reg *reg = (const fs_reg *)data;
1145
1146 if (reg->file != UNIFORM)
1147 return;
1148
1149 hash_table_insert(dst_ht, data, key);
1150 }
1151
1152 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1153 * This brings in those uniform definitions
1154 */
1155 void
1156 fs_visitor::import_uniforms(fs_visitor *v)
1157 {
1158 hash_table_call_foreach(v->variable_ht,
1159 import_uniforms_callback,
1160 variable_ht);
1161 this->push_constant_loc = v->push_constant_loc;
1162 this->pull_constant_loc = v->pull_constant_loc;
1163 this->uniforms = v->uniforms;
1164 this->param_size = v->param_size;
1165 }
1166
1167 /* Our support for uniforms is piggy-backed on the struct
1168 * gl_fragment_program, because that's where the values actually
1169 * get stored, rather than in some global gl_shader_program uniform
1170 * store.
1171 */
1172 void
1173 fs_visitor::setup_uniform_values(ir_variable *ir)
1174 {
1175 int namelen = strlen(ir->name);
1176
1177 /* The data for our (non-builtin) uniforms is stored in a series of
1178 * gl_uniform_driver_storage structs for each subcomponent that
1179 * glGetUniformLocation() could name. We know it's been set up in the same
1180 * order we'd walk the type, so walk the list of storage and find anything
1181 * with our name, or the prefix of a component that starts with our name.
1182 */
1183 unsigned params_before = uniforms;
1184 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1185 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1186
1187 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1188 (storage->name[namelen] != 0 &&
1189 storage->name[namelen] != '.' &&
1190 storage->name[namelen] != '[')) {
1191 continue;
1192 }
1193
1194 unsigned slots = storage->type->component_slots();
1195 if (storage->array_elements)
1196 slots *= storage->array_elements;
1197
1198 for (unsigned i = 0; i < slots; i++) {
1199 stage_prog_data->param[uniforms++] = &storage->storage[i];
1200 }
1201 }
1202
1203 /* Make sure we actually initialized the right amount of stuff here. */
1204 assert(params_before + ir->type->component_slots() == uniforms);
1205 (void)params_before;
1206 }
1207
1208
1209 /* Our support for builtin uniforms is even scarier than non-builtin.
1210 * It sits on top of the PROG_STATE_VAR parameters that are
1211 * automatically updated from GL context state.
1212 */
1213 void
1214 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1215 {
1216 const ir_state_slot *const slots = ir->get_state_slots();
1217 assert(slots != NULL);
1218
1219 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1220 /* This state reference has already been setup by ir_to_mesa, but we'll
1221 * get the same index back here.
1222 */
1223 int index = _mesa_add_state_reference(this->prog->Parameters,
1224 (gl_state_index *)slots[i].tokens);
1225
1226 /* Add each of the unique swizzles of the element as a parameter.
1227 * This'll end up matching the expected layout of the
1228 * array/matrix/structure we're trying to fill in.
1229 */
1230 int last_swiz = -1;
1231 for (unsigned int j = 0; j < 4; j++) {
1232 int swiz = GET_SWZ(slots[i].swizzle, j);
1233 if (swiz == last_swiz)
1234 break;
1235 last_swiz = swiz;
1236
1237 stage_prog_data->param[uniforms++] =
1238 &prog->Parameters->ParameterValues[index][swiz];
1239 }
1240 }
1241 }
1242
1243 fs_reg *
1244 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1245 bool origin_upper_left)
1246 {
1247 assert(stage == MESA_SHADER_FRAGMENT);
1248 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1249 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1250 fs_reg wpos = *reg;
1251 bool flip = !origin_upper_left ^ key->render_to_fbo;
1252
1253 /* gl_FragCoord.x */
1254 if (pixel_center_integer) {
1255 emit(MOV(wpos, this->pixel_x));
1256 } else {
1257 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1258 }
1259 wpos = offset(wpos, 1);
1260
1261 /* gl_FragCoord.y */
1262 if (!flip && pixel_center_integer) {
1263 emit(MOV(wpos, this->pixel_y));
1264 } else {
1265 fs_reg pixel_y = this->pixel_y;
1266 float offset = (pixel_center_integer ? 0.0 : 0.5);
1267
1268 if (flip) {
1269 pixel_y.negate = true;
1270 offset += key->drawable_height - 1.0;
1271 }
1272
1273 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1274 }
1275 wpos = offset(wpos, 1);
1276
1277 /* gl_FragCoord.z */
1278 if (devinfo->gen >= 6) {
1279 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1280 } else {
1281 emit(FS_OPCODE_LINTERP, wpos,
1282 this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1283 interp_reg(VARYING_SLOT_POS, 2));
1284 }
1285 wpos = offset(wpos, 1);
1286
1287 /* gl_FragCoord.w: Already set up in emit_interpolation */
1288 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1289
1290 return reg;
1291 }
1292
1293 fs_inst *
1294 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1295 glsl_interp_qualifier interpolation_mode,
1296 bool is_centroid, bool is_sample)
1297 {
1298 brw_wm_barycentric_interp_mode barycoord_mode;
1299 if (devinfo->gen >= 6) {
1300 if (is_centroid) {
1301 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1302 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1303 else
1304 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1305 } else if (is_sample) {
1306 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1307 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1308 else
1309 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1310 } else {
1311 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1312 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1313 else
1314 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1315 }
1316 } else {
1317 /* On Ironlake and below, there is only one interpolation mode.
1318 * Centroid interpolation doesn't mean anything on this hardware --
1319 * there is no multisampling.
1320 */
1321 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1322 }
1323 return emit(FS_OPCODE_LINTERP, attr,
1324 this->delta_xy[barycoord_mode], interp);
1325 }
1326
1327 void
1328 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1329 const glsl_type *type,
1330 glsl_interp_qualifier interpolation_mode,
1331 int location, bool mod_centroid,
1332 bool mod_sample)
1333 {
1334 attr.type = brw_type_for_base_type(type->get_scalar_type());
1335
1336 assert(stage == MESA_SHADER_FRAGMENT);
1337 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1338 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1339
1340 unsigned int array_elements;
1341
1342 if (type->is_array()) {
1343 array_elements = type->length;
1344 if (array_elements == 0) {
1345 fail("dereferenced array '%s' has length 0\n", name);
1346 }
1347 type = type->fields.array;
1348 } else {
1349 array_elements = 1;
1350 }
1351
1352 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1353 bool is_gl_Color =
1354 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1355 if (key->flat_shade && is_gl_Color) {
1356 interpolation_mode = INTERP_QUALIFIER_FLAT;
1357 } else {
1358 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1359 }
1360 }
1361
1362 for (unsigned int i = 0; i < array_elements; i++) {
1363 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1364 if (prog_data->urb_setup[location] == -1) {
1365 /* If there's no incoming setup data for this slot, don't
1366 * emit interpolation for it.
1367 */
1368 attr = offset(attr, type->vector_elements);
1369 location++;
1370 continue;
1371 }
1372
1373 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1374 /* Constant interpolation (flat shading) case. The SF has
1375 * handed us defined values in only the constant offset
1376 * field of the setup reg.
1377 */
1378 for (unsigned int k = 0; k < type->vector_elements; k++) {
1379 struct brw_reg interp = interp_reg(location, k);
1380 interp = suboffset(interp, 3);
1381 interp.type = attr.type;
1382 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1383 attr = offset(attr, 1);
1384 }
1385 } else {
1386 /* Smooth/noperspective interpolation case. */
1387 for (unsigned int k = 0; k < type->vector_elements; k++) {
1388 struct brw_reg interp = interp_reg(location, k);
1389 if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1390 /* Get the pixel/sample mask into f0 so that we know
1391 * which pixels are lit. Then, for each channel that is
1392 * unlit, replace the centroid data with non-centroid
1393 * data.
1394 */
1395 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1396
1397 fs_inst *inst;
1398 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1399 false, false);
1400 inst->predicate = BRW_PREDICATE_NORMAL;
1401 inst->predicate_inverse = true;
1402 if (devinfo->has_pln)
1403 inst->no_dd_clear = true;
1404
1405 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1406 mod_centroid && !key->persample_shading,
1407 mod_sample || key->persample_shading);
1408 inst->predicate = BRW_PREDICATE_NORMAL;
1409 inst->predicate_inverse = false;
1410 if (devinfo->has_pln)
1411 inst->no_dd_check = true;
1412
1413 } else {
1414 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1415 mod_centroid && !key->persample_shading,
1416 mod_sample || key->persample_shading);
1417 }
1418 if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1419 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1420 }
1421 attr = offset(attr, 1);
1422 }
1423
1424 }
1425 location++;
1426 }
1427 }
1428 }
1429
1430 fs_reg *
1431 fs_visitor::emit_frontfacing_interpolation()
1432 {
1433 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1434
1435 if (devinfo->gen >= 6) {
1436 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1437 * a boolean result from this (~0/true or 0/false).
1438 *
1439 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1440 * this task in only one instruction:
1441 * - a negation source modifier will flip the bit; and
1442 * - a W -> D type conversion will sign extend the bit into the high
1443 * word of the destination.
1444 *
1445 * An ASR 15 fills the low word of the destination.
1446 */
1447 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1448 g0.negate = true;
1449
1450 emit(ASR(*reg, g0, fs_reg(15)));
1451 } else {
1452 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1453 * a boolean result from this (1/true or 0/false).
1454 *
1455 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1456 * the negation source modifier to flip it. Unfortunately the SHR
1457 * instruction only operates on UD (or D with an abs source modifier)
1458 * sources without negation.
1459 *
1460 * Instead, use ASR (which will give ~0/true or 0/false).
1461 */
1462 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1463 g1_6.negate = true;
1464
1465 emit(ASR(*reg, g1_6, fs_reg(31)));
1466 }
1467
1468 return reg;
1469 }
1470
1471 void
1472 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1473 {
1474 assert(stage == MESA_SHADER_FRAGMENT);
1475 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1476 assert(dst.type == BRW_REGISTER_TYPE_F);
1477
1478 if (key->compute_pos_offset) {
1479 /* Convert int_sample_pos to floating point */
1480 emit(MOV(dst, int_sample_pos));
1481 /* Scale to the range [0, 1] */
1482 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1483 }
1484 else {
1485 /* From ARB_sample_shading specification:
1486 * "When rendering to a non-multisample buffer, or if multisample
1487 * rasterization is disabled, gl_SamplePosition will always be
1488 * (0.5, 0.5).
1489 */
1490 emit(MOV(dst, fs_reg(0.5f)));
1491 }
1492 }
1493
1494 fs_reg *
1495 fs_visitor::emit_samplepos_setup()
1496 {
1497 assert(devinfo->gen >= 6);
1498
1499 this->current_annotation = "compute sample position";
1500 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1501 fs_reg pos = *reg;
1502 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1503 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1504
1505 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1506 * mode will be enabled.
1507 *
1508 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1509 * R31.1:0 Position Offset X/Y for Slot[3:0]
1510 * R31.3:2 Position Offset X/Y for Slot[7:4]
1511 * .....
1512 *
1513 * The X, Y sample positions come in as bytes in thread payload. So, read
1514 * the positions using vstride=16, width=8, hstride=2.
1515 */
1516 struct brw_reg sample_pos_reg =
1517 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1518 BRW_REGISTER_TYPE_B), 16, 8, 2);
1519
1520 if (dispatch_width == 8) {
1521 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1522 } else {
1523 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1524 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1525 ->force_sechalf = true;
1526 }
1527 /* Compute gl_SamplePosition.x */
1528 compute_sample_position(pos, int_sample_x);
1529 pos = offset(pos, 1);
1530 if (dispatch_width == 8) {
1531 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1532 } else {
1533 emit(MOV(half(int_sample_y, 0),
1534 fs_reg(suboffset(sample_pos_reg, 1))));
1535 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1536 ->force_sechalf = true;
1537 }
1538 /* Compute gl_SamplePosition.y */
1539 compute_sample_position(pos, int_sample_y);
1540 return reg;
1541 }
1542
1543 fs_reg *
1544 fs_visitor::emit_sampleid_setup()
1545 {
1546 assert(stage == MESA_SHADER_FRAGMENT);
1547 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1548 assert(devinfo->gen >= 6);
1549
1550 this->current_annotation = "compute sample id";
1551 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1552
1553 if (key->compute_sample_id) {
1554 fs_reg t1 = vgrf(glsl_type::int_type);
1555 fs_reg t2 = vgrf(glsl_type::int_type);
1556 t2.type = BRW_REGISTER_TYPE_UW;
1557
1558 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1559 * 8x multisampling, subspan 0 will represent sample N (where N
1560 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1561 * 7. We can find the value of N by looking at R0.0 bits 7:6
1562 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1563 * (since samples are always delivered in pairs). That is, we
1564 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1565 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1566 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1567 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1568 * populating a temporary variable with the sequence (0, 1, 2, 3),
1569 * and then reading from it using vstride=1, width=4, hstride=0.
1570 * These computations hold good for 4x multisampling as well.
1571 *
1572 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1573 * the first four slots are sample 0 of subspan 0; the next four
1574 * are sample 1 of subspan 0; the third group is sample 0 of
1575 * subspan 1, and finally sample 1 of subspan 1.
1576 */
1577 fs_inst *inst;
1578 inst = emit(BRW_OPCODE_AND, t1,
1579 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1580 fs_reg(0xc0));
1581 inst->force_writemask_all = true;
1582 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1583 inst->force_writemask_all = true;
1584 /* This works for both SIMD8 and SIMD16 */
1585 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1586 inst->force_writemask_all = true;
1587 /* This special instruction takes care of setting vstride=1,
1588 * width=4, hstride=0 of t2 during an ADD instruction.
1589 */
1590 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1591 } else {
1592 /* As per GL_ARB_sample_shading specification:
1593 * "When rendering to a non-multisample buffer, or if multisample
1594 * rasterization is disabled, gl_SampleID will always be zero."
1595 */
1596 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1597 }
1598
1599 return reg;
1600 }
1601
1602 void
1603 fs_visitor::resolve_source_modifiers(fs_reg *src)
1604 {
1605 if (!src->abs && !src->negate)
1606 return;
1607
1608 fs_reg temp = retype(vgrf(1), src->type);
1609 emit(MOV(temp, *src));
1610 *src = temp;
1611 }
1612
1613 fs_reg
1614 fs_visitor::fix_math_operand(fs_reg src)
1615 {
1616 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1617 * might be able to do better by doing execsize = 1 math and then
1618 * expanding that result out, but we would need to be careful with
1619 * masking.
1620 *
1621 * The hardware ignores source modifiers (negate and abs) on math
1622 * instructions, so we also move to a temp to set those up.
1623 */
1624 if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1625 !src.abs && !src.negate)
1626 return src;
1627
1628 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1629 * operands to math
1630 */
1631 if (devinfo->gen >= 7 && src.file != IMM)
1632 return src;
1633
1634 fs_reg expanded = vgrf(glsl_type::float_type);
1635 expanded.type = src.type;
1636 emit(BRW_OPCODE_MOV, expanded, src);
1637 return expanded;
1638 }
1639
1640 fs_inst *
1641 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1642 {
1643 switch (opcode) {
1644 case SHADER_OPCODE_RCP:
1645 case SHADER_OPCODE_RSQ:
1646 case SHADER_OPCODE_SQRT:
1647 case SHADER_OPCODE_EXP2:
1648 case SHADER_OPCODE_LOG2:
1649 case SHADER_OPCODE_SIN:
1650 case SHADER_OPCODE_COS:
1651 break;
1652 default:
1653 unreachable("not reached: bad math opcode");
1654 }
1655
1656 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1657 * might be able to do better by doing execsize = 1 math and then
1658 * expanding that result out, but we would need to be careful with
1659 * masking.
1660 *
1661 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1662 * instructions, so we also move to a temp to set those up.
1663 */
1664 if (devinfo->gen == 6 || devinfo->gen == 7)
1665 src = fix_math_operand(src);
1666
1667 fs_inst *inst = emit(opcode, dst, src);
1668
1669 if (devinfo->gen < 6) {
1670 inst->base_mrf = 2;
1671 inst->mlen = dispatch_width / 8;
1672 }
1673
1674 return inst;
1675 }
1676
1677 fs_inst *
1678 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1679 {
1680 int base_mrf = 2;
1681 fs_inst *inst;
1682
1683 if (devinfo->gen >= 8) {
1684 inst = emit(opcode, dst, src0, src1);
1685 } else if (devinfo->gen >= 6) {
1686 src0 = fix_math_operand(src0);
1687 src1 = fix_math_operand(src1);
1688
1689 inst = emit(opcode, dst, src0, src1);
1690 } else {
1691 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1692 * "Message Payload":
1693 *
1694 * "Operand0[7]. For the INT DIV functions, this operand is the
1695 * denominator."
1696 * ...
1697 * "Operand1[7]. For the INT DIV functions, this operand is the
1698 * numerator."
1699 */
1700 bool is_int_div = opcode != SHADER_OPCODE_POW;
1701 fs_reg &op0 = is_int_div ? src1 : src0;
1702 fs_reg &op1 = is_int_div ? src0 : src1;
1703
1704 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1705 inst = emit(opcode, dst, op0, reg_null_f);
1706
1707 inst->base_mrf = base_mrf;
1708 inst->mlen = 2 * dispatch_width / 8;
1709 }
1710 return inst;
1711 }
1712
1713 void
1714 fs_visitor::emit_discard_jump()
1715 {
1716 assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1717
1718 /* For performance, after a discard, jump to the end of the
1719 * shader if all relevant channels have been discarded.
1720 */
1721 fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1722 discard_jump->flag_subreg = 1;
1723
1724 discard_jump->predicate = (dispatch_width == 8)
1725 ? BRW_PREDICATE_ALIGN1_ANY8H
1726 : BRW_PREDICATE_ALIGN1_ANY16H;
1727 discard_jump->predicate_inverse = true;
1728 }
1729
1730 void
1731 fs_visitor::assign_curb_setup()
1732 {
1733 if (dispatch_width == 8) {
1734 prog_data->dispatch_grf_start_reg = payload.num_regs;
1735 } else {
1736 if (stage == MESA_SHADER_FRAGMENT) {
1737 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1738 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1739 } else if (stage == MESA_SHADER_COMPUTE) {
1740 brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1741 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1742 } else {
1743 unreachable("Unsupported shader type!");
1744 }
1745 }
1746
1747 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1748
1749 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1750 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1751 for (unsigned int i = 0; i < inst->sources; i++) {
1752 if (inst->src[i].file == UNIFORM) {
1753 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1754 int constant_nr;
1755 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1756 constant_nr = push_constant_loc[uniform_nr];
1757 } else {
1758 /* Section 5.11 of the OpenGL 4.1 spec says:
1759 * "Out-of-bounds reads return undefined values, which include
1760 * values from other variables of the active program or zero."
1761 * Just return the first push constant.
1762 */
1763 constant_nr = 0;
1764 }
1765
1766 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1767 constant_nr / 8,
1768 constant_nr % 8);
1769
1770 inst->src[i].file = HW_REG;
1771 inst->src[i].fixed_hw_reg = byte_offset(
1772 retype(brw_reg, inst->src[i].type),
1773 inst->src[i].subreg_offset);
1774 }
1775 }
1776 }
1777 }
1778
1779 void
1780 fs_visitor::calculate_urb_setup()
1781 {
1782 assert(stage == MESA_SHADER_FRAGMENT);
1783 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1784 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1785
1786 memset(prog_data->urb_setup, -1,
1787 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1788
1789 int urb_next = 0;
1790 /* Figure out where each of the incoming setup attributes lands. */
1791 if (devinfo->gen >= 6) {
1792 if (_mesa_bitcount_64(prog->InputsRead &
1793 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1794 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1795 * first 16 varying inputs, so we can put them wherever we want.
1796 * Just put them in order.
1797 *
1798 * This is useful because it means that (a) inputs not used by the
1799 * fragment shader won't take up valuable register space, and (b) we
1800 * won't have to recompile the fragment shader if it gets paired with
1801 * a different vertex (or geometry) shader.
1802 */
1803 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1804 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1805 BITFIELD64_BIT(i)) {
1806 prog_data->urb_setup[i] = urb_next++;
1807 }
1808 }
1809 } else {
1810 /* We have enough input varyings that the SF/SBE pipeline stage can't
1811 * arbitrarily rearrange them to suit our whim; we have to put them
1812 * in an order that matches the output of the previous pipeline stage
1813 * (geometry or vertex shader).
1814 */
1815 struct brw_vue_map prev_stage_vue_map;
1816 brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1817 key->input_slots_valid);
1818 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1819 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1820 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1821 slot++) {
1822 int varying = prev_stage_vue_map.slot_to_varying[slot];
1823 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1824 * unused.
1825 */
1826 if (varying != BRW_VARYING_SLOT_COUNT &&
1827 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1828 BITFIELD64_BIT(varying))) {
1829 prog_data->urb_setup[varying] = slot - first_slot;
1830 }
1831 }
1832 urb_next = prev_stage_vue_map.num_slots - first_slot;
1833 }
1834 } else {
1835 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1836 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1837 /* Point size is packed into the header, not as a general attribute */
1838 if (i == VARYING_SLOT_PSIZ)
1839 continue;
1840
1841 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1842 /* The back color slot is skipped when the front color is
1843 * also written to. In addition, some slots can be
1844 * written in the vertex shader and not read in the
1845 * fragment shader. So the register number must always be
1846 * incremented, mapped or not.
1847 */
1848 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1849 prog_data->urb_setup[i] = urb_next;
1850 urb_next++;
1851 }
1852 }
1853
1854 /*
1855 * It's a FS only attribute, and we did interpolation for this attribute
1856 * in SF thread. So, count it here, too.
1857 *
1858 * See compile_sf_prog() for more info.
1859 */
1860 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1861 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1862 }
1863
1864 prog_data->num_varying_inputs = urb_next;
1865 }
1866
1867 void
1868 fs_visitor::assign_urb_setup()
1869 {
1870 assert(stage == MESA_SHADER_FRAGMENT);
1871 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1872
1873 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1874
1875 /* Offset all the urb_setup[] index by the actual position of the
1876 * setup regs, now that the location of the constants has been chosen.
1877 */
1878 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1879 if (inst->opcode == FS_OPCODE_LINTERP) {
1880 assert(inst->src[1].file == HW_REG);
1881 inst->src[1].fixed_hw_reg.nr += urb_start;
1882 }
1883
1884 if (inst->opcode == FS_OPCODE_CINTERP) {
1885 assert(inst->src[0].file == HW_REG);
1886 inst->src[0].fixed_hw_reg.nr += urb_start;
1887 }
1888 }
1889
1890 /* Each attribute is 4 setup channels, each of which is half a reg. */
1891 this->first_non_payload_grf =
1892 urb_start + prog_data->num_varying_inputs * 2;
1893 }
1894
1895 void
1896 fs_visitor::assign_vs_urb_setup()
1897 {
1898 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1899 int grf, count, slot, channel, attr;
1900
1901 assert(stage == MESA_SHADER_VERTEX);
1902 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1903 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1904 count++;
1905
1906 /* Each attribute is 4 regs. */
1907 this->first_non_payload_grf =
1908 payload.num_regs + prog_data->curb_read_length + count * 4;
1909
1910 unsigned vue_entries =
1911 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1912
1913 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1914 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1915
1916 assert(vs_prog_data->base.urb_read_length <= 15);
1917
1918 /* Rewrite all ATTR file references to the hw grf that they land in. */
1919 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1920 for (int i = 0; i < inst->sources; i++) {
1921 if (inst->src[i].file == ATTR) {
1922
1923 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1924 slot = count - 1;
1925 } else {
1926 /* Attributes come in in a contiguous block, ordered by their
1927 * gl_vert_attrib value. That means we can compute the slot
1928 * number for an attribute by masking out the enabled
1929 * attributes before it and counting the bits.
1930 */
1931 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1932 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1933 BITFIELD64_MASK(attr));
1934 }
1935
1936 channel = inst->src[i].reg_offset & 3;
1937
1938 grf = payload.num_regs +
1939 prog_data->curb_read_length +
1940 slot * 4 + channel;
1941
1942 inst->src[i].file = HW_REG;
1943 inst->src[i].fixed_hw_reg =
1944 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1945 }
1946 }
1947 }
1948 }
1949
1950 /**
1951 * Split large virtual GRFs into separate components if we can.
1952 *
1953 * This is mostly duplicated with what brw_fs_vector_splitting does,
1954 * but that's really conservative because it's afraid of doing
1955 * splitting that doesn't result in real progress after the rest of
1956 * the optimization phases, which would cause infinite looping in
1957 * optimization. We can do it once here, safely. This also has the
1958 * opportunity to split interpolated values, or maybe even uniforms,
1959 * which we don't have at the IR level.
1960 *
1961 * We want to split, because virtual GRFs are what we register
1962 * allocate and spill (due to contiguousness requirements for some
1963 * instructions), and they're what we naturally generate in the
1964 * codegen process, but most virtual GRFs don't actually need to be
1965 * contiguous sets of GRFs. If we split, we'll end up with reduced
1966 * live intervals and better dead code elimination and coalescing.
1967 */
1968 void
1969 fs_visitor::split_virtual_grfs()
1970 {
1971 int num_vars = this->alloc.count;
1972
1973 /* Count the total number of registers */
1974 int reg_count = 0;
1975 int vgrf_to_reg[num_vars];
1976 for (int i = 0; i < num_vars; i++) {
1977 vgrf_to_reg[i] = reg_count;
1978 reg_count += alloc.sizes[i];
1979 }
1980
1981 /* An array of "split points". For each register slot, this indicates
1982 * if this slot can be separated from the previous slot. Every time an
1983 * instruction uses multiple elements of a register (as a source or
1984 * destination), we mark the used slots as inseparable. Then we go
1985 * through and split the registers into the smallest pieces we can.
1986 */
1987 bool split_points[reg_count];
1988 memset(split_points, 0, sizeof(split_points));
1989
1990 /* Mark all used registers as fully splittable */
1991 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1992 if (inst->dst.file == GRF) {
1993 int reg = vgrf_to_reg[inst->dst.reg];
1994 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1995 split_points[reg + j] = true;
1996 }
1997
1998 for (int i = 0; i < inst->sources; i++) {
1999 if (inst->src[i].file == GRF) {
2000 int reg = vgrf_to_reg[inst->src[i].reg];
2001 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
2002 split_points[reg + j] = true;
2003 }
2004 }
2005 }
2006
2007 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2008 if (inst->dst.file == GRF) {
2009 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2010 for (int j = 1; j < inst->regs_written; j++)
2011 split_points[reg + j] = false;
2012 }
2013 for (int i = 0; i < inst->sources; i++) {
2014 if (inst->src[i].file == GRF) {
2015 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2016 for (int j = 1; j < inst->regs_read(i); j++)
2017 split_points[reg + j] = false;
2018 }
2019 }
2020 }
2021
2022 int new_virtual_grf[reg_count];
2023 int new_reg_offset[reg_count];
2024
2025 int reg = 0;
2026 for (int i = 0; i < num_vars; i++) {
2027 /* The first one should always be 0 as a quick sanity check. */
2028 assert(split_points[reg] == false);
2029
2030 /* j = 0 case */
2031 new_reg_offset[reg] = 0;
2032 reg++;
2033 int offset = 1;
2034
2035 /* j > 0 case */
2036 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2037 /* If this is a split point, reset the offset to 0 and allocate a
2038 * new virtual GRF for the previous offset many registers
2039 */
2040 if (split_points[reg]) {
2041 assert(offset <= MAX_VGRF_SIZE);
2042 int grf = alloc.allocate(offset);
2043 for (int k = reg - offset; k < reg; k++)
2044 new_virtual_grf[k] = grf;
2045 offset = 0;
2046 }
2047 new_reg_offset[reg] = offset;
2048 offset++;
2049 reg++;
2050 }
2051
2052 /* The last one gets the original register number */
2053 assert(offset <= MAX_VGRF_SIZE);
2054 alloc.sizes[i] = offset;
2055 for (int k = reg - offset; k < reg; k++)
2056 new_virtual_grf[k] = i;
2057 }
2058 assert(reg == reg_count);
2059
2060 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2061 if (inst->dst.file == GRF) {
2062 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2063 inst->dst.reg = new_virtual_grf[reg];
2064 inst->dst.reg_offset = new_reg_offset[reg];
2065 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2066 }
2067 for (int i = 0; i < inst->sources; i++) {
2068 if (inst->src[i].file == GRF) {
2069 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2070 inst->src[i].reg = new_virtual_grf[reg];
2071 inst->src[i].reg_offset = new_reg_offset[reg];
2072 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2073 }
2074 }
2075 }
2076 invalidate_live_intervals();
2077 }
2078
2079 /**
2080 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2081 *
2082 * During code generation, we create tons of temporary variables, many of
2083 * which get immediately killed and are never used again. Yet, in later
2084 * optimization and analysis passes, such as compute_live_intervals, we need
2085 * to loop over all the virtual GRFs. Compacting them can save a lot of
2086 * overhead.
2087 */
2088 bool
2089 fs_visitor::compact_virtual_grfs()
2090 {
2091 bool progress = false;
2092 int remap_table[this->alloc.count];
2093 memset(remap_table, -1, sizeof(remap_table));
2094
2095 /* Mark which virtual GRFs are used. */
2096 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2097 if (inst->dst.file == GRF)
2098 remap_table[inst->dst.reg] = 0;
2099
2100 for (int i = 0; i < inst->sources; i++) {
2101 if (inst->src[i].file == GRF)
2102 remap_table[inst->src[i].reg] = 0;
2103 }
2104 }
2105
2106 /* Compact the GRF arrays. */
2107 int new_index = 0;
2108 for (unsigned i = 0; i < this->alloc.count; i++) {
2109 if (remap_table[i] == -1) {
2110 /* We just found an unused register. This means that we are
2111 * actually going to compact something.
2112 */
2113 progress = true;
2114 } else {
2115 remap_table[i] = new_index;
2116 alloc.sizes[new_index] = alloc.sizes[i];
2117 invalidate_live_intervals();
2118 ++new_index;
2119 }
2120 }
2121
2122 this->alloc.count = new_index;
2123
2124 /* Patch all the instructions to use the newly renumbered registers */
2125 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2126 if (inst->dst.file == GRF)
2127 inst->dst.reg = remap_table[inst->dst.reg];
2128
2129 for (int i = 0; i < inst->sources; i++) {
2130 if (inst->src[i].file == GRF)
2131 inst->src[i].reg = remap_table[inst->src[i].reg];
2132 }
2133 }
2134
2135 /* Patch all the references to delta_xy, since they're used in register
2136 * allocation. If they're unused, switch them to BAD_FILE so we don't
2137 * think some random VGRF is delta_xy.
2138 */
2139 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2140 if (delta_xy[i].file == GRF) {
2141 if (remap_table[delta_xy[i].reg] != -1) {
2142 delta_xy[i].reg = remap_table[delta_xy[i].reg];
2143 } else {
2144 delta_xy[i].file = BAD_FILE;
2145 }
2146 }
2147 }
2148
2149 return progress;
2150 }
2151
2152 /*
2153 * Implements array access of uniforms by inserting a
2154 * PULL_CONSTANT_LOAD instruction.
2155 *
2156 * Unlike temporary GRF array access (where we don't support it due to
2157 * the difficulty of doing relative addressing on instruction
2158 * destinations), we could potentially do array access of uniforms
2159 * that were loaded in GRF space as push constants. In real-world
2160 * usage we've seen, though, the arrays being used are always larger
2161 * than we could load as push constants, so just always move all
2162 * uniform array access out to a pull constant buffer.
2163 */
2164 void
2165 fs_visitor::move_uniform_array_access_to_pull_constants()
2166 {
2167 if (dispatch_width != 8)
2168 return;
2169
2170 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2171 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2172
2173 /* Walk through and find array access of uniforms. Put a copy of that
2174 * uniform in the pull constant buffer.
2175 *
2176 * Note that we don't move constant-indexed accesses to arrays. No
2177 * testing has been done of the performance impact of this choice.
2178 */
2179 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2180 for (int i = 0 ; i < inst->sources; i++) {
2181 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2182 continue;
2183
2184 int uniform = inst->src[i].reg;
2185
2186 /* If this array isn't already present in the pull constant buffer,
2187 * add it.
2188 */
2189 if (pull_constant_loc[uniform] == -1) {
2190 const gl_constant_value **values = &stage_prog_data->param[uniform];
2191
2192 assert(param_size[uniform]);
2193
2194 for (int j = 0; j < param_size[uniform]; j++) {
2195 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2196
2197 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2198 values[j];
2199 }
2200 }
2201 }
2202 }
2203 }
2204
2205 /**
2206 * Assign UNIFORM file registers to either push constants or pull constants.
2207 *
2208 * We allow a fragment shader to have more than the specified minimum
2209 * maximum number of fragment shader uniform components (64). If
2210 * there are too many of these, they'd fill up all of register space.
2211 * So, this will push some of them out to the pull constant buffer and
2212 * update the program to load them.
2213 */
2214 void
2215 fs_visitor::assign_constant_locations()
2216 {
2217 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2218 if (dispatch_width != 8)
2219 return;
2220
2221 /* Find which UNIFORM registers are still in use. */
2222 bool is_live[uniforms];
2223 for (unsigned int i = 0; i < uniforms; i++) {
2224 is_live[i] = false;
2225 }
2226
2227 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2228 for (int i = 0; i < inst->sources; i++) {
2229 if (inst->src[i].file != UNIFORM)
2230 continue;
2231
2232 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2233 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2234 is_live[constant_nr] = true;
2235 }
2236 }
2237
2238 /* Only allow 16 registers (128 uniform components) as push constants.
2239 *
2240 * Just demote the end of the list. We could probably do better
2241 * here, demoting things that are rarely used in the program first.
2242 *
2243 * If changing this value, note the limitation about total_regs in
2244 * brw_curbe.c.
2245 */
2246 unsigned int max_push_components = 16 * 8;
2247 unsigned int num_push_constants = 0;
2248
2249 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2250
2251 for (unsigned int i = 0; i < uniforms; i++) {
2252 if (!is_live[i] || pull_constant_loc[i] != -1) {
2253 /* This UNIFORM register is either dead, or has already been demoted
2254 * to a pull const. Mark it as no longer living in the param[] array.
2255 */
2256 push_constant_loc[i] = -1;
2257 continue;
2258 }
2259
2260 if (num_push_constants < max_push_components) {
2261 /* Retain as a push constant. Record the location in the params[]
2262 * array.
2263 */
2264 push_constant_loc[i] = num_push_constants++;
2265 } else {
2266 /* Demote to a pull constant. */
2267 push_constant_loc[i] = -1;
2268
2269 int pull_index = stage_prog_data->nr_pull_params++;
2270 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2271 pull_constant_loc[i] = pull_index;
2272 }
2273 }
2274
2275 stage_prog_data->nr_params = num_push_constants;
2276
2277 /* Up until now, the param[] array has been indexed by reg + reg_offset
2278 * of UNIFORM registers. Condense it to only contain the uniforms we
2279 * chose to upload as push constants.
2280 */
2281 for (unsigned int i = 0; i < uniforms; i++) {
2282 int remapped = push_constant_loc[i];
2283
2284 if (remapped == -1)
2285 continue;
2286
2287 assert(remapped <= (int)i);
2288 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2289 }
2290 }
2291
2292 /**
2293 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2294 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2295 */
2296 void
2297 fs_visitor::demote_pull_constants()
2298 {
2299 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2300 for (int i = 0; i < inst->sources; i++) {
2301 if (inst->src[i].file != UNIFORM)
2302 continue;
2303
2304 int pull_index;
2305 unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2306 if (location >= uniforms) /* Out of bounds access */
2307 pull_index = -1;
2308 else
2309 pull_index = pull_constant_loc[location];
2310
2311 if (pull_index == -1)
2312 continue;
2313
2314 /* Set up the annotation tracking for new generated instructions. */
2315 base_ir = inst->ir;
2316 current_annotation = inst->annotation;
2317
2318 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2319 fs_reg dst = vgrf(glsl_type::float_type);
2320
2321 /* Generate a pull load into dst. */
2322 if (inst->src[i].reladdr) {
2323 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2324 surf_index,
2325 *inst->src[i].reladdr,
2326 pull_index);
2327 inst->insert_before(block, &list);
2328 inst->src[i].reladdr = NULL;
2329 } else {
2330 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2331 fs_inst *pull =
2332 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2333 dst, surf_index, offset);
2334 inst->insert_before(block, pull);
2335 inst->src[i].set_smear(pull_index & 3);
2336 }
2337
2338 /* Rewrite the instruction to use the temporary VGRF. */
2339 inst->src[i].file = GRF;
2340 inst->src[i].reg = dst.reg;
2341 inst->src[i].reg_offset = 0;
2342 inst->src[i].width = dispatch_width;
2343 }
2344 }
2345 invalidate_live_intervals();
2346 }
2347
2348 bool
2349 fs_visitor::opt_algebraic()
2350 {
2351 bool progress = false;
2352
2353 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2354 switch (inst->opcode) {
2355 case BRW_OPCODE_MOV:
2356 if (inst->src[0].file != IMM)
2357 break;
2358
2359 if (inst->saturate) {
2360 if (inst->dst.type != inst->src[0].type)
2361 assert(!"unimplemented: saturate mixed types");
2362
2363 if (brw_saturate_immediate(inst->dst.type,
2364 &inst->src[0].fixed_hw_reg)) {
2365 inst->saturate = false;
2366 progress = true;
2367 }
2368 }
2369 break;
2370
2371 case BRW_OPCODE_MUL:
2372 if (inst->src[1].file != IMM)
2373 continue;
2374
2375 /* a * 1.0 = a */
2376 if (inst->src[1].is_one()) {
2377 inst->opcode = BRW_OPCODE_MOV;
2378 inst->src[1] = reg_undef;
2379 progress = true;
2380 break;
2381 }
2382
2383 /* a * -1.0 = -a */
2384 if (inst->src[1].is_negative_one()) {
2385 inst->opcode = BRW_OPCODE_MOV;
2386 inst->src[0].negate = !inst->src[0].negate;
2387 inst->src[1] = reg_undef;
2388 progress = true;
2389 break;
2390 }
2391
2392 /* a * 0.0 = 0.0 */
2393 if (inst->src[1].is_zero()) {
2394 inst->opcode = BRW_OPCODE_MOV;
2395 inst->src[0] = inst->src[1];
2396 inst->src[1] = reg_undef;
2397 progress = true;
2398 break;
2399 }
2400
2401 if (inst->src[0].file == IMM) {
2402 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2403 inst->opcode = BRW_OPCODE_MOV;
2404 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2405 inst->src[1] = reg_undef;
2406 progress = true;
2407 break;
2408 }
2409 break;
2410 case BRW_OPCODE_ADD:
2411 if (inst->src[1].file != IMM)
2412 continue;
2413
2414 /* a + 0.0 = a */
2415 if (inst->src[1].is_zero()) {
2416 inst->opcode = BRW_OPCODE_MOV;
2417 inst->src[1] = reg_undef;
2418 progress = true;
2419 break;
2420 }
2421
2422 if (inst->src[0].file == IMM) {
2423 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2424 inst->opcode = BRW_OPCODE_MOV;
2425 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2426 inst->src[1] = reg_undef;
2427 progress = true;
2428 break;
2429 }
2430 break;
2431 case BRW_OPCODE_OR:
2432 if (inst->src[0].equals(inst->src[1])) {
2433 inst->opcode = BRW_OPCODE_MOV;
2434 inst->src[1] = reg_undef;
2435 progress = true;
2436 break;
2437 }
2438 break;
2439 case BRW_OPCODE_LRP:
2440 if (inst->src[1].equals(inst->src[2])) {
2441 inst->opcode = BRW_OPCODE_MOV;
2442 inst->src[0] = inst->src[1];
2443 inst->src[1] = reg_undef;
2444 inst->src[2] = reg_undef;
2445 progress = true;
2446 break;
2447 }
2448 break;
2449 case BRW_OPCODE_CMP:
2450 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2451 inst->src[0].abs &&
2452 inst->src[0].negate &&
2453 inst->src[1].is_zero()) {
2454 inst->src[0].abs = false;
2455 inst->src[0].negate = false;
2456 inst->conditional_mod = BRW_CONDITIONAL_Z;
2457 progress = true;
2458 break;
2459 }
2460 break;
2461 case BRW_OPCODE_SEL:
2462 if (inst->src[0].equals(inst->src[1])) {
2463 inst->opcode = BRW_OPCODE_MOV;
2464 inst->src[1] = reg_undef;
2465 inst->predicate = BRW_PREDICATE_NONE;
2466 inst->predicate_inverse = false;
2467 progress = true;
2468 } else if (inst->saturate && inst->src[1].file == IMM) {
2469 switch (inst->conditional_mod) {
2470 case BRW_CONDITIONAL_LE:
2471 case BRW_CONDITIONAL_L:
2472 switch (inst->src[1].type) {
2473 case BRW_REGISTER_TYPE_F:
2474 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2475 inst->opcode = BRW_OPCODE_MOV;
2476 inst->src[1] = reg_undef;
2477 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2478 progress = true;
2479 }
2480 break;
2481 default:
2482 break;
2483 }
2484 break;
2485 case BRW_CONDITIONAL_GE:
2486 case BRW_CONDITIONAL_G:
2487 switch (inst->src[1].type) {
2488 case BRW_REGISTER_TYPE_F:
2489 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2490 inst->opcode = BRW_OPCODE_MOV;
2491 inst->src[1] = reg_undef;
2492 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2493 progress = true;
2494 }
2495 break;
2496 default:
2497 break;
2498 }
2499 default:
2500 break;
2501 }
2502 }
2503 break;
2504 case BRW_OPCODE_MAD:
2505 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2506 inst->opcode = BRW_OPCODE_MOV;
2507 inst->src[1] = reg_undef;
2508 inst->src[2] = reg_undef;
2509 progress = true;
2510 } else if (inst->src[0].is_zero()) {
2511 inst->opcode = BRW_OPCODE_MUL;
2512 inst->src[0] = inst->src[2];
2513 inst->src[2] = reg_undef;
2514 progress = true;
2515 } else if (inst->src[1].is_one()) {
2516 inst->opcode = BRW_OPCODE_ADD;
2517 inst->src[1] = inst->src[2];
2518 inst->src[2] = reg_undef;
2519 progress = true;
2520 } else if (inst->src[2].is_one()) {
2521 inst->opcode = BRW_OPCODE_ADD;
2522 inst->src[2] = reg_undef;
2523 progress = true;
2524 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2525 inst->opcode = BRW_OPCODE_ADD;
2526 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2527 inst->src[2] = reg_undef;
2528 progress = true;
2529 }
2530 break;
2531 case SHADER_OPCODE_RCP: {
2532 fs_inst *prev = (fs_inst *)inst->prev;
2533 if (prev->opcode == SHADER_OPCODE_SQRT) {
2534 if (inst->src[0].equals(prev->dst)) {
2535 inst->opcode = SHADER_OPCODE_RSQ;
2536 inst->src[0] = prev->src[0];
2537 progress = true;
2538 }
2539 }
2540 break;
2541 }
2542 case SHADER_OPCODE_BROADCAST:
2543 if (is_uniform(inst->src[0])) {
2544 inst->opcode = BRW_OPCODE_MOV;
2545 inst->sources = 1;
2546 inst->force_writemask_all = true;
2547 progress = true;
2548 } else if (inst->src[1].file == IMM) {
2549 inst->opcode = BRW_OPCODE_MOV;
2550 inst->src[0] = component(inst->src[0],
2551 inst->src[1].fixed_hw_reg.dw1.ud);
2552 inst->sources = 1;
2553 inst->force_writemask_all = true;
2554 progress = true;
2555 }
2556 break;
2557
2558 default:
2559 break;
2560 }
2561
2562 /* Swap if src[0] is immediate. */
2563 if (progress && inst->is_commutative()) {
2564 if (inst->src[0].file == IMM) {
2565 fs_reg tmp = inst->src[1];
2566 inst->src[1] = inst->src[0];
2567 inst->src[0] = tmp;
2568 }
2569 }
2570 }
2571 return progress;
2572 }
2573
2574 /**
2575 * Optimize sample messages that have constant zero values for the trailing
2576 * texture coordinates. We can just reduce the message length for these
2577 * instructions instead of reserving a register for it. Trailing parameters
2578 * that aren't sent default to zero anyway. This will cause the dead code
2579 * eliminator to remove the MOV instruction that would otherwise be emitted to
2580 * set up the zero value.
2581 */
2582 bool
2583 fs_visitor::opt_zero_samples()
2584 {
2585 /* Gen4 infers the texturing opcode based on the message length so we can't
2586 * change it.
2587 */
2588 if (devinfo->gen < 5)
2589 return false;
2590
2591 bool progress = false;
2592
2593 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2594 if (!inst->is_tex())
2595 continue;
2596
2597 fs_inst *load_payload = (fs_inst *) inst->prev;
2598
2599 if (load_payload->is_head_sentinel() ||
2600 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2601 continue;
2602
2603 /* We don't want to remove the message header or the first parameter.
2604 * Removing the first parameter is not allowed, see the Haswell PRM
2605 * volume 7, page 149:
2606 *
2607 * "Parameter 0 is required except for the sampleinfo message, which
2608 * has no parameter 0"
2609 */
2610 while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2611 load_payload->src[(inst->mlen - inst->header_size) /
2612 (dispatch_width / 8) +
2613 inst->header_size - 1].is_zero()) {
2614 inst->mlen -= dispatch_width / 8;
2615 progress = true;
2616 }
2617 }
2618
2619 if (progress)
2620 invalidate_live_intervals();
2621
2622 return progress;
2623 }
2624
2625 /**
2626 * Optimize sample messages which are followed by the final RT write.
2627 *
2628 * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2629 * results sent directly to the framebuffer, bypassing the EU. Recognize the
2630 * final texturing results copied to the framebuffer write payload and modify
2631 * them to write to the framebuffer directly.
2632 */
2633 bool
2634 fs_visitor::opt_sampler_eot()
2635 {
2636 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2637
2638 if (stage != MESA_SHADER_FRAGMENT)
2639 return false;
2640
2641 if (devinfo->gen < 9 && !devinfo->is_cherryview)
2642 return false;
2643
2644 /* FINISHME: It should be possible to implement this optimization when there
2645 * are multiple drawbuffers.
2646 */
2647 if (key->nr_color_regions != 1)
2648 return false;
2649
2650 /* Look for a texturing instruction immediately before the final FB_WRITE. */
2651 fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2652 assert(fb_write->eot);
2653 assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2654
2655 fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2656
2657 /* There wasn't one; nothing to do. */
2658 if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2659 return false;
2660
2661 /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2662 * It's very likely to be the previous instruction.
2663 */
2664 fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2665 if (load_payload->is_head_sentinel() ||
2666 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2667 return false;
2668
2669 assert(!tex_inst->eot); /* We can't get here twice */
2670 assert((tex_inst->offset & (0xff << 24)) == 0);
2671
2672 tex_inst->offset |= fb_write->target << 24;
2673 tex_inst->eot = true;
2674 tex_inst->dst = reg_null_ud;
2675 fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2676
2677 /* If a header is present, marking the eot is sufficient. Otherwise, we need
2678 * to create a new LOAD_PAYLOAD command with the same sources and a space
2679 * saved for the header. Using a new destination register not only makes sure
2680 * we have enough space, but it will make sure the dead code eliminator kills
2681 * the instruction that this will replace.
2682 */
2683 if (tex_inst->header_size != 0)
2684 return true;
2685
2686 fs_reg send_header = vgrf(load_payload->sources + 1);
2687 fs_reg *new_sources =
2688 ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2689
2690 new_sources[0] = fs_reg();
2691 for (int i = 0; i < load_payload->sources; i++)
2692 new_sources[i+1] = load_payload->src[i];
2693
2694 /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2695 * requires a lot of information about the sources to appropriately figure
2696 * out the number of registers needed to be used. Given this stage in our
2697 * optimization, we may not have the appropriate GRFs required by
2698 * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2699 * manually emit the instruction.
2700 */
2701 fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2702 load_payload->exec_size,
2703 send_header,
2704 new_sources,
2705 load_payload->sources + 1);
2706
2707 new_load_payload->regs_written = load_payload->regs_written + 1;
2708 new_load_payload->header_size = 1;
2709 tex_inst->mlen++;
2710 tex_inst->header_size = 1;
2711 tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2712 tex_inst->src[0] = send_header;
2713
2714 return true;
2715 }
2716
2717 bool
2718 fs_visitor::opt_register_renaming()
2719 {
2720 bool progress = false;
2721 int depth = 0;
2722
2723 int remap[alloc.count];
2724 memset(remap, -1, sizeof(int) * alloc.count);
2725
2726 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2727 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2728 depth++;
2729 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2730 inst->opcode == BRW_OPCODE_WHILE) {
2731 depth--;
2732 }
2733
2734 /* Rewrite instruction sources. */
2735 for (int i = 0; i < inst->sources; i++) {
2736 if (inst->src[i].file == GRF &&
2737 remap[inst->src[i].reg] != -1 &&
2738 remap[inst->src[i].reg] != inst->src[i].reg) {
2739 inst->src[i].reg = remap[inst->src[i].reg];
2740 progress = true;
2741 }
2742 }
2743
2744 const int dst = inst->dst.reg;
2745
2746 if (depth == 0 &&
2747 inst->dst.file == GRF &&
2748 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2749 !inst->is_partial_write()) {
2750 if (remap[dst] == -1) {
2751 remap[dst] = dst;
2752 } else {
2753 remap[dst] = alloc.allocate(inst->dst.width / 8);
2754 inst->dst.reg = remap[dst];
2755 progress = true;
2756 }
2757 } else if (inst->dst.file == GRF &&
2758 remap[dst] != -1 &&
2759 remap[dst] != dst) {
2760 inst->dst.reg = remap[dst];
2761 progress = true;
2762 }
2763 }
2764
2765 if (progress) {
2766 invalidate_live_intervals();
2767
2768 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2769 if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2770 delta_xy[i].reg = remap[delta_xy[i].reg];
2771 }
2772 }
2773 }
2774
2775 return progress;
2776 }
2777
2778 /**
2779 * Remove redundant or useless discard jumps.
2780 *
2781 * For example, we can eliminate jumps in the following sequence:
2782 *
2783 * discard-jump (redundant with the next jump)
2784 * discard-jump (useless; jumps to the next instruction)
2785 * placeholder-halt
2786 */
2787 bool
2788 fs_visitor::opt_redundant_discard_jumps()
2789 {
2790 bool progress = false;
2791
2792 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2793
2794 fs_inst *placeholder_halt = NULL;
2795 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2796 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2797 placeholder_halt = inst;
2798 break;
2799 }
2800 }
2801
2802 if (!placeholder_halt)
2803 return false;
2804
2805 /* Delete any HALTs immediately before the placeholder halt. */
2806 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2807 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2808 prev = (fs_inst *) placeholder_halt->prev) {
2809 prev->remove(last_bblock);
2810 progress = true;
2811 }
2812
2813 if (progress)
2814 invalidate_live_intervals();
2815
2816 return progress;
2817 }
2818
2819 bool
2820 fs_visitor::compute_to_mrf()
2821 {
2822 bool progress = false;
2823 int next_ip = 0;
2824
2825 /* No MRFs on Gen >= 7. */
2826 if (devinfo->gen >= 7)
2827 return false;
2828
2829 calculate_live_intervals();
2830
2831 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2832 int ip = next_ip;
2833 next_ip++;
2834
2835 if (inst->opcode != BRW_OPCODE_MOV ||
2836 inst->is_partial_write() ||
2837 inst->dst.file != MRF || inst->src[0].file != GRF ||
2838 inst->dst.type != inst->src[0].type ||
2839 inst->src[0].abs || inst->src[0].negate ||
2840 !inst->src[0].is_contiguous() ||
2841 inst->src[0].subreg_offset)
2842 continue;
2843
2844 /* Work out which hardware MRF registers are written by this
2845 * instruction.
2846 */
2847 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2848 int mrf_high;
2849 if (inst->dst.reg & BRW_MRF_COMPR4) {
2850 mrf_high = mrf_low + 4;
2851 } else if (inst->exec_size == 16) {
2852 mrf_high = mrf_low + 1;
2853 } else {
2854 mrf_high = mrf_low;
2855 }
2856
2857 /* Can't compute-to-MRF this GRF if someone else was going to
2858 * read it later.
2859 */
2860 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2861 continue;
2862
2863 /* Found a move of a GRF to a MRF. Let's see if we can go
2864 * rewrite the thing that made this GRF to write into the MRF.
2865 */
2866 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2867 if (scan_inst->dst.file == GRF &&
2868 scan_inst->dst.reg == inst->src[0].reg) {
2869 /* Found the last thing to write our reg we want to turn
2870 * into a compute-to-MRF.
2871 */
2872
2873 /* If this one instruction didn't populate all the
2874 * channels, bail. We might be able to rewrite everything
2875 * that writes that reg, but it would require smarter
2876 * tracking to delay the rewriting until complete success.
2877 */
2878 if (scan_inst->is_partial_write())
2879 break;
2880
2881 /* Things returning more than one register would need us to
2882 * understand coalescing out more than one MOV at a time.
2883 */
2884 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2885 break;
2886
2887 /* SEND instructions can't have MRF as a destination. */
2888 if (scan_inst->mlen)
2889 break;
2890
2891 if (devinfo->gen == 6) {
2892 /* gen6 math instructions must have the destination be
2893 * GRF, so no compute-to-MRF for them.
2894 */
2895 if (scan_inst->is_math()) {
2896 break;
2897 }
2898 }
2899
2900 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2901 /* Found the creator of our MRF's source value. */
2902 scan_inst->dst.file = MRF;
2903 scan_inst->dst.reg = inst->dst.reg;
2904 scan_inst->saturate |= inst->saturate;
2905 inst->remove(block);
2906 progress = true;
2907 }
2908 break;
2909 }
2910
2911 /* We don't handle control flow here. Most computation of
2912 * values that end up in MRFs are shortly before the MRF
2913 * write anyway.
2914 */
2915 if (block->start() == scan_inst)
2916 break;
2917
2918 /* You can't read from an MRF, so if someone else reads our
2919 * MRF's source GRF that we wanted to rewrite, that stops us.
2920 */
2921 bool interfered = false;
2922 for (int i = 0; i < scan_inst->sources; i++) {
2923 if (scan_inst->src[i].file == GRF &&
2924 scan_inst->src[i].reg == inst->src[0].reg &&
2925 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2926 interfered = true;
2927 }
2928 }
2929 if (interfered)
2930 break;
2931
2932 if (scan_inst->dst.file == MRF) {
2933 /* If somebody else writes our MRF here, we can't
2934 * compute-to-MRF before that.
2935 */
2936 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2937 int scan_mrf_high;
2938
2939 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2940 scan_mrf_high = scan_mrf_low + 4;
2941 } else if (scan_inst->exec_size == 16) {
2942 scan_mrf_high = scan_mrf_low + 1;
2943 } else {
2944 scan_mrf_high = scan_mrf_low;
2945 }
2946
2947 if (mrf_low == scan_mrf_low ||
2948 mrf_low == scan_mrf_high ||
2949 mrf_high == scan_mrf_low ||
2950 mrf_high == scan_mrf_high) {
2951 break;
2952 }
2953 }
2954
2955 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2956 /* Found a SEND instruction, which means that there are
2957 * live values in MRFs from base_mrf to base_mrf +
2958 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2959 * above it.
2960 */
2961 if (mrf_low >= scan_inst->base_mrf &&
2962 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2963 break;
2964 }
2965 if (mrf_high >= scan_inst->base_mrf &&
2966 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2967 break;
2968 }
2969 }
2970 }
2971 }
2972
2973 if (progress)
2974 invalidate_live_intervals();
2975
2976 return progress;
2977 }
2978
2979 /**
2980 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2981 * flow. We could probably do better here with some form of divergence
2982 * analysis.
2983 */
2984 bool
2985 fs_visitor::eliminate_find_live_channel()
2986 {
2987 bool progress = false;
2988 unsigned depth = 0;
2989
2990 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2991 switch (inst->opcode) {
2992 case BRW_OPCODE_IF:
2993 case BRW_OPCODE_DO:
2994 depth++;
2995 break;
2996
2997 case BRW_OPCODE_ENDIF:
2998 case BRW_OPCODE_WHILE:
2999 depth--;
3000 break;
3001
3002 case FS_OPCODE_DISCARD_JUMP:
3003 /* This can potentially make control flow non-uniform until the end
3004 * of the program.
3005 */
3006 return progress;
3007
3008 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
3009 if (depth == 0) {
3010 inst->opcode = BRW_OPCODE_MOV;
3011 inst->src[0] = fs_reg(0);
3012 inst->sources = 1;
3013 inst->force_writemask_all = true;
3014 progress = true;
3015 }
3016 break;
3017
3018 default:
3019 break;
3020 }
3021 }
3022
3023 return progress;
3024 }
3025
3026 /**
3027 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
3028 * instructions to FS_OPCODE_REP_FB_WRITE.
3029 */
3030 void
3031 fs_visitor::emit_repclear_shader()
3032 {
3033 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3034 int base_mrf = 1;
3035 int color_mrf = base_mrf + 2;
3036
3037 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
3038 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
3039 mov->force_writemask_all = true;
3040
3041 fs_inst *write;
3042 if (key->nr_color_regions == 1) {
3043 write = emit(FS_OPCODE_REP_FB_WRITE);
3044 write->saturate = key->clamp_fragment_color;
3045 write->base_mrf = color_mrf;
3046 write->target = 0;
3047 write->header_size = 0;
3048 write->mlen = 1;
3049 } else {
3050 assume(key->nr_color_regions > 0);
3051 for (int i = 0; i < key->nr_color_regions; ++i) {
3052 write = emit(FS_OPCODE_REP_FB_WRITE);
3053 write->saturate = key->clamp_fragment_color;
3054 write->base_mrf = base_mrf;
3055 write->target = i;
3056 write->header_size = 2;
3057 write->mlen = 3;
3058 }
3059 }
3060 write->eot = true;
3061
3062 calculate_cfg();
3063
3064 assign_constant_locations();
3065 assign_curb_setup();
3066
3067 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
3068 assert(mov->src[0].file == HW_REG);
3069 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
3070 }
3071
3072 /**
3073 * Walks through basic blocks, looking for repeated MRF writes and
3074 * removing the later ones.
3075 */
3076 bool
3077 fs_visitor::remove_duplicate_mrf_writes()
3078 {
3079 fs_inst *last_mrf_move[16];
3080 bool progress = false;
3081
3082 /* Need to update the MRF tracking for compressed instructions. */
3083 if (dispatch_width == 16)
3084 return false;
3085
3086 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3087
3088 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3089 if (inst->is_control_flow()) {
3090 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3091 }
3092
3093 if (inst->opcode == BRW_OPCODE_MOV &&
3094 inst->dst.file == MRF) {
3095 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3096 if (prev_inst && inst->equals(prev_inst)) {
3097 inst->remove(block);
3098 progress = true;
3099 continue;
3100 }
3101 }
3102
3103 /* Clear out the last-write records for MRFs that were overwritten. */
3104 if (inst->dst.file == MRF) {
3105 last_mrf_move[inst->dst.reg] = NULL;
3106 }
3107
3108 if (inst->mlen > 0 && inst->base_mrf != -1) {
3109 /* Found a SEND instruction, which will include two or fewer
3110 * implied MRF writes. We could do better here.
3111 */
3112 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3113 last_mrf_move[inst->base_mrf + i] = NULL;
3114 }
3115 }
3116
3117 /* Clear out any MRF move records whose sources got overwritten. */
3118 if (inst->dst.file == GRF) {
3119 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3120 if (last_mrf_move[i] &&
3121 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3122 last_mrf_move[i] = NULL;
3123 }
3124 }
3125 }
3126
3127 if (inst->opcode == BRW_OPCODE_MOV &&
3128 inst->dst.file == MRF &&
3129 inst->src[0].file == GRF &&
3130 !inst->is_partial_write()) {
3131 last_mrf_move[inst->dst.reg] = inst;
3132 }
3133 }
3134
3135 if (progress)
3136 invalidate_live_intervals();
3137
3138 return progress;
3139 }
3140
3141 static void
3142 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3143 {
3144 /* Clear the flag for registers that actually got read (as expected). */
3145 for (int i = 0; i < inst->sources; i++) {
3146 int grf;
3147 if (inst->src[i].file == GRF) {
3148 grf = inst->src[i].reg;
3149 } else if (inst->src[i].file == HW_REG &&
3150 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3151 grf = inst->src[i].fixed_hw_reg.nr;
3152 } else {
3153 continue;
3154 }
3155
3156 if (grf >= first_grf &&
3157 grf < first_grf + grf_len) {
3158 deps[grf - first_grf] = false;
3159 if (inst->exec_size == 16)
3160 deps[grf - first_grf + 1] = false;
3161 }
3162 }
3163 }
3164
3165 /**
3166 * Implements this workaround for the original 965:
3167 *
3168 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3169 * check for post destination dependencies on this instruction, software
3170 * must ensure that there is no destination hazard for the case of ‘write
3171 * followed by a posted write’ shown in the following example.
3172 *
3173 * 1. mov r3 0
3174 * 2. send r3.xy <rest of send instruction>
3175 * 3. mov r2 r3
3176 *
3177 * Due to no post-destination dependency check on the ‘send’, the above
3178 * code sequence could have two instructions (1 and 2) in flight at the
3179 * same time that both consider ‘r3’ as the target of their final writes.
3180 */
3181 void
3182 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3183 fs_inst *inst)
3184 {
3185 int write_len = inst->regs_written;
3186 int first_write_grf = inst->dst.reg;
3187 bool needs_dep[BRW_MAX_MRF];
3188 assert(write_len < (int)sizeof(needs_dep) - 1);
3189
3190 memset(needs_dep, false, sizeof(needs_dep));
3191 memset(needs_dep, true, write_len);
3192
3193 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3194
3195 /* Walk backwards looking for writes to registers we're writing which
3196 * aren't read since being written. If we hit the start of the program,
3197 * we assume that there are no outstanding dependencies on entry to the
3198 * program.
3199 */
3200 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3201 /* If we hit control flow, assume that there *are* outstanding
3202 * dependencies, and force their cleanup before our instruction.
3203 */
3204 if (block->start() == scan_inst) {
3205 for (int i = 0; i < write_len; i++) {
3206 if (needs_dep[i]) {
3207 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3208 }
3209 }
3210 return;
3211 }
3212
3213 /* We insert our reads as late as possible on the assumption that any
3214 * instruction but a MOV that might have left us an outstanding
3215 * dependency has more latency than a MOV.
3216 */
3217 if (scan_inst->dst.file == GRF) {
3218 for (int i = 0; i < scan_inst->regs_written; i++) {
3219 int reg = scan_inst->dst.reg + i;
3220
3221 if (reg >= first_write_grf &&
3222 reg < first_write_grf + write_len &&
3223 needs_dep[reg - first_write_grf]) {
3224 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3225 needs_dep[reg - first_write_grf] = false;
3226 if (scan_inst->exec_size == 16)
3227 needs_dep[reg - first_write_grf + 1] = false;
3228 }
3229 }
3230 }
3231
3232 /* Clear the flag for registers that actually got read (as expected). */
3233 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3234
3235 /* Continue the loop only if we haven't resolved all the dependencies */
3236 int i;
3237 for (i = 0; i < write_len; i++) {
3238 if (needs_dep[i])
3239 break;
3240 }
3241 if (i == write_len)
3242 return;
3243 }
3244 }
3245
3246 /**
3247 * Implements this workaround for the original 965:
3248 *
3249 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3250 * used as a destination register until after it has been sourced by an
3251 * instruction with a different destination register.
3252 */
3253 void
3254 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3255 {
3256 int write_len = inst->regs_written;
3257 int first_write_grf = inst->dst.reg;
3258 bool needs_dep[BRW_MAX_MRF];
3259 assert(write_len < (int)sizeof(needs_dep) - 1);
3260
3261 memset(needs_dep, false, sizeof(needs_dep));
3262 memset(needs_dep, true, write_len);
3263 /* Walk forwards looking for writes to registers we're writing which aren't
3264 * read before being written.
3265 */
3266 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3267 /* If we hit control flow, force resolve all remaining dependencies. */
3268 if (block->end() == scan_inst) {
3269 for (int i = 0; i < write_len; i++) {
3270 if (needs_dep[i])
3271 scan_inst->insert_before(block,
3272 DEP_RESOLVE_MOV(first_write_grf + i));
3273 }
3274 return;
3275 }
3276
3277 /* Clear the flag for registers that actually got read (as expected). */
3278 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3279
3280 /* We insert our reads as late as possible since they're reading the
3281 * result of a SEND, which has massive latency.
3282 */
3283 if (scan_inst->dst.file == GRF &&
3284 scan_inst->dst.reg >= first_write_grf &&
3285 scan_inst->dst.reg < first_write_grf + write_len &&
3286 needs_dep[scan_inst->dst.reg - first_write_grf]) {
3287 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3288 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3289 }
3290
3291 /* Continue the loop only if we haven't resolved all the dependencies */
3292 int i;
3293 for (i = 0; i < write_len; i++) {
3294 if (needs_dep[i])
3295 break;
3296 }
3297 if (i == write_len)
3298 return;
3299 }
3300 }
3301
3302 void
3303 fs_visitor::insert_gen4_send_dependency_workarounds()
3304 {
3305 if (devinfo->gen != 4 || devinfo->is_g4x)
3306 return;
3307
3308 bool progress = false;
3309
3310 /* Note that we're done with register allocation, so GRF fs_regs always
3311 * have a .reg_offset of 0.
3312 */
3313
3314 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3315 if (inst->mlen != 0 && inst->dst.file == GRF) {
3316 insert_gen4_pre_send_dependency_workarounds(block, inst);
3317 insert_gen4_post_send_dependency_workarounds(block, inst);
3318 progress = true;
3319 }
3320 }
3321
3322 if (progress)
3323 invalidate_live_intervals();
3324 }
3325
3326 /**
3327 * Turns the generic expression-style uniform pull constant load instruction
3328 * into a hardware-specific series of instructions for loading a pull
3329 * constant.
3330 *
3331 * The expression style allows the CSE pass before this to optimize out
3332 * repeated loads from the same offset, and gives the pre-register-allocation
3333 * scheduling full flexibility, while the conversion to native instructions
3334 * allows the post-register-allocation scheduler the best information
3335 * possible.
3336 *
3337 * Note that execution masking for setting up pull constant loads is special:
3338 * the channels that need to be written are unrelated to the current execution
3339 * mask, since a later instruction will use one of the result channels as a
3340 * source operand for all 8 or 16 of its channels.
3341 */
3342 void
3343 fs_visitor::lower_uniform_pull_constant_loads()
3344 {
3345 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3346 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3347 continue;
3348
3349 if (devinfo->gen >= 7) {
3350 /* The offset arg before was a vec4-aligned byte offset. We need to
3351 * turn it into a dword offset.
3352 */
3353 fs_reg const_offset_reg = inst->src[1];
3354 assert(const_offset_reg.file == IMM &&
3355 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3356 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3357 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3358
3359 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3360 * Reserve space for the register.
3361 */
3362 if (devinfo->gen >= 9) {
3363 payload.reg_offset++;
3364 alloc.sizes[payload.reg] = 2;
3365 }
3366
3367 /* This is actually going to be a MOV, but since only the first dword
3368 * is accessed, we have a special opcode to do just that one. Note
3369 * that this needs to be an operation that will be considered a def
3370 * by live variable analysis, or register allocation will explode.
3371 */
3372 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3373 8, payload, const_offset_reg);
3374 setup->force_writemask_all = true;
3375
3376 setup->ir = inst->ir;
3377 setup->annotation = inst->annotation;
3378 inst->insert_before(block, setup);
3379
3380 /* Similarly, this will only populate the first 4 channels of the
3381 * result register (since we only use smear values from 0-3), but we
3382 * don't tell the optimizer.
3383 */
3384 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3385 inst->src[1] = payload;
3386
3387 invalidate_live_intervals();
3388 } else {
3389 /* Before register allocation, we didn't tell the scheduler about the
3390 * MRF we use. We know it's safe to use this MRF because nothing
3391 * else does except for register spill/unspill, which generates and
3392 * uses its MRF within a single IR instruction.
3393 */
3394 inst->base_mrf = 14;
3395 inst->mlen = 1;
3396 }
3397 }
3398 }
3399
3400 bool
3401 fs_visitor::lower_load_payload()
3402 {
3403 bool progress = false;
3404
3405 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3406 if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3407 continue;
3408
3409 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3410 assert(inst->saturate == false);
3411
3412 fs_reg dst = inst->dst;
3413
3414 /* Get rid of COMPR4. We'll add it back in if we need it */
3415 if (dst.file == MRF)
3416 dst.reg = dst.reg & ~BRW_MRF_COMPR4;
3417
3418 dst.width = 8;
3419 for (uint8_t i = 0; i < inst->header_size; i++) {
3420 if (inst->src[i].file != BAD_FILE) {
3421 fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3422 fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3423 mov_src.width = 8;
3424 fs_inst *mov = MOV(mov_dst, mov_src);
3425 mov->force_writemask_all = true;
3426 inst->insert_before(block, mov);
3427 }
3428 dst = offset(dst, 1);
3429 }
3430
3431 dst.width = inst->exec_size;
3432 if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
3433 inst->exec_size > 8) {
3434 /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3435 * a straightforward copy. Instead, the result of the
3436 * LOAD_PAYLOAD is treated as interleaved and the first four
3437 * non-header sources are unpacked as:
3438 *
3439 * m + 0: r0
3440 * m + 1: g0
3441 * m + 2: b0
3442 * m + 3: a0
3443 * m + 4: r1
3444 * m + 5: g1
3445 * m + 6: b1
3446 * m + 7: a1
3447 *
3448 * This is used for gen <= 5 fb writes.
3449 */
3450 assert(inst->exec_size == 16);
3451 assert(inst->header_size + 4 <= inst->sources);
3452 for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3453 if (inst->src[i].file != BAD_FILE) {
3454 if (devinfo->has_compr4) {
3455 fs_reg compr4_dst = retype(dst, inst->src[i].type);
3456 compr4_dst.reg |= BRW_MRF_COMPR4;
3457
3458 fs_inst *mov = MOV(compr4_dst, inst->src[i]);
3459 mov->force_writemask_all = inst->force_writemask_all;
3460 inst->insert_before(block, mov);
3461 } else {
3462 /* Platform doesn't have COMPR4. We have to fake it */
3463 fs_reg mov_dst = retype(dst, inst->src[i].type);
3464 mov_dst.width = 8;
3465
3466 fs_inst *mov = MOV(mov_dst, half(inst->src[i], 0));
3467 mov->force_writemask_all = inst->force_writemask_all;
3468 inst->insert_before(block, mov);
3469
3470 mov = MOV(offset(mov_dst, 4), half(inst->src[i], 1));
3471 mov->force_writemask_all = inst->force_writemask_all;
3472 mov->force_sechalf = true;
3473 inst->insert_before(block, mov);
3474 }
3475 }
3476
3477 dst.reg++;
3478 }
3479
3480 /* The loop above only ever incremented us through the first set
3481 * of 4 registers. However, thanks to the magic of COMPR4, we
3482 * actually wrote to the first 8 registers, so we need to take
3483 * that into account now.
3484 */
3485 dst.reg += 4;
3486
3487 /* The COMPR4 code took care of the first 4 sources. We'll let
3488 * the regular path handle any remaining sources. Yes, we are
3489 * modifying the instruction but we're about to delete it so
3490 * this really doesn't hurt anything.
3491 */
3492 inst->header_size += 4;
3493 }
3494
3495 for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3496 if (inst->src[i].file != BAD_FILE) {
3497 fs_inst *mov = MOV(retype(dst, inst->src[i].type),
3498 inst->src[i]);
3499 mov->force_writemask_all = inst->force_writemask_all;
3500 inst->insert_before(block, mov);
3501 }
3502 dst = offset(dst, 1);
3503 }
3504
3505 inst->remove(block);
3506 progress = true;
3507 }
3508
3509 if (progress)
3510 invalidate_live_intervals();
3511
3512 return progress;
3513 }
3514
3515 void
3516 fs_visitor::dump_instructions()
3517 {
3518 dump_instructions(NULL);
3519 }
3520
3521 void
3522 fs_visitor::dump_instructions(const char *name)
3523 {
3524 FILE *file = stderr;
3525 if (name && geteuid() != 0) {
3526 file = fopen(name, "w");
3527 if (!file)
3528 file = stderr;
3529 }
3530
3531 if (cfg) {
3532 calculate_register_pressure();
3533 int ip = 0, max_pressure = 0;
3534 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3535 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3536 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3537 dump_instruction(inst, file);
3538 ip++;
3539 }
3540 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3541 } else {
3542 int ip = 0;
3543 foreach_in_list(backend_instruction, inst, &instructions) {
3544 fprintf(file, "%4d: ", ip++);
3545 dump_instruction(inst, file);
3546 }
3547 }
3548
3549 if (file != stderr) {
3550 fclose(file);
3551 }
3552 }
3553
3554 void
3555 fs_visitor::dump_instruction(backend_instruction *be_inst)
3556 {
3557 dump_instruction(be_inst, stderr);
3558 }
3559
3560 void
3561 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3562 {
3563 fs_inst *inst = (fs_inst *)be_inst;
3564
3565 if (inst->predicate) {
3566 fprintf(file, "(%cf0.%d) ",
3567 inst->predicate_inverse ? '-' : '+',
3568 inst->flag_subreg);
3569 }
3570
3571 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3572 if (inst->saturate)
3573 fprintf(file, ".sat");
3574 if (inst->conditional_mod) {
3575 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3576 if (!inst->predicate &&
3577 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3578 inst->opcode != BRW_OPCODE_IF &&
3579 inst->opcode != BRW_OPCODE_WHILE))) {
3580 fprintf(file, ".f0.%d", inst->flag_subreg);
3581 }
3582 }
3583 fprintf(file, "(%d) ", inst->exec_size);
3584
3585
3586 switch (inst->dst.file) {
3587 case GRF:
3588 fprintf(file, "vgrf%d", inst->dst.reg);
3589 if (inst->dst.width != dispatch_width)
3590 fprintf(file, "@%d", inst->dst.width);
3591 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3592 inst->dst.subreg_offset)
3593 fprintf(file, "+%d.%d",
3594 inst->dst.reg_offset, inst->dst.subreg_offset);
3595 break;
3596 case MRF:
3597 fprintf(file, "m%d", inst->dst.reg);
3598 break;
3599 case BAD_FILE:
3600 fprintf(file, "(null)");
3601 break;
3602 case UNIFORM:
3603 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3604 break;
3605 case ATTR:
3606 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3607 break;
3608 case HW_REG:
3609 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3610 switch (inst->dst.fixed_hw_reg.nr) {
3611 case BRW_ARF_NULL:
3612 fprintf(file, "null");
3613 break;
3614 case BRW_ARF_ADDRESS:
3615 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3616 break;
3617 case BRW_ARF_ACCUMULATOR:
3618 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3619 break;
3620 case BRW_ARF_FLAG:
3621 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3622 inst->dst.fixed_hw_reg.subnr);
3623 break;
3624 default:
3625 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3626 inst->dst.fixed_hw_reg.subnr);
3627 break;
3628 }
3629 } else {
3630 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3631 }
3632 if (inst->dst.fixed_hw_reg.subnr)
3633 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3634 break;
3635 default:
3636 fprintf(file, "???");
3637 break;
3638 }
3639 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3640
3641 for (int i = 0; i < inst->sources; i++) {
3642 if (inst->src[i].negate)
3643 fprintf(file, "-");
3644 if (inst->src[i].abs)
3645 fprintf(file, "|");
3646 switch (inst->src[i].file) {
3647 case GRF:
3648 fprintf(file, "vgrf%d", inst->src[i].reg);
3649 if (inst->src[i].width != dispatch_width)
3650 fprintf(file, "@%d", inst->src[i].width);
3651 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3652 inst->src[i].subreg_offset)
3653 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3654 inst->src[i].subreg_offset);
3655 break;
3656 case MRF:
3657 fprintf(file, "***m%d***", inst->src[i].reg);
3658 break;
3659 case ATTR:
3660 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3661 break;
3662 case UNIFORM:
3663 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3664 if (inst->src[i].reladdr) {
3665 fprintf(file, "+reladdr");
3666 } else if (inst->src[i].subreg_offset) {
3667 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3668 inst->src[i].subreg_offset);
3669 }
3670 break;
3671 case BAD_FILE:
3672 fprintf(file, "(null)");
3673 break;
3674 case IMM:
3675 switch (inst->src[i].type) {
3676 case BRW_REGISTER_TYPE_F:
3677 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3678 break;
3679 case BRW_REGISTER_TYPE_W:
3680 case BRW_REGISTER_TYPE_D:
3681 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3682 break;
3683 case BRW_REGISTER_TYPE_UW:
3684 case BRW_REGISTER_TYPE_UD:
3685 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3686 break;
3687 case BRW_REGISTER_TYPE_VF:
3688 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3689 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3690 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3691 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3692 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3693 break;
3694 default:
3695 fprintf(file, "???");
3696 break;
3697 }
3698 break;
3699 case HW_REG:
3700 if (inst->src[i].fixed_hw_reg.negate)
3701 fprintf(file, "-");
3702 if (inst->src[i].fixed_hw_reg.abs)
3703 fprintf(file, "|");
3704 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3705 switch (inst->src[i].fixed_hw_reg.nr) {
3706 case BRW_ARF_NULL:
3707 fprintf(file, "null");
3708 break;
3709 case BRW_ARF_ADDRESS:
3710 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3711 break;
3712 case BRW_ARF_ACCUMULATOR:
3713 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3714 break;
3715 case BRW_ARF_FLAG:
3716 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3717 inst->src[i].fixed_hw_reg.subnr);
3718 break;
3719 default:
3720 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3721 inst->src[i].fixed_hw_reg.subnr);
3722 break;
3723 }
3724 } else {
3725 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3726 }
3727 if (inst->src[i].fixed_hw_reg.subnr)
3728 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3729 if (inst->src[i].fixed_hw_reg.abs)
3730 fprintf(file, "|");
3731 break;
3732 default:
3733 fprintf(file, "???");
3734 break;
3735 }
3736 if (inst->src[i].abs)
3737 fprintf(file, "|");
3738
3739 if (inst->src[i].file != IMM) {
3740 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3741 }
3742
3743 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3744 fprintf(file, ", ");
3745 }
3746
3747 fprintf(file, " ");
3748
3749 if (dispatch_width == 16 && inst->exec_size == 8) {
3750 if (inst->force_sechalf)
3751 fprintf(file, "2ndhalf ");
3752 else
3753 fprintf(file, "1sthalf ");
3754 }
3755
3756 fprintf(file, "\n");
3757 }
3758
3759 /**
3760 * Possibly returns an instruction that set up @param reg.
3761 *
3762 * Sometimes we want to take the result of some expression/variable
3763 * dereference tree and rewrite the instruction generating the result
3764 * of the tree. When processing the tree, we know that the
3765 * instructions generated are all writing temporaries that are dead
3766 * outside of this tree. So, if we have some instructions that write
3767 * a temporary, we're free to point that temp write somewhere else.
3768 *
3769 * Note that this doesn't guarantee that the instruction generated
3770 * only reg -- it might be the size=4 destination of a texture instruction.
3771 */
3772 fs_inst *
3773 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3774 fs_inst *end,
3775 const fs_reg &reg)
3776 {
3777 if (end == start ||
3778 end->is_partial_write() ||
3779 reg.reladdr ||
3780 !reg.equals(end->dst)) {
3781 return NULL;
3782 } else {
3783 return end;
3784 }
3785 }
3786
3787 void
3788 fs_visitor::setup_payload_gen6()
3789 {
3790 bool uses_depth =
3791 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3792 unsigned barycentric_interp_modes =
3793 (stage == MESA_SHADER_FRAGMENT) ?
3794 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3795
3796 assert(devinfo->gen >= 6);
3797
3798 /* R0-1: masks, pixel X/Y coordinates. */
3799 payload.num_regs = 2;
3800 /* R2: only for 32-pixel dispatch.*/
3801
3802 /* R3-26: barycentric interpolation coordinates. These appear in the
3803 * same order that they appear in the brw_wm_barycentric_interp_mode
3804 * enum. Each set of coordinates occupies 2 registers if dispatch width
3805 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3806 * appear if they were enabled using the "Barycentric Interpolation
3807 * Mode" bits in WM_STATE.
3808 */
3809 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3810 if (barycentric_interp_modes & (1 << i)) {
3811 payload.barycentric_coord_reg[i] = payload.num_regs;
3812 payload.num_regs += 2;
3813 if (dispatch_width == 16) {
3814 payload.num_regs += 2;
3815 }
3816 }
3817 }
3818
3819 /* R27: interpolated depth if uses source depth */
3820 if (uses_depth) {
3821 payload.source_depth_reg = payload.num_regs;
3822 payload.num_regs++;
3823 if (dispatch_width == 16) {
3824 /* R28: interpolated depth if not SIMD8. */
3825 payload.num_regs++;
3826 }
3827 }
3828 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3829 if (uses_depth) {
3830 payload.source_w_reg = payload.num_regs;
3831 payload.num_regs++;
3832 if (dispatch_width == 16) {
3833 /* R30: interpolated W if not SIMD8. */
3834 payload.num_regs++;
3835 }
3836 }
3837
3838 if (stage == MESA_SHADER_FRAGMENT) {
3839 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3840 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3841 prog_data->uses_pos_offset = key->compute_pos_offset;
3842 /* R31: MSAA position offsets. */
3843 if (prog_data->uses_pos_offset) {
3844 payload.sample_pos_reg = payload.num_regs;
3845 payload.num_regs++;
3846 }
3847 }
3848
3849 /* R32: MSAA input coverage mask */
3850 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3851 assert(devinfo->gen >= 7);
3852 payload.sample_mask_in_reg = payload.num_regs;
3853 payload.num_regs++;
3854 if (dispatch_width == 16) {
3855 /* R33: input coverage mask if not SIMD8. */
3856 payload.num_regs++;
3857 }
3858 }
3859
3860 /* R34-: bary for 32-pixel. */
3861 /* R58-59: interp W for 32-pixel. */
3862
3863 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3864 source_depth_to_render_target = true;
3865 }
3866 }
3867
3868 void
3869 fs_visitor::setup_vs_payload()
3870 {
3871 /* R0: thread header, R1: urb handles */
3872 payload.num_regs = 2;
3873 }
3874
3875 void
3876 fs_visitor::setup_cs_payload()
3877 {
3878 assert(brw->gen >= 7);
3879
3880 payload.num_regs = 1;
3881 }
3882
3883 void
3884 fs_visitor::assign_binding_table_offsets()
3885 {
3886 assert(stage == MESA_SHADER_FRAGMENT);
3887 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3888 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3889 uint32_t next_binding_table_offset = 0;
3890
3891 /* If there are no color regions, we still perform an FB write to a null
3892 * renderbuffer, which we place at surface index 0.
3893 */
3894 prog_data->binding_table.render_target_start = next_binding_table_offset;
3895 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3896
3897 assign_common_binding_table_offsets(next_binding_table_offset);
3898 }
3899
3900 void
3901 fs_visitor::calculate_register_pressure()
3902 {
3903 invalidate_live_intervals();
3904 calculate_live_intervals();
3905
3906 unsigned num_instructions = 0;
3907 foreach_block(block, cfg)
3908 num_instructions += block->instructions.length();
3909
3910 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3911
3912 for (unsigned reg = 0; reg < alloc.count; reg++) {
3913 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3914 regs_live_at_ip[ip] += alloc.sizes[reg];
3915 }
3916 }
3917
3918 void
3919 fs_visitor::optimize()
3920 {
3921 split_virtual_grfs();
3922
3923 move_uniform_array_access_to_pull_constants();
3924 assign_constant_locations();
3925 demote_pull_constants();
3926
3927 #define OPT(pass, args...) ({ \
3928 pass_num++; \
3929 bool this_progress = pass(args); \
3930 \
3931 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3932 char filename[64]; \
3933 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3934 stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3935 \
3936 backend_visitor::dump_instructions(filename); \
3937 } \
3938 \
3939 progress = progress || this_progress; \
3940 this_progress; \
3941 })
3942
3943 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3944 char filename[64];
3945 snprintf(filename, 64, "%s%d-%04d-00-start",
3946 stage_abbrev, dispatch_width,
3947 shader_prog ? shader_prog->Name : 0);
3948
3949 backend_visitor::dump_instructions(filename);
3950 }
3951
3952 bool progress;
3953 int iteration = 0;
3954 int pass_num = 0;
3955 do {
3956 progress = false;
3957 pass_num = 0;
3958 iteration++;
3959
3960 OPT(remove_duplicate_mrf_writes);
3961
3962 OPT(opt_algebraic);
3963 OPT(opt_cse);
3964 OPT(opt_copy_propagate);
3965 OPT(opt_peephole_predicated_break);
3966 OPT(opt_cmod_propagation);
3967 OPT(dead_code_eliminate);
3968 OPT(opt_peephole_sel);
3969 OPT(dead_control_flow_eliminate, this);
3970 OPT(opt_register_renaming);
3971 OPT(opt_redundant_discard_jumps);
3972 OPT(opt_saturate_propagation);
3973 OPT(opt_zero_samples);
3974 OPT(register_coalesce);
3975 OPT(compute_to_mrf);
3976 OPT(eliminate_find_live_channel);
3977
3978 OPT(compact_virtual_grfs);
3979 } while (progress);
3980
3981 pass_num = 0;
3982
3983 OPT(opt_sampler_eot);
3984
3985 if (OPT(lower_load_payload)) {
3986 split_virtual_grfs();
3987 OPT(register_coalesce);
3988 OPT(compute_to_mrf);
3989 OPT(dead_code_eliminate);
3990 }
3991
3992 OPT(opt_combine_constants);
3993
3994 lower_uniform_pull_constant_loads();
3995 }
3996
3997 /**
3998 * Three source instruction must have a GRF/MRF destination register.
3999 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
4000 */
4001 void
4002 fs_visitor::fixup_3src_null_dest()
4003 {
4004 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
4005 if (inst->is_3src() && inst->dst.is_null()) {
4006 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
4007 inst->dst.type);
4008 }
4009 }
4010 }
4011
4012 void
4013 fs_visitor::allocate_registers()
4014 {
4015 bool allocated_without_spills;
4016
4017 static const enum instruction_scheduler_mode pre_modes[] = {
4018 SCHEDULE_PRE,
4019 SCHEDULE_PRE_NON_LIFO,
4020 SCHEDULE_PRE_LIFO,
4021 };
4022
4023 /* Try each scheduling heuristic to see if it can successfully register
4024 * allocate without spilling. They should be ordered by decreasing
4025 * performance but increasing likelihood of allocating.
4026 */
4027 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
4028 schedule_instructions(pre_modes[i]);
4029
4030 if (0) {
4031 assign_regs_trivial();
4032 allocated_without_spills = true;
4033 } else {
4034 allocated_without_spills = assign_regs(false);
4035 }
4036 if (allocated_without_spills)
4037 break;
4038 }
4039
4040 if (!allocated_without_spills) {
4041 /* We assume that any spilling is worse than just dropping back to
4042 * SIMD8. There's probably actually some intermediate point where
4043 * SIMD16 with a couple of spills is still better.
4044 */
4045 if (dispatch_width == 16) {
4046 fail("Failure to register allocate. Reduce number of "
4047 "live scalar values to avoid this.");
4048 } else {
4049 perf_debug("%s shader triggered register spilling. "
4050 "Try reducing the number of live scalar values to "
4051 "improve performance.\n", stage_name);
4052 }
4053
4054 /* Since we're out of heuristics, just go spill registers until we
4055 * get an allocation.
4056 */
4057 while (!assign_regs(true)) {
4058 if (failed)
4059 break;
4060 }
4061 }
4062
4063 /* This must come after all optimization and register allocation, since
4064 * it inserts dead code that happens to have side effects, and it does
4065 * so based on the actual physical registers in use.
4066 */
4067 insert_gen4_send_dependency_workarounds();
4068
4069 if (failed)
4070 return;
4071
4072 if (!allocated_without_spills)
4073 schedule_instructions(SCHEDULE_POST);
4074
4075 if (last_scratch > 0)
4076 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4077 }
4078
4079 bool
4080 fs_visitor::run_vs()
4081 {
4082 assert(stage == MESA_SHADER_VERTEX);
4083
4084 assign_common_binding_table_offsets(0);
4085 setup_vs_payload();
4086
4087 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4088 emit_shader_time_begin();
4089
4090 if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
4091 emit_nir_code();
4092 } else {
4093 foreach_in_list(ir_instruction, ir, shader->base.ir) {
4094 base_ir = ir;
4095 this->result = reg_undef;
4096 ir->accept(this);
4097 }
4098 base_ir = NULL;
4099 }
4100
4101 if (failed)
4102 return false;
4103
4104 emit_urb_writes();
4105
4106 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4107 emit_shader_time_end();
4108
4109 calculate_cfg();
4110
4111 optimize();
4112
4113 assign_curb_setup();
4114 assign_vs_urb_setup();
4115
4116 fixup_3src_null_dest();
4117 allocate_registers();
4118
4119 return !failed;
4120 }
4121
4122 bool
4123 fs_visitor::run_fs()
4124 {
4125 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4126 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4127
4128 assert(stage == MESA_SHADER_FRAGMENT);
4129
4130 sanity_param_count = prog->Parameters->NumParameters;
4131
4132 assign_binding_table_offsets();
4133
4134 if (devinfo->gen >= 6)
4135 setup_payload_gen6();
4136 else
4137 setup_payload_gen4();
4138
4139 if (0) {
4140 emit_dummy_fs();
4141 } else if (brw->use_rep_send && dispatch_width == 16) {
4142 emit_repclear_shader();
4143 } else {
4144 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4145 emit_shader_time_begin();
4146
4147 calculate_urb_setup();
4148 if (prog->InputsRead > 0) {
4149 if (devinfo->gen < 6)
4150 emit_interpolation_setup_gen4();
4151 else
4152 emit_interpolation_setup_gen6();
4153 }
4154
4155 /* We handle discards by keeping track of the still-live pixels in f0.1.
4156 * Initialize it with the dispatched pixels.
4157 */
4158 if (wm_prog_data->uses_kill) {
4159 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4160 discard_init->flag_subreg = 1;
4161 }
4162
4163 /* Generate FS IR for main(). (the visitor only descends into
4164 * functions called "main").
4165 */
4166 if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
4167 emit_nir_code();
4168 } else if (shader) {
4169 foreach_in_list(ir_instruction, ir, shader->base.ir) {
4170 base_ir = ir;
4171 this->result = reg_undef;
4172 ir->accept(this);
4173 }
4174 } else {
4175 emit_fragment_program_code();
4176 }
4177 base_ir = NULL;
4178 if (failed)
4179 return false;
4180
4181 if (wm_prog_data->uses_kill)
4182 emit(FS_OPCODE_PLACEHOLDER_HALT);
4183
4184 if (wm_key->alpha_test_func)
4185 emit_alpha_test();
4186
4187 emit_fb_writes();
4188
4189 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4190 emit_shader_time_end();
4191
4192 calculate_cfg();
4193
4194 optimize();
4195
4196 assign_curb_setup();
4197 assign_urb_setup();
4198
4199 fixup_3src_null_dest();
4200 allocate_registers();
4201
4202 if (failed)
4203 return false;
4204 }
4205
4206 if (dispatch_width == 8)
4207 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4208 else
4209 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4210
4211 /* If any state parameters were appended, then ParameterValues could have
4212 * been realloced, in which case the driver uniform storage set up by
4213 * _mesa_associate_uniform_storage() would point to freed memory. Make
4214 * sure that didn't happen.
4215 */
4216 assert(sanity_param_count == prog->Parameters->NumParameters);
4217
4218 return !failed;
4219 }
4220
4221 bool
4222 fs_visitor::run_cs()
4223 {
4224 assert(stage == MESA_SHADER_COMPUTE);
4225 assert(shader);
4226
4227 sanity_param_count = prog->Parameters->NumParameters;
4228
4229 assign_common_binding_table_offsets(0);
4230
4231 setup_cs_payload();
4232
4233 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4234 emit_shader_time_begin();
4235
4236 emit_nir_code();
4237
4238 if (failed)
4239 return false;
4240
4241 emit_cs_terminate();
4242
4243 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4244 emit_shader_time_end();
4245
4246 calculate_cfg();
4247
4248 optimize();
4249
4250 assign_curb_setup();
4251
4252 fixup_3src_null_dest();
4253 allocate_registers();
4254
4255 if (failed)
4256 return false;
4257
4258 /* If any state parameters were appended, then ParameterValues could have
4259 * been realloced, in which case the driver uniform storage set up by
4260 * _mesa_associate_uniform_storage() would point to freed memory. Make
4261 * sure that didn't happen.
4262 */
4263 assert(sanity_param_count == prog->Parameters->NumParameters);
4264
4265 return !failed;
4266 }
4267
4268 const unsigned *
4269 brw_wm_fs_emit(struct brw_context *brw,
4270 void *mem_ctx,
4271 const struct brw_wm_prog_key *key,
4272 struct brw_wm_prog_data *prog_data,
4273 struct gl_fragment_program *fp,
4274 struct gl_shader_program *prog,
4275 unsigned *final_assembly_size)
4276 {
4277 bool start_busy = false;
4278 double start_time = 0;
4279
4280 if (unlikely(brw->perf_debug)) {
4281 start_busy = (brw->batch.last_bo &&
4282 drm_intel_bo_busy(brw->batch.last_bo));
4283 start_time = get_time();
4284 }
4285
4286 struct brw_shader *shader = NULL;
4287 if (prog)
4288 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4289
4290 if (unlikely(INTEL_DEBUG & DEBUG_WM))
4291 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4292
4293 /* Now the main event: Visit the shader IR and generate our FS IR for it.
4294 */
4295 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
4296 if (!v.run_fs()) {
4297 if (prog) {
4298 prog->LinkStatus = false;
4299 ralloc_strcat(&prog->InfoLog, v.fail_msg);
4300 }
4301
4302 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4303 v.fail_msg);
4304
4305 return NULL;
4306 }
4307
4308 cfg_t *simd16_cfg = NULL;
4309 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
4310 if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4311 if (!v.simd16_unsupported) {
4312 /* Try a SIMD16 compile */
4313 v2.import_uniforms(&v);
4314 if (!v2.run_fs()) {
4315 perf_debug("SIMD16 shader failed to compile, falling back to "
4316 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4317 } else {
4318 simd16_cfg = v2.cfg;
4319 }
4320 } else {
4321 perf_debug("SIMD16 shader unsupported, falling back to "
4322 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4323 }
4324 }
4325
4326 cfg_t *simd8_cfg;
4327 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4328 if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4329 simd8_cfg = NULL;
4330 prog_data->no_8 = true;
4331 } else {
4332 simd8_cfg = v.cfg;
4333 prog_data->no_8 = false;
4334 }
4335
4336 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4337 &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4338
4339 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4340 char *name;
4341 if (prog)
4342 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4343 prog->Label ? prog->Label : "unnamed",
4344 prog->Name);
4345 else
4346 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4347
4348 g.enable_debug(name);
4349 }
4350
4351 if (simd8_cfg)
4352 g.generate_code(simd8_cfg, 8);
4353 if (simd16_cfg)
4354 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4355
4356 if (unlikely(brw->perf_debug) && shader) {
4357 if (shader->compiled_once)
4358 brw_wm_debug_recompile(brw, prog, key);
4359 shader->compiled_once = true;
4360
4361 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4362 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4363 (get_time() - start_time) * 1000);
4364 }
4365 }
4366
4367 return g.get_assembly(final_assembly_size);
4368 }
4369
4370 extern "C" bool
4371 brw_fs_precompile(struct gl_context *ctx,
4372 struct gl_shader_program *shader_prog,
4373 struct gl_program *prog)
4374 {
4375 struct brw_context *brw = brw_context(ctx);
4376 struct brw_wm_prog_key key;
4377
4378 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4379 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4380 bool program_uses_dfdy = fp->UsesDFdy;
4381
4382 memset(&key, 0, sizeof(key));
4383
4384 if (brw->gen < 6) {
4385 if (fp->UsesKill)
4386 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4387
4388 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4389 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4390
4391 /* Just assume depth testing. */
4392 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4393 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4394 }
4395
4396 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4397 BRW_FS_VARYING_INPUT_MASK) > 16)
4398 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4399
4400 brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4401
4402 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4403 key.drawable_height = ctx->DrawBuffer->Height;
4404 }
4405
4406 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4407 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4408 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4409
4410 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4411 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4412 key.nr_color_regions > 1;
4413 }
4414
4415 key.program_string_id = bfp->id;
4416
4417 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4418 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4419
4420 bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4421
4422 brw->wm.base.prog_offset = old_prog_offset;
4423 brw->wm.prog_data = old_prog_data;
4424
4425 return success;
4426 }
4427
4428 void
4429 brw_setup_tex_for_precompile(struct brw_context *brw,
4430 struct brw_sampler_prog_key_data *tex,
4431 struct gl_program *prog)
4432 {
4433 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4434 unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4435 for (unsigned i = 0; i < sampler_count; i++) {
4436 if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4437 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4438 tex->swizzles[i] =
4439 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4440 } else {
4441 /* Color sampler: assume no swizzling. */
4442 tex->swizzles[i] = SWIZZLE_XYZW;
4443 }
4444 }
4445 }