i965/fs: Migrate lower_load_payload to the IR builder.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 using namespace brw;
53
54 void
55 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
56 const fs_reg *src, unsigned sources)
57 {
58 memset(this, 0, sizeof(*this));
59
60 this->src = new fs_reg[MAX2(sources, 3)];
61 for (unsigned i = 0; i < sources; i++)
62 this->src[i] = src[i];
63
64 this->opcode = opcode;
65 this->dst = dst;
66 this->sources = sources;
67 this->exec_size = exec_size;
68
69 assert(dst.file != IMM && dst.file != UNIFORM);
70
71 /* If exec_size == 0, try to guess it from the registers. Since all
72 * manner of things may use hardware registers, we first try to guess
73 * based on GRF registers. If this fails, we will go ahead and take the
74 * width from the destination register.
75 */
76 if (this->exec_size == 0) {
77 if (dst.file == GRF) {
78 this->exec_size = dst.width;
79 } else {
80 for (unsigned i = 0; i < sources; ++i) {
81 if (src[i].file != GRF && src[i].file != ATTR)
82 continue;
83
84 if (this->exec_size <= 1)
85 this->exec_size = src[i].width;
86 assert(src[i].width == 1 || src[i].width == this->exec_size);
87 }
88 }
89
90 if (this->exec_size == 0 && dst.file != BAD_FILE)
91 this->exec_size = dst.width;
92 }
93 assert(this->exec_size != 0);
94
95 this->conditional_mod = BRW_CONDITIONAL_NONE;
96
97 /* This will be the case for almost all instructions. */
98 switch (dst.file) {
99 case GRF:
100 case HW_REG:
101 case MRF:
102 case ATTR:
103 this->regs_written =
104 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
105 break;
106 case BAD_FILE:
107 this->regs_written = 0;
108 break;
109 case IMM:
110 case UNIFORM:
111 unreachable("Invalid destination register file");
112 default:
113 unreachable("Invalid register file");
114 }
115
116 this->writes_accumulator = false;
117 }
118
119 fs_inst::fs_inst()
120 {
121 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
122 }
123
124 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
125 {
126 init(opcode, exec_size, reg_undef, NULL, 0);
127 }
128
129 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
130 {
131 init(opcode, 0, dst, NULL, 0);
132 }
133
134 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
135 const fs_reg &src0)
136 {
137 const fs_reg src[1] = { src0 };
138 init(opcode, exec_size, dst, src, 1);
139 }
140
141 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
142 {
143 const fs_reg src[1] = { src0 };
144 init(opcode, 0, dst, src, 1);
145 }
146
147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
148 const fs_reg &src0, const fs_reg &src1)
149 {
150 const fs_reg src[2] = { src0, src1 };
151 init(opcode, exec_size, dst, src, 2);
152 }
153
154 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
155 const fs_reg &src1)
156 {
157 const fs_reg src[2] = { src0, src1 };
158 init(opcode, 0, dst, src, 2);
159 }
160
161 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
162 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
163 {
164 const fs_reg src[3] = { src0, src1, src2 };
165 init(opcode, exec_size, dst, src, 3);
166 }
167
168 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
169 const fs_reg &src1, const fs_reg &src2)
170 {
171 const fs_reg src[3] = { src0, src1, src2 };
172 init(opcode, 0, dst, src, 3);
173 }
174
175 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
176 const fs_reg src[], unsigned sources)
177 {
178 init(opcode, 0, dst, src, sources);
179 }
180
181 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
182 const fs_reg src[], unsigned sources)
183 {
184 init(opcode, exec_width, dst, src, sources);
185 }
186
187 fs_inst::fs_inst(const fs_inst &that)
188 {
189 memcpy(this, &that, sizeof(that));
190
191 this->src = new fs_reg[MAX2(that.sources, 3)];
192
193 for (unsigned i = 0; i < that.sources; i++)
194 this->src[i] = that.src[i];
195 }
196
197 fs_inst::~fs_inst()
198 {
199 delete[] this->src;
200 }
201
202 void
203 fs_inst::resize_sources(uint8_t num_sources)
204 {
205 if (this->sources != num_sources) {
206 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
207
208 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
209 src[i] = this->src[i];
210
211 delete[] this->src;
212 this->src = src;
213 this->sources = num_sources;
214 }
215 }
216
217 #define ALU1(op) \
218 fs_inst * \
219 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
220 { \
221 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
222 }
223
224 #define ALU2(op) \
225 fs_inst * \
226 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
227 const fs_reg &src1) \
228 { \
229 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
230 }
231
232 #define ALU2_ACC(op) \
233 fs_inst * \
234 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
235 const fs_reg &src1) \
236 { \
237 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
238 inst->writes_accumulator = true; \
239 return inst; \
240 }
241
242 #define ALU3(op) \
243 fs_inst * \
244 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
245 const fs_reg &src1, const fs_reg &src2) \
246 { \
247 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
248 }
249
250 ALU1(NOT)
251 ALU1(MOV)
252 ALU1(FRC)
253 ALU1(RNDD)
254 ALU1(RNDE)
255 ALU1(RNDZ)
256 ALU2(ADD)
257 ALU2(MUL)
258 ALU2_ACC(MACH)
259 ALU2(AND)
260 ALU2(OR)
261 ALU2(XOR)
262 ALU2(SHL)
263 ALU2(SHR)
264 ALU2(ASR)
265 ALU3(LRP)
266 ALU1(BFREV)
267 ALU3(BFE)
268 ALU2(BFI1)
269 ALU3(BFI2)
270 ALU1(FBH)
271 ALU1(FBL)
272 ALU1(CBIT)
273 ALU3(MAD)
274 ALU2_ACC(ADDC)
275 ALU2_ACC(SUBB)
276 ALU2(SEL)
277 ALU2(MAC)
278
279 /** Gen4 predicated IF. */
280 fs_inst *
281 fs_visitor::IF(enum brw_predicate predicate)
282 {
283 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
284 inst->predicate = predicate;
285 return inst;
286 }
287
288 /** Gen6 IF with embedded comparison. */
289 fs_inst *
290 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
291 enum brw_conditional_mod condition)
292 {
293 assert(devinfo->gen == 6);
294 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
295 reg_null_d, src0, src1);
296 inst->conditional_mod = condition;
297 return inst;
298 }
299
300 /**
301 * CMP: Sets the low bit of the destination channels with the result
302 * of the comparison, while the upper bits are undefined, and updates
303 * the flag register with the packed 16 bits of the result.
304 */
305 fs_inst *
306 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
307 enum brw_conditional_mod condition)
308 {
309 fs_inst *inst;
310
311 /* Take the instruction:
312 *
313 * CMP null<d> src0<f> src1<f>
314 *
315 * Original gen4 does type conversion to the destination type before
316 * comparison, producing garbage results for floating point comparisons.
317 *
318 * The destination type doesn't matter on newer generations, so we set the
319 * type to match src0 so we can compact the instruction.
320 */
321 dst.type = src0.type;
322 if (dst.file == HW_REG)
323 dst.fixed_hw_reg.type = dst.type;
324
325 resolve_ud_negate(&src0);
326 resolve_ud_negate(&src1);
327
328 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
329 inst->conditional_mod = condition;
330
331 return inst;
332 }
333
334 fs_inst *
335 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
336 int header_size)
337 {
338 assert(dst.width % 8 == 0);
339 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
340 dst, src, sources);
341 inst->header_size = header_size;
342
343 for (int i = 0; i < header_size; i++)
344 assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
345 inst->regs_written = header_size;
346
347 for (int i = header_size; i < sources; ++i)
348 assert(src[i].file != GRF || src[i].width == dst.width);
349 inst->regs_written += (sources - header_size) * (dst.width / 8);
350
351 return inst;
352 }
353
354 exec_list
355 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
356 const fs_reg &surf_index,
357 const fs_reg &varying_offset,
358 uint32_t const_offset)
359 {
360 exec_list instructions;
361 fs_inst *inst;
362
363 /* We have our constant surface use a pitch of 4 bytes, so our index can
364 * be any component of a vector, and then we load 4 contiguous
365 * components starting from that.
366 *
367 * We break down the const_offset to a portion added to the variable
368 * offset and a portion done using reg_offset, which means that if you
369 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
370 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
371 * CSE can later notice that those loads are all the same and eliminate
372 * the redundant ones.
373 */
374 fs_reg vec4_offset = vgrf(glsl_type::int_type);
375 instructions.push_tail(ADD(vec4_offset,
376 varying_offset, fs_reg(const_offset & ~3)));
377
378 int scale = 1;
379 if (devinfo->gen == 4 && dst.width == 8) {
380 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
381 * u, v, r) as parameters, or we can just use the SIMD16 message
382 * consisting of (header, u). We choose the second, at the cost of a
383 * longer return length.
384 */
385 scale = 2;
386 }
387
388 enum opcode op;
389 if (devinfo->gen >= 7)
390 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
391 else
392 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
393
394 assert(dst.width % 8 == 0);
395 int regs_written = 4 * (dst.width / 8) * scale;
396 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
397 dst.type, dst.width);
398 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
399 inst->regs_written = regs_written;
400 instructions.push_tail(inst);
401
402 if (devinfo->gen < 7) {
403 inst->base_mrf = 13;
404 inst->header_size = 1;
405 if (devinfo->gen == 4)
406 inst->mlen = 3;
407 else
408 inst->mlen = 1 + dispatch_width / 8;
409 }
410
411 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
412 instructions.push_tail(MOV(dst, result));
413
414 return instructions;
415 }
416
417 /**
418 * A helper for MOV generation for fixing up broken hardware SEND dependency
419 * handling.
420 */
421 fs_inst *
422 fs_visitor::DEP_RESOLVE_MOV(int grf)
423 {
424 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
425
426 inst->ir = NULL;
427 inst->annotation = "send dependency resolve";
428
429 /* The caller always wants uncompressed to emit the minimal extra
430 * dependencies, and to avoid having to deal with aligning its regs to 2.
431 */
432 inst->exec_size = 8;
433
434 return inst;
435 }
436
437 bool
438 fs_inst::equals(fs_inst *inst) const
439 {
440 return (opcode == inst->opcode &&
441 dst.equals(inst->dst) &&
442 src[0].equals(inst->src[0]) &&
443 src[1].equals(inst->src[1]) &&
444 src[2].equals(inst->src[2]) &&
445 saturate == inst->saturate &&
446 predicate == inst->predicate &&
447 conditional_mod == inst->conditional_mod &&
448 mlen == inst->mlen &&
449 base_mrf == inst->base_mrf &&
450 target == inst->target &&
451 eot == inst->eot &&
452 header_size == inst->header_size &&
453 shadow_compare == inst->shadow_compare &&
454 exec_size == inst->exec_size &&
455 offset == inst->offset);
456 }
457
458 bool
459 fs_inst::overwrites_reg(const fs_reg &reg) const
460 {
461 return reg.in_range(dst, regs_written);
462 }
463
464 bool
465 fs_inst::is_send_from_grf() const
466 {
467 switch (opcode) {
468 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
469 case SHADER_OPCODE_SHADER_TIME_ADD:
470 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
471 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
472 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
473 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
474 case SHADER_OPCODE_UNTYPED_ATOMIC:
475 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
476 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
477 case SHADER_OPCODE_TYPED_ATOMIC:
478 case SHADER_OPCODE_TYPED_SURFACE_READ:
479 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
480 case SHADER_OPCODE_URB_WRITE_SIMD8:
481 return true;
482 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
483 return src[1].file == GRF;
484 case FS_OPCODE_FB_WRITE:
485 return src[0].file == GRF;
486 default:
487 if (is_tex())
488 return src[0].file == GRF;
489
490 return false;
491 }
492 }
493
494 bool
495 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
496 {
497 if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
498 return false;
499
500 fs_reg reg = this->src[0];
501 if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
502 return false;
503
504 if (grf_alloc.sizes[reg.reg] != this->regs_written)
505 return false;
506
507 for (int i = 0; i < this->sources; i++) {
508 reg.type = this->src[i].type;
509 reg.width = this->src[i].width;
510 if (!this->src[i].equals(reg))
511 return false;
512 reg = ::offset(reg, 1);
513 }
514
515 return true;
516 }
517
518 bool
519 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
520 {
521 if (devinfo->gen == 6 && is_math())
522 return false;
523
524 if (is_send_from_grf())
525 return false;
526
527 if (!backend_instruction::can_do_source_mods())
528 return false;
529
530 return true;
531 }
532
533 bool
534 fs_inst::has_side_effects() const
535 {
536 return this->eot || backend_instruction::has_side_effects();
537 }
538
539 void
540 fs_reg::init()
541 {
542 memset(this, 0, sizeof(*this));
543 stride = 1;
544 }
545
546 /** Generic unset register constructor. */
547 fs_reg::fs_reg()
548 {
549 init();
550 this->file = BAD_FILE;
551 }
552
553 /** Immediate value constructor. */
554 fs_reg::fs_reg(float f)
555 {
556 init();
557 this->file = IMM;
558 this->type = BRW_REGISTER_TYPE_F;
559 this->fixed_hw_reg.dw1.f = f;
560 this->width = 1;
561 }
562
563 /** Immediate value constructor. */
564 fs_reg::fs_reg(int32_t i)
565 {
566 init();
567 this->file = IMM;
568 this->type = BRW_REGISTER_TYPE_D;
569 this->fixed_hw_reg.dw1.d = i;
570 this->width = 1;
571 }
572
573 /** Immediate value constructor. */
574 fs_reg::fs_reg(uint32_t u)
575 {
576 init();
577 this->file = IMM;
578 this->type = BRW_REGISTER_TYPE_UD;
579 this->fixed_hw_reg.dw1.ud = u;
580 this->width = 1;
581 }
582
583 /** Vector float immediate value constructor. */
584 fs_reg::fs_reg(uint8_t vf[4])
585 {
586 init();
587 this->file = IMM;
588 this->type = BRW_REGISTER_TYPE_VF;
589 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
590 }
591
592 /** Vector float immediate value constructor. */
593 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
594 {
595 init();
596 this->file = IMM;
597 this->type = BRW_REGISTER_TYPE_VF;
598 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
599 (vf1 << 8) |
600 (vf2 << 16) |
601 (vf3 << 24);
602 }
603
604 /** Fixed brw_reg. */
605 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
606 {
607 init();
608 this->file = HW_REG;
609 this->fixed_hw_reg = fixed_hw_reg;
610 this->type = fixed_hw_reg.type;
611 this->width = 1 << fixed_hw_reg.width;
612 }
613
614 bool
615 fs_reg::equals(const fs_reg &r) const
616 {
617 return (file == r.file &&
618 reg == r.reg &&
619 reg_offset == r.reg_offset &&
620 subreg_offset == r.subreg_offset &&
621 type == r.type &&
622 negate == r.negate &&
623 abs == r.abs &&
624 !reladdr && !r.reladdr &&
625 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
626 width == r.width &&
627 stride == r.stride);
628 }
629
630 fs_reg &
631 fs_reg::set_smear(unsigned subreg)
632 {
633 assert(file != HW_REG && file != IMM);
634 subreg_offset = subreg * type_sz(type);
635 stride = 0;
636 return *this;
637 }
638
639 bool
640 fs_reg::is_contiguous() const
641 {
642 return stride == 1;
643 }
644
645 int
646 fs_visitor::type_size(const struct glsl_type *type)
647 {
648 unsigned int size, i;
649
650 switch (type->base_type) {
651 case GLSL_TYPE_UINT:
652 case GLSL_TYPE_INT:
653 case GLSL_TYPE_FLOAT:
654 case GLSL_TYPE_BOOL:
655 return type->components();
656 case GLSL_TYPE_ARRAY:
657 return type_size(type->fields.array) * type->length;
658 case GLSL_TYPE_STRUCT:
659 size = 0;
660 for (i = 0; i < type->length; i++) {
661 size += type_size(type->fields.structure[i].type);
662 }
663 return size;
664 case GLSL_TYPE_SAMPLER:
665 /* Samplers take up no register space, since they're baked in at
666 * link time.
667 */
668 return 0;
669 case GLSL_TYPE_ATOMIC_UINT:
670 return 0;
671 case GLSL_TYPE_IMAGE:
672 case GLSL_TYPE_VOID:
673 case GLSL_TYPE_ERROR:
674 case GLSL_TYPE_INTERFACE:
675 case GLSL_TYPE_DOUBLE:
676 unreachable("not reached");
677 }
678
679 return 0;
680 }
681
682 /**
683 * Create a MOV to read the timestamp register.
684 *
685 * The caller is responsible for emitting the MOV. The return value is
686 * the destination of the MOV, with extra parameters set.
687 */
688 fs_reg
689 fs_visitor::get_timestamp(fs_inst **out_mov)
690 {
691 assert(devinfo->gen >= 7);
692
693 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
694 BRW_ARF_TIMESTAMP,
695 0),
696 BRW_REGISTER_TYPE_UD));
697
698 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
699
700 fs_inst *mov = MOV(dst, ts);
701 /* We want to read the 3 fields we care about even if it's not enabled in
702 * the dispatch.
703 */
704 mov->force_writemask_all = true;
705
706 /* The caller wants the low 32 bits of the timestamp. Since it's running
707 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
708 * which is plenty of time for our purposes. It is identical across the
709 * EUs, but since it's tracking GPU core speed it will increment at a
710 * varying rate as render P-states change.
711 *
712 * The caller could also check if render P-states have changed (or anything
713 * else that might disrupt timing) by setting smear to 2 and checking if
714 * that field is != 0.
715 */
716 dst.set_smear(0);
717
718 *out_mov = mov;
719 return dst;
720 }
721
722 void
723 fs_visitor::emit_shader_time_begin()
724 {
725 current_annotation = "shader time start";
726 fs_inst *mov;
727 shader_start_time = get_timestamp(&mov);
728 emit(mov);
729 }
730
731 void
732 fs_visitor::emit_shader_time_end()
733 {
734 current_annotation = "shader time end";
735
736 enum shader_time_shader_type type, written_type, reset_type;
737 switch (stage) {
738 case MESA_SHADER_VERTEX:
739 type = ST_VS;
740 written_type = ST_VS_WRITTEN;
741 reset_type = ST_VS_RESET;
742 break;
743 case MESA_SHADER_GEOMETRY:
744 type = ST_GS;
745 written_type = ST_GS_WRITTEN;
746 reset_type = ST_GS_RESET;
747 break;
748 case MESA_SHADER_FRAGMENT:
749 if (dispatch_width == 8) {
750 type = ST_FS8;
751 written_type = ST_FS8_WRITTEN;
752 reset_type = ST_FS8_RESET;
753 } else {
754 assert(dispatch_width == 16);
755 type = ST_FS16;
756 written_type = ST_FS16_WRITTEN;
757 reset_type = ST_FS16_RESET;
758 }
759 break;
760 case MESA_SHADER_COMPUTE:
761 type = ST_CS;
762 written_type = ST_CS_WRITTEN;
763 reset_type = ST_CS_RESET;
764 break;
765 default:
766 unreachable("fs_visitor::emit_shader_time_end missing code");
767 }
768
769 /* Insert our code just before the final SEND with EOT. */
770 exec_node *end = this->instructions.get_tail();
771 assert(end && ((fs_inst *) end)->eot);
772
773 fs_inst *tm_read;
774 fs_reg shader_end_time = get_timestamp(&tm_read);
775 end->insert_before(tm_read);
776
777 /* Check that there weren't any timestamp reset events (assuming these
778 * were the only two timestamp reads that happened).
779 */
780 fs_reg reset = shader_end_time;
781 reset.set_smear(2);
782 fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
783 test->conditional_mod = BRW_CONDITIONAL_Z;
784 test->force_writemask_all = true;
785 end->insert_before(test);
786 end->insert_before(IF(BRW_PREDICATE_NORMAL));
787
788 fs_reg start = shader_start_time;
789 start.negate = true;
790 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
791 diff.set_smear(0);
792 fs_inst *add = ADD(diff, start, shader_end_time);
793 add->force_writemask_all = true;
794 end->insert_before(add);
795
796 /* If there were no instructions between the two timestamp gets, the diff
797 * is 2 cycles. Remove that overhead, so I can forget about that when
798 * trying to determine the time taken for single instructions.
799 */
800 add = ADD(diff, diff, fs_reg(-2u));
801 add->force_writemask_all = true;
802 end->insert_before(add);
803
804 end->insert_before(SHADER_TIME_ADD(type, diff));
805 end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
806 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
807 end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
808 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
809 }
810
811 fs_inst *
812 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
813 {
814 int shader_time_index =
815 brw_get_shader_time_index(brw, shader_prog, prog, type);
816 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
817
818 fs_reg payload;
819 if (dispatch_width == 8)
820 payload = vgrf(glsl_type::uvec2_type);
821 else
822 payload = vgrf(glsl_type::uint_type);
823
824 return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
825 fs_reg(), payload, offset, value);
826 }
827
828 void
829 fs_visitor::vfail(const char *format, va_list va)
830 {
831 char *msg;
832
833 if (failed)
834 return;
835
836 failed = true;
837
838 msg = ralloc_vasprintf(mem_ctx, format, va);
839 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
840
841 this->fail_msg = msg;
842
843 if (debug_enabled) {
844 fprintf(stderr, "%s", msg);
845 }
846 }
847
848 void
849 fs_visitor::fail(const char *format, ...)
850 {
851 va_list va;
852
853 va_start(va, format);
854 vfail(format, va);
855 va_end(va);
856 }
857
858 /**
859 * Mark this program as impossible to compile in SIMD16 mode.
860 *
861 * During the SIMD8 compile (which happens first), we can detect and flag
862 * things that are unsupported in SIMD16 mode, so the compiler can skip
863 * the SIMD16 compile altogether.
864 *
865 * During a SIMD16 compile (if one happens anyway), this just calls fail().
866 */
867 void
868 fs_visitor::no16(const char *format, ...)
869 {
870 va_list va;
871
872 va_start(va, format);
873
874 if (dispatch_width == 16) {
875 vfail(format, va);
876 } else {
877 simd16_unsupported = true;
878
879 if (brw->perf_debug) {
880 if (no16_msg)
881 ralloc_vasprintf_append(&no16_msg, format, va);
882 else
883 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
884 }
885 }
886
887 va_end(va);
888 }
889
890 fs_inst *
891 fs_visitor::emit(enum opcode opcode)
892 {
893 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
894 }
895
896 fs_inst *
897 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
898 {
899 return emit(new(mem_ctx) fs_inst(opcode, dst));
900 }
901
902 fs_inst *
903 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
904 {
905 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
906 }
907
908 fs_inst *
909 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
910 const fs_reg &src1)
911 {
912 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
913 }
914
915 fs_inst *
916 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
917 const fs_reg &src1, const fs_reg &src2)
918 {
919 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
920 }
921
922 fs_inst *
923 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
924 fs_reg src[], int sources)
925 {
926 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
927 }
928
929 /**
930 * Returns true if the instruction has a flag that means it won't
931 * update an entire destination register.
932 *
933 * For example, dead code elimination and live variable analysis want to know
934 * when a write to a variable screens off any preceding values that were in
935 * it.
936 */
937 bool
938 fs_inst::is_partial_write() const
939 {
940 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
941 (this->dst.width * type_sz(this->dst.type)) < 32 ||
942 !this->dst.is_contiguous());
943 }
944
945 int
946 fs_inst::regs_read(int arg) const
947 {
948 if (is_tex() && arg == 0 && src[0].file == GRF) {
949 return mlen;
950 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
951 return mlen;
952 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
953 return mlen;
954 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
955 return mlen;
956 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
957 return mlen;
958 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
959 return mlen;
960 } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
961 return mlen;
962 } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
963 return mlen;
964 } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
965 return mlen;
966 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
967 return mlen;
968 } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
969 return exec_size / 4;
970 }
971
972 switch (src[arg].file) {
973 case BAD_FILE:
974 case UNIFORM:
975 case IMM:
976 return 1;
977 case GRF:
978 case HW_REG:
979 if (src[arg].stride == 0) {
980 return 1;
981 } else {
982 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
983 return (size + 31) / 32;
984 }
985 case MRF:
986 unreachable("MRF registers are not allowed as sources");
987 default:
988 unreachable("Invalid register file");
989 }
990 }
991
992 bool
993 fs_inst::reads_flag() const
994 {
995 return predicate;
996 }
997
998 bool
999 fs_inst::writes_flag() const
1000 {
1001 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
1002 opcode != BRW_OPCODE_IF &&
1003 opcode != BRW_OPCODE_WHILE)) ||
1004 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
1005 }
1006
1007 /**
1008 * Returns how many MRFs an FS opcode will write over.
1009 *
1010 * Note that this is not the 0 or 1 implied writes in an actual gen
1011 * instruction -- the FS opcodes often generate MOVs in addition.
1012 */
1013 int
1014 fs_visitor::implied_mrf_writes(fs_inst *inst)
1015 {
1016 if (inst->mlen == 0)
1017 return 0;
1018
1019 if (inst->base_mrf == -1)
1020 return 0;
1021
1022 switch (inst->opcode) {
1023 case SHADER_OPCODE_RCP:
1024 case SHADER_OPCODE_RSQ:
1025 case SHADER_OPCODE_SQRT:
1026 case SHADER_OPCODE_EXP2:
1027 case SHADER_OPCODE_LOG2:
1028 case SHADER_OPCODE_SIN:
1029 case SHADER_OPCODE_COS:
1030 return 1 * dispatch_width / 8;
1031 case SHADER_OPCODE_POW:
1032 case SHADER_OPCODE_INT_QUOTIENT:
1033 case SHADER_OPCODE_INT_REMAINDER:
1034 return 2 * dispatch_width / 8;
1035 case SHADER_OPCODE_TEX:
1036 case FS_OPCODE_TXB:
1037 case SHADER_OPCODE_TXD:
1038 case SHADER_OPCODE_TXF:
1039 case SHADER_OPCODE_TXF_CMS:
1040 case SHADER_OPCODE_TXF_MCS:
1041 case SHADER_OPCODE_TG4:
1042 case SHADER_OPCODE_TG4_OFFSET:
1043 case SHADER_OPCODE_TXL:
1044 case SHADER_OPCODE_TXS:
1045 case SHADER_OPCODE_LOD:
1046 return 1;
1047 case FS_OPCODE_FB_WRITE:
1048 return 2;
1049 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1050 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1051 return 1;
1052 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1053 return inst->mlen;
1054 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1055 return inst->mlen;
1056 case SHADER_OPCODE_UNTYPED_ATOMIC:
1057 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1058 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1059 case SHADER_OPCODE_TYPED_ATOMIC:
1060 case SHADER_OPCODE_TYPED_SURFACE_READ:
1061 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1062 case SHADER_OPCODE_URB_WRITE_SIMD8:
1063 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1064 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1065 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1066 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1067 return 0;
1068 default:
1069 unreachable("not reached");
1070 }
1071 }
1072
1073 fs_reg
1074 fs_visitor::vgrf(const glsl_type *const type)
1075 {
1076 int reg_width = dispatch_width / 8;
1077 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1078 brw_type_for_base_type(type), dispatch_width);
1079 }
1080
1081 fs_reg
1082 fs_visitor::vgrf(int num_components)
1083 {
1084 int reg_width = dispatch_width / 8;
1085 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1086 BRW_REGISTER_TYPE_F, dispatch_width);
1087 }
1088
1089 /** Fixed HW reg constructor. */
1090 fs_reg::fs_reg(enum register_file file, int reg)
1091 {
1092 init();
1093 this->file = file;
1094 this->reg = reg;
1095 this->type = BRW_REGISTER_TYPE_F;
1096
1097 switch (file) {
1098 case UNIFORM:
1099 this->width = 1;
1100 break;
1101 default:
1102 this->width = 8;
1103 }
1104 }
1105
1106 /** Fixed HW reg constructor. */
1107 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1108 {
1109 init();
1110 this->file = file;
1111 this->reg = reg;
1112 this->type = type;
1113
1114 switch (file) {
1115 case UNIFORM:
1116 this->width = 1;
1117 break;
1118 default:
1119 this->width = 8;
1120 }
1121 }
1122
1123 /** Fixed HW reg constructor. */
1124 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1125 uint8_t width)
1126 {
1127 init();
1128 this->file = file;
1129 this->reg = reg;
1130 this->type = type;
1131 this->width = width;
1132 }
1133
1134 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1135 * This brings in those uniform definitions
1136 */
1137 void
1138 fs_visitor::import_uniforms(fs_visitor *v)
1139 {
1140 this->push_constant_loc = v->push_constant_loc;
1141 this->pull_constant_loc = v->pull_constant_loc;
1142 this->uniforms = v->uniforms;
1143 this->param_size = v->param_size;
1144 }
1145
1146 fs_reg *
1147 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1148 bool origin_upper_left)
1149 {
1150 assert(stage == MESA_SHADER_FRAGMENT);
1151 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1152 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1153 fs_reg wpos = *reg;
1154 bool flip = !origin_upper_left ^ key->render_to_fbo;
1155
1156 /* gl_FragCoord.x */
1157 if (pixel_center_integer) {
1158 emit(MOV(wpos, this->pixel_x));
1159 } else {
1160 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1161 }
1162 wpos = offset(wpos, 1);
1163
1164 /* gl_FragCoord.y */
1165 if (!flip && pixel_center_integer) {
1166 emit(MOV(wpos, this->pixel_y));
1167 } else {
1168 fs_reg pixel_y = this->pixel_y;
1169 float offset = (pixel_center_integer ? 0.0 : 0.5);
1170
1171 if (flip) {
1172 pixel_y.negate = true;
1173 offset += key->drawable_height - 1.0;
1174 }
1175
1176 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1177 }
1178 wpos = offset(wpos, 1);
1179
1180 /* gl_FragCoord.z */
1181 if (devinfo->gen >= 6) {
1182 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1183 } else {
1184 emit(FS_OPCODE_LINTERP, wpos,
1185 this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1186 interp_reg(VARYING_SLOT_POS, 2));
1187 }
1188 wpos = offset(wpos, 1);
1189
1190 /* gl_FragCoord.w: Already set up in emit_interpolation */
1191 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1192
1193 return reg;
1194 }
1195
1196 fs_inst *
1197 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1198 glsl_interp_qualifier interpolation_mode,
1199 bool is_centroid, bool is_sample)
1200 {
1201 brw_wm_barycentric_interp_mode barycoord_mode;
1202 if (devinfo->gen >= 6) {
1203 if (is_centroid) {
1204 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1205 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1206 else
1207 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1208 } else if (is_sample) {
1209 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1210 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1211 else
1212 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1213 } else {
1214 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1215 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1216 else
1217 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1218 }
1219 } else {
1220 /* On Ironlake and below, there is only one interpolation mode.
1221 * Centroid interpolation doesn't mean anything on this hardware --
1222 * there is no multisampling.
1223 */
1224 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1225 }
1226 return emit(FS_OPCODE_LINTERP, attr,
1227 this->delta_xy[barycoord_mode], interp);
1228 }
1229
1230 void
1231 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1232 const glsl_type *type,
1233 glsl_interp_qualifier interpolation_mode,
1234 int location, bool mod_centroid,
1235 bool mod_sample)
1236 {
1237 attr.type = brw_type_for_base_type(type->get_scalar_type());
1238
1239 assert(stage == MESA_SHADER_FRAGMENT);
1240 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1241 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1242
1243 unsigned int array_elements;
1244
1245 if (type->is_array()) {
1246 array_elements = type->length;
1247 if (array_elements == 0) {
1248 fail("dereferenced array '%s' has length 0\n", name);
1249 }
1250 type = type->fields.array;
1251 } else {
1252 array_elements = 1;
1253 }
1254
1255 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1256 bool is_gl_Color =
1257 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1258 if (key->flat_shade && is_gl_Color) {
1259 interpolation_mode = INTERP_QUALIFIER_FLAT;
1260 } else {
1261 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1262 }
1263 }
1264
1265 for (unsigned int i = 0; i < array_elements; i++) {
1266 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1267 if (prog_data->urb_setup[location] == -1) {
1268 /* If there's no incoming setup data for this slot, don't
1269 * emit interpolation for it.
1270 */
1271 attr = offset(attr, type->vector_elements);
1272 location++;
1273 continue;
1274 }
1275
1276 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1277 /* Constant interpolation (flat shading) case. The SF has
1278 * handed us defined values in only the constant offset
1279 * field of the setup reg.
1280 */
1281 for (unsigned int k = 0; k < type->vector_elements; k++) {
1282 struct brw_reg interp = interp_reg(location, k);
1283 interp = suboffset(interp, 3);
1284 interp.type = attr.type;
1285 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1286 attr = offset(attr, 1);
1287 }
1288 } else {
1289 /* Smooth/noperspective interpolation case. */
1290 for (unsigned int k = 0; k < type->vector_elements; k++) {
1291 struct brw_reg interp = interp_reg(location, k);
1292 if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1293 /* Get the pixel/sample mask into f0 so that we know
1294 * which pixels are lit. Then, for each channel that is
1295 * unlit, replace the centroid data with non-centroid
1296 * data.
1297 */
1298 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1299
1300 fs_inst *inst;
1301 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1302 false, false);
1303 inst->predicate = BRW_PREDICATE_NORMAL;
1304 inst->predicate_inverse = true;
1305 if (devinfo->has_pln)
1306 inst->no_dd_clear = true;
1307
1308 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1309 mod_centroid && !key->persample_shading,
1310 mod_sample || key->persample_shading);
1311 inst->predicate = BRW_PREDICATE_NORMAL;
1312 inst->predicate_inverse = false;
1313 if (devinfo->has_pln)
1314 inst->no_dd_check = true;
1315
1316 } else {
1317 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1318 mod_centroid && !key->persample_shading,
1319 mod_sample || key->persample_shading);
1320 }
1321 if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1322 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1323 }
1324 attr = offset(attr, 1);
1325 }
1326
1327 }
1328 location++;
1329 }
1330 }
1331 }
1332
1333 fs_reg *
1334 fs_visitor::emit_frontfacing_interpolation()
1335 {
1336 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1337
1338 if (devinfo->gen >= 6) {
1339 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1340 * a boolean result from this (~0/true or 0/false).
1341 *
1342 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1343 * this task in only one instruction:
1344 * - a negation source modifier will flip the bit; and
1345 * - a W -> D type conversion will sign extend the bit into the high
1346 * word of the destination.
1347 *
1348 * An ASR 15 fills the low word of the destination.
1349 */
1350 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1351 g0.negate = true;
1352
1353 emit(ASR(*reg, g0, fs_reg(15)));
1354 } else {
1355 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1356 * a boolean result from this (1/true or 0/false).
1357 *
1358 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1359 * the negation source modifier to flip it. Unfortunately the SHR
1360 * instruction only operates on UD (or D with an abs source modifier)
1361 * sources without negation.
1362 *
1363 * Instead, use ASR (which will give ~0/true or 0/false).
1364 */
1365 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1366 g1_6.negate = true;
1367
1368 emit(ASR(*reg, g1_6, fs_reg(31)));
1369 }
1370
1371 return reg;
1372 }
1373
1374 void
1375 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1376 {
1377 assert(stage == MESA_SHADER_FRAGMENT);
1378 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1379 assert(dst.type == BRW_REGISTER_TYPE_F);
1380
1381 if (key->compute_pos_offset) {
1382 /* Convert int_sample_pos to floating point */
1383 emit(MOV(dst, int_sample_pos));
1384 /* Scale to the range [0, 1] */
1385 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1386 }
1387 else {
1388 /* From ARB_sample_shading specification:
1389 * "When rendering to a non-multisample buffer, or if multisample
1390 * rasterization is disabled, gl_SamplePosition will always be
1391 * (0.5, 0.5).
1392 */
1393 emit(MOV(dst, fs_reg(0.5f)));
1394 }
1395 }
1396
1397 fs_reg *
1398 fs_visitor::emit_samplepos_setup()
1399 {
1400 assert(devinfo->gen >= 6);
1401
1402 this->current_annotation = "compute sample position";
1403 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1404 fs_reg pos = *reg;
1405 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1406 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1407
1408 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1409 * mode will be enabled.
1410 *
1411 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1412 * R31.1:0 Position Offset X/Y for Slot[3:0]
1413 * R31.3:2 Position Offset X/Y for Slot[7:4]
1414 * .....
1415 *
1416 * The X, Y sample positions come in as bytes in thread payload. So, read
1417 * the positions using vstride=16, width=8, hstride=2.
1418 */
1419 struct brw_reg sample_pos_reg =
1420 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1421 BRW_REGISTER_TYPE_B), 16, 8, 2);
1422
1423 if (dispatch_width == 8) {
1424 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1425 } else {
1426 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1427 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1428 ->force_sechalf = true;
1429 }
1430 /* Compute gl_SamplePosition.x */
1431 compute_sample_position(pos, int_sample_x);
1432 pos = offset(pos, 1);
1433 if (dispatch_width == 8) {
1434 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1435 } else {
1436 emit(MOV(half(int_sample_y, 0),
1437 fs_reg(suboffset(sample_pos_reg, 1))));
1438 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1439 ->force_sechalf = true;
1440 }
1441 /* Compute gl_SamplePosition.y */
1442 compute_sample_position(pos, int_sample_y);
1443 return reg;
1444 }
1445
1446 fs_reg *
1447 fs_visitor::emit_sampleid_setup()
1448 {
1449 assert(stage == MESA_SHADER_FRAGMENT);
1450 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1451 assert(devinfo->gen >= 6);
1452
1453 this->current_annotation = "compute sample id";
1454 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1455
1456 if (key->compute_sample_id) {
1457 fs_reg t1 = vgrf(glsl_type::int_type);
1458 fs_reg t2 = vgrf(glsl_type::int_type);
1459 t2.type = BRW_REGISTER_TYPE_UW;
1460
1461 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1462 * 8x multisampling, subspan 0 will represent sample N (where N
1463 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1464 * 7. We can find the value of N by looking at R0.0 bits 7:6
1465 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1466 * (since samples are always delivered in pairs). That is, we
1467 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1468 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1469 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1470 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1471 * populating a temporary variable with the sequence (0, 1, 2, 3),
1472 * and then reading from it using vstride=1, width=4, hstride=0.
1473 * These computations hold good for 4x multisampling as well.
1474 *
1475 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1476 * the first four slots are sample 0 of subspan 0; the next four
1477 * are sample 1 of subspan 0; the third group is sample 0 of
1478 * subspan 1, and finally sample 1 of subspan 1.
1479 */
1480 fs_inst *inst;
1481 inst = emit(BRW_OPCODE_AND, t1,
1482 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1483 fs_reg(0xc0));
1484 inst->force_writemask_all = true;
1485 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1486 inst->force_writemask_all = true;
1487 /* This works for both SIMD8 and SIMD16 */
1488 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1489 inst->force_writemask_all = true;
1490 /* This special instruction takes care of setting vstride=1,
1491 * width=4, hstride=0 of t2 during an ADD instruction.
1492 */
1493 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1494 } else {
1495 /* As per GL_ARB_sample_shading specification:
1496 * "When rendering to a non-multisample buffer, or if multisample
1497 * rasterization is disabled, gl_SampleID will always be zero."
1498 */
1499 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1500 }
1501
1502 return reg;
1503 }
1504
1505 void
1506 fs_visitor::resolve_source_modifiers(fs_reg *src)
1507 {
1508 if (!src->abs && !src->negate)
1509 return;
1510
1511 fs_reg temp = retype(vgrf(1), src->type);
1512 emit(MOV(temp, *src));
1513 *src = temp;
1514 }
1515
1516 fs_reg
1517 fs_visitor::fix_math_operand(fs_reg src)
1518 {
1519 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1520 * might be able to do better by doing execsize = 1 math and then
1521 * expanding that result out, but we would need to be careful with
1522 * masking.
1523 *
1524 * The hardware ignores source modifiers (negate and abs) on math
1525 * instructions, so we also move to a temp to set those up.
1526 */
1527 if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1528 !src.abs && !src.negate)
1529 return src;
1530
1531 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1532 * operands to math
1533 */
1534 if (devinfo->gen >= 7 && src.file != IMM)
1535 return src;
1536
1537 fs_reg expanded = vgrf(glsl_type::float_type);
1538 expanded.type = src.type;
1539 emit(BRW_OPCODE_MOV, expanded, src);
1540 return expanded;
1541 }
1542
1543 fs_inst *
1544 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1545 {
1546 switch (opcode) {
1547 case SHADER_OPCODE_RCP:
1548 case SHADER_OPCODE_RSQ:
1549 case SHADER_OPCODE_SQRT:
1550 case SHADER_OPCODE_EXP2:
1551 case SHADER_OPCODE_LOG2:
1552 case SHADER_OPCODE_SIN:
1553 case SHADER_OPCODE_COS:
1554 break;
1555 default:
1556 unreachable("not reached: bad math opcode");
1557 }
1558
1559 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1560 * might be able to do better by doing execsize = 1 math and then
1561 * expanding that result out, but we would need to be careful with
1562 * masking.
1563 *
1564 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1565 * instructions, so we also move to a temp to set those up.
1566 */
1567 if (devinfo->gen == 6 || devinfo->gen == 7)
1568 src = fix_math_operand(src);
1569
1570 fs_inst *inst = emit(opcode, dst, src);
1571
1572 if (devinfo->gen < 6) {
1573 inst->base_mrf = 2;
1574 inst->mlen = dispatch_width / 8;
1575 }
1576
1577 return inst;
1578 }
1579
1580 fs_inst *
1581 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1582 {
1583 int base_mrf = 2;
1584 fs_inst *inst;
1585
1586 if (devinfo->gen >= 8) {
1587 inst = emit(opcode, dst, src0, src1);
1588 } else if (devinfo->gen >= 6) {
1589 src0 = fix_math_operand(src0);
1590 src1 = fix_math_operand(src1);
1591
1592 inst = emit(opcode, dst, src0, src1);
1593 } else {
1594 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1595 * "Message Payload":
1596 *
1597 * "Operand0[7]. For the INT DIV functions, this operand is the
1598 * denominator."
1599 * ...
1600 * "Operand1[7]. For the INT DIV functions, this operand is the
1601 * numerator."
1602 */
1603 bool is_int_div = opcode != SHADER_OPCODE_POW;
1604 fs_reg &op0 = is_int_div ? src1 : src0;
1605 fs_reg &op1 = is_int_div ? src0 : src1;
1606
1607 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1608 inst = emit(opcode, dst, op0, reg_null_f);
1609
1610 inst->base_mrf = base_mrf;
1611 inst->mlen = 2 * dispatch_width / 8;
1612 }
1613 return inst;
1614 }
1615
1616 void
1617 fs_visitor::emit_discard_jump()
1618 {
1619 assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1620
1621 /* For performance, after a discard, jump to the end of the
1622 * shader if all relevant channels have been discarded.
1623 */
1624 fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1625 discard_jump->flag_subreg = 1;
1626
1627 discard_jump->predicate = (dispatch_width == 8)
1628 ? BRW_PREDICATE_ALIGN1_ANY8H
1629 : BRW_PREDICATE_ALIGN1_ANY16H;
1630 discard_jump->predicate_inverse = true;
1631 }
1632
1633 void
1634 fs_visitor::assign_curb_setup()
1635 {
1636 if (dispatch_width == 8) {
1637 prog_data->dispatch_grf_start_reg = payload.num_regs;
1638 } else {
1639 if (stage == MESA_SHADER_FRAGMENT) {
1640 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1641 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1642 } else if (stage == MESA_SHADER_COMPUTE) {
1643 brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1644 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1645 } else {
1646 unreachable("Unsupported shader type!");
1647 }
1648 }
1649
1650 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1651
1652 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1653 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1654 for (unsigned int i = 0; i < inst->sources; i++) {
1655 if (inst->src[i].file == UNIFORM) {
1656 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1657 int constant_nr;
1658 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1659 constant_nr = push_constant_loc[uniform_nr];
1660 } else {
1661 /* Section 5.11 of the OpenGL 4.1 spec says:
1662 * "Out-of-bounds reads return undefined values, which include
1663 * values from other variables of the active program or zero."
1664 * Just return the first push constant.
1665 */
1666 constant_nr = 0;
1667 }
1668
1669 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1670 constant_nr / 8,
1671 constant_nr % 8);
1672
1673 inst->src[i].file = HW_REG;
1674 inst->src[i].fixed_hw_reg = byte_offset(
1675 retype(brw_reg, inst->src[i].type),
1676 inst->src[i].subreg_offset);
1677 }
1678 }
1679 }
1680 }
1681
1682 void
1683 fs_visitor::calculate_urb_setup()
1684 {
1685 assert(stage == MESA_SHADER_FRAGMENT);
1686 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1687 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1688
1689 memset(prog_data->urb_setup, -1,
1690 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1691
1692 int urb_next = 0;
1693 /* Figure out where each of the incoming setup attributes lands. */
1694 if (devinfo->gen >= 6) {
1695 if (_mesa_bitcount_64(prog->InputsRead &
1696 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1697 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1698 * first 16 varying inputs, so we can put them wherever we want.
1699 * Just put them in order.
1700 *
1701 * This is useful because it means that (a) inputs not used by the
1702 * fragment shader won't take up valuable register space, and (b) we
1703 * won't have to recompile the fragment shader if it gets paired with
1704 * a different vertex (or geometry) shader.
1705 */
1706 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1707 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1708 BITFIELD64_BIT(i)) {
1709 prog_data->urb_setup[i] = urb_next++;
1710 }
1711 }
1712 } else {
1713 /* We have enough input varyings that the SF/SBE pipeline stage can't
1714 * arbitrarily rearrange them to suit our whim; we have to put them
1715 * in an order that matches the output of the previous pipeline stage
1716 * (geometry or vertex shader).
1717 */
1718 struct brw_vue_map prev_stage_vue_map;
1719 brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1720 key->input_slots_valid);
1721 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1722 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1723 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1724 slot++) {
1725 int varying = prev_stage_vue_map.slot_to_varying[slot];
1726 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1727 * unused.
1728 */
1729 if (varying != BRW_VARYING_SLOT_COUNT &&
1730 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1731 BITFIELD64_BIT(varying))) {
1732 prog_data->urb_setup[varying] = slot - first_slot;
1733 }
1734 }
1735 urb_next = prev_stage_vue_map.num_slots - first_slot;
1736 }
1737 } else {
1738 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1739 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1740 /* Point size is packed into the header, not as a general attribute */
1741 if (i == VARYING_SLOT_PSIZ)
1742 continue;
1743
1744 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1745 /* The back color slot is skipped when the front color is
1746 * also written to. In addition, some slots can be
1747 * written in the vertex shader and not read in the
1748 * fragment shader. So the register number must always be
1749 * incremented, mapped or not.
1750 */
1751 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1752 prog_data->urb_setup[i] = urb_next;
1753 urb_next++;
1754 }
1755 }
1756
1757 /*
1758 * It's a FS only attribute, and we did interpolation for this attribute
1759 * in SF thread. So, count it here, too.
1760 *
1761 * See compile_sf_prog() for more info.
1762 */
1763 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1764 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1765 }
1766
1767 prog_data->num_varying_inputs = urb_next;
1768 }
1769
1770 void
1771 fs_visitor::assign_urb_setup()
1772 {
1773 assert(stage == MESA_SHADER_FRAGMENT);
1774 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1775
1776 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1777
1778 /* Offset all the urb_setup[] index by the actual position of the
1779 * setup regs, now that the location of the constants has been chosen.
1780 */
1781 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1782 if (inst->opcode == FS_OPCODE_LINTERP) {
1783 assert(inst->src[1].file == HW_REG);
1784 inst->src[1].fixed_hw_reg.nr += urb_start;
1785 }
1786
1787 if (inst->opcode == FS_OPCODE_CINTERP) {
1788 assert(inst->src[0].file == HW_REG);
1789 inst->src[0].fixed_hw_reg.nr += urb_start;
1790 }
1791 }
1792
1793 /* Each attribute is 4 setup channels, each of which is half a reg. */
1794 this->first_non_payload_grf =
1795 urb_start + prog_data->num_varying_inputs * 2;
1796 }
1797
1798 void
1799 fs_visitor::assign_vs_urb_setup()
1800 {
1801 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1802 int grf, count, slot, channel, attr;
1803
1804 assert(stage == MESA_SHADER_VERTEX);
1805 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1806 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1807 count++;
1808
1809 /* Each attribute is 4 regs. */
1810 this->first_non_payload_grf =
1811 payload.num_regs + prog_data->curb_read_length + count * 4;
1812
1813 unsigned vue_entries =
1814 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1815
1816 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1817 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1818
1819 assert(vs_prog_data->base.urb_read_length <= 15);
1820
1821 /* Rewrite all ATTR file references to the hw grf that they land in. */
1822 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1823 for (int i = 0; i < inst->sources; i++) {
1824 if (inst->src[i].file == ATTR) {
1825
1826 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1827 slot = count - 1;
1828 } else {
1829 /* Attributes come in in a contiguous block, ordered by their
1830 * gl_vert_attrib value. That means we can compute the slot
1831 * number for an attribute by masking out the enabled
1832 * attributes before it and counting the bits.
1833 */
1834 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1835 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1836 BITFIELD64_MASK(attr));
1837 }
1838
1839 channel = inst->src[i].reg_offset & 3;
1840
1841 grf = payload.num_regs +
1842 prog_data->curb_read_length +
1843 slot * 4 + channel;
1844
1845 inst->src[i].file = HW_REG;
1846 inst->src[i].fixed_hw_reg =
1847 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1848 }
1849 }
1850 }
1851 }
1852
1853 /**
1854 * Split large virtual GRFs into separate components if we can.
1855 *
1856 * This is mostly duplicated with what brw_fs_vector_splitting does,
1857 * but that's really conservative because it's afraid of doing
1858 * splitting that doesn't result in real progress after the rest of
1859 * the optimization phases, which would cause infinite looping in
1860 * optimization. We can do it once here, safely. This also has the
1861 * opportunity to split interpolated values, or maybe even uniforms,
1862 * which we don't have at the IR level.
1863 *
1864 * We want to split, because virtual GRFs are what we register
1865 * allocate and spill (due to contiguousness requirements for some
1866 * instructions), and they're what we naturally generate in the
1867 * codegen process, but most virtual GRFs don't actually need to be
1868 * contiguous sets of GRFs. If we split, we'll end up with reduced
1869 * live intervals and better dead code elimination and coalescing.
1870 */
1871 void
1872 fs_visitor::split_virtual_grfs()
1873 {
1874 int num_vars = this->alloc.count;
1875
1876 /* Count the total number of registers */
1877 int reg_count = 0;
1878 int vgrf_to_reg[num_vars];
1879 for (int i = 0; i < num_vars; i++) {
1880 vgrf_to_reg[i] = reg_count;
1881 reg_count += alloc.sizes[i];
1882 }
1883
1884 /* An array of "split points". For each register slot, this indicates
1885 * if this slot can be separated from the previous slot. Every time an
1886 * instruction uses multiple elements of a register (as a source or
1887 * destination), we mark the used slots as inseparable. Then we go
1888 * through and split the registers into the smallest pieces we can.
1889 */
1890 bool split_points[reg_count];
1891 memset(split_points, 0, sizeof(split_points));
1892
1893 /* Mark all used registers as fully splittable */
1894 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1895 if (inst->dst.file == GRF) {
1896 int reg = vgrf_to_reg[inst->dst.reg];
1897 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1898 split_points[reg + j] = true;
1899 }
1900
1901 for (int i = 0; i < inst->sources; i++) {
1902 if (inst->src[i].file == GRF) {
1903 int reg = vgrf_to_reg[inst->src[i].reg];
1904 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1905 split_points[reg + j] = true;
1906 }
1907 }
1908 }
1909
1910 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1911 if (inst->dst.file == GRF) {
1912 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1913 for (int j = 1; j < inst->regs_written; j++)
1914 split_points[reg + j] = false;
1915 }
1916 for (int i = 0; i < inst->sources; i++) {
1917 if (inst->src[i].file == GRF) {
1918 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1919 for (int j = 1; j < inst->regs_read(i); j++)
1920 split_points[reg + j] = false;
1921 }
1922 }
1923 }
1924
1925 int new_virtual_grf[reg_count];
1926 int new_reg_offset[reg_count];
1927
1928 int reg = 0;
1929 for (int i = 0; i < num_vars; i++) {
1930 /* The first one should always be 0 as a quick sanity check. */
1931 assert(split_points[reg] == false);
1932
1933 /* j = 0 case */
1934 new_reg_offset[reg] = 0;
1935 reg++;
1936 int offset = 1;
1937
1938 /* j > 0 case */
1939 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1940 /* If this is a split point, reset the offset to 0 and allocate a
1941 * new virtual GRF for the previous offset many registers
1942 */
1943 if (split_points[reg]) {
1944 assert(offset <= MAX_VGRF_SIZE);
1945 int grf = alloc.allocate(offset);
1946 for (int k = reg - offset; k < reg; k++)
1947 new_virtual_grf[k] = grf;
1948 offset = 0;
1949 }
1950 new_reg_offset[reg] = offset;
1951 offset++;
1952 reg++;
1953 }
1954
1955 /* The last one gets the original register number */
1956 assert(offset <= MAX_VGRF_SIZE);
1957 alloc.sizes[i] = offset;
1958 for (int k = reg - offset; k < reg; k++)
1959 new_virtual_grf[k] = i;
1960 }
1961 assert(reg == reg_count);
1962
1963 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1964 if (inst->dst.file == GRF) {
1965 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1966 inst->dst.reg = new_virtual_grf[reg];
1967 inst->dst.reg_offset = new_reg_offset[reg];
1968 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1969 }
1970 for (int i = 0; i < inst->sources; i++) {
1971 if (inst->src[i].file == GRF) {
1972 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1973 inst->src[i].reg = new_virtual_grf[reg];
1974 inst->src[i].reg_offset = new_reg_offset[reg];
1975 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1976 }
1977 }
1978 }
1979 invalidate_live_intervals();
1980 }
1981
1982 /**
1983 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1984 *
1985 * During code generation, we create tons of temporary variables, many of
1986 * which get immediately killed and are never used again. Yet, in later
1987 * optimization and analysis passes, such as compute_live_intervals, we need
1988 * to loop over all the virtual GRFs. Compacting them can save a lot of
1989 * overhead.
1990 */
1991 bool
1992 fs_visitor::compact_virtual_grfs()
1993 {
1994 bool progress = false;
1995 int remap_table[this->alloc.count];
1996 memset(remap_table, -1, sizeof(remap_table));
1997
1998 /* Mark which virtual GRFs are used. */
1999 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2000 if (inst->dst.file == GRF)
2001 remap_table[inst->dst.reg] = 0;
2002
2003 for (int i = 0; i < inst->sources; i++) {
2004 if (inst->src[i].file == GRF)
2005 remap_table[inst->src[i].reg] = 0;
2006 }
2007 }
2008
2009 /* Compact the GRF arrays. */
2010 int new_index = 0;
2011 for (unsigned i = 0; i < this->alloc.count; i++) {
2012 if (remap_table[i] == -1) {
2013 /* We just found an unused register. This means that we are
2014 * actually going to compact something.
2015 */
2016 progress = true;
2017 } else {
2018 remap_table[i] = new_index;
2019 alloc.sizes[new_index] = alloc.sizes[i];
2020 invalidate_live_intervals();
2021 ++new_index;
2022 }
2023 }
2024
2025 this->alloc.count = new_index;
2026
2027 /* Patch all the instructions to use the newly renumbered registers */
2028 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2029 if (inst->dst.file == GRF)
2030 inst->dst.reg = remap_table[inst->dst.reg];
2031
2032 for (int i = 0; i < inst->sources; i++) {
2033 if (inst->src[i].file == GRF)
2034 inst->src[i].reg = remap_table[inst->src[i].reg];
2035 }
2036 }
2037
2038 /* Patch all the references to delta_xy, since they're used in register
2039 * allocation. If they're unused, switch them to BAD_FILE so we don't
2040 * think some random VGRF is delta_xy.
2041 */
2042 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2043 if (delta_xy[i].file == GRF) {
2044 if (remap_table[delta_xy[i].reg] != -1) {
2045 delta_xy[i].reg = remap_table[delta_xy[i].reg];
2046 } else {
2047 delta_xy[i].file = BAD_FILE;
2048 }
2049 }
2050 }
2051
2052 return progress;
2053 }
2054
2055 /*
2056 * Implements array access of uniforms by inserting a
2057 * PULL_CONSTANT_LOAD instruction.
2058 *
2059 * Unlike temporary GRF array access (where we don't support it due to
2060 * the difficulty of doing relative addressing on instruction
2061 * destinations), we could potentially do array access of uniforms
2062 * that were loaded in GRF space as push constants. In real-world
2063 * usage we've seen, though, the arrays being used are always larger
2064 * than we could load as push constants, so just always move all
2065 * uniform array access out to a pull constant buffer.
2066 */
2067 void
2068 fs_visitor::move_uniform_array_access_to_pull_constants()
2069 {
2070 if (dispatch_width != 8)
2071 return;
2072
2073 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2074 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2075
2076 /* Walk through and find array access of uniforms. Put a copy of that
2077 * uniform in the pull constant buffer.
2078 *
2079 * Note that we don't move constant-indexed accesses to arrays. No
2080 * testing has been done of the performance impact of this choice.
2081 */
2082 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2083 for (int i = 0 ; i < inst->sources; i++) {
2084 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2085 continue;
2086
2087 int uniform = inst->src[i].reg;
2088
2089 /* If this array isn't already present in the pull constant buffer,
2090 * add it.
2091 */
2092 if (pull_constant_loc[uniform] == -1) {
2093 const gl_constant_value **values = &stage_prog_data->param[uniform];
2094
2095 assert(param_size[uniform]);
2096
2097 for (int j = 0; j < param_size[uniform]; j++) {
2098 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2099
2100 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2101 values[j];
2102 }
2103 }
2104 }
2105 }
2106 }
2107
2108 /**
2109 * Assign UNIFORM file registers to either push constants or pull constants.
2110 *
2111 * We allow a fragment shader to have more than the specified minimum
2112 * maximum number of fragment shader uniform components (64). If
2113 * there are too many of these, they'd fill up all of register space.
2114 * So, this will push some of them out to the pull constant buffer and
2115 * update the program to load them.
2116 */
2117 void
2118 fs_visitor::assign_constant_locations()
2119 {
2120 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2121 if (dispatch_width != 8)
2122 return;
2123
2124 /* Find which UNIFORM registers are still in use. */
2125 bool is_live[uniforms];
2126 for (unsigned int i = 0; i < uniforms; i++) {
2127 is_live[i] = false;
2128 }
2129
2130 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2131 for (int i = 0; i < inst->sources; i++) {
2132 if (inst->src[i].file != UNIFORM)
2133 continue;
2134
2135 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2136 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2137 is_live[constant_nr] = true;
2138 }
2139 }
2140
2141 /* Only allow 16 registers (128 uniform components) as push constants.
2142 *
2143 * Just demote the end of the list. We could probably do better
2144 * here, demoting things that are rarely used in the program first.
2145 *
2146 * If changing this value, note the limitation about total_regs in
2147 * brw_curbe.c.
2148 */
2149 unsigned int max_push_components = 16 * 8;
2150 unsigned int num_push_constants = 0;
2151
2152 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2153
2154 for (unsigned int i = 0; i < uniforms; i++) {
2155 if (!is_live[i] || pull_constant_loc[i] != -1) {
2156 /* This UNIFORM register is either dead, or has already been demoted
2157 * to a pull const. Mark it as no longer living in the param[] array.
2158 */
2159 push_constant_loc[i] = -1;
2160 continue;
2161 }
2162
2163 if (num_push_constants < max_push_components) {
2164 /* Retain as a push constant. Record the location in the params[]
2165 * array.
2166 */
2167 push_constant_loc[i] = num_push_constants++;
2168 } else {
2169 /* Demote to a pull constant. */
2170 push_constant_loc[i] = -1;
2171
2172 int pull_index = stage_prog_data->nr_pull_params++;
2173 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2174 pull_constant_loc[i] = pull_index;
2175 }
2176 }
2177
2178 stage_prog_data->nr_params = num_push_constants;
2179
2180 /* Up until now, the param[] array has been indexed by reg + reg_offset
2181 * of UNIFORM registers. Condense it to only contain the uniforms we
2182 * chose to upload as push constants.
2183 */
2184 for (unsigned int i = 0; i < uniforms; i++) {
2185 int remapped = push_constant_loc[i];
2186
2187 if (remapped == -1)
2188 continue;
2189
2190 assert(remapped <= (int)i);
2191 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2192 }
2193 }
2194
2195 /**
2196 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2197 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2198 */
2199 void
2200 fs_visitor::demote_pull_constants()
2201 {
2202 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2203 for (int i = 0; i < inst->sources; i++) {
2204 if (inst->src[i].file != UNIFORM)
2205 continue;
2206
2207 int pull_index;
2208 unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2209 if (location >= uniforms) /* Out of bounds access */
2210 pull_index = -1;
2211 else
2212 pull_index = pull_constant_loc[location];
2213
2214 if (pull_index == -1)
2215 continue;
2216
2217 /* Set up the annotation tracking for new generated instructions. */
2218 base_ir = inst->ir;
2219 current_annotation = inst->annotation;
2220
2221 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2222 fs_reg dst = vgrf(glsl_type::float_type);
2223
2224 /* Generate a pull load into dst. */
2225 if (inst->src[i].reladdr) {
2226 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2227 surf_index,
2228 *inst->src[i].reladdr,
2229 pull_index);
2230 inst->insert_before(block, &list);
2231 inst->src[i].reladdr = NULL;
2232 } else {
2233 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2234 fs_inst *pull =
2235 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2236 dst, surf_index, offset);
2237 inst->insert_before(block, pull);
2238 inst->src[i].set_smear(pull_index & 3);
2239 }
2240
2241 /* Rewrite the instruction to use the temporary VGRF. */
2242 inst->src[i].file = GRF;
2243 inst->src[i].reg = dst.reg;
2244 inst->src[i].reg_offset = 0;
2245 inst->src[i].width = dispatch_width;
2246 }
2247 }
2248 invalidate_live_intervals();
2249 }
2250
2251 bool
2252 fs_visitor::opt_algebraic()
2253 {
2254 bool progress = false;
2255
2256 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2257 switch (inst->opcode) {
2258 case BRW_OPCODE_MOV:
2259 if (inst->src[0].file != IMM)
2260 break;
2261
2262 if (inst->saturate) {
2263 if (inst->dst.type != inst->src[0].type)
2264 assert(!"unimplemented: saturate mixed types");
2265
2266 if (brw_saturate_immediate(inst->dst.type,
2267 &inst->src[0].fixed_hw_reg)) {
2268 inst->saturate = false;
2269 progress = true;
2270 }
2271 }
2272 break;
2273
2274 case BRW_OPCODE_MUL:
2275 if (inst->src[1].file != IMM)
2276 continue;
2277
2278 /* a * 1.0 = a */
2279 if (inst->src[1].is_one()) {
2280 inst->opcode = BRW_OPCODE_MOV;
2281 inst->src[1] = reg_undef;
2282 progress = true;
2283 break;
2284 }
2285
2286 /* a * -1.0 = -a */
2287 if (inst->src[1].is_negative_one()) {
2288 inst->opcode = BRW_OPCODE_MOV;
2289 inst->src[0].negate = !inst->src[0].negate;
2290 inst->src[1] = reg_undef;
2291 progress = true;
2292 break;
2293 }
2294
2295 /* a * 0.0 = 0.0 */
2296 if (inst->src[1].is_zero()) {
2297 inst->opcode = BRW_OPCODE_MOV;
2298 inst->src[0] = inst->src[1];
2299 inst->src[1] = reg_undef;
2300 progress = true;
2301 break;
2302 }
2303
2304 if (inst->src[0].file == IMM) {
2305 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2306 inst->opcode = BRW_OPCODE_MOV;
2307 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2308 inst->src[1] = reg_undef;
2309 progress = true;
2310 break;
2311 }
2312 break;
2313 case BRW_OPCODE_ADD:
2314 if (inst->src[1].file != IMM)
2315 continue;
2316
2317 /* a + 0.0 = a */
2318 if (inst->src[1].is_zero()) {
2319 inst->opcode = BRW_OPCODE_MOV;
2320 inst->src[1] = reg_undef;
2321 progress = true;
2322 break;
2323 }
2324
2325 if (inst->src[0].file == IMM) {
2326 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2327 inst->opcode = BRW_OPCODE_MOV;
2328 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2329 inst->src[1] = reg_undef;
2330 progress = true;
2331 break;
2332 }
2333 break;
2334 case BRW_OPCODE_OR:
2335 if (inst->src[0].equals(inst->src[1])) {
2336 inst->opcode = BRW_OPCODE_MOV;
2337 inst->src[1] = reg_undef;
2338 progress = true;
2339 break;
2340 }
2341 break;
2342 case BRW_OPCODE_LRP:
2343 if (inst->src[1].equals(inst->src[2])) {
2344 inst->opcode = BRW_OPCODE_MOV;
2345 inst->src[0] = inst->src[1];
2346 inst->src[1] = reg_undef;
2347 inst->src[2] = reg_undef;
2348 progress = true;
2349 break;
2350 }
2351 break;
2352 case BRW_OPCODE_CMP:
2353 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2354 inst->src[0].abs &&
2355 inst->src[0].negate &&
2356 inst->src[1].is_zero()) {
2357 inst->src[0].abs = false;
2358 inst->src[0].negate = false;
2359 inst->conditional_mod = BRW_CONDITIONAL_Z;
2360 progress = true;
2361 break;
2362 }
2363 break;
2364 case BRW_OPCODE_SEL:
2365 if (inst->src[0].equals(inst->src[1])) {
2366 inst->opcode = BRW_OPCODE_MOV;
2367 inst->src[1] = reg_undef;
2368 inst->predicate = BRW_PREDICATE_NONE;
2369 inst->predicate_inverse = false;
2370 progress = true;
2371 } else if (inst->saturate && inst->src[1].file == IMM) {
2372 switch (inst->conditional_mod) {
2373 case BRW_CONDITIONAL_LE:
2374 case BRW_CONDITIONAL_L:
2375 switch (inst->src[1].type) {
2376 case BRW_REGISTER_TYPE_F:
2377 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2378 inst->opcode = BRW_OPCODE_MOV;
2379 inst->src[1] = reg_undef;
2380 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2381 progress = true;
2382 }
2383 break;
2384 default:
2385 break;
2386 }
2387 break;
2388 case BRW_CONDITIONAL_GE:
2389 case BRW_CONDITIONAL_G:
2390 switch (inst->src[1].type) {
2391 case BRW_REGISTER_TYPE_F:
2392 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2393 inst->opcode = BRW_OPCODE_MOV;
2394 inst->src[1] = reg_undef;
2395 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2396 progress = true;
2397 }
2398 break;
2399 default:
2400 break;
2401 }
2402 default:
2403 break;
2404 }
2405 }
2406 break;
2407 case BRW_OPCODE_MAD:
2408 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2409 inst->opcode = BRW_OPCODE_MOV;
2410 inst->src[1] = reg_undef;
2411 inst->src[2] = reg_undef;
2412 progress = true;
2413 } else if (inst->src[0].is_zero()) {
2414 inst->opcode = BRW_OPCODE_MUL;
2415 inst->src[0] = inst->src[2];
2416 inst->src[2] = reg_undef;
2417 progress = true;
2418 } else if (inst->src[1].is_one()) {
2419 inst->opcode = BRW_OPCODE_ADD;
2420 inst->src[1] = inst->src[2];
2421 inst->src[2] = reg_undef;
2422 progress = true;
2423 } else if (inst->src[2].is_one()) {
2424 inst->opcode = BRW_OPCODE_ADD;
2425 inst->src[2] = reg_undef;
2426 progress = true;
2427 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2428 inst->opcode = BRW_OPCODE_ADD;
2429 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2430 inst->src[2] = reg_undef;
2431 progress = true;
2432 }
2433 break;
2434 case SHADER_OPCODE_RCP: {
2435 fs_inst *prev = (fs_inst *)inst->prev;
2436 if (prev->opcode == SHADER_OPCODE_SQRT) {
2437 if (inst->src[0].equals(prev->dst)) {
2438 inst->opcode = SHADER_OPCODE_RSQ;
2439 inst->src[0] = prev->src[0];
2440 progress = true;
2441 }
2442 }
2443 break;
2444 }
2445 case SHADER_OPCODE_BROADCAST:
2446 if (is_uniform(inst->src[0])) {
2447 inst->opcode = BRW_OPCODE_MOV;
2448 inst->sources = 1;
2449 inst->force_writemask_all = true;
2450 progress = true;
2451 } else if (inst->src[1].file == IMM) {
2452 inst->opcode = BRW_OPCODE_MOV;
2453 inst->src[0] = component(inst->src[0],
2454 inst->src[1].fixed_hw_reg.dw1.ud);
2455 inst->sources = 1;
2456 inst->force_writemask_all = true;
2457 progress = true;
2458 }
2459 break;
2460
2461 default:
2462 break;
2463 }
2464
2465 /* Swap if src[0] is immediate. */
2466 if (progress && inst->is_commutative()) {
2467 if (inst->src[0].file == IMM) {
2468 fs_reg tmp = inst->src[1];
2469 inst->src[1] = inst->src[0];
2470 inst->src[0] = tmp;
2471 }
2472 }
2473 }
2474 return progress;
2475 }
2476
2477 /**
2478 * Optimize sample messages that have constant zero values for the trailing
2479 * texture coordinates. We can just reduce the message length for these
2480 * instructions instead of reserving a register for it. Trailing parameters
2481 * that aren't sent default to zero anyway. This will cause the dead code
2482 * eliminator to remove the MOV instruction that would otherwise be emitted to
2483 * set up the zero value.
2484 */
2485 bool
2486 fs_visitor::opt_zero_samples()
2487 {
2488 /* Gen4 infers the texturing opcode based on the message length so we can't
2489 * change it.
2490 */
2491 if (devinfo->gen < 5)
2492 return false;
2493
2494 bool progress = false;
2495
2496 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2497 if (!inst->is_tex())
2498 continue;
2499
2500 fs_inst *load_payload = (fs_inst *) inst->prev;
2501
2502 if (load_payload->is_head_sentinel() ||
2503 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2504 continue;
2505
2506 /* We don't want to remove the message header or the first parameter.
2507 * Removing the first parameter is not allowed, see the Haswell PRM
2508 * volume 7, page 149:
2509 *
2510 * "Parameter 0 is required except for the sampleinfo message, which
2511 * has no parameter 0"
2512 */
2513 while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2514 load_payload->src[(inst->mlen - inst->header_size) /
2515 (dispatch_width / 8) +
2516 inst->header_size - 1].is_zero()) {
2517 inst->mlen -= dispatch_width / 8;
2518 progress = true;
2519 }
2520 }
2521
2522 if (progress)
2523 invalidate_live_intervals();
2524
2525 return progress;
2526 }
2527
2528 /**
2529 * Optimize sample messages which are followed by the final RT write.
2530 *
2531 * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2532 * results sent directly to the framebuffer, bypassing the EU. Recognize the
2533 * final texturing results copied to the framebuffer write payload and modify
2534 * them to write to the framebuffer directly.
2535 */
2536 bool
2537 fs_visitor::opt_sampler_eot()
2538 {
2539 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2540
2541 if (stage != MESA_SHADER_FRAGMENT)
2542 return false;
2543
2544 if (devinfo->gen < 9 && !devinfo->is_cherryview)
2545 return false;
2546
2547 /* FINISHME: It should be possible to implement this optimization when there
2548 * are multiple drawbuffers.
2549 */
2550 if (key->nr_color_regions != 1)
2551 return false;
2552
2553 /* Look for a texturing instruction immediately before the final FB_WRITE. */
2554 fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2555 assert(fb_write->eot);
2556 assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2557
2558 fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2559
2560 /* There wasn't one; nothing to do. */
2561 if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2562 return false;
2563
2564 /* This optimisation doesn't seem to work for textureGather for some
2565 * reason. I can't find any documentation or known workarounds to indicate
2566 * that this is expected, but considering that it is probably pretty
2567 * unlikely that a shader would directly write out the results from
2568 * textureGather we might as well just disable it.
2569 */
2570 if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
2571 tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2572 return false;
2573
2574 /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2575 * It's very likely to be the previous instruction.
2576 */
2577 fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2578 if (load_payload->is_head_sentinel() ||
2579 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2580 return false;
2581
2582 assert(!tex_inst->eot); /* We can't get here twice */
2583 assert((tex_inst->offset & (0xff << 24)) == 0);
2584
2585 tex_inst->offset |= fb_write->target << 24;
2586 tex_inst->eot = true;
2587 tex_inst->dst = bld.null_reg_ud();
2588 fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2589
2590 /* If a header is present, marking the eot is sufficient. Otherwise, we need
2591 * to create a new LOAD_PAYLOAD command with the same sources and a space
2592 * saved for the header. Using a new destination register not only makes sure
2593 * we have enough space, but it will make sure the dead code eliminator kills
2594 * the instruction that this will replace.
2595 */
2596 if (tex_inst->header_size != 0)
2597 return true;
2598
2599 fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
2600 load_payload->sources + 1);
2601 fs_reg *new_sources =
2602 ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2603
2604 new_sources[0] = fs_reg();
2605 for (int i = 0; i < load_payload->sources; i++)
2606 new_sources[i+1] = load_payload->src[i];
2607
2608 /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2609 * requires a lot of information about the sources to appropriately figure
2610 * out the number of registers needed to be used. Given this stage in our
2611 * optimization, we may not have the appropriate GRFs required by
2612 * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2613 * manually emit the instruction.
2614 */
2615 fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2616 load_payload->exec_size,
2617 send_header,
2618 new_sources,
2619 load_payload->sources + 1);
2620
2621 new_load_payload->regs_written = load_payload->regs_written + 1;
2622 new_load_payload->header_size = 1;
2623 tex_inst->mlen++;
2624 tex_inst->header_size = 1;
2625 tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2626 tex_inst->src[0] = send_header;
2627
2628 return true;
2629 }
2630
2631 bool
2632 fs_visitor::opt_register_renaming()
2633 {
2634 bool progress = false;
2635 int depth = 0;
2636
2637 int remap[alloc.count];
2638 memset(remap, -1, sizeof(int) * alloc.count);
2639
2640 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2641 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2642 depth++;
2643 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2644 inst->opcode == BRW_OPCODE_WHILE) {
2645 depth--;
2646 }
2647
2648 /* Rewrite instruction sources. */
2649 for (int i = 0; i < inst->sources; i++) {
2650 if (inst->src[i].file == GRF &&
2651 remap[inst->src[i].reg] != -1 &&
2652 remap[inst->src[i].reg] != inst->src[i].reg) {
2653 inst->src[i].reg = remap[inst->src[i].reg];
2654 progress = true;
2655 }
2656 }
2657
2658 const int dst = inst->dst.reg;
2659
2660 if (depth == 0 &&
2661 inst->dst.file == GRF &&
2662 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2663 !inst->is_partial_write()) {
2664 if (remap[dst] == -1) {
2665 remap[dst] = dst;
2666 } else {
2667 remap[dst] = alloc.allocate(inst->dst.width / 8);
2668 inst->dst.reg = remap[dst];
2669 progress = true;
2670 }
2671 } else if (inst->dst.file == GRF &&
2672 remap[dst] != -1 &&
2673 remap[dst] != dst) {
2674 inst->dst.reg = remap[dst];
2675 progress = true;
2676 }
2677 }
2678
2679 if (progress) {
2680 invalidate_live_intervals();
2681
2682 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2683 if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2684 delta_xy[i].reg = remap[delta_xy[i].reg];
2685 }
2686 }
2687 }
2688
2689 return progress;
2690 }
2691
2692 /**
2693 * Remove redundant or useless discard jumps.
2694 *
2695 * For example, we can eliminate jumps in the following sequence:
2696 *
2697 * discard-jump (redundant with the next jump)
2698 * discard-jump (useless; jumps to the next instruction)
2699 * placeholder-halt
2700 */
2701 bool
2702 fs_visitor::opt_redundant_discard_jumps()
2703 {
2704 bool progress = false;
2705
2706 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2707
2708 fs_inst *placeholder_halt = NULL;
2709 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2710 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2711 placeholder_halt = inst;
2712 break;
2713 }
2714 }
2715
2716 if (!placeholder_halt)
2717 return false;
2718
2719 /* Delete any HALTs immediately before the placeholder halt. */
2720 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2721 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2722 prev = (fs_inst *) placeholder_halt->prev) {
2723 prev->remove(last_bblock);
2724 progress = true;
2725 }
2726
2727 if (progress)
2728 invalidate_live_intervals();
2729
2730 return progress;
2731 }
2732
2733 bool
2734 fs_visitor::compute_to_mrf()
2735 {
2736 bool progress = false;
2737 int next_ip = 0;
2738
2739 /* No MRFs on Gen >= 7. */
2740 if (devinfo->gen >= 7)
2741 return false;
2742
2743 calculate_live_intervals();
2744
2745 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2746 int ip = next_ip;
2747 next_ip++;
2748
2749 if (inst->opcode != BRW_OPCODE_MOV ||
2750 inst->is_partial_write() ||
2751 inst->dst.file != MRF || inst->src[0].file != GRF ||
2752 inst->dst.type != inst->src[0].type ||
2753 inst->src[0].abs || inst->src[0].negate ||
2754 !inst->src[0].is_contiguous() ||
2755 inst->src[0].subreg_offset)
2756 continue;
2757
2758 /* Work out which hardware MRF registers are written by this
2759 * instruction.
2760 */
2761 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2762 int mrf_high;
2763 if (inst->dst.reg & BRW_MRF_COMPR4) {
2764 mrf_high = mrf_low + 4;
2765 } else if (inst->exec_size == 16) {
2766 mrf_high = mrf_low + 1;
2767 } else {
2768 mrf_high = mrf_low;
2769 }
2770
2771 /* Can't compute-to-MRF this GRF if someone else was going to
2772 * read it later.
2773 */
2774 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2775 continue;
2776
2777 /* Found a move of a GRF to a MRF. Let's see if we can go
2778 * rewrite the thing that made this GRF to write into the MRF.
2779 */
2780 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2781 if (scan_inst->dst.file == GRF &&
2782 scan_inst->dst.reg == inst->src[0].reg) {
2783 /* Found the last thing to write our reg we want to turn
2784 * into a compute-to-MRF.
2785 */
2786
2787 /* If this one instruction didn't populate all the
2788 * channels, bail. We might be able to rewrite everything
2789 * that writes that reg, but it would require smarter
2790 * tracking to delay the rewriting until complete success.
2791 */
2792 if (scan_inst->is_partial_write())
2793 break;
2794
2795 /* Things returning more than one register would need us to
2796 * understand coalescing out more than one MOV at a time.
2797 */
2798 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2799 break;
2800
2801 /* SEND instructions can't have MRF as a destination. */
2802 if (scan_inst->mlen)
2803 break;
2804
2805 if (devinfo->gen == 6) {
2806 /* gen6 math instructions must have the destination be
2807 * GRF, so no compute-to-MRF for them.
2808 */
2809 if (scan_inst->is_math()) {
2810 break;
2811 }
2812 }
2813
2814 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2815 /* Found the creator of our MRF's source value. */
2816 scan_inst->dst.file = MRF;
2817 scan_inst->dst.reg = inst->dst.reg;
2818 scan_inst->saturate |= inst->saturate;
2819 inst->remove(block);
2820 progress = true;
2821 }
2822 break;
2823 }
2824
2825 /* We don't handle control flow here. Most computation of
2826 * values that end up in MRFs are shortly before the MRF
2827 * write anyway.
2828 */
2829 if (block->start() == scan_inst)
2830 break;
2831
2832 /* You can't read from an MRF, so if someone else reads our
2833 * MRF's source GRF that we wanted to rewrite, that stops us.
2834 */
2835 bool interfered = false;
2836 for (int i = 0; i < scan_inst->sources; i++) {
2837 if (scan_inst->src[i].file == GRF &&
2838 scan_inst->src[i].reg == inst->src[0].reg &&
2839 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2840 interfered = true;
2841 }
2842 }
2843 if (interfered)
2844 break;
2845
2846 if (scan_inst->dst.file == MRF) {
2847 /* If somebody else writes our MRF here, we can't
2848 * compute-to-MRF before that.
2849 */
2850 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2851 int scan_mrf_high;
2852
2853 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2854 scan_mrf_high = scan_mrf_low + 4;
2855 } else if (scan_inst->exec_size == 16) {
2856 scan_mrf_high = scan_mrf_low + 1;
2857 } else {
2858 scan_mrf_high = scan_mrf_low;
2859 }
2860
2861 if (mrf_low == scan_mrf_low ||
2862 mrf_low == scan_mrf_high ||
2863 mrf_high == scan_mrf_low ||
2864 mrf_high == scan_mrf_high) {
2865 break;
2866 }
2867 }
2868
2869 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2870 /* Found a SEND instruction, which means that there are
2871 * live values in MRFs from base_mrf to base_mrf +
2872 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2873 * above it.
2874 */
2875 if (mrf_low >= scan_inst->base_mrf &&
2876 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2877 break;
2878 }
2879 if (mrf_high >= scan_inst->base_mrf &&
2880 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2881 break;
2882 }
2883 }
2884 }
2885 }
2886
2887 if (progress)
2888 invalidate_live_intervals();
2889
2890 return progress;
2891 }
2892
2893 /**
2894 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2895 * flow. We could probably do better here with some form of divergence
2896 * analysis.
2897 */
2898 bool
2899 fs_visitor::eliminate_find_live_channel()
2900 {
2901 bool progress = false;
2902 unsigned depth = 0;
2903
2904 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2905 switch (inst->opcode) {
2906 case BRW_OPCODE_IF:
2907 case BRW_OPCODE_DO:
2908 depth++;
2909 break;
2910
2911 case BRW_OPCODE_ENDIF:
2912 case BRW_OPCODE_WHILE:
2913 depth--;
2914 break;
2915
2916 case FS_OPCODE_DISCARD_JUMP:
2917 /* This can potentially make control flow non-uniform until the end
2918 * of the program.
2919 */
2920 return progress;
2921
2922 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2923 if (depth == 0) {
2924 inst->opcode = BRW_OPCODE_MOV;
2925 inst->src[0] = fs_reg(0);
2926 inst->sources = 1;
2927 inst->force_writemask_all = true;
2928 progress = true;
2929 }
2930 break;
2931
2932 default:
2933 break;
2934 }
2935 }
2936
2937 return progress;
2938 }
2939
2940 /**
2941 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2942 * instructions to FS_OPCODE_REP_FB_WRITE.
2943 */
2944 void
2945 fs_visitor::emit_repclear_shader()
2946 {
2947 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2948 int base_mrf = 1;
2949 int color_mrf = base_mrf + 2;
2950
2951 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2952 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2953 mov->force_writemask_all = true;
2954
2955 fs_inst *write;
2956 if (key->nr_color_regions == 1) {
2957 write = emit(FS_OPCODE_REP_FB_WRITE);
2958 write->saturate = key->clamp_fragment_color;
2959 write->base_mrf = color_mrf;
2960 write->target = 0;
2961 write->header_size = 0;
2962 write->mlen = 1;
2963 } else {
2964 assume(key->nr_color_regions > 0);
2965 for (int i = 0; i < key->nr_color_regions; ++i) {
2966 write = emit(FS_OPCODE_REP_FB_WRITE);
2967 write->saturate = key->clamp_fragment_color;
2968 write->base_mrf = base_mrf;
2969 write->target = i;
2970 write->header_size = 2;
2971 write->mlen = 3;
2972 }
2973 }
2974 write->eot = true;
2975
2976 calculate_cfg();
2977
2978 assign_constant_locations();
2979 assign_curb_setup();
2980
2981 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2982 assert(mov->src[0].file == HW_REG);
2983 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2984 }
2985
2986 /**
2987 * Walks through basic blocks, looking for repeated MRF writes and
2988 * removing the later ones.
2989 */
2990 bool
2991 fs_visitor::remove_duplicate_mrf_writes()
2992 {
2993 fs_inst *last_mrf_move[16];
2994 bool progress = false;
2995
2996 /* Need to update the MRF tracking for compressed instructions. */
2997 if (dispatch_width == 16)
2998 return false;
2999
3000 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3001
3002 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3003 if (inst->is_control_flow()) {
3004 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3005 }
3006
3007 if (inst->opcode == BRW_OPCODE_MOV &&
3008 inst->dst.file == MRF) {
3009 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3010 if (prev_inst && inst->equals(prev_inst)) {
3011 inst->remove(block);
3012 progress = true;
3013 continue;
3014 }
3015 }
3016
3017 /* Clear out the last-write records for MRFs that were overwritten. */
3018 if (inst->dst.file == MRF) {
3019 last_mrf_move[inst->dst.reg] = NULL;
3020 }
3021
3022 if (inst->mlen > 0 && inst->base_mrf != -1) {
3023 /* Found a SEND instruction, which will include two or fewer
3024 * implied MRF writes. We could do better here.
3025 */
3026 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3027 last_mrf_move[inst->base_mrf + i] = NULL;
3028 }
3029 }
3030
3031 /* Clear out any MRF move records whose sources got overwritten. */
3032 if (inst->dst.file == GRF) {
3033 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3034 if (last_mrf_move[i] &&
3035 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3036 last_mrf_move[i] = NULL;
3037 }
3038 }
3039 }
3040
3041 if (inst->opcode == BRW_OPCODE_MOV &&
3042 inst->dst.file == MRF &&
3043 inst->src[0].file == GRF &&
3044 !inst->is_partial_write()) {
3045 last_mrf_move[inst->dst.reg] = inst;
3046 }
3047 }
3048
3049 if (progress)
3050 invalidate_live_intervals();
3051
3052 return progress;
3053 }
3054
3055 static void
3056 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3057 {
3058 /* Clear the flag for registers that actually got read (as expected). */
3059 for (int i = 0; i < inst->sources; i++) {
3060 int grf;
3061 if (inst->src[i].file == GRF) {
3062 grf = inst->src[i].reg;
3063 } else if (inst->src[i].file == HW_REG &&
3064 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3065 grf = inst->src[i].fixed_hw_reg.nr;
3066 } else {
3067 continue;
3068 }
3069
3070 if (grf >= first_grf &&
3071 grf < first_grf + grf_len) {
3072 deps[grf - first_grf] = false;
3073 if (inst->exec_size == 16)
3074 deps[grf - first_grf + 1] = false;
3075 }
3076 }
3077 }
3078
3079 /**
3080 * Implements this workaround for the original 965:
3081 *
3082 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3083 * check for post destination dependencies on this instruction, software
3084 * must ensure that there is no destination hazard for the case of ‘write
3085 * followed by a posted write’ shown in the following example.
3086 *
3087 * 1. mov r3 0
3088 * 2. send r3.xy <rest of send instruction>
3089 * 3. mov r2 r3
3090 *
3091 * Due to no post-destination dependency check on the ‘send’, the above
3092 * code sequence could have two instructions (1 and 2) in flight at the
3093 * same time that both consider ‘r3’ as the target of their final writes.
3094 */
3095 void
3096 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3097 fs_inst *inst)
3098 {
3099 int write_len = inst->regs_written;
3100 int first_write_grf = inst->dst.reg;
3101 bool needs_dep[BRW_MAX_MRF];
3102 assert(write_len < (int)sizeof(needs_dep) - 1);
3103
3104 memset(needs_dep, false, sizeof(needs_dep));
3105 memset(needs_dep, true, write_len);
3106
3107 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3108
3109 /* Walk backwards looking for writes to registers we're writing which
3110 * aren't read since being written. If we hit the start of the program,
3111 * we assume that there are no outstanding dependencies on entry to the
3112 * program.
3113 */
3114 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3115 /* If we hit control flow, assume that there *are* outstanding
3116 * dependencies, and force their cleanup before our instruction.
3117 */
3118 if (block->start() == scan_inst) {
3119 for (int i = 0; i < write_len; i++) {
3120 if (needs_dep[i]) {
3121 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3122 }
3123 }
3124 return;
3125 }
3126
3127 /* We insert our reads as late as possible on the assumption that any
3128 * instruction but a MOV that might have left us an outstanding
3129 * dependency has more latency than a MOV.
3130 */
3131 if (scan_inst->dst.file == GRF) {
3132 for (int i = 0; i < scan_inst->regs_written; i++) {
3133 int reg = scan_inst->dst.reg + i;
3134
3135 if (reg >= first_write_grf &&
3136 reg < first_write_grf + write_len &&
3137 needs_dep[reg - first_write_grf]) {
3138 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3139 needs_dep[reg - first_write_grf] = false;
3140 if (scan_inst->exec_size == 16)
3141 needs_dep[reg - first_write_grf + 1] = false;
3142 }
3143 }
3144 }
3145
3146 /* Clear the flag for registers that actually got read (as expected). */
3147 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3148
3149 /* Continue the loop only if we haven't resolved all the dependencies */
3150 int i;
3151 for (i = 0; i < write_len; i++) {
3152 if (needs_dep[i])
3153 break;
3154 }
3155 if (i == write_len)
3156 return;
3157 }
3158 }
3159
3160 /**
3161 * Implements this workaround for the original 965:
3162 *
3163 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3164 * used as a destination register until after it has been sourced by an
3165 * instruction with a different destination register.
3166 */
3167 void
3168 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3169 {
3170 int write_len = inst->regs_written;
3171 int first_write_grf = inst->dst.reg;
3172 bool needs_dep[BRW_MAX_MRF];
3173 assert(write_len < (int)sizeof(needs_dep) - 1);
3174
3175 memset(needs_dep, false, sizeof(needs_dep));
3176 memset(needs_dep, true, write_len);
3177 /* Walk forwards looking for writes to registers we're writing which aren't
3178 * read before being written.
3179 */
3180 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3181 /* If we hit control flow, force resolve all remaining dependencies. */
3182 if (block->end() == scan_inst) {
3183 for (int i = 0; i < write_len; i++) {
3184 if (needs_dep[i])
3185 scan_inst->insert_before(block,
3186 DEP_RESOLVE_MOV(first_write_grf + i));
3187 }
3188 return;
3189 }
3190
3191 /* Clear the flag for registers that actually got read (as expected). */
3192 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3193
3194 /* We insert our reads as late as possible since they're reading the
3195 * result of a SEND, which has massive latency.
3196 */
3197 if (scan_inst->dst.file == GRF &&
3198 scan_inst->dst.reg >= first_write_grf &&
3199 scan_inst->dst.reg < first_write_grf + write_len &&
3200 needs_dep[scan_inst->dst.reg - first_write_grf]) {
3201 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3202 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3203 }
3204
3205 /* Continue the loop only if we haven't resolved all the dependencies */
3206 int i;
3207 for (i = 0; i < write_len; i++) {
3208 if (needs_dep[i])
3209 break;
3210 }
3211 if (i == write_len)
3212 return;
3213 }
3214 }
3215
3216 void
3217 fs_visitor::insert_gen4_send_dependency_workarounds()
3218 {
3219 if (devinfo->gen != 4 || devinfo->is_g4x)
3220 return;
3221
3222 bool progress = false;
3223
3224 /* Note that we're done with register allocation, so GRF fs_regs always
3225 * have a .reg_offset of 0.
3226 */
3227
3228 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3229 if (inst->mlen != 0 && inst->dst.file == GRF) {
3230 insert_gen4_pre_send_dependency_workarounds(block, inst);
3231 insert_gen4_post_send_dependency_workarounds(block, inst);
3232 progress = true;
3233 }
3234 }
3235
3236 if (progress)
3237 invalidate_live_intervals();
3238 }
3239
3240 /**
3241 * Turns the generic expression-style uniform pull constant load instruction
3242 * into a hardware-specific series of instructions for loading a pull
3243 * constant.
3244 *
3245 * The expression style allows the CSE pass before this to optimize out
3246 * repeated loads from the same offset, and gives the pre-register-allocation
3247 * scheduling full flexibility, while the conversion to native instructions
3248 * allows the post-register-allocation scheduler the best information
3249 * possible.
3250 *
3251 * Note that execution masking for setting up pull constant loads is special:
3252 * the channels that need to be written are unrelated to the current execution
3253 * mask, since a later instruction will use one of the result channels as a
3254 * source operand for all 8 or 16 of its channels.
3255 */
3256 void
3257 fs_visitor::lower_uniform_pull_constant_loads()
3258 {
3259 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3260 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3261 continue;
3262
3263 if (devinfo->gen >= 7) {
3264 /* The offset arg before was a vec4-aligned byte offset. We need to
3265 * turn it into a dword offset.
3266 */
3267 fs_reg const_offset_reg = inst->src[1];
3268 assert(const_offset_reg.file == IMM &&
3269 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3270 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3271 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3272
3273 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3274 * Reserve space for the register.
3275 */
3276 if (devinfo->gen >= 9) {
3277 payload.reg_offset++;
3278 alloc.sizes[payload.reg] = 2;
3279 }
3280
3281 /* This is actually going to be a MOV, but since only the first dword
3282 * is accessed, we have a special opcode to do just that one. Note
3283 * that this needs to be an operation that will be considered a def
3284 * by live variable analysis, or register allocation will explode.
3285 */
3286 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3287 8, payload, const_offset_reg);
3288 setup->force_writemask_all = true;
3289
3290 setup->ir = inst->ir;
3291 setup->annotation = inst->annotation;
3292 inst->insert_before(block, setup);
3293
3294 /* Similarly, this will only populate the first 4 channels of the
3295 * result register (since we only use smear values from 0-3), but we
3296 * don't tell the optimizer.
3297 */
3298 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3299 inst->src[1] = payload;
3300
3301 invalidate_live_intervals();
3302 } else {
3303 /* Before register allocation, we didn't tell the scheduler about the
3304 * MRF we use. We know it's safe to use this MRF because nothing
3305 * else does except for register spill/unspill, which generates and
3306 * uses its MRF within a single IR instruction.
3307 */
3308 inst->base_mrf = 14;
3309 inst->mlen = 1;
3310 }
3311 }
3312 }
3313
3314 bool
3315 fs_visitor::lower_load_payload()
3316 {
3317 bool progress = false;
3318
3319 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3320 if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3321 continue;
3322
3323 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3324 assert(inst->saturate == false);
3325
3326 const fs_builder ibld = bld.group(inst->exec_size, inst->force_sechalf)
3327 .exec_all(inst->force_writemask_all)
3328 .at(block, inst);
3329 fs_reg dst = inst->dst;
3330
3331 /* Get rid of COMPR4. We'll add it back in if we need it */
3332 if (dst.file == MRF)
3333 dst.reg = dst.reg & ~BRW_MRF_COMPR4;
3334
3335 dst.width = 8;
3336 for (uint8_t i = 0; i < inst->header_size; i++) {
3337 if (inst->src[i].file != BAD_FILE) {
3338 fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3339 fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3340 mov_src.width = 8;
3341 ibld.exec_all().MOV(mov_dst, mov_src);
3342 }
3343 dst = offset(dst, 1);
3344 }
3345
3346 dst.width = inst->exec_size;
3347 if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
3348 inst->exec_size > 8) {
3349 /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3350 * a straightforward copy. Instead, the result of the
3351 * LOAD_PAYLOAD is treated as interleaved and the first four
3352 * non-header sources are unpacked as:
3353 *
3354 * m + 0: r0
3355 * m + 1: g0
3356 * m + 2: b0
3357 * m + 3: a0
3358 * m + 4: r1
3359 * m + 5: g1
3360 * m + 6: b1
3361 * m + 7: a1
3362 *
3363 * This is used for gen <= 5 fb writes.
3364 */
3365 assert(inst->exec_size == 16);
3366 assert(inst->header_size + 4 <= inst->sources);
3367 for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3368 if (inst->src[i].file != BAD_FILE) {
3369 if (devinfo->has_compr4) {
3370 fs_reg compr4_dst = retype(dst, inst->src[i].type);
3371 compr4_dst.reg |= BRW_MRF_COMPR4;
3372 ibld.MOV(compr4_dst, inst->src[i]);
3373 } else {
3374 /* Platform doesn't have COMPR4. We have to fake it */
3375 fs_reg mov_dst = retype(dst, inst->src[i].type);
3376 mov_dst.width = 8;
3377 ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
3378 ibld.half(1).MOV(offset(mov_dst, 4), half(inst->src[i], 1));
3379 }
3380 }
3381
3382 dst.reg++;
3383 }
3384
3385 /* The loop above only ever incremented us through the first set
3386 * of 4 registers. However, thanks to the magic of COMPR4, we
3387 * actually wrote to the first 8 registers, so we need to take
3388 * that into account now.
3389 */
3390 dst.reg += 4;
3391
3392 /* The COMPR4 code took care of the first 4 sources. We'll let
3393 * the regular path handle any remaining sources. Yes, we are
3394 * modifying the instruction but we're about to delete it so
3395 * this really doesn't hurt anything.
3396 */
3397 inst->header_size += 4;
3398 }
3399
3400 for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3401 if (inst->src[i].file != BAD_FILE)
3402 ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
3403 dst = offset(dst, 1);
3404 }
3405
3406 inst->remove(block);
3407 progress = true;
3408 }
3409
3410 if (progress)
3411 invalidate_live_intervals();
3412
3413 return progress;
3414 }
3415
3416 bool
3417 fs_visitor::lower_integer_multiplication()
3418 {
3419 bool progress = false;
3420
3421 /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
3422 * directly, but Cherryview cannot.
3423 */
3424 if (devinfo->gen >= 8 && !devinfo->is_cherryview)
3425 return false;
3426
3427 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3428 if (inst->opcode != BRW_OPCODE_MUL ||
3429 inst->dst.is_accumulator() ||
3430 (inst->dst.type != BRW_REGISTER_TYPE_D &&
3431 inst->dst.type != BRW_REGISTER_TYPE_UD))
3432 continue;
3433
3434 #define insert(instr) inst->insert_before(block, instr)
3435
3436 /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3437 * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3438 * src1 are used.
3439 *
3440 * If multiplying by an immediate value that fits in 16-bits, do a
3441 * single MUL instruction with that value in the proper location.
3442 */
3443 if (inst->src[1].file == IMM &&
3444 inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
3445 if (devinfo->gen < 7) {
3446 fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
3447 inst->dst.type, dispatch_width);
3448 insert(MOV(imm, inst->src[1]));
3449 insert(MUL(inst->dst, imm, inst->src[0]));
3450 } else {
3451 insert(MUL(inst->dst, inst->src[0], inst->src[1]));
3452 }
3453 } else {
3454 /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3455 * do 32-bit integer multiplication in one instruction, but instead
3456 * must do a sequence (which actually calculates a 64-bit result):
3457 *
3458 * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
3459 * mach(8) null g3<8,8,1>D g4<8,8,1>D
3460 * mov(8) g2<1>D acc0<8,8,1>D
3461 *
3462 * But on Gen > 6, the ability to use second accumulator register
3463 * (acc1) for non-float data types was removed, preventing a simple
3464 * implementation in SIMD16. A 16-channel result can be calculated by
3465 * executing the three instructions twice in SIMD8, once with quarter
3466 * control of 1Q for the first eight channels and again with 2Q for
3467 * the second eight channels.
3468 *
3469 * Which accumulator register is implicitly accessed (by AccWrEnable
3470 * for instance) is determined by the quarter control. Unfortunately
3471 * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3472 * implicit accumulator access by an instruction with 2Q will access
3473 * acc1 regardless of whether the data type is usable in acc1.
3474 *
3475 * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3476 * integer data types.
3477 *
3478 * Since we only want the low 32-bits of the result, we can do two
3479 * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3480 * adjust the high result and add them (like the mach is doing):
3481 *
3482 * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
3483 * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
3484 * shl(8) g9<1>D g8<8,8,1>D 16D
3485 * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
3486 *
3487 * We avoid the shl instruction by realizing that we only want to add
3488 * the low 16-bits of the "high" result to the high 16-bits of the
3489 * "low" result and using proper regioning on the add:
3490 *
3491 * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
3492 * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
3493 * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
3494 *
3495 * Since it does not use the (single) accumulator register, we can
3496 * schedule multi-component multiplications much better.
3497 */
3498
3499 if (inst->conditional_mod && inst->dst.is_null()) {
3500 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3501 inst->dst.type, dispatch_width);
3502 }
3503 fs_reg low = inst->dst;
3504 fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
3505 inst->dst.type, dispatch_width);
3506
3507 if (brw->gen >= 7) {
3508 fs_reg src1_0_w = inst->src[1];
3509 fs_reg src1_1_w = inst->src[1];
3510
3511 if (inst->src[1].file == IMM) {
3512 src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
3513 src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
3514 } else {
3515 src1_0_w.type = BRW_REGISTER_TYPE_UW;
3516 src1_0_w.stride = 2;
3517
3518 src1_1_w.type = BRW_REGISTER_TYPE_UW;
3519 src1_1_w.stride = 2;
3520 src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3521 }
3522 insert(MUL(low, inst->src[0], src1_0_w));
3523 insert(MUL(high, inst->src[0], src1_1_w));
3524 } else {
3525 fs_reg src0_0_w = inst->src[0];
3526 fs_reg src0_1_w = inst->src[0];
3527
3528 src0_0_w.type = BRW_REGISTER_TYPE_UW;
3529 src0_0_w.stride = 2;
3530
3531 src0_1_w.type = BRW_REGISTER_TYPE_UW;
3532 src0_1_w.stride = 2;
3533 src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3534
3535 insert(MUL(low, src0_0_w, inst->src[1]));
3536 insert(MUL(high, src0_1_w, inst->src[1]));
3537 }
3538
3539 fs_reg dst = inst->dst;
3540 dst.type = BRW_REGISTER_TYPE_UW;
3541 dst.subreg_offset = 2;
3542 dst.stride = 2;
3543
3544 high.type = BRW_REGISTER_TYPE_UW;
3545 high.stride = 2;
3546
3547 low.type = BRW_REGISTER_TYPE_UW;
3548 low.subreg_offset = 2;
3549 low.stride = 2;
3550
3551 insert(ADD(dst, low, high));
3552
3553 if (inst->conditional_mod) {
3554 fs_reg null(retype(brw_null_reg(), inst->dst.type));
3555 fs_inst *mov = MOV(null, inst->dst);
3556 mov->conditional_mod = inst->conditional_mod;
3557 insert(mov);
3558 }
3559 }
3560 #undef insert
3561
3562 inst->remove(block);
3563 progress = true;
3564 }
3565
3566 if (progress)
3567 invalidate_live_intervals();
3568
3569 return progress;
3570 }
3571
3572 void
3573 fs_visitor::dump_instructions()
3574 {
3575 dump_instructions(NULL);
3576 }
3577
3578 void
3579 fs_visitor::dump_instructions(const char *name)
3580 {
3581 FILE *file = stderr;
3582 if (name && geteuid() != 0) {
3583 file = fopen(name, "w");
3584 if (!file)
3585 file = stderr;
3586 }
3587
3588 if (cfg) {
3589 calculate_register_pressure();
3590 int ip = 0, max_pressure = 0;
3591 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3592 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3593 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3594 dump_instruction(inst, file);
3595 ip++;
3596 }
3597 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3598 } else {
3599 int ip = 0;
3600 foreach_in_list(backend_instruction, inst, &instructions) {
3601 fprintf(file, "%4d: ", ip++);
3602 dump_instruction(inst, file);
3603 }
3604 }
3605
3606 if (file != stderr) {
3607 fclose(file);
3608 }
3609 }
3610
3611 void
3612 fs_visitor::dump_instruction(backend_instruction *be_inst)
3613 {
3614 dump_instruction(be_inst, stderr);
3615 }
3616
3617 void
3618 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3619 {
3620 fs_inst *inst = (fs_inst *)be_inst;
3621
3622 if (inst->predicate) {
3623 fprintf(file, "(%cf0.%d) ",
3624 inst->predicate_inverse ? '-' : '+',
3625 inst->flag_subreg);
3626 }
3627
3628 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3629 if (inst->saturate)
3630 fprintf(file, ".sat");
3631 if (inst->conditional_mod) {
3632 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3633 if (!inst->predicate &&
3634 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3635 inst->opcode != BRW_OPCODE_IF &&
3636 inst->opcode != BRW_OPCODE_WHILE))) {
3637 fprintf(file, ".f0.%d", inst->flag_subreg);
3638 }
3639 }
3640 fprintf(file, "(%d) ", inst->exec_size);
3641
3642 if (inst->mlen) {
3643 fprintf(file, "(mlen: %d) ", inst->mlen);
3644 }
3645
3646 switch (inst->dst.file) {
3647 case GRF:
3648 fprintf(file, "vgrf%d", inst->dst.reg);
3649 if (inst->dst.width != dispatch_width)
3650 fprintf(file, "@%d", inst->dst.width);
3651 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3652 inst->dst.subreg_offset)
3653 fprintf(file, "+%d.%d",
3654 inst->dst.reg_offset, inst->dst.subreg_offset);
3655 break;
3656 case MRF:
3657 fprintf(file, "m%d", inst->dst.reg);
3658 break;
3659 case BAD_FILE:
3660 fprintf(file, "(null)");
3661 break;
3662 case UNIFORM:
3663 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3664 break;
3665 case ATTR:
3666 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3667 break;
3668 case HW_REG:
3669 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3670 switch (inst->dst.fixed_hw_reg.nr) {
3671 case BRW_ARF_NULL:
3672 fprintf(file, "null");
3673 break;
3674 case BRW_ARF_ADDRESS:
3675 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3676 break;
3677 case BRW_ARF_ACCUMULATOR:
3678 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3679 break;
3680 case BRW_ARF_FLAG:
3681 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3682 inst->dst.fixed_hw_reg.subnr);
3683 break;
3684 default:
3685 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3686 inst->dst.fixed_hw_reg.subnr);
3687 break;
3688 }
3689 } else {
3690 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3691 }
3692 if (inst->dst.fixed_hw_reg.subnr)
3693 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3694 break;
3695 default:
3696 fprintf(file, "???");
3697 break;
3698 }
3699 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3700
3701 for (int i = 0; i < inst->sources; i++) {
3702 if (inst->src[i].negate)
3703 fprintf(file, "-");
3704 if (inst->src[i].abs)
3705 fprintf(file, "|");
3706 switch (inst->src[i].file) {
3707 case GRF:
3708 fprintf(file, "vgrf%d", inst->src[i].reg);
3709 if (inst->src[i].width != dispatch_width)
3710 fprintf(file, "@%d", inst->src[i].width);
3711 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3712 inst->src[i].subreg_offset)
3713 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3714 inst->src[i].subreg_offset);
3715 break;
3716 case MRF:
3717 fprintf(file, "***m%d***", inst->src[i].reg);
3718 break;
3719 case ATTR:
3720 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3721 break;
3722 case UNIFORM:
3723 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3724 if (inst->src[i].reladdr) {
3725 fprintf(file, "+reladdr");
3726 } else if (inst->src[i].subreg_offset) {
3727 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3728 inst->src[i].subreg_offset);
3729 }
3730 break;
3731 case BAD_FILE:
3732 fprintf(file, "(null)");
3733 break;
3734 case IMM:
3735 switch (inst->src[i].type) {
3736 case BRW_REGISTER_TYPE_F:
3737 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3738 break;
3739 case BRW_REGISTER_TYPE_W:
3740 case BRW_REGISTER_TYPE_D:
3741 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3742 break;
3743 case BRW_REGISTER_TYPE_UW:
3744 case BRW_REGISTER_TYPE_UD:
3745 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3746 break;
3747 case BRW_REGISTER_TYPE_VF:
3748 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3749 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3750 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3751 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3752 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3753 break;
3754 default:
3755 fprintf(file, "???");
3756 break;
3757 }
3758 break;
3759 case HW_REG:
3760 if (inst->src[i].fixed_hw_reg.negate)
3761 fprintf(file, "-");
3762 if (inst->src[i].fixed_hw_reg.abs)
3763 fprintf(file, "|");
3764 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3765 switch (inst->src[i].fixed_hw_reg.nr) {
3766 case BRW_ARF_NULL:
3767 fprintf(file, "null");
3768 break;
3769 case BRW_ARF_ADDRESS:
3770 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3771 break;
3772 case BRW_ARF_ACCUMULATOR:
3773 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3774 break;
3775 case BRW_ARF_FLAG:
3776 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3777 inst->src[i].fixed_hw_reg.subnr);
3778 break;
3779 default:
3780 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3781 inst->src[i].fixed_hw_reg.subnr);
3782 break;
3783 }
3784 } else {
3785 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3786 }
3787 if (inst->src[i].fixed_hw_reg.subnr)
3788 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3789 if (inst->src[i].fixed_hw_reg.abs)
3790 fprintf(file, "|");
3791 break;
3792 default:
3793 fprintf(file, "???");
3794 break;
3795 }
3796 if (inst->src[i].abs)
3797 fprintf(file, "|");
3798
3799 if (inst->src[i].file != IMM) {
3800 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3801 }
3802
3803 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3804 fprintf(file, ", ");
3805 }
3806
3807 fprintf(file, " ");
3808
3809 if (dispatch_width == 16 && inst->exec_size == 8) {
3810 if (inst->force_sechalf)
3811 fprintf(file, "2ndhalf ");
3812 else
3813 fprintf(file, "1sthalf ");
3814 }
3815
3816 fprintf(file, "\n");
3817 }
3818
3819 /**
3820 * Possibly returns an instruction that set up @param reg.
3821 *
3822 * Sometimes we want to take the result of some expression/variable
3823 * dereference tree and rewrite the instruction generating the result
3824 * of the tree. When processing the tree, we know that the
3825 * instructions generated are all writing temporaries that are dead
3826 * outside of this tree. So, if we have some instructions that write
3827 * a temporary, we're free to point that temp write somewhere else.
3828 *
3829 * Note that this doesn't guarantee that the instruction generated
3830 * only reg -- it might be the size=4 destination of a texture instruction.
3831 */
3832 fs_inst *
3833 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3834 fs_inst *end,
3835 const fs_reg &reg)
3836 {
3837 if (end == start ||
3838 end->is_partial_write() ||
3839 reg.reladdr ||
3840 !reg.equals(end->dst)) {
3841 return NULL;
3842 } else {
3843 return end;
3844 }
3845 }
3846
3847 void
3848 fs_visitor::setup_payload_gen6()
3849 {
3850 bool uses_depth =
3851 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3852 unsigned barycentric_interp_modes =
3853 (stage == MESA_SHADER_FRAGMENT) ?
3854 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3855
3856 assert(devinfo->gen >= 6);
3857
3858 /* R0-1: masks, pixel X/Y coordinates. */
3859 payload.num_regs = 2;
3860 /* R2: only for 32-pixel dispatch.*/
3861
3862 /* R3-26: barycentric interpolation coordinates. These appear in the
3863 * same order that they appear in the brw_wm_barycentric_interp_mode
3864 * enum. Each set of coordinates occupies 2 registers if dispatch width
3865 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3866 * appear if they were enabled using the "Barycentric Interpolation
3867 * Mode" bits in WM_STATE.
3868 */
3869 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3870 if (barycentric_interp_modes & (1 << i)) {
3871 payload.barycentric_coord_reg[i] = payload.num_regs;
3872 payload.num_regs += 2;
3873 if (dispatch_width == 16) {
3874 payload.num_regs += 2;
3875 }
3876 }
3877 }
3878
3879 /* R27: interpolated depth if uses source depth */
3880 if (uses_depth) {
3881 payload.source_depth_reg = payload.num_regs;
3882 payload.num_regs++;
3883 if (dispatch_width == 16) {
3884 /* R28: interpolated depth if not SIMD8. */
3885 payload.num_regs++;
3886 }
3887 }
3888 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3889 if (uses_depth) {
3890 payload.source_w_reg = payload.num_regs;
3891 payload.num_regs++;
3892 if (dispatch_width == 16) {
3893 /* R30: interpolated W if not SIMD8. */
3894 payload.num_regs++;
3895 }
3896 }
3897
3898 if (stage == MESA_SHADER_FRAGMENT) {
3899 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3900 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3901 prog_data->uses_pos_offset = key->compute_pos_offset;
3902 /* R31: MSAA position offsets. */
3903 if (prog_data->uses_pos_offset) {
3904 payload.sample_pos_reg = payload.num_regs;
3905 payload.num_regs++;
3906 }
3907 }
3908
3909 /* R32: MSAA input coverage mask */
3910 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3911 assert(devinfo->gen >= 7);
3912 payload.sample_mask_in_reg = payload.num_regs;
3913 payload.num_regs++;
3914 if (dispatch_width == 16) {
3915 /* R33: input coverage mask if not SIMD8. */
3916 payload.num_regs++;
3917 }
3918 }
3919
3920 /* R34-: bary for 32-pixel. */
3921 /* R58-59: interp W for 32-pixel. */
3922
3923 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3924 source_depth_to_render_target = true;
3925 }
3926 }
3927
3928 void
3929 fs_visitor::setup_vs_payload()
3930 {
3931 /* R0: thread header, R1: urb handles */
3932 payload.num_regs = 2;
3933 }
3934
3935 void
3936 fs_visitor::setup_cs_payload()
3937 {
3938 assert(brw->gen >= 7);
3939
3940 payload.num_regs = 1;
3941 }
3942
3943 void
3944 fs_visitor::assign_binding_table_offsets()
3945 {
3946 assert(stage == MESA_SHADER_FRAGMENT);
3947 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3948 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3949 uint32_t next_binding_table_offset = 0;
3950
3951 /* If there are no color regions, we still perform an FB write to a null
3952 * renderbuffer, which we place at surface index 0.
3953 */
3954 prog_data->binding_table.render_target_start = next_binding_table_offset;
3955 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3956
3957 assign_common_binding_table_offsets(next_binding_table_offset);
3958 }
3959
3960 void
3961 fs_visitor::calculate_register_pressure()
3962 {
3963 invalidate_live_intervals();
3964 calculate_live_intervals();
3965
3966 unsigned num_instructions = 0;
3967 foreach_block(block, cfg)
3968 num_instructions += block->instructions.length();
3969
3970 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3971
3972 for (unsigned reg = 0; reg < alloc.count; reg++) {
3973 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3974 regs_live_at_ip[ip] += alloc.sizes[reg];
3975 }
3976 }
3977
3978 void
3979 fs_visitor::optimize()
3980 {
3981 /* bld is the common builder object pointing at the end of the program we
3982 * used to translate it into i965 IR. For the optimization and lowering
3983 * passes coming next, any code added after the end of the program without
3984 * having explicitly called fs_builder::at() clearly points at a mistake.
3985 * Ideally optimization passes wouldn't be part of the visitor so they
3986 * wouldn't have access to bld at all, but they do, so just in case some
3987 * pass forgets to ask for a location explicitly set it to NULL here to
3988 * make it trip.
3989 */
3990 bld = bld.at(NULL, NULL);
3991
3992 split_virtual_grfs();
3993
3994 move_uniform_array_access_to_pull_constants();
3995 assign_constant_locations();
3996 demote_pull_constants();
3997
3998 #define OPT(pass, args...) ({ \
3999 pass_num++; \
4000 bool this_progress = pass(args); \
4001 \
4002 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
4003 char filename[64]; \
4004 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
4005 stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
4006 \
4007 backend_shader::dump_instructions(filename); \
4008 } \
4009 \
4010 progress = progress || this_progress; \
4011 this_progress; \
4012 })
4013
4014 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
4015 char filename[64];
4016 snprintf(filename, 64, "%s%d-%04d-00-start",
4017 stage_abbrev, dispatch_width,
4018 shader_prog ? shader_prog->Name : 0);
4019
4020 backend_shader::dump_instructions(filename);
4021 }
4022
4023 bool progress;
4024 int iteration = 0;
4025 int pass_num = 0;
4026 do {
4027 progress = false;
4028 pass_num = 0;
4029 iteration++;
4030
4031 OPT(remove_duplicate_mrf_writes);
4032
4033 OPT(opt_algebraic);
4034 OPT(opt_cse);
4035 OPT(opt_copy_propagate);
4036 OPT(opt_peephole_predicated_break);
4037 OPT(opt_cmod_propagation);
4038 OPT(dead_code_eliminate);
4039 OPT(opt_peephole_sel);
4040 OPT(dead_control_flow_eliminate, this);
4041 OPT(opt_register_renaming);
4042 OPT(opt_redundant_discard_jumps);
4043 OPT(opt_saturate_propagation);
4044 OPT(opt_zero_samples);
4045 OPT(register_coalesce);
4046 OPT(compute_to_mrf);
4047 OPT(eliminate_find_live_channel);
4048
4049 OPT(compact_virtual_grfs);
4050 } while (progress);
4051
4052 pass_num = 0;
4053
4054 OPT(opt_sampler_eot);
4055
4056 if (OPT(lower_load_payload)) {
4057 split_virtual_grfs();
4058 OPT(register_coalesce);
4059 OPT(compute_to_mrf);
4060 OPT(dead_code_eliminate);
4061 }
4062
4063 OPT(opt_combine_constants);
4064 OPT(lower_integer_multiplication);
4065
4066 lower_uniform_pull_constant_loads();
4067 }
4068
4069 /**
4070 * Three source instruction must have a GRF/MRF destination register.
4071 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
4072 */
4073 void
4074 fs_visitor::fixup_3src_null_dest()
4075 {
4076 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
4077 if (inst->is_3src() && inst->dst.is_null()) {
4078 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
4079 inst->dst.type);
4080 }
4081 }
4082 }
4083
4084 void
4085 fs_visitor::allocate_registers()
4086 {
4087 bool allocated_without_spills;
4088
4089 static const enum instruction_scheduler_mode pre_modes[] = {
4090 SCHEDULE_PRE,
4091 SCHEDULE_PRE_NON_LIFO,
4092 SCHEDULE_PRE_LIFO,
4093 };
4094
4095 /* Try each scheduling heuristic to see if it can successfully register
4096 * allocate without spilling. They should be ordered by decreasing
4097 * performance but increasing likelihood of allocating.
4098 */
4099 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
4100 schedule_instructions(pre_modes[i]);
4101
4102 if (0) {
4103 assign_regs_trivial();
4104 allocated_without_spills = true;
4105 } else {
4106 allocated_without_spills = assign_regs(false);
4107 }
4108 if (allocated_without_spills)
4109 break;
4110 }
4111
4112 if (!allocated_without_spills) {
4113 /* We assume that any spilling is worse than just dropping back to
4114 * SIMD8. There's probably actually some intermediate point where
4115 * SIMD16 with a couple of spills is still better.
4116 */
4117 if (dispatch_width == 16) {
4118 fail("Failure to register allocate. Reduce number of "
4119 "live scalar values to avoid this.");
4120 } else {
4121 perf_debug("%s shader triggered register spilling. "
4122 "Try reducing the number of live scalar values to "
4123 "improve performance.\n", stage_name);
4124 }
4125
4126 /* Since we're out of heuristics, just go spill registers until we
4127 * get an allocation.
4128 */
4129 while (!assign_regs(true)) {
4130 if (failed)
4131 break;
4132 }
4133 }
4134
4135 /* This must come after all optimization and register allocation, since
4136 * it inserts dead code that happens to have side effects, and it does
4137 * so based on the actual physical registers in use.
4138 */
4139 insert_gen4_send_dependency_workarounds();
4140
4141 if (failed)
4142 return;
4143
4144 if (!allocated_without_spills)
4145 schedule_instructions(SCHEDULE_POST);
4146
4147 if (last_scratch > 0)
4148 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4149 }
4150
4151 bool
4152 fs_visitor::run_vs()
4153 {
4154 assert(stage == MESA_SHADER_VERTEX);
4155
4156 assign_common_binding_table_offsets(0);
4157 setup_vs_payload();
4158
4159 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4160 emit_shader_time_begin();
4161
4162 emit_nir_code();
4163
4164 if (failed)
4165 return false;
4166
4167 emit_urb_writes();
4168
4169 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4170 emit_shader_time_end();
4171
4172 calculate_cfg();
4173
4174 optimize();
4175
4176 assign_curb_setup();
4177 assign_vs_urb_setup();
4178
4179 fixup_3src_null_dest();
4180 allocate_registers();
4181
4182 return !failed;
4183 }
4184
4185 bool
4186 fs_visitor::run_fs()
4187 {
4188 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4189 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4190
4191 assert(stage == MESA_SHADER_FRAGMENT);
4192
4193 sanity_param_count = prog->Parameters->NumParameters;
4194
4195 assign_binding_table_offsets();
4196
4197 if (devinfo->gen >= 6)
4198 setup_payload_gen6();
4199 else
4200 setup_payload_gen4();
4201
4202 if (0) {
4203 emit_dummy_fs();
4204 } else if (brw->use_rep_send && dispatch_width == 16) {
4205 emit_repclear_shader();
4206 } else {
4207 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4208 emit_shader_time_begin();
4209
4210 calculate_urb_setup();
4211 if (prog->InputsRead > 0) {
4212 if (devinfo->gen < 6)
4213 emit_interpolation_setup_gen4();
4214 else
4215 emit_interpolation_setup_gen6();
4216 }
4217
4218 /* We handle discards by keeping track of the still-live pixels in f0.1.
4219 * Initialize it with the dispatched pixels.
4220 */
4221 if (wm_prog_data->uses_kill) {
4222 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4223 discard_init->flag_subreg = 1;
4224 }
4225
4226 /* Generate FS IR for main(). (the visitor only descends into
4227 * functions called "main").
4228 */
4229 emit_nir_code();
4230
4231 if (failed)
4232 return false;
4233
4234 if (wm_prog_data->uses_kill)
4235 emit(FS_OPCODE_PLACEHOLDER_HALT);
4236
4237 if (wm_key->alpha_test_func)
4238 emit_alpha_test();
4239
4240 emit_fb_writes();
4241
4242 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4243 emit_shader_time_end();
4244
4245 calculate_cfg();
4246
4247 optimize();
4248
4249 assign_curb_setup();
4250 assign_urb_setup();
4251
4252 fixup_3src_null_dest();
4253 allocate_registers();
4254
4255 if (failed)
4256 return false;
4257 }
4258
4259 if (dispatch_width == 8)
4260 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4261 else
4262 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4263
4264 /* If any state parameters were appended, then ParameterValues could have
4265 * been realloced, in which case the driver uniform storage set up by
4266 * _mesa_associate_uniform_storage() would point to freed memory. Make
4267 * sure that didn't happen.
4268 */
4269 assert(sanity_param_count == prog->Parameters->NumParameters);
4270
4271 return !failed;
4272 }
4273
4274 bool
4275 fs_visitor::run_cs()
4276 {
4277 assert(stage == MESA_SHADER_COMPUTE);
4278 assert(shader);
4279
4280 sanity_param_count = prog->Parameters->NumParameters;
4281
4282 assign_common_binding_table_offsets(0);
4283
4284 setup_cs_payload();
4285
4286 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4287 emit_shader_time_begin();
4288
4289 emit_nir_code();
4290
4291 if (failed)
4292 return false;
4293
4294 emit_cs_terminate();
4295
4296 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4297 emit_shader_time_end();
4298
4299 calculate_cfg();
4300
4301 optimize();
4302
4303 assign_curb_setup();
4304
4305 fixup_3src_null_dest();
4306 allocate_registers();
4307
4308 if (failed)
4309 return false;
4310
4311 /* If any state parameters were appended, then ParameterValues could have
4312 * been realloced, in which case the driver uniform storage set up by
4313 * _mesa_associate_uniform_storage() would point to freed memory. Make
4314 * sure that didn't happen.
4315 */
4316 assert(sanity_param_count == prog->Parameters->NumParameters);
4317
4318 return !failed;
4319 }
4320
4321 const unsigned *
4322 brw_wm_fs_emit(struct brw_context *brw,
4323 void *mem_ctx,
4324 const struct brw_wm_prog_key *key,
4325 struct brw_wm_prog_data *prog_data,
4326 struct gl_fragment_program *fp,
4327 struct gl_shader_program *prog,
4328 unsigned *final_assembly_size)
4329 {
4330 bool start_busy = false;
4331 double start_time = 0;
4332
4333 if (unlikely(brw->perf_debug)) {
4334 start_busy = (brw->batch.last_bo &&
4335 drm_intel_bo_busy(brw->batch.last_bo));
4336 start_time = get_time();
4337 }
4338
4339 struct brw_shader *shader = NULL;
4340 if (prog)
4341 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4342
4343 if (unlikely(INTEL_DEBUG & DEBUG_WM))
4344 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4345
4346 /* Now the main event: Visit the shader IR and generate our FS IR for it.
4347 */
4348 fs_visitor v(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4349 prog, &fp->Base, 8);
4350 if (!v.run_fs()) {
4351 if (prog) {
4352 prog->LinkStatus = false;
4353 ralloc_strcat(&prog->InfoLog, v.fail_msg);
4354 }
4355
4356 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4357 v.fail_msg);
4358
4359 return NULL;
4360 }
4361
4362 cfg_t *simd16_cfg = NULL;
4363 fs_visitor v2(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4364 prog, &fp->Base, 16);
4365 if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4366 if (!v.simd16_unsupported) {
4367 /* Try a SIMD16 compile */
4368 v2.import_uniforms(&v);
4369 if (!v2.run_fs()) {
4370 perf_debug("SIMD16 shader failed to compile, falling back to "
4371 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4372 } else {
4373 simd16_cfg = v2.cfg;
4374 }
4375 } else {
4376 perf_debug("SIMD16 shader unsupported, falling back to "
4377 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4378 }
4379 }
4380
4381 cfg_t *simd8_cfg;
4382 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4383 if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4384 simd8_cfg = NULL;
4385 prog_data->no_8 = true;
4386 } else {
4387 simd8_cfg = v.cfg;
4388 prog_data->no_8 = false;
4389 }
4390
4391 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4392 &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4393
4394 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4395 char *name;
4396 if (prog)
4397 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4398 prog->Label ? prog->Label : "unnamed",
4399 prog->Name);
4400 else
4401 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4402
4403 g.enable_debug(name);
4404 }
4405
4406 if (simd8_cfg)
4407 g.generate_code(simd8_cfg, 8);
4408 if (simd16_cfg)
4409 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4410
4411 if (unlikely(brw->perf_debug) && shader) {
4412 if (shader->compiled_once)
4413 brw_wm_debug_recompile(brw, prog, key);
4414 shader->compiled_once = true;
4415
4416 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4417 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4418 (get_time() - start_time) * 1000);
4419 }
4420 }
4421
4422 return g.get_assembly(final_assembly_size);
4423 }
4424
4425 extern "C" bool
4426 brw_fs_precompile(struct gl_context *ctx,
4427 struct gl_shader_program *shader_prog,
4428 struct gl_program *prog)
4429 {
4430 struct brw_context *brw = brw_context(ctx);
4431 struct brw_wm_prog_key key;
4432
4433 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4434 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4435 bool program_uses_dfdy = fp->UsesDFdy;
4436
4437 memset(&key, 0, sizeof(key));
4438
4439 if (brw->gen < 6) {
4440 if (fp->UsesKill)
4441 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4442
4443 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4444 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4445
4446 /* Just assume depth testing. */
4447 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4448 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4449 }
4450
4451 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4452 BRW_FS_VARYING_INPUT_MASK) > 16)
4453 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4454
4455 brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4456
4457 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4458 key.drawable_height = ctx->DrawBuffer->Height;
4459 }
4460
4461 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4462 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4463 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4464
4465 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4466 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4467 key.nr_color_regions > 1;
4468 }
4469
4470 key.program_string_id = bfp->id;
4471
4472 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4473 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4474
4475 bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4476
4477 brw->wm.base.prog_offset = old_prog_offset;
4478 brw->wm.prog_data = old_prog_data;
4479
4480 return success;
4481 }
4482
4483 void
4484 brw_setup_tex_for_precompile(struct brw_context *brw,
4485 struct brw_sampler_prog_key_data *tex,
4486 struct gl_program *prog)
4487 {
4488 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4489 unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4490 for (unsigned i = 0; i < sampler_count; i++) {
4491 if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4492 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4493 tex->swizzles[i] =
4494 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4495 } else {
4496 /* Color sampler: assume no swizzling. */
4497 tex->swizzles[i] = SWIZZLE_XYZW;
4498 }
4499 }
4500 }