i965/fs: Migrate FS interpolation code to the IR builder.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 using namespace brw;
53
54 void
55 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
56 const fs_reg *src, unsigned sources)
57 {
58 memset(this, 0, sizeof(*this));
59
60 this->src = new fs_reg[MAX2(sources, 3)];
61 for (unsigned i = 0; i < sources; i++)
62 this->src[i] = src[i];
63
64 this->opcode = opcode;
65 this->dst = dst;
66 this->sources = sources;
67 this->exec_size = exec_size;
68
69 assert(dst.file != IMM && dst.file != UNIFORM);
70
71 /* If exec_size == 0, try to guess it from the registers. Since all
72 * manner of things may use hardware registers, we first try to guess
73 * based on GRF registers. If this fails, we will go ahead and take the
74 * width from the destination register.
75 */
76 if (this->exec_size == 0) {
77 if (dst.file == GRF) {
78 this->exec_size = dst.width;
79 } else {
80 for (unsigned i = 0; i < sources; ++i) {
81 if (src[i].file != GRF && src[i].file != ATTR)
82 continue;
83
84 if (this->exec_size <= 1)
85 this->exec_size = src[i].width;
86 assert(src[i].width == 1 || src[i].width == this->exec_size);
87 }
88 }
89
90 if (this->exec_size == 0 && dst.file != BAD_FILE)
91 this->exec_size = dst.width;
92 }
93 assert(this->exec_size != 0);
94
95 this->conditional_mod = BRW_CONDITIONAL_NONE;
96
97 /* This will be the case for almost all instructions. */
98 switch (dst.file) {
99 case GRF:
100 case HW_REG:
101 case MRF:
102 case ATTR:
103 this->regs_written =
104 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
105 break;
106 case BAD_FILE:
107 this->regs_written = 0;
108 break;
109 case IMM:
110 case UNIFORM:
111 unreachable("Invalid destination register file");
112 default:
113 unreachable("Invalid register file");
114 }
115
116 this->writes_accumulator = false;
117 }
118
119 fs_inst::fs_inst()
120 {
121 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
122 }
123
124 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
125 {
126 init(opcode, exec_size, reg_undef, NULL, 0);
127 }
128
129 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
130 {
131 init(opcode, 0, dst, NULL, 0);
132 }
133
134 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
135 const fs_reg &src0)
136 {
137 const fs_reg src[1] = { src0 };
138 init(opcode, exec_size, dst, src, 1);
139 }
140
141 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
142 {
143 const fs_reg src[1] = { src0 };
144 init(opcode, 0, dst, src, 1);
145 }
146
147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
148 const fs_reg &src0, const fs_reg &src1)
149 {
150 const fs_reg src[2] = { src0, src1 };
151 init(opcode, exec_size, dst, src, 2);
152 }
153
154 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
155 const fs_reg &src1)
156 {
157 const fs_reg src[2] = { src0, src1 };
158 init(opcode, 0, dst, src, 2);
159 }
160
161 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
162 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
163 {
164 const fs_reg src[3] = { src0, src1, src2 };
165 init(opcode, exec_size, dst, src, 3);
166 }
167
168 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
169 const fs_reg &src1, const fs_reg &src2)
170 {
171 const fs_reg src[3] = { src0, src1, src2 };
172 init(opcode, 0, dst, src, 3);
173 }
174
175 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
176 const fs_reg src[], unsigned sources)
177 {
178 init(opcode, 0, dst, src, sources);
179 }
180
181 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
182 const fs_reg src[], unsigned sources)
183 {
184 init(opcode, exec_width, dst, src, sources);
185 }
186
187 fs_inst::fs_inst(const fs_inst &that)
188 {
189 memcpy(this, &that, sizeof(that));
190
191 this->src = new fs_reg[MAX2(that.sources, 3)];
192
193 for (unsigned i = 0; i < that.sources; i++)
194 this->src[i] = that.src[i];
195 }
196
197 fs_inst::~fs_inst()
198 {
199 delete[] this->src;
200 }
201
202 void
203 fs_inst::resize_sources(uint8_t num_sources)
204 {
205 if (this->sources != num_sources) {
206 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
207
208 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
209 src[i] = this->src[i];
210
211 delete[] this->src;
212 this->src = src;
213 this->sources = num_sources;
214 }
215 }
216
217 #define ALU1(op) \
218 fs_inst * \
219 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
220 { \
221 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
222 }
223
224 #define ALU2(op) \
225 fs_inst * \
226 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
227 const fs_reg &src1) \
228 { \
229 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
230 }
231
232 #define ALU2_ACC(op) \
233 fs_inst * \
234 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
235 const fs_reg &src1) \
236 { \
237 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
238 inst->writes_accumulator = true; \
239 return inst; \
240 }
241
242 #define ALU3(op) \
243 fs_inst * \
244 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
245 const fs_reg &src1, const fs_reg &src2) \
246 { \
247 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
248 }
249
250 ALU1(NOT)
251 ALU1(MOV)
252 ALU1(FRC)
253 ALU1(RNDD)
254 ALU1(RNDE)
255 ALU1(RNDZ)
256 ALU2(ADD)
257 ALU2(MUL)
258 ALU2_ACC(MACH)
259 ALU2(AND)
260 ALU2(OR)
261 ALU2(XOR)
262 ALU2(SHL)
263 ALU2(SHR)
264 ALU2(ASR)
265 ALU3(LRP)
266 ALU1(BFREV)
267 ALU3(BFE)
268 ALU2(BFI1)
269 ALU3(BFI2)
270 ALU1(FBH)
271 ALU1(FBL)
272 ALU1(CBIT)
273 ALU3(MAD)
274 ALU2_ACC(ADDC)
275 ALU2_ACC(SUBB)
276 ALU2(SEL)
277 ALU2(MAC)
278
279 /** Gen4 predicated IF. */
280 fs_inst *
281 fs_visitor::IF(enum brw_predicate predicate)
282 {
283 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
284 inst->predicate = predicate;
285 return inst;
286 }
287
288 /** Gen6 IF with embedded comparison. */
289 fs_inst *
290 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
291 enum brw_conditional_mod condition)
292 {
293 assert(devinfo->gen == 6);
294 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
295 reg_null_d, src0, src1);
296 inst->conditional_mod = condition;
297 return inst;
298 }
299
300 /**
301 * CMP: Sets the low bit of the destination channels with the result
302 * of the comparison, while the upper bits are undefined, and updates
303 * the flag register with the packed 16 bits of the result.
304 */
305 fs_inst *
306 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
307 enum brw_conditional_mod condition)
308 {
309 fs_inst *inst;
310
311 /* Take the instruction:
312 *
313 * CMP null<d> src0<f> src1<f>
314 *
315 * Original gen4 does type conversion to the destination type before
316 * comparison, producing garbage results for floating point comparisons.
317 *
318 * The destination type doesn't matter on newer generations, so we set the
319 * type to match src0 so we can compact the instruction.
320 */
321 dst.type = src0.type;
322 if (dst.file == HW_REG)
323 dst.fixed_hw_reg.type = dst.type;
324
325 resolve_ud_negate(&src0);
326 resolve_ud_negate(&src1);
327
328 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
329 inst->conditional_mod = condition;
330
331 return inst;
332 }
333
334 fs_inst *
335 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
336 int header_size)
337 {
338 assert(dst.width % 8 == 0);
339 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
340 dst, src, sources);
341 inst->header_size = header_size;
342
343 for (int i = 0; i < header_size; i++)
344 assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
345 inst->regs_written = header_size;
346
347 for (int i = header_size; i < sources; ++i)
348 assert(src[i].file != GRF || src[i].width == dst.width);
349 inst->regs_written += (sources - header_size) * (dst.width / 8);
350
351 return inst;
352 }
353
354 void
355 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
356 const fs_reg &dst,
357 const fs_reg &surf_index,
358 const fs_reg &varying_offset,
359 uint32_t const_offset)
360 {
361 /* We have our constant surface use a pitch of 4 bytes, so our index can
362 * be any component of a vector, and then we load 4 contiguous
363 * components starting from that.
364 *
365 * We break down the const_offset to a portion added to the variable
366 * offset and a portion done using reg_offset, which means that if you
367 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
368 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
369 * CSE can later notice that those loads are all the same and eliminate
370 * the redundant ones.
371 */
372 fs_reg vec4_offset = vgrf(glsl_type::int_type);
373 bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
374
375 int scale = 1;
376 if (devinfo->gen == 4 && dst.width == 8) {
377 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
378 * u, v, r) as parameters, or we can just use the SIMD16 message
379 * consisting of (header, u). We choose the second, at the cost of a
380 * longer return length.
381 */
382 scale = 2;
383 }
384
385 enum opcode op;
386 if (devinfo->gen >= 7)
387 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
388 else
389 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
390
391 assert(dst.width % 8 == 0);
392 int regs_written = 4 * (dst.width / 8) * scale;
393 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
394 dst.type, dst.width);
395 fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
396 inst->regs_written = regs_written;
397
398 if (devinfo->gen < 7) {
399 inst->base_mrf = 13;
400 inst->header_size = 1;
401 if (devinfo->gen == 4)
402 inst->mlen = 3;
403 else
404 inst->mlen = 1 + dispatch_width / 8;
405 }
406
407 bld.MOV(dst, offset(vec4_result, (const_offset & 3) * scale));
408 }
409
410 /**
411 * A helper for MOV generation for fixing up broken hardware SEND dependency
412 * handling.
413 */
414 void
415 fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
416 {
417 /* The caller always wants uncompressed to emit the minimal extra
418 * dependencies, and to avoid having to deal with aligning its regs to 2.
419 */
420 const fs_builder ubld = bld.annotate("send dependency resolve")
421 .half(0);
422
423 ubld.MOV(ubld.null_reg_f(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
424 }
425
426 bool
427 fs_inst::equals(fs_inst *inst) const
428 {
429 return (opcode == inst->opcode &&
430 dst.equals(inst->dst) &&
431 src[0].equals(inst->src[0]) &&
432 src[1].equals(inst->src[1]) &&
433 src[2].equals(inst->src[2]) &&
434 saturate == inst->saturate &&
435 predicate == inst->predicate &&
436 conditional_mod == inst->conditional_mod &&
437 mlen == inst->mlen &&
438 base_mrf == inst->base_mrf &&
439 target == inst->target &&
440 eot == inst->eot &&
441 header_size == inst->header_size &&
442 shadow_compare == inst->shadow_compare &&
443 exec_size == inst->exec_size &&
444 offset == inst->offset);
445 }
446
447 bool
448 fs_inst::overwrites_reg(const fs_reg &reg) const
449 {
450 return reg.in_range(dst, regs_written);
451 }
452
453 bool
454 fs_inst::is_send_from_grf() const
455 {
456 switch (opcode) {
457 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
458 case SHADER_OPCODE_SHADER_TIME_ADD:
459 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
460 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
461 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
462 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
463 case SHADER_OPCODE_UNTYPED_ATOMIC:
464 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
465 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
466 case SHADER_OPCODE_TYPED_ATOMIC:
467 case SHADER_OPCODE_TYPED_SURFACE_READ:
468 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
469 case SHADER_OPCODE_URB_WRITE_SIMD8:
470 return true;
471 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
472 return src[1].file == GRF;
473 case FS_OPCODE_FB_WRITE:
474 return src[0].file == GRF;
475 default:
476 if (is_tex())
477 return src[0].file == GRF;
478
479 return false;
480 }
481 }
482
483 bool
484 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
485 {
486 if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
487 return false;
488
489 fs_reg reg = this->src[0];
490 if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
491 return false;
492
493 if (grf_alloc.sizes[reg.reg] != this->regs_written)
494 return false;
495
496 for (int i = 0; i < this->sources; i++) {
497 reg.type = this->src[i].type;
498 reg.width = this->src[i].width;
499 if (!this->src[i].equals(reg))
500 return false;
501 reg = ::offset(reg, 1);
502 }
503
504 return true;
505 }
506
507 bool
508 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
509 {
510 if (devinfo->gen == 6 && is_math())
511 return false;
512
513 if (is_send_from_grf())
514 return false;
515
516 if (!backend_instruction::can_do_source_mods())
517 return false;
518
519 return true;
520 }
521
522 bool
523 fs_inst::has_side_effects() const
524 {
525 return this->eot || backend_instruction::has_side_effects();
526 }
527
528 void
529 fs_reg::init()
530 {
531 memset(this, 0, sizeof(*this));
532 stride = 1;
533 }
534
535 /** Generic unset register constructor. */
536 fs_reg::fs_reg()
537 {
538 init();
539 this->file = BAD_FILE;
540 }
541
542 /** Immediate value constructor. */
543 fs_reg::fs_reg(float f)
544 {
545 init();
546 this->file = IMM;
547 this->type = BRW_REGISTER_TYPE_F;
548 this->fixed_hw_reg.dw1.f = f;
549 this->width = 1;
550 }
551
552 /** Immediate value constructor. */
553 fs_reg::fs_reg(int32_t i)
554 {
555 init();
556 this->file = IMM;
557 this->type = BRW_REGISTER_TYPE_D;
558 this->fixed_hw_reg.dw1.d = i;
559 this->width = 1;
560 }
561
562 /** Immediate value constructor. */
563 fs_reg::fs_reg(uint32_t u)
564 {
565 init();
566 this->file = IMM;
567 this->type = BRW_REGISTER_TYPE_UD;
568 this->fixed_hw_reg.dw1.ud = u;
569 this->width = 1;
570 }
571
572 /** Vector float immediate value constructor. */
573 fs_reg::fs_reg(uint8_t vf[4])
574 {
575 init();
576 this->file = IMM;
577 this->type = BRW_REGISTER_TYPE_VF;
578 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
579 }
580
581 /** Vector float immediate value constructor. */
582 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
583 {
584 init();
585 this->file = IMM;
586 this->type = BRW_REGISTER_TYPE_VF;
587 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
588 (vf1 << 8) |
589 (vf2 << 16) |
590 (vf3 << 24);
591 }
592
593 /** Fixed brw_reg. */
594 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
595 {
596 init();
597 this->file = HW_REG;
598 this->fixed_hw_reg = fixed_hw_reg;
599 this->type = fixed_hw_reg.type;
600 this->width = 1 << fixed_hw_reg.width;
601 }
602
603 bool
604 fs_reg::equals(const fs_reg &r) const
605 {
606 return (file == r.file &&
607 reg == r.reg &&
608 reg_offset == r.reg_offset &&
609 subreg_offset == r.subreg_offset &&
610 type == r.type &&
611 negate == r.negate &&
612 abs == r.abs &&
613 !reladdr && !r.reladdr &&
614 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
615 width == r.width &&
616 stride == r.stride);
617 }
618
619 fs_reg &
620 fs_reg::set_smear(unsigned subreg)
621 {
622 assert(file != HW_REG && file != IMM);
623 subreg_offset = subreg * type_sz(type);
624 stride = 0;
625 return *this;
626 }
627
628 bool
629 fs_reg::is_contiguous() const
630 {
631 return stride == 1;
632 }
633
634 int
635 fs_visitor::type_size(const struct glsl_type *type)
636 {
637 unsigned int size, i;
638
639 switch (type->base_type) {
640 case GLSL_TYPE_UINT:
641 case GLSL_TYPE_INT:
642 case GLSL_TYPE_FLOAT:
643 case GLSL_TYPE_BOOL:
644 return type->components();
645 case GLSL_TYPE_ARRAY:
646 return type_size(type->fields.array) * type->length;
647 case GLSL_TYPE_STRUCT:
648 size = 0;
649 for (i = 0; i < type->length; i++) {
650 size += type_size(type->fields.structure[i].type);
651 }
652 return size;
653 case GLSL_TYPE_SAMPLER:
654 /* Samplers take up no register space, since they're baked in at
655 * link time.
656 */
657 return 0;
658 case GLSL_TYPE_ATOMIC_UINT:
659 return 0;
660 case GLSL_TYPE_IMAGE:
661 case GLSL_TYPE_VOID:
662 case GLSL_TYPE_ERROR:
663 case GLSL_TYPE_INTERFACE:
664 case GLSL_TYPE_DOUBLE:
665 unreachable("not reached");
666 }
667
668 return 0;
669 }
670
671 /**
672 * Create a MOV to read the timestamp register.
673 *
674 * The caller is responsible for emitting the MOV. The return value is
675 * the destination of the MOV, with extra parameters set.
676 */
677 fs_reg
678 fs_visitor::get_timestamp(const fs_builder &bld)
679 {
680 assert(devinfo->gen >= 7);
681
682 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
683 BRW_ARF_TIMESTAMP,
684 0),
685 BRW_REGISTER_TYPE_UD));
686
687 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
688
689 /* We want to read the 3 fields we care about even if it's not enabled in
690 * the dispatch.
691 */
692 bld.exec_all().MOV(dst, ts);
693
694 /* The caller wants the low 32 bits of the timestamp. Since it's running
695 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
696 * which is plenty of time for our purposes. It is identical across the
697 * EUs, but since it's tracking GPU core speed it will increment at a
698 * varying rate as render P-states change.
699 *
700 * The caller could also check if render P-states have changed (or anything
701 * else that might disrupt timing) by setting smear to 2 and checking if
702 * that field is != 0.
703 */
704 dst.set_smear(0);
705
706 return dst;
707 }
708
709 void
710 fs_visitor::emit_shader_time_begin()
711 {
712 shader_start_time = get_timestamp(bld.annotate("shader time start"));
713 }
714
715 void
716 fs_visitor::emit_shader_time_end()
717 {
718 enum shader_time_shader_type type, written_type, reset_type;
719 switch (stage) {
720 case MESA_SHADER_VERTEX:
721 type = ST_VS;
722 written_type = ST_VS_WRITTEN;
723 reset_type = ST_VS_RESET;
724 break;
725 case MESA_SHADER_GEOMETRY:
726 type = ST_GS;
727 written_type = ST_GS_WRITTEN;
728 reset_type = ST_GS_RESET;
729 break;
730 case MESA_SHADER_FRAGMENT:
731 if (dispatch_width == 8) {
732 type = ST_FS8;
733 written_type = ST_FS8_WRITTEN;
734 reset_type = ST_FS8_RESET;
735 } else {
736 assert(dispatch_width == 16);
737 type = ST_FS16;
738 written_type = ST_FS16_WRITTEN;
739 reset_type = ST_FS16_RESET;
740 }
741 break;
742 case MESA_SHADER_COMPUTE:
743 type = ST_CS;
744 written_type = ST_CS_WRITTEN;
745 reset_type = ST_CS_RESET;
746 break;
747 default:
748 unreachable("fs_visitor::emit_shader_time_end missing code");
749 }
750
751 /* Insert our code just before the final SEND with EOT. */
752 exec_node *end = this->instructions.get_tail();
753 assert(end && ((fs_inst *) end)->eot);
754 const fs_builder ibld = bld.annotate("shader time end")
755 .exec_all().at(NULL, end);
756
757 fs_reg shader_end_time = get_timestamp(ibld);
758
759 /* Check that there weren't any timestamp reset events (assuming these
760 * were the only two timestamp reads that happened).
761 */
762 fs_reg reset = shader_end_time;
763 reset.set_smear(2);
764 set_condmod(BRW_CONDITIONAL_Z,
765 ibld.AND(ibld.null_reg_ud(), reset, fs_reg(1u)));
766 ibld.IF(BRW_PREDICATE_NORMAL);
767
768 fs_reg start = shader_start_time;
769 start.negate = true;
770 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
771 diff.set_smear(0);
772 ibld.ADD(diff, start, shader_end_time);
773
774 /* If there were no instructions between the two timestamp gets, the diff
775 * is 2 cycles. Remove that overhead, so I can forget about that when
776 * trying to determine the time taken for single instructions.
777 */
778 ibld.ADD(diff, diff, fs_reg(-2u));
779 SHADER_TIME_ADD(ibld, type, diff);
780 SHADER_TIME_ADD(ibld, written_type, fs_reg(1u));
781 ibld.emit(BRW_OPCODE_ELSE);
782 SHADER_TIME_ADD(ibld, reset_type, fs_reg(1u));
783 ibld.emit(BRW_OPCODE_ENDIF);
784 }
785
786 void
787 fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
788 enum shader_time_shader_type type, fs_reg value)
789 {
790 int shader_time_index =
791 brw_get_shader_time_index(brw, shader_prog, prog, type);
792 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
793
794 fs_reg payload;
795 if (dispatch_width == 8)
796 payload = vgrf(glsl_type::uvec2_type);
797 else
798 payload = vgrf(glsl_type::uint_type);
799
800 bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
801 }
802
803 void
804 fs_visitor::vfail(const char *format, va_list va)
805 {
806 char *msg;
807
808 if (failed)
809 return;
810
811 failed = true;
812
813 msg = ralloc_vasprintf(mem_ctx, format, va);
814 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
815
816 this->fail_msg = msg;
817
818 if (debug_enabled) {
819 fprintf(stderr, "%s", msg);
820 }
821 }
822
823 void
824 fs_visitor::fail(const char *format, ...)
825 {
826 va_list va;
827
828 va_start(va, format);
829 vfail(format, va);
830 va_end(va);
831 }
832
833 /**
834 * Mark this program as impossible to compile in SIMD16 mode.
835 *
836 * During the SIMD8 compile (which happens first), we can detect and flag
837 * things that are unsupported in SIMD16 mode, so the compiler can skip
838 * the SIMD16 compile altogether.
839 *
840 * During a SIMD16 compile (if one happens anyway), this just calls fail().
841 */
842 void
843 fs_visitor::no16(const char *format, ...)
844 {
845 va_list va;
846
847 va_start(va, format);
848
849 if (dispatch_width == 16) {
850 vfail(format, va);
851 } else {
852 simd16_unsupported = true;
853
854 if (brw->perf_debug) {
855 if (no16_msg)
856 ralloc_vasprintf_append(&no16_msg, format, va);
857 else
858 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
859 }
860 }
861
862 va_end(va);
863 }
864
865 fs_inst *
866 fs_visitor::emit(enum opcode opcode)
867 {
868 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
869 }
870
871 fs_inst *
872 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
873 {
874 return emit(new(mem_ctx) fs_inst(opcode, dst));
875 }
876
877 fs_inst *
878 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
879 {
880 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
881 }
882
883 fs_inst *
884 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
885 const fs_reg &src1)
886 {
887 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
888 }
889
890 fs_inst *
891 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
892 const fs_reg &src1, const fs_reg &src2)
893 {
894 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
895 }
896
897 fs_inst *
898 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
899 fs_reg src[], int sources)
900 {
901 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
902 }
903
904 /**
905 * Returns true if the instruction has a flag that means it won't
906 * update an entire destination register.
907 *
908 * For example, dead code elimination and live variable analysis want to know
909 * when a write to a variable screens off any preceding values that were in
910 * it.
911 */
912 bool
913 fs_inst::is_partial_write() const
914 {
915 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
916 (this->dst.width * type_sz(this->dst.type)) < 32 ||
917 !this->dst.is_contiguous());
918 }
919
920 int
921 fs_inst::regs_read(int arg) const
922 {
923 if (is_tex() && arg == 0 && src[0].file == GRF) {
924 return mlen;
925 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
926 return mlen;
927 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
928 return mlen;
929 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
930 return mlen;
931 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
932 return mlen;
933 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
934 return mlen;
935 } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
936 return mlen;
937 } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
938 return mlen;
939 } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
940 return mlen;
941 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
942 return mlen;
943 } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
944 return exec_size / 4;
945 }
946
947 switch (src[arg].file) {
948 case BAD_FILE:
949 case UNIFORM:
950 case IMM:
951 return 1;
952 case GRF:
953 case HW_REG:
954 if (src[arg].stride == 0) {
955 return 1;
956 } else {
957 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
958 return (size + 31) / 32;
959 }
960 case MRF:
961 unreachable("MRF registers are not allowed as sources");
962 default:
963 unreachable("Invalid register file");
964 }
965 }
966
967 bool
968 fs_inst::reads_flag() const
969 {
970 return predicate;
971 }
972
973 bool
974 fs_inst::writes_flag() const
975 {
976 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
977 opcode != BRW_OPCODE_IF &&
978 opcode != BRW_OPCODE_WHILE)) ||
979 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
980 }
981
982 /**
983 * Returns how many MRFs an FS opcode will write over.
984 *
985 * Note that this is not the 0 or 1 implied writes in an actual gen
986 * instruction -- the FS opcodes often generate MOVs in addition.
987 */
988 int
989 fs_visitor::implied_mrf_writes(fs_inst *inst)
990 {
991 if (inst->mlen == 0)
992 return 0;
993
994 if (inst->base_mrf == -1)
995 return 0;
996
997 switch (inst->opcode) {
998 case SHADER_OPCODE_RCP:
999 case SHADER_OPCODE_RSQ:
1000 case SHADER_OPCODE_SQRT:
1001 case SHADER_OPCODE_EXP2:
1002 case SHADER_OPCODE_LOG2:
1003 case SHADER_OPCODE_SIN:
1004 case SHADER_OPCODE_COS:
1005 return 1 * dispatch_width / 8;
1006 case SHADER_OPCODE_POW:
1007 case SHADER_OPCODE_INT_QUOTIENT:
1008 case SHADER_OPCODE_INT_REMAINDER:
1009 return 2 * dispatch_width / 8;
1010 case SHADER_OPCODE_TEX:
1011 case FS_OPCODE_TXB:
1012 case SHADER_OPCODE_TXD:
1013 case SHADER_OPCODE_TXF:
1014 case SHADER_OPCODE_TXF_CMS:
1015 case SHADER_OPCODE_TXF_MCS:
1016 case SHADER_OPCODE_TG4:
1017 case SHADER_OPCODE_TG4_OFFSET:
1018 case SHADER_OPCODE_TXL:
1019 case SHADER_OPCODE_TXS:
1020 case SHADER_OPCODE_LOD:
1021 return 1;
1022 case FS_OPCODE_FB_WRITE:
1023 return 2;
1024 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1025 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1026 return 1;
1027 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1028 return inst->mlen;
1029 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1030 return inst->mlen;
1031 case SHADER_OPCODE_UNTYPED_ATOMIC:
1032 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1033 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1034 case SHADER_OPCODE_TYPED_ATOMIC:
1035 case SHADER_OPCODE_TYPED_SURFACE_READ:
1036 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1037 case SHADER_OPCODE_URB_WRITE_SIMD8:
1038 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1039 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1040 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1041 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1042 return 0;
1043 default:
1044 unreachable("not reached");
1045 }
1046 }
1047
1048 fs_reg
1049 fs_visitor::vgrf(const glsl_type *const type)
1050 {
1051 int reg_width = dispatch_width / 8;
1052 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1053 brw_type_for_base_type(type), dispatch_width);
1054 }
1055
1056 fs_reg
1057 fs_visitor::vgrf(int num_components)
1058 {
1059 int reg_width = dispatch_width / 8;
1060 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1061 BRW_REGISTER_TYPE_F, dispatch_width);
1062 }
1063
1064 /** Fixed HW reg constructor. */
1065 fs_reg::fs_reg(enum register_file file, int reg)
1066 {
1067 init();
1068 this->file = file;
1069 this->reg = reg;
1070 this->type = BRW_REGISTER_TYPE_F;
1071
1072 switch (file) {
1073 case UNIFORM:
1074 this->width = 1;
1075 break;
1076 default:
1077 this->width = 8;
1078 }
1079 }
1080
1081 /** Fixed HW reg constructor. */
1082 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1083 {
1084 init();
1085 this->file = file;
1086 this->reg = reg;
1087 this->type = type;
1088
1089 switch (file) {
1090 case UNIFORM:
1091 this->width = 1;
1092 break;
1093 default:
1094 this->width = 8;
1095 }
1096 }
1097
1098 /** Fixed HW reg constructor. */
1099 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1100 uint8_t width)
1101 {
1102 init();
1103 this->file = file;
1104 this->reg = reg;
1105 this->type = type;
1106 this->width = width;
1107 }
1108
1109 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1110 * This brings in those uniform definitions
1111 */
1112 void
1113 fs_visitor::import_uniforms(fs_visitor *v)
1114 {
1115 this->push_constant_loc = v->push_constant_loc;
1116 this->pull_constant_loc = v->pull_constant_loc;
1117 this->uniforms = v->uniforms;
1118 this->param_size = v->param_size;
1119 }
1120
1121 fs_reg *
1122 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1123 bool origin_upper_left)
1124 {
1125 assert(stage == MESA_SHADER_FRAGMENT);
1126 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1127 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1128 fs_reg wpos = *reg;
1129 bool flip = !origin_upper_left ^ key->render_to_fbo;
1130
1131 /* gl_FragCoord.x */
1132 if (pixel_center_integer) {
1133 bld.MOV(wpos, this->pixel_x);
1134 } else {
1135 bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
1136 }
1137 wpos = offset(wpos, 1);
1138
1139 /* gl_FragCoord.y */
1140 if (!flip && pixel_center_integer) {
1141 bld.MOV(wpos, this->pixel_y);
1142 } else {
1143 fs_reg pixel_y = this->pixel_y;
1144 float offset = (pixel_center_integer ? 0.0 : 0.5);
1145
1146 if (flip) {
1147 pixel_y.negate = true;
1148 offset += key->drawable_height - 1.0;
1149 }
1150
1151 bld.ADD(wpos, pixel_y, fs_reg(offset));
1152 }
1153 wpos = offset(wpos, 1);
1154
1155 /* gl_FragCoord.z */
1156 if (devinfo->gen >= 6) {
1157 bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
1158 } else {
1159 bld.emit(FS_OPCODE_LINTERP, wpos,
1160 this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1161 interp_reg(VARYING_SLOT_POS, 2));
1162 }
1163 wpos = offset(wpos, 1);
1164
1165 /* gl_FragCoord.w: Already set up in emit_interpolation */
1166 bld.MOV(wpos, this->wpos_w);
1167
1168 return reg;
1169 }
1170
1171 fs_inst *
1172 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1173 glsl_interp_qualifier interpolation_mode,
1174 bool is_centroid, bool is_sample)
1175 {
1176 brw_wm_barycentric_interp_mode barycoord_mode;
1177 if (devinfo->gen >= 6) {
1178 if (is_centroid) {
1179 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1180 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1181 else
1182 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1183 } else if (is_sample) {
1184 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1185 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1186 else
1187 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1188 } else {
1189 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1190 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1191 else
1192 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1193 }
1194 } else {
1195 /* On Ironlake and below, there is only one interpolation mode.
1196 * Centroid interpolation doesn't mean anything on this hardware --
1197 * there is no multisampling.
1198 */
1199 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1200 }
1201 return bld.emit(FS_OPCODE_LINTERP, attr,
1202 this->delta_xy[barycoord_mode], interp);
1203 }
1204
1205 void
1206 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1207 const glsl_type *type,
1208 glsl_interp_qualifier interpolation_mode,
1209 int location, bool mod_centroid,
1210 bool mod_sample)
1211 {
1212 attr.type = brw_type_for_base_type(type->get_scalar_type());
1213
1214 assert(stage == MESA_SHADER_FRAGMENT);
1215 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1216 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1217
1218 unsigned int array_elements;
1219
1220 if (type->is_array()) {
1221 array_elements = type->length;
1222 if (array_elements == 0) {
1223 fail("dereferenced array '%s' has length 0\n", name);
1224 }
1225 type = type->fields.array;
1226 } else {
1227 array_elements = 1;
1228 }
1229
1230 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1231 bool is_gl_Color =
1232 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1233 if (key->flat_shade && is_gl_Color) {
1234 interpolation_mode = INTERP_QUALIFIER_FLAT;
1235 } else {
1236 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1237 }
1238 }
1239
1240 for (unsigned int i = 0; i < array_elements; i++) {
1241 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1242 if (prog_data->urb_setup[location] == -1) {
1243 /* If there's no incoming setup data for this slot, don't
1244 * emit interpolation for it.
1245 */
1246 attr = offset(attr, type->vector_elements);
1247 location++;
1248 continue;
1249 }
1250
1251 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1252 /* Constant interpolation (flat shading) case. The SF has
1253 * handed us defined values in only the constant offset
1254 * field of the setup reg.
1255 */
1256 for (unsigned int k = 0; k < type->vector_elements; k++) {
1257 struct brw_reg interp = interp_reg(location, k);
1258 interp = suboffset(interp, 3);
1259 interp.type = attr.type;
1260 bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1261 attr = offset(attr, 1);
1262 }
1263 } else {
1264 /* Smooth/noperspective interpolation case. */
1265 for (unsigned int k = 0; k < type->vector_elements; k++) {
1266 struct brw_reg interp = interp_reg(location, k);
1267 if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1268 /* Get the pixel/sample mask into f0 so that we know
1269 * which pixels are lit. Then, for each channel that is
1270 * unlit, replace the centroid data with non-centroid
1271 * data.
1272 */
1273 bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1274
1275 fs_inst *inst;
1276 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1277 false, false);
1278 inst->predicate = BRW_PREDICATE_NORMAL;
1279 inst->predicate_inverse = true;
1280 if (devinfo->has_pln)
1281 inst->no_dd_clear = true;
1282
1283 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1284 mod_centroid && !key->persample_shading,
1285 mod_sample || key->persample_shading);
1286 inst->predicate = BRW_PREDICATE_NORMAL;
1287 inst->predicate_inverse = false;
1288 if (devinfo->has_pln)
1289 inst->no_dd_check = true;
1290
1291 } else {
1292 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1293 mod_centroid && !key->persample_shading,
1294 mod_sample || key->persample_shading);
1295 }
1296 if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1297 bld.MUL(attr, attr, this->pixel_w);
1298 }
1299 attr = offset(attr, 1);
1300 }
1301
1302 }
1303 location++;
1304 }
1305 }
1306 }
1307
1308 fs_reg *
1309 fs_visitor::emit_frontfacing_interpolation()
1310 {
1311 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1312
1313 if (devinfo->gen >= 6) {
1314 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1315 * a boolean result from this (~0/true or 0/false).
1316 *
1317 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1318 * this task in only one instruction:
1319 * - a negation source modifier will flip the bit; and
1320 * - a W -> D type conversion will sign extend the bit into the high
1321 * word of the destination.
1322 *
1323 * An ASR 15 fills the low word of the destination.
1324 */
1325 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1326 g0.negate = true;
1327
1328 bld.ASR(*reg, g0, fs_reg(15));
1329 } else {
1330 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1331 * a boolean result from this (1/true or 0/false).
1332 *
1333 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1334 * the negation source modifier to flip it. Unfortunately the SHR
1335 * instruction only operates on UD (or D with an abs source modifier)
1336 * sources without negation.
1337 *
1338 * Instead, use ASR (which will give ~0/true or 0/false).
1339 */
1340 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1341 g1_6.negate = true;
1342
1343 bld.ASR(*reg, g1_6, fs_reg(31));
1344 }
1345
1346 return reg;
1347 }
1348
1349 void
1350 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1351 {
1352 assert(stage == MESA_SHADER_FRAGMENT);
1353 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1354 assert(dst.type == BRW_REGISTER_TYPE_F);
1355
1356 if (key->compute_pos_offset) {
1357 /* Convert int_sample_pos to floating point */
1358 emit(MOV(dst, int_sample_pos));
1359 /* Scale to the range [0, 1] */
1360 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1361 }
1362 else {
1363 /* From ARB_sample_shading specification:
1364 * "When rendering to a non-multisample buffer, or if multisample
1365 * rasterization is disabled, gl_SamplePosition will always be
1366 * (0.5, 0.5).
1367 */
1368 emit(MOV(dst, fs_reg(0.5f)));
1369 }
1370 }
1371
1372 fs_reg *
1373 fs_visitor::emit_samplepos_setup()
1374 {
1375 assert(devinfo->gen >= 6);
1376
1377 this->current_annotation = "compute sample position";
1378 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1379 fs_reg pos = *reg;
1380 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1381 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1382
1383 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1384 * mode will be enabled.
1385 *
1386 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1387 * R31.1:0 Position Offset X/Y for Slot[3:0]
1388 * R31.3:2 Position Offset X/Y for Slot[7:4]
1389 * .....
1390 *
1391 * The X, Y sample positions come in as bytes in thread payload. So, read
1392 * the positions using vstride=16, width=8, hstride=2.
1393 */
1394 struct brw_reg sample_pos_reg =
1395 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1396 BRW_REGISTER_TYPE_B), 16, 8, 2);
1397
1398 if (dispatch_width == 8) {
1399 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1400 } else {
1401 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1402 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1403 ->force_sechalf = true;
1404 }
1405 /* Compute gl_SamplePosition.x */
1406 compute_sample_position(pos, int_sample_x);
1407 pos = offset(pos, 1);
1408 if (dispatch_width == 8) {
1409 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1410 } else {
1411 emit(MOV(half(int_sample_y, 0),
1412 fs_reg(suboffset(sample_pos_reg, 1))));
1413 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1414 ->force_sechalf = true;
1415 }
1416 /* Compute gl_SamplePosition.y */
1417 compute_sample_position(pos, int_sample_y);
1418 return reg;
1419 }
1420
1421 fs_reg *
1422 fs_visitor::emit_sampleid_setup()
1423 {
1424 assert(stage == MESA_SHADER_FRAGMENT);
1425 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1426 assert(devinfo->gen >= 6);
1427
1428 this->current_annotation = "compute sample id";
1429 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1430
1431 if (key->compute_sample_id) {
1432 fs_reg t1 = vgrf(glsl_type::int_type);
1433 fs_reg t2 = vgrf(glsl_type::int_type);
1434 t2.type = BRW_REGISTER_TYPE_UW;
1435
1436 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1437 * 8x multisampling, subspan 0 will represent sample N (where N
1438 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1439 * 7. We can find the value of N by looking at R0.0 bits 7:6
1440 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1441 * (since samples are always delivered in pairs). That is, we
1442 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1443 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1444 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1445 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1446 * populating a temporary variable with the sequence (0, 1, 2, 3),
1447 * and then reading from it using vstride=1, width=4, hstride=0.
1448 * These computations hold good for 4x multisampling as well.
1449 *
1450 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1451 * the first four slots are sample 0 of subspan 0; the next four
1452 * are sample 1 of subspan 0; the third group is sample 0 of
1453 * subspan 1, and finally sample 1 of subspan 1.
1454 */
1455 fs_inst *inst;
1456 inst = emit(BRW_OPCODE_AND, t1,
1457 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1458 fs_reg(0xc0));
1459 inst->force_writemask_all = true;
1460 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1461 inst->force_writemask_all = true;
1462 /* This works for both SIMD8 and SIMD16 */
1463 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1464 inst->force_writemask_all = true;
1465 /* This special instruction takes care of setting vstride=1,
1466 * width=4, hstride=0 of t2 during an ADD instruction.
1467 */
1468 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1469 } else {
1470 /* As per GL_ARB_sample_shading specification:
1471 * "When rendering to a non-multisample buffer, or if multisample
1472 * rasterization is disabled, gl_SampleID will always be zero."
1473 */
1474 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1475 }
1476
1477 return reg;
1478 }
1479
1480 void
1481 fs_visitor::resolve_source_modifiers(fs_reg *src)
1482 {
1483 if (!src->abs && !src->negate)
1484 return;
1485
1486 fs_reg temp = retype(vgrf(1), src->type);
1487 emit(MOV(temp, *src));
1488 *src = temp;
1489 }
1490
1491 fs_reg
1492 fs_visitor::fix_math_operand(fs_reg src)
1493 {
1494 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1495 * might be able to do better by doing execsize = 1 math and then
1496 * expanding that result out, but we would need to be careful with
1497 * masking.
1498 *
1499 * The hardware ignores source modifiers (negate and abs) on math
1500 * instructions, so we also move to a temp to set those up.
1501 */
1502 if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1503 !src.abs && !src.negate)
1504 return src;
1505
1506 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1507 * operands to math
1508 */
1509 if (devinfo->gen >= 7 && src.file != IMM)
1510 return src;
1511
1512 fs_reg expanded = vgrf(glsl_type::float_type);
1513 expanded.type = src.type;
1514 emit(BRW_OPCODE_MOV, expanded, src);
1515 return expanded;
1516 }
1517
1518 fs_inst *
1519 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1520 {
1521 switch (opcode) {
1522 case SHADER_OPCODE_RCP:
1523 case SHADER_OPCODE_RSQ:
1524 case SHADER_OPCODE_SQRT:
1525 case SHADER_OPCODE_EXP2:
1526 case SHADER_OPCODE_LOG2:
1527 case SHADER_OPCODE_SIN:
1528 case SHADER_OPCODE_COS:
1529 break;
1530 default:
1531 unreachable("not reached: bad math opcode");
1532 }
1533
1534 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1535 * might be able to do better by doing execsize = 1 math and then
1536 * expanding that result out, but we would need to be careful with
1537 * masking.
1538 *
1539 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1540 * instructions, so we also move to a temp to set those up.
1541 */
1542 if (devinfo->gen == 6 || devinfo->gen == 7)
1543 src = fix_math_operand(src);
1544
1545 fs_inst *inst = emit(opcode, dst, src);
1546
1547 if (devinfo->gen < 6) {
1548 inst->base_mrf = 2;
1549 inst->mlen = dispatch_width / 8;
1550 }
1551
1552 return inst;
1553 }
1554
1555 fs_inst *
1556 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1557 {
1558 int base_mrf = 2;
1559 fs_inst *inst;
1560
1561 if (devinfo->gen >= 8) {
1562 inst = emit(opcode, dst, src0, src1);
1563 } else if (devinfo->gen >= 6) {
1564 src0 = fix_math_operand(src0);
1565 src1 = fix_math_operand(src1);
1566
1567 inst = emit(opcode, dst, src0, src1);
1568 } else {
1569 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1570 * "Message Payload":
1571 *
1572 * "Operand0[7]. For the INT DIV functions, this operand is the
1573 * denominator."
1574 * ...
1575 * "Operand1[7]. For the INT DIV functions, this operand is the
1576 * numerator."
1577 */
1578 bool is_int_div = opcode != SHADER_OPCODE_POW;
1579 fs_reg &op0 = is_int_div ? src1 : src0;
1580 fs_reg &op1 = is_int_div ? src0 : src1;
1581
1582 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1583 inst = emit(opcode, dst, op0, reg_null_f);
1584
1585 inst->base_mrf = base_mrf;
1586 inst->mlen = 2 * dispatch_width / 8;
1587 }
1588 return inst;
1589 }
1590
1591 void
1592 fs_visitor::emit_discard_jump()
1593 {
1594 assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1595
1596 /* For performance, after a discard, jump to the end of the
1597 * shader if all relevant channels have been discarded.
1598 */
1599 fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1600 discard_jump->flag_subreg = 1;
1601
1602 discard_jump->predicate = (dispatch_width == 8)
1603 ? BRW_PREDICATE_ALIGN1_ANY8H
1604 : BRW_PREDICATE_ALIGN1_ANY16H;
1605 discard_jump->predicate_inverse = true;
1606 }
1607
1608 void
1609 fs_visitor::assign_curb_setup()
1610 {
1611 if (dispatch_width == 8) {
1612 prog_data->dispatch_grf_start_reg = payload.num_regs;
1613 } else {
1614 if (stage == MESA_SHADER_FRAGMENT) {
1615 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1616 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1617 } else if (stage == MESA_SHADER_COMPUTE) {
1618 brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1619 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1620 } else {
1621 unreachable("Unsupported shader type!");
1622 }
1623 }
1624
1625 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1626
1627 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1628 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1629 for (unsigned int i = 0; i < inst->sources; i++) {
1630 if (inst->src[i].file == UNIFORM) {
1631 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1632 int constant_nr;
1633 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1634 constant_nr = push_constant_loc[uniform_nr];
1635 } else {
1636 /* Section 5.11 of the OpenGL 4.1 spec says:
1637 * "Out-of-bounds reads return undefined values, which include
1638 * values from other variables of the active program or zero."
1639 * Just return the first push constant.
1640 */
1641 constant_nr = 0;
1642 }
1643
1644 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1645 constant_nr / 8,
1646 constant_nr % 8);
1647
1648 inst->src[i].file = HW_REG;
1649 inst->src[i].fixed_hw_reg = byte_offset(
1650 retype(brw_reg, inst->src[i].type),
1651 inst->src[i].subreg_offset);
1652 }
1653 }
1654 }
1655 }
1656
1657 void
1658 fs_visitor::calculate_urb_setup()
1659 {
1660 assert(stage == MESA_SHADER_FRAGMENT);
1661 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1662 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1663
1664 memset(prog_data->urb_setup, -1,
1665 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1666
1667 int urb_next = 0;
1668 /* Figure out where each of the incoming setup attributes lands. */
1669 if (devinfo->gen >= 6) {
1670 if (_mesa_bitcount_64(prog->InputsRead &
1671 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1672 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1673 * first 16 varying inputs, so we can put them wherever we want.
1674 * Just put them in order.
1675 *
1676 * This is useful because it means that (a) inputs not used by the
1677 * fragment shader won't take up valuable register space, and (b) we
1678 * won't have to recompile the fragment shader if it gets paired with
1679 * a different vertex (or geometry) shader.
1680 */
1681 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1682 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1683 BITFIELD64_BIT(i)) {
1684 prog_data->urb_setup[i] = urb_next++;
1685 }
1686 }
1687 } else {
1688 /* We have enough input varyings that the SF/SBE pipeline stage can't
1689 * arbitrarily rearrange them to suit our whim; we have to put them
1690 * in an order that matches the output of the previous pipeline stage
1691 * (geometry or vertex shader).
1692 */
1693 struct brw_vue_map prev_stage_vue_map;
1694 brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1695 key->input_slots_valid);
1696 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1697 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1698 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1699 slot++) {
1700 int varying = prev_stage_vue_map.slot_to_varying[slot];
1701 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1702 * unused.
1703 */
1704 if (varying != BRW_VARYING_SLOT_COUNT &&
1705 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1706 BITFIELD64_BIT(varying))) {
1707 prog_data->urb_setup[varying] = slot - first_slot;
1708 }
1709 }
1710 urb_next = prev_stage_vue_map.num_slots - first_slot;
1711 }
1712 } else {
1713 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1714 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1715 /* Point size is packed into the header, not as a general attribute */
1716 if (i == VARYING_SLOT_PSIZ)
1717 continue;
1718
1719 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1720 /* The back color slot is skipped when the front color is
1721 * also written to. In addition, some slots can be
1722 * written in the vertex shader and not read in the
1723 * fragment shader. So the register number must always be
1724 * incremented, mapped or not.
1725 */
1726 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1727 prog_data->urb_setup[i] = urb_next;
1728 urb_next++;
1729 }
1730 }
1731
1732 /*
1733 * It's a FS only attribute, and we did interpolation for this attribute
1734 * in SF thread. So, count it here, too.
1735 *
1736 * See compile_sf_prog() for more info.
1737 */
1738 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1739 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1740 }
1741
1742 prog_data->num_varying_inputs = urb_next;
1743 }
1744
1745 void
1746 fs_visitor::assign_urb_setup()
1747 {
1748 assert(stage == MESA_SHADER_FRAGMENT);
1749 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1750
1751 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1752
1753 /* Offset all the urb_setup[] index by the actual position of the
1754 * setup regs, now that the location of the constants has been chosen.
1755 */
1756 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1757 if (inst->opcode == FS_OPCODE_LINTERP) {
1758 assert(inst->src[1].file == HW_REG);
1759 inst->src[1].fixed_hw_reg.nr += urb_start;
1760 }
1761
1762 if (inst->opcode == FS_OPCODE_CINTERP) {
1763 assert(inst->src[0].file == HW_REG);
1764 inst->src[0].fixed_hw_reg.nr += urb_start;
1765 }
1766 }
1767
1768 /* Each attribute is 4 setup channels, each of which is half a reg. */
1769 this->first_non_payload_grf =
1770 urb_start + prog_data->num_varying_inputs * 2;
1771 }
1772
1773 void
1774 fs_visitor::assign_vs_urb_setup()
1775 {
1776 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1777 int grf, count, slot, channel, attr;
1778
1779 assert(stage == MESA_SHADER_VERTEX);
1780 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1781 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1782 count++;
1783
1784 /* Each attribute is 4 regs. */
1785 this->first_non_payload_grf =
1786 payload.num_regs + prog_data->curb_read_length + count * 4;
1787
1788 unsigned vue_entries =
1789 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1790
1791 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1792 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1793
1794 assert(vs_prog_data->base.urb_read_length <= 15);
1795
1796 /* Rewrite all ATTR file references to the hw grf that they land in. */
1797 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1798 for (int i = 0; i < inst->sources; i++) {
1799 if (inst->src[i].file == ATTR) {
1800
1801 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1802 slot = count - 1;
1803 } else {
1804 /* Attributes come in in a contiguous block, ordered by their
1805 * gl_vert_attrib value. That means we can compute the slot
1806 * number for an attribute by masking out the enabled
1807 * attributes before it and counting the bits.
1808 */
1809 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1810 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1811 BITFIELD64_MASK(attr));
1812 }
1813
1814 channel = inst->src[i].reg_offset & 3;
1815
1816 grf = payload.num_regs +
1817 prog_data->curb_read_length +
1818 slot * 4 + channel;
1819
1820 inst->src[i].file = HW_REG;
1821 inst->src[i].fixed_hw_reg =
1822 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1823 }
1824 }
1825 }
1826 }
1827
1828 /**
1829 * Split large virtual GRFs into separate components if we can.
1830 *
1831 * This is mostly duplicated with what brw_fs_vector_splitting does,
1832 * but that's really conservative because it's afraid of doing
1833 * splitting that doesn't result in real progress after the rest of
1834 * the optimization phases, which would cause infinite looping in
1835 * optimization. We can do it once here, safely. This also has the
1836 * opportunity to split interpolated values, or maybe even uniforms,
1837 * which we don't have at the IR level.
1838 *
1839 * We want to split, because virtual GRFs are what we register
1840 * allocate and spill (due to contiguousness requirements for some
1841 * instructions), and they're what we naturally generate in the
1842 * codegen process, but most virtual GRFs don't actually need to be
1843 * contiguous sets of GRFs. If we split, we'll end up with reduced
1844 * live intervals and better dead code elimination and coalescing.
1845 */
1846 void
1847 fs_visitor::split_virtual_grfs()
1848 {
1849 int num_vars = this->alloc.count;
1850
1851 /* Count the total number of registers */
1852 int reg_count = 0;
1853 int vgrf_to_reg[num_vars];
1854 for (int i = 0; i < num_vars; i++) {
1855 vgrf_to_reg[i] = reg_count;
1856 reg_count += alloc.sizes[i];
1857 }
1858
1859 /* An array of "split points". For each register slot, this indicates
1860 * if this slot can be separated from the previous slot. Every time an
1861 * instruction uses multiple elements of a register (as a source or
1862 * destination), we mark the used slots as inseparable. Then we go
1863 * through and split the registers into the smallest pieces we can.
1864 */
1865 bool split_points[reg_count];
1866 memset(split_points, 0, sizeof(split_points));
1867
1868 /* Mark all used registers as fully splittable */
1869 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1870 if (inst->dst.file == GRF) {
1871 int reg = vgrf_to_reg[inst->dst.reg];
1872 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1873 split_points[reg + j] = true;
1874 }
1875
1876 for (int i = 0; i < inst->sources; i++) {
1877 if (inst->src[i].file == GRF) {
1878 int reg = vgrf_to_reg[inst->src[i].reg];
1879 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1880 split_points[reg + j] = true;
1881 }
1882 }
1883 }
1884
1885 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1886 if (inst->dst.file == GRF) {
1887 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1888 for (int j = 1; j < inst->regs_written; j++)
1889 split_points[reg + j] = false;
1890 }
1891 for (int i = 0; i < inst->sources; i++) {
1892 if (inst->src[i].file == GRF) {
1893 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1894 for (int j = 1; j < inst->regs_read(i); j++)
1895 split_points[reg + j] = false;
1896 }
1897 }
1898 }
1899
1900 int new_virtual_grf[reg_count];
1901 int new_reg_offset[reg_count];
1902
1903 int reg = 0;
1904 for (int i = 0; i < num_vars; i++) {
1905 /* The first one should always be 0 as a quick sanity check. */
1906 assert(split_points[reg] == false);
1907
1908 /* j = 0 case */
1909 new_reg_offset[reg] = 0;
1910 reg++;
1911 int offset = 1;
1912
1913 /* j > 0 case */
1914 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1915 /* If this is a split point, reset the offset to 0 and allocate a
1916 * new virtual GRF for the previous offset many registers
1917 */
1918 if (split_points[reg]) {
1919 assert(offset <= MAX_VGRF_SIZE);
1920 int grf = alloc.allocate(offset);
1921 for (int k = reg - offset; k < reg; k++)
1922 new_virtual_grf[k] = grf;
1923 offset = 0;
1924 }
1925 new_reg_offset[reg] = offset;
1926 offset++;
1927 reg++;
1928 }
1929
1930 /* The last one gets the original register number */
1931 assert(offset <= MAX_VGRF_SIZE);
1932 alloc.sizes[i] = offset;
1933 for (int k = reg - offset; k < reg; k++)
1934 new_virtual_grf[k] = i;
1935 }
1936 assert(reg == reg_count);
1937
1938 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1939 if (inst->dst.file == GRF) {
1940 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1941 inst->dst.reg = new_virtual_grf[reg];
1942 inst->dst.reg_offset = new_reg_offset[reg];
1943 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1944 }
1945 for (int i = 0; i < inst->sources; i++) {
1946 if (inst->src[i].file == GRF) {
1947 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1948 inst->src[i].reg = new_virtual_grf[reg];
1949 inst->src[i].reg_offset = new_reg_offset[reg];
1950 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1951 }
1952 }
1953 }
1954 invalidate_live_intervals();
1955 }
1956
1957 /**
1958 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1959 *
1960 * During code generation, we create tons of temporary variables, many of
1961 * which get immediately killed and are never used again. Yet, in later
1962 * optimization and analysis passes, such as compute_live_intervals, we need
1963 * to loop over all the virtual GRFs. Compacting them can save a lot of
1964 * overhead.
1965 */
1966 bool
1967 fs_visitor::compact_virtual_grfs()
1968 {
1969 bool progress = false;
1970 int remap_table[this->alloc.count];
1971 memset(remap_table, -1, sizeof(remap_table));
1972
1973 /* Mark which virtual GRFs are used. */
1974 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1975 if (inst->dst.file == GRF)
1976 remap_table[inst->dst.reg] = 0;
1977
1978 for (int i = 0; i < inst->sources; i++) {
1979 if (inst->src[i].file == GRF)
1980 remap_table[inst->src[i].reg] = 0;
1981 }
1982 }
1983
1984 /* Compact the GRF arrays. */
1985 int new_index = 0;
1986 for (unsigned i = 0; i < this->alloc.count; i++) {
1987 if (remap_table[i] == -1) {
1988 /* We just found an unused register. This means that we are
1989 * actually going to compact something.
1990 */
1991 progress = true;
1992 } else {
1993 remap_table[i] = new_index;
1994 alloc.sizes[new_index] = alloc.sizes[i];
1995 invalidate_live_intervals();
1996 ++new_index;
1997 }
1998 }
1999
2000 this->alloc.count = new_index;
2001
2002 /* Patch all the instructions to use the newly renumbered registers */
2003 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2004 if (inst->dst.file == GRF)
2005 inst->dst.reg = remap_table[inst->dst.reg];
2006
2007 for (int i = 0; i < inst->sources; i++) {
2008 if (inst->src[i].file == GRF)
2009 inst->src[i].reg = remap_table[inst->src[i].reg];
2010 }
2011 }
2012
2013 /* Patch all the references to delta_xy, since they're used in register
2014 * allocation. If they're unused, switch them to BAD_FILE so we don't
2015 * think some random VGRF is delta_xy.
2016 */
2017 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2018 if (delta_xy[i].file == GRF) {
2019 if (remap_table[delta_xy[i].reg] != -1) {
2020 delta_xy[i].reg = remap_table[delta_xy[i].reg];
2021 } else {
2022 delta_xy[i].file = BAD_FILE;
2023 }
2024 }
2025 }
2026
2027 return progress;
2028 }
2029
2030 /*
2031 * Implements array access of uniforms by inserting a
2032 * PULL_CONSTANT_LOAD instruction.
2033 *
2034 * Unlike temporary GRF array access (where we don't support it due to
2035 * the difficulty of doing relative addressing on instruction
2036 * destinations), we could potentially do array access of uniforms
2037 * that were loaded in GRF space as push constants. In real-world
2038 * usage we've seen, though, the arrays being used are always larger
2039 * than we could load as push constants, so just always move all
2040 * uniform array access out to a pull constant buffer.
2041 */
2042 void
2043 fs_visitor::move_uniform_array_access_to_pull_constants()
2044 {
2045 if (dispatch_width != 8)
2046 return;
2047
2048 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2049 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2050
2051 /* Walk through and find array access of uniforms. Put a copy of that
2052 * uniform in the pull constant buffer.
2053 *
2054 * Note that we don't move constant-indexed accesses to arrays. No
2055 * testing has been done of the performance impact of this choice.
2056 */
2057 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2058 for (int i = 0 ; i < inst->sources; i++) {
2059 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2060 continue;
2061
2062 int uniform = inst->src[i].reg;
2063
2064 /* If this array isn't already present in the pull constant buffer,
2065 * add it.
2066 */
2067 if (pull_constant_loc[uniform] == -1) {
2068 const gl_constant_value **values = &stage_prog_data->param[uniform];
2069
2070 assert(param_size[uniform]);
2071
2072 for (int j = 0; j < param_size[uniform]; j++) {
2073 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2074
2075 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2076 values[j];
2077 }
2078 }
2079 }
2080 }
2081 }
2082
2083 /**
2084 * Assign UNIFORM file registers to either push constants or pull constants.
2085 *
2086 * We allow a fragment shader to have more than the specified minimum
2087 * maximum number of fragment shader uniform components (64). If
2088 * there are too many of these, they'd fill up all of register space.
2089 * So, this will push some of them out to the pull constant buffer and
2090 * update the program to load them.
2091 */
2092 void
2093 fs_visitor::assign_constant_locations()
2094 {
2095 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2096 if (dispatch_width != 8)
2097 return;
2098
2099 /* Find which UNIFORM registers are still in use. */
2100 bool is_live[uniforms];
2101 for (unsigned int i = 0; i < uniforms; i++) {
2102 is_live[i] = false;
2103 }
2104
2105 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2106 for (int i = 0; i < inst->sources; i++) {
2107 if (inst->src[i].file != UNIFORM)
2108 continue;
2109
2110 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2111 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2112 is_live[constant_nr] = true;
2113 }
2114 }
2115
2116 /* Only allow 16 registers (128 uniform components) as push constants.
2117 *
2118 * Just demote the end of the list. We could probably do better
2119 * here, demoting things that are rarely used in the program first.
2120 *
2121 * If changing this value, note the limitation about total_regs in
2122 * brw_curbe.c.
2123 */
2124 unsigned int max_push_components = 16 * 8;
2125 unsigned int num_push_constants = 0;
2126
2127 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2128
2129 for (unsigned int i = 0; i < uniforms; i++) {
2130 if (!is_live[i] || pull_constant_loc[i] != -1) {
2131 /* This UNIFORM register is either dead, or has already been demoted
2132 * to a pull const. Mark it as no longer living in the param[] array.
2133 */
2134 push_constant_loc[i] = -1;
2135 continue;
2136 }
2137
2138 if (num_push_constants < max_push_components) {
2139 /* Retain as a push constant. Record the location in the params[]
2140 * array.
2141 */
2142 push_constant_loc[i] = num_push_constants++;
2143 } else {
2144 /* Demote to a pull constant. */
2145 push_constant_loc[i] = -1;
2146
2147 int pull_index = stage_prog_data->nr_pull_params++;
2148 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2149 pull_constant_loc[i] = pull_index;
2150 }
2151 }
2152
2153 stage_prog_data->nr_params = num_push_constants;
2154
2155 /* Up until now, the param[] array has been indexed by reg + reg_offset
2156 * of UNIFORM registers. Condense it to only contain the uniforms we
2157 * chose to upload as push constants.
2158 */
2159 for (unsigned int i = 0; i < uniforms; i++) {
2160 int remapped = push_constant_loc[i];
2161
2162 if (remapped == -1)
2163 continue;
2164
2165 assert(remapped <= (int)i);
2166 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2167 }
2168 }
2169
2170 /**
2171 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2172 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2173 */
2174 void
2175 fs_visitor::demote_pull_constants()
2176 {
2177 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2178 for (int i = 0; i < inst->sources; i++) {
2179 if (inst->src[i].file != UNIFORM)
2180 continue;
2181
2182 int pull_index;
2183 unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2184 if (location >= uniforms) /* Out of bounds access */
2185 pull_index = -1;
2186 else
2187 pull_index = pull_constant_loc[location];
2188
2189 if (pull_index == -1)
2190 continue;
2191
2192 /* Set up the annotation tracking for new generated instructions. */
2193 const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
2194 .at(block, inst);
2195 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2196 fs_reg dst = vgrf(glsl_type::float_type);
2197
2198 /* Generate a pull load into dst. */
2199 if (inst->src[i].reladdr) {
2200 VARYING_PULL_CONSTANT_LOAD(ibld, dst,
2201 surf_index,
2202 *inst->src[i].reladdr,
2203 pull_index);
2204 inst->src[i].reladdr = NULL;
2205 } else {
2206 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2207 ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
2208 dst, surf_index, offset);
2209 inst->src[i].set_smear(pull_index & 3);
2210 }
2211
2212 /* Rewrite the instruction to use the temporary VGRF. */
2213 inst->src[i].file = GRF;
2214 inst->src[i].reg = dst.reg;
2215 inst->src[i].reg_offset = 0;
2216 inst->src[i].width = dispatch_width;
2217 }
2218 }
2219 invalidate_live_intervals();
2220 }
2221
2222 bool
2223 fs_visitor::opt_algebraic()
2224 {
2225 bool progress = false;
2226
2227 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2228 switch (inst->opcode) {
2229 case BRW_OPCODE_MOV:
2230 if (inst->src[0].file != IMM)
2231 break;
2232
2233 if (inst->saturate) {
2234 if (inst->dst.type != inst->src[0].type)
2235 assert(!"unimplemented: saturate mixed types");
2236
2237 if (brw_saturate_immediate(inst->dst.type,
2238 &inst->src[0].fixed_hw_reg)) {
2239 inst->saturate = false;
2240 progress = true;
2241 }
2242 }
2243 break;
2244
2245 case BRW_OPCODE_MUL:
2246 if (inst->src[1].file != IMM)
2247 continue;
2248
2249 /* a * 1.0 = a */
2250 if (inst->src[1].is_one()) {
2251 inst->opcode = BRW_OPCODE_MOV;
2252 inst->src[1] = reg_undef;
2253 progress = true;
2254 break;
2255 }
2256
2257 /* a * -1.0 = -a */
2258 if (inst->src[1].is_negative_one()) {
2259 inst->opcode = BRW_OPCODE_MOV;
2260 inst->src[0].negate = !inst->src[0].negate;
2261 inst->src[1] = reg_undef;
2262 progress = true;
2263 break;
2264 }
2265
2266 /* a * 0.0 = 0.0 */
2267 if (inst->src[1].is_zero()) {
2268 inst->opcode = BRW_OPCODE_MOV;
2269 inst->src[0] = inst->src[1];
2270 inst->src[1] = reg_undef;
2271 progress = true;
2272 break;
2273 }
2274
2275 if (inst->src[0].file == IMM) {
2276 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2277 inst->opcode = BRW_OPCODE_MOV;
2278 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2279 inst->src[1] = reg_undef;
2280 progress = true;
2281 break;
2282 }
2283 break;
2284 case BRW_OPCODE_ADD:
2285 if (inst->src[1].file != IMM)
2286 continue;
2287
2288 /* a + 0.0 = a */
2289 if (inst->src[1].is_zero()) {
2290 inst->opcode = BRW_OPCODE_MOV;
2291 inst->src[1] = reg_undef;
2292 progress = true;
2293 break;
2294 }
2295
2296 if (inst->src[0].file == IMM) {
2297 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2298 inst->opcode = BRW_OPCODE_MOV;
2299 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2300 inst->src[1] = reg_undef;
2301 progress = true;
2302 break;
2303 }
2304 break;
2305 case BRW_OPCODE_OR:
2306 if (inst->src[0].equals(inst->src[1])) {
2307 inst->opcode = BRW_OPCODE_MOV;
2308 inst->src[1] = reg_undef;
2309 progress = true;
2310 break;
2311 }
2312 break;
2313 case BRW_OPCODE_LRP:
2314 if (inst->src[1].equals(inst->src[2])) {
2315 inst->opcode = BRW_OPCODE_MOV;
2316 inst->src[0] = inst->src[1];
2317 inst->src[1] = reg_undef;
2318 inst->src[2] = reg_undef;
2319 progress = true;
2320 break;
2321 }
2322 break;
2323 case BRW_OPCODE_CMP:
2324 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2325 inst->src[0].abs &&
2326 inst->src[0].negate &&
2327 inst->src[1].is_zero()) {
2328 inst->src[0].abs = false;
2329 inst->src[0].negate = false;
2330 inst->conditional_mod = BRW_CONDITIONAL_Z;
2331 progress = true;
2332 break;
2333 }
2334 break;
2335 case BRW_OPCODE_SEL:
2336 if (inst->src[0].equals(inst->src[1])) {
2337 inst->opcode = BRW_OPCODE_MOV;
2338 inst->src[1] = reg_undef;
2339 inst->predicate = BRW_PREDICATE_NONE;
2340 inst->predicate_inverse = false;
2341 progress = true;
2342 } else if (inst->saturate && inst->src[1].file == IMM) {
2343 switch (inst->conditional_mod) {
2344 case BRW_CONDITIONAL_LE:
2345 case BRW_CONDITIONAL_L:
2346 switch (inst->src[1].type) {
2347 case BRW_REGISTER_TYPE_F:
2348 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2349 inst->opcode = BRW_OPCODE_MOV;
2350 inst->src[1] = reg_undef;
2351 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2352 progress = true;
2353 }
2354 break;
2355 default:
2356 break;
2357 }
2358 break;
2359 case BRW_CONDITIONAL_GE:
2360 case BRW_CONDITIONAL_G:
2361 switch (inst->src[1].type) {
2362 case BRW_REGISTER_TYPE_F:
2363 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2364 inst->opcode = BRW_OPCODE_MOV;
2365 inst->src[1] = reg_undef;
2366 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2367 progress = true;
2368 }
2369 break;
2370 default:
2371 break;
2372 }
2373 default:
2374 break;
2375 }
2376 }
2377 break;
2378 case BRW_OPCODE_MAD:
2379 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2380 inst->opcode = BRW_OPCODE_MOV;
2381 inst->src[1] = reg_undef;
2382 inst->src[2] = reg_undef;
2383 progress = true;
2384 } else if (inst->src[0].is_zero()) {
2385 inst->opcode = BRW_OPCODE_MUL;
2386 inst->src[0] = inst->src[2];
2387 inst->src[2] = reg_undef;
2388 progress = true;
2389 } else if (inst->src[1].is_one()) {
2390 inst->opcode = BRW_OPCODE_ADD;
2391 inst->src[1] = inst->src[2];
2392 inst->src[2] = reg_undef;
2393 progress = true;
2394 } else if (inst->src[2].is_one()) {
2395 inst->opcode = BRW_OPCODE_ADD;
2396 inst->src[2] = reg_undef;
2397 progress = true;
2398 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2399 inst->opcode = BRW_OPCODE_ADD;
2400 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2401 inst->src[2] = reg_undef;
2402 progress = true;
2403 }
2404 break;
2405 case SHADER_OPCODE_RCP: {
2406 fs_inst *prev = (fs_inst *)inst->prev;
2407 if (prev->opcode == SHADER_OPCODE_SQRT) {
2408 if (inst->src[0].equals(prev->dst)) {
2409 inst->opcode = SHADER_OPCODE_RSQ;
2410 inst->src[0] = prev->src[0];
2411 progress = true;
2412 }
2413 }
2414 break;
2415 }
2416 case SHADER_OPCODE_BROADCAST:
2417 if (is_uniform(inst->src[0])) {
2418 inst->opcode = BRW_OPCODE_MOV;
2419 inst->sources = 1;
2420 inst->force_writemask_all = true;
2421 progress = true;
2422 } else if (inst->src[1].file == IMM) {
2423 inst->opcode = BRW_OPCODE_MOV;
2424 inst->src[0] = component(inst->src[0],
2425 inst->src[1].fixed_hw_reg.dw1.ud);
2426 inst->sources = 1;
2427 inst->force_writemask_all = true;
2428 progress = true;
2429 }
2430 break;
2431
2432 default:
2433 break;
2434 }
2435
2436 /* Swap if src[0] is immediate. */
2437 if (progress && inst->is_commutative()) {
2438 if (inst->src[0].file == IMM) {
2439 fs_reg tmp = inst->src[1];
2440 inst->src[1] = inst->src[0];
2441 inst->src[0] = tmp;
2442 }
2443 }
2444 }
2445 return progress;
2446 }
2447
2448 /**
2449 * Optimize sample messages that have constant zero values for the trailing
2450 * texture coordinates. We can just reduce the message length for these
2451 * instructions instead of reserving a register for it. Trailing parameters
2452 * that aren't sent default to zero anyway. This will cause the dead code
2453 * eliminator to remove the MOV instruction that would otherwise be emitted to
2454 * set up the zero value.
2455 */
2456 bool
2457 fs_visitor::opt_zero_samples()
2458 {
2459 /* Gen4 infers the texturing opcode based on the message length so we can't
2460 * change it.
2461 */
2462 if (devinfo->gen < 5)
2463 return false;
2464
2465 bool progress = false;
2466
2467 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2468 if (!inst->is_tex())
2469 continue;
2470
2471 fs_inst *load_payload = (fs_inst *) inst->prev;
2472
2473 if (load_payload->is_head_sentinel() ||
2474 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2475 continue;
2476
2477 /* We don't want to remove the message header or the first parameter.
2478 * Removing the first parameter is not allowed, see the Haswell PRM
2479 * volume 7, page 149:
2480 *
2481 * "Parameter 0 is required except for the sampleinfo message, which
2482 * has no parameter 0"
2483 */
2484 while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2485 load_payload->src[(inst->mlen - inst->header_size) /
2486 (dispatch_width / 8) +
2487 inst->header_size - 1].is_zero()) {
2488 inst->mlen -= dispatch_width / 8;
2489 progress = true;
2490 }
2491 }
2492
2493 if (progress)
2494 invalidate_live_intervals();
2495
2496 return progress;
2497 }
2498
2499 /**
2500 * Optimize sample messages which are followed by the final RT write.
2501 *
2502 * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2503 * results sent directly to the framebuffer, bypassing the EU. Recognize the
2504 * final texturing results copied to the framebuffer write payload and modify
2505 * them to write to the framebuffer directly.
2506 */
2507 bool
2508 fs_visitor::opt_sampler_eot()
2509 {
2510 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2511
2512 if (stage != MESA_SHADER_FRAGMENT)
2513 return false;
2514
2515 if (devinfo->gen < 9 && !devinfo->is_cherryview)
2516 return false;
2517
2518 /* FINISHME: It should be possible to implement this optimization when there
2519 * are multiple drawbuffers.
2520 */
2521 if (key->nr_color_regions != 1)
2522 return false;
2523
2524 /* Look for a texturing instruction immediately before the final FB_WRITE. */
2525 fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2526 assert(fb_write->eot);
2527 assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2528
2529 fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2530
2531 /* There wasn't one; nothing to do. */
2532 if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2533 return false;
2534
2535 /* This optimisation doesn't seem to work for textureGather for some
2536 * reason. I can't find any documentation or known workarounds to indicate
2537 * that this is expected, but considering that it is probably pretty
2538 * unlikely that a shader would directly write out the results from
2539 * textureGather we might as well just disable it.
2540 */
2541 if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
2542 tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2543 return false;
2544
2545 /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2546 * It's very likely to be the previous instruction.
2547 */
2548 fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2549 if (load_payload->is_head_sentinel() ||
2550 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2551 return false;
2552
2553 assert(!tex_inst->eot); /* We can't get here twice */
2554 assert((tex_inst->offset & (0xff << 24)) == 0);
2555
2556 tex_inst->offset |= fb_write->target << 24;
2557 tex_inst->eot = true;
2558 tex_inst->dst = bld.null_reg_ud();
2559 fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2560
2561 /* If a header is present, marking the eot is sufficient. Otherwise, we need
2562 * to create a new LOAD_PAYLOAD command with the same sources and a space
2563 * saved for the header. Using a new destination register not only makes sure
2564 * we have enough space, but it will make sure the dead code eliminator kills
2565 * the instruction that this will replace.
2566 */
2567 if (tex_inst->header_size != 0)
2568 return true;
2569
2570 fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
2571 load_payload->sources + 1);
2572 fs_reg *new_sources =
2573 ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2574
2575 new_sources[0] = fs_reg();
2576 for (int i = 0; i < load_payload->sources; i++)
2577 new_sources[i+1] = load_payload->src[i];
2578
2579 /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2580 * requires a lot of information about the sources to appropriately figure
2581 * out the number of registers needed to be used. Given this stage in our
2582 * optimization, we may not have the appropriate GRFs required by
2583 * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2584 * manually emit the instruction.
2585 */
2586 fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2587 load_payload->exec_size,
2588 send_header,
2589 new_sources,
2590 load_payload->sources + 1);
2591
2592 new_load_payload->regs_written = load_payload->regs_written + 1;
2593 new_load_payload->header_size = 1;
2594 tex_inst->mlen++;
2595 tex_inst->header_size = 1;
2596 tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2597 tex_inst->src[0] = send_header;
2598
2599 return true;
2600 }
2601
2602 bool
2603 fs_visitor::opt_register_renaming()
2604 {
2605 bool progress = false;
2606 int depth = 0;
2607
2608 int remap[alloc.count];
2609 memset(remap, -1, sizeof(int) * alloc.count);
2610
2611 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2612 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2613 depth++;
2614 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2615 inst->opcode == BRW_OPCODE_WHILE) {
2616 depth--;
2617 }
2618
2619 /* Rewrite instruction sources. */
2620 for (int i = 0; i < inst->sources; i++) {
2621 if (inst->src[i].file == GRF &&
2622 remap[inst->src[i].reg] != -1 &&
2623 remap[inst->src[i].reg] != inst->src[i].reg) {
2624 inst->src[i].reg = remap[inst->src[i].reg];
2625 progress = true;
2626 }
2627 }
2628
2629 const int dst = inst->dst.reg;
2630
2631 if (depth == 0 &&
2632 inst->dst.file == GRF &&
2633 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2634 !inst->is_partial_write()) {
2635 if (remap[dst] == -1) {
2636 remap[dst] = dst;
2637 } else {
2638 remap[dst] = alloc.allocate(inst->dst.width / 8);
2639 inst->dst.reg = remap[dst];
2640 progress = true;
2641 }
2642 } else if (inst->dst.file == GRF &&
2643 remap[dst] != -1 &&
2644 remap[dst] != dst) {
2645 inst->dst.reg = remap[dst];
2646 progress = true;
2647 }
2648 }
2649
2650 if (progress) {
2651 invalidate_live_intervals();
2652
2653 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2654 if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2655 delta_xy[i].reg = remap[delta_xy[i].reg];
2656 }
2657 }
2658 }
2659
2660 return progress;
2661 }
2662
2663 /**
2664 * Remove redundant or useless discard jumps.
2665 *
2666 * For example, we can eliminate jumps in the following sequence:
2667 *
2668 * discard-jump (redundant with the next jump)
2669 * discard-jump (useless; jumps to the next instruction)
2670 * placeholder-halt
2671 */
2672 bool
2673 fs_visitor::opt_redundant_discard_jumps()
2674 {
2675 bool progress = false;
2676
2677 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2678
2679 fs_inst *placeholder_halt = NULL;
2680 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2681 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2682 placeholder_halt = inst;
2683 break;
2684 }
2685 }
2686
2687 if (!placeholder_halt)
2688 return false;
2689
2690 /* Delete any HALTs immediately before the placeholder halt. */
2691 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2692 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2693 prev = (fs_inst *) placeholder_halt->prev) {
2694 prev->remove(last_bblock);
2695 progress = true;
2696 }
2697
2698 if (progress)
2699 invalidate_live_intervals();
2700
2701 return progress;
2702 }
2703
2704 bool
2705 fs_visitor::compute_to_mrf()
2706 {
2707 bool progress = false;
2708 int next_ip = 0;
2709
2710 /* No MRFs on Gen >= 7. */
2711 if (devinfo->gen >= 7)
2712 return false;
2713
2714 calculate_live_intervals();
2715
2716 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2717 int ip = next_ip;
2718 next_ip++;
2719
2720 if (inst->opcode != BRW_OPCODE_MOV ||
2721 inst->is_partial_write() ||
2722 inst->dst.file != MRF || inst->src[0].file != GRF ||
2723 inst->dst.type != inst->src[0].type ||
2724 inst->src[0].abs || inst->src[0].negate ||
2725 !inst->src[0].is_contiguous() ||
2726 inst->src[0].subreg_offset)
2727 continue;
2728
2729 /* Work out which hardware MRF registers are written by this
2730 * instruction.
2731 */
2732 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2733 int mrf_high;
2734 if (inst->dst.reg & BRW_MRF_COMPR4) {
2735 mrf_high = mrf_low + 4;
2736 } else if (inst->exec_size == 16) {
2737 mrf_high = mrf_low + 1;
2738 } else {
2739 mrf_high = mrf_low;
2740 }
2741
2742 /* Can't compute-to-MRF this GRF if someone else was going to
2743 * read it later.
2744 */
2745 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2746 continue;
2747
2748 /* Found a move of a GRF to a MRF. Let's see if we can go
2749 * rewrite the thing that made this GRF to write into the MRF.
2750 */
2751 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2752 if (scan_inst->dst.file == GRF &&
2753 scan_inst->dst.reg == inst->src[0].reg) {
2754 /* Found the last thing to write our reg we want to turn
2755 * into a compute-to-MRF.
2756 */
2757
2758 /* If this one instruction didn't populate all the
2759 * channels, bail. We might be able to rewrite everything
2760 * that writes that reg, but it would require smarter
2761 * tracking to delay the rewriting until complete success.
2762 */
2763 if (scan_inst->is_partial_write())
2764 break;
2765
2766 /* Things returning more than one register would need us to
2767 * understand coalescing out more than one MOV at a time.
2768 */
2769 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2770 break;
2771
2772 /* SEND instructions can't have MRF as a destination. */
2773 if (scan_inst->mlen)
2774 break;
2775
2776 if (devinfo->gen == 6) {
2777 /* gen6 math instructions must have the destination be
2778 * GRF, so no compute-to-MRF for them.
2779 */
2780 if (scan_inst->is_math()) {
2781 break;
2782 }
2783 }
2784
2785 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2786 /* Found the creator of our MRF's source value. */
2787 scan_inst->dst.file = MRF;
2788 scan_inst->dst.reg = inst->dst.reg;
2789 scan_inst->saturate |= inst->saturate;
2790 inst->remove(block);
2791 progress = true;
2792 }
2793 break;
2794 }
2795
2796 /* We don't handle control flow here. Most computation of
2797 * values that end up in MRFs are shortly before the MRF
2798 * write anyway.
2799 */
2800 if (block->start() == scan_inst)
2801 break;
2802
2803 /* You can't read from an MRF, so if someone else reads our
2804 * MRF's source GRF that we wanted to rewrite, that stops us.
2805 */
2806 bool interfered = false;
2807 for (int i = 0; i < scan_inst->sources; i++) {
2808 if (scan_inst->src[i].file == GRF &&
2809 scan_inst->src[i].reg == inst->src[0].reg &&
2810 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2811 interfered = true;
2812 }
2813 }
2814 if (interfered)
2815 break;
2816
2817 if (scan_inst->dst.file == MRF) {
2818 /* If somebody else writes our MRF here, we can't
2819 * compute-to-MRF before that.
2820 */
2821 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2822 int scan_mrf_high;
2823
2824 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2825 scan_mrf_high = scan_mrf_low + 4;
2826 } else if (scan_inst->exec_size == 16) {
2827 scan_mrf_high = scan_mrf_low + 1;
2828 } else {
2829 scan_mrf_high = scan_mrf_low;
2830 }
2831
2832 if (mrf_low == scan_mrf_low ||
2833 mrf_low == scan_mrf_high ||
2834 mrf_high == scan_mrf_low ||
2835 mrf_high == scan_mrf_high) {
2836 break;
2837 }
2838 }
2839
2840 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2841 /* Found a SEND instruction, which means that there are
2842 * live values in MRFs from base_mrf to base_mrf +
2843 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2844 * above it.
2845 */
2846 if (mrf_low >= scan_inst->base_mrf &&
2847 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2848 break;
2849 }
2850 if (mrf_high >= scan_inst->base_mrf &&
2851 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2852 break;
2853 }
2854 }
2855 }
2856 }
2857
2858 if (progress)
2859 invalidate_live_intervals();
2860
2861 return progress;
2862 }
2863
2864 /**
2865 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2866 * flow. We could probably do better here with some form of divergence
2867 * analysis.
2868 */
2869 bool
2870 fs_visitor::eliminate_find_live_channel()
2871 {
2872 bool progress = false;
2873 unsigned depth = 0;
2874
2875 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2876 switch (inst->opcode) {
2877 case BRW_OPCODE_IF:
2878 case BRW_OPCODE_DO:
2879 depth++;
2880 break;
2881
2882 case BRW_OPCODE_ENDIF:
2883 case BRW_OPCODE_WHILE:
2884 depth--;
2885 break;
2886
2887 case FS_OPCODE_DISCARD_JUMP:
2888 /* This can potentially make control flow non-uniform until the end
2889 * of the program.
2890 */
2891 return progress;
2892
2893 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2894 if (depth == 0) {
2895 inst->opcode = BRW_OPCODE_MOV;
2896 inst->src[0] = fs_reg(0);
2897 inst->sources = 1;
2898 inst->force_writemask_all = true;
2899 progress = true;
2900 }
2901 break;
2902
2903 default:
2904 break;
2905 }
2906 }
2907
2908 return progress;
2909 }
2910
2911 /**
2912 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2913 * instructions to FS_OPCODE_REP_FB_WRITE.
2914 */
2915 void
2916 fs_visitor::emit_repclear_shader()
2917 {
2918 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2919 int base_mrf = 1;
2920 int color_mrf = base_mrf + 2;
2921
2922 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2923 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2924 mov->force_writemask_all = true;
2925
2926 fs_inst *write;
2927 if (key->nr_color_regions == 1) {
2928 write = emit(FS_OPCODE_REP_FB_WRITE);
2929 write->saturate = key->clamp_fragment_color;
2930 write->base_mrf = color_mrf;
2931 write->target = 0;
2932 write->header_size = 0;
2933 write->mlen = 1;
2934 } else {
2935 assume(key->nr_color_regions > 0);
2936 for (int i = 0; i < key->nr_color_regions; ++i) {
2937 write = emit(FS_OPCODE_REP_FB_WRITE);
2938 write->saturate = key->clamp_fragment_color;
2939 write->base_mrf = base_mrf;
2940 write->target = i;
2941 write->header_size = 2;
2942 write->mlen = 3;
2943 }
2944 }
2945 write->eot = true;
2946
2947 calculate_cfg();
2948
2949 assign_constant_locations();
2950 assign_curb_setup();
2951
2952 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2953 assert(mov->src[0].file == HW_REG);
2954 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2955 }
2956
2957 /**
2958 * Walks through basic blocks, looking for repeated MRF writes and
2959 * removing the later ones.
2960 */
2961 bool
2962 fs_visitor::remove_duplicate_mrf_writes()
2963 {
2964 fs_inst *last_mrf_move[16];
2965 bool progress = false;
2966
2967 /* Need to update the MRF tracking for compressed instructions. */
2968 if (dispatch_width == 16)
2969 return false;
2970
2971 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2972
2973 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2974 if (inst->is_control_flow()) {
2975 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2976 }
2977
2978 if (inst->opcode == BRW_OPCODE_MOV &&
2979 inst->dst.file == MRF) {
2980 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2981 if (prev_inst && inst->equals(prev_inst)) {
2982 inst->remove(block);
2983 progress = true;
2984 continue;
2985 }
2986 }
2987
2988 /* Clear out the last-write records for MRFs that were overwritten. */
2989 if (inst->dst.file == MRF) {
2990 last_mrf_move[inst->dst.reg] = NULL;
2991 }
2992
2993 if (inst->mlen > 0 && inst->base_mrf != -1) {
2994 /* Found a SEND instruction, which will include two or fewer
2995 * implied MRF writes. We could do better here.
2996 */
2997 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2998 last_mrf_move[inst->base_mrf + i] = NULL;
2999 }
3000 }
3001
3002 /* Clear out any MRF move records whose sources got overwritten. */
3003 if (inst->dst.file == GRF) {
3004 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3005 if (last_mrf_move[i] &&
3006 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3007 last_mrf_move[i] = NULL;
3008 }
3009 }
3010 }
3011
3012 if (inst->opcode == BRW_OPCODE_MOV &&
3013 inst->dst.file == MRF &&
3014 inst->src[0].file == GRF &&
3015 !inst->is_partial_write()) {
3016 last_mrf_move[inst->dst.reg] = inst;
3017 }
3018 }
3019
3020 if (progress)
3021 invalidate_live_intervals();
3022
3023 return progress;
3024 }
3025
3026 static void
3027 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3028 {
3029 /* Clear the flag for registers that actually got read (as expected). */
3030 for (int i = 0; i < inst->sources; i++) {
3031 int grf;
3032 if (inst->src[i].file == GRF) {
3033 grf = inst->src[i].reg;
3034 } else if (inst->src[i].file == HW_REG &&
3035 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3036 grf = inst->src[i].fixed_hw_reg.nr;
3037 } else {
3038 continue;
3039 }
3040
3041 if (grf >= first_grf &&
3042 grf < first_grf + grf_len) {
3043 deps[grf - first_grf] = false;
3044 if (inst->exec_size == 16)
3045 deps[grf - first_grf + 1] = false;
3046 }
3047 }
3048 }
3049
3050 /**
3051 * Implements this workaround for the original 965:
3052 *
3053 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3054 * check for post destination dependencies on this instruction, software
3055 * must ensure that there is no destination hazard for the case of ‘write
3056 * followed by a posted write’ shown in the following example.
3057 *
3058 * 1. mov r3 0
3059 * 2. send r3.xy <rest of send instruction>
3060 * 3. mov r2 r3
3061 *
3062 * Due to no post-destination dependency check on the ‘send’, the above
3063 * code sequence could have two instructions (1 and 2) in flight at the
3064 * same time that both consider ‘r3’ as the target of their final writes.
3065 */
3066 void
3067 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3068 fs_inst *inst)
3069 {
3070 int write_len = inst->regs_written;
3071 int first_write_grf = inst->dst.reg;
3072 bool needs_dep[BRW_MAX_MRF];
3073 assert(write_len < (int)sizeof(needs_dep) - 1);
3074
3075 memset(needs_dep, false, sizeof(needs_dep));
3076 memset(needs_dep, true, write_len);
3077
3078 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3079
3080 /* Walk backwards looking for writes to registers we're writing which
3081 * aren't read since being written. If we hit the start of the program,
3082 * we assume that there are no outstanding dependencies on entry to the
3083 * program.
3084 */
3085 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3086 /* If we hit control flow, assume that there *are* outstanding
3087 * dependencies, and force their cleanup before our instruction.
3088 */
3089 if (block->start() == scan_inst) {
3090 for (int i = 0; i < write_len; i++) {
3091 if (needs_dep[i])
3092 DEP_RESOLVE_MOV(bld.at(block, inst), first_write_grf + i);
3093 }
3094 return;
3095 }
3096
3097 /* We insert our reads as late as possible on the assumption that any
3098 * instruction but a MOV that might have left us an outstanding
3099 * dependency has more latency than a MOV.
3100 */
3101 if (scan_inst->dst.file == GRF) {
3102 for (int i = 0; i < scan_inst->regs_written; i++) {
3103 int reg = scan_inst->dst.reg + i;
3104
3105 if (reg >= first_write_grf &&
3106 reg < first_write_grf + write_len &&
3107 needs_dep[reg - first_write_grf]) {
3108 DEP_RESOLVE_MOV(bld.at(block, inst), reg);
3109 needs_dep[reg - first_write_grf] = false;
3110 if (scan_inst->exec_size == 16)
3111 needs_dep[reg - first_write_grf + 1] = false;
3112 }
3113 }
3114 }
3115
3116 /* Clear the flag for registers that actually got read (as expected). */
3117 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3118
3119 /* Continue the loop only if we haven't resolved all the dependencies */
3120 int i;
3121 for (i = 0; i < write_len; i++) {
3122 if (needs_dep[i])
3123 break;
3124 }
3125 if (i == write_len)
3126 return;
3127 }
3128 }
3129
3130 /**
3131 * Implements this workaround for the original 965:
3132 *
3133 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3134 * used as a destination register until after it has been sourced by an
3135 * instruction with a different destination register.
3136 */
3137 void
3138 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3139 {
3140 int write_len = inst->regs_written;
3141 int first_write_grf = inst->dst.reg;
3142 bool needs_dep[BRW_MAX_MRF];
3143 assert(write_len < (int)sizeof(needs_dep) - 1);
3144
3145 memset(needs_dep, false, sizeof(needs_dep));
3146 memset(needs_dep, true, write_len);
3147 /* Walk forwards looking for writes to registers we're writing which aren't
3148 * read before being written.
3149 */
3150 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3151 /* If we hit control flow, force resolve all remaining dependencies. */
3152 if (block->end() == scan_inst) {
3153 for (int i = 0; i < write_len; i++) {
3154 if (needs_dep[i])
3155 DEP_RESOLVE_MOV(bld.at(block, scan_inst), first_write_grf + i);
3156 }
3157 return;
3158 }
3159
3160 /* Clear the flag for registers that actually got read (as expected). */
3161 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3162
3163 /* We insert our reads as late as possible since they're reading the
3164 * result of a SEND, which has massive latency.
3165 */
3166 if (scan_inst->dst.file == GRF &&
3167 scan_inst->dst.reg >= first_write_grf &&
3168 scan_inst->dst.reg < first_write_grf + write_len &&
3169 needs_dep[scan_inst->dst.reg - first_write_grf]) {
3170 DEP_RESOLVE_MOV(bld.at(block, scan_inst), scan_inst->dst.reg);
3171 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3172 }
3173
3174 /* Continue the loop only if we haven't resolved all the dependencies */
3175 int i;
3176 for (i = 0; i < write_len; i++) {
3177 if (needs_dep[i])
3178 break;
3179 }
3180 if (i == write_len)
3181 return;
3182 }
3183 }
3184
3185 void
3186 fs_visitor::insert_gen4_send_dependency_workarounds()
3187 {
3188 if (devinfo->gen != 4 || devinfo->is_g4x)
3189 return;
3190
3191 bool progress = false;
3192
3193 /* Note that we're done with register allocation, so GRF fs_regs always
3194 * have a .reg_offset of 0.
3195 */
3196
3197 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3198 if (inst->mlen != 0 && inst->dst.file == GRF) {
3199 insert_gen4_pre_send_dependency_workarounds(block, inst);
3200 insert_gen4_post_send_dependency_workarounds(block, inst);
3201 progress = true;
3202 }
3203 }
3204
3205 if (progress)
3206 invalidate_live_intervals();
3207 }
3208
3209 /**
3210 * Turns the generic expression-style uniform pull constant load instruction
3211 * into a hardware-specific series of instructions for loading a pull
3212 * constant.
3213 *
3214 * The expression style allows the CSE pass before this to optimize out
3215 * repeated loads from the same offset, and gives the pre-register-allocation
3216 * scheduling full flexibility, while the conversion to native instructions
3217 * allows the post-register-allocation scheduler the best information
3218 * possible.
3219 *
3220 * Note that execution masking for setting up pull constant loads is special:
3221 * the channels that need to be written are unrelated to the current execution
3222 * mask, since a later instruction will use one of the result channels as a
3223 * source operand for all 8 or 16 of its channels.
3224 */
3225 void
3226 fs_visitor::lower_uniform_pull_constant_loads()
3227 {
3228 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3229 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3230 continue;
3231
3232 if (devinfo->gen >= 7) {
3233 /* The offset arg before was a vec4-aligned byte offset. We need to
3234 * turn it into a dword offset.
3235 */
3236 fs_reg const_offset_reg = inst->src[1];
3237 assert(const_offset_reg.file == IMM &&
3238 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3239 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3240 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3241
3242 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3243 * Reserve space for the register.
3244 */
3245 if (devinfo->gen >= 9) {
3246 payload.reg_offset++;
3247 alloc.sizes[payload.reg] = 2;
3248 }
3249
3250 /* This is actually going to be a MOV, but since only the first dword
3251 * is accessed, we have a special opcode to do just that one. Note
3252 * that this needs to be an operation that will be considered a def
3253 * by live variable analysis, or register allocation will explode.
3254 */
3255 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3256 8, payload, const_offset_reg);
3257 setup->force_writemask_all = true;
3258
3259 setup->ir = inst->ir;
3260 setup->annotation = inst->annotation;
3261 inst->insert_before(block, setup);
3262
3263 /* Similarly, this will only populate the first 4 channels of the
3264 * result register (since we only use smear values from 0-3), but we
3265 * don't tell the optimizer.
3266 */
3267 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3268 inst->src[1] = payload;
3269
3270 invalidate_live_intervals();
3271 } else {
3272 /* Before register allocation, we didn't tell the scheduler about the
3273 * MRF we use. We know it's safe to use this MRF because nothing
3274 * else does except for register spill/unspill, which generates and
3275 * uses its MRF within a single IR instruction.
3276 */
3277 inst->base_mrf = 14;
3278 inst->mlen = 1;
3279 }
3280 }
3281 }
3282
3283 bool
3284 fs_visitor::lower_load_payload()
3285 {
3286 bool progress = false;
3287
3288 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3289 if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3290 continue;
3291
3292 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3293 assert(inst->saturate == false);
3294
3295 const fs_builder ibld = bld.group(inst->exec_size, inst->force_sechalf)
3296 .exec_all(inst->force_writemask_all)
3297 .at(block, inst);
3298 fs_reg dst = inst->dst;
3299
3300 /* Get rid of COMPR4. We'll add it back in if we need it */
3301 if (dst.file == MRF)
3302 dst.reg = dst.reg & ~BRW_MRF_COMPR4;
3303
3304 dst.width = 8;
3305 for (uint8_t i = 0; i < inst->header_size; i++) {
3306 if (inst->src[i].file != BAD_FILE) {
3307 fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3308 fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3309 mov_src.width = 8;
3310 ibld.exec_all().MOV(mov_dst, mov_src);
3311 }
3312 dst = offset(dst, 1);
3313 }
3314
3315 dst.width = inst->exec_size;
3316 if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
3317 inst->exec_size > 8) {
3318 /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3319 * a straightforward copy. Instead, the result of the
3320 * LOAD_PAYLOAD is treated as interleaved and the first four
3321 * non-header sources are unpacked as:
3322 *
3323 * m + 0: r0
3324 * m + 1: g0
3325 * m + 2: b0
3326 * m + 3: a0
3327 * m + 4: r1
3328 * m + 5: g1
3329 * m + 6: b1
3330 * m + 7: a1
3331 *
3332 * This is used for gen <= 5 fb writes.
3333 */
3334 assert(inst->exec_size == 16);
3335 assert(inst->header_size + 4 <= inst->sources);
3336 for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3337 if (inst->src[i].file != BAD_FILE) {
3338 if (devinfo->has_compr4) {
3339 fs_reg compr4_dst = retype(dst, inst->src[i].type);
3340 compr4_dst.reg |= BRW_MRF_COMPR4;
3341 ibld.MOV(compr4_dst, inst->src[i]);
3342 } else {
3343 /* Platform doesn't have COMPR4. We have to fake it */
3344 fs_reg mov_dst = retype(dst, inst->src[i].type);
3345 mov_dst.width = 8;
3346 ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
3347 ibld.half(1).MOV(offset(mov_dst, 4), half(inst->src[i], 1));
3348 }
3349 }
3350
3351 dst.reg++;
3352 }
3353
3354 /* The loop above only ever incremented us through the first set
3355 * of 4 registers. However, thanks to the magic of COMPR4, we
3356 * actually wrote to the first 8 registers, so we need to take
3357 * that into account now.
3358 */
3359 dst.reg += 4;
3360
3361 /* The COMPR4 code took care of the first 4 sources. We'll let
3362 * the regular path handle any remaining sources. Yes, we are
3363 * modifying the instruction but we're about to delete it so
3364 * this really doesn't hurt anything.
3365 */
3366 inst->header_size += 4;
3367 }
3368
3369 for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3370 if (inst->src[i].file != BAD_FILE)
3371 ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
3372 dst = offset(dst, 1);
3373 }
3374
3375 inst->remove(block);
3376 progress = true;
3377 }
3378
3379 if (progress)
3380 invalidate_live_intervals();
3381
3382 return progress;
3383 }
3384
3385 bool
3386 fs_visitor::lower_integer_multiplication()
3387 {
3388 bool progress = false;
3389
3390 /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
3391 * directly, but Cherryview cannot.
3392 */
3393 if (devinfo->gen >= 8 && !devinfo->is_cherryview)
3394 return false;
3395
3396 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3397 if (inst->opcode != BRW_OPCODE_MUL ||
3398 inst->dst.is_accumulator() ||
3399 (inst->dst.type != BRW_REGISTER_TYPE_D &&
3400 inst->dst.type != BRW_REGISTER_TYPE_UD))
3401 continue;
3402
3403 const fs_builder ibld = bld.at(block, inst);
3404
3405 /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3406 * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3407 * src1 are used.
3408 *
3409 * If multiplying by an immediate value that fits in 16-bits, do a
3410 * single MUL instruction with that value in the proper location.
3411 */
3412 if (inst->src[1].file == IMM &&
3413 inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
3414 if (devinfo->gen < 7) {
3415 fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
3416 inst->dst.type, dispatch_width);
3417 ibld.MOV(imm, inst->src[1]);
3418 ibld.MUL(inst->dst, imm, inst->src[0]);
3419 } else {
3420 ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
3421 }
3422 } else {
3423 /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3424 * do 32-bit integer multiplication in one instruction, but instead
3425 * must do a sequence (which actually calculates a 64-bit result):
3426 *
3427 * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
3428 * mach(8) null g3<8,8,1>D g4<8,8,1>D
3429 * mov(8) g2<1>D acc0<8,8,1>D
3430 *
3431 * But on Gen > 6, the ability to use second accumulator register
3432 * (acc1) for non-float data types was removed, preventing a simple
3433 * implementation in SIMD16. A 16-channel result can be calculated by
3434 * executing the three instructions twice in SIMD8, once with quarter
3435 * control of 1Q for the first eight channels and again with 2Q for
3436 * the second eight channels.
3437 *
3438 * Which accumulator register is implicitly accessed (by AccWrEnable
3439 * for instance) is determined by the quarter control. Unfortunately
3440 * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3441 * implicit accumulator access by an instruction with 2Q will access
3442 * acc1 regardless of whether the data type is usable in acc1.
3443 *
3444 * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3445 * integer data types.
3446 *
3447 * Since we only want the low 32-bits of the result, we can do two
3448 * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3449 * adjust the high result and add them (like the mach is doing):
3450 *
3451 * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
3452 * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
3453 * shl(8) g9<1>D g8<8,8,1>D 16D
3454 * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
3455 *
3456 * We avoid the shl instruction by realizing that we only want to add
3457 * the low 16-bits of the "high" result to the high 16-bits of the
3458 * "low" result and using proper regioning on the add:
3459 *
3460 * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
3461 * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
3462 * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
3463 *
3464 * Since it does not use the (single) accumulator register, we can
3465 * schedule multi-component multiplications much better.
3466 */
3467
3468 if (inst->conditional_mod && inst->dst.is_null()) {
3469 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3470 inst->dst.type, dispatch_width);
3471 }
3472 fs_reg low = inst->dst;
3473 fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
3474 inst->dst.type, dispatch_width);
3475
3476 if (brw->gen >= 7) {
3477 fs_reg src1_0_w = inst->src[1];
3478 fs_reg src1_1_w = inst->src[1];
3479
3480 if (inst->src[1].file == IMM) {
3481 src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
3482 src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
3483 } else {
3484 src1_0_w.type = BRW_REGISTER_TYPE_UW;
3485 src1_0_w.stride = 2;
3486
3487 src1_1_w.type = BRW_REGISTER_TYPE_UW;
3488 src1_1_w.stride = 2;
3489 src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3490 }
3491 ibld.MUL(low, inst->src[0], src1_0_w);
3492 ibld.MUL(high, inst->src[0], src1_1_w);
3493 } else {
3494 fs_reg src0_0_w = inst->src[0];
3495 fs_reg src0_1_w = inst->src[0];
3496
3497 src0_0_w.type = BRW_REGISTER_TYPE_UW;
3498 src0_0_w.stride = 2;
3499
3500 src0_1_w.type = BRW_REGISTER_TYPE_UW;
3501 src0_1_w.stride = 2;
3502 src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3503
3504 ibld.MUL(low, src0_0_w, inst->src[1]);
3505 ibld.MUL(high, src0_1_w, inst->src[1]);
3506 }
3507
3508 fs_reg dst = inst->dst;
3509 dst.type = BRW_REGISTER_TYPE_UW;
3510 dst.subreg_offset = 2;
3511 dst.stride = 2;
3512
3513 high.type = BRW_REGISTER_TYPE_UW;
3514 high.stride = 2;
3515
3516 low.type = BRW_REGISTER_TYPE_UW;
3517 low.subreg_offset = 2;
3518 low.stride = 2;
3519
3520 ibld.ADD(dst, low, high);
3521
3522 if (inst->conditional_mod) {
3523 fs_reg null(retype(brw_null_reg(), inst->dst.type));
3524 set_condmod(inst->conditional_mod,
3525 ibld.MOV(null, inst->dst));
3526 }
3527 }
3528
3529 inst->remove(block);
3530 progress = true;
3531 }
3532
3533 if (progress)
3534 invalidate_live_intervals();
3535
3536 return progress;
3537 }
3538
3539 void
3540 fs_visitor::dump_instructions()
3541 {
3542 dump_instructions(NULL);
3543 }
3544
3545 void
3546 fs_visitor::dump_instructions(const char *name)
3547 {
3548 FILE *file = stderr;
3549 if (name && geteuid() != 0) {
3550 file = fopen(name, "w");
3551 if (!file)
3552 file = stderr;
3553 }
3554
3555 if (cfg) {
3556 calculate_register_pressure();
3557 int ip = 0, max_pressure = 0;
3558 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3559 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3560 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3561 dump_instruction(inst, file);
3562 ip++;
3563 }
3564 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3565 } else {
3566 int ip = 0;
3567 foreach_in_list(backend_instruction, inst, &instructions) {
3568 fprintf(file, "%4d: ", ip++);
3569 dump_instruction(inst, file);
3570 }
3571 }
3572
3573 if (file != stderr) {
3574 fclose(file);
3575 }
3576 }
3577
3578 void
3579 fs_visitor::dump_instruction(backend_instruction *be_inst)
3580 {
3581 dump_instruction(be_inst, stderr);
3582 }
3583
3584 void
3585 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3586 {
3587 fs_inst *inst = (fs_inst *)be_inst;
3588
3589 if (inst->predicate) {
3590 fprintf(file, "(%cf0.%d) ",
3591 inst->predicate_inverse ? '-' : '+',
3592 inst->flag_subreg);
3593 }
3594
3595 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3596 if (inst->saturate)
3597 fprintf(file, ".sat");
3598 if (inst->conditional_mod) {
3599 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3600 if (!inst->predicate &&
3601 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3602 inst->opcode != BRW_OPCODE_IF &&
3603 inst->opcode != BRW_OPCODE_WHILE))) {
3604 fprintf(file, ".f0.%d", inst->flag_subreg);
3605 }
3606 }
3607 fprintf(file, "(%d) ", inst->exec_size);
3608
3609 if (inst->mlen) {
3610 fprintf(file, "(mlen: %d) ", inst->mlen);
3611 }
3612
3613 switch (inst->dst.file) {
3614 case GRF:
3615 fprintf(file, "vgrf%d", inst->dst.reg);
3616 if (inst->dst.width != dispatch_width)
3617 fprintf(file, "@%d", inst->dst.width);
3618 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3619 inst->dst.subreg_offset)
3620 fprintf(file, "+%d.%d",
3621 inst->dst.reg_offset, inst->dst.subreg_offset);
3622 break;
3623 case MRF:
3624 fprintf(file, "m%d", inst->dst.reg);
3625 break;
3626 case BAD_FILE:
3627 fprintf(file, "(null)");
3628 break;
3629 case UNIFORM:
3630 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3631 break;
3632 case ATTR:
3633 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3634 break;
3635 case HW_REG:
3636 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3637 switch (inst->dst.fixed_hw_reg.nr) {
3638 case BRW_ARF_NULL:
3639 fprintf(file, "null");
3640 break;
3641 case BRW_ARF_ADDRESS:
3642 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3643 break;
3644 case BRW_ARF_ACCUMULATOR:
3645 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3646 break;
3647 case BRW_ARF_FLAG:
3648 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3649 inst->dst.fixed_hw_reg.subnr);
3650 break;
3651 default:
3652 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3653 inst->dst.fixed_hw_reg.subnr);
3654 break;
3655 }
3656 } else {
3657 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3658 }
3659 if (inst->dst.fixed_hw_reg.subnr)
3660 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3661 break;
3662 default:
3663 fprintf(file, "???");
3664 break;
3665 }
3666 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3667
3668 for (int i = 0; i < inst->sources; i++) {
3669 if (inst->src[i].negate)
3670 fprintf(file, "-");
3671 if (inst->src[i].abs)
3672 fprintf(file, "|");
3673 switch (inst->src[i].file) {
3674 case GRF:
3675 fprintf(file, "vgrf%d", inst->src[i].reg);
3676 if (inst->src[i].width != dispatch_width)
3677 fprintf(file, "@%d", inst->src[i].width);
3678 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3679 inst->src[i].subreg_offset)
3680 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3681 inst->src[i].subreg_offset);
3682 break;
3683 case MRF:
3684 fprintf(file, "***m%d***", inst->src[i].reg);
3685 break;
3686 case ATTR:
3687 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3688 break;
3689 case UNIFORM:
3690 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3691 if (inst->src[i].reladdr) {
3692 fprintf(file, "+reladdr");
3693 } else if (inst->src[i].subreg_offset) {
3694 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3695 inst->src[i].subreg_offset);
3696 }
3697 break;
3698 case BAD_FILE:
3699 fprintf(file, "(null)");
3700 break;
3701 case IMM:
3702 switch (inst->src[i].type) {
3703 case BRW_REGISTER_TYPE_F:
3704 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3705 break;
3706 case BRW_REGISTER_TYPE_W:
3707 case BRW_REGISTER_TYPE_D:
3708 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3709 break;
3710 case BRW_REGISTER_TYPE_UW:
3711 case BRW_REGISTER_TYPE_UD:
3712 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3713 break;
3714 case BRW_REGISTER_TYPE_VF:
3715 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3716 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3717 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3718 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3719 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3720 break;
3721 default:
3722 fprintf(file, "???");
3723 break;
3724 }
3725 break;
3726 case HW_REG:
3727 if (inst->src[i].fixed_hw_reg.negate)
3728 fprintf(file, "-");
3729 if (inst->src[i].fixed_hw_reg.abs)
3730 fprintf(file, "|");
3731 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3732 switch (inst->src[i].fixed_hw_reg.nr) {
3733 case BRW_ARF_NULL:
3734 fprintf(file, "null");
3735 break;
3736 case BRW_ARF_ADDRESS:
3737 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3738 break;
3739 case BRW_ARF_ACCUMULATOR:
3740 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3741 break;
3742 case BRW_ARF_FLAG:
3743 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3744 inst->src[i].fixed_hw_reg.subnr);
3745 break;
3746 default:
3747 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3748 inst->src[i].fixed_hw_reg.subnr);
3749 break;
3750 }
3751 } else {
3752 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3753 }
3754 if (inst->src[i].fixed_hw_reg.subnr)
3755 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3756 if (inst->src[i].fixed_hw_reg.abs)
3757 fprintf(file, "|");
3758 break;
3759 default:
3760 fprintf(file, "???");
3761 break;
3762 }
3763 if (inst->src[i].abs)
3764 fprintf(file, "|");
3765
3766 if (inst->src[i].file != IMM) {
3767 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3768 }
3769
3770 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3771 fprintf(file, ", ");
3772 }
3773
3774 fprintf(file, " ");
3775
3776 if (dispatch_width == 16 && inst->exec_size == 8) {
3777 if (inst->force_sechalf)
3778 fprintf(file, "2ndhalf ");
3779 else
3780 fprintf(file, "1sthalf ");
3781 }
3782
3783 fprintf(file, "\n");
3784 }
3785
3786 /**
3787 * Possibly returns an instruction that set up @param reg.
3788 *
3789 * Sometimes we want to take the result of some expression/variable
3790 * dereference tree and rewrite the instruction generating the result
3791 * of the tree. When processing the tree, we know that the
3792 * instructions generated are all writing temporaries that are dead
3793 * outside of this tree. So, if we have some instructions that write
3794 * a temporary, we're free to point that temp write somewhere else.
3795 *
3796 * Note that this doesn't guarantee that the instruction generated
3797 * only reg -- it might be the size=4 destination of a texture instruction.
3798 */
3799 fs_inst *
3800 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3801 fs_inst *end,
3802 const fs_reg &reg)
3803 {
3804 if (end == start ||
3805 end->is_partial_write() ||
3806 reg.reladdr ||
3807 !reg.equals(end->dst)) {
3808 return NULL;
3809 } else {
3810 return end;
3811 }
3812 }
3813
3814 void
3815 fs_visitor::setup_payload_gen6()
3816 {
3817 bool uses_depth =
3818 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3819 unsigned barycentric_interp_modes =
3820 (stage == MESA_SHADER_FRAGMENT) ?
3821 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3822
3823 assert(devinfo->gen >= 6);
3824
3825 /* R0-1: masks, pixel X/Y coordinates. */
3826 payload.num_regs = 2;
3827 /* R2: only for 32-pixel dispatch.*/
3828
3829 /* R3-26: barycentric interpolation coordinates. These appear in the
3830 * same order that they appear in the brw_wm_barycentric_interp_mode
3831 * enum. Each set of coordinates occupies 2 registers if dispatch width
3832 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3833 * appear if they were enabled using the "Barycentric Interpolation
3834 * Mode" bits in WM_STATE.
3835 */
3836 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3837 if (barycentric_interp_modes & (1 << i)) {
3838 payload.barycentric_coord_reg[i] = payload.num_regs;
3839 payload.num_regs += 2;
3840 if (dispatch_width == 16) {
3841 payload.num_regs += 2;
3842 }
3843 }
3844 }
3845
3846 /* R27: interpolated depth if uses source depth */
3847 if (uses_depth) {
3848 payload.source_depth_reg = payload.num_regs;
3849 payload.num_regs++;
3850 if (dispatch_width == 16) {
3851 /* R28: interpolated depth if not SIMD8. */
3852 payload.num_regs++;
3853 }
3854 }
3855 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3856 if (uses_depth) {
3857 payload.source_w_reg = payload.num_regs;
3858 payload.num_regs++;
3859 if (dispatch_width == 16) {
3860 /* R30: interpolated W if not SIMD8. */
3861 payload.num_regs++;
3862 }
3863 }
3864
3865 if (stage == MESA_SHADER_FRAGMENT) {
3866 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3867 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3868 prog_data->uses_pos_offset = key->compute_pos_offset;
3869 /* R31: MSAA position offsets. */
3870 if (prog_data->uses_pos_offset) {
3871 payload.sample_pos_reg = payload.num_regs;
3872 payload.num_regs++;
3873 }
3874 }
3875
3876 /* R32: MSAA input coverage mask */
3877 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3878 assert(devinfo->gen >= 7);
3879 payload.sample_mask_in_reg = payload.num_regs;
3880 payload.num_regs++;
3881 if (dispatch_width == 16) {
3882 /* R33: input coverage mask if not SIMD8. */
3883 payload.num_regs++;
3884 }
3885 }
3886
3887 /* R34-: bary for 32-pixel. */
3888 /* R58-59: interp W for 32-pixel. */
3889
3890 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3891 source_depth_to_render_target = true;
3892 }
3893 }
3894
3895 void
3896 fs_visitor::setup_vs_payload()
3897 {
3898 /* R0: thread header, R1: urb handles */
3899 payload.num_regs = 2;
3900 }
3901
3902 void
3903 fs_visitor::setup_cs_payload()
3904 {
3905 assert(brw->gen >= 7);
3906
3907 payload.num_regs = 1;
3908 }
3909
3910 void
3911 fs_visitor::assign_binding_table_offsets()
3912 {
3913 assert(stage == MESA_SHADER_FRAGMENT);
3914 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3915 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3916 uint32_t next_binding_table_offset = 0;
3917
3918 /* If there are no color regions, we still perform an FB write to a null
3919 * renderbuffer, which we place at surface index 0.
3920 */
3921 prog_data->binding_table.render_target_start = next_binding_table_offset;
3922 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3923
3924 assign_common_binding_table_offsets(next_binding_table_offset);
3925 }
3926
3927 void
3928 fs_visitor::calculate_register_pressure()
3929 {
3930 invalidate_live_intervals();
3931 calculate_live_intervals();
3932
3933 unsigned num_instructions = 0;
3934 foreach_block(block, cfg)
3935 num_instructions += block->instructions.length();
3936
3937 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3938
3939 for (unsigned reg = 0; reg < alloc.count; reg++) {
3940 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3941 regs_live_at_ip[ip] += alloc.sizes[reg];
3942 }
3943 }
3944
3945 void
3946 fs_visitor::optimize()
3947 {
3948 /* bld is the common builder object pointing at the end of the program we
3949 * used to translate it into i965 IR. For the optimization and lowering
3950 * passes coming next, any code added after the end of the program without
3951 * having explicitly called fs_builder::at() clearly points at a mistake.
3952 * Ideally optimization passes wouldn't be part of the visitor so they
3953 * wouldn't have access to bld at all, but they do, so just in case some
3954 * pass forgets to ask for a location explicitly set it to NULL here to
3955 * make it trip.
3956 */
3957 bld = bld.at(NULL, NULL);
3958
3959 split_virtual_grfs();
3960
3961 move_uniform_array_access_to_pull_constants();
3962 assign_constant_locations();
3963 demote_pull_constants();
3964
3965 #define OPT(pass, args...) ({ \
3966 pass_num++; \
3967 bool this_progress = pass(args); \
3968 \
3969 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3970 char filename[64]; \
3971 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3972 stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3973 \
3974 backend_shader::dump_instructions(filename); \
3975 } \
3976 \
3977 progress = progress || this_progress; \
3978 this_progress; \
3979 })
3980
3981 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3982 char filename[64];
3983 snprintf(filename, 64, "%s%d-%04d-00-start",
3984 stage_abbrev, dispatch_width,
3985 shader_prog ? shader_prog->Name : 0);
3986
3987 backend_shader::dump_instructions(filename);
3988 }
3989
3990 bool progress;
3991 int iteration = 0;
3992 int pass_num = 0;
3993 do {
3994 progress = false;
3995 pass_num = 0;
3996 iteration++;
3997
3998 OPT(remove_duplicate_mrf_writes);
3999
4000 OPT(opt_algebraic);
4001 OPT(opt_cse);
4002 OPT(opt_copy_propagate);
4003 OPT(opt_peephole_predicated_break);
4004 OPT(opt_cmod_propagation);
4005 OPT(dead_code_eliminate);
4006 OPT(opt_peephole_sel);
4007 OPT(dead_control_flow_eliminate, this);
4008 OPT(opt_register_renaming);
4009 OPT(opt_redundant_discard_jumps);
4010 OPT(opt_saturate_propagation);
4011 OPT(opt_zero_samples);
4012 OPT(register_coalesce);
4013 OPT(compute_to_mrf);
4014 OPT(eliminate_find_live_channel);
4015
4016 OPT(compact_virtual_grfs);
4017 } while (progress);
4018
4019 pass_num = 0;
4020
4021 OPT(opt_sampler_eot);
4022
4023 if (OPT(lower_load_payload)) {
4024 split_virtual_grfs();
4025 OPT(register_coalesce);
4026 OPT(compute_to_mrf);
4027 OPT(dead_code_eliminate);
4028 }
4029
4030 OPT(opt_combine_constants);
4031 OPT(lower_integer_multiplication);
4032
4033 lower_uniform_pull_constant_loads();
4034 }
4035
4036 /**
4037 * Three source instruction must have a GRF/MRF destination register.
4038 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
4039 */
4040 void
4041 fs_visitor::fixup_3src_null_dest()
4042 {
4043 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
4044 if (inst->is_3src() && inst->dst.is_null()) {
4045 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
4046 inst->dst.type);
4047 }
4048 }
4049 }
4050
4051 void
4052 fs_visitor::allocate_registers()
4053 {
4054 bool allocated_without_spills;
4055
4056 static const enum instruction_scheduler_mode pre_modes[] = {
4057 SCHEDULE_PRE,
4058 SCHEDULE_PRE_NON_LIFO,
4059 SCHEDULE_PRE_LIFO,
4060 };
4061
4062 /* Try each scheduling heuristic to see if it can successfully register
4063 * allocate without spilling. They should be ordered by decreasing
4064 * performance but increasing likelihood of allocating.
4065 */
4066 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
4067 schedule_instructions(pre_modes[i]);
4068
4069 if (0) {
4070 assign_regs_trivial();
4071 allocated_without_spills = true;
4072 } else {
4073 allocated_without_spills = assign_regs(false);
4074 }
4075 if (allocated_without_spills)
4076 break;
4077 }
4078
4079 if (!allocated_without_spills) {
4080 /* We assume that any spilling is worse than just dropping back to
4081 * SIMD8. There's probably actually some intermediate point where
4082 * SIMD16 with a couple of spills is still better.
4083 */
4084 if (dispatch_width == 16) {
4085 fail("Failure to register allocate. Reduce number of "
4086 "live scalar values to avoid this.");
4087 } else {
4088 perf_debug("%s shader triggered register spilling. "
4089 "Try reducing the number of live scalar values to "
4090 "improve performance.\n", stage_name);
4091 }
4092
4093 /* Since we're out of heuristics, just go spill registers until we
4094 * get an allocation.
4095 */
4096 while (!assign_regs(true)) {
4097 if (failed)
4098 break;
4099 }
4100 }
4101
4102 /* This must come after all optimization and register allocation, since
4103 * it inserts dead code that happens to have side effects, and it does
4104 * so based on the actual physical registers in use.
4105 */
4106 insert_gen4_send_dependency_workarounds();
4107
4108 if (failed)
4109 return;
4110
4111 if (!allocated_without_spills)
4112 schedule_instructions(SCHEDULE_POST);
4113
4114 if (last_scratch > 0)
4115 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4116 }
4117
4118 bool
4119 fs_visitor::run_vs()
4120 {
4121 assert(stage == MESA_SHADER_VERTEX);
4122
4123 assign_common_binding_table_offsets(0);
4124 setup_vs_payload();
4125
4126 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4127 emit_shader_time_begin();
4128
4129 emit_nir_code();
4130
4131 if (failed)
4132 return false;
4133
4134 emit_urb_writes();
4135
4136 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4137 emit_shader_time_end();
4138
4139 calculate_cfg();
4140
4141 optimize();
4142
4143 assign_curb_setup();
4144 assign_vs_urb_setup();
4145
4146 fixup_3src_null_dest();
4147 allocate_registers();
4148
4149 return !failed;
4150 }
4151
4152 bool
4153 fs_visitor::run_fs()
4154 {
4155 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4156 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4157
4158 assert(stage == MESA_SHADER_FRAGMENT);
4159
4160 sanity_param_count = prog->Parameters->NumParameters;
4161
4162 assign_binding_table_offsets();
4163
4164 if (devinfo->gen >= 6)
4165 setup_payload_gen6();
4166 else
4167 setup_payload_gen4();
4168
4169 if (0) {
4170 emit_dummy_fs();
4171 } else if (brw->use_rep_send && dispatch_width == 16) {
4172 emit_repclear_shader();
4173 } else {
4174 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4175 emit_shader_time_begin();
4176
4177 calculate_urb_setup();
4178 if (prog->InputsRead > 0) {
4179 if (devinfo->gen < 6)
4180 emit_interpolation_setup_gen4();
4181 else
4182 emit_interpolation_setup_gen6();
4183 }
4184
4185 /* We handle discards by keeping track of the still-live pixels in f0.1.
4186 * Initialize it with the dispatched pixels.
4187 */
4188 if (wm_prog_data->uses_kill) {
4189 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4190 discard_init->flag_subreg = 1;
4191 }
4192
4193 /* Generate FS IR for main(). (the visitor only descends into
4194 * functions called "main").
4195 */
4196 emit_nir_code();
4197
4198 if (failed)
4199 return false;
4200
4201 if (wm_prog_data->uses_kill)
4202 emit(FS_OPCODE_PLACEHOLDER_HALT);
4203
4204 if (wm_key->alpha_test_func)
4205 emit_alpha_test();
4206
4207 emit_fb_writes();
4208
4209 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4210 emit_shader_time_end();
4211
4212 calculate_cfg();
4213
4214 optimize();
4215
4216 assign_curb_setup();
4217 assign_urb_setup();
4218
4219 fixup_3src_null_dest();
4220 allocate_registers();
4221
4222 if (failed)
4223 return false;
4224 }
4225
4226 if (dispatch_width == 8)
4227 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4228 else
4229 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4230
4231 /* If any state parameters were appended, then ParameterValues could have
4232 * been realloced, in which case the driver uniform storage set up by
4233 * _mesa_associate_uniform_storage() would point to freed memory. Make
4234 * sure that didn't happen.
4235 */
4236 assert(sanity_param_count == prog->Parameters->NumParameters);
4237
4238 return !failed;
4239 }
4240
4241 bool
4242 fs_visitor::run_cs()
4243 {
4244 assert(stage == MESA_SHADER_COMPUTE);
4245 assert(shader);
4246
4247 sanity_param_count = prog->Parameters->NumParameters;
4248
4249 assign_common_binding_table_offsets(0);
4250
4251 setup_cs_payload();
4252
4253 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4254 emit_shader_time_begin();
4255
4256 emit_nir_code();
4257
4258 if (failed)
4259 return false;
4260
4261 emit_cs_terminate();
4262
4263 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4264 emit_shader_time_end();
4265
4266 calculate_cfg();
4267
4268 optimize();
4269
4270 assign_curb_setup();
4271
4272 fixup_3src_null_dest();
4273 allocate_registers();
4274
4275 if (failed)
4276 return false;
4277
4278 /* If any state parameters were appended, then ParameterValues could have
4279 * been realloced, in which case the driver uniform storage set up by
4280 * _mesa_associate_uniform_storage() would point to freed memory. Make
4281 * sure that didn't happen.
4282 */
4283 assert(sanity_param_count == prog->Parameters->NumParameters);
4284
4285 return !failed;
4286 }
4287
4288 const unsigned *
4289 brw_wm_fs_emit(struct brw_context *brw,
4290 void *mem_ctx,
4291 const struct brw_wm_prog_key *key,
4292 struct brw_wm_prog_data *prog_data,
4293 struct gl_fragment_program *fp,
4294 struct gl_shader_program *prog,
4295 unsigned *final_assembly_size)
4296 {
4297 bool start_busy = false;
4298 double start_time = 0;
4299
4300 if (unlikely(brw->perf_debug)) {
4301 start_busy = (brw->batch.last_bo &&
4302 drm_intel_bo_busy(brw->batch.last_bo));
4303 start_time = get_time();
4304 }
4305
4306 struct brw_shader *shader = NULL;
4307 if (prog)
4308 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4309
4310 if (unlikely(INTEL_DEBUG & DEBUG_WM))
4311 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4312
4313 /* Now the main event: Visit the shader IR and generate our FS IR for it.
4314 */
4315 fs_visitor v(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4316 prog, &fp->Base, 8);
4317 if (!v.run_fs()) {
4318 if (prog) {
4319 prog->LinkStatus = false;
4320 ralloc_strcat(&prog->InfoLog, v.fail_msg);
4321 }
4322
4323 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4324 v.fail_msg);
4325
4326 return NULL;
4327 }
4328
4329 cfg_t *simd16_cfg = NULL;
4330 fs_visitor v2(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4331 prog, &fp->Base, 16);
4332 if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4333 if (!v.simd16_unsupported) {
4334 /* Try a SIMD16 compile */
4335 v2.import_uniforms(&v);
4336 if (!v2.run_fs()) {
4337 perf_debug("SIMD16 shader failed to compile, falling back to "
4338 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4339 } else {
4340 simd16_cfg = v2.cfg;
4341 }
4342 } else {
4343 perf_debug("SIMD16 shader unsupported, falling back to "
4344 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4345 }
4346 }
4347
4348 cfg_t *simd8_cfg;
4349 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4350 if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4351 simd8_cfg = NULL;
4352 prog_data->no_8 = true;
4353 } else {
4354 simd8_cfg = v.cfg;
4355 prog_data->no_8 = false;
4356 }
4357
4358 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4359 &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4360
4361 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4362 char *name;
4363 if (prog)
4364 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4365 prog->Label ? prog->Label : "unnamed",
4366 prog->Name);
4367 else
4368 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4369
4370 g.enable_debug(name);
4371 }
4372
4373 if (simd8_cfg)
4374 g.generate_code(simd8_cfg, 8);
4375 if (simd16_cfg)
4376 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4377
4378 if (unlikely(brw->perf_debug) && shader) {
4379 if (shader->compiled_once)
4380 brw_wm_debug_recompile(brw, prog, key);
4381 shader->compiled_once = true;
4382
4383 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4384 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4385 (get_time() - start_time) * 1000);
4386 }
4387 }
4388
4389 return g.get_assembly(final_assembly_size);
4390 }
4391
4392 extern "C" bool
4393 brw_fs_precompile(struct gl_context *ctx,
4394 struct gl_shader_program *shader_prog,
4395 struct gl_program *prog)
4396 {
4397 struct brw_context *brw = brw_context(ctx);
4398 struct brw_wm_prog_key key;
4399
4400 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4401 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4402 bool program_uses_dfdy = fp->UsesDFdy;
4403
4404 memset(&key, 0, sizeof(key));
4405
4406 if (brw->gen < 6) {
4407 if (fp->UsesKill)
4408 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4409
4410 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4411 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4412
4413 /* Just assume depth testing. */
4414 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4415 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4416 }
4417
4418 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4419 BRW_FS_VARYING_INPUT_MASK) > 16)
4420 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4421
4422 brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4423
4424 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4425 key.drawable_height = ctx->DrawBuffer->Height;
4426 }
4427
4428 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4429 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4430 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4431
4432 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4433 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4434 key.nr_color_regions > 1;
4435 }
4436
4437 key.program_string_id = bfp->id;
4438
4439 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4440 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4441
4442 bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4443
4444 brw->wm.base.prog_offset = old_prog_offset;
4445 brw->wm.prog_data = old_prog_data;
4446
4447 return success;
4448 }
4449
4450 void
4451 brw_setup_tex_for_precompile(struct brw_context *brw,
4452 struct brw_sampler_prog_key_data *tex,
4453 struct gl_program *prog)
4454 {
4455 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4456 unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4457 for (unsigned i = 0; i < sampler_count; i++) {
4458 if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4459 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4460 tex->swizzles[i] =
4461 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4462 } else {
4463 /* Color sampler: assume no swizzling. */
4464 tex->swizzles[i] = SWIZZLE_XYZW;
4465 }
4466 }
4467 }