i965/fs: Migrate FS framebuffer writes to the IR builder.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 using namespace brw;
53
54 void
55 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
56 const fs_reg *src, unsigned sources)
57 {
58 memset(this, 0, sizeof(*this));
59
60 this->src = new fs_reg[MAX2(sources, 3)];
61 for (unsigned i = 0; i < sources; i++)
62 this->src[i] = src[i];
63
64 this->opcode = opcode;
65 this->dst = dst;
66 this->sources = sources;
67 this->exec_size = exec_size;
68
69 assert(dst.file != IMM && dst.file != UNIFORM);
70
71 /* If exec_size == 0, try to guess it from the registers. Since all
72 * manner of things may use hardware registers, we first try to guess
73 * based on GRF registers. If this fails, we will go ahead and take the
74 * width from the destination register.
75 */
76 if (this->exec_size == 0) {
77 if (dst.file == GRF) {
78 this->exec_size = dst.width;
79 } else {
80 for (unsigned i = 0; i < sources; ++i) {
81 if (src[i].file != GRF && src[i].file != ATTR)
82 continue;
83
84 if (this->exec_size <= 1)
85 this->exec_size = src[i].width;
86 assert(src[i].width == 1 || src[i].width == this->exec_size);
87 }
88 }
89
90 if (this->exec_size == 0 && dst.file != BAD_FILE)
91 this->exec_size = dst.width;
92 }
93 assert(this->exec_size != 0);
94
95 this->conditional_mod = BRW_CONDITIONAL_NONE;
96
97 /* This will be the case for almost all instructions. */
98 switch (dst.file) {
99 case GRF:
100 case HW_REG:
101 case MRF:
102 case ATTR:
103 this->regs_written =
104 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
105 break;
106 case BAD_FILE:
107 this->regs_written = 0;
108 break;
109 case IMM:
110 case UNIFORM:
111 unreachable("Invalid destination register file");
112 default:
113 unreachable("Invalid register file");
114 }
115
116 this->writes_accumulator = false;
117 }
118
119 fs_inst::fs_inst()
120 {
121 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
122 }
123
124 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
125 {
126 init(opcode, exec_size, reg_undef, NULL, 0);
127 }
128
129 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
130 {
131 init(opcode, 0, dst, NULL, 0);
132 }
133
134 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
135 const fs_reg &src0)
136 {
137 const fs_reg src[1] = { src0 };
138 init(opcode, exec_size, dst, src, 1);
139 }
140
141 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
142 {
143 const fs_reg src[1] = { src0 };
144 init(opcode, 0, dst, src, 1);
145 }
146
147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
148 const fs_reg &src0, const fs_reg &src1)
149 {
150 const fs_reg src[2] = { src0, src1 };
151 init(opcode, exec_size, dst, src, 2);
152 }
153
154 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
155 const fs_reg &src1)
156 {
157 const fs_reg src[2] = { src0, src1 };
158 init(opcode, 0, dst, src, 2);
159 }
160
161 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
162 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
163 {
164 const fs_reg src[3] = { src0, src1, src2 };
165 init(opcode, exec_size, dst, src, 3);
166 }
167
168 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
169 const fs_reg &src1, const fs_reg &src2)
170 {
171 const fs_reg src[3] = { src0, src1, src2 };
172 init(opcode, 0, dst, src, 3);
173 }
174
175 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
176 const fs_reg src[], unsigned sources)
177 {
178 init(opcode, 0, dst, src, sources);
179 }
180
181 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
182 const fs_reg src[], unsigned sources)
183 {
184 init(opcode, exec_width, dst, src, sources);
185 }
186
187 fs_inst::fs_inst(const fs_inst &that)
188 {
189 memcpy(this, &that, sizeof(that));
190
191 this->src = new fs_reg[MAX2(that.sources, 3)];
192
193 for (unsigned i = 0; i < that.sources; i++)
194 this->src[i] = that.src[i];
195 }
196
197 fs_inst::~fs_inst()
198 {
199 delete[] this->src;
200 }
201
202 void
203 fs_inst::resize_sources(uint8_t num_sources)
204 {
205 if (this->sources != num_sources) {
206 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
207
208 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
209 src[i] = this->src[i];
210
211 delete[] this->src;
212 this->src = src;
213 this->sources = num_sources;
214 }
215 }
216
217 #define ALU1(op) \
218 fs_inst * \
219 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
220 { \
221 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
222 }
223
224 #define ALU2(op) \
225 fs_inst * \
226 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
227 const fs_reg &src1) \
228 { \
229 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
230 }
231
232 #define ALU2_ACC(op) \
233 fs_inst * \
234 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
235 const fs_reg &src1) \
236 { \
237 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
238 inst->writes_accumulator = true; \
239 return inst; \
240 }
241
242 #define ALU3(op) \
243 fs_inst * \
244 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
245 const fs_reg &src1, const fs_reg &src2) \
246 { \
247 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
248 }
249
250 ALU1(NOT)
251 ALU1(MOV)
252 ALU1(FRC)
253 ALU1(RNDD)
254 ALU1(RNDE)
255 ALU1(RNDZ)
256 ALU2(ADD)
257 ALU2(MUL)
258 ALU2_ACC(MACH)
259 ALU2(AND)
260 ALU2(OR)
261 ALU2(XOR)
262 ALU2(SHL)
263 ALU2(SHR)
264 ALU2(ASR)
265 ALU3(LRP)
266 ALU1(BFREV)
267 ALU3(BFE)
268 ALU2(BFI1)
269 ALU3(BFI2)
270 ALU1(FBH)
271 ALU1(FBL)
272 ALU1(CBIT)
273 ALU3(MAD)
274 ALU2_ACC(ADDC)
275 ALU2_ACC(SUBB)
276 ALU2(SEL)
277 ALU2(MAC)
278
279 /** Gen4 predicated IF. */
280 fs_inst *
281 fs_visitor::IF(enum brw_predicate predicate)
282 {
283 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
284 inst->predicate = predicate;
285 return inst;
286 }
287
288 /** Gen6 IF with embedded comparison. */
289 fs_inst *
290 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
291 enum brw_conditional_mod condition)
292 {
293 assert(devinfo->gen == 6);
294 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
295 reg_null_d, src0, src1);
296 inst->conditional_mod = condition;
297 return inst;
298 }
299
300 /**
301 * CMP: Sets the low bit of the destination channels with the result
302 * of the comparison, while the upper bits are undefined, and updates
303 * the flag register with the packed 16 bits of the result.
304 */
305 fs_inst *
306 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
307 enum brw_conditional_mod condition)
308 {
309 fs_inst *inst;
310
311 /* Take the instruction:
312 *
313 * CMP null<d> src0<f> src1<f>
314 *
315 * Original gen4 does type conversion to the destination type before
316 * comparison, producing garbage results for floating point comparisons.
317 *
318 * The destination type doesn't matter on newer generations, so we set the
319 * type to match src0 so we can compact the instruction.
320 */
321 dst.type = src0.type;
322 if (dst.file == HW_REG)
323 dst.fixed_hw_reg.type = dst.type;
324
325 resolve_ud_negate(&src0);
326 resolve_ud_negate(&src1);
327
328 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
329 inst->conditional_mod = condition;
330
331 return inst;
332 }
333
334 fs_inst *
335 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
336 int header_size)
337 {
338 assert(dst.width % 8 == 0);
339 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
340 dst, src, sources);
341 inst->header_size = header_size;
342
343 for (int i = 0; i < header_size; i++)
344 assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
345 inst->regs_written = header_size;
346
347 for (int i = header_size; i < sources; ++i)
348 assert(src[i].file != GRF || src[i].width == dst.width);
349 inst->regs_written += (sources - header_size) * (dst.width / 8);
350
351 return inst;
352 }
353
354 void
355 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
356 const fs_reg &dst,
357 const fs_reg &surf_index,
358 const fs_reg &varying_offset,
359 uint32_t const_offset)
360 {
361 /* We have our constant surface use a pitch of 4 bytes, so our index can
362 * be any component of a vector, and then we load 4 contiguous
363 * components starting from that.
364 *
365 * We break down the const_offset to a portion added to the variable
366 * offset and a portion done using reg_offset, which means that if you
367 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
368 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
369 * CSE can later notice that those loads are all the same and eliminate
370 * the redundant ones.
371 */
372 fs_reg vec4_offset = vgrf(glsl_type::int_type);
373 bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
374
375 int scale = 1;
376 if (devinfo->gen == 4 && dst.width == 8) {
377 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
378 * u, v, r) as parameters, or we can just use the SIMD16 message
379 * consisting of (header, u). We choose the second, at the cost of a
380 * longer return length.
381 */
382 scale = 2;
383 }
384
385 enum opcode op;
386 if (devinfo->gen >= 7)
387 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
388 else
389 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
390
391 assert(dst.width % 8 == 0);
392 int regs_written = 4 * (dst.width / 8) * scale;
393 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
394 dst.type, dst.width);
395 fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
396 inst->regs_written = regs_written;
397
398 if (devinfo->gen < 7) {
399 inst->base_mrf = 13;
400 inst->header_size = 1;
401 if (devinfo->gen == 4)
402 inst->mlen = 3;
403 else
404 inst->mlen = 1 + dispatch_width / 8;
405 }
406
407 bld.MOV(dst, offset(vec4_result, (const_offset & 3) * scale));
408 }
409
410 /**
411 * A helper for MOV generation for fixing up broken hardware SEND dependency
412 * handling.
413 */
414 void
415 fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
416 {
417 /* The caller always wants uncompressed to emit the minimal extra
418 * dependencies, and to avoid having to deal with aligning its regs to 2.
419 */
420 const fs_builder ubld = bld.annotate("send dependency resolve")
421 .half(0);
422
423 ubld.MOV(ubld.null_reg_f(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
424 }
425
426 bool
427 fs_inst::equals(fs_inst *inst) const
428 {
429 return (opcode == inst->opcode &&
430 dst.equals(inst->dst) &&
431 src[0].equals(inst->src[0]) &&
432 src[1].equals(inst->src[1]) &&
433 src[2].equals(inst->src[2]) &&
434 saturate == inst->saturate &&
435 predicate == inst->predicate &&
436 conditional_mod == inst->conditional_mod &&
437 mlen == inst->mlen &&
438 base_mrf == inst->base_mrf &&
439 target == inst->target &&
440 eot == inst->eot &&
441 header_size == inst->header_size &&
442 shadow_compare == inst->shadow_compare &&
443 exec_size == inst->exec_size &&
444 offset == inst->offset);
445 }
446
447 bool
448 fs_inst::overwrites_reg(const fs_reg &reg) const
449 {
450 return reg.in_range(dst, regs_written);
451 }
452
453 bool
454 fs_inst::is_send_from_grf() const
455 {
456 switch (opcode) {
457 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
458 case SHADER_OPCODE_SHADER_TIME_ADD:
459 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
460 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
461 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
462 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
463 case SHADER_OPCODE_UNTYPED_ATOMIC:
464 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
465 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
466 case SHADER_OPCODE_TYPED_ATOMIC:
467 case SHADER_OPCODE_TYPED_SURFACE_READ:
468 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
469 case SHADER_OPCODE_URB_WRITE_SIMD8:
470 return true;
471 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
472 return src[1].file == GRF;
473 case FS_OPCODE_FB_WRITE:
474 return src[0].file == GRF;
475 default:
476 if (is_tex())
477 return src[0].file == GRF;
478
479 return false;
480 }
481 }
482
483 bool
484 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
485 {
486 if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
487 return false;
488
489 fs_reg reg = this->src[0];
490 if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
491 return false;
492
493 if (grf_alloc.sizes[reg.reg] != this->regs_written)
494 return false;
495
496 for (int i = 0; i < this->sources; i++) {
497 reg.type = this->src[i].type;
498 reg.width = this->src[i].width;
499 if (!this->src[i].equals(reg))
500 return false;
501 reg = ::offset(reg, 1);
502 }
503
504 return true;
505 }
506
507 bool
508 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
509 {
510 if (devinfo->gen == 6 && is_math())
511 return false;
512
513 if (is_send_from_grf())
514 return false;
515
516 if (!backend_instruction::can_do_source_mods())
517 return false;
518
519 return true;
520 }
521
522 bool
523 fs_inst::has_side_effects() const
524 {
525 return this->eot || backend_instruction::has_side_effects();
526 }
527
528 void
529 fs_reg::init()
530 {
531 memset(this, 0, sizeof(*this));
532 stride = 1;
533 }
534
535 /** Generic unset register constructor. */
536 fs_reg::fs_reg()
537 {
538 init();
539 this->file = BAD_FILE;
540 }
541
542 /** Immediate value constructor. */
543 fs_reg::fs_reg(float f)
544 {
545 init();
546 this->file = IMM;
547 this->type = BRW_REGISTER_TYPE_F;
548 this->fixed_hw_reg.dw1.f = f;
549 this->width = 1;
550 }
551
552 /** Immediate value constructor. */
553 fs_reg::fs_reg(int32_t i)
554 {
555 init();
556 this->file = IMM;
557 this->type = BRW_REGISTER_TYPE_D;
558 this->fixed_hw_reg.dw1.d = i;
559 this->width = 1;
560 }
561
562 /** Immediate value constructor. */
563 fs_reg::fs_reg(uint32_t u)
564 {
565 init();
566 this->file = IMM;
567 this->type = BRW_REGISTER_TYPE_UD;
568 this->fixed_hw_reg.dw1.ud = u;
569 this->width = 1;
570 }
571
572 /** Vector float immediate value constructor. */
573 fs_reg::fs_reg(uint8_t vf[4])
574 {
575 init();
576 this->file = IMM;
577 this->type = BRW_REGISTER_TYPE_VF;
578 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
579 }
580
581 /** Vector float immediate value constructor. */
582 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
583 {
584 init();
585 this->file = IMM;
586 this->type = BRW_REGISTER_TYPE_VF;
587 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
588 (vf1 << 8) |
589 (vf2 << 16) |
590 (vf3 << 24);
591 }
592
593 /** Fixed brw_reg. */
594 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
595 {
596 init();
597 this->file = HW_REG;
598 this->fixed_hw_reg = fixed_hw_reg;
599 this->type = fixed_hw_reg.type;
600 this->width = 1 << fixed_hw_reg.width;
601 }
602
603 bool
604 fs_reg::equals(const fs_reg &r) const
605 {
606 return (file == r.file &&
607 reg == r.reg &&
608 reg_offset == r.reg_offset &&
609 subreg_offset == r.subreg_offset &&
610 type == r.type &&
611 negate == r.negate &&
612 abs == r.abs &&
613 !reladdr && !r.reladdr &&
614 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
615 width == r.width &&
616 stride == r.stride);
617 }
618
619 fs_reg &
620 fs_reg::set_smear(unsigned subreg)
621 {
622 assert(file != HW_REG && file != IMM);
623 subreg_offset = subreg * type_sz(type);
624 stride = 0;
625 return *this;
626 }
627
628 bool
629 fs_reg::is_contiguous() const
630 {
631 return stride == 1;
632 }
633
634 int
635 fs_visitor::type_size(const struct glsl_type *type)
636 {
637 unsigned int size, i;
638
639 switch (type->base_type) {
640 case GLSL_TYPE_UINT:
641 case GLSL_TYPE_INT:
642 case GLSL_TYPE_FLOAT:
643 case GLSL_TYPE_BOOL:
644 return type->components();
645 case GLSL_TYPE_ARRAY:
646 return type_size(type->fields.array) * type->length;
647 case GLSL_TYPE_STRUCT:
648 size = 0;
649 for (i = 0; i < type->length; i++) {
650 size += type_size(type->fields.structure[i].type);
651 }
652 return size;
653 case GLSL_TYPE_SAMPLER:
654 /* Samplers take up no register space, since they're baked in at
655 * link time.
656 */
657 return 0;
658 case GLSL_TYPE_ATOMIC_UINT:
659 return 0;
660 case GLSL_TYPE_IMAGE:
661 case GLSL_TYPE_VOID:
662 case GLSL_TYPE_ERROR:
663 case GLSL_TYPE_INTERFACE:
664 case GLSL_TYPE_DOUBLE:
665 unreachable("not reached");
666 }
667
668 return 0;
669 }
670
671 /**
672 * Create a MOV to read the timestamp register.
673 *
674 * The caller is responsible for emitting the MOV. The return value is
675 * the destination of the MOV, with extra parameters set.
676 */
677 fs_reg
678 fs_visitor::get_timestamp(const fs_builder &bld)
679 {
680 assert(devinfo->gen >= 7);
681
682 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
683 BRW_ARF_TIMESTAMP,
684 0),
685 BRW_REGISTER_TYPE_UD));
686
687 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
688
689 /* We want to read the 3 fields we care about even if it's not enabled in
690 * the dispatch.
691 */
692 bld.exec_all().MOV(dst, ts);
693
694 /* The caller wants the low 32 bits of the timestamp. Since it's running
695 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
696 * which is plenty of time for our purposes. It is identical across the
697 * EUs, but since it's tracking GPU core speed it will increment at a
698 * varying rate as render P-states change.
699 *
700 * The caller could also check if render P-states have changed (or anything
701 * else that might disrupt timing) by setting smear to 2 and checking if
702 * that field is != 0.
703 */
704 dst.set_smear(0);
705
706 return dst;
707 }
708
709 void
710 fs_visitor::emit_shader_time_begin()
711 {
712 shader_start_time = get_timestamp(bld.annotate("shader time start"));
713 }
714
715 void
716 fs_visitor::emit_shader_time_end()
717 {
718 enum shader_time_shader_type type, written_type, reset_type;
719 switch (stage) {
720 case MESA_SHADER_VERTEX:
721 type = ST_VS;
722 written_type = ST_VS_WRITTEN;
723 reset_type = ST_VS_RESET;
724 break;
725 case MESA_SHADER_GEOMETRY:
726 type = ST_GS;
727 written_type = ST_GS_WRITTEN;
728 reset_type = ST_GS_RESET;
729 break;
730 case MESA_SHADER_FRAGMENT:
731 if (dispatch_width == 8) {
732 type = ST_FS8;
733 written_type = ST_FS8_WRITTEN;
734 reset_type = ST_FS8_RESET;
735 } else {
736 assert(dispatch_width == 16);
737 type = ST_FS16;
738 written_type = ST_FS16_WRITTEN;
739 reset_type = ST_FS16_RESET;
740 }
741 break;
742 case MESA_SHADER_COMPUTE:
743 type = ST_CS;
744 written_type = ST_CS_WRITTEN;
745 reset_type = ST_CS_RESET;
746 break;
747 default:
748 unreachable("fs_visitor::emit_shader_time_end missing code");
749 }
750
751 /* Insert our code just before the final SEND with EOT. */
752 exec_node *end = this->instructions.get_tail();
753 assert(end && ((fs_inst *) end)->eot);
754 const fs_builder ibld = bld.annotate("shader time end")
755 .exec_all().at(NULL, end);
756
757 fs_reg shader_end_time = get_timestamp(ibld);
758
759 /* Check that there weren't any timestamp reset events (assuming these
760 * were the only two timestamp reads that happened).
761 */
762 fs_reg reset = shader_end_time;
763 reset.set_smear(2);
764 set_condmod(BRW_CONDITIONAL_Z,
765 ibld.AND(ibld.null_reg_ud(), reset, fs_reg(1u)));
766 ibld.IF(BRW_PREDICATE_NORMAL);
767
768 fs_reg start = shader_start_time;
769 start.negate = true;
770 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
771 diff.set_smear(0);
772 ibld.ADD(diff, start, shader_end_time);
773
774 /* If there were no instructions between the two timestamp gets, the diff
775 * is 2 cycles. Remove that overhead, so I can forget about that when
776 * trying to determine the time taken for single instructions.
777 */
778 ibld.ADD(diff, diff, fs_reg(-2u));
779 SHADER_TIME_ADD(ibld, type, diff);
780 SHADER_TIME_ADD(ibld, written_type, fs_reg(1u));
781 ibld.emit(BRW_OPCODE_ELSE);
782 SHADER_TIME_ADD(ibld, reset_type, fs_reg(1u));
783 ibld.emit(BRW_OPCODE_ENDIF);
784 }
785
786 void
787 fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
788 enum shader_time_shader_type type, fs_reg value)
789 {
790 int shader_time_index =
791 brw_get_shader_time_index(brw, shader_prog, prog, type);
792 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
793
794 fs_reg payload;
795 if (dispatch_width == 8)
796 payload = vgrf(glsl_type::uvec2_type);
797 else
798 payload = vgrf(glsl_type::uint_type);
799
800 bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
801 }
802
803 void
804 fs_visitor::vfail(const char *format, va_list va)
805 {
806 char *msg;
807
808 if (failed)
809 return;
810
811 failed = true;
812
813 msg = ralloc_vasprintf(mem_ctx, format, va);
814 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
815
816 this->fail_msg = msg;
817
818 if (debug_enabled) {
819 fprintf(stderr, "%s", msg);
820 }
821 }
822
823 void
824 fs_visitor::fail(const char *format, ...)
825 {
826 va_list va;
827
828 va_start(va, format);
829 vfail(format, va);
830 va_end(va);
831 }
832
833 /**
834 * Mark this program as impossible to compile in SIMD16 mode.
835 *
836 * During the SIMD8 compile (which happens first), we can detect and flag
837 * things that are unsupported in SIMD16 mode, so the compiler can skip
838 * the SIMD16 compile altogether.
839 *
840 * During a SIMD16 compile (if one happens anyway), this just calls fail().
841 */
842 void
843 fs_visitor::no16(const char *format, ...)
844 {
845 va_list va;
846
847 va_start(va, format);
848
849 if (dispatch_width == 16) {
850 vfail(format, va);
851 } else {
852 simd16_unsupported = true;
853
854 if (brw->perf_debug) {
855 if (no16_msg)
856 ralloc_vasprintf_append(&no16_msg, format, va);
857 else
858 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
859 }
860 }
861
862 va_end(va);
863 }
864
865 fs_inst *
866 fs_visitor::emit(enum opcode opcode)
867 {
868 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
869 }
870
871 fs_inst *
872 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
873 {
874 return emit(new(mem_ctx) fs_inst(opcode, dst));
875 }
876
877 fs_inst *
878 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
879 {
880 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
881 }
882
883 fs_inst *
884 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
885 const fs_reg &src1)
886 {
887 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
888 }
889
890 fs_inst *
891 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
892 const fs_reg &src1, const fs_reg &src2)
893 {
894 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
895 }
896
897 fs_inst *
898 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
899 fs_reg src[], int sources)
900 {
901 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
902 }
903
904 /**
905 * Returns true if the instruction has a flag that means it won't
906 * update an entire destination register.
907 *
908 * For example, dead code elimination and live variable analysis want to know
909 * when a write to a variable screens off any preceding values that were in
910 * it.
911 */
912 bool
913 fs_inst::is_partial_write() const
914 {
915 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
916 (this->dst.width * type_sz(this->dst.type)) < 32 ||
917 !this->dst.is_contiguous());
918 }
919
920 int
921 fs_inst::regs_read(int arg) const
922 {
923 if (is_tex() && arg == 0 && src[0].file == GRF) {
924 return mlen;
925 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
926 return mlen;
927 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
928 return mlen;
929 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
930 return mlen;
931 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
932 return mlen;
933 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
934 return mlen;
935 } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
936 return mlen;
937 } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
938 return mlen;
939 } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
940 return mlen;
941 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
942 return mlen;
943 } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
944 return exec_size / 4;
945 }
946
947 switch (src[arg].file) {
948 case BAD_FILE:
949 case UNIFORM:
950 case IMM:
951 return 1;
952 case GRF:
953 case HW_REG:
954 if (src[arg].stride == 0) {
955 return 1;
956 } else {
957 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
958 return (size + 31) / 32;
959 }
960 case MRF:
961 unreachable("MRF registers are not allowed as sources");
962 default:
963 unreachable("Invalid register file");
964 }
965 }
966
967 bool
968 fs_inst::reads_flag() const
969 {
970 return predicate;
971 }
972
973 bool
974 fs_inst::writes_flag() const
975 {
976 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
977 opcode != BRW_OPCODE_IF &&
978 opcode != BRW_OPCODE_WHILE)) ||
979 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
980 }
981
982 /**
983 * Returns how many MRFs an FS opcode will write over.
984 *
985 * Note that this is not the 0 or 1 implied writes in an actual gen
986 * instruction -- the FS opcodes often generate MOVs in addition.
987 */
988 int
989 fs_visitor::implied_mrf_writes(fs_inst *inst)
990 {
991 if (inst->mlen == 0)
992 return 0;
993
994 if (inst->base_mrf == -1)
995 return 0;
996
997 switch (inst->opcode) {
998 case SHADER_OPCODE_RCP:
999 case SHADER_OPCODE_RSQ:
1000 case SHADER_OPCODE_SQRT:
1001 case SHADER_OPCODE_EXP2:
1002 case SHADER_OPCODE_LOG2:
1003 case SHADER_OPCODE_SIN:
1004 case SHADER_OPCODE_COS:
1005 return 1 * dispatch_width / 8;
1006 case SHADER_OPCODE_POW:
1007 case SHADER_OPCODE_INT_QUOTIENT:
1008 case SHADER_OPCODE_INT_REMAINDER:
1009 return 2 * dispatch_width / 8;
1010 case SHADER_OPCODE_TEX:
1011 case FS_OPCODE_TXB:
1012 case SHADER_OPCODE_TXD:
1013 case SHADER_OPCODE_TXF:
1014 case SHADER_OPCODE_TXF_CMS:
1015 case SHADER_OPCODE_TXF_MCS:
1016 case SHADER_OPCODE_TG4:
1017 case SHADER_OPCODE_TG4_OFFSET:
1018 case SHADER_OPCODE_TXL:
1019 case SHADER_OPCODE_TXS:
1020 case SHADER_OPCODE_LOD:
1021 return 1;
1022 case FS_OPCODE_FB_WRITE:
1023 return 2;
1024 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1025 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1026 return 1;
1027 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1028 return inst->mlen;
1029 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1030 return inst->mlen;
1031 case SHADER_OPCODE_UNTYPED_ATOMIC:
1032 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1033 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1034 case SHADER_OPCODE_TYPED_ATOMIC:
1035 case SHADER_OPCODE_TYPED_SURFACE_READ:
1036 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1037 case SHADER_OPCODE_URB_WRITE_SIMD8:
1038 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1039 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1040 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1041 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1042 return 0;
1043 default:
1044 unreachable("not reached");
1045 }
1046 }
1047
1048 fs_reg
1049 fs_visitor::vgrf(const glsl_type *const type)
1050 {
1051 int reg_width = dispatch_width / 8;
1052 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1053 brw_type_for_base_type(type), dispatch_width);
1054 }
1055
1056 fs_reg
1057 fs_visitor::vgrf(int num_components)
1058 {
1059 int reg_width = dispatch_width / 8;
1060 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1061 BRW_REGISTER_TYPE_F, dispatch_width);
1062 }
1063
1064 /** Fixed HW reg constructor. */
1065 fs_reg::fs_reg(enum register_file file, int reg)
1066 {
1067 init();
1068 this->file = file;
1069 this->reg = reg;
1070 this->type = BRW_REGISTER_TYPE_F;
1071
1072 switch (file) {
1073 case UNIFORM:
1074 this->width = 1;
1075 break;
1076 default:
1077 this->width = 8;
1078 }
1079 }
1080
1081 /** Fixed HW reg constructor. */
1082 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1083 {
1084 init();
1085 this->file = file;
1086 this->reg = reg;
1087 this->type = type;
1088
1089 switch (file) {
1090 case UNIFORM:
1091 this->width = 1;
1092 break;
1093 default:
1094 this->width = 8;
1095 }
1096 }
1097
1098 /** Fixed HW reg constructor. */
1099 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1100 uint8_t width)
1101 {
1102 init();
1103 this->file = file;
1104 this->reg = reg;
1105 this->type = type;
1106 this->width = width;
1107 }
1108
1109 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1110 * This brings in those uniform definitions
1111 */
1112 void
1113 fs_visitor::import_uniforms(fs_visitor *v)
1114 {
1115 this->push_constant_loc = v->push_constant_loc;
1116 this->pull_constant_loc = v->pull_constant_loc;
1117 this->uniforms = v->uniforms;
1118 this->param_size = v->param_size;
1119 }
1120
1121 fs_reg *
1122 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1123 bool origin_upper_left)
1124 {
1125 assert(stage == MESA_SHADER_FRAGMENT);
1126 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1127 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1128 fs_reg wpos = *reg;
1129 bool flip = !origin_upper_left ^ key->render_to_fbo;
1130
1131 /* gl_FragCoord.x */
1132 if (pixel_center_integer) {
1133 bld.MOV(wpos, this->pixel_x);
1134 } else {
1135 bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
1136 }
1137 wpos = offset(wpos, 1);
1138
1139 /* gl_FragCoord.y */
1140 if (!flip && pixel_center_integer) {
1141 bld.MOV(wpos, this->pixel_y);
1142 } else {
1143 fs_reg pixel_y = this->pixel_y;
1144 float offset = (pixel_center_integer ? 0.0 : 0.5);
1145
1146 if (flip) {
1147 pixel_y.negate = true;
1148 offset += key->drawable_height - 1.0;
1149 }
1150
1151 bld.ADD(wpos, pixel_y, fs_reg(offset));
1152 }
1153 wpos = offset(wpos, 1);
1154
1155 /* gl_FragCoord.z */
1156 if (devinfo->gen >= 6) {
1157 bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
1158 } else {
1159 bld.emit(FS_OPCODE_LINTERP, wpos,
1160 this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1161 interp_reg(VARYING_SLOT_POS, 2));
1162 }
1163 wpos = offset(wpos, 1);
1164
1165 /* gl_FragCoord.w: Already set up in emit_interpolation */
1166 bld.MOV(wpos, this->wpos_w);
1167
1168 return reg;
1169 }
1170
1171 fs_inst *
1172 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1173 glsl_interp_qualifier interpolation_mode,
1174 bool is_centroid, bool is_sample)
1175 {
1176 brw_wm_barycentric_interp_mode barycoord_mode;
1177 if (devinfo->gen >= 6) {
1178 if (is_centroid) {
1179 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1180 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1181 else
1182 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1183 } else if (is_sample) {
1184 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1185 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1186 else
1187 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1188 } else {
1189 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1190 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1191 else
1192 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1193 }
1194 } else {
1195 /* On Ironlake and below, there is only one interpolation mode.
1196 * Centroid interpolation doesn't mean anything on this hardware --
1197 * there is no multisampling.
1198 */
1199 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1200 }
1201 return bld.emit(FS_OPCODE_LINTERP, attr,
1202 this->delta_xy[barycoord_mode], interp);
1203 }
1204
1205 void
1206 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1207 const glsl_type *type,
1208 glsl_interp_qualifier interpolation_mode,
1209 int location, bool mod_centroid,
1210 bool mod_sample)
1211 {
1212 attr.type = brw_type_for_base_type(type->get_scalar_type());
1213
1214 assert(stage == MESA_SHADER_FRAGMENT);
1215 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1216 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1217
1218 unsigned int array_elements;
1219
1220 if (type->is_array()) {
1221 array_elements = type->length;
1222 if (array_elements == 0) {
1223 fail("dereferenced array '%s' has length 0\n", name);
1224 }
1225 type = type->fields.array;
1226 } else {
1227 array_elements = 1;
1228 }
1229
1230 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1231 bool is_gl_Color =
1232 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1233 if (key->flat_shade && is_gl_Color) {
1234 interpolation_mode = INTERP_QUALIFIER_FLAT;
1235 } else {
1236 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1237 }
1238 }
1239
1240 for (unsigned int i = 0; i < array_elements; i++) {
1241 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1242 if (prog_data->urb_setup[location] == -1) {
1243 /* If there's no incoming setup data for this slot, don't
1244 * emit interpolation for it.
1245 */
1246 attr = offset(attr, type->vector_elements);
1247 location++;
1248 continue;
1249 }
1250
1251 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1252 /* Constant interpolation (flat shading) case. The SF has
1253 * handed us defined values in only the constant offset
1254 * field of the setup reg.
1255 */
1256 for (unsigned int k = 0; k < type->vector_elements; k++) {
1257 struct brw_reg interp = interp_reg(location, k);
1258 interp = suboffset(interp, 3);
1259 interp.type = attr.type;
1260 bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1261 attr = offset(attr, 1);
1262 }
1263 } else {
1264 /* Smooth/noperspective interpolation case. */
1265 for (unsigned int k = 0; k < type->vector_elements; k++) {
1266 struct brw_reg interp = interp_reg(location, k);
1267 if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1268 /* Get the pixel/sample mask into f0 so that we know
1269 * which pixels are lit. Then, for each channel that is
1270 * unlit, replace the centroid data with non-centroid
1271 * data.
1272 */
1273 bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1274
1275 fs_inst *inst;
1276 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1277 false, false);
1278 inst->predicate = BRW_PREDICATE_NORMAL;
1279 inst->predicate_inverse = true;
1280 if (devinfo->has_pln)
1281 inst->no_dd_clear = true;
1282
1283 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1284 mod_centroid && !key->persample_shading,
1285 mod_sample || key->persample_shading);
1286 inst->predicate = BRW_PREDICATE_NORMAL;
1287 inst->predicate_inverse = false;
1288 if (devinfo->has_pln)
1289 inst->no_dd_check = true;
1290
1291 } else {
1292 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1293 mod_centroid && !key->persample_shading,
1294 mod_sample || key->persample_shading);
1295 }
1296 if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1297 bld.MUL(attr, attr, this->pixel_w);
1298 }
1299 attr = offset(attr, 1);
1300 }
1301
1302 }
1303 location++;
1304 }
1305 }
1306 }
1307
1308 fs_reg *
1309 fs_visitor::emit_frontfacing_interpolation()
1310 {
1311 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1312
1313 if (devinfo->gen >= 6) {
1314 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1315 * a boolean result from this (~0/true or 0/false).
1316 *
1317 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1318 * this task in only one instruction:
1319 * - a negation source modifier will flip the bit; and
1320 * - a W -> D type conversion will sign extend the bit into the high
1321 * word of the destination.
1322 *
1323 * An ASR 15 fills the low word of the destination.
1324 */
1325 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1326 g0.negate = true;
1327
1328 bld.ASR(*reg, g0, fs_reg(15));
1329 } else {
1330 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1331 * a boolean result from this (1/true or 0/false).
1332 *
1333 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1334 * the negation source modifier to flip it. Unfortunately the SHR
1335 * instruction only operates on UD (or D with an abs source modifier)
1336 * sources without negation.
1337 *
1338 * Instead, use ASR (which will give ~0/true or 0/false).
1339 */
1340 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1341 g1_6.negate = true;
1342
1343 bld.ASR(*reg, g1_6, fs_reg(31));
1344 }
1345
1346 return reg;
1347 }
1348
1349 void
1350 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1351 {
1352 assert(stage == MESA_SHADER_FRAGMENT);
1353 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1354 assert(dst.type == BRW_REGISTER_TYPE_F);
1355
1356 if (key->compute_pos_offset) {
1357 /* Convert int_sample_pos to floating point */
1358 bld.MOV(dst, int_sample_pos);
1359 /* Scale to the range [0, 1] */
1360 bld.MUL(dst, dst, fs_reg(1 / 16.0f));
1361 }
1362 else {
1363 /* From ARB_sample_shading specification:
1364 * "When rendering to a non-multisample buffer, or if multisample
1365 * rasterization is disabled, gl_SamplePosition will always be
1366 * (0.5, 0.5).
1367 */
1368 bld.MOV(dst, fs_reg(0.5f));
1369 }
1370 }
1371
1372 fs_reg *
1373 fs_visitor::emit_samplepos_setup()
1374 {
1375 assert(devinfo->gen >= 6);
1376
1377 const fs_builder abld = bld.annotate("compute sample position");
1378 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1379 fs_reg pos = *reg;
1380 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1381 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1382
1383 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1384 * mode will be enabled.
1385 *
1386 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1387 * R31.1:0 Position Offset X/Y for Slot[3:0]
1388 * R31.3:2 Position Offset X/Y for Slot[7:4]
1389 * .....
1390 *
1391 * The X, Y sample positions come in as bytes in thread payload. So, read
1392 * the positions using vstride=16, width=8, hstride=2.
1393 */
1394 struct brw_reg sample_pos_reg =
1395 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1396 BRW_REGISTER_TYPE_B), 16, 8, 2);
1397
1398 if (dispatch_width == 8) {
1399 abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
1400 } else {
1401 abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
1402 abld.half(1).MOV(half(int_sample_x, 1),
1403 fs_reg(suboffset(sample_pos_reg, 16)));
1404 }
1405 /* Compute gl_SamplePosition.x */
1406 compute_sample_position(pos, int_sample_x);
1407 pos = offset(pos, 1);
1408 if (dispatch_width == 8) {
1409 abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
1410 } else {
1411 abld.half(0).MOV(half(int_sample_y, 0),
1412 fs_reg(suboffset(sample_pos_reg, 1)));
1413 abld.half(1).MOV(half(int_sample_y, 1),
1414 fs_reg(suboffset(sample_pos_reg, 17)));
1415 }
1416 /* Compute gl_SamplePosition.y */
1417 compute_sample_position(pos, int_sample_y);
1418 return reg;
1419 }
1420
1421 fs_reg *
1422 fs_visitor::emit_sampleid_setup()
1423 {
1424 assert(stage == MESA_SHADER_FRAGMENT);
1425 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1426 assert(devinfo->gen >= 6);
1427
1428 const fs_builder abld = bld.annotate("compute sample id");
1429 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1430
1431 if (key->compute_sample_id) {
1432 fs_reg t1 = vgrf(glsl_type::int_type);
1433 fs_reg t2 = vgrf(glsl_type::int_type);
1434 t2.type = BRW_REGISTER_TYPE_UW;
1435
1436 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1437 * 8x multisampling, subspan 0 will represent sample N (where N
1438 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1439 * 7. We can find the value of N by looking at R0.0 bits 7:6
1440 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1441 * (since samples are always delivered in pairs). That is, we
1442 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1443 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1444 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1445 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1446 * populating a temporary variable with the sequence (0, 1, 2, 3),
1447 * and then reading from it using vstride=1, width=4, hstride=0.
1448 * These computations hold good for 4x multisampling as well.
1449 *
1450 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1451 * the first four slots are sample 0 of subspan 0; the next four
1452 * are sample 1 of subspan 0; the third group is sample 0 of
1453 * subspan 1, and finally sample 1 of subspan 1.
1454 */
1455 abld.exec_all()
1456 .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1457 fs_reg(0xc0));
1458 abld.exec_all().SHR(t1, t1, fs_reg(5));
1459
1460 /* This works for both SIMD8 and SIMD16 */
1461 abld.exec_all()
1462 .MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210));
1463
1464 /* This special instruction takes care of setting vstride=1,
1465 * width=4, hstride=0 of t2 during an ADD instruction.
1466 */
1467 abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1468 } else {
1469 /* As per GL_ARB_sample_shading specification:
1470 * "When rendering to a non-multisample buffer, or if multisample
1471 * rasterization is disabled, gl_SampleID will always be zero."
1472 */
1473 abld.MOV(*reg, fs_reg(0));
1474 }
1475
1476 return reg;
1477 }
1478
1479 void
1480 fs_visitor::resolve_source_modifiers(fs_reg *src)
1481 {
1482 if (!src->abs && !src->negate)
1483 return;
1484
1485 fs_reg temp = retype(vgrf(1), src->type);
1486 emit(MOV(temp, *src));
1487 *src = temp;
1488 }
1489
1490 fs_reg
1491 fs_visitor::fix_math_operand(fs_reg src)
1492 {
1493 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1494 * might be able to do better by doing execsize = 1 math and then
1495 * expanding that result out, but we would need to be careful with
1496 * masking.
1497 *
1498 * The hardware ignores source modifiers (negate and abs) on math
1499 * instructions, so we also move to a temp to set those up.
1500 */
1501 if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1502 !src.abs && !src.negate)
1503 return src;
1504
1505 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1506 * operands to math
1507 */
1508 if (devinfo->gen >= 7 && src.file != IMM)
1509 return src;
1510
1511 fs_reg expanded = vgrf(glsl_type::float_type);
1512 expanded.type = src.type;
1513 emit(BRW_OPCODE_MOV, expanded, src);
1514 return expanded;
1515 }
1516
1517 fs_inst *
1518 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1519 {
1520 switch (opcode) {
1521 case SHADER_OPCODE_RCP:
1522 case SHADER_OPCODE_RSQ:
1523 case SHADER_OPCODE_SQRT:
1524 case SHADER_OPCODE_EXP2:
1525 case SHADER_OPCODE_LOG2:
1526 case SHADER_OPCODE_SIN:
1527 case SHADER_OPCODE_COS:
1528 break;
1529 default:
1530 unreachable("not reached: bad math opcode");
1531 }
1532
1533 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1534 * might be able to do better by doing execsize = 1 math and then
1535 * expanding that result out, but we would need to be careful with
1536 * masking.
1537 *
1538 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1539 * instructions, so we also move to a temp to set those up.
1540 */
1541 if (devinfo->gen == 6 || devinfo->gen == 7)
1542 src = fix_math_operand(src);
1543
1544 fs_inst *inst = emit(opcode, dst, src);
1545
1546 if (devinfo->gen < 6) {
1547 inst->base_mrf = 2;
1548 inst->mlen = dispatch_width / 8;
1549 }
1550
1551 return inst;
1552 }
1553
1554 fs_inst *
1555 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1556 {
1557 int base_mrf = 2;
1558 fs_inst *inst;
1559
1560 if (devinfo->gen >= 8) {
1561 inst = emit(opcode, dst, src0, src1);
1562 } else if (devinfo->gen >= 6) {
1563 src0 = fix_math_operand(src0);
1564 src1 = fix_math_operand(src1);
1565
1566 inst = emit(opcode, dst, src0, src1);
1567 } else {
1568 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1569 * "Message Payload":
1570 *
1571 * "Operand0[7]. For the INT DIV functions, this operand is the
1572 * denominator."
1573 * ...
1574 * "Operand1[7]. For the INT DIV functions, this operand is the
1575 * numerator."
1576 */
1577 bool is_int_div = opcode != SHADER_OPCODE_POW;
1578 fs_reg &op0 = is_int_div ? src1 : src0;
1579 fs_reg &op1 = is_int_div ? src0 : src1;
1580
1581 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1582 inst = emit(opcode, dst, op0, reg_null_f);
1583
1584 inst->base_mrf = base_mrf;
1585 inst->mlen = 2 * dispatch_width / 8;
1586 }
1587 return inst;
1588 }
1589
1590 void
1591 fs_visitor::emit_discard_jump()
1592 {
1593 assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1594
1595 /* For performance, after a discard, jump to the end of the
1596 * shader if all relevant channels have been discarded.
1597 */
1598 fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
1599 discard_jump->flag_subreg = 1;
1600
1601 discard_jump->predicate = (dispatch_width == 8)
1602 ? BRW_PREDICATE_ALIGN1_ANY8H
1603 : BRW_PREDICATE_ALIGN1_ANY16H;
1604 discard_jump->predicate_inverse = true;
1605 }
1606
1607 void
1608 fs_visitor::assign_curb_setup()
1609 {
1610 if (dispatch_width == 8) {
1611 prog_data->dispatch_grf_start_reg = payload.num_regs;
1612 } else {
1613 if (stage == MESA_SHADER_FRAGMENT) {
1614 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1615 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1616 } else if (stage == MESA_SHADER_COMPUTE) {
1617 brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1618 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1619 } else {
1620 unreachable("Unsupported shader type!");
1621 }
1622 }
1623
1624 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1625
1626 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1627 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1628 for (unsigned int i = 0; i < inst->sources; i++) {
1629 if (inst->src[i].file == UNIFORM) {
1630 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1631 int constant_nr;
1632 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1633 constant_nr = push_constant_loc[uniform_nr];
1634 } else {
1635 /* Section 5.11 of the OpenGL 4.1 spec says:
1636 * "Out-of-bounds reads return undefined values, which include
1637 * values from other variables of the active program or zero."
1638 * Just return the first push constant.
1639 */
1640 constant_nr = 0;
1641 }
1642
1643 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1644 constant_nr / 8,
1645 constant_nr % 8);
1646
1647 inst->src[i].file = HW_REG;
1648 inst->src[i].fixed_hw_reg = byte_offset(
1649 retype(brw_reg, inst->src[i].type),
1650 inst->src[i].subreg_offset);
1651 }
1652 }
1653 }
1654 }
1655
1656 void
1657 fs_visitor::calculate_urb_setup()
1658 {
1659 assert(stage == MESA_SHADER_FRAGMENT);
1660 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1661 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1662
1663 memset(prog_data->urb_setup, -1,
1664 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1665
1666 int urb_next = 0;
1667 /* Figure out where each of the incoming setup attributes lands. */
1668 if (devinfo->gen >= 6) {
1669 if (_mesa_bitcount_64(prog->InputsRead &
1670 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1671 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1672 * first 16 varying inputs, so we can put them wherever we want.
1673 * Just put them in order.
1674 *
1675 * This is useful because it means that (a) inputs not used by the
1676 * fragment shader won't take up valuable register space, and (b) we
1677 * won't have to recompile the fragment shader if it gets paired with
1678 * a different vertex (or geometry) shader.
1679 */
1680 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1681 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1682 BITFIELD64_BIT(i)) {
1683 prog_data->urb_setup[i] = urb_next++;
1684 }
1685 }
1686 } else {
1687 /* We have enough input varyings that the SF/SBE pipeline stage can't
1688 * arbitrarily rearrange them to suit our whim; we have to put them
1689 * in an order that matches the output of the previous pipeline stage
1690 * (geometry or vertex shader).
1691 */
1692 struct brw_vue_map prev_stage_vue_map;
1693 brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1694 key->input_slots_valid);
1695 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1696 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1697 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1698 slot++) {
1699 int varying = prev_stage_vue_map.slot_to_varying[slot];
1700 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1701 * unused.
1702 */
1703 if (varying != BRW_VARYING_SLOT_COUNT &&
1704 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1705 BITFIELD64_BIT(varying))) {
1706 prog_data->urb_setup[varying] = slot - first_slot;
1707 }
1708 }
1709 urb_next = prev_stage_vue_map.num_slots - first_slot;
1710 }
1711 } else {
1712 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1713 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1714 /* Point size is packed into the header, not as a general attribute */
1715 if (i == VARYING_SLOT_PSIZ)
1716 continue;
1717
1718 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1719 /* The back color slot is skipped when the front color is
1720 * also written to. In addition, some slots can be
1721 * written in the vertex shader and not read in the
1722 * fragment shader. So the register number must always be
1723 * incremented, mapped or not.
1724 */
1725 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1726 prog_data->urb_setup[i] = urb_next;
1727 urb_next++;
1728 }
1729 }
1730
1731 /*
1732 * It's a FS only attribute, and we did interpolation for this attribute
1733 * in SF thread. So, count it here, too.
1734 *
1735 * See compile_sf_prog() for more info.
1736 */
1737 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1738 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1739 }
1740
1741 prog_data->num_varying_inputs = urb_next;
1742 }
1743
1744 void
1745 fs_visitor::assign_urb_setup()
1746 {
1747 assert(stage == MESA_SHADER_FRAGMENT);
1748 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1749
1750 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1751
1752 /* Offset all the urb_setup[] index by the actual position of the
1753 * setup regs, now that the location of the constants has been chosen.
1754 */
1755 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1756 if (inst->opcode == FS_OPCODE_LINTERP) {
1757 assert(inst->src[1].file == HW_REG);
1758 inst->src[1].fixed_hw_reg.nr += urb_start;
1759 }
1760
1761 if (inst->opcode == FS_OPCODE_CINTERP) {
1762 assert(inst->src[0].file == HW_REG);
1763 inst->src[0].fixed_hw_reg.nr += urb_start;
1764 }
1765 }
1766
1767 /* Each attribute is 4 setup channels, each of which is half a reg. */
1768 this->first_non_payload_grf =
1769 urb_start + prog_data->num_varying_inputs * 2;
1770 }
1771
1772 void
1773 fs_visitor::assign_vs_urb_setup()
1774 {
1775 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1776 int grf, count, slot, channel, attr;
1777
1778 assert(stage == MESA_SHADER_VERTEX);
1779 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1780 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1781 count++;
1782
1783 /* Each attribute is 4 regs. */
1784 this->first_non_payload_grf =
1785 payload.num_regs + prog_data->curb_read_length + count * 4;
1786
1787 unsigned vue_entries =
1788 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1789
1790 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1791 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1792
1793 assert(vs_prog_data->base.urb_read_length <= 15);
1794
1795 /* Rewrite all ATTR file references to the hw grf that they land in. */
1796 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1797 for (int i = 0; i < inst->sources; i++) {
1798 if (inst->src[i].file == ATTR) {
1799
1800 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1801 slot = count - 1;
1802 } else {
1803 /* Attributes come in in a contiguous block, ordered by their
1804 * gl_vert_attrib value. That means we can compute the slot
1805 * number for an attribute by masking out the enabled
1806 * attributes before it and counting the bits.
1807 */
1808 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1809 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1810 BITFIELD64_MASK(attr));
1811 }
1812
1813 channel = inst->src[i].reg_offset & 3;
1814
1815 grf = payload.num_regs +
1816 prog_data->curb_read_length +
1817 slot * 4 + channel;
1818
1819 inst->src[i].file = HW_REG;
1820 inst->src[i].fixed_hw_reg =
1821 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1822 }
1823 }
1824 }
1825 }
1826
1827 /**
1828 * Split large virtual GRFs into separate components if we can.
1829 *
1830 * This is mostly duplicated with what brw_fs_vector_splitting does,
1831 * but that's really conservative because it's afraid of doing
1832 * splitting that doesn't result in real progress after the rest of
1833 * the optimization phases, which would cause infinite looping in
1834 * optimization. We can do it once here, safely. This also has the
1835 * opportunity to split interpolated values, or maybe even uniforms,
1836 * which we don't have at the IR level.
1837 *
1838 * We want to split, because virtual GRFs are what we register
1839 * allocate and spill (due to contiguousness requirements for some
1840 * instructions), and they're what we naturally generate in the
1841 * codegen process, but most virtual GRFs don't actually need to be
1842 * contiguous sets of GRFs. If we split, we'll end up with reduced
1843 * live intervals and better dead code elimination and coalescing.
1844 */
1845 void
1846 fs_visitor::split_virtual_grfs()
1847 {
1848 int num_vars = this->alloc.count;
1849
1850 /* Count the total number of registers */
1851 int reg_count = 0;
1852 int vgrf_to_reg[num_vars];
1853 for (int i = 0; i < num_vars; i++) {
1854 vgrf_to_reg[i] = reg_count;
1855 reg_count += alloc.sizes[i];
1856 }
1857
1858 /* An array of "split points". For each register slot, this indicates
1859 * if this slot can be separated from the previous slot. Every time an
1860 * instruction uses multiple elements of a register (as a source or
1861 * destination), we mark the used slots as inseparable. Then we go
1862 * through and split the registers into the smallest pieces we can.
1863 */
1864 bool split_points[reg_count];
1865 memset(split_points, 0, sizeof(split_points));
1866
1867 /* Mark all used registers as fully splittable */
1868 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1869 if (inst->dst.file == GRF) {
1870 int reg = vgrf_to_reg[inst->dst.reg];
1871 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1872 split_points[reg + j] = true;
1873 }
1874
1875 for (int i = 0; i < inst->sources; i++) {
1876 if (inst->src[i].file == GRF) {
1877 int reg = vgrf_to_reg[inst->src[i].reg];
1878 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1879 split_points[reg + j] = true;
1880 }
1881 }
1882 }
1883
1884 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1885 if (inst->dst.file == GRF) {
1886 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1887 for (int j = 1; j < inst->regs_written; j++)
1888 split_points[reg + j] = false;
1889 }
1890 for (int i = 0; i < inst->sources; i++) {
1891 if (inst->src[i].file == GRF) {
1892 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1893 for (int j = 1; j < inst->regs_read(i); j++)
1894 split_points[reg + j] = false;
1895 }
1896 }
1897 }
1898
1899 int new_virtual_grf[reg_count];
1900 int new_reg_offset[reg_count];
1901
1902 int reg = 0;
1903 for (int i = 0; i < num_vars; i++) {
1904 /* The first one should always be 0 as a quick sanity check. */
1905 assert(split_points[reg] == false);
1906
1907 /* j = 0 case */
1908 new_reg_offset[reg] = 0;
1909 reg++;
1910 int offset = 1;
1911
1912 /* j > 0 case */
1913 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1914 /* If this is a split point, reset the offset to 0 and allocate a
1915 * new virtual GRF for the previous offset many registers
1916 */
1917 if (split_points[reg]) {
1918 assert(offset <= MAX_VGRF_SIZE);
1919 int grf = alloc.allocate(offset);
1920 for (int k = reg - offset; k < reg; k++)
1921 new_virtual_grf[k] = grf;
1922 offset = 0;
1923 }
1924 new_reg_offset[reg] = offset;
1925 offset++;
1926 reg++;
1927 }
1928
1929 /* The last one gets the original register number */
1930 assert(offset <= MAX_VGRF_SIZE);
1931 alloc.sizes[i] = offset;
1932 for (int k = reg - offset; k < reg; k++)
1933 new_virtual_grf[k] = i;
1934 }
1935 assert(reg == reg_count);
1936
1937 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1938 if (inst->dst.file == GRF) {
1939 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1940 inst->dst.reg = new_virtual_grf[reg];
1941 inst->dst.reg_offset = new_reg_offset[reg];
1942 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1943 }
1944 for (int i = 0; i < inst->sources; i++) {
1945 if (inst->src[i].file == GRF) {
1946 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1947 inst->src[i].reg = new_virtual_grf[reg];
1948 inst->src[i].reg_offset = new_reg_offset[reg];
1949 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1950 }
1951 }
1952 }
1953 invalidate_live_intervals();
1954 }
1955
1956 /**
1957 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1958 *
1959 * During code generation, we create tons of temporary variables, many of
1960 * which get immediately killed and are never used again. Yet, in later
1961 * optimization and analysis passes, such as compute_live_intervals, we need
1962 * to loop over all the virtual GRFs. Compacting them can save a lot of
1963 * overhead.
1964 */
1965 bool
1966 fs_visitor::compact_virtual_grfs()
1967 {
1968 bool progress = false;
1969 int remap_table[this->alloc.count];
1970 memset(remap_table, -1, sizeof(remap_table));
1971
1972 /* Mark which virtual GRFs are used. */
1973 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1974 if (inst->dst.file == GRF)
1975 remap_table[inst->dst.reg] = 0;
1976
1977 for (int i = 0; i < inst->sources; i++) {
1978 if (inst->src[i].file == GRF)
1979 remap_table[inst->src[i].reg] = 0;
1980 }
1981 }
1982
1983 /* Compact the GRF arrays. */
1984 int new_index = 0;
1985 for (unsigned i = 0; i < this->alloc.count; i++) {
1986 if (remap_table[i] == -1) {
1987 /* We just found an unused register. This means that we are
1988 * actually going to compact something.
1989 */
1990 progress = true;
1991 } else {
1992 remap_table[i] = new_index;
1993 alloc.sizes[new_index] = alloc.sizes[i];
1994 invalidate_live_intervals();
1995 ++new_index;
1996 }
1997 }
1998
1999 this->alloc.count = new_index;
2000
2001 /* Patch all the instructions to use the newly renumbered registers */
2002 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2003 if (inst->dst.file == GRF)
2004 inst->dst.reg = remap_table[inst->dst.reg];
2005
2006 for (int i = 0; i < inst->sources; i++) {
2007 if (inst->src[i].file == GRF)
2008 inst->src[i].reg = remap_table[inst->src[i].reg];
2009 }
2010 }
2011
2012 /* Patch all the references to delta_xy, since they're used in register
2013 * allocation. If they're unused, switch them to BAD_FILE so we don't
2014 * think some random VGRF is delta_xy.
2015 */
2016 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2017 if (delta_xy[i].file == GRF) {
2018 if (remap_table[delta_xy[i].reg] != -1) {
2019 delta_xy[i].reg = remap_table[delta_xy[i].reg];
2020 } else {
2021 delta_xy[i].file = BAD_FILE;
2022 }
2023 }
2024 }
2025
2026 return progress;
2027 }
2028
2029 /*
2030 * Implements array access of uniforms by inserting a
2031 * PULL_CONSTANT_LOAD instruction.
2032 *
2033 * Unlike temporary GRF array access (where we don't support it due to
2034 * the difficulty of doing relative addressing on instruction
2035 * destinations), we could potentially do array access of uniforms
2036 * that were loaded in GRF space as push constants. In real-world
2037 * usage we've seen, though, the arrays being used are always larger
2038 * than we could load as push constants, so just always move all
2039 * uniform array access out to a pull constant buffer.
2040 */
2041 void
2042 fs_visitor::move_uniform_array_access_to_pull_constants()
2043 {
2044 if (dispatch_width != 8)
2045 return;
2046
2047 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2048 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2049
2050 /* Walk through and find array access of uniforms. Put a copy of that
2051 * uniform in the pull constant buffer.
2052 *
2053 * Note that we don't move constant-indexed accesses to arrays. No
2054 * testing has been done of the performance impact of this choice.
2055 */
2056 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2057 for (int i = 0 ; i < inst->sources; i++) {
2058 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2059 continue;
2060
2061 int uniform = inst->src[i].reg;
2062
2063 /* If this array isn't already present in the pull constant buffer,
2064 * add it.
2065 */
2066 if (pull_constant_loc[uniform] == -1) {
2067 const gl_constant_value **values = &stage_prog_data->param[uniform];
2068
2069 assert(param_size[uniform]);
2070
2071 for (int j = 0; j < param_size[uniform]; j++) {
2072 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2073
2074 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2075 values[j];
2076 }
2077 }
2078 }
2079 }
2080 }
2081
2082 /**
2083 * Assign UNIFORM file registers to either push constants or pull constants.
2084 *
2085 * We allow a fragment shader to have more than the specified minimum
2086 * maximum number of fragment shader uniform components (64). If
2087 * there are too many of these, they'd fill up all of register space.
2088 * So, this will push some of them out to the pull constant buffer and
2089 * update the program to load them.
2090 */
2091 void
2092 fs_visitor::assign_constant_locations()
2093 {
2094 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2095 if (dispatch_width != 8)
2096 return;
2097
2098 /* Find which UNIFORM registers are still in use. */
2099 bool is_live[uniforms];
2100 for (unsigned int i = 0; i < uniforms; i++) {
2101 is_live[i] = false;
2102 }
2103
2104 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2105 for (int i = 0; i < inst->sources; i++) {
2106 if (inst->src[i].file != UNIFORM)
2107 continue;
2108
2109 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2110 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2111 is_live[constant_nr] = true;
2112 }
2113 }
2114
2115 /* Only allow 16 registers (128 uniform components) as push constants.
2116 *
2117 * Just demote the end of the list. We could probably do better
2118 * here, demoting things that are rarely used in the program first.
2119 *
2120 * If changing this value, note the limitation about total_regs in
2121 * brw_curbe.c.
2122 */
2123 unsigned int max_push_components = 16 * 8;
2124 unsigned int num_push_constants = 0;
2125
2126 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2127
2128 for (unsigned int i = 0; i < uniforms; i++) {
2129 if (!is_live[i] || pull_constant_loc[i] != -1) {
2130 /* This UNIFORM register is either dead, or has already been demoted
2131 * to a pull const. Mark it as no longer living in the param[] array.
2132 */
2133 push_constant_loc[i] = -1;
2134 continue;
2135 }
2136
2137 if (num_push_constants < max_push_components) {
2138 /* Retain as a push constant. Record the location in the params[]
2139 * array.
2140 */
2141 push_constant_loc[i] = num_push_constants++;
2142 } else {
2143 /* Demote to a pull constant. */
2144 push_constant_loc[i] = -1;
2145
2146 int pull_index = stage_prog_data->nr_pull_params++;
2147 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2148 pull_constant_loc[i] = pull_index;
2149 }
2150 }
2151
2152 stage_prog_data->nr_params = num_push_constants;
2153
2154 /* Up until now, the param[] array has been indexed by reg + reg_offset
2155 * of UNIFORM registers. Condense it to only contain the uniforms we
2156 * chose to upload as push constants.
2157 */
2158 for (unsigned int i = 0; i < uniforms; i++) {
2159 int remapped = push_constant_loc[i];
2160
2161 if (remapped == -1)
2162 continue;
2163
2164 assert(remapped <= (int)i);
2165 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2166 }
2167 }
2168
2169 /**
2170 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2171 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2172 */
2173 void
2174 fs_visitor::demote_pull_constants()
2175 {
2176 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2177 for (int i = 0; i < inst->sources; i++) {
2178 if (inst->src[i].file != UNIFORM)
2179 continue;
2180
2181 int pull_index;
2182 unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2183 if (location >= uniforms) /* Out of bounds access */
2184 pull_index = -1;
2185 else
2186 pull_index = pull_constant_loc[location];
2187
2188 if (pull_index == -1)
2189 continue;
2190
2191 /* Set up the annotation tracking for new generated instructions. */
2192 const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
2193 .at(block, inst);
2194 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2195 fs_reg dst = vgrf(glsl_type::float_type);
2196
2197 /* Generate a pull load into dst. */
2198 if (inst->src[i].reladdr) {
2199 VARYING_PULL_CONSTANT_LOAD(ibld, dst,
2200 surf_index,
2201 *inst->src[i].reladdr,
2202 pull_index);
2203 inst->src[i].reladdr = NULL;
2204 } else {
2205 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2206 ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
2207 dst, surf_index, offset);
2208 inst->src[i].set_smear(pull_index & 3);
2209 }
2210
2211 /* Rewrite the instruction to use the temporary VGRF. */
2212 inst->src[i].file = GRF;
2213 inst->src[i].reg = dst.reg;
2214 inst->src[i].reg_offset = 0;
2215 inst->src[i].width = dispatch_width;
2216 }
2217 }
2218 invalidate_live_intervals();
2219 }
2220
2221 bool
2222 fs_visitor::opt_algebraic()
2223 {
2224 bool progress = false;
2225
2226 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2227 switch (inst->opcode) {
2228 case BRW_OPCODE_MOV:
2229 if (inst->src[0].file != IMM)
2230 break;
2231
2232 if (inst->saturate) {
2233 if (inst->dst.type != inst->src[0].type)
2234 assert(!"unimplemented: saturate mixed types");
2235
2236 if (brw_saturate_immediate(inst->dst.type,
2237 &inst->src[0].fixed_hw_reg)) {
2238 inst->saturate = false;
2239 progress = true;
2240 }
2241 }
2242 break;
2243
2244 case BRW_OPCODE_MUL:
2245 if (inst->src[1].file != IMM)
2246 continue;
2247
2248 /* a * 1.0 = a */
2249 if (inst->src[1].is_one()) {
2250 inst->opcode = BRW_OPCODE_MOV;
2251 inst->src[1] = reg_undef;
2252 progress = true;
2253 break;
2254 }
2255
2256 /* a * -1.0 = -a */
2257 if (inst->src[1].is_negative_one()) {
2258 inst->opcode = BRW_OPCODE_MOV;
2259 inst->src[0].negate = !inst->src[0].negate;
2260 inst->src[1] = reg_undef;
2261 progress = true;
2262 break;
2263 }
2264
2265 /* a * 0.0 = 0.0 */
2266 if (inst->src[1].is_zero()) {
2267 inst->opcode = BRW_OPCODE_MOV;
2268 inst->src[0] = inst->src[1];
2269 inst->src[1] = reg_undef;
2270 progress = true;
2271 break;
2272 }
2273
2274 if (inst->src[0].file == IMM) {
2275 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2276 inst->opcode = BRW_OPCODE_MOV;
2277 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2278 inst->src[1] = reg_undef;
2279 progress = true;
2280 break;
2281 }
2282 break;
2283 case BRW_OPCODE_ADD:
2284 if (inst->src[1].file != IMM)
2285 continue;
2286
2287 /* a + 0.0 = a */
2288 if (inst->src[1].is_zero()) {
2289 inst->opcode = BRW_OPCODE_MOV;
2290 inst->src[1] = reg_undef;
2291 progress = true;
2292 break;
2293 }
2294
2295 if (inst->src[0].file == IMM) {
2296 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2297 inst->opcode = BRW_OPCODE_MOV;
2298 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2299 inst->src[1] = reg_undef;
2300 progress = true;
2301 break;
2302 }
2303 break;
2304 case BRW_OPCODE_OR:
2305 if (inst->src[0].equals(inst->src[1])) {
2306 inst->opcode = BRW_OPCODE_MOV;
2307 inst->src[1] = reg_undef;
2308 progress = true;
2309 break;
2310 }
2311 break;
2312 case BRW_OPCODE_LRP:
2313 if (inst->src[1].equals(inst->src[2])) {
2314 inst->opcode = BRW_OPCODE_MOV;
2315 inst->src[0] = inst->src[1];
2316 inst->src[1] = reg_undef;
2317 inst->src[2] = reg_undef;
2318 progress = true;
2319 break;
2320 }
2321 break;
2322 case BRW_OPCODE_CMP:
2323 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2324 inst->src[0].abs &&
2325 inst->src[0].negate &&
2326 inst->src[1].is_zero()) {
2327 inst->src[0].abs = false;
2328 inst->src[0].negate = false;
2329 inst->conditional_mod = BRW_CONDITIONAL_Z;
2330 progress = true;
2331 break;
2332 }
2333 break;
2334 case BRW_OPCODE_SEL:
2335 if (inst->src[0].equals(inst->src[1])) {
2336 inst->opcode = BRW_OPCODE_MOV;
2337 inst->src[1] = reg_undef;
2338 inst->predicate = BRW_PREDICATE_NONE;
2339 inst->predicate_inverse = false;
2340 progress = true;
2341 } else if (inst->saturate && inst->src[1].file == IMM) {
2342 switch (inst->conditional_mod) {
2343 case BRW_CONDITIONAL_LE:
2344 case BRW_CONDITIONAL_L:
2345 switch (inst->src[1].type) {
2346 case BRW_REGISTER_TYPE_F:
2347 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2348 inst->opcode = BRW_OPCODE_MOV;
2349 inst->src[1] = reg_undef;
2350 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2351 progress = true;
2352 }
2353 break;
2354 default:
2355 break;
2356 }
2357 break;
2358 case BRW_CONDITIONAL_GE:
2359 case BRW_CONDITIONAL_G:
2360 switch (inst->src[1].type) {
2361 case BRW_REGISTER_TYPE_F:
2362 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2363 inst->opcode = BRW_OPCODE_MOV;
2364 inst->src[1] = reg_undef;
2365 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2366 progress = true;
2367 }
2368 break;
2369 default:
2370 break;
2371 }
2372 default:
2373 break;
2374 }
2375 }
2376 break;
2377 case BRW_OPCODE_MAD:
2378 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2379 inst->opcode = BRW_OPCODE_MOV;
2380 inst->src[1] = reg_undef;
2381 inst->src[2] = reg_undef;
2382 progress = true;
2383 } else if (inst->src[0].is_zero()) {
2384 inst->opcode = BRW_OPCODE_MUL;
2385 inst->src[0] = inst->src[2];
2386 inst->src[2] = reg_undef;
2387 progress = true;
2388 } else if (inst->src[1].is_one()) {
2389 inst->opcode = BRW_OPCODE_ADD;
2390 inst->src[1] = inst->src[2];
2391 inst->src[2] = reg_undef;
2392 progress = true;
2393 } else if (inst->src[2].is_one()) {
2394 inst->opcode = BRW_OPCODE_ADD;
2395 inst->src[2] = reg_undef;
2396 progress = true;
2397 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2398 inst->opcode = BRW_OPCODE_ADD;
2399 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2400 inst->src[2] = reg_undef;
2401 progress = true;
2402 }
2403 break;
2404 case SHADER_OPCODE_RCP: {
2405 fs_inst *prev = (fs_inst *)inst->prev;
2406 if (prev->opcode == SHADER_OPCODE_SQRT) {
2407 if (inst->src[0].equals(prev->dst)) {
2408 inst->opcode = SHADER_OPCODE_RSQ;
2409 inst->src[0] = prev->src[0];
2410 progress = true;
2411 }
2412 }
2413 break;
2414 }
2415 case SHADER_OPCODE_BROADCAST:
2416 if (is_uniform(inst->src[0])) {
2417 inst->opcode = BRW_OPCODE_MOV;
2418 inst->sources = 1;
2419 inst->force_writemask_all = true;
2420 progress = true;
2421 } else if (inst->src[1].file == IMM) {
2422 inst->opcode = BRW_OPCODE_MOV;
2423 inst->src[0] = component(inst->src[0],
2424 inst->src[1].fixed_hw_reg.dw1.ud);
2425 inst->sources = 1;
2426 inst->force_writemask_all = true;
2427 progress = true;
2428 }
2429 break;
2430
2431 default:
2432 break;
2433 }
2434
2435 /* Swap if src[0] is immediate. */
2436 if (progress && inst->is_commutative()) {
2437 if (inst->src[0].file == IMM) {
2438 fs_reg tmp = inst->src[1];
2439 inst->src[1] = inst->src[0];
2440 inst->src[0] = tmp;
2441 }
2442 }
2443 }
2444 return progress;
2445 }
2446
2447 /**
2448 * Optimize sample messages that have constant zero values for the trailing
2449 * texture coordinates. We can just reduce the message length for these
2450 * instructions instead of reserving a register for it. Trailing parameters
2451 * that aren't sent default to zero anyway. This will cause the dead code
2452 * eliminator to remove the MOV instruction that would otherwise be emitted to
2453 * set up the zero value.
2454 */
2455 bool
2456 fs_visitor::opt_zero_samples()
2457 {
2458 /* Gen4 infers the texturing opcode based on the message length so we can't
2459 * change it.
2460 */
2461 if (devinfo->gen < 5)
2462 return false;
2463
2464 bool progress = false;
2465
2466 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2467 if (!inst->is_tex())
2468 continue;
2469
2470 fs_inst *load_payload = (fs_inst *) inst->prev;
2471
2472 if (load_payload->is_head_sentinel() ||
2473 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2474 continue;
2475
2476 /* We don't want to remove the message header or the first parameter.
2477 * Removing the first parameter is not allowed, see the Haswell PRM
2478 * volume 7, page 149:
2479 *
2480 * "Parameter 0 is required except for the sampleinfo message, which
2481 * has no parameter 0"
2482 */
2483 while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2484 load_payload->src[(inst->mlen - inst->header_size) /
2485 (dispatch_width / 8) +
2486 inst->header_size - 1].is_zero()) {
2487 inst->mlen -= dispatch_width / 8;
2488 progress = true;
2489 }
2490 }
2491
2492 if (progress)
2493 invalidate_live_intervals();
2494
2495 return progress;
2496 }
2497
2498 /**
2499 * Optimize sample messages which are followed by the final RT write.
2500 *
2501 * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2502 * results sent directly to the framebuffer, bypassing the EU. Recognize the
2503 * final texturing results copied to the framebuffer write payload and modify
2504 * them to write to the framebuffer directly.
2505 */
2506 bool
2507 fs_visitor::opt_sampler_eot()
2508 {
2509 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2510
2511 if (stage != MESA_SHADER_FRAGMENT)
2512 return false;
2513
2514 if (devinfo->gen < 9 && !devinfo->is_cherryview)
2515 return false;
2516
2517 /* FINISHME: It should be possible to implement this optimization when there
2518 * are multiple drawbuffers.
2519 */
2520 if (key->nr_color_regions != 1)
2521 return false;
2522
2523 /* Look for a texturing instruction immediately before the final FB_WRITE. */
2524 fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2525 assert(fb_write->eot);
2526 assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2527
2528 fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2529
2530 /* There wasn't one; nothing to do. */
2531 if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2532 return false;
2533
2534 /* This optimisation doesn't seem to work for textureGather for some
2535 * reason. I can't find any documentation or known workarounds to indicate
2536 * that this is expected, but considering that it is probably pretty
2537 * unlikely that a shader would directly write out the results from
2538 * textureGather we might as well just disable it.
2539 */
2540 if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
2541 tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2542 return false;
2543
2544 /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2545 * It's very likely to be the previous instruction.
2546 */
2547 fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2548 if (load_payload->is_head_sentinel() ||
2549 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2550 return false;
2551
2552 assert(!tex_inst->eot); /* We can't get here twice */
2553 assert((tex_inst->offset & (0xff << 24)) == 0);
2554
2555 tex_inst->offset |= fb_write->target << 24;
2556 tex_inst->eot = true;
2557 tex_inst->dst = bld.null_reg_ud();
2558 fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2559
2560 /* If a header is present, marking the eot is sufficient. Otherwise, we need
2561 * to create a new LOAD_PAYLOAD command with the same sources and a space
2562 * saved for the header. Using a new destination register not only makes sure
2563 * we have enough space, but it will make sure the dead code eliminator kills
2564 * the instruction that this will replace.
2565 */
2566 if (tex_inst->header_size != 0)
2567 return true;
2568
2569 fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
2570 load_payload->sources + 1);
2571 fs_reg *new_sources =
2572 ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2573
2574 new_sources[0] = fs_reg();
2575 for (int i = 0; i < load_payload->sources; i++)
2576 new_sources[i+1] = load_payload->src[i];
2577
2578 /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2579 * requires a lot of information about the sources to appropriately figure
2580 * out the number of registers needed to be used. Given this stage in our
2581 * optimization, we may not have the appropriate GRFs required by
2582 * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2583 * manually emit the instruction.
2584 */
2585 fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2586 load_payload->exec_size,
2587 send_header,
2588 new_sources,
2589 load_payload->sources + 1);
2590
2591 new_load_payload->regs_written = load_payload->regs_written + 1;
2592 new_load_payload->header_size = 1;
2593 tex_inst->mlen++;
2594 tex_inst->header_size = 1;
2595 tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2596 tex_inst->src[0] = send_header;
2597
2598 return true;
2599 }
2600
2601 bool
2602 fs_visitor::opt_register_renaming()
2603 {
2604 bool progress = false;
2605 int depth = 0;
2606
2607 int remap[alloc.count];
2608 memset(remap, -1, sizeof(int) * alloc.count);
2609
2610 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2611 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2612 depth++;
2613 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2614 inst->opcode == BRW_OPCODE_WHILE) {
2615 depth--;
2616 }
2617
2618 /* Rewrite instruction sources. */
2619 for (int i = 0; i < inst->sources; i++) {
2620 if (inst->src[i].file == GRF &&
2621 remap[inst->src[i].reg] != -1 &&
2622 remap[inst->src[i].reg] != inst->src[i].reg) {
2623 inst->src[i].reg = remap[inst->src[i].reg];
2624 progress = true;
2625 }
2626 }
2627
2628 const int dst = inst->dst.reg;
2629
2630 if (depth == 0 &&
2631 inst->dst.file == GRF &&
2632 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2633 !inst->is_partial_write()) {
2634 if (remap[dst] == -1) {
2635 remap[dst] = dst;
2636 } else {
2637 remap[dst] = alloc.allocate(inst->dst.width / 8);
2638 inst->dst.reg = remap[dst];
2639 progress = true;
2640 }
2641 } else if (inst->dst.file == GRF &&
2642 remap[dst] != -1 &&
2643 remap[dst] != dst) {
2644 inst->dst.reg = remap[dst];
2645 progress = true;
2646 }
2647 }
2648
2649 if (progress) {
2650 invalidate_live_intervals();
2651
2652 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2653 if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2654 delta_xy[i].reg = remap[delta_xy[i].reg];
2655 }
2656 }
2657 }
2658
2659 return progress;
2660 }
2661
2662 /**
2663 * Remove redundant or useless discard jumps.
2664 *
2665 * For example, we can eliminate jumps in the following sequence:
2666 *
2667 * discard-jump (redundant with the next jump)
2668 * discard-jump (useless; jumps to the next instruction)
2669 * placeholder-halt
2670 */
2671 bool
2672 fs_visitor::opt_redundant_discard_jumps()
2673 {
2674 bool progress = false;
2675
2676 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2677
2678 fs_inst *placeholder_halt = NULL;
2679 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2680 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2681 placeholder_halt = inst;
2682 break;
2683 }
2684 }
2685
2686 if (!placeholder_halt)
2687 return false;
2688
2689 /* Delete any HALTs immediately before the placeholder halt. */
2690 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2691 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2692 prev = (fs_inst *) placeholder_halt->prev) {
2693 prev->remove(last_bblock);
2694 progress = true;
2695 }
2696
2697 if (progress)
2698 invalidate_live_intervals();
2699
2700 return progress;
2701 }
2702
2703 bool
2704 fs_visitor::compute_to_mrf()
2705 {
2706 bool progress = false;
2707 int next_ip = 0;
2708
2709 /* No MRFs on Gen >= 7. */
2710 if (devinfo->gen >= 7)
2711 return false;
2712
2713 calculate_live_intervals();
2714
2715 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2716 int ip = next_ip;
2717 next_ip++;
2718
2719 if (inst->opcode != BRW_OPCODE_MOV ||
2720 inst->is_partial_write() ||
2721 inst->dst.file != MRF || inst->src[0].file != GRF ||
2722 inst->dst.type != inst->src[0].type ||
2723 inst->src[0].abs || inst->src[0].negate ||
2724 !inst->src[0].is_contiguous() ||
2725 inst->src[0].subreg_offset)
2726 continue;
2727
2728 /* Work out which hardware MRF registers are written by this
2729 * instruction.
2730 */
2731 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2732 int mrf_high;
2733 if (inst->dst.reg & BRW_MRF_COMPR4) {
2734 mrf_high = mrf_low + 4;
2735 } else if (inst->exec_size == 16) {
2736 mrf_high = mrf_low + 1;
2737 } else {
2738 mrf_high = mrf_low;
2739 }
2740
2741 /* Can't compute-to-MRF this GRF if someone else was going to
2742 * read it later.
2743 */
2744 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2745 continue;
2746
2747 /* Found a move of a GRF to a MRF. Let's see if we can go
2748 * rewrite the thing that made this GRF to write into the MRF.
2749 */
2750 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2751 if (scan_inst->dst.file == GRF &&
2752 scan_inst->dst.reg == inst->src[0].reg) {
2753 /* Found the last thing to write our reg we want to turn
2754 * into a compute-to-MRF.
2755 */
2756
2757 /* If this one instruction didn't populate all the
2758 * channels, bail. We might be able to rewrite everything
2759 * that writes that reg, but it would require smarter
2760 * tracking to delay the rewriting until complete success.
2761 */
2762 if (scan_inst->is_partial_write())
2763 break;
2764
2765 /* Things returning more than one register would need us to
2766 * understand coalescing out more than one MOV at a time.
2767 */
2768 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2769 break;
2770
2771 /* SEND instructions can't have MRF as a destination. */
2772 if (scan_inst->mlen)
2773 break;
2774
2775 if (devinfo->gen == 6) {
2776 /* gen6 math instructions must have the destination be
2777 * GRF, so no compute-to-MRF for them.
2778 */
2779 if (scan_inst->is_math()) {
2780 break;
2781 }
2782 }
2783
2784 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2785 /* Found the creator of our MRF's source value. */
2786 scan_inst->dst.file = MRF;
2787 scan_inst->dst.reg = inst->dst.reg;
2788 scan_inst->saturate |= inst->saturate;
2789 inst->remove(block);
2790 progress = true;
2791 }
2792 break;
2793 }
2794
2795 /* We don't handle control flow here. Most computation of
2796 * values that end up in MRFs are shortly before the MRF
2797 * write anyway.
2798 */
2799 if (block->start() == scan_inst)
2800 break;
2801
2802 /* You can't read from an MRF, so if someone else reads our
2803 * MRF's source GRF that we wanted to rewrite, that stops us.
2804 */
2805 bool interfered = false;
2806 for (int i = 0; i < scan_inst->sources; i++) {
2807 if (scan_inst->src[i].file == GRF &&
2808 scan_inst->src[i].reg == inst->src[0].reg &&
2809 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2810 interfered = true;
2811 }
2812 }
2813 if (interfered)
2814 break;
2815
2816 if (scan_inst->dst.file == MRF) {
2817 /* If somebody else writes our MRF here, we can't
2818 * compute-to-MRF before that.
2819 */
2820 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2821 int scan_mrf_high;
2822
2823 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2824 scan_mrf_high = scan_mrf_low + 4;
2825 } else if (scan_inst->exec_size == 16) {
2826 scan_mrf_high = scan_mrf_low + 1;
2827 } else {
2828 scan_mrf_high = scan_mrf_low;
2829 }
2830
2831 if (mrf_low == scan_mrf_low ||
2832 mrf_low == scan_mrf_high ||
2833 mrf_high == scan_mrf_low ||
2834 mrf_high == scan_mrf_high) {
2835 break;
2836 }
2837 }
2838
2839 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2840 /* Found a SEND instruction, which means that there are
2841 * live values in MRFs from base_mrf to base_mrf +
2842 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2843 * above it.
2844 */
2845 if (mrf_low >= scan_inst->base_mrf &&
2846 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2847 break;
2848 }
2849 if (mrf_high >= scan_inst->base_mrf &&
2850 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2851 break;
2852 }
2853 }
2854 }
2855 }
2856
2857 if (progress)
2858 invalidate_live_intervals();
2859
2860 return progress;
2861 }
2862
2863 /**
2864 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2865 * flow. We could probably do better here with some form of divergence
2866 * analysis.
2867 */
2868 bool
2869 fs_visitor::eliminate_find_live_channel()
2870 {
2871 bool progress = false;
2872 unsigned depth = 0;
2873
2874 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2875 switch (inst->opcode) {
2876 case BRW_OPCODE_IF:
2877 case BRW_OPCODE_DO:
2878 depth++;
2879 break;
2880
2881 case BRW_OPCODE_ENDIF:
2882 case BRW_OPCODE_WHILE:
2883 depth--;
2884 break;
2885
2886 case FS_OPCODE_DISCARD_JUMP:
2887 /* This can potentially make control flow non-uniform until the end
2888 * of the program.
2889 */
2890 return progress;
2891
2892 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2893 if (depth == 0) {
2894 inst->opcode = BRW_OPCODE_MOV;
2895 inst->src[0] = fs_reg(0);
2896 inst->sources = 1;
2897 inst->force_writemask_all = true;
2898 progress = true;
2899 }
2900 break;
2901
2902 default:
2903 break;
2904 }
2905 }
2906
2907 return progress;
2908 }
2909
2910 /**
2911 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2912 * instructions to FS_OPCODE_REP_FB_WRITE.
2913 */
2914 void
2915 fs_visitor::emit_repclear_shader()
2916 {
2917 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2918 int base_mrf = 1;
2919 int color_mrf = base_mrf + 2;
2920
2921 fs_inst *mov = bld.exec_all().MOV(vec4(brw_message_reg(color_mrf)),
2922 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
2923
2924 fs_inst *write;
2925 if (key->nr_color_regions == 1) {
2926 write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2927 write->saturate = key->clamp_fragment_color;
2928 write->base_mrf = color_mrf;
2929 write->target = 0;
2930 write->header_size = 0;
2931 write->mlen = 1;
2932 } else {
2933 assume(key->nr_color_regions > 0);
2934 for (int i = 0; i < key->nr_color_regions; ++i) {
2935 write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2936 write->saturate = key->clamp_fragment_color;
2937 write->base_mrf = base_mrf;
2938 write->target = i;
2939 write->header_size = 2;
2940 write->mlen = 3;
2941 }
2942 }
2943 write->eot = true;
2944
2945 calculate_cfg();
2946
2947 assign_constant_locations();
2948 assign_curb_setup();
2949
2950 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2951 assert(mov->src[0].file == HW_REG);
2952 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2953 }
2954
2955 /**
2956 * Walks through basic blocks, looking for repeated MRF writes and
2957 * removing the later ones.
2958 */
2959 bool
2960 fs_visitor::remove_duplicate_mrf_writes()
2961 {
2962 fs_inst *last_mrf_move[16];
2963 bool progress = false;
2964
2965 /* Need to update the MRF tracking for compressed instructions. */
2966 if (dispatch_width == 16)
2967 return false;
2968
2969 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2970
2971 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2972 if (inst->is_control_flow()) {
2973 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2974 }
2975
2976 if (inst->opcode == BRW_OPCODE_MOV &&
2977 inst->dst.file == MRF) {
2978 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2979 if (prev_inst && inst->equals(prev_inst)) {
2980 inst->remove(block);
2981 progress = true;
2982 continue;
2983 }
2984 }
2985
2986 /* Clear out the last-write records for MRFs that were overwritten. */
2987 if (inst->dst.file == MRF) {
2988 last_mrf_move[inst->dst.reg] = NULL;
2989 }
2990
2991 if (inst->mlen > 0 && inst->base_mrf != -1) {
2992 /* Found a SEND instruction, which will include two or fewer
2993 * implied MRF writes. We could do better here.
2994 */
2995 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2996 last_mrf_move[inst->base_mrf + i] = NULL;
2997 }
2998 }
2999
3000 /* Clear out any MRF move records whose sources got overwritten. */
3001 if (inst->dst.file == GRF) {
3002 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3003 if (last_mrf_move[i] &&
3004 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3005 last_mrf_move[i] = NULL;
3006 }
3007 }
3008 }
3009
3010 if (inst->opcode == BRW_OPCODE_MOV &&
3011 inst->dst.file == MRF &&
3012 inst->src[0].file == GRF &&
3013 !inst->is_partial_write()) {
3014 last_mrf_move[inst->dst.reg] = inst;
3015 }
3016 }
3017
3018 if (progress)
3019 invalidate_live_intervals();
3020
3021 return progress;
3022 }
3023
3024 static void
3025 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3026 {
3027 /* Clear the flag for registers that actually got read (as expected). */
3028 for (int i = 0; i < inst->sources; i++) {
3029 int grf;
3030 if (inst->src[i].file == GRF) {
3031 grf = inst->src[i].reg;
3032 } else if (inst->src[i].file == HW_REG &&
3033 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3034 grf = inst->src[i].fixed_hw_reg.nr;
3035 } else {
3036 continue;
3037 }
3038
3039 if (grf >= first_grf &&
3040 grf < first_grf + grf_len) {
3041 deps[grf - first_grf] = false;
3042 if (inst->exec_size == 16)
3043 deps[grf - first_grf + 1] = false;
3044 }
3045 }
3046 }
3047
3048 /**
3049 * Implements this workaround for the original 965:
3050 *
3051 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3052 * check for post destination dependencies on this instruction, software
3053 * must ensure that there is no destination hazard for the case of ‘write
3054 * followed by a posted write’ shown in the following example.
3055 *
3056 * 1. mov r3 0
3057 * 2. send r3.xy <rest of send instruction>
3058 * 3. mov r2 r3
3059 *
3060 * Due to no post-destination dependency check on the ‘send’, the above
3061 * code sequence could have two instructions (1 and 2) in flight at the
3062 * same time that both consider ‘r3’ as the target of their final writes.
3063 */
3064 void
3065 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3066 fs_inst *inst)
3067 {
3068 int write_len = inst->regs_written;
3069 int first_write_grf = inst->dst.reg;
3070 bool needs_dep[BRW_MAX_MRF];
3071 assert(write_len < (int)sizeof(needs_dep) - 1);
3072
3073 memset(needs_dep, false, sizeof(needs_dep));
3074 memset(needs_dep, true, write_len);
3075
3076 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3077
3078 /* Walk backwards looking for writes to registers we're writing which
3079 * aren't read since being written. If we hit the start of the program,
3080 * we assume that there are no outstanding dependencies on entry to the
3081 * program.
3082 */
3083 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3084 /* If we hit control flow, assume that there *are* outstanding
3085 * dependencies, and force their cleanup before our instruction.
3086 */
3087 if (block->start() == scan_inst) {
3088 for (int i = 0; i < write_len; i++) {
3089 if (needs_dep[i])
3090 DEP_RESOLVE_MOV(bld.at(block, inst), first_write_grf + i);
3091 }
3092 return;
3093 }
3094
3095 /* We insert our reads as late as possible on the assumption that any
3096 * instruction but a MOV that might have left us an outstanding
3097 * dependency has more latency than a MOV.
3098 */
3099 if (scan_inst->dst.file == GRF) {
3100 for (int i = 0; i < scan_inst->regs_written; i++) {
3101 int reg = scan_inst->dst.reg + i;
3102
3103 if (reg >= first_write_grf &&
3104 reg < first_write_grf + write_len &&
3105 needs_dep[reg - first_write_grf]) {
3106 DEP_RESOLVE_MOV(bld.at(block, inst), reg);
3107 needs_dep[reg - first_write_grf] = false;
3108 if (scan_inst->exec_size == 16)
3109 needs_dep[reg - first_write_grf + 1] = false;
3110 }
3111 }
3112 }
3113
3114 /* Clear the flag for registers that actually got read (as expected). */
3115 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3116
3117 /* Continue the loop only if we haven't resolved all the dependencies */
3118 int i;
3119 for (i = 0; i < write_len; i++) {
3120 if (needs_dep[i])
3121 break;
3122 }
3123 if (i == write_len)
3124 return;
3125 }
3126 }
3127
3128 /**
3129 * Implements this workaround for the original 965:
3130 *
3131 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3132 * used as a destination register until after it has been sourced by an
3133 * instruction with a different destination register.
3134 */
3135 void
3136 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3137 {
3138 int write_len = inst->regs_written;
3139 int first_write_grf = inst->dst.reg;
3140 bool needs_dep[BRW_MAX_MRF];
3141 assert(write_len < (int)sizeof(needs_dep) - 1);
3142
3143 memset(needs_dep, false, sizeof(needs_dep));
3144 memset(needs_dep, true, write_len);
3145 /* Walk forwards looking for writes to registers we're writing which aren't
3146 * read before being written.
3147 */
3148 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3149 /* If we hit control flow, force resolve all remaining dependencies. */
3150 if (block->end() == scan_inst) {
3151 for (int i = 0; i < write_len; i++) {
3152 if (needs_dep[i])
3153 DEP_RESOLVE_MOV(bld.at(block, scan_inst), first_write_grf + i);
3154 }
3155 return;
3156 }
3157
3158 /* Clear the flag for registers that actually got read (as expected). */
3159 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3160
3161 /* We insert our reads as late as possible since they're reading the
3162 * result of a SEND, which has massive latency.
3163 */
3164 if (scan_inst->dst.file == GRF &&
3165 scan_inst->dst.reg >= first_write_grf &&
3166 scan_inst->dst.reg < first_write_grf + write_len &&
3167 needs_dep[scan_inst->dst.reg - first_write_grf]) {
3168 DEP_RESOLVE_MOV(bld.at(block, scan_inst), scan_inst->dst.reg);
3169 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3170 }
3171
3172 /* Continue the loop only if we haven't resolved all the dependencies */
3173 int i;
3174 for (i = 0; i < write_len; i++) {
3175 if (needs_dep[i])
3176 break;
3177 }
3178 if (i == write_len)
3179 return;
3180 }
3181 }
3182
3183 void
3184 fs_visitor::insert_gen4_send_dependency_workarounds()
3185 {
3186 if (devinfo->gen != 4 || devinfo->is_g4x)
3187 return;
3188
3189 bool progress = false;
3190
3191 /* Note that we're done with register allocation, so GRF fs_regs always
3192 * have a .reg_offset of 0.
3193 */
3194
3195 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3196 if (inst->mlen != 0 && inst->dst.file == GRF) {
3197 insert_gen4_pre_send_dependency_workarounds(block, inst);
3198 insert_gen4_post_send_dependency_workarounds(block, inst);
3199 progress = true;
3200 }
3201 }
3202
3203 if (progress)
3204 invalidate_live_intervals();
3205 }
3206
3207 /**
3208 * Turns the generic expression-style uniform pull constant load instruction
3209 * into a hardware-specific series of instructions for loading a pull
3210 * constant.
3211 *
3212 * The expression style allows the CSE pass before this to optimize out
3213 * repeated loads from the same offset, and gives the pre-register-allocation
3214 * scheduling full flexibility, while the conversion to native instructions
3215 * allows the post-register-allocation scheduler the best information
3216 * possible.
3217 *
3218 * Note that execution masking for setting up pull constant loads is special:
3219 * the channels that need to be written are unrelated to the current execution
3220 * mask, since a later instruction will use one of the result channels as a
3221 * source operand for all 8 or 16 of its channels.
3222 */
3223 void
3224 fs_visitor::lower_uniform_pull_constant_loads()
3225 {
3226 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3227 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3228 continue;
3229
3230 if (devinfo->gen >= 7) {
3231 /* The offset arg before was a vec4-aligned byte offset. We need to
3232 * turn it into a dword offset.
3233 */
3234 fs_reg const_offset_reg = inst->src[1];
3235 assert(const_offset_reg.file == IMM &&
3236 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3237 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3238 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3239
3240 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3241 * Reserve space for the register.
3242 */
3243 if (devinfo->gen >= 9) {
3244 payload.reg_offset++;
3245 alloc.sizes[payload.reg] = 2;
3246 }
3247
3248 /* This is actually going to be a MOV, but since only the first dword
3249 * is accessed, we have a special opcode to do just that one. Note
3250 * that this needs to be an operation that will be considered a def
3251 * by live variable analysis, or register allocation will explode.
3252 */
3253 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3254 8, payload, const_offset_reg);
3255 setup->force_writemask_all = true;
3256
3257 setup->ir = inst->ir;
3258 setup->annotation = inst->annotation;
3259 inst->insert_before(block, setup);
3260
3261 /* Similarly, this will only populate the first 4 channels of the
3262 * result register (since we only use smear values from 0-3), but we
3263 * don't tell the optimizer.
3264 */
3265 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3266 inst->src[1] = payload;
3267
3268 invalidate_live_intervals();
3269 } else {
3270 /* Before register allocation, we didn't tell the scheduler about the
3271 * MRF we use. We know it's safe to use this MRF because nothing
3272 * else does except for register spill/unspill, which generates and
3273 * uses its MRF within a single IR instruction.
3274 */
3275 inst->base_mrf = 14;
3276 inst->mlen = 1;
3277 }
3278 }
3279 }
3280
3281 bool
3282 fs_visitor::lower_load_payload()
3283 {
3284 bool progress = false;
3285
3286 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3287 if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3288 continue;
3289
3290 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3291 assert(inst->saturate == false);
3292
3293 const fs_builder ibld = bld.group(inst->exec_size, inst->force_sechalf)
3294 .exec_all(inst->force_writemask_all)
3295 .at(block, inst);
3296 fs_reg dst = inst->dst;
3297
3298 /* Get rid of COMPR4. We'll add it back in if we need it */
3299 if (dst.file == MRF)
3300 dst.reg = dst.reg & ~BRW_MRF_COMPR4;
3301
3302 dst.width = 8;
3303 for (uint8_t i = 0; i < inst->header_size; i++) {
3304 if (inst->src[i].file != BAD_FILE) {
3305 fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3306 fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3307 mov_src.width = 8;
3308 ibld.exec_all().MOV(mov_dst, mov_src);
3309 }
3310 dst = offset(dst, 1);
3311 }
3312
3313 dst.width = inst->exec_size;
3314 if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
3315 inst->exec_size > 8) {
3316 /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3317 * a straightforward copy. Instead, the result of the
3318 * LOAD_PAYLOAD is treated as interleaved and the first four
3319 * non-header sources are unpacked as:
3320 *
3321 * m + 0: r0
3322 * m + 1: g0
3323 * m + 2: b0
3324 * m + 3: a0
3325 * m + 4: r1
3326 * m + 5: g1
3327 * m + 6: b1
3328 * m + 7: a1
3329 *
3330 * This is used for gen <= 5 fb writes.
3331 */
3332 assert(inst->exec_size == 16);
3333 assert(inst->header_size + 4 <= inst->sources);
3334 for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3335 if (inst->src[i].file != BAD_FILE) {
3336 if (devinfo->has_compr4) {
3337 fs_reg compr4_dst = retype(dst, inst->src[i].type);
3338 compr4_dst.reg |= BRW_MRF_COMPR4;
3339 ibld.MOV(compr4_dst, inst->src[i]);
3340 } else {
3341 /* Platform doesn't have COMPR4. We have to fake it */
3342 fs_reg mov_dst = retype(dst, inst->src[i].type);
3343 mov_dst.width = 8;
3344 ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
3345 ibld.half(1).MOV(offset(mov_dst, 4), half(inst->src[i], 1));
3346 }
3347 }
3348
3349 dst.reg++;
3350 }
3351
3352 /* The loop above only ever incremented us through the first set
3353 * of 4 registers. However, thanks to the magic of COMPR4, we
3354 * actually wrote to the first 8 registers, so we need to take
3355 * that into account now.
3356 */
3357 dst.reg += 4;
3358
3359 /* The COMPR4 code took care of the first 4 sources. We'll let
3360 * the regular path handle any remaining sources. Yes, we are
3361 * modifying the instruction but we're about to delete it so
3362 * this really doesn't hurt anything.
3363 */
3364 inst->header_size += 4;
3365 }
3366
3367 for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3368 if (inst->src[i].file != BAD_FILE)
3369 ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
3370 dst = offset(dst, 1);
3371 }
3372
3373 inst->remove(block);
3374 progress = true;
3375 }
3376
3377 if (progress)
3378 invalidate_live_intervals();
3379
3380 return progress;
3381 }
3382
3383 bool
3384 fs_visitor::lower_integer_multiplication()
3385 {
3386 bool progress = false;
3387
3388 /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
3389 * directly, but Cherryview cannot.
3390 */
3391 if (devinfo->gen >= 8 && !devinfo->is_cherryview)
3392 return false;
3393
3394 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3395 if (inst->opcode != BRW_OPCODE_MUL ||
3396 inst->dst.is_accumulator() ||
3397 (inst->dst.type != BRW_REGISTER_TYPE_D &&
3398 inst->dst.type != BRW_REGISTER_TYPE_UD))
3399 continue;
3400
3401 const fs_builder ibld = bld.at(block, inst);
3402
3403 /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3404 * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3405 * src1 are used.
3406 *
3407 * If multiplying by an immediate value that fits in 16-bits, do a
3408 * single MUL instruction with that value in the proper location.
3409 */
3410 if (inst->src[1].file == IMM &&
3411 inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
3412 if (devinfo->gen < 7) {
3413 fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
3414 inst->dst.type, dispatch_width);
3415 ibld.MOV(imm, inst->src[1]);
3416 ibld.MUL(inst->dst, imm, inst->src[0]);
3417 } else {
3418 ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
3419 }
3420 } else {
3421 /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3422 * do 32-bit integer multiplication in one instruction, but instead
3423 * must do a sequence (which actually calculates a 64-bit result):
3424 *
3425 * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
3426 * mach(8) null g3<8,8,1>D g4<8,8,1>D
3427 * mov(8) g2<1>D acc0<8,8,1>D
3428 *
3429 * But on Gen > 6, the ability to use second accumulator register
3430 * (acc1) for non-float data types was removed, preventing a simple
3431 * implementation in SIMD16. A 16-channel result can be calculated by
3432 * executing the three instructions twice in SIMD8, once with quarter
3433 * control of 1Q for the first eight channels and again with 2Q for
3434 * the second eight channels.
3435 *
3436 * Which accumulator register is implicitly accessed (by AccWrEnable
3437 * for instance) is determined by the quarter control. Unfortunately
3438 * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3439 * implicit accumulator access by an instruction with 2Q will access
3440 * acc1 regardless of whether the data type is usable in acc1.
3441 *
3442 * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3443 * integer data types.
3444 *
3445 * Since we only want the low 32-bits of the result, we can do two
3446 * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3447 * adjust the high result and add them (like the mach is doing):
3448 *
3449 * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
3450 * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
3451 * shl(8) g9<1>D g8<8,8,1>D 16D
3452 * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
3453 *
3454 * We avoid the shl instruction by realizing that we only want to add
3455 * the low 16-bits of the "high" result to the high 16-bits of the
3456 * "low" result and using proper regioning on the add:
3457 *
3458 * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
3459 * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
3460 * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
3461 *
3462 * Since it does not use the (single) accumulator register, we can
3463 * schedule multi-component multiplications much better.
3464 */
3465
3466 if (inst->conditional_mod && inst->dst.is_null()) {
3467 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3468 inst->dst.type, dispatch_width);
3469 }
3470 fs_reg low = inst->dst;
3471 fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
3472 inst->dst.type, dispatch_width);
3473
3474 if (brw->gen >= 7) {
3475 fs_reg src1_0_w = inst->src[1];
3476 fs_reg src1_1_w = inst->src[1];
3477
3478 if (inst->src[1].file == IMM) {
3479 src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
3480 src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
3481 } else {
3482 src1_0_w.type = BRW_REGISTER_TYPE_UW;
3483 src1_0_w.stride = 2;
3484
3485 src1_1_w.type = BRW_REGISTER_TYPE_UW;
3486 src1_1_w.stride = 2;
3487 src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3488 }
3489 ibld.MUL(low, inst->src[0], src1_0_w);
3490 ibld.MUL(high, inst->src[0], src1_1_w);
3491 } else {
3492 fs_reg src0_0_w = inst->src[0];
3493 fs_reg src0_1_w = inst->src[0];
3494
3495 src0_0_w.type = BRW_REGISTER_TYPE_UW;
3496 src0_0_w.stride = 2;
3497
3498 src0_1_w.type = BRW_REGISTER_TYPE_UW;
3499 src0_1_w.stride = 2;
3500 src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3501
3502 ibld.MUL(low, src0_0_w, inst->src[1]);
3503 ibld.MUL(high, src0_1_w, inst->src[1]);
3504 }
3505
3506 fs_reg dst = inst->dst;
3507 dst.type = BRW_REGISTER_TYPE_UW;
3508 dst.subreg_offset = 2;
3509 dst.stride = 2;
3510
3511 high.type = BRW_REGISTER_TYPE_UW;
3512 high.stride = 2;
3513
3514 low.type = BRW_REGISTER_TYPE_UW;
3515 low.subreg_offset = 2;
3516 low.stride = 2;
3517
3518 ibld.ADD(dst, low, high);
3519
3520 if (inst->conditional_mod) {
3521 fs_reg null(retype(brw_null_reg(), inst->dst.type));
3522 set_condmod(inst->conditional_mod,
3523 ibld.MOV(null, inst->dst));
3524 }
3525 }
3526
3527 inst->remove(block);
3528 progress = true;
3529 }
3530
3531 if (progress)
3532 invalidate_live_intervals();
3533
3534 return progress;
3535 }
3536
3537 void
3538 fs_visitor::dump_instructions()
3539 {
3540 dump_instructions(NULL);
3541 }
3542
3543 void
3544 fs_visitor::dump_instructions(const char *name)
3545 {
3546 FILE *file = stderr;
3547 if (name && geteuid() != 0) {
3548 file = fopen(name, "w");
3549 if (!file)
3550 file = stderr;
3551 }
3552
3553 if (cfg) {
3554 calculate_register_pressure();
3555 int ip = 0, max_pressure = 0;
3556 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3557 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3558 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3559 dump_instruction(inst, file);
3560 ip++;
3561 }
3562 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3563 } else {
3564 int ip = 0;
3565 foreach_in_list(backend_instruction, inst, &instructions) {
3566 fprintf(file, "%4d: ", ip++);
3567 dump_instruction(inst, file);
3568 }
3569 }
3570
3571 if (file != stderr) {
3572 fclose(file);
3573 }
3574 }
3575
3576 void
3577 fs_visitor::dump_instruction(backend_instruction *be_inst)
3578 {
3579 dump_instruction(be_inst, stderr);
3580 }
3581
3582 void
3583 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3584 {
3585 fs_inst *inst = (fs_inst *)be_inst;
3586
3587 if (inst->predicate) {
3588 fprintf(file, "(%cf0.%d) ",
3589 inst->predicate_inverse ? '-' : '+',
3590 inst->flag_subreg);
3591 }
3592
3593 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3594 if (inst->saturate)
3595 fprintf(file, ".sat");
3596 if (inst->conditional_mod) {
3597 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3598 if (!inst->predicate &&
3599 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3600 inst->opcode != BRW_OPCODE_IF &&
3601 inst->opcode != BRW_OPCODE_WHILE))) {
3602 fprintf(file, ".f0.%d", inst->flag_subreg);
3603 }
3604 }
3605 fprintf(file, "(%d) ", inst->exec_size);
3606
3607 if (inst->mlen) {
3608 fprintf(file, "(mlen: %d) ", inst->mlen);
3609 }
3610
3611 switch (inst->dst.file) {
3612 case GRF:
3613 fprintf(file, "vgrf%d", inst->dst.reg);
3614 if (inst->dst.width != dispatch_width)
3615 fprintf(file, "@%d", inst->dst.width);
3616 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3617 inst->dst.subreg_offset)
3618 fprintf(file, "+%d.%d",
3619 inst->dst.reg_offset, inst->dst.subreg_offset);
3620 break;
3621 case MRF:
3622 fprintf(file, "m%d", inst->dst.reg);
3623 break;
3624 case BAD_FILE:
3625 fprintf(file, "(null)");
3626 break;
3627 case UNIFORM:
3628 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3629 break;
3630 case ATTR:
3631 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3632 break;
3633 case HW_REG:
3634 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3635 switch (inst->dst.fixed_hw_reg.nr) {
3636 case BRW_ARF_NULL:
3637 fprintf(file, "null");
3638 break;
3639 case BRW_ARF_ADDRESS:
3640 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3641 break;
3642 case BRW_ARF_ACCUMULATOR:
3643 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3644 break;
3645 case BRW_ARF_FLAG:
3646 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3647 inst->dst.fixed_hw_reg.subnr);
3648 break;
3649 default:
3650 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3651 inst->dst.fixed_hw_reg.subnr);
3652 break;
3653 }
3654 } else {
3655 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3656 }
3657 if (inst->dst.fixed_hw_reg.subnr)
3658 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3659 break;
3660 default:
3661 fprintf(file, "???");
3662 break;
3663 }
3664 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3665
3666 for (int i = 0; i < inst->sources; i++) {
3667 if (inst->src[i].negate)
3668 fprintf(file, "-");
3669 if (inst->src[i].abs)
3670 fprintf(file, "|");
3671 switch (inst->src[i].file) {
3672 case GRF:
3673 fprintf(file, "vgrf%d", inst->src[i].reg);
3674 if (inst->src[i].width != dispatch_width)
3675 fprintf(file, "@%d", inst->src[i].width);
3676 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3677 inst->src[i].subreg_offset)
3678 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3679 inst->src[i].subreg_offset);
3680 break;
3681 case MRF:
3682 fprintf(file, "***m%d***", inst->src[i].reg);
3683 break;
3684 case ATTR:
3685 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3686 break;
3687 case UNIFORM:
3688 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3689 if (inst->src[i].reladdr) {
3690 fprintf(file, "+reladdr");
3691 } else if (inst->src[i].subreg_offset) {
3692 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3693 inst->src[i].subreg_offset);
3694 }
3695 break;
3696 case BAD_FILE:
3697 fprintf(file, "(null)");
3698 break;
3699 case IMM:
3700 switch (inst->src[i].type) {
3701 case BRW_REGISTER_TYPE_F:
3702 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3703 break;
3704 case BRW_REGISTER_TYPE_W:
3705 case BRW_REGISTER_TYPE_D:
3706 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3707 break;
3708 case BRW_REGISTER_TYPE_UW:
3709 case BRW_REGISTER_TYPE_UD:
3710 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3711 break;
3712 case BRW_REGISTER_TYPE_VF:
3713 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3714 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3715 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3716 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3717 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3718 break;
3719 default:
3720 fprintf(file, "???");
3721 break;
3722 }
3723 break;
3724 case HW_REG:
3725 if (inst->src[i].fixed_hw_reg.negate)
3726 fprintf(file, "-");
3727 if (inst->src[i].fixed_hw_reg.abs)
3728 fprintf(file, "|");
3729 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3730 switch (inst->src[i].fixed_hw_reg.nr) {
3731 case BRW_ARF_NULL:
3732 fprintf(file, "null");
3733 break;
3734 case BRW_ARF_ADDRESS:
3735 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3736 break;
3737 case BRW_ARF_ACCUMULATOR:
3738 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3739 break;
3740 case BRW_ARF_FLAG:
3741 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3742 inst->src[i].fixed_hw_reg.subnr);
3743 break;
3744 default:
3745 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3746 inst->src[i].fixed_hw_reg.subnr);
3747 break;
3748 }
3749 } else {
3750 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3751 }
3752 if (inst->src[i].fixed_hw_reg.subnr)
3753 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3754 if (inst->src[i].fixed_hw_reg.abs)
3755 fprintf(file, "|");
3756 break;
3757 default:
3758 fprintf(file, "???");
3759 break;
3760 }
3761 if (inst->src[i].abs)
3762 fprintf(file, "|");
3763
3764 if (inst->src[i].file != IMM) {
3765 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3766 }
3767
3768 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3769 fprintf(file, ", ");
3770 }
3771
3772 fprintf(file, " ");
3773
3774 if (dispatch_width == 16 && inst->exec_size == 8) {
3775 if (inst->force_sechalf)
3776 fprintf(file, "2ndhalf ");
3777 else
3778 fprintf(file, "1sthalf ");
3779 }
3780
3781 fprintf(file, "\n");
3782 }
3783
3784 /**
3785 * Possibly returns an instruction that set up @param reg.
3786 *
3787 * Sometimes we want to take the result of some expression/variable
3788 * dereference tree and rewrite the instruction generating the result
3789 * of the tree. When processing the tree, we know that the
3790 * instructions generated are all writing temporaries that are dead
3791 * outside of this tree. So, if we have some instructions that write
3792 * a temporary, we're free to point that temp write somewhere else.
3793 *
3794 * Note that this doesn't guarantee that the instruction generated
3795 * only reg -- it might be the size=4 destination of a texture instruction.
3796 */
3797 fs_inst *
3798 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3799 fs_inst *end,
3800 const fs_reg &reg)
3801 {
3802 if (end == start ||
3803 end->is_partial_write() ||
3804 reg.reladdr ||
3805 !reg.equals(end->dst)) {
3806 return NULL;
3807 } else {
3808 return end;
3809 }
3810 }
3811
3812 void
3813 fs_visitor::setup_payload_gen6()
3814 {
3815 bool uses_depth =
3816 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3817 unsigned barycentric_interp_modes =
3818 (stage == MESA_SHADER_FRAGMENT) ?
3819 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3820
3821 assert(devinfo->gen >= 6);
3822
3823 /* R0-1: masks, pixel X/Y coordinates. */
3824 payload.num_regs = 2;
3825 /* R2: only for 32-pixel dispatch.*/
3826
3827 /* R3-26: barycentric interpolation coordinates. These appear in the
3828 * same order that they appear in the brw_wm_barycentric_interp_mode
3829 * enum. Each set of coordinates occupies 2 registers if dispatch width
3830 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3831 * appear if they were enabled using the "Barycentric Interpolation
3832 * Mode" bits in WM_STATE.
3833 */
3834 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3835 if (barycentric_interp_modes & (1 << i)) {
3836 payload.barycentric_coord_reg[i] = payload.num_regs;
3837 payload.num_regs += 2;
3838 if (dispatch_width == 16) {
3839 payload.num_regs += 2;
3840 }
3841 }
3842 }
3843
3844 /* R27: interpolated depth if uses source depth */
3845 if (uses_depth) {
3846 payload.source_depth_reg = payload.num_regs;
3847 payload.num_regs++;
3848 if (dispatch_width == 16) {
3849 /* R28: interpolated depth if not SIMD8. */
3850 payload.num_regs++;
3851 }
3852 }
3853 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3854 if (uses_depth) {
3855 payload.source_w_reg = payload.num_regs;
3856 payload.num_regs++;
3857 if (dispatch_width == 16) {
3858 /* R30: interpolated W if not SIMD8. */
3859 payload.num_regs++;
3860 }
3861 }
3862
3863 if (stage == MESA_SHADER_FRAGMENT) {
3864 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3865 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3866 prog_data->uses_pos_offset = key->compute_pos_offset;
3867 /* R31: MSAA position offsets. */
3868 if (prog_data->uses_pos_offset) {
3869 payload.sample_pos_reg = payload.num_regs;
3870 payload.num_regs++;
3871 }
3872 }
3873
3874 /* R32: MSAA input coverage mask */
3875 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3876 assert(devinfo->gen >= 7);
3877 payload.sample_mask_in_reg = payload.num_regs;
3878 payload.num_regs++;
3879 if (dispatch_width == 16) {
3880 /* R33: input coverage mask if not SIMD8. */
3881 payload.num_regs++;
3882 }
3883 }
3884
3885 /* R34-: bary for 32-pixel. */
3886 /* R58-59: interp W for 32-pixel. */
3887
3888 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3889 source_depth_to_render_target = true;
3890 }
3891 }
3892
3893 void
3894 fs_visitor::setup_vs_payload()
3895 {
3896 /* R0: thread header, R1: urb handles */
3897 payload.num_regs = 2;
3898 }
3899
3900 void
3901 fs_visitor::setup_cs_payload()
3902 {
3903 assert(brw->gen >= 7);
3904
3905 payload.num_regs = 1;
3906 }
3907
3908 void
3909 fs_visitor::assign_binding_table_offsets()
3910 {
3911 assert(stage == MESA_SHADER_FRAGMENT);
3912 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3913 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3914 uint32_t next_binding_table_offset = 0;
3915
3916 /* If there are no color regions, we still perform an FB write to a null
3917 * renderbuffer, which we place at surface index 0.
3918 */
3919 prog_data->binding_table.render_target_start = next_binding_table_offset;
3920 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3921
3922 assign_common_binding_table_offsets(next_binding_table_offset);
3923 }
3924
3925 void
3926 fs_visitor::calculate_register_pressure()
3927 {
3928 invalidate_live_intervals();
3929 calculate_live_intervals();
3930
3931 unsigned num_instructions = 0;
3932 foreach_block(block, cfg)
3933 num_instructions += block->instructions.length();
3934
3935 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3936
3937 for (unsigned reg = 0; reg < alloc.count; reg++) {
3938 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3939 regs_live_at_ip[ip] += alloc.sizes[reg];
3940 }
3941 }
3942
3943 void
3944 fs_visitor::optimize()
3945 {
3946 /* bld is the common builder object pointing at the end of the program we
3947 * used to translate it into i965 IR. For the optimization and lowering
3948 * passes coming next, any code added after the end of the program without
3949 * having explicitly called fs_builder::at() clearly points at a mistake.
3950 * Ideally optimization passes wouldn't be part of the visitor so they
3951 * wouldn't have access to bld at all, but they do, so just in case some
3952 * pass forgets to ask for a location explicitly set it to NULL here to
3953 * make it trip.
3954 */
3955 bld = bld.at(NULL, NULL);
3956
3957 split_virtual_grfs();
3958
3959 move_uniform_array_access_to_pull_constants();
3960 assign_constant_locations();
3961 demote_pull_constants();
3962
3963 #define OPT(pass, args...) ({ \
3964 pass_num++; \
3965 bool this_progress = pass(args); \
3966 \
3967 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3968 char filename[64]; \
3969 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3970 stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3971 \
3972 backend_shader::dump_instructions(filename); \
3973 } \
3974 \
3975 progress = progress || this_progress; \
3976 this_progress; \
3977 })
3978
3979 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3980 char filename[64];
3981 snprintf(filename, 64, "%s%d-%04d-00-start",
3982 stage_abbrev, dispatch_width,
3983 shader_prog ? shader_prog->Name : 0);
3984
3985 backend_shader::dump_instructions(filename);
3986 }
3987
3988 bool progress;
3989 int iteration = 0;
3990 int pass_num = 0;
3991 do {
3992 progress = false;
3993 pass_num = 0;
3994 iteration++;
3995
3996 OPT(remove_duplicate_mrf_writes);
3997
3998 OPT(opt_algebraic);
3999 OPT(opt_cse);
4000 OPT(opt_copy_propagate);
4001 OPT(opt_peephole_predicated_break);
4002 OPT(opt_cmod_propagation);
4003 OPT(dead_code_eliminate);
4004 OPT(opt_peephole_sel);
4005 OPT(dead_control_flow_eliminate, this);
4006 OPT(opt_register_renaming);
4007 OPT(opt_redundant_discard_jumps);
4008 OPT(opt_saturate_propagation);
4009 OPT(opt_zero_samples);
4010 OPT(register_coalesce);
4011 OPT(compute_to_mrf);
4012 OPT(eliminate_find_live_channel);
4013
4014 OPT(compact_virtual_grfs);
4015 } while (progress);
4016
4017 pass_num = 0;
4018
4019 OPT(opt_sampler_eot);
4020
4021 if (OPT(lower_load_payload)) {
4022 split_virtual_grfs();
4023 OPT(register_coalesce);
4024 OPT(compute_to_mrf);
4025 OPT(dead_code_eliminate);
4026 }
4027
4028 OPT(opt_combine_constants);
4029 OPT(lower_integer_multiplication);
4030
4031 lower_uniform_pull_constant_loads();
4032 }
4033
4034 /**
4035 * Three source instruction must have a GRF/MRF destination register.
4036 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
4037 */
4038 void
4039 fs_visitor::fixup_3src_null_dest()
4040 {
4041 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
4042 if (inst->is_3src() && inst->dst.is_null()) {
4043 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
4044 inst->dst.type);
4045 }
4046 }
4047 }
4048
4049 void
4050 fs_visitor::allocate_registers()
4051 {
4052 bool allocated_without_spills;
4053
4054 static const enum instruction_scheduler_mode pre_modes[] = {
4055 SCHEDULE_PRE,
4056 SCHEDULE_PRE_NON_LIFO,
4057 SCHEDULE_PRE_LIFO,
4058 };
4059
4060 /* Try each scheduling heuristic to see if it can successfully register
4061 * allocate without spilling. They should be ordered by decreasing
4062 * performance but increasing likelihood of allocating.
4063 */
4064 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
4065 schedule_instructions(pre_modes[i]);
4066
4067 if (0) {
4068 assign_regs_trivial();
4069 allocated_without_spills = true;
4070 } else {
4071 allocated_without_spills = assign_regs(false);
4072 }
4073 if (allocated_without_spills)
4074 break;
4075 }
4076
4077 if (!allocated_without_spills) {
4078 /* We assume that any spilling is worse than just dropping back to
4079 * SIMD8. There's probably actually some intermediate point where
4080 * SIMD16 with a couple of spills is still better.
4081 */
4082 if (dispatch_width == 16) {
4083 fail("Failure to register allocate. Reduce number of "
4084 "live scalar values to avoid this.");
4085 } else {
4086 perf_debug("%s shader triggered register spilling. "
4087 "Try reducing the number of live scalar values to "
4088 "improve performance.\n", stage_name);
4089 }
4090
4091 /* Since we're out of heuristics, just go spill registers until we
4092 * get an allocation.
4093 */
4094 while (!assign_regs(true)) {
4095 if (failed)
4096 break;
4097 }
4098 }
4099
4100 /* This must come after all optimization and register allocation, since
4101 * it inserts dead code that happens to have side effects, and it does
4102 * so based on the actual physical registers in use.
4103 */
4104 insert_gen4_send_dependency_workarounds();
4105
4106 if (failed)
4107 return;
4108
4109 if (!allocated_without_spills)
4110 schedule_instructions(SCHEDULE_POST);
4111
4112 if (last_scratch > 0)
4113 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4114 }
4115
4116 bool
4117 fs_visitor::run_vs()
4118 {
4119 assert(stage == MESA_SHADER_VERTEX);
4120
4121 assign_common_binding_table_offsets(0);
4122 setup_vs_payload();
4123
4124 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4125 emit_shader_time_begin();
4126
4127 emit_nir_code();
4128
4129 if (failed)
4130 return false;
4131
4132 emit_urb_writes();
4133
4134 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4135 emit_shader_time_end();
4136
4137 calculate_cfg();
4138
4139 optimize();
4140
4141 assign_curb_setup();
4142 assign_vs_urb_setup();
4143
4144 fixup_3src_null_dest();
4145 allocate_registers();
4146
4147 return !failed;
4148 }
4149
4150 bool
4151 fs_visitor::run_fs()
4152 {
4153 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4154 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4155
4156 assert(stage == MESA_SHADER_FRAGMENT);
4157
4158 sanity_param_count = prog->Parameters->NumParameters;
4159
4160 assign_binding_table_offsets();
4161
4162 if (devinfo->gen >= 6)
4163 setup_payload_gen6();
4164 else
4165 setup_payload_gen4();
4166
4167 if (0) {
4168 emit_dummy_fs();
4169 } else if (brw->use_rep_send && dispatch_width == 16) {
4170 emit_repclear_shader();
4171 } else {
4172 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4173 emit_shader_time_begin();
4174
4175 calculate_urb_setup();
4176 if (prog->InputsRead > 0) {
4177 if (devinfo->gen < 6)
4178 emit_interpolation_setup_gen4();
4179 else
4180 emit_interpolation_setup_gen6();
4181 }
4182
4183 /* We handle discards by keeping track of the still-live pixels in f0.1.
4184 * Initialize it with the dispatched pixels.
4185 */
4186 if (wm_prog_data->uses_kill) {
4187 fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4188 discard_init->flag_subreg = 1;
4189 }
4190
4191 /* Generate FS IR for main(). (the visitor only descends into
4192 * functions called "main").
4193 */
4194 emit_nir_code();
4195
4196 if (failed)
4197 return false;
4198
4199 if (wm_prog_data->uses_kill)
4200 bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
4201
4202 if (wm_key->alpha_test_func)
4203 emit_alpha_test();
4204
4205 emit_fb_writes();
4206
4207 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4208 emit_shader_time_end();
4209
4210 calculate_cfg();
4211
4212 optimize();
4213
4214 assign_curb_setup();
4215 assign_urb_setup();
4216
4217 fixup_3src_null_dest();
4218 allocate_registers();
4219
4220 if (failed)
4221 return false;
4222 }
4223
4224 if (dispatch_width == 8)
4225 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4226 else
4227 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4228
4229 /* If any state parameters were appended, then ParameterValues could have
4230 * been realloced, in which case the driver uniform storage set up by
4231 * _mesa_associate_uniform_storage() would point to freed memory. Make
4232 * sure that didn't happen.
4233 */
4234 assert(sanity_param_count == prog->Parameters->NumParameters);
4235
4236 return !failed;
4237 }
4238
4239 bool
4240 fs_visitor::run_cs()
4241 {
4242 assert(stage == MESA_SHADER_COMPUTE);
4243 assert(shader);
4244
4245 sanity_param_count = prog->Parameters->NumParameters;
4246
4247 assign_common_binding_table_offsets(0);
4248
4249 setup_cs_payload();
4250
4251 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4252 emit_shader_time_begin();
4253
4254 emit_nir_code();
4255
4256 if (failed)
4257 return false;
4258
4259 emit_cs_terminate();
4260
4261 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4262 emit_shader_time_end();
4263
4264 calculate_cfg();
4265
4266 optimize();
4267
4268 assign_curb_setup();
4269
4270 fixup_3src_null_dest();
4271 allocate_registers();
4272
4273 if (failed)
4274 return false;
4275
4276 /* If any state parameters were appended, then ParameterValues could have
4277 * been realloced, in which case the driver uniform storage set up by
4278 * _mesa_associate_uniform_storage() would point to freed memory. Make
4279 * sure that didn't happen.
4280 */
4281 assert(sanity_param_count == prog->Parameters->NumParameters);
4282
4283 return !failed;
4284 }
4285
4286 const unsigned *
4287 brw_wm_fs_emit(struct brw_context *brw,
4288 void *mem_ctx,
4289 const struct brw_wm_prog_key *key,
4290 struct brw_wm_prog_data *prog_data,
4291 struct gl_fragment_program *fp,
4292 struct gl_shader_program *prog,
4293 unsigned *final_assembly_size)
4294 {
4295 bool start_busy = false;
4296 double start_time = 0;
4297
4298 if (unlikely(brw->perf_debug)) {
4299 start_busy = (brw->batch.last_bo &&
4300 drm_intel_bo_busy(brw->batch.last_bo));
4301 start_time = get_time();
4302 }
4303
4304 struct brw_shader *shader = NULL;
4305 if (prog)
4306 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4307
4308 if (unlikely(INTEL_DEBUG & DEBUG_WM))
4309 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4310
4311 /* Now the main event: Visit the shader IR and generate our FS IR for it.
4312 */
4313 fs_visitor v(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4314 prog, &fp->Base, 8);
4315 if (!v.run_fs()) {
4316 if (prog) {
4317 prog->LinkStatus = false;
4318 ralloc_strcat(&prog->InfoLog, v.fail_msg);
4319 }
4320
4321 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4322 v.fail_msg);
4323
4324 return NULL;
4325 }
4326
4327 cfg_t *simd16_cfg = NULL;
4328 fs_visitor v2(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4329 prog, &fp->Base, 16);
4330 if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4331 if (!v.simd16_unsupported) {
4332 /* Try a SIMD16 compile */
4333 v2.import_uniforms(&v);
4334 if (!v2.run_fs()) {
4335 perf_debug("SIMD16 shader failed to compile, falling back to "
4336 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4337 } else {
4338 simd16_cfg = v2.cfg;
4339 }
4340 } else {
4341 perf_debug("SIMD16 shader unsupported, falling back to "
4342 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4343 }
4344 }
4345
4346 cfg_t *simd8_cfg;
4347 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4348 if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4349 simd8_cfg = NULL;
4350 prog_data->no_8 = true;
4351 } else {
4352 simd8_cfg = v.cfg;
4353 prog_data->no_8 = false;
4354 }
4355
4356 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4357 &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4358
4359 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4360 char *name;
4361 if (prog)
4362 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4363 prog->Label ? prog->Label : "unnamed",
4364 prog->Name);
4365 else
4366 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4367
4368 g.enable_debug(name);
4369 }
4370
4371 if (simd8_cfg)
4372 g.generate_code(simd8_cfg, 8);
4373 if (simd16_cfg)
4374 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4375
4376 if (unlikely(brw->perf_debug) && shader) {
4377 if (shader->compiled_once)
4378 brw_wm_debug_recompile(brw, prog, key);
4379 shader->compiled_once = true;
4380
4381 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4382 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4383 (get_time() - start_time) * 1000);
4384 }
4385 }
4386
4387 return g.get_assembly(final_assembly_size);
4388 }
4389
4390 extern "C" bool
4391 brw_fs_precompile(struct gl_context *ctx,
4392 struct gl_shader_program *shader_prog,
4393 struct gl_program *prog)
4394 {
4395 struct brw_context *brw = brw_context(ctx);
4396 struct brw_wm_prog_key key;
4397
4398 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4399 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4400 bool program_uses_dfdy = fp->UsesDFdy;
4401
4402 memset(&key, 0, sizeof(key));
4403
4404 if (brw->gen < 6) {
4405 if (fp->UsesKill)
4406 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4407
4408 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4409 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4410
4411 /* Just assume depth testing. */
4412 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4413 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4414 }
4415
4416 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4417 BRW_FS_VARYING_INPUT_MASK) > 16)
4418 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4419
4420 brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4421
4422 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4423 key.drawable_height = ctx->DrawBuffer->Height;
4424 }
4425
4426 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4427 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4428 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4429
4430 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4431 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4432 key.nr_color_regions > 1;
4433 }
4434
4435 key.program_string_id = bfp->id;
4436
4437 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4438 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4439
4440 bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4441
4442 brw->wm.base.prog_offset = old_prog_offset;
4443 brw->wm.prog_data = old_prog_data;
4444
4445 return success;
4446 }
4447
4448 void
4449 brw_setup_tex_for_precompile(struct brw_context *brw,
4450 struct brw_sampler_prog_key_data *tex,
4451 struct gl_program *prog)
4452 {
4453 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4454 unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4455 for (unsigned i = 0; i < sampler_count; i++) {
4456 if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4457 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4458 tex->swizzles[i] =
4459 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4460 } else {
4461 /* Color sampler: assume no swizzling. */
4462 tex->swizzles[i] = SWIZZLE_XYZW;
4463 }
4464 }
4465 }