i965/fs: Migrate FS gl_SamplePosition/ID computation code to the IR builder.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 using namespace brw;
53
54 void
55 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
56 const fs_reg *src, unsigned sources)
57 {
58 memset(this, 0, sizeof(*this));
59
60 this->src = new fs_reg[MAX2(sources, 3)];
61 for (unsigned i = 0; i < sources; i++)
62 this->src[i] = src[i];
63
64 this->opcode = opcode;
65 this->dst = dst;
66 this->sources = sources;
67 this->exec_size = exec_size;
68
69 assert(dst.file != IMM && dst.file != UNIFORM);
70
71 /* If exec_size == 0, try to guess it from the registers. Since all
72 * manner of things may use hardware registers, we first try to guess
73 * based on GRF registers. If this fails, we will go ahead and take the
74 * width from the destination register.
75 */
76 if (this->exec_size == 0) {
77 if (dst.file == GRF) {
78 this->exec_size = dst.width;
79 } else {
80 for (unsigned i = 0; i < sources; ++i) {
81 if (src[i].file != GRF && src[i].file != ATTR)
82 continue;
83
84 if (this->exec_size <= 1)
85 this->exec_size = src[i].width;
86 assert(src[i].width == 1 || src[i].width == this->exec_size);
87 }
88 }
89
90 if (this->exec_size == 0 && dst.file != BAD_FILE)
91 this->exec_size = dst.width;
92 }
93 assert(this->exec_size != 0);
94
95 this->conditional_mod = BRW_CONDITIONAL_NONE;
96
97 /* This will be the case for almost all instructions. */
98 switch (dst.file) {
99 case GRF:
100 case HW_REG:
101 case MRF:
102 case ATTR:
103 this->regs_written =
104 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
105 break;
106 case BAD_FILE:
107 this->regs_written = 0;
108 break;
109 case IMM:
110 case UNIFORM:
111 unreachable("Invalid destination register file");
112 default:
113 unreachable("Invalid register file");
114 }
115
116 this->writes_accumulator = false;
117 }
118
119 fs_inst::fs_inst()
120 {
121 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
122 }
123
124 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
125 {
126 init(opcode, exec_size, reg_undef, NULL, 0);
127 }
128
129 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
130 {
131 init(opcode, 0, dst, NULL, 0);
132 }
133
134 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
135 const fs_reg &src0)
136 {
137 const fs_reg src[1] = { src0 };
138 init(opcode, exec_size, dst, src, 1);
139 }
140
141 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
142 {
143 const fs_reg src[1] = { src0 };
144 init(opcode, 0, dst, src, 1);
145 }
146
147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
148 const fs_reg &src0, const fs_reg &src1)
149 {
150 const fs_reg src[2] = { src0, src1 };
151 init(opcode, exec_size, dst, src, 2);
152 }
153
154 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
155 const fs_reg &src1)
156 {
157 const fs_reg src[2] = { src0, src1 };
158 init(opcode, 0, dst, src, 2);
159 }
160
161 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
162 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
163 {
164 const fs_reg src[3] = { src0, src1, src2 };
165 init(opcode, exec_size, dst, src, 3);
166 }
167
168 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
169 const fs_reg &src1, const fs_reg &src2)
170 {
171 const fs_reg src[3] = { src0, src1, src2 };
172 init(opcode, 0, dst, src, 3);
173 }
174
175 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
176 const fs_reg src[], unsigned sources)
177 {
178 init(opcode, 0, dst, src, sources);
179 }
180
181 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
182 const fs_reg src[], unsigned sources)
183 {
184 init(opcode, exec_width, dst, src, sources);
185 }
186
187 fs_inst::fs_inst(const fs_inst &that)
188 {
189 memcpy(this, &that, sizeof(that));
190
191 this->src = new fs_reg[MAX2(that.sources, 3)];
192
193 for (unsigned i = 0; i < that.sources; i++)
194 this->src[i] = that.src[i];
195 }
196
197 fs_inst::~fs_inst()
198 {
199 delete[] this->src;
200 }
201
202 void
203 fs_inst::resize_sources(uint8_t num_sources)
204 {
205 if (this->sources != num_sources) {
206 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
207
208 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
209 src[i] = this->src[i];
210
211 delete[] this->src;
212 this->src = src;
213 this->sources = num_sources;
214 }
215 }
216
217 #define ALU1(op) \
218 fs_inst * \
219 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
220 { \
221 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
222 }
223
224 #define ALU2(op) \
225 fs_inst * \
226 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
227 const fs_reg &src1) \
228 { \
229 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
230 }
231
232 #define ALU2_ACC(op) \
233 fs_inst * \
234 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
235 const fs_reg &src1) \
236 { \
237 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
238 inst->writes_accumulator = true; \
239 return inst; \
240 }
241
242 #define ALU3(op) \
243 fs_inst * \
244 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
245 const fs_reg &src1, const fs_reg &src2) \
246 { \
247 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
248 }
249
250 ALU1(NOT)
251 ALU1(MOV)
252 ALU1(FRC)
253 ALU1(RNDD)
254 ALU1(RNDE)
255 ALU1(RNDZ)
256 ALU2(ADD)
257 ALU2(MUL)
258 ALU2_ACC(MACH)
259 ALU2(AND)
260 ALU2(OR)
261 ALU2(XOR)
262 ALU2(SHL)
263 ALU2(SHR)
264 ALU2(ASR)
265 ALU3(LRP)
266 ALU1(BFREV)
267 ALU3(BFE)
268 ALU2(BFI1)
269 ALU3(BFI2)
270 ALU1(FBH)
271 ALU1(FBL)
272 ALU1(CBIT)
273 ALU3(MAD)
274 ALU2_ACC(ADDC)
275 ALU2_ACC(SUBB)
276 ALU2(SEL)
277 ALU2(MAC)
278
279 /** Gen4 predicated IF. */
280 fs_inst *
281 fs_visitor::IF(enum brw_predicate predicate)
282 {
283 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
284 inst->predicate = predicate;
285 return inst;
286 }
287
288 /** Gen6 IF with embedded comparison. */
289 fs_inst *
290 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
291 enum brw_conditional_mod condition)
292 {
293 assert(devinfo->gen == 6);
294 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
295 reg_null_d, src0, src1);
296 inst->conditional_mod = condition;
297 return inst;
298 }
299
300 /**
301 * CMP: Sets the low bit of the destination channels with the result
302 * of the comparison, while the upper bits are undefined, and updates
303 * the flag register with the packed 16 bits of the result.
304 */
305 fs_inst *
306 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
307 enum brw_conditional_mod condition)
308 {
309 fs_inst *inst;
310
311 /* Take the instruction:
312 *
313 * CMP null<d> src0<f> src1<f>
314 *
315 * Original gen4 does type conversion to the destination type before
316 * comparison, producing garbage results for floating point comparisons.
317 *
318 * The destination type doesn't matter on newer generations, so we set the
319 * type to match src0 so we can compact the instruction.
320 */
321 dst.type = src0.type;
322 if (dst.file == HW_REG)
323 dst.fixed_hw_reg.type = dst.type;
324
325 resolve_ud_negate(&src0);
326 resolve_ud_negate(&src1);
327
328 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
329 inst->conditional_mod = condition;
330
331 return inst;
332 }
333
334 fs_inst *
335 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
336 int header_size)
337 {
338 assert(dst.width % 8 == 0);
339 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
340 dst, src, sources);
341 inst->header_size = header_size;
342
343 for (int i = 0; i < header_size; i++)
344 assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
345 inst->regs_written = header_size;
346
347 for (int i = header_size; i < sources; ++i)
348 assert(src[i].file != GRF || src[i].width == dst.width);
349 inst->regs_written += (sources - header_size) * (dst.width / 8);
350
351 return inst;
352 }
353
354 void
355 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
356 const fs_reg &dst,
357 const fs_reg &surf_index,
358 const fs_reg &varying_offset,
359 uint32_t const_offset)
360 {
361 /* We have our constant surface use a pitch of 4 bytes, so our index can
362 * be any component of a vector, and then we load 4 contiguous
363 * components starting from that.
364 *
365 * We break down the const_offset to a portion added to the variable
366 * offset and a portion done using reg_offset, which means that if you
367 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
368 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
369 * CSE can later notice that those loads are all the same and eliminate
370 * the redundant ones.
371 */
372 fs_reg vec4_offset = vgrf(glsl_type::int_type);
373 bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
374
375 int scale = 1;
376 if (devinfo->gen == 4 && dst.width == 8) {
377 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
378 * u, v, r) as parameters, or we can just use the SIMD16 message
379 * consisting of (header, u). We choose the second, at the cost of a
380 * longer return length.
381 */
382 scale = 2;
383 }
384
385 enum opcode op;
386 if (devinfo->gen >= 7)
387 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
388 else
389 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
390
391 assert(dst.width % 8 == 0);
392 int regs_written = 4 * (dst.width / 8) * scale;
393 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
394 dst.type, dst.width);
395 fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
396 inst->regs_written = regs_written;
397
398 if (devinfo->gen < 7) {
399 inst->base_mrf = 13;
400 inst->header_size = 1;
401 if (devinfo->gen == 4)
402 inst->mlen = 3;
403 else
404 inst->mlen = 1 + dispatch_width / 8;
405 }
406
407 bld.MOV(dst, offset(vec4_result, (const_offset & 3) * scale));
408 }
409
410 /**
411 * A helper for MOV generation for fixing up broken hardware SEND dependency
412 * handling.
413 */
414 void
415 fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
416 {
417 /* The caller always wants uncompressed to emit the minimal extra
418 * dependencies, and to avoid having to deal with aligning its regs to 2.
419 */
420 const fs_builder ubld = bld.annotate("send dependency resolve")
421 .half(0);
422
423 ubld.MOV(ubld.null_reg_f(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
424 }
425
426 bool
427 fs_inst::equals(fs_inst *inst) const
428 {
429 return (opcode == inst->opcode &&
430 dst.equals(inst->dst) &&
431 src[0].equals(inst->src[0]) &&
432 src[1].equals(inst->src[1]) &&
433 src[2].equals(inst->src[2]) &&
434 saturate == inst->saturate &&
435 predicate == inst->predicate &&
436 conditional_mod == inst->conditional_mod &&
437 mlen == inst->mlen &&
438 base_mrf == inst->base_mrf &&
439 target == inst->target &&
440 eot == inst->eot &&
441 header_size == inst->header_size &&
442 shadow_compare == inst->shadow_compare &&
443 exec_size == inst->exec_size &&
444 offset == inst->offset);
445 }
446
447 bool
448 fs_inst::overwrites_reg(const fs_reg &reg) const
449 {
450 return reg.in_range(dst, regs_written);
451 }
452
453 bool
454 fs_inst::is_send_from_grf() const
455 {
456 switch (opcode) {
457 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
458 case SHADER_OPCODE_SHADER_TIME_ADD:
459 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
460 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
461 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
462 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
463 case SHADER_OPCODE_UNTYPED_ATOMIC:
464 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
465 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
466 case SHADER_OPCODE_TYPED_ATOMIC:
467 case SHADER_OPCODE_TYPED_SURFACE_READ:
468 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
469 case SHADER_OPCODE_URB_WRITE_SIMD8:
470 return true;
471 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
472 return src[1].file == GRF;
473 case FS_OPCODE_FB_WRITE:
474 return src[0].file == GRF;
475 default:
476 if (is_tex())
477 return src[0].file == GRF;
478
479 return false;
480 }
481 }
482
483 bool
484 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
485 {
486 if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
487 return false;
488
489 fs_reg reg = this->src[0];
490 if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
491 return false;
492
493 if (grf_alloc.sizes[reg.reg] != this->regs_written)
494 return false;
495
496 for (int i = 0; i < this->sources; i++) {
497 reg.type = this->src[i].type;
498 reg.width = this->src[i].width;
499 if (!this->src[i].equals(reg))
500 return false;
501 reg = ::offset(reg, 1);
502 }
503
504 return true;
505 }
506
507 bool
508 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
509 {
510 if (devinfo->gen == 6 && is_math())
511 return false;
512
513 if (is_send_from_grf())
514 return false;
515
516 if (!backend_instruction::can_do_source_mods())
517 return false;
518
519 return true;
520 }
521
522 bool
523 fs_inst::has_side_effects() const
524 {
525 return this->eot || backend_instruction::has_side_effects();
526 }
527
528 void
529 fs_reg::init()
530 {
531 memset(this, 0, sizeof(*this));
532 stride = 1;
533 }
534
535 /** Generic unset register constructor. */
536 fs_reg::fs_reg()
537 {
538 init();
539 this->file = BAD_FILE;
540 }
541
542 /** Immediate value constructor. */
543 fs_reg::fs_reg(float f)
544 {
545 init();
546 this->file = IMM;
547 this->type = BRW_REGISTER_TYPE_F;
548 this->fixed_hw_reg.dw1.f = f;
549 this->width = 1;
550 }
551
552 /** Immediate value constructor. */
553 fs_reg::fs_reg(int32_t i)
554 {
555 init();
556 this->file = IMM;
557 this->type = BRW_REGISTER_TYPE_D;
558 this->fixed_hw_reg.dw1.d = i;
559 this->width = 1;
560 }
561
562 /** Immediate value constructor. */
563 fs_reg::fs_reg(uint32_t u)
564 {
565 init();
566 this->file = IMM;
567 this->type = BRW_REGISTER_TYPE_UD;
568 this->fixed_hw_reg.dw1.ud = u;
569 this->width = 1;
570 }
571
572 /** Vector float immediate value constructor. */
573 fs_reg::fs_reg(uint8_t vf[4])
574 {
575 init();
576 this->file = IMM;
577 this->type = BRW_REGISTER_TYPE_VF;
578 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
579 }
580
581 /** Vector float immediate value constructor. */
582 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
583 {
584 init();
585 this->file = IMM;
586 this->type = BRW_REGISTER_TYPE_VF;
587 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
588 (vf1 << 8) |
589 (vf2 << 16) |
590 (vf3 << 24);
591 }
592
593 /** Fixed brw_reg. */
594 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
595 {
596 init();
597 this->file = HW_REG;
598 this->fixed_hw_reg = fixed_hw_reg;
599 this->type = fixed_hw_reg.type;
600 this->width = 1 << fixed_hw_reg.width;
601 }
602
603 bool
604 fs_reg::equals(const fs_reg &r) const
605 {
606 return (file == r.file &&
607 reg == r.reg &&
608 reg_offset == r.reg_offset &&
609 subreg_offset == r.subreg_offset &&
610 type == r.type &&
611 negate == r.negate &&
612 abs == r.abs &&
613 !reladdr && !r.reladdr &&
614 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
615 width == r.width &&
616 stride == r.stride);
617 }
618
619 fs_reg &
620 fs_reg::set_smear(unsigned subreg)
621 {
622 assert(file != HW_REG && file != IMM);
623 subreg_offset = subreg * type_sz(type);
624 stride = 0;
625 return *this;
626 }
627
628 bool
629 fs_reg::is_contiguous() const
630 {
631 return stride == 1;
632 }
633
634 int
635 fs_visitor::type_size(const struct glsl_type *type)
636 {
637 unsigned int size, i;
638
639 switch (type->base_type) {
640 case GLSL_TYPE_UINT:
641 case GLSL_TYPE_INT:
642 case GLSL_TYPE_FLOAT:
643 case GLSL_TYPE_BOOL:
644 return type->components();
645 case GLSL_TYPE_ARRAY:
646 return type_size(type->fields.array) * type->length;
647 case GLSL_TYPE_STRUCT:
648 size = 0;
649 for (i = 0; i < type->length; i++) {
650 size += type_size(type->fields.structure[i].type);
651 }
652 return size;
653 case GLSL_TYPE_SAMPLER:
654 /* Samplers take up no register space, since they're baked in at
655 * link time.
656 */
657 return 0;
658 case GLSL_TYPE_ATOMIC_UINT:
659 return 0;
660 case GLSL_TYPE_IMAGE:
661 case GLSL_TYPE_VOID:
662 case GLSL_TYPE_ERROR:
663 case GLSL_TYPE_INTERFACE:
664 case GLSL_TYPE_DOUBLE:
665 unreachable("not reached");
666 }
667
668 return 0;
669 }
670
671 /**
672 * Create a MOV to read the timestamp register.
673 *
674 * The caller is responsible for emitting the MOV. The return value is
675 * the destination of the MOV, with extra parameters set.
676 */
677 fs_reg
678 fs_visitor::get_timestamp(const fs_builder &bld)
679 {
680 assert(devinfo->gen >= 7);
681
682 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
683 BRW_ARF_TIMESTAMP,
684 0),
685 BRW_REGISTER_TYPE_UD));
686
687 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
688
689 /* We want to read the 3 fields we care about even if it's not enabled in
690 * the dispatch.
691 */
692 bld.exec_all().MOV(dst, ts);
693
694 /* The caller wants the low 32 bits of the timestamp. Since it's running
695 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
696 * which is plenty of time for our purposes. It is identical across the
697 * EUs, but since it's tracking GPU core speed it will increment at a
698 * varying rate as render P-states change.
699 *
700 * The caller could also check if render P-states have changed (or anything
701 * else that might disrupt timing) by setting smear to 2 and checking if
702 * that field is != 0.
703 */
704 dst.set_smear(0);
705
706 return dst;
707 }
708
709 void
710 fs_visitor::emit_shader_time_begin()
711 {
712 shader_start_time = get_timestamp(bld.annotate("shader time start"));
713 }
714
715 void
716 fs_visitor::emit_shader_time_end()
717 {
718 enum shader_time_shader_type type, written_type, reset_type;
719 switch (stage) {
720 case MESA_SHADER_VERTEX:
721 type = ST_VS;
722 written_type = ST_VS_WRITTEN;
723 reset_type = ST_VS_RESET;
724 break;
725 case MESA_SHADER_GEOMETRY:
726 type = ST_GS;
727 written_type = ST_GS_WRITTEN;
728 reset_type = ST_GS_RESET;
729 break;
730 case MESA_SHADER_FRAGMENT:
731 if (dispatch_width == 8) {
732 type = ST_FS8;
733 written_type = ST_FS8_WRITTEN;
734 reset_type = ST_FS8_RESET;
735 } else {
736 assert(dispatch_width == 16);
737 type = ST_FS16;
738 written_type = ST_FS16_WRITTEN;
739 reset_type = ST_FS16_RESET;
740 }
741 break;
742 case MESA_SHADER_COMPUTE:
743 type = ST_CS;
744 written_type = ST_CS_WRITTEN;
745 reset_type = ST_CS_RESET;
746 break;
747 default:
748 unreachable("fs_visitor::emit_shader_time_end missing code");
749 }
750
751 /* Insert our code just before the final SEND with EOT. */
752 exec_node *end = this->instructions.get_tail();
753 assert(end && ((fs_inst *) end)->eot);
754 const fs_builder ibld = bld.annotate("shader time end")
755 .exec_all().at(NULL, end);
756
757 fs_reg shader_end_time = get_timestamp(ibld);
758
759 /* Check that there weren't any timestamp reset events (assuming these
760 * were the only two timestamp reads that happened).
761 */
762 fs_reg reset = shader_end_time;
763 reset.set_smear(2);
764 set_condmod(BRW_CONDITIONAL_Z,
765 ibld.AND(ibld.null_reg_ud(), reset, fs_reg(1u)));
766 ibld.IF(BRW_PREDICATE_NORMAL);
767
768 fs_reg start = shader_start_time;
769 start.negate = true;
770 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
771 diff.set_smear(0);
772 ibld.ADD(diff, start, shader_end_time);
773
774 /* If there were no instructions between the two timestamp gets, the diff
775 * is 2 cycles. Remove that overhead, so I can forget about that when
776 * trying to determine the time taken for single instructions.
777 */
778 ibld.ADD(diff, diff, fs_reg(-2u));
779 SHADER_TIME_ADD(ibld, type, diff);
780 SHADER_TIME_ADD(ibld, written_type, fs_reg(1u));
781 ibld.emit(BRW_OPCODE_ELSE);
782 SHADER_TIME_ADD(ibld, reset_type, fs_reg(1u));
783 ibld.emit(BRW_OPCODE_ENDIF);
784 }
785
786 void
787 fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
788 enum shader_time_shader_type type, fs_reg value)
789 {
790 int shader_time_index =
791 brw_get_shader_time_index(brw, shader_prog, prog, type);
792 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
793
794 fs_reg payload;
795 if (dispatch_width == 8)
796 payload = vgrf(glsl_type::uvec2_type);
797 else
798 payload = vgrf(glsl_type::uint_type);
799
800 bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
801 }
802
803 void
804 fs_visitor::vfail(const char *format, va_list va)
805 {
806 char *msg;
807
808 if (failed)
809 return;
810
811 failed = true;
812
813 msg = ralloc_vasprintf(mem_ctx, format, va);
814 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
815
816 this->fail_msg = msg;
817
818 if (debug_enabled) {
819 fprintf(stderr, "%s", msg);
820 }
821 }
822
823 void
824 fs_visitor::fail(const char *format, ...)
825 {
826 va_list va;
827
828 va_start(va, format);
829 vfail(format, va);
830 va_end(va);
831 }
832
833 /**
834 * Mark this program as impossible to compile in SIMD16 mode.
835 *
836 * During the SIMD8 compile (which happens first), we can detect and flag
837 * things that are unsupported in SIMD16 mode, so the compiler can skip
838 * the SIMD16 compile altogether.
839 *
840 * During a SIMD16 compile (if one happens anyway), this just calls fail().
841 */
842 void
843 fs_visitor::no16(const char *format, ...)
844 {
845 va_list va;
846
847 va_start(va, format);
848
849 if (dispatch_width == 16) {
850 vfail(format, va);
851 } else {
852 simd16_unsupported = true;
853
854 if (brw->perf_debug) {
855 if (no16_msg)
856 ralloc_vasprintf_append(&no16_msg, format, va);
857 else
858 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
859 }
860 }
861
862 va_end(va);
863 }
864
865 fs_inst *
866 fs_visitor::emit(enum opcode opcode)
867 {
868 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
869 }
870
871 fs_inst *
872 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
873 {
874 return emit(new(mem_ctx) fs_inst(opcode, dst));
875 }
876
877 fs_inst *
878 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
879 {
880 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
881 }
882
883 fs_inst *
884 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
885 const fs_reg &src1)
886 {
887 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
888 }
889
890 fs_inst *
891 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
892 const fs_reg &src1, const fs_reg &src2)
893 {
894 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
895 }
896
897 fs_inst *
898 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
899 fs_reg src[], int sources)
900 {
901 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
902 }
903
904 /**
905 * Returns true if the instruction has a flag that means it won't
906 * update an entire destination register.
907 *
908 * For example, dead code elimination and live variable analysis want to know
909 * when a write to a variable screens off any preceding values that were in
910 * it.
911 */
912 bool
913 fs_inst::is_partial_write() const
914 {
915 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
916 (this->dst.width * type_sz(this->dst.type)) < 32 ||
917 !this->dst.is_contiguous());
918 }
919
920 int
921 fs_inst::regs_read(int arg) const
922 {
923 if (is_tex() && arg == 0 && src[0].file == GRF) {
924 return mlen;
925 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
926 return mlen;
927 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
928 return mlen;
929 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
930 return mlen;
931 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
932 return mlen;
933 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
934 return mlen;
935 } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
936 return mlen;
937 } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
938 return mlen;
939 } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
940 return mlen;
941 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
942 return mlen;
943 } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
944 return exec_size / 4;
945 }
946
947 switch (src[arg].file) {
948 case BAD_FILE:
949 case UNIFORM:
950 case IMM:
951 return 1;
952 case GRF:
953 case HW_REG:
954 if (src[arg].stride == 0) {
955 return 1;
956 } else {
957 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
958 return (size + 31) / 32;
959 }
960 case MRF:
961 unreachable("MRF registers are not allowed as sources");
962 default:
963 unreachable("Invalid register file");
964 }
965 }
966
967 bool
968 fs_inst::reads_flag() const
969 {
970 return predicate;
971 }
972
973 bool
974 fs_inst::writes_flag() const
975 {
976 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
977 opcode != BRW_OPCODE_IF &&
978 opcode != BRW_OPCODE_WHILE)) ||
979 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
980 }
981
982 /**
983 * Returns how many MRFs an FS opcode will write over.
984 *
985 * Note that this is not the 0 or 1 implied writes in an actual gen
986 * instruction -- the FS opcodes often generate MOVs in addition.
987 */
988 int
989 fs_visitor::implied_mrf_writes(fs_inst *inst)
990 {
991 if (inst->mlen == 0)
992 return 0;
993
994 if (inst->base_mrf == -1)
995 return 0;
996
997 switch (inst->opcode) {
998 case SHADER_OPCODE_RCP:
999 case SHADER_OPCODE_RSQ:
1000 case SHADER_OPCODE_SQRT:
1001 case SHADER_OPCODE_EXP2:
1002 case SHADER_OPCODE_LOG2:
1003 case SHADER_OPCODE_SIN:
1004 case SHADER_OPCODE_COS:
1005 return 1 * dispatch_width / 8;
1006 case SHADER_OPCODE_POW:
1007 case SHADER_OPCODE_INT_QUOTIENT:
1008 case SHADER_OPCODE_INT_REMAINDER:
1009 return 2 * dispatch_width / 8;
1010 case SHADER_OPCODE_TEX:
1011 case FS_OPCODE_TXB:
1012 case SHADER_OPCODE_TXD:
1013 case SHADER_OPCODE_TXF:
1014 case SHADER_OPCODE_TXF_CMS:
1015 case SHADER_OPCODE_TXF_MCS:
1016 case SHADER_OPCODE_TG4:
1017 case SHADER_OPCODE_TG4_OFFSET:
1018 case SHADER_OPCODE_TXL:
1019 case SHADER_OPCODE_TXS:
1020 case SHADER_OPCODE_LOD:
1021 return 1;
1022 case FS_OPCODE_FB_WRITE:
1023 return 2;
1024 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1025 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1026 return 1;
1027 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1028 return inst->mlen;
1029 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1030 return inst->mlen;
1031 case SHADER_OPCODE_UNTYPED_ATOMIC:
1032 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1033 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1034 case SHADER_OPCODE_TYPED_ATOMIC:
1035 case SHADER_OPCODE_TYPED_SURFACE_READ:
1036 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1037 case SHADER_OPCODE_URB_WRITE_SIMD8:
1038 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1039 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1040 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1041 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1042 return 0;
1043 default:
1044 unreachable("not reached");
1045 }
1046 }
1047
1048 fs_reg
1049 fs_visitor::vgrf(const glsl_type *const type)
1050 {
1051 int reg_width = dispatch_width / 8;
1052 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1053 brw_type_for_base_type(type), dispatch_width);
1054 }
1055
1056 fs_reg
1057 fs_visitor::vgrf(int num_components)
1058 {
1059 int reg_width = dispatch_width / 8;
1060 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1061 BRW_REGISTER_TYPE_F, dispatch_width);
1062 }
1063
1064 /** Fixed HW reg constructor. */
1065 fs_reg::fs_reg(enum register_file file, int reg)
1066 {
1067 init();
1068 this->file = file;
1069 this->reg = reg;
1070 this->type = BRW_REGISTER_TYPE_F;
1071
1072 switch (file) {
1073 case UNIFORM:
1074 this->width = 1;
1075 break;
1076 default:
1077 this->width = 8;
1078 }
1079 }
1080
1081 /** Fixed HW reg constructor. */
1082 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1083 {
1084 init();
1085 this->file = file;
1086 this->reg = reg;
1087 this->type = type;
1088
1089 switch (file) {
1090 case UNIFORM:
1091 this->width = 1;
1092 break;
1093 default:
1094 this->width = 8;
1095 }
1096 }
1097
1098 /** Fixed HW reg constructor. */
1099 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1100 uint8_t width)
1101 {
1102 init();
1103 this->file = file;
1104 this->reg = reg;
1105 this->type = type;
1106 this->width = width;
1107 }
1108
1109 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1110 * This brings in those uniform definitions
1111 */
1112 void
1113 fs_visitor::import_uniforms(fs_visitor *v)
1114 {
1115 this->push_constant_loc = v->push_constant_loc;
1116 this->pull_constant_loc = v->pull_constant_loc;
1117 this->uniforms = v->uniforms;
1118 this->param_size = v->param_size;
1119 }
1120
1121 fs_reg *
1122 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1123 bool origin_upper_left)
1124 {
1125 assert(stage == MESA_SHADER_FRAGMENT);
1126 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1127 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1128 fs_reg wpos = *reg;
1129 bool flip = !origin_upper_left ^ key->render_to_fbo;
1130
1131 /* gl_FragCoord.x */
1132 if (pixel_center_integer) {
1133 bld.MOV(wpos, this->pixel_x);
1134 } else {
1135 bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
1136 }
1137 wpos = offset(wpos, 1);
1138
1139 /* gl_FragCoord.y */
1140 if (!flip && pixel_center_integer) {
1141 bld.MOV(wpos, this->pixel_y);
1142 } else {
1143 fs_reg pixel_y = this->pixel_y;
1144 float offset = (pixel_center_integer ? 0.0 : 0.5);
1145
1146 if (flip) {
1147 pixel_y.negate = true;
1148 offset += key->drawable_height - 1.0;
1149 }
1150
1151 bld.ADD(wpos, pixel_y, fs_reg(offset));
1152 }
1153 wpos = offset(wpos, 1);
1154
1155 /* gl_FragCoord.z */
1156 if (devinfo->gen >= 6) {
1157 bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
1158 } else {
1159 bld.emit(FS_OPCODE_LINTERP, wpos,
1160 this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1161 interp_reg(VARYING_SLOT_POS, 2));
1162 }
1163 wpos = offset(wpos, 1);
1164
1165 /* gl_FragCoord.w: Already set up in emit_interpolation */
1166 bld.MOV(wpos, this->wpos_w);
1167
1168 return reg;
1169 }
1170
1171 fs_inst *
1172 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1173 glsl_interp_qualifier interpolation_mode,
1174 bool is_centroid, bool is_sample)
1175 {
1176 brw_wm_barycentric_interp_mode barycoord_mode;
1177 if (devinfo->gen >= 6) {
1178 if (is_centroid) {
1179 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1180 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1181 else
1182 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1183 } else if (is_sample) {
1184 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1185 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1186 else
1187 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1188 } else {
1189 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1190 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1191 else
1192 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1193 }
1194 } else {
1195 /* On Ironlake and below, there is only one interpolation mode.
1196 * Centroid interpolation doesn't mean anything on this hardware --
1197 * there is no multisampling.
1198 */
1199 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1200 }
1201 return bld.emit(FS_OPCODE_LINTERP, attr,
1202 this->delta_xy[barycoord_mode], interp);
1203 }
1204
1205 void
1206 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1207 const glsl_type *type,
1208 glsl_interp_qualifier interpolation_mode,
1209 int location, bool mod_centroid,
1210 bool mod_sample)
1211 {
1212 attr.type = brw_type_for_base_type(type->get_scalar_type());
1213
1214 assert(stage == MESA_SHADER_FRAGMENT);
1215 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1216 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1217
1218 unsigned int array_elements;
1219
1220 if (type->is_array()) {
1221 array_elements = type->length;
1222 if (array_elements == 0) {
1223 fail("dereferenced array '%s' has length 0\n", name);
1224 }
1225 type = type->fields.array;
1226 } else {
1227 array_elements = 1;
1228 }
1229
1230 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1231 bool is_gl_Color =
1232 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1233 if (key->flat_shade && is_gl_Color) {
1234 interpolation_mode = INTERP_QUALIFIER_FLAT;
1235 } else {
1236 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1237 }
1238 }
1239
1240 for (unsigned int i = 0; i < array_elements; i++) {
1241 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1242 if (prog_data->urb_setup[location] == -1) {
1243 /* If there's no incoming setup data for this slot, don't
1244 * emit interpolation for it.
1245 */
1246 attr = offset(attr, type->vector_elements);
1247 location++;
1248 continue;
1249 }
1250
1251 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1252 /* Constant interpolation (flat shading) case. The SF has
1253 * handed us defined values in only the constant offset
1254 * field of the setup reg.
1255 */
1256 for (unsigned int k = 0; k < type->vector_elements; k++) {
1257 struct brw_reg interp = interp_reg(location, k);
1258 interp = suboffset(interp, 3);
1259 interp.type = attr.type;
1260 bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1261 attr = offset(attr, 1);
1262 }
1263 } else {
1264 /* Smooth/noperspective interpolation case. */
1265 for (unsigned int k = 0; k < type->vector_elements; k++) {
1266 struct brw_reg interp = interp_reg(location, k);
1267 if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1268 /* Get the pixel/sample mask into f0 so that we know
1269 * which pixels are lit. Then, for each channel that is
1270 * unlit, replace the centroid data with non-centroid
1271 * data.
1272 */
1273 bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1274
1275 fs_inst *inst;
1276 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1277 false, false);
1278 inst->predicate = BRW_PREDICATE_NORMAL;
1279 inst->predicate_inverse = true;
1280 if (devinfo->has_pln)
1281 inst->no_dd_clear = true;
1282
1283 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1284 mod_centroid && !key->persample_shading,
1285 mod_sample || key->persample_shading);
1286 inst->predicate = BRW_PREDICATE_NORMAL;
1287 inst->predicate_inverse = false;
1288 if (devinfo->has_pln)
1289 inst->no_dd_check = true;
1290
1291 } else {
1292 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1293 mod_centroid && !key->persample_shading,
1294 mod_sample || key->persample_shading);
1295 }
1296 if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1297 bld.MUL(attr, attr, this->pixel_w);
1298 }
1299 attr = offset(attr, 1);
1300 }
1301
1302 }
1303 location++;
1304 }
1305 }
1306 }
1307
1308 fs_reg *
1309 fs_visitor::emit_frontfacing_interpolation()
1310 {
1311 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1312
1313 if (devinfo->gen >= 6) {
1314 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1315 * a boolean result from this (~0/true or 0/false).
1316 *
1317 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1318 * this task in only one instruction:
1319 * - a negation source modifier will flip the bit; and
1320 * - a W -> D type conversion will sign extend the bit into the high
1321 * word of the destination.
1322 *
1323 * An ASR 15 fills the low word of the destination.
1324 */
1325 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1326 g0.negate = true;
1327
1328 bld.ASR(*reg, g0, fs_reg(15));
1329 } else {
1330 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1331 * a boolean result from this (1/true or 0/false).
1332 *
1333 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1334 * the negation source modifier to flip it. Unfortunately the SHR
1335 * instruction only operates on UD (or D with an abs source modifier)
1336 * sources without negation.
1337 *
1338 * Instead, use ASR (which will give ~0/true or 0/false).
1339 */
1340 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1341 g1_6.negate = true;
1342
1343 bld.ASR(*reg, g1_6, fs_reg(31));
1344 }
1345
1346 return reg;
1347 }
1348
1349 void
1350 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1351 {
1352 assert(stage == MESA_SHADER_FRAGMENT);
1353 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1354 assert(dst.type == BRW_REGISTER_TYPE_F);
1355
1356 if (key->compute_pos_offset) {
1357 /* Convert int_sample_pos to floating point */
1358 bld.MOV(dst, int_sample_pos);
1359 /* Scale to the range [0, 1] */
1360 bld.MUL(dst, dst, fs_reg(1 / 16.0f));
1361 }
1362 else {
1363 /* From ARB_sample_shading specification:
1364 * "When rendering to a non-multisample buffer, or if multisample
1365 * rasterization is disabled, gl_SamplePosition will always be
1366 * (0.5, 0.5).
1367 */
1368 bld.MOV(dst, fs_reg(0.5f));
1369 }
1370 }
1371
1372 fs_reg *
1373 fs_visitor::emit_samplepos_setup()
1374 {
1375 assert(devinfo->gen >= 6);
1376
1377 const fs_builder abld = bld.annotate("compute sample position");
1378 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1379 fs_reg pos = *reg;
1380 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1381 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1382
1383 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1384 * mode will be enabled.
1385 *
1386 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1387 * R31.1:0 Position Offset X/Y for Slot[3:0]
1388 * R31.3:2 Position Offset X/Y for Slot[7:4]
1389 * .....
1390 *
1391 * The X, Y sample positions come in as bytes in thread payload. So, read
1392 * the positions using vstride=16, width=8, hstride=2.
1393 */
1394 struct brw_reg sample_pos_reg =
1395 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1396 BRW_REGISTER_TYPE_B), 16, 8, 2);
1397
1398 if (dispatch_width == 8) {
1399 abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
1400 } else {
1401 abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
1402 abld.half(1).MOV(half(int_sample_x, 1),
1403 fs_reg(suboffset(sample_pos_reg, 16)));
1404 }
1405 /* Compute gl_SamplePosition.x */
1406 compute_sample_position(pos, int_sample_x);
1407 pos = offset(pos, 1);
1408 if (dispatch_width == 8) {
1409 abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
1410 } else {
1411 abld.half(0).MOV(half(int_sample_y, 0),
1412 fs_reg(suboffset(sample_pos_reg, 1)));
1413 abld.half(1).MOV(half(int_sample_y, 1),
1414 fs_reg(suboffset(sample_pos_reg, 17)));
1415 }
1416 /* Compute gl_SamplePosition.y */
1417 compute_sample_position(pos, int_sample_y);
1418 return reg;
1419 }
1420
1421 fs_reg *
1422 fs_visitor::emit_sampleid_setup()
1423 {
1424 assert(stage == MESA_SHADER_FRAGMENT);
1425 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1426 assert(devinfo->gen >= 6);
1427
1428 const fs_builder abld = bld.annotate("compute sample id");
1429 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1430
1431 if (key->compute_sample_id) {
1432 fs_reg t1 = vgrf(glsl_type::int_type);
1433 fs_reg t2 = vgrf(glsl_type::int_type);
1434 t2.type = BRW_REGISTER_TYPE_UW;
1435
1436 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1437 * 8x multisampling, subspan 0 will represent sample N (where N
1438 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1439 * 7. We can find the value of N by looking at R0.0 bits 7:6
1440 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1441 * (since samples are always delivered in pairs). That is, we
1442 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1443 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1444 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1445 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1446 * populating a temporary variable with the sequence (0, 1, 2, 3),
1447 * and then reading from it using vstride=1, width=4, hstride=0.
1448 * These computations hold good for 4x multisampling as well.
1449 *
1450 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1451 * the first four slots are sample 0 of subspan 0; the next four
1452 * are sample 1 of subspan 0; the third group is sample 0 of
1453 * subspan 1, and finally sample 1 of subspan 1.
1454 */
1455 abld.exec_all()
1456 .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1457 fs_reg(0xc0));
1458 abld.exec_all().SHR(t1, t1, fs_reg(5));
1459
1460 /* This works for both SIMD8 and SIMD16 */
1461 abld.exec_all()
1462 .MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210));
1463
1464 /* This special instruction takes care of setting vstride=1,
1465 * width=4, hstride=0 of t2 during an ADD instruction.
1466 */
1467 abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1468 } else {
1469 /* As per GL_ARB_sample_shading specification:
1470 * "When rendering to a non-multisample buffer, or if multisample
1471 * rasterization is disabled, gl_SampleID will always be zero."
1472 */
1473 abld.MOV(*reg, fs_reg(0));
1474 }
1475
1476 return reg;
1477 }
1478
1479 void
1480 fs_visitor::resolve_source_modifiers(fs_reg *src)
1481 {
1482 if (!src->abs && !src->negate)
1483 return;
1484
1485 fs_reg temp = retype(vgrf(1), src->type);
1486 emit(MOV(temp, *src));
1487 *src = temp;
1488 }
1489
1490 fs_reg
1491 fs_visitor::fix_math_operand(fs_reg src)
1492 {
1493 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1494 * might be able to do better by doing execsize = 1 math and then
1495 * expanding that result out, but we would need to be careful with
1496 * masking.
1497 *
1498 * The hardware ignores source modifiers (negate and abs) on math
1499 * instructions, so we also move to a temp to set those up.
1500 */
1501 if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1502 !src.abs && !src.negate)
1503 return src;
1504
1505 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1506 * operands to math
1507 */
1508 if (devinfo->gen >= 7 && src.file != IMM)
1509 return src;
1510
1511 fs_reg expanded = vgrf(glsl_type::float_type);
1512 expanded.type = src.type;
1513 emit(BRW_OPCODE_MOV, expanded, src);
1514 return expanded;
1515 }
1516
1517 fs_inst *
1518 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1519 {
1520 switch (opcode) {
1521 case SHADER_OPCODE_RCP:
1522 case SHADER_OPCODE_RSQ:
1523 case SHADER_OPCODE_SQRT:
1524 case SHADER_OPCODE_EXP2:
1525 case SHADER_OPCODE_LOG2:
1526 case SHADER_OPCODE_SIN:
1527 case SHADER_OPCODE_COS:
1528 break;
1529 default:
1530 unreachable("not reached: bad math opcode");
1531 }
1532
1533 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1534 * might be able to do better by doing execsize = 1 math and then
1535 * expanding that result out, but we would need to be careful with
1536 * masking.
1537 *
1538 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1539 * instructions, so we also move to a temp to set those up.
1540 */
1541 if (devinfo->gen == 6 || devinfo->gen == 7)
1542 src = fix_math_operand(src);
1543
1544 fs_inst *inst = emit(opcode, dst, src);
1545
1546 if (devinfo->gen < 6) {
1547 inst->base_mrf = 2;
1548 inst->mlen = dispatch_width / 8;
1549 }
1550
1551 return inst;
1552 }
1553
1554 fs_inst *
1555 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1556 {
1557 int base_mrf = 2;
1558 fs_inst *inst;
1559
1560 if (devinfo->gen >= 8) {
1561 inst = emit(opcode, dst, src0, src1);
1562 } else if (devinfo->gen >= 6) {
1563 src0 = fix_math_operand(src0);
1564 src1 = fix_math_operand(src1);
1565
1566 inst = emit(opcode, dst, src0, src1);
1567 } else {
1568 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1569 * "Message Payload":
1570 *
1571 * "Operand0[7]. For the INT DIV functions, this operand is the
1572 * denominator."
1573 * ...
1574 * "Operand1[7]. For the INT DIV functions, this operand is the
1575 * numerator."
1576 */
1577 bool is_int_div = opcode != SHADER_OPCODE_POW;
1578 fs_reg &op0 = is_int_div ? src1 : src0;
1579 fs_reg &op1 = is_int_div ? src0 : src1;
1580
1581 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1582 inst = emit(opcode, dst, op0, reg_null_f);
1583
1584 inst->base_mrf = base_mrf;
1585 inst->mlen = 2 * dispatch_width / 8;
1586 }
1587 return inst;
1588 }
1589
1590 void
1591 fs_visitor::emit_discard_jump()
1592 {
1593 assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1594
1595 /* For performance, after a discard, jump to the end of the
1596 * shader if all relevant channels have been discarded.
1597 */
1598 fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1599 discard_jump->flag_subreg = 1;
1600
1601 discard_jump->predicate = (dispatch_width == 8)
1602 ? BRW_PREDICATE_ALIGN1_ANY8H
1603 : BRW_PREDICATE_ALIGN1_ANY16H;
1604 discard_jump->predicate_inverse = true;
1605 }
1606
1607 void
1608 fs_visitor::assign_curb_setup()
1609 {
1610 if (dispatch_width == 8) {
1611 prog_data->dispatch_grf_start_reg = payload.num_regs;
1612 } else {
1613 if (stage == MESA_SHADER_FRAGMENT) {
1614 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1615 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1616 } else if (stage == MESA_SHADER_COMPUTE) {
1617 brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1618 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1619 } else {
1620 unreachable("Unsupported shader type!");
1621 }
1622 }
1623
1624 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1625
1626 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1627 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1628 for (unsigned int i = 0; i < inst->sources; i++) {
1629 if (inst->src[i].file == UNIFORM) {
1630 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1631 int constant_nr;
1632 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1633 constant_nr = push_constant_loc[uniform_nr];
1634 } else {
1635 /* Section 5.11 of the OpenGL 4.1 spec says:
1636 * "Out-of-bounds reads return undefined values, which include
1637 * values from other variables of the active program or zero."
1638 * Just return the first push constant.
1639 */
1640 constant_nr = 0;
1641 }
1642
1643 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1644 constant_nr / 8,
1645 constant_nr % 8);
1646
1647 inst->src[i].file = HW_REG;
1648 inst->src[i].fixed_hw_reg = byte_offset(
1649 retype(brw_reg, inst->src[i].type),
1650 inst->src[i].subreg_offset);
1651 }
1652 }
1653 }
1654 }
1655
1656 void
1657 fs_visitor::calculate_urb_setup()
1658 {
1659 assert(stage == MESA_SHADER_FRAGMENT);
1660 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1661 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1662
1663 memset(prog_data->urb_setup, -1,
1664 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1665
1666 int urb_next = 0;
1667 /* Figure out where each of the incoming setup attributes lands. */
1668 if (devinfo->gen >= 6) {
1669 if (_mesa_bitcount_64(prog->InputsRead &
1670 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1671 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1672 * first 16 varying inputs, so we can put them wherever we want.
1673 * Just put them in order.
1674 *
1675 * This is useful because it means that (a) inputs not used by the
1676 * fragment shader won't take up valuable register space, and (b) we
1677 * won't have to recompile the fragment shader if it gets paired with
1678 * a different vertex (or geometry) shader.
1679 */
1680 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1681 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1682 BITFIELD64_BIT(i)) {
1683 prog_data->urb_setup[i] = urb_next++;
1684 }
1685 }
1686 } else {
1687 /* We have enough input varyings that the SF/SBE pipeline stage can't
1688 * arbitrarily rearrange them to suit our whim; we have to put them
1689 * in an order that matches the output of the previous pipeline stage
1690 * (geometry or vertex shader).
1691 */
1692 struct brw_vue_map prev_stage_vue_map;
1693 brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1694 key->input_slots_valid);
1695 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1696 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1697 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1698 slot++) {
1699 int varying = prev_stage_vue_map.slot_to_varying[slot];
1700 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1701 * unused.
1702 */
1703 if (varying != BRW_VARYING_SLOT_COUNT &&
1704 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1705 BITFIELD64_BIT(varying))) {
1706 prog_data->urb_setup[varying] = slot - first_slot;
1707 }
1708 }
1709 urb_next = prev_stage_vue_map.num_slots - first_slot;
1710 }
1711 } else {
1712 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1713 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1714 /* Point size is packed into the header, not as a general attribute */
1715 if (i == VARYING_SLOT_PSIZ)
1716 continue;
1717
1718 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1719 /* The back color slot is skipped when the front color is
1720 * also written to. In addition, some slots can be
1721 * written in the vertex shader and not read in the
1722 * fragment shader. So the register number must always be
1723 * incremented, mapped or not.
1724 */
1725 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1726 prog_data->urb_setup[i] = urb_next;
1727 urb_next++;
1728 }
1729 }
1730
1731 /*
1732 * It's a FS only attribute, and we did interpolation for this attribute
1733 * in SF thread. So, count it here, too.
1734 *
1735 * See compile_sf_prog() for more info.
1736 */
1737 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1738 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1739 }
1740
1741 prog_data->num_varying_inputs = urb_next;
1742 }
1743
1744 void
1745 fs_visitor::assign_urb_setup()
1746 {
1747 assert(stage == MESA_SHADER_FRAGMENT);
1748 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1749
1750 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1751
1752 /* Offset all the urb_setup[] index by the actual position of the
1753 * setup regs, now that the location of the constants has been chosen.
1754 */
1755 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1756 if (inst->opcode == FS_OPCODE_LINTERP) {
1757 assert(inst->src[1].file == HW_REG);
1758 inst->src[1].fixed_hw_reg.nr += urb_start;
1759 }
1760
1761 if (inst->opcode == FS_OPCODE_CINTERP) {
1762 assert(inst->src[0].file == HW_REG);
1763 inst->src[0].fixed_hw_reg.nr += urb_start;
1764 }
1765 }
1766
1767 /* Each attribute is 4 setup channels, each of which is half a reg. */
1768 this->first_non_payload_grf =
1769 urb_start + prog_data->num_varying_inputs * 2;
1770 }
1771
1772 void
1773 fs_visitor::assign_vs_urb_setup()
1774 {
1775 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1776 int grf, count, slot, channel, attr;
1777
1778 assert(stage == MESA_SHADER_VERTEX);
1779 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1780 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1781 count++;
1782
1783 /* Each attribute is 4 regs. */
1784 this->first_non_payload_grf =
1785 payload.num_regs + prog_data->curb_read_length + count * 4;
1786
1787 unsigned vue_entries =
1788 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1789
1790 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1791 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1792
1793 assert(vs_prog_data->base.urb_read_length <= 15);
1794
1795 /* Rewrite all ATTR file references to the hw grf that they land in. */
1796 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1797 for (int i = 0; i < inst->sources; i++) {
1798 if (inst->src[i].file == ATTR) {
1799
1800 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1801 slot = count - 1;
1802 } else {
1803 /* Attributes come in in a contiguous block, ordered by their
1804 * gl_vert_attrib value. That means we can compute the slot
1805 * number for an attribute by masking out the enabled
1806 * attributes before it and counting the bits.
1807 */
1808 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1809 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1810 BITFIELD64_MASK(attr));
1811 }
1812
1813 channel = inst->src[i].reg_offset & 3;
1814
1815 grf = payload.num_regs +
1816 prog_data->curb_read_length +
1817 slot * 4 + channel;
1818
1819 inst->src[i].file = HW_REG;
1820 inst->src[i].fixed_hw_reg =
1821 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1822 }
1823 }
1824 }
1825 }
1826
1827 /**
1828 * Split large virtual GRFs into separate components if we can.
1829 *
1830 * This is mostly duplicated with what brw_fs_vector_splitting does,
1831 * but that's really conservative because it's afraid of doing
1832 * splitting that doesn't result in real progress after the rest of
1833 * the optimization phases, which would cause infinite looping in
1834 * optimization. We can do it once here, safely. This also has the
1835 * opportunity to split interpolated values, or maybe even uniforms,
1836 * which we don't have at the IR level.
1837 *
1838 * We want to split, because virtual GRFs are what we register
1839 * allocate and spill (due to contiguousness requirements for some
1840 * instructions), and they're what we naturally generate in the
1841 * codegen process, but most virtual GRFs don't actually need to be
1842 * contiguous sets of GRFs. If we split, we'll end up with reduced
1843 * live intervals and better dead code elimination and coalescing.
1844 */
1845 void
1846 fs_visitor::split_virtual_grfs()
1847 {
1848 int num_vars = this->alloc.count;
1849
1850 /* Count the total number of registers */
1851 int reg_count = 0;
1852 int vgrf_to_reg[num_vars];
1853 for (int i = 0; i < num_vars; i++) {
1854 vgrf_to_reg[i] = reg_count;
1855 reg_count += alloc.sizes[i];
1856 }
1857
1858 /* An array of "split points". For each register slot, this indicates
1859 * if this slot can be separated from the previous slot. Every time an
1860 * instruction uses multiple elements of a register (as a source or
1861 * destination), we mark the used slots as inseparable. Then we go
1862 * through and split the registers into the smallest pieces we can.
1863 */
1864 bool split_points[reg_count];
1865 memset(split_points, 0, sizeof(split_points));
1866
1867 /* Mark all used registers as fully splittable */
1868 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1869 if (inst->dst.file == GRF) {
1870 int reg = vgrf_to_reg[inst->dst.reg];
1871 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1872 split_points[reg + j] = true;
1873 }
1874
1875 for (int i = 0; i < inst->sources; i++) {
1876 if (inst->src[i].file == GRF) {
1877 int reg = vgrf_to_reg[inst->src[i].reg];
1878 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1879 split_points[reg + j] = true;
1880 }
1881 }
1882 }
1883
1884 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1885 if (inst->dst.file == GRF) {
1886 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1887 for (int j = 1; j < inst->regs_written; j++)
1888 split_points[reg + j] = false;
1889 }
1890 for (int i = 0; i < inst->sources; i++) {
1891 if (inst->src[i].file == GRF) {
1892 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1893 for (int j = 1; j < inst->regs_read(i); j++)
1894 split_points[reg + j] = false;
1895 }
1896 }
1897 }
1898
1899 int new_virtual_grf[reg_count];
1900 int new_reg_offset[reg_count];
1901
1902 int reg = 0;
1903 for (int i = 0; i < num_vars; i++) {
1904 /* The first one should always be 0 as a quick sanity check. */
1905 assert(split_points[reg] == false);
1906
1907 /* j = 0 case */
1908 new_reg_offset[reg] = 0;
1909 reg++;
1910 int offset = 1;
1911
1912 /* j > 0 case */
1913 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1914 /* If this is a split point, reset the offset to 0 and allocate a
1915 * new virtual GRF for the previous offset many registers
1916 */
1917 if (split_points[reg]) {
1918 assert(offset <= MAX_VGRF_SIZE);
1919 int grf = alloc.allocate(offset);
1920 for (int k = reg - offset; k < reg; k++)
1921 new_virtual_grf[k] = grf;
1922 offset = 0;
1923 }
1924 new_reg_offset[reg] = offset;
1925 offset++;
1926 reg++;
1927 }
1928
1929 /* The last one gets the original register number */
1930 assert(offset <= MAX_VGRF_SIZE);
1931 alloc.sizes[i] = offset;
1932 for (int k = reg - offset; k < reg; k++)
1933 new_virtual_grf[k] = i;
1934 }
1935 assert(reg == reg_count);
1936
1937 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1938 if (inst->dst.file == GRF) {
1939 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1940 inst->dst.reg = new_virtual_grf[reg];
1941 inst->dst.reg_offset = new_reg_offset[reg];
1942 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1943 }
1944 for (int i = 0; i < inst->sources; i++) {
1945 if (inst->src[i].file == GRF) {
1946 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1947 inst->src[i].reg = new_virtual_grf[reg];
1948 inst->src[i].reg_offset = new_reg_offset[reg];
1949 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1950 }
1951 }
1952 }
1953 invalidate_live_intervals();
1954 }
1955
1956 /**
1957 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1958 *
1959 * During code generation, we create tons of temporary variables, many of
1960 * which get immediately killed and are never used again. Yet, in later
1961 * optimization and analysis passes, such as compute_live_intervals, we need
1962 * to loop over all the virtual GRFs. Compacting them can save a lot of
1963 * overhead.
1964 */
1965 bool
1966 fs_visitor::compact_virtual_grfs()
1967 {
1968 bool progress = false;
1969 int remap_table[this->alloc.count];
1970 memset(remap_table, -1, sizeof(remap_table));
1971
1972 /* Mark which virtual GRFs are used. */
1973 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1974 if (inst->dst.file == GRF)
1975 remap_table[inst->dst.reg] = 0;
1976
1977 for (int i = 0; i < inst->sources; i++) {
1978 if (inst->src[i].file == GRF)
1979 remap_table[inst->src[i].reg] = 0;
1980 }
1981 }
1982
1983 /* Compact the GRF arrays. */
1984 int new_index = 0;
1985 for (unsigned i = 0; i < this->alloc.count; i++) {
1986 if (remap_table[i] == -1) {
1987 /* We just found an unused register. This means that we are
1988 * actually going to compact something.
1989 */
1990 progress = true;
1991 } else {
1992 remap_table[i] = new_index;
1993 alloc.sizes[new_index] = alloc.sizes[i];
1994 invalidate_live_intervals();
1995 ++new_index;
1996 }
1997 }
1998
1999 this->alloc.count = new_index;
2000
2001 /* Patch all the instructions to use the newly renumbered registers */
2002 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2003 if (inst->dst.file == GRF)
2004 inst->dst.reg = remap_table[inst->dst.reg];
2005
2006 for (int i = 0; i < inst->sources; i++) {
2007 if (inst->src[i].file == GRF)
2008 inst->src[i].reg = remap_table[inst->src[i].reg];
2009 }
2010 }
2011
2012 /* Patch all the references to delta_xy, since they're used in register
2013 * allocation. If they're unused, switch them to BAD_FILE so we don't
2014 * think some random VGRF is delta_xy.
2015 */
2016 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2017 if (delta_xy[i].file == GRF) {
2018 if (remap_table[delta_xy[i].reg] != -1) {
2019 delta_xy[i].reg = remap_table[delta_xy[i].reg];
2020 } else {
2021 delta_xy[i].file = BAD_FILE;
2022 }
2023 }
2024 }
2025
2026 return progress;
2027 }
2028
2029 /*
2030 * Implements array access of uniforms by inserting a
2031 * PULL_CONSTANT_LOAD instruction.
2032 *
2033 * Unlike temporary GRF array access (where we don't support it due to
2034 * the difficulty of doing relative addressing on instruction
2035 * destinations), we could potentially do array access of uniforms
2036 * that were loaded in GRF space as push constants. In real-world
2037 * usage we've seen, though, the arrays being used are always larger
2038 * than we could load as push constants, so just always move all
2039 * uniform array access out to a pull constant buffer.
2040 */
2041 void
2042 fs_visitor::move_uniform_array_access_to_pull_constants()
2043 {
2044 if (dispatch_width != 8)
2045 return;
2046
2047 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2048 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2049
2050 /* Walk through and find array access of uniforms. Put a copy of that
2051 * uniform in the pull constant buffer.
2052 *
2053 * Note that we don't move constant-indexed accesses to arrays. No
2054 * testing has been done of the performance impact of this choice.
2055 */
2056 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2057 for (int i = 0 ; i < inst->sources; i++) {
2058 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2059 continue;
2060
2061 int uniform = inst->src[i].reg;
2062
2063 /* If this array isn't already present in the pull constant buffer,
2064 * add it.
2065 */
2066 if (pull_constant_loc[uniform] == -1) {
2067 const gl_constant_value **values = &stage_prog_data->param[uniform];
2068
2069 assert(param_size[uniform]);
2070
2071 for (int j = 0; j < param_size[uniform]; j++) {
2072 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2073
2074 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2075 values[j];
2076 }
2077 }
2078 }
2079 }
2080 }
2081
2082 /**
2083 * Assign UNIFORM file registers to either push constants or pull constants.
2084 *
2085 * We allow a fragment shader to have more than the specified minimum
2086 * maximum number of fragment shader uniform components (64). If
2087 * there are too many of these, they'd fill up all of register space.
2088 * So, this will push some of them out to the pull constant buffer and
2089 * update the program to load them.
2090 */
2091 void
2092 fs_visitor::assign_constant_locations()
2093 {
2094 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2095 if (dispatch_width != 8)
2096 return;
2097
2098 /* Find which UNIFORM registers are still in use. */
2099 bool is_live[uniforms];
2100 for (unsigned int i = 0; i < uniforms; i++) {
2101 is_live[i] = false;
2102 }
2103
2104 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2105 for (int i = 0; i < inst->sources; i++) {
2106 if (inst->src[i].file != UNIFORM)
2107 continue;
2108
2109 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2110 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2111 is_live[constant_nr] = true;
2112 }
2113 }
2114
2115 /* Only allow 16 registers (128 uniform components) as push constants.
2116 *
2117 * Just demote the end of the list. We could probably do better
2118 * here, demoting things that are rarely used in the program first.
2119 *
2120 * If changing this value, note the limitation about total_regs in
2121 * brw_curbe.c.
2122 */
2123 unsigned int max_push_components = 16 * 8;
2124 unsigned int num_push_constants = 0;
2125
2126 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2127
2128 for (unsigned int i = 0; i < uniforms; i++) {
2129 if (!is_live[i] || pull_constant_loc[i] != -1) {
2130 /* This UNIFORM register is either dead, or has already been demoted
2131 * to a pull const. Mark it as no longer living in the param[] array.
2132 */
2133 push_constant_loc[i] = -1;
2134 continue;
2135 }
2136
2137 if (num_push_constants < max_push_components) {
2138 /* Retain as a push constant. Record the location in the params[]
2139 * array.
2140 */
2141 push_constant_loc[i] = num_push_constants++;
2142 } else {
2143 /* Demote to a pull constant. */
2144 push_constant_loc[i] = -1;
2145
2146 int pull_index = stage_prog_data->nr_pull_params++;
2147 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2148 pull_constant_loc[i] = pull_index;
2149 }
2150 }
2151
2152 stage_prog_data->nr_params = num_push_constants;
2153
2154 /* Up until now, the param[] array has been indexed by reg + reg_offset
2155 * of UNIFORM registers. Condense it to only contain the uniforms we
2156 * chose to upload as push constants.
2157 */
2158 for (unsigned int i = 0; i < uniforms; i++) {
2159 int remapped = push_constant_loc[i];
2160
2161 if (remapped == -1)
2162 continue;
2163
2164 assert(remapped <= (int)i);
2165 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2166 }
2167 }
2168
2169 /**
2170 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2171 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2172 */
2173 void
2174 fs_visitor::demote_pull_constants()
2175 {
2176 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2177 for (int i = 0; i < inst->sources; i++) {
2178 if (inst->src[i].file != UNIFORM)
2179 continue;
2180
2181 int pull_index;
2182 unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2183 if (location >= uniforms) /* Out of bounds access */
2184 pull_index = -1;
2185 else
2186 pull_index = pull_constant_loc[location];
2187
2188 if (pull_index == -1)
2189 continue;
2190
2191 /* Set up the annotation tracking for new generated instructions. */
2192 const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
2193 .at(block, inst);
2194 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2195 fs_reg dst = vgrf(glsl_type::float_type);
2196
2197 /* Generate a pull load into dst. */
2198 if (inst->src[i].reladdr) {
2199 VARYING_PULL_CONSTANT_LOAD(ibld, dst,
2200 surf_index,
2201 *inst->src[i].reladdr,
2202 pull_index);
2203 inst->src[i].reladdr = NULL;
2204 } else {
2205 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2206 ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
2207 dst, surf_index, offset);
2208 inst->src[i].set_smear(pull_index & 3);
2209 }
2210
2211 /* Rewrite the instruction to use the temporary VGRF. */
2212 inst->src[i].file = GRF;
2213 inst->src[i].reg = dst.reg;
2214 inst->src[i].reg_offset = 0;
2215 inst->src[i].width = dispatch_width;
2216 }
2217 }
2218 invalidate_live_intervals();
2219 }
2220
2221 bool
2222 fs_visitor::opt_algebraic()
2223 {
2224 bool progress = false;
2225
2226 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2227 switch (inst->opcode) {
2228 case BRW_OPCODE_MOV:
2229 if (inst->src[0].file != IMM)
2230 break;
2231
2232 if (inst->saturate) {
2233 if (inst->dst.type != inst->src[0].type)
2234 assert(!"unimplemented: saturate mixed types");
2235
2236 if (brw_saturate_immediate(inst->dst.type,
2237 &inst->src[0].fixed_hw_reg)) {
2238 inst->saturate = false;
2239 progress = true;
2240 }
2241 }
2242 break;
2243
2244 case BRW_OPCODE_MUL:
2245 if (inst->src[1].file != IMM)
2246 continue;
2247
2248 /* a * 1.0 = a */
2249 if (inst->src[1].is_one()) {
2250 inst->opcode = BRW_OPCODE_MOV;
2251 inst->src[1] = reg_undef;
2252 progress = true;
2253 break;
2254 }
2255
2256 /* a * -1.0 = -a */
2257 if (inst->src[1].is_negative_one()) {
2258 inst->opcode = BRW_OPCODE_MOV;
2259 inst->src[0].negate = !inst->src[0].negate;
2260 inst->src[1] = reg_undef;
2261 progress = true;
2262 break;
2263 }
2264
2265 /* a * 0.0 = 0.0 */
2266 if (inst->src[1].is_zero()) {
2267 inst->opcode = BRW_OPCODE_MOV;
2268 inst->src[0] = inst->src[1];
2269 inst->src[1] = reg_undef;
2270 progress = true;
2271 break;
2272 }
2273
2274 if (inst->src[0].file == IMM) {
2275 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2276 inst->opcode = BRW_OPCODE_MOV;
2277 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2278 inst->src[1] = reg_undef;
2279 progress = true;
2280 break;
2281 }
2282 break;
2283 case BRW_OPCODE_ADD:
2284 if (inst->src[1].file != IMM)
2285 continue;
2286
2287 /* a + 0.0 = a */
2288 if (inst->src[1].is_zero()) {
2289 inst->opcode = BRW_OPCODE_MOV;
2290 inst->src[1] = reg_undef;
2291 progress = true;
2292 break;
2293 }
2294
2295 if (inst->src[0].file == IMM) {
2296 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2297 inst->opcode = BRW_OPCODE_MOV;
2298 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2299 inst->src[1] = reg_undef;
2300 progress = true;
2301 break;
2302 }
2303 break;
2304 case BRW_OPCODE_OR:
2305 if (inst->src[0].equals(inst->src[1])) {
2306 inst->opcode = BRW_OPCODE_MOV;
2307 inst->src[1] = reg_undef;
2308 progress = true;
2309 break;
2310 }
2311 break;
2312 case BRW_OPCODE_LRP:
2313 if (inst->src[1].equals(inst->src[2])) {
2314 inst->opcode = BRW_OPCODE_MOV;
2315 inst->src[0] = inst->src[1];
2316 inst->src[1] = reg_undef;
2317 inst->src[2] = reg_undef;
2318 progress = true;
2319 break;
2320 }
2321 break;
2322 case BRW_OPCODE_CMP:
2323 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2324 inst->src[0].abs &&
2325 inst->src[0].negate &&
2326 inst->src[1].is_zero()) {
2327 inst->src[0].abs = false;
2328 inst->src[0].negate = false;
2329 inst->conditional_mod = BRW_CONDITIONAL_Z;
2330 progress = true;
2331 break;
2332 }
2333 break;
2334 case BRW_OPCODE_SEL:
2335 if (inst->src[0].equals(inst->src[1])) {
2336 inst->opcode = BRW_OPCODE_MOV;
2337 inst->src[1] = reg_undef;
2338 inst->predicate = BRW_PREDICATE_NONE;
2339 inst->predicate_inverse = false;
2340 progress = true;
2341 } else if (inst->saturate && inst->src[1].file == IMM) {
2342 switch (inst->conditional_mod) {
2343 case BRW_CONDITIONAL_LE:
2344 case BRW_CONDITIONAL_L:
2345 switch (inst->src[1].type) {
2346 case BRW_REGISTER_TYPE_F:
2347 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2348 inst->opcode = BRW_OPCODE_MOV;
2349 inst->src[1] = reg_undef;
2350 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2351 progress = true;
2352 }
2353 break;
2354 default:
2355 break;
2356 }
2357 break;
2358 case BRW_CONDITIONAL_GE:
2359 case BRW_CONDITIONAL_G:
2360 switch (inst->src[1].type) {
2361 case BRW_REGISTER_TYPE_F:
2362 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2363 inst->opcode = BRW_OPCODE_MOV;
2364 inst->src[1] = reg_undef;
2365 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2366 progress = true;
2367 }
2368 break;
2369 default:
2370 break;
2371 }
2372 default:
2373 break;
2374 }
2375 }
2376 break;
2377 case BRW_OPCODE_MAD:
2378 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2379 inst->opcode = BRW_OPCODE_MOV;
2380 inst->src[1] = reg_undef;
2381 inst->src[2] = reg_undef;
2382 progress = true;
2383 } else if (inst->src[0].is_zero()) {
2384 inst->opcode = BRW_OPCODE_MUL;
2385 inst->src[0] = inst->src[2];
2386 inst->src[2] = reg_undef;
2387 progress = true;
2388 } else if (inst->src[1].is_one()) {
2389 inst->opcode = BRW_OPCODE_ADD;
2390 inst->src[1] = inst->src[2];
2391 inst->src[2] = reg_undef;
2392 progress = true;
2393 } else if (inst->src[2].is_one()) {
2394 inst->opcode = BRW_OPCODE_ADD;
2395 inst->src[2] = reg_undef;
2396 progress = true;
2397 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2398 inst->opcode = BRW_OPCODE_ADD;
2399 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2400 inst->src[2] = reg_undef;
2401 progress = true;
2402 }
2403 break;
2404 case SHADER_OPCODE_RCP: {
2405 fs_inst *prev = (fs_inst *)inst->prev;
2406 if (prev->opcode == SHADER_OPCODE_SQRT) {
2407 if (inst->src[0].equals(prev->dst)) {
2408 inst->opcode = SHADER_OPCODE_RSQ;
2409 inst->src[0] = prev->src[0];
2410 progress = true;
2411 }
2412 }
2413 break;
2414 }
2415 case SHADER_OPCODE_BROADCAST:
2416 if (is_uniform(inst->src[0])) {
2417 inst->opcode = BRW_OPCODE_MOV;
2418 inst->sources = 1;
2419 inst->force_writemask_all = true;
2420 progress = true;
2421 } else if (inst->src[1].file == IMM) {
2422 inst->opcode = BRW_OPCODE_MOV;
2423 inst->src[0] = component(inst->src[0],
2424 inst->src[1].fixed_hw_reg.dw1.ud);
2425 inst->sources = 1;
2426 inst->force_writemask_all = true;
2427 progress = true;
2428 }
2429 break;
2430
2431 default:
2432 break;
2433 }
2434
2435 /* Swap if src[0] is immediate. */
2436 if (progress && inst->is_commutative()) {
2437 if (inst->src[0].file == IMM) {
2438 fs_reg tmp = inst->src[1];
2439 inst->src[1] = inst->src[0];
2440 inst->src[0] = tmp;
2441 }
2442 }
2443 }
2444 return progress;
2445 }
2446
2447 /**
2448 * Optimize sample messages that have constant zero values for the trailing
2449 * texture coordinates. We can just reduce the message length for these
2450 * instructions instead of reserving a register for it. Trailing parameters
2451 * that aren't sent default to zero anyway. This will cause the dead code
2452 * eliminator to remove the MOV instruction that would otherwise be emitted to
2453 * set up the zero value.
2454 */
2455 bool
2456 fs_visitor::opt_zero_samples()
2457 {
2458 /* Gen4 infers the texturing opcode based on the message length so we can't
2459 * change it.
2460 */
2461 if (devinfo->gen < 5)
2462 return false;
2463
2464 bool progress = false;
2465
2466 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2467 if (!inst->is_tex())
2468 continue;
2469
2470 fs_inst *load_payload = (fs_inst *) inst->prev;
2471
2472 if (load_payload->is_head_sentinel() ||
2473 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2474 continue;
2475
2476 /* We don't want to remove the message header or the first parameter.
2477 * Removing the first parameter is not allowed, see the Haswell PRM
2478 * volume 7, page 149:
2479 *
2480 * "Parameter 0 is required except for the sampleinfo message, which
2481 * has no parameter 0"
2482 */
2483 while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2484 load_payload->src[(inst->mlen - inst->header_size) /
2485 (dispatch_width / 8) +
2486 inst->header_size - 1].is_zero()) {
2487 inst->mlen -= dispatch_width / 8;
2488 progress = true;
2489 }
2490 }
2491
2492 if (progress)
2493 invalidate_live_intervals();
2494
2495 return progress;
2496 }
2497
2498 /**
2499 * Optimize sample messages which are followed by the final RT write.
2500 *
2501 * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2502 * results sent directly to the framebuffer, bypassing the EU. Recognize the
2503 * final texturing results copied to the framebuffer write payload and modify
2504 * them to write to the framebuffer directly.
2505 */
2506 bool
2507 fs_visitor::opt_sampler_eot()
2508 {
2509 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2510
2511 if (stage != MESA_SHADER_FRAGMENT)
2512 return false;
2513
2514 if (devinfo->gen < 9 && !devinfo->is_cherryview)
2515 return false;
2516
2517 /* FINISHME: It should be possible to implement this optimization when there
2518 * are multiple drawbuffers.
2519 */
2520 if (key->nr_color_regions != 1)
2521 return false;
2522
2523 /* Look for a texturing instruction immediately before the final FB_WRITE. */
2524 fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2525 assert(fb_write->eot);
2526 assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2527
2528 fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2529
2530 /* There wasn't one; nothing to do. */
2531 if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2532 return false;
2533
2534 /* This optimisation doesn't seem to work for textureGather for some
2535 * reason. I can't find any documentation or known workarounds to indicate
2536 * that this is expected, but considering that it is probably pretty
2537 * unlikely that a shader would directly write out the results from
2538 * textureGather we might as well just disable it.
2539 */
2540 if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
2541 tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2542 return false;
2543
2544 /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2545 * It's very likely to be the previous instruction.
2546 */
2547 fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2548 if (load_payload->is_head_sentinel() ||
2549 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2550 return false;
2551
2552 assert(!tex_inst->eot); /* We can't get here twice */
2553 assert((tex_inst->offset & (0xff << 24)) == 0);
2554
2555 tex_inst->offset |= fb_write->target << 24;
2556 tex_inst->eot = true;
2557 tex_inst->dst = bld.null_reg_ud();
2558 fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2559
2560 /* If a header is present, marking the eot is sufficient. Otherwise, we need
2561 * to create a new LOAD_PAYLOAD command with the same sources and a space
2562 * saved for the header. Using a new destination register not only makes sure
2563 * we have enough space, but it will make sure the dead code eliminator kills
2564 * the instruction that this will replace.
2565 */
2566 if (tex_inst->header_size != 0)
2567 return true;
2568
2569 fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
2570 load_payload->sources + 1);
2571 fs_reg *new_sources =
2572 ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2573
2574 new_sources[0] = fs_reg();
2575 for (int i = 0; i < load_payload->sources; i++)
2576 new_sources[i+1] = load_payload->src[i];
2577
2578 /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2579 * requires a lot of information about the sources to appropriately figure
2580 * out the number of registers needed to be used. Given this stage in our
2581 * optimization, we may not have the appropriate GRFs required by
2582 * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2583 * manually emit the instruction.
2584 */
2585 fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2586 load_payload->exec_size,
2587 send_header,
2588 new_sources,
2589 load_payload->sources + 1);
2590
2591 new_load_payload->regs_written = load_payload->regs_written + 1;
2592 new_load_payload->header_size = 1;
2593 tex_inst->mlen++;
2594 tex_inst->header_size = 1;
2595 tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2596 tex_inst->src[0] = send_header;
2597
2598 return true;
2599 }
2600
2601 bool
2602 fs_visitor::opt_register_renaming()
2603 {
2604 bool progress = false;
2605 int depth = 0;
2606
2607 int remap[alloc.count];
2608 memset(remap, -1, sizeof(int) * alloc.count);
2609
2610 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2611 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2612 depth++;
2613 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2614 inst->opcode == BRW_OPCODE_WHILE) {
2615 depth--;
2616 }
2617
2618 /* Rewrite instruction sources. */
2619 for (int i = 0; i < inst->sources; i++) {
2620 if (inst->src[i].file == GRF &&
2621 remap[inst->src[i].reg] != -1 &&
2622 remap[inst->src[i].reg] != inst->src[i].reg) {
2623 inst->src[i].reg = remap[inst->src[i].reg];
2624 progress = true;
2625 }
2626 }
2627
2628 const int dst = inst->dst.reg;
2629
2630 if (depth == 0 &&
2631 inst->dst.file == GRF &&
2632 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2633 !inst->is_partial_write()) {
2634 if (remap[dst] == -1) {
2635 remap[dst] = dst;
2636 } else {
2637 remap[dst] = alloc.allocate(inst->dst.width / 8);
2638 inst->dst.reg = remap[dst];
2639 progress = true;
2640 }
2641 } else if (inst->dst.file == GRF &&
2642 remap[dst] != -1 &&
2643 remap[dst] != dst) {
2644 inst->dst.reg = remap[dst];
2645 progress = true;
2646 }
2647 }
2648
2649 if (progress) {
2650 invalidate_live_intervals();
2651
2652 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2653 if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2654 delta_xy[i].reg = remap[delta_xy[i].reg];
2655 }
2656 }
2657 }
2658
2659 return progress;
2660 }
2661
2662 /**
2663 * Remove redundant or useless discard jumps.
2664 *
2665 * For example, we can eliminate jumps in the following sequence:
2666 *
2667 * discard-jump (redundant with the next jump)
2668 * discard-jump (useless; jumps to the next instruction)
2669 * placeholder-halt
2670 */
2671 bool
2672 fs_visitor::opt_redundant_discard_jumps()
2673 {
2674 bool progress = false;
2675
2676 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2677
2678 fs_inst *placeholder_halt = NULL;
2679 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2680 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2681 placeholder_halt = inst;
2682 break;
2683 }
2684 }
2685
2686 if (!placeholder_halt)
2687 return false;
2688
2689 /* Delete any HALTs immediately before the placeholder halt. */
2690 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2691 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2692 prev = (fs_inst *) placeholder_halt->prev) {
2693 prev->remove(last_bblock);
2694 progress = true;
2695 }
2696
2697 if (progress)
2698 invalidate_live_intervals();
2699
2700 return progress;
2701 }
2702
2703 bool
2704 fs_visitor::compute_to_mrf()
2705 {
2706 bool progress = false;
2707 int next_ip = 0;
2708
2709 /* No MRFs on Gen >= 7. */
2710 if (devinfo->gen >= 7)
2711 return false;
2712
2713 calculate_live_intervals();
2714
2715 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2716 int ip = next_ip;
2717 next_ip++;
2718
2719 if (inst->opcode != BRW_OPCODE_MOV ||
2720 inst->is_partial_write() ||
2721 inst->dst.file != MRF || inst->src[0].file != GRF ||
2722 inst->dst.type != inst->src[0].type ||
2723 inst->src[0].abs || inst->src[0].negate ||
2724 !inst->src[0].is_contiguous() ||
2725 inst->src[0].subreg_offset)
2726 continue;
2727
2728 /* Work out which hardware MRF registers are written by this
2729 * instruction.
2730 */
2731 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2732 int mrf_high;
2733 if (inst->dst.reg & BRW_MRF_COMPR4) {
2734 mrf_high = mrf_low + 4;
2735 } else if (inst->exec_size == 16) {
2736 mrf_high = mrf_low + 1;
2737 } else {
2738 mrf_high = mrf_low;
2739 }
2740
2741 /* Can't compute-to-MRF this GRF if someone else was going to
2742 * read it later.
2743 */
2744 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2745 continue;
2746
2747 /* Found a move of a GRF to a MRF. Let's see if we can go
2748 * rewrite the thing that made this GRF to write into the MRF.
2749 */
2750 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2751 if (scan_inst->dst.file == GRF &&
2752 scan_inst->dst.reg == inst->src[0].reg) {
2753 /* Found the last thing to write our reg we want to turn
2754 * into a compute-to-MRF.
2755 */
2756
2757 /* If this one instruction didn't populate all the
2758 * channels, bail. We might be able to rewrite everything
2759 * that writes that reg, but it would require smarter
2760 * tracking to delay the rewriting until complete success.
2761 */
2762 if (scan_inst->is_partial_write())
2763 break;
2764
2765 /* Things returning more than one register would need us to
2766 * understand coalescing out more than one MOV at a time.
2767 */
2768 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2769 break;
2770
2771 /* SEND instructions can't have MRF as a destination. */
2772 if (scan_inst->mlen)
2773 break;
2774
2775 if (devinfo->gen == 6) {
2776 /* gen6 math instructions must have the destination be
2777 * GRF, so no compute-to-MRF for them.
2778 */
2779 if (scan_inst->is_math()) {
2780 break;
2781 }
2782 }
2783
2784 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2785 /* Found the creator of our MRF's source value. */
2786 scan_inst->dst.file = MRF;
2787 scan_inst->dst.reg = inst->dst.reg;
2788 scan_inst->saturate |= inst->saturate;
2789 inst->remove(block);
2790 progress = true;
2791 }
2792 break;
2793 }
2794
2795 /* We don't handle control flow here. Most computation of
2796 * values that end up in MRFs are shortly before the MRF
2797 * write anyway.
2798 */
2799 if (block->start() == scan_inst)
2800 break;
2801
2802 /* You can't read from an MRF, so if someone else reads our
2803 * MRF's source GRF that we wanted to rewrite, that stops us.
2804 */
2805 bool interfered = false;
2806 for (int i = 0; i < scan_inst->sources; i++) {
2807 if (scan_inst->src[i].file == GRF &&
2808 scan_inst->src[i].reg == inst->src[0].reg &&
2809 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2810 interfered = true;
2811 }
2812 }
2813 if (interfered)
2814 break;
2815
2816 if (scan_inst->dst.file == MRF) {
2817 /* If somebody else writes our MRF here, we can't
2818 * compute-to-MRF before that.
2819 */
2820 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2821 int scan_mrf_high;
2822
2823 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2824 scan_mrf_high = scan_mrf_low + 4;
2825 } else if (scan_inst->exec_size == 16) {
2826 scan_mrf_high = scan_mrf_low + 1;
2827 } else {
2828 scan_mrf_high = scan_mrf_low;
2829 }
2830
2831 if (mrf_low == scan_mrf_low ||
2832 mrf_low == scan_mrf_high ||
2833 mrf_high == scan_mrf_low ||
2834 mrf_high == scan_mrf_high) {
2835 break;
2836 }
2837 }
2838
2839 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2840 /* Found a SEND instruction, which means that there are
2841 * live values in MRFs from base_mrf to base_mrf +
2842 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2843 * above it.
2844 */
2845 if (mrf_low >= scan_inst->base_mrf &&
2846 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2847 break;
2848 }
2849 if (mrf_high >= scan_inst->base_mrf &&
2850 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2851 break;
2852 }
2853 }
2854 }
2855 }
2856
2857 if (progress)
2858 invalidate_live_intervals();
2859
2860 return progress;
2861 }
2862
2863 /**
2864 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2865 * flow. We could probably do better here with some form of divergence
2866 * analysis.
2867 */
2868 bool
2869 fs_visitor::eliminate_find_live_channel()
2870 {
2871 bool progress = false;
2872 unsigned depth = 0;
2873
2874 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2875 switch (inst->opcode) {
2876 case BRW_OPCODE_IF:
2877 case BRW_OPCODE_DO:
2878 depth++;
2879 break;
2880
2881 case BRW_OPCODE_ENDIF:
2882 case BRW_OPCODE_WHILE:
2883 depth--;
2884 break;
2885
2886 case FS_OPCODE_DISCARD_JUMP:
2887 /* This can potentially make control flow non-uniform until the end
2888 * of the program.
2889 */
2890 return progress;
2891
2892 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2893 if (depth == 0) {
2894 inst->opcode = BRW_OPCODE_MOV;
2895 inst->src[0] = fs_reg(0);
2896 inst->sources = 1;
2897 inst->force_writemask_all = true;
2898 progress = true;
2899 }
2900 break;
2901
2902 default:
2903 break;
2904 }
2905 }
2906
2907 return progress;
2908 }
2909
2910 /**
2911 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2912 * instructions to FS_OPCODE_REP_FB_WRITE.
2913 */
2914 void
2915 fs_visitor::emit_repclear_shader()
2916 {
2917 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2918 int base_mrf = 1;
2919 int color_mrf = base_mrf + 2;
2920
2921 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2922 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2923 mov->force_writemask_all = true;
2924
2925 fs_inst *write;
2926 if (key->nr_color_regions == 1) {
2927 write = emit(FS_OPCODE_REP_FB_WRITE);
2928 write->saturate = key->clamp_fragment_color;
2929 write->base_mrf = color_mrf;
2930 write->target = 0;
2931 write->header_size = 0;
2932 write->mlen = 1;
2933 } else {
2934 assume(key->nr_color_regions > 0);
2935 for (int i = 0; i < key->nr_color_regions; ++i) {
2936 write = emit(FS_OPCODE_REP_FB_WRITE);
2937 write->saturate = key->clamp_fragment_color;
2938 write->base_mrf = base_mrf;
2939 write->target = i;
2940 write->header_size = 2;
2941 write->mlen = 3;
2942 }
2943 }
2944 write->eot = true;
2945
2946 calculate_cfg();
2947
2948 assign_constant_locations();
2949 assign_curb_setup();
2950
2951 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2952 assert(mov->src[0].file == HW_REG);
2953 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2954 }
2955
2956 /**
2957 * Walks through basic blocks, looking for repeated MRF writes and
2958 * removing the later ones.
2959 */
2960 bool
2961 fs_visitor::remove_duplicate_mrf_writes()
2962 {
2963 fs_inst *last_mrf_move[16];
2964 bool progress = false;
2965
2966 /* Need to update the MRF tracking for compressed instructions. */
2967 if (dispatch_width == 16)
2968 return false;
2969
2970 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2971
2972 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2973 if (inst->is_control_flow()) {
2974 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2975 }
2976
2977 if (inst->opcode == BRW_OPCODE_MOV &&
2978 inst->dst.file == MRF) {
2979 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2980 if (prev_inst && inst->equals(prev_inst)) {
2981 inst->remove(block);
2982 progress = true;
2983 continue;
2984 }
2985 }
2986
2987 /* Clear out the last-write records for MRFs that were overwritten. */
2988 if (inst->dst.file == MRF) {
2989 last_mrf_move[inst->dst.reg] = NULL;
2990 }
2991
2992 if (inst->mlen > 0 && inst->base_mrf != -1) {
2993 /* Found a SEND instruction, which will include two or fewer
2994 * implied MRF writes. We could do better here.
2995 */
2996 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2997 last_mrf_move[inst->base_mrf + i] = NULL;
2998 }
2999 }
3000
3001 /* Clear out any MRF move records whose sources got overwritten. */
3002 if (inst->dst.file == GRF) {
3003 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3004 if (last_mrf_move[i] &&
3005 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3006 last_mrf_move[i] = NULL;
3007 }
3008 }
3009 }
3010
3011 if (inst->opcode == BRW_OPCODE_MOV &&
3012 inst->dst.file == MRF &&
3013 inst->src[0].file == GRF &&
3014 !inst->is_partial_write()) {
3015 last_mrf_move[inst->dst.reg] = inst;
3016 }
3017 }
3018
3019 if (progress)
3020 invalidate_live_intervals();
3021
3022 return progress;
3023 }
3024
3025 static void
3026 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3027 {
3028 /* Clear the flag for registers that actually got read (as expected). */
3029 for (int i = 0; i < inst->sources; i++) {
3030 int grf;
3031 if (inst->src[i].file == GRF) {
3032 grf = inst->src[i].reg;
3033 } else if (inst->src[i].file == HW_REG &&
3034 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3035 grf = inst->src[i].fixed_hw_reg.nr;
3036 } else {
3037 continue;
3038 }
3039
3040 if (grf >= first_grf &&
3041 grf < first_grf + grf_len) {
3042 deps[grf - first_grf] = false;
3043 if (inst->exec_size == 16)
3044 deps[grf - first_grf + 1] = false;
3045 }
3046 }
3047 }
3048
3049 /**
3050 * Implements this workaround for the original 965:
3051 *
3052 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3053 * check for post destination dependencies on this instruction, software
3054 * must ensure that there is no destination hazard for the case of ‘write
3055 * followed by a posted write’ shown in the following example.
3056 *
3057 * 1. mov r3 0
3058 * 2. send r3.xy <rest of send instruction>
3059 * 3. mov r2 r3
3060 *
3061 * Due to no post-destination dependency check on the ‘send’, the above
3062 * code sequence could have two instructions (1 and 2) in flight at the
3063 * same time that both consider ‘r3’ as the target of their final writes.
3064 */
3065 void
3066 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3067 fs_inst *inst)
3068 {
3069 int write_len = inst->regs_written;
3070 int first_write_grf = inst->dst.reg;
3071 bool needs_dep[BRW_MAX_MRF];
3072 assert(write_len < (int)sizeof(needs_dep) - 1);
3073
3074 memset(needs_dep, false, sizeof(needs_dep));
3075 memset(needs_dep, true, write_len);
3076
3077 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3078
3079 /* Walk backwards looking for writes to registers we're writing which
3080 * aren't read since being written. If we hit the start of the program,
3081 * we assume that there are no outstanding dependencies on entry to the
3082 * program.
3083 */
3084 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3085 /* If we hit control flow, assume that there *are* outstanding
3086 * dependencies, and force their cleanup before our instruction.
3087 */
3088 if (block->start() == scan_inst) {
3089 for (int i = 0; i < write_len; i++) {
3090 if (needs_dep[i])
3091 DEP_RESOLVE_MOV(bld.at(block, inst), first_write_grf + i);
3092 }
3093 return;
3094 }
3095
3096 /* We insert our reads as late as possible on the assumption that any
3097 * instruction but a MOV that might have left us an outstanding
3098 * dependency has more latency than a MOV.
3099 */
3100 if (scan_inst->dst.file == GRF) {
3101 for (int i = 0; i < scan_inst->regs_written; i++) {
3102 int reg = scan_inst->dst.reg + i;
3103
3104 if (reg >= first_write_grf &&
3105 reg < first_write_grf + write_len &&
3106 needs_dep[reg - first_write_grf]) {
3107 DEP_RESOLVE_MOV(bld.at(block, inst), reg);
3108 needs_dep[reg - first_write_grf] = false;
3109 if (scan_inst->exec_size == 16)
3110 needs_dep[reg - first_write_grf + 1] = false;
3111 }
3112 }
3113 }
3114
3115 /* Clear the flag for registers that actually got read (as expected). */
3116 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3117
3118 /* Continue the loop only if we haven't resolved all the dependencies */
3119 int i;
3120 for (i = 0; i < write_len; i++) {
3121 if (needs_dep[i])
3122 break;
3123 }
3124 if (i == write_len)
3125 return;
3126 }
3127 }
3128
3129 /**
3130 * Implements this workaround for the original 965:
3131 *
3132 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3133 * used as a destination register until after it has been sourced by an
3134 * instruction with a different destination register.
3135 */
3136 void
3137 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3138 {
3139 int write_len = inst->regs_written;
3140 int first_write_grf = inst->dst.reg;
3141 bool needs_dep[BRW_MAX_MRF];
3142 assert(write_len < (int)sizeof(needs_dep) - 1);
3143
3144 memset(needs_dep, false, sizeof(needs_dep));
3145 memset(needs_dep, true, write_len);
3146 /* Walk forwards looking for writes to registers we're writing which aren't
3147 * read before being written.
3148 */
3149 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3150 /* If we hit control flow, force resolve all remaining dependencies. */
3151 if (block->end() == scan_inst) {
3152 for (int i = 0; i < write_len; i++) {
3153 if (needs_dep[i])
3154 DEP_RESOLVE_MOV(bld.at(block, scan_inst), first_write_grf + i);
3155 }
3156 return;
3157 }
3158
3159 /* Clear the flag for registers that actually got read (as expected). */
3160 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3161
3162 /* We insert our reads as late as possible since they're reading the
3163 * result of a SEND, which has massive latency.
3164 */
3165 if (scan_inst->dst.file == GRF &&
3166 scan_inst->dst.reg >= first_write_grf &&
3167 scan_inst->dst.reg < first_write_grf + write_len &&
3168 needs_dep[scan_inst->dst.reg - first_write_grf]) {
3169 DEP_RESOLVE_MOV(bld.at(block, scan_inst), scan_inst->dst.reg);
3170 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3171 }
3172
3173 /* Continue the loop only if we haven't resolved all the dependencies */
3174 int i;
3175 for (i = 0; i < write_len; i++) {
3176 if (needs_dep[i])
3177 break;
3178 }
3179 if (i == write_len)
3180 return;
3181 }
3182 }
3183
3184 void
3185 fs_visitor::insert_gen4_send_dependency_workarounds()
3186 {
3187 if (devinfo->gen != 4 || devinfo->is_g4x)
3188 return;
3189
3190 bool progress = false;
3191
3192 /* Note that we're done with register allocation, so GRF fs_regs always
3193 * have a .reg_offset of 0.
3194 */
3195
3196 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3197 if (inst->mlen != 0 && inst->dst.file == GRF) {
3198 insert_gen4_pre_send_dependency_workarounds(block, inst);
3199 insert_gen4_post_send_dependency_workarounds(block, inst);
3200 progress = true;
3201 }
3202 }
3203
3204 if (progress)
3205 invalidate_live_intervals();
3206 }
3207
3208 /**
3209 * Turns the generic expression-style uniform pull constant load instruction
3210 * into a hardware-specific series of instructions for loading a pull
3211 * constant.
3212 *
3213 * The expression style allows the CSE pass before this to optimize out
3214 * repeated loads from the same offset, and gives the pre-register-allocation
3215 * scheduling full flexibility, while the conversion to native instructions
3216 * allows the post-register-allocation scheduler the best information
3217 * possible.
3218 *
3219 * Note that execution masking for setting up pull constant loads is special:
3220 * the channels that need to be written are unrelated to the current execution
3221 * mask, since a later instruction will use one of the result channels as a
3222 * source operand for all 8 or 16 of its channels.
3223 */
3224 void
3225 fs_visitor::lower_uniform_pull_constant_loads()
3226 {
3227 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3228 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3229 continue;
3230
3231 if (devinfo->gen >= 7) {
3232 /* The offset arg before was a vec4-aligned byte offset. We need to
3233 * turn it into a dword offset.
3234 */
3235 fs_reg const_offset_reg = inst->src[1];
3236 assert(const_offset_reg.file == IMM &&
3237 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3238 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3239 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3240
3241 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3242 * Reserve space for the register.
3243 */
3244 if (devinfo->gen >= 9) {
3245 payload.reg_offset++;
3246 alloc.sizes[payload.reg] = 2;
3247 }
3248
3249 /* This is actually going to be a MOV, but since only the first dword
3250 * is accessed, we have a special opcode to do just that one. Note
3251 * that this needs to be an operation that will be considered a def
3252 * by live variable analysis, or register allocation will explode.
3253 */
3254 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3255 8, payload, const_offset_reg);
3256 setup->force_writemask_all = true;
3257
3258 setup->ir = inst->ir;
3259 setup->annotation = inst->annotation;
3260 inst->insert_before(block, setup);
3261
3262 /* Similarly, this will only populate the first 4 channels of the
3263 * result register (since we only use smear values from 0-3), but we
3264 * don't tell the optimizer.
3265 */
3266 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3267 inst->src[1] = payload;
3268
3269 invalidate_live_intervals();
3270 } else {
3271 /* Before register allocation, we didn't tell the scheduler about the
3272 * MRF we use. We know it's safe to use this MRF because nothing
3273 * else does except for register spill/unspill, which generates and
3274 * uses its MRF within a single IR instruction.
3275 */
3276 inst->base_mrf = 14;
3277 inst->mlen = 1;
3278 }
3279 }
3280 }
3281
3282 bool
3283 fs_visitor::lower_load_payload()
3284 {
3285 bool progress = false;
3286
3287 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3288 if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3289 continue;
3290
3291 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3292 assert(inst->saturate == false);
3293
3294 const fs_builder ibld = bld.group(inst->exec_size, inst->force_sechalf)
3295 .exec_all(inst->force_writemask_all)
3296 .at(block, inst);
3297 fs_reg dst = inst->dst;
3298
3299 /* Get rid of COMPR4. We'll add it back in if we need it */
3300 if (dst.file == MRF)
3301 dst.reg = dst.reg & ~BRW_MRF_COMPR4;
3302
3303 dst.width = 8;
3304 for (uint8_t i = 0; i < inst->header_size; i++) {
3305 if (inst->src[i].file != BAD_FILE) {
3306 fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3307 fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3308 mov_src.width = 8;
3309 ibld.exec_all().MOV(mov_dst, mov_src);
3310 }
3311 dst = offset(dst, 1);
3312 }
3313
3314 dst.width = inst->exec_size;
3315 if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
3316 inst->exec_size > 8) {
3317 /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3318 * a straightforward copy. Instead, the result of the
3319 * LOAD_PAYLOAD is treated as interleaved and the first four
3320 * non-header sources are unpacked as:
3321 *
3322 * m + 0: r0
3323 * m + 1: g0
3324 * m + 2: b0
3325 * m + 3: a0
3326 * m + 4: r1
3327 * m + 5: g1
3328 * m + 6: b1
3329 * m + 7: a1
3330 *
3331 * This is used for gen <= 5 fb writes.
3332 */
3333 assert(inst->exec_size == 16);
3334 assert(inst->header_size + 4 <= inst->sources);
3335 for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3336 if (inst->src[i].file != BAD_FILE) {
3337 if (devinfo->has_compr4) {
3338 fs_reg compr4_dst = retype(dst, inst->src[i].type);
3339 compr4_dst.reg |= BRW_MRF_COMPR4;
3340 ibld.MOV(compr4_dst, inst->src[i]);
3341 } else {
3342 /* Platform doesn't have COMPR4. We have to fake it */
3343 fs_reg mov_dst = retype(dst, inst->src[i].type);
3344 mov_dst.width = 8;
3345 ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
3346 ibld.half(1).MOV(offset(mov_dst, 4), half(inst->src[i], 1));
3347 }
3348 }
3349
3350 dst.reg++;
3351 }
3352
3353 /* The loop above only ever incremented us through the first set
3354 * of 4 registers. However, thanks to the magic of COMPR4, we
3355 * actually wrote to the first 8 registers, so we need to take
3356 * that into account now.
3357 */
3358 dst.reg += 4;
3359
3360 /* The COMPR4 code took care of the first 4 sources. We'll let
3361 * the regular path handle any remaining sources. Yes, we are
3362 * modifying the instruction but we're about to delete it so
3363 * this really doesn't hurt anything.
3364 */
3365 inst->header_size += 4;
3366 }
3367
3368 for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3369 if (inst->src[i].file != BAD_FILE)
3370 ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
3371 dst = offset(dst, 1);
3372 }
3373
3374 inst->remove(block);
3375 progress = true;
3376 }
3377
3378 if (progress)
3379 invalidate_live_intervals();
3380
3381 return progress;
3382 }
3383
3384 bool
3385 fs_visitor::lower_integer_multiplication()
3386 {
3387 bool progress = false;
3388
3389 /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
3390 * directly, but Cherryview cannot.
3391 */
3392 if (devinfo->gen >= 8 && !devinfo->is_cherryview)
3393 return false;
3394
3395 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3396 if (inst->opcode != BRW_OPCODE_MUL ||
3397 inst->dst.is_accumulator() ||
3398 (inst->dst.type != BRW_REGISTER_TYPE_D &&
3399 inst->dst.type != BRW_REGISTER_TYPE_UD))
3400 continue;
3401
3402 const fs_builder ibld = bld.at(block, inst);
3403
3404 /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3405 * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3406 * src1 are used.
3407 *
3408 * If multiplying by an immediate value that fits in 16-bits, do a
3409 * single MUL instruction with that value in the proper location.
3410 */
3411 if (inst->src[1].file == IMM &&
3412 inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
3413 if (devinfo->gen < 7) {
3414 fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
3415 inst->dst.type, dispatch_width);
3416 ibld.MOV(imm, inst->src[1]);
3417 ibld.MUL(inst->dst, imm, inst->src[0]);
3418 } else {
3419 ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
3420 }
3421 } else {
3422 /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3423 * do 32-bit integer multiplication in one instruction, but instead
3424 * must do a sequence (which actually calculates a 64-bit result):
3425 *
3426 * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
3427 * mach(8) null g3<8,8,1>D g4<8,8,1>D
3428 * mov(8) g2<1>D acc0<8,8,1>D
3429 *
3430 * But on Gen > 6, the ability to use second accumulator register
3431 * (acc1) for non-float data types was removed, preventing a simple
3432 * implementation in SIMD16. A 16-channel result can be calculated by
3433 * executing the three instructions twice in SIMD8, once with quarter
3434 * control of 1Q for the first eight channels and again with 2Q for
3435 * the second eight channels.
3436 *
3437 * Which accumulator register is implicitly accessed (by AccWrEnable
3438 * for instance) is determined by the quarter control. Unfortunately
3439 * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3440 * implicit accumulator access by an instruction with 2Q will access
3441 * acc1 regardless of whether the data type is usable in acc1.
3442 *
3443 * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3444 * integer data types.
3445 *
3446 * Since we only want the low 32-bits of the result, we can do two
3447 * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3448 * adjust the high result and add them (like the mach is doing):
3449 *
3450 * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
3451 * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
3452 * shl(8) g9<1>D g8<8,8,1>D 16D
3453 * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
3454 *
3455 * We avoid the shl instruction by realizing that we only want to add
3456 * the low 16-bits of the "high" result to the high 16-bits of the
3457 * "low" result and using proper regioning on the add:
3458 *
3459 * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
3460 * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
3461 * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
3462 *
3463 * Since it does not use the (single) accumulator register, we can
3464 * schedule multi-component multiplications much better.
3465 */
3466
3467 if (inst->conditional_mod && inst->dst.is_null()) {
3468 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3469 inst->dst.type, dispatch_width);
3470 }
3471 fs_reg low = inst->dst;
3472 fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
3473 inst->dst.type, dispatch_width);
3474
3475 if (brw->gen >= 7) {
3476 fs_reg src1_0_w = inst->src[1];
3477 fs_reg src1_1_w = inst->src[1];
3478
3479 if (inst->src[1].file == IMM) {
3480 src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
3481 src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
3482 } else {
3483 src1_0_w.type = BRW_REGISTER_TYPE_UW;
3484 src1_0_w.stride = 2;
3485
3486 src1_1_w.type = BRW_REGISTER_TYPE_UW;
3487 src1_1_w.stride = 2;
3488 src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3489 }
3490 ibld.MUL(low, inst->src[0], src1_0_w);
3491 ibld.MUL(high, inst->src[0], src1_1_w);
3492 } else {
3493 fs_reg src0_0_w = inst->src[0];
3494 fs_reg src0_1_w = inst->src[0];
3495
3496 src0_0_w.type = BRW_REGISTER_TYPE_UW;
3497 src0_0_w.stride = 2;
3498
3499 src0_1_w.type = BRW_REGISTER_TYPE_UW;
3500 src0_1_w.stride = 2;
3501 src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3502
3503 ibld.MUL(low, src0_0_w, inst->src[1]);
3504 ibld.MUL(high, src0_1_w, inst->src[1]);
3505 }
3506
3507 fs_reg dst = inst->dst;
3508 dst.type = BRW_REGISTER_TYPE_UW;
3509 dst.subreg_offset = 2;
3510 dst.stride = 2;
3511
3512 high.type = BRW_REGISTER_TYPE_UW;
3513 high.stride = 2;
3514
3515 low.type = BRW_REGISTER_TYPE_UW;
3516 low.subreg_offset = 2;
3517 low.stride = 2;
3518
3519 ibld.ADD(dst, low, high);
3520
3521 if (inst->conditional_mod) {
3522 fs_reg null(retype(brw_null_reg(), inst->dst.type));
3523 set_condmod(inst->conditional_mod,
3524 ibld.MOV(null, inst->dst));
3525 }
3526 }
3527
3528 inst->remove(block);
3529 progress = true;
3530 }
3531
3532 if (progress)
3533 invalidate_live_intervals();
3534
3535 return progress;
3536 }
3537
3538 void
3539 fs_visitor::dump_instructions()
3540 {
3541 dump_instructions(NULL);
3542 }
3543
3544 void
3545 fs_visitor::dump_instructions(const char *name)
3546 {
3547 FILE *file = stderr;
3548 if (name && geteuid() != 0) {
3549 file = fopen(name, "w");
3550 if (!file)
3551 file = stderr;
3552 }
3553
3554 if (cfg) {
3555 calculate_register_pressure();
3556 int ip = 0, max_pressure = 0;
3557 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3558 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3559 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3560 dump_instruction(inst, file);
3561 ip++;
3562 }
3563 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3564 } else {
3565 int ip = 0;
3566 foreach_in_list(backend_instruction, inst, &instructions) {
3567 fprintf(file, "%4d: ", ip++);
3568 dump_instruction(inst, file);
3569 }
3570 }
3571
3572 if (file != stderr) {
3573 fclose(file);
3574 }
3575 }
3576
3577 void
3578 fs_visitor::dump_instruction(backend_instruction *be_inst)
3579 {
3580 dump_instruction(be_inst, stderr);
3581 }
3582
3583 void
3584 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3585 {
3586 fs_inst *inst = (fs_inst *)be_inst;
3587
3588 if (inst->predicate) {
3589 fprintf(file, "(%cf0.%d) ",
3590 inst->predicate_inverse ? '-' : '+',
3591 inst->flag_subreg);
3592 }
3593
3594 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3595 if (inst->saturate)
3596 fprintf(file, ".sat");
3597 if (inst->conditional_mod) {
3598 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3599 if (!inst->predicate &&
3600 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3601 inst->opcode != BRW_OPCODE_IF &&
3602 inst->opcode != BRW_OPCODE_WHILE))) {
3603 fprintf(file, ".f0.%d", inst->flag_subreg);
3604 }
3605 }
3606 fprintf(file, "(%d) ", inst->exec_size);
3607
3608 if (inst->mlen) {
3609 fprintf(file, "(mlen: %d) ", inst->mlen);
3610 }
3611
3612 switch (inst->dst.file) {
3613 case GRF:
3614 fprintf(file, "vgrf%d", inst->dst.reg);
3615 if (inst->dst.width != dispatch_width)
3616 fprintf(file, "@%d", inst->dst.width);
3617 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3618 inst->dst.subreg_offset)
3619 fprintf(file, "+%d.%d",
3620 inst->dst.reg_offset, inst->dst.subreg_offset);
3621 break;
3622 case MRF:
3623 fprintf(file, "m%d", inst->dst.reg);
3624 break;
3625 case BAD_FILE:
3626 fprintf(file, "(null)");
3627 break;
3628 case UNIFORM:
3629 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3630 break;
3631 case ATTR:
3632 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3633 break;
3634 case HW_REG:
3635 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3636 switch (inst->dst.fixed_hw_reg.nr) {
3637 case BRW_ARF_NULL:
3638 fprintf(file, "null");
3639 break;
3640 case BRW_ARF_ADDRESS:
3641 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3642 break;
3643 case BRW_ARF_ACCUMULATOR:
3644 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3645 break;
3646 case BRW_ARF_FLAG:
3647 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3648 inst->dst.fixed_hw_reg.subnr);
3649 break;
3650 default:
3651 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3652 inst->dst.fixed_hw_reg.subnr);
3653 break;
3654 }
3655 } else {
3656 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3657 }
3658 if (inst->dst.fixed_hw_reg.subnr)
3659 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3660 break;
3661 default:
3662 fprintf(file, "???");
3663 break;
3664 }
3665 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3666
3667 for (int i = 0; i < inst->sources; i++) {
3668 if (inst->src[i].negate)
3669 fprintf(file, "-");
3670 if (inst->src[i].abs)
3671 fprintf(file, "|");
3672 switch (inst->src[i].file) {
3673 case GRF:
3674 fprintf(file, "vgrf%d", inst->src[i].reg);
3675 if (inst->src[i].width != dispatch_width)
3676 fprintf(file, "@%d", inst->src[i].width);
3677 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3678 inst->src[i].subreg_offset)
3679 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3680 inst->src[i].subreg_offset);
3681 break;
3682 case MRF:
3683 fprintf(file, "***m%d***", inst->src[i].reg);
3684 break;
3685 case ATTR:
3686 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3687 break;
3688 case UNIFORM:
3689 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3690 if (inst->src[i].reladdr) {
3691 fprintf(file, "+reladdr");
3692 } else if (inst->src[i].subreg_offset) {
3693 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3694 inst->src[i].subreg_offset);
3695 }
3696 break;
3697 case BAD_FILE:
3698 fprintf(file, "(null)");
3699 break;
3700 case IMM:
3701 switch (inst->src[i].type) {
3702 case BRW_REGISTER_TYPE_F:
3703 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3704 break;
3705 case BRW_REGISTER_TYPE_W:
3706 case BRW_REGISTER_TYPE_D:
3707 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3708 break;
3709 case BRW_REGISTER_TYPE_UW:
3710 case BRW_REGISTER_TYPE_UD:
3711 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3712 break;
3713 case BRW_REGISTER_TYPE_VF:
3714 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3715 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3716 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3717 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3718 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3719 break;
3720 default:
3721 fprintf(file, "???");
3722 break;
3723 }
3724 break;
3725 case HW_REG:
3726 if (inst->src[i].fixed_hw_reg.negate)
3727 fprintf(file, "-");
3728 if (inst->src[i].fixed_hw_reg.abs)
3729 fprintf(file, "|");
3730 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3731 switch (inst->src[i].fixed_hw_reg.nr) {
3732 case BRW_ARF_NULL:
3733 fprintf(file, "null");
3734 break;
3735 case BRW_ARF_ADDRESS:
3736 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3737 break;
3738 case BRW_ARF_ACCUMULATOR:
3739 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3740 break;
3741 case BRW_ARF_FLAG:
3742 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3743 inst->src[i].fixed_hw_reg.subnr);
3744 break;
3745 default:
3746 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3747 inst->src[i].fixed_hw_reg.subnr);
3748 break;
3749 }
3750 } else {
3751 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3752 }
3753 if (inst->src[i].fixed_hw_reg.subnr)
3754 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3755 if (inst->src[i].fixed_hw_reg.abs)
3756 fprintf(file, "|");
3757 break;
3758 default:
3759 fprintf(file, "???");
3760 break;
3761 }
3762 if (inst->src[i].abs)
3763 fprintf(file, "|");
3764
3765 if (inst->src[i].file != IMM) {
3766 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3767 }
3768
3769 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3770 fprintf(file, ", ");
3771 }
3772
3773 fprintf(file, " ");
3774
3775 if (dispatch_width == 16 && inst->exec_size == 8) {
3776 if (inst->force_sechalf)
3777 fprintf(file, "2ndhalf ");
3778 else
3779 fprintf(file, "1sthalf ");
3780 }
3781
3782 fprintf(file, "\n");
3783 }
3784
3785 /**
3786 * Possibly returns an instruction that set up @param reg.
3787 *
3788 * Sometimes we want to take the result of some expression/variable
3789 * dereference tree and rewrite the instruction generating the result
3790 * of the tree. When processing the tree, we know that the
3791 * instructions generated are all writing temporaries that are dead
3792 * outside of this tree. So, if we have some instructions that write
3793 * a temporary, we're free to point that temp write somewhere else.
3794 *
3795 * Note that this doesn't guarantee that the instruction generated
3796 * only reg -- it might be the size=4 destination of a texture instruction.
3797 */
3798 fs_inst *
3799 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3800 fs_inst *end,
3801 const fs_reg &reg)
3802 {
3803 if (end == start ||
3804 end->is_partial_write() ||
3805 reg.reladdr ||
3806 !reg.equals(end->dst)) {
3807 return NULL;
3808 } else {
3809 return end;
3810 }
3811 }
3812
3813 void
3814 fs_visitor::setup_payload_gen6()
3815 {
3816 bool uses_depth =
3817 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3818 unsigned barycentric_interp_modes =
3819 (stage == MESA_SHADER_FRAGMENT) ?
3820 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3821
3822 assert(devinfo->gen >= 6);
3823
3824 /* R0-1: masks, pixel X/Y coordinates. */
3825 payload.num_regs = 2;
3826 /* R2: only for 32-pixel dispatch.*/
3827
3828 /* R3-26: barycentric interpolation coordinates. These appear in the
3829 * same order that they appear in the brw_wm_barycentric_interp_mode
3830 * enum. Each set of coordinates occupies 2 registers if dispatch width
3831 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3832 * appear if they were enabled using the "Barycentric Interpolation
3833 * Mode" bits in WM_STATE.
3834 */
3835 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3836 if (barycentric_interp_modes & (1 << i)) {
3837 payload.barycentric_coord_reg[i] = payload.num_regs;
3838 payload.num_regs += 2;
3839 if (dispatch_width == 16) {
3840 payload.num_regs += 2;
3841 }
3842 }
3843 }
3844
3845 /* R27: interpolated depth if uses source depth */
3846 if (uses_depth) {
3847 payload.source_depth_reg = payload.num_regs;
3848 payload.num_regs++;
3849 if (dispatch_width == 16) {
3850 /* R28: interpolated depth if not SIMD8. */
3851 payload.num_regs++;
3852 }
3853 }
3854 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3855 if (uses_depth) {
3856 payload.source_w_reg = payload.num_regs;
3857 payload.num_regs++;
3858 if (dispatch_width == 16) {
3859 /* R30: interpolated W if not SIMD8. */
3860 payload.num_regs++;
3861 }
3862 }
3863
3864 if (stage == MESA_SHADER_FRAGMENT) {
3865 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3866 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3867 prog_data->uses_pos_offset = key->compute_pos_offset;
3868 /* R31: MSAA position offsets. */
3869 if (prog_data->uses_pos_offset) {
3870 payload.sample_pos_reg = payload.num_regs;
3871 payload.num_regs++;
3872 }
3873 }
3874
3875 /* R32: MSAA input coverage mask */
3876 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3877 assert(devinfo->gen >= 7);
3878 payload.sample_mask_in_reg = payload.num_regs;
3879 payload.num_regs++;
3880 if (dispatch_width == 16) {
3881 /* R33: input coverage mask if not SIMD8. */
3882 payload.num_regs++;
3883 }
3884 }
3885
3886 /* R34-: bary for 32-pixel. */
3887 /* R58-59: interp W for 32-pixel. */
3888
3889 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3890 source_depth_to_render_target = true;
3891 }
3892 }
3893
3894 void
3895 fs_visitor::setup_vs_payload()
3896 {
3897 /* R0: thread header, R1: urb handles */
3898 payload.num_regs = 2;
3899 }
3900
3901 void
3902 fs_visitor::setup_cs_payload()
3903 {
3904 assert(brw->gen >= 7);
3905
3906 payload.num_regs = 1;
3907 }
3908
3909 void
3910 fs_visitor::assign_binding_table_offsets()
3911 {
3912 assert(stage == MESA_SHADER_FRAGMENT);
3913 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3914 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3915 uint32_t next_binding_table_offset = 0;
3916
3917 /* If there are no color regions, we still perform an FB write to a null
3918 * renderbuffer, which we place at surface index 0.
3919 */
3920 prog_data->binding_table.render_target_start = next_binding_table_offset;
3921 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3922
3923 assign_common_binding_table_offsets(next_binding_table_offset);
3924 }
3925
3926 void
3927 fs_visitor::calculate_register_pressure()
3928 {
3929 invalidate_live_intervals();
3930 calculate_live_intervals();
3931
3932 unsigned num_instructions = 0;
3933 foreach_block(block, cfg)
3934 num_instructions += block->instructions.length();
3935
3936 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3937
3938 for (unsigned reg = 0; reg < alloc.count; reg++) {
3939 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3940 regs_live_at_ip[ip] += alloc.sizes[reg];
3941 }
3942 }
3943
3944 void
3945 fs_visitor::optimize()
3946 {
3947 /* bld is the common builder object pointing at the end of the program we
3948 * used to translate it into i965 IR. For the optimization and lowering
3949 * passes coming next, any code added after the end of the program without
3950 * having explicitly called fs_builder::at() clearly points at a mistake.
3951 * Ideally optimization passes wouldn't be part of the visitor so they
3952 * wouldn't have access to bld at all, but they do, so just in case some
3953 * pass forgets to ask for a location explicitly set it to NULL here to
3954 * make it trip.
3955 */
3956 bld = bld.at(NULL, NULL);
3957
3958 split_virtual_grfs();
3959
3960 move_uniform_array_access_to_pull_constants();
3961 assign_constant_locations();
3962 demote_pull_constants();
3963
3964 #define OPT(pass, args...) ({ \
3965 pass_num++; \
3966 bool this_progress = pass(args); \
3967 \
3968 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3969 char filename[64]; \
3970 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3971 stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3972 \
3973 backend_shader::dump_instructions(filename); \
3974 } \
3975 \
3976 progress = progress || this_progress; \
3977 this_progress; \
3978 })
3979
3980 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3981 char filename[64];
3982 snprintf(filename, 64, "%s%d-%04d-00-start",
3983 stage_abbrev, dispatch_width,
3984 shader_prog ? shader_prog->Name : 0);
3985
3986 backend_shader::dump_instructions(filename);
3987 }
3988
3989 bool progress;
3990 int iteration = 0;
3991 int pass_num = 0;
3992 do {
3993 progress = false;
3994 pass_num = 0;
3995 iteration++;
3996
3997 OPT(remove_duplicate_mrf_writes);
3998
3999 OPT(opt_algebraic);
4000 OPT(opt_cse);
4001 OPT(opt_copy_propagate);
4002 OPT(opt_peephole_predicated_break);
4003 OPT(opt_cmod_propagation);
4004 OPT(dead_code_eliminate);
4005 OPT(opt_peephole_sel);
4006 OPT(dead_control_flow_eliminate, this);
4007 OPT(opt_register_renaming);
4008 OPT(opt_redundant_discard_jumps);
4009 OPT(opt_saturate_propagation);
4010 OPT(opt_zero_samples);
4011 OPT(register_coalesce);
4012 OPT(compute_to_mrf);
4013 OPT(eliminate_find_live_channel);
4014
4015 OPT(compact_virtual_grfs);
4016 } while (progress);
4017
4018 pass_num = 0;
4019
4020 OPT(opt_sampler_eot);
4021
4022 if (OPT(lower_load_payload)) {
4023 split_virtual_grfs();
4024 OPT(register_coalesce);
4025 OPT(compute_to_mrf);
4026 OPT(dead_code_eliminate);
4027 }
4028
4029 OPT(opt_combine_constants);
4030 OPT(lower_integer_multiplication);
4031
4032 lower_uniform_pull_constant_loads();
4033 }
4034
4035 /**
4036 * Three source instruction must have a GRF/MRF destination register.
4037 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
4038 */
4039 void
4040 fs_visitor::fixup_3src_null_dest()
4041 {
4042 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
4043 if (inst->is_3src() && inst->dst.is_null()) {
4044 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
4045 inst->dst.type);
4046 }
4047 }
4048 }
4049
4050 void
4051 fs_visitor::allocate_registers()
4052 {
4053 bool allocated_without_spills;
4054
4055 static const enum instruction_scheduler_mode pre_modes[] = {
4056 SCHEDULE_PRE,
4057 SCHEDULE_PRE_NON_LIFO,
4058 SCHEDULE_PRE_LIFO,
4059 };
4060
4061 /* Try each scheduling heuristic to see if it can successfully register
4062 * allocate without spilling. They should be ordered by decreasing
4063 * performance but increasing likelihood of allocating.
4064 */
4065 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
4066 schedule_instructions(pre_modes[i]);
4067
4068 if (0) {
4069 assign_regs_trivial();
4070 allocated_without_spills = true;
4071 } else {
4072 allocated_without_spills = assign_regs(false);
4073 }
4074 if (allocated_without_spills)
4075 break;
4076 }
4077
4078 if (!allocated_without_spills) {
4079 /* We assume that any spilling is worse than just dropping back to
4080 * SIMD8. There's probably actually some intermediate point where
4081 * SIMD16 with a couple of spills is still better.
4082 */
4083 if (dispatch_width == 16) {
4084 fail("Failure to register allocate. Reduce number of "
4085 "live scalar values to avoid this.");
4086 } else {
4087 perf_debug("%s shader triggered register spilling. "
4088 "Try reducing the number of live scalar values to "
4089 "improve performance.\n", stage_name);
4090 }
4091
4092 /* Since we're out of heuristics, just go spill registers until we
4093 * get an allocation.
4094 */
4095 while (!assign_regs(true)) {
4096 if (failed)
4097 break;
4098 }
4099 }
4100
4101 /* This must come after all optimization and register allocation, since
4102 * it inserts dead code that happens to have side effects, and it does
4103 * so based on the actual physical registers in use.
4104 */
4105 insert_gen4_send_dependency_workarounds();
4106
4107 if (failed)
4108 return;
4109
4110 if (!allocated_without_spills)
4111 schedule_instructions(SCHEDULE_POST);
4112
4113 if (last_scratch > 0)
4114 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4115 }
4116
4117 bool
4118 fs_visitor::run_vs()
4119 {
4120 assert(stage == MESA_SHADER_VERTEX);
4121
4122 assign_common_binding_table_offsets(0);
4123 setup_vs_payload();
4124
4125 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4126 emit_shader_time_begin();
4127
4128 emit_nir_code();
4129
4130 if (failed)
4131 return false;
4132
4133 emit_urb_writes();
4134
4135 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4136 emit_shader_time_end();
4137
4138 calculate_cfg();
4139
4140 optimize();
4141
4142 assign_curb_setup();
4143 assign_vs_urb_setup();
4144
4145 fixup_3src_null_dest();
4146 allocate_registers();
4147
4148 return !failed;
4149 }
4150
4151 bool
4152 fs_visitor::run_fs()
4153 {
4154 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4155 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4156
4157 assert(stage == MESA_SHADER_FRAGMENT);
4158
4159 sanity_param_count = prog->Parameters->NumParameters;
4160
4161 assign_binding_table_offsets();
4162
4163 if (devinfo->gen >= 6)
4164 setup_payload_gen6();
4165 else
4166 setup_payload_gen4();
4167
4168 if (0) {
4169 emit_dummy_fs();
4170 } else if (brw->use_rep_send && dispatch_width == 16) {
4171 emit_repclear_shader();
4172 } else {
4173 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4174 emit_shader_time_begin();
4175
4176 calculate_urb_setup();
4177 if (prog->InputsRead > 0) {
4178 if (devinfo->gen < 6)
4179 emit_interpolation_setup_gen4();
4180 else
4181 emit_interpolation_setup_gen6();
4182 }
4183
4184 /* We handle discards by keeping track of the still-live pixels in f0.1.
4185 * Initialize it with the dispatched pixels.
4186 */
4187 if (wm_prog_data->uses_kill) {
4188 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4189 discard_init->flag_subreg = 1;
4190 }
4191
4192 /* Generate FS IR for main(). (the visitor only descends into
4193 * functions called "main").
4194 */
4195 emit_nir_code();
4196
4197 if (failed)
4198 return false;
4199
4200 if (wm_prog_data->uses_kill)
4201 emit(FS_OPCODE_PLACEHOLDER_HALT);
4202
4203 if (wm_key->alpha_test_func)
4204 emit_alpha_test();
4205
4206 emit_fb_writes();
4207
4208 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4209 emit_shader_time_end();
4210
4211 calculate_cfg();
4212
4213 optimize();
4214
4215 assign_curb_setup();
4216 assign_urb_setup();
4217
4218 fixup_3src_null_dest();
4219 allocate_registers();
4220
4221 if (failed)
4222 return false;
4223 }
4224
4225 if (dispatch_width == 8)
4226 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4227 else
4228 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4229
4230 /* If any state parameters were appended, then ParameterValues could have
4231 * been realloced, in which case the driver uniform storage set up by
4232 * _mesa_associate_uniform_storage() would point to freed memory. Make
4233 * sure that didn't happen.
4234 */
4235 assert(sanity_param_count == prog->Parameters->NumParameters);
4236
4237 return !failed;
4238 }
4239
4240 bool
4241 fs_visitor::run_cs()
4242 {
4243 assert(stage == MESA_SHADER_COMPUTE);
4244 assert(shader);
4245
4246 sanity_param_count = prog->Parameters->NumParameters;
4247
4248 assign_common_binding_table_offsets(0);
4249
4250 setup_cs_payload();
4251
4252 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4253 emit_shader_time_begin();
4254
4255 emit_nir_code();
4256
4257 if (failed)
4258 return false;
4259
4260 emit_cs_terminate();
4261
4262 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4263 emit_shader_time_end();
4264
4265 calculate_cfg();
4266
4267 optimize();
4268
4269 assign_curb_setup();
4270
4271 fixup_3src_null_dest();
4272 allocate_registers();
4273
4274 if (failed)
4275 return false;
4276
4277 /* If any state parameters were appended, then ParameterValues could have
4278 * been realloced, in which case the driver uniform storage set up by
4279 * _mesa_associate_uniform_storage() would point to freed memory. Make
4280 * sure that didn't happen.
4281 */
4282 assert(sanity_param_count == prog->Parameters->NumParameters);
4283
4284 return !failed;
4285 }
4286
4287 const unsigned *
4288 brw_wm_fs_emit(struct brw_context *brw,
4289 void *mem_ctx,
4290 const struct brw_wm_prog_key *key,
4291 struct brw_wm_prog_data *prog_data,
4292 struct gl_fragment_program *fp,
4293 struct gl_shader_program *prog,
4294 unsigned *final_assembly_size)
4295 {
4296 bool start_busy = false;
4297 double start_time = 0;
4298
4299 if (unlikely(brw->perf_debug)) {
4300 start_busy = (brw->batch.last_bo &&
4301 drm_intel_bo_busy(brw->batch.last_bo));
4302 start_time = get_time();
4303 }
4304
4305 struct brw_shader *shader = NULL;
4306 if (prog)
4307 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4308
4309 if (unlikely(INTEL_DEBUG & DEBUG_WM))
4310 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4311
4312 /* Now the main event: Visit the shader IR and generate our FS IR for it.
4313 */
4314 fs_visitor v(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4315 prog, &fp->Base, 8);
4316 if (!v.run_fs()) {
4317 if (prog) {
4318 prog->LinkStatus = false;
4319 ralloc_strcat(&prog->InfoLog, v.fail_msg);
4320 }
4321
4322 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4323 v.fail_msg);
4324
4325 return NULL;
4326 }
4327
4328 cfg_t *simd16_cfg = NULL;
4329 fs_visitor v2(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4330 prog, &fp->Base, 16);
4331 if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4332 if (!v.simd16_unsupported) {
4333 /* Try a SIMD16 compile */
4334 v2.import_uniforms(&v);
4335 if (!v2.run_fs()) {
4336 perf_debug("SIMD16 shader failed to compile, falling back to "
4337 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4338 } else {
4339 simd16_cfg = v2.cfg;
4340 }
4341 } else {
4342 perf_debug("SIMD16 shader unsupported, falling back to "
4343 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4344 }
4345 }
4346
4347 cfg_t *simd8_cfg;
4348 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4349 if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4350 simd8_cfg = NULL;
4351 prog_data->no_8 = true;
4352 } else {
4353 simd8_cfg = v.cfg;
4354 prog_data->no_8 = false;
4355 }
4356
4357 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4358 &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4359
4360 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4361 char *name;
4362 if (prog)
4363 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4364 prog->Label ? prog->Label : "unnamed",
4365 prog->Name);
4366 else
4367 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4368
4369 g.enable_debug(name);
4370 }
4371
4372 if (simd8_cfg)
4373 g.generate_code(simd8_cfg, 8);
4374 if (simd16_cfg)
4375 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4376
4377 if (unlikely(brw->perf_debug) && shader) {
4378 if (shader->compiled_once)
4379 brw_wm_debug_recompile(brw, prog, key);
4380 shader->compiled_once = true;
4381
4382 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4383 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4384 (get_time() - start_time) * 1000);
4385 }
4386 }
4387
4388 return g.get_assembly(final_assembly_size);
4389 }
4390
4391 extern "C" bool
4392 brw_fs_precompile(struct gl_context *ctx,
4393 struct gl_shader_program *shader_prog,
4394 struct gl_program *prog)
4395 {
4396 struct brw_context *brw = brw_context(ctx);
4397 struct brw_wm_prog_key key;
4398
4399 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4400 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4401 bool program_uses_dfdy = fp->UsesDFdy;
4402
4403 memset(&key, 0, sizeof(key));
4404
4405 if (brw->gen < 6) {
4406 if (fp->UsesKill)
4407 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4408
4409 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4410 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4411
4412 /* Just assume depth testing. */
4413 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4414 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4415 }
4416
4417 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4418 BRW_FS_VARYING_INPUT_MASK) > 16)
4419 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4420
4421 brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4422
4423 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4424 key.drawable_height = ctx->DrawBuffer->Height;
4425 }
4426
4427 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4428 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4429 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4430
4431 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4432 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4433 key.nr_color_regions > 1;
4434 }
4435
4436 key.program_string_id = bfp->id;
4437
4438 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4439 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4440
4441 bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4442
4443 brw->wm.base.prog_offset = old_prog_offset;
4444 brw->wm.prog_data = old_prog_data;
4445
4446 return success;
4447 }
4448
4449 void
4450 brw_setup_tex_for_precompile(struct brw_context *brw,
4451 struct brw_sampler_prog_key_data *tex,
4452 struct gl_program *prog)
4453 {
4454 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4455 unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4456 for (unsigned i = 0; i < sampler_count; i++) {
4457 if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4458 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4459 tex->swizzles[i] =
4460 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4461 } else {
4462 /* Color sampler: assume no swizzling. */
4463 tex->swizzles[i] = SWIZZLE_XYZW;
4464 }
4465 }
4466 }