vk: Implement scratch buffers to make spilling work
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 void
53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
54 const fs_reg *src, unsigned sources)
55 {
56 memset(this, 0, sizeof(*this));
57
58 this->src = new fs_reg[MAX2(sources, 3)];
59 for (unsigned i = 0; i < sources; i++)
60 this->src[i] = src[i];
61
62 this->opcode = opcode;
63 this->dst = dst;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (unsigned i = 0; i < sources; ++i) {
79 if (src[i].file != GRF && src[i].file != ATTR)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 this->conditional_mod = BRW_CONDITIONAL_NONE;
94
95 /* This will be the case for almost all instructions. */
96 switch (dst.file) {
97 case GRF:
98 case HW_REG:
99 case MRF:
100 case ATTR:
101 this->regs_written =
102 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
103 break;
104 case BAD_FILE:
105 this->regs_written = 0;
106 break;
107 case IMM:
108 case UNIFORM:
109 unreachable("Invalid destination register file");
110 default:
111 unreachable("Invalid register file");
112 }
113
114 this->writes_accumulator = false;
115 }
116
117 fs_inst::fs_inst()
118 {
119 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
120 }
121
122 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
123 {
124 init(opcode, exec_size, reg_undef, NULL, 0);
125 }
126
127 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
128 {
129 init(opcode, 0, dst, NULL, 0);
130 }
131
132 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
133 const fs_reg &src0)
134 {
135 const fs_reg src[1] = { src0 };
136 init(opcode, exec_size, dst, src, 1);
137 }
138
139 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
140 {
141 const fs_reg src[1] = { src0 };
142 init(opcode, 0, dst, src, 1);
143 }
144
145 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
146 const fs_reg &src0, const fs_reg &src1)
147 {
148 const fs_reg src[2] = { src0, src1 };
149 init(opcode, exec_size, dst, src, 2);
150 }
151
152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
153 const fs_reg &src1)
154 {
155 const fs_reg src[2] = { src0, src1 };
156 init(opcode, 0, dst, src, 2);
157 }
158
159 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
160 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
161 {
162 const fs_reg src[3] = { src0, src1, src2 };
163 init(opcode, exec_size, dst, src, 3);
164 }
165
166 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
167 const fs_reg &src1, const fs_reg &src2)
168 {
169 const fs_reg src[3] = { src0, src1, src2 };
170 init(opcode, 0, dst, src, 3);
171 }
172
173 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
174 const fs_reg src[], unsigned sources)
175 {
176 init(opcode, 0, dst, src, sources);
177 }
178
179 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
180 const fs_reg src[], unsigned sources)
181 {
182 init(opcode, exec_width, dst, src, sources);
183 }
184
185 fs_inst::fs_inst(const fs_inst &that)
186 {
187 memcpy(this, &that, sizeof(that));
188
189 this->src = new fs_reg[MAX2(that.sources, 3)];
190
191 for (unsigned i = 0; i < that.sources; i++)
192 this->src[i] = that.src[i];
193 }
194
195 fs_inst::~fs_inst()
196 {
197 delete[] this->src;
198 }
199
200 void
201 fs_inst::resize_sources(uint8_t num_sources)
202 {
203 if (this->sources != num_sources) {
204 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
205
206 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
207 src[i] = this->src[i];
208
209 delete[] this->src;
210 this->src = src;
211 this->sources = num_sources;
212 }
213 }
214
215 #define ALU1(op) \
216 fs_inst * \
217 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
218 { \
219 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
220 }
221
222 #define ALU2(op) \
223 fs_inst * \
224 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
225 const fs_reg &src1) \
226 { \
227 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
228 }
229
230 #define ALU2_ACC(op) \
231 fs_inst * \
232 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
233 const fs_reg &src1) \
234 { \
235 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
236 inst->writes_accumulator = true; \
237 return inst; \
238 }
239
240 #define ALU3(op) \
241 fs_inst * \
242 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
243 const fs_reg &src1, const fs_reg &src2) \
244 { \
245 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
246 }
247
248 ALU1(NOT)
249 ALU1(MOV)
250 ALU1(FRC)
251 ALU1(RNDD)
252 ALU1(RNDE)
253 ALU1(RNDZ)
254 ALU2(ADD)
255 ALU2(MUL)
256 ALU2_ACC(MACH)
257 ALU2(AND)
258 ALU2(OR)
259 ALU2(XOR)
260 ALU2(SHL)
261 ALU2(SHR)
262 ALU2(ASR)
263 ALU3(LRP)
264 ALU1(BFREV)
265 ALU3(BFE)
266 ALU2(BFI1)
267 ALU3(BFI2)
268 ALU1(FBH)
269 ALU1(FBL)
270 ALU1(CBIT)
271 ALU3(MAD)
272 ALU2_ACC(ADDC)
273 ALU2_ACC(SUBB)
274 ALU2(SEL)
275 ALU2(MAC)
276
277 /** Gen4 predicated IF. */
278 fs_inst *
279 fs_visitor::IF(enum brw_predicate predicate)
280 {
281 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
282 inst->predicate = predicate;
283 return inst;
284 }
285
286 /** Gen6 IF with embedded comparison. */
287 fs_inst *
288 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
289 enum brw_conditional_mod condition)
290 {
291 assert(devinfo->gen == 6);
292 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
293 reg_null_d, src0, src1);
294 inst->conditional_mod = condition;
295 return inst;
296 }
297
298 /**
299 * CMP: Sets the low bit of the destination channels with the result
300 * of the comparison, while the upper bits are undefined, and updates
301 * the flag register with the packed 16 bits of the result.
302 */
303 fs_inst *
304 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
305 enum brw_conditional_mod condition)
306 {
307 fs_inst *inst;
308
309 /* Take the instruction:
310 *
311 * CMP null<d> src0<f> src1<f>
312 *
313 * Original gen4 does type conversion to the destination type before
314 * comparison, producing garbage results for floating point comparisons.
315 *
316 * The destination type doesn't matter on newer generations, so we set the
317 * type to match src0 so we can compact the instruction.
318 */
319 dst.type = src0.type;
320 if (dst.file == HW_REG)
321 dst.fixed_hw_reg.type = dst.type;
322
323 resolve_ud_negate(&src0);
324 resolve_ud_negate(&src1);
325
326 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
327 inst->conditional_mod = condition;
328
329 return inst;
330 }
331
332 fs_inst *
333 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
334 int header_size)
335 {
336 assert(dst.width % 8 == 0);
337 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
338 dst, src, sources);
339 inst->header_size = header_size;
340
341 for (int i = 0; i < header_size; i++)
342 assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
343 inst->regs_written = header_size;
344
345 for (int i = header_size; i < sources; ++i)
346 assert(src[i].file != GRF || src[i].width == dst.width);
347 inst->regs_written += (sources - header_size) * (dst.width / 8);
348
349 return inst;
350 }
351
352 exec_list
353 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
354 const fs_reg &surf_index,
355 const fs_reg &varying_offset,
356 uint32_t const_offset)
357 {
358 exec_list instructions;
359 fs_inst *inst;
360
361 /* We have our constant surface use a pitch of 4 bytes, so our index can
362 * be any component of a vector, and then we load 4 contiguous
363 * components starting from that.
364 *
365 * We break down the const_offset to a portion added to the variable
366 * offset and a portion done using reg_offset, which means that if you
367 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
368 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
369 * CSE can later notice that those loads are all the same and eliminate
370 * the redundant ones.
371 */
372 fs_reg vec4_offset = vgrf(glsl_type::int_type);
373 instructions.push_tail(ADD(vec4_offset,
374 varying_offset, fs_reg(const_offset & ~3)));
375
376 int scale = 1;
377 if (devinfo->gen == 4 && dst.width == 8) {
378 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
379 * u, v, r) as parameters, or we can just use the SIMD16 message
380 * consisting of (header, u). We choose the second, at the cost of a
381 * longer return length.
382 */
383 scale = 2;
384 }
385
386 enum opcode op;
387 if (devinfo->gen >= 7)
388 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
389 else
390 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
391
392 assert(dst.width % 8 == 0);
393 int regs_written = 4 * (dst.width / 8) * scale;
394 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
395 dst.type, dst.width);
396 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
397 inst->regs_written = regs_written;
398 instructions.push_tail(inst);
399
400 if (devinfo->gen < 7) {
401 inst->base_mrf = 13;
402 inst->header_size = 1;
403 if (devinfo->gen == 4)
404 inst->mlen = 3;
405 else
406 inst->mlen = 1 + dispatch_width / 8;
407 }
408
409 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
410 instructions.push_tail(MOV(dst, result));
411
412 return instructions;
413 }
414
415 /**
416 * A helper for MOV generation for fixing up broken hardware SEND dependency
417 * handling.
418 */
419 fs_inst *
420 fs_visitor::DEP_RESOLVE_MOV(int grf)
421 {
422 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
423
424 inst->ir = NULL;
425 inst->annotation = "send dependency resolve";
426
427 /* The caller always wants uncompressed to emit the minimal extra
428 * dependencies, and to avoid having to deal with aligning its regs to 2.
429 */
430 inst->exec_size = 8;
431
432 return inst;
433 }
434
435 bool
436 fs_inst::equals(fs_inst *inst) const
437 {
438 return (opcode == inst->opcode &&
439 dst.equals(inst->dst) &&
440 src[0].equals(inst->src[0]) &&
441 src[1].equals(inst->src[1]) &&
442 src[2].equals(inst->src[2]) &&
443 saturate == inst->saturate &&
444 predicate == inst->predicate &&
445 conditional_mod == inst->conditional_mod &&
446 mlen == inst->mlen &&
447 base_mrf == inst->base_mrf &&
448 target == inst->target &&
449 eot == inst->eot &&
450 header_size == inst->header_size &&
451 shadow_compare == inst->shadow_compare &&
452 exec_size == inst->exec_size &&
453 offset == inst->offset);
454 }
455
456 bool
457 fs_inst::overwrites_reg(const fs_reg &reg) const
458 {
459 return reg.in_range(dst, regs_written);
460 }
461
462 bool
463 fs_inst::is_send_from_grf() const
464 {
465 switch (opcode) {
466 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
467 case SHADER_OPCODE_SHADER_TIME_ADD:
468 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
469 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
470 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
471 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
472 case SHADER_OPCODE_UNTYPED_ATOMIC:
473 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
474 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
475 case SHADER_OPCODE_TYPED_ATOMIC:
476 case SHADER_OPCODE_TYPED_SURFACE_READ:
477 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
478 case SHADER_OPCODE_URB_WRITE_SIMD8:
479 return true;
480 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
481 return src[1].file == GRF;
482 case FS_OPCODE_FB_WRITE:
483 return src[0].file == GRF;
484 default:
485 if (is_tex())
486 return src[0].file == GRF;
487
488 return false;
489 }
490 }
491
492 bool
493 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
494 {
495 if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
496 return false;
497
498 fs_reg reg = this->src[0];
499 if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
500 return false;
501
502 if (grf_alloc.sizes[reg.reg] != this->regs_written)
503 return false;
504
505 for (int i = 0; i < this->sources; i++) {
506 reg.type = this->src[i].type;
507 reg.width = this->src[i].width;
508 if (!this->src[i].equals(reg))
509 return false;
510 reg = ::offset(reg, 1);
511 }
512
513 return true;
514 }
515
516 bool
517 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
518 {
519 if (devinfo->gen == 6 && is_math())
520 return false;
521
522 if (is_send_from_grf())
523 return false;
524
525 if (!backend_instruction::can_do_source_mods())
526 return false;
527
528 return true;
529 }
530
531 bool
532 fs_inst::has_side_effects() const
533 {
534 return this->eot || backend_instruction::has_side_effects();
535 }
536
537 void
538 fs_reg::init()
539 {
540 memset(this, 0, sizeof(*this));
541 stride = 1;
542 }
543
544 /** Generic unset register constructor. */
545 fs_reg::fs_reg()
546 {
547 init();
548 this->file = BAD_FILE;
549 }
550
551 /** Immediate value constructor. */
552 fs_reg::fs_reg(float f)
553 {
554 init();
555 this->file = IMM;
556 this->type = BRW_REGISTER_TYPE_F;
557 this->fixed_hw_reg.dw1.f = f;
558 this->width = 1;
559 }
560
561 /** Immediate value constructor. */
562 fs_reg::fs_reg(int32_t i)
563 {
564 init();
565 this->file = IMM;
566 this->type = BRW_REGISTER_TYPE_D;
567 this->fixed_hw_reg.dw1.d = i;
568 this->width = 1;
569 }
570
571 /** Immediate value constructor. */
572 fs_reg::fs_reg(uint32_t u)
573 {
574 init();
575 this->file = IMM;
576 this->type = BRW_REGISTER_TYPE_UD;
577 this->fixed_hw_reg.dw1.ud = u;
578 this->width = 1;
579 }
580
581 /** Vector float immediate value constructor. */
582 fs_reg::fs_reg(uint8_t vf[4])
583 {
584 init();
585 this->file = IMM;
586 this->type = BRW_REGISTER_TYPE_VF;
587 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
588 }
589
590 /** Vector float immediate value constructor. */
591 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
592 {
593 init();
594 this->file = IMM;
595 this->type = BRW_REGISTER_TYPE_VF;
596 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
597 (vf1 << 8) |
598 (vf2 << 16) |
599 (vf3 << 24);
600 }
601
602 /** Fixed brw_reg. */
603 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
604 {
605 init();
606 this->file = HW_REG;
607 this->fixed_hw_reg = fixed_hw_reg;
608 this->type = fixed_hw_reg.type;
609 this->width = 1 << fixed_hw_reg.width;
610 }
611
612 bool
613 fs_reg::equals(const fs_reg &r) const
614 {
615 return (file == r.file &&
616 reg == r.reg &&
617 reg_offset == r.reg_offset &&
618 subreg_offset == r.subreg_offset &&
619 type == r.type &&
620 negate == r.negate &&
621 abs == r.abs &&
622 !reladdr && !r.reladdr &&
623 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
624 width == r.width &&
625 stride == r.stride);
626 }
627
628 fs_reg &
629 fs_reg::set_smear(unsigned subreg)
630 {
631 assert(file != HW_REG && file != IMM);
632 subreg_offset = subreg * type_sz(type);
633 stride = 0;
634 return *this;
635 }
636
637 bool
638 fs_reg::is_contiguous() const
639 {
640 return stride == 1;
641 }
642
643 int
644 fs_visitor::type_size(const struct glsl_type *type)
645 {
646 unsigned int size, i;
647
648 switch (type->base_type) {
649 case GLSL_TYPE_UINT:
650 case GLSL_TYPE_INT:
651 case GLSL_TYPE_FLOAT:
652 case GLSL_TYPE_BOOL:
653 return type->components();
654 case GLSL_TYPE_ARRAY:
655 return type_size(type->fields.array) * type->length;
656 case GLSL_TYPE_STRUCT:
657 size = 0;
658 for (i = 0; i < type->length; i++) {
659 size += type_size(type->fields.structure[i].type);
660 }
661 return size;
662 case GLSL_TYPE_SAMPLER:
663 /* Samplers take up no register space, since they're baked in at
664 * link time.
665 */
666 return 0;
667 case GLSL_TYPE_ATOMIC_UINT:
668 return 0;
669 case GLSL_TYPE_IMAGE:
670 case GLSL_TYPE_VOID:
671 case GLSL_TYPE_ERROR:
672 case GLSL_TYPE_INTERFACE:
673 case GLSL_TYPE_DOUBLE:
674 case GLSL_TYPE_FUNCTION:
675 unreachable("not reached");
676 }
677
678 return 0;
679 }
680
681 /**
682 * Create a MOV to read the timestamp register.
683 *
684 * The caller is responsible for emitting the MOV. The return value is
685 * the destination of the MOV, with extra parameters set.
686 */
687 fs_reg
688 fs_visitor::get_timestamp(fs_inst **out_mov)
689 {
690 assert(devinfo->gen >= 7);
691
692 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
693 BRW_ARF_TIMESTAMP,
694 0),
695 BRW_REGISTER_TYPE_UD));
696
697 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
698
699 fs_inst *mov = MOV(dst, ts);
700 /* We want to read the 3 fields we care about even if it's not enabled in
701 * the dispatch.
702 */
703 mov->force_writemask_all = true;
704
705 /* The caller wants the low 32 bits of the timestamp. Since it's running
706 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
707 * which is plenty of time for our purposes. It is identical across the
708 * EUs, but since it's tracking GPU core speed it will increment at a
709 * varying rate as render P-states change.
710 *
711 * The caller could also check if render P-states have changed (or anything
712 * else that might disrupt timing) by setting smear to 2 and checking if
713 * that field is != 0.
714 */
715 dst.set_smear(0);
716
717 *out_mov = mov;
718 return dst;
719 }
720
721 void
722 fs_visitor::emit_shader_time_begin()
723 {
724 current_annotation = "shader time start";
725 fs_inst *mov;
726 shader_start_time = get_timestamp(&mov);
727 emit(mov);
728 }
729
730 void
731 fs_visitor::emit_shader_time_end()
732 {
733 current_annotation = "shader time end";
734
735 enum shader_time_shader_type type, written_type, reset_type;
736 switch (stage) {
737 case MESA_SHADER_VERTEX:
738 type = ST_VS;
739 written_type = ST_VS_WRITTEN;
740 reset_type = ST_VS_RESET;
741 break;
742 case MESA_SHADER_GEOMETRY:
743 type = ST_GS;
744 written_type = ST_GS_WRITTEN;
745 reset_type = ST_GS_RESET;
746 break;
747 case MESA_SHADER_FRAGMENT:
748 if (dispatch_width == 8) {
749 type = ST_FS8;
750 written_type = ST_FS8_WRITTEN;
751 reset_type = ST_FS8_RESET;
752 } else {
753 assert(dispatch_width == 16);
754 type = ST_FS16;
755 written_type = ST_FS16_WRITTEN;
756 reset_type = ST_FS16_RESET;
757 }
758 break;
759 case MESA_SHADER_COMPUTE:
760 type = ST_CS;
761 written_type = ST_CS_WRITTEN;
762 reset_type = ST_CS_RESET;
763 break;
764 default:
765 unreachable("fs_visitor::emit_shader_time_end missing code");
766 }
767
768 /* Insert our code just before the final SEND with EOT. */
769 exec_node *end = this->instructions.get_tail();
770 assert(end && ((fs_inst *) end)->eot);
771
772 fs_inst *tm_read;
773 fs_reg shader_end_time = get_timestamp(&tm_read);
774 end->insert_before(tm_read);
775
776 /* Check that there weren't any timestamp reset events (assuming these
777 * were the only two timestamp reads that happened).
778 */
779 fs_reg reset = shader_end_time;
780 reset.set_smear(2);
781 fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
782 test->conditional_mod = BRW_CONDITIONAL_Z;
783 test->force_writemask_all = true;
784 end->insert_before(test);
785 end->insert_before(IF(BRW_PREDICATE_NORMAL));
786
787 fs_reg start = shader_start_time;
788 start.negate = true;
789 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
790 diff.set_smear(0);
791 fs_inst *add = ADD(diff, start, shader_end_time);
792 add->force_writemask_all = true;
793 end->insert_before(add);
794
795 /* If there were no instructions between the two timestamp gets, the diff
796 * is 2 cycles. Remove that overhead, so I can forget about that when
797 * trying to determine the time taken for single instructions.
798 */
799 add = ADD(diff, diff, fs_reg(-2u));
800 add->force_writemask_all = true;
801 end->insert_before(add);
802
803 end->insert_before(SHADER_TIME_ADD(type, diff));
804 end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
805 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
806 end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
807 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
808 }
809
810 fs_inst *
811 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
812 {
813 int shader_time_index =
814 brw_get_shader_time_index(brw, shader_prog, prog, type);
815 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
816
817 fs_reg payload;
818 if (dispatch_width == 8)
819 payload = vgrf(glsl_type::uvec2_type);
820 else
821 payload = vgrf(glsl_type::uint_type);
822
823 return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
824 fs_reg(), payload, offset, value);
825 }
826
827 void
828 fs_visitor::vfail(const char *format, va_list va)
829 {
830 char *msg;
831
832 if (failed)
833 return;
834
835 failed = true;
836
837 msg = ralloc_vasprintf(mem_ctx, format, va);
838 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
839
840 this->fail_msg = msg;
841
842 if (debug_enabled) {
843 fprintf(stderr, "%s", msg);
844 }
845 }
846
847 void
848 fs_visitor::fail(const char *format, ...)
849 {
850 va_list va;
851
852 va_start(va, format);
853 vfail(format, va);
854 va_end(va);
855 }
856
857 /**
858 * Mark this program as impossible to compile in SIMD16 mode.
859 *
860 * During the SIMD8 compile (which happens first), we can detect and flag
861 * things that are unsupported in SIMD16 mode, so the compiler can skip
862 * the SIMD16 compile altogether.
863 *
864 * During a SIMD16 compile (if one happens anyway), this just calls fail().
865 */
866 void
867 fs_visitor::no16(const char *format, ...)
868 {
869 va_list va;
870
871 va_start(va, format);
872
873 if (dispatch_width == 16) {
874 vfail(format, va);
875 } else {
876 simd16_unsupported = true;
877
878 if (brw->perf_debug) {
879 if (no16_msg)
880 ralloc_vasprintf_append(&no16_msg, format, va);
881 else
882 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
883 }
884 }
885
886 va_end(va);
887 }
888
889 fs_inst *
890 fs_visitor::emit(enum opcode opcode)
891 {
892 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
893 }
894
895 fs_inst *
896 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
897 {
898 return emit(new(mem_ctx) fs_inst(opcode, dst));
899 }
900
901 fs_inst *
902 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
903 {
904 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
905 }
906
907 fs_inst *
908 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
909 const fs_reg &src1)
910 {
911 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
912 }
913
914 fs_inst *
915 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
916 const fs_reg &src1, const fs_reg &src2)
917 {
918 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
919 }
920
921 fs_inst *
922 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
923 fs_reg src[], int sources)
924 {
925 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
926 }
927
928 /**
929 * Returns true if the instruction has a flag that means it won't
930 * update an entire destination register.
931 *
932 * For example, dead code elimination and live variable analysis want to know
933 * when a write to a variable screens off any preceding values that were in
934 * it.
935 */
936 bool
937 fs_inst::is_partial_write() const
938 {
939 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
940 (this->dst.width * type_sz(this->dst.type)) < 32 ||
941 !this->dst.is_contiguous());
942 }
943
944 int
945 fs_inst::regs_read(int arg) const
946 {
947 if (is_tex() && arg == 0 && src[0].file == GRF) {
948 return mlen;
949 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
950 return mlen;
951 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
952 return mlen;
953 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
954 return mlen;
955 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
956 return mlen;
957 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
958 return mlen;
959 } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
960 return mlen;
961 } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
962 return mlen;
963 } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
964 return mlen;
965 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
966 return mlen;
967 } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
968 return exec_size / 4;
969 }
970
971 switch (src[arg].file) {
972 case BAD_FILE:
973 case UNIFORM:
974 case IMM:
975 return 1;
976 case GRF:
977 case HW_REG:
978 if (src[arg].stride == 0) {
979 return 1;
980 } else {
981 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
982 return (size + 31) / 32;
983 }
984 case MRF:
985 unreachable("MRF registers are not allowed as sources");
986 default:
987 unreachable("Invalid register file");
988 }
989 }
990
991 bool
992 fs_inst::reads_flag() const
993 {
994 return predicate;
995 }
996
997 bool
998 fs_inst::writes_flag() const
999 {
1000 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
1001 opcode != BRW_OPCODE_IF &&
1002 opcode != BRW_OPCODE_WHILE)) ||
1003 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
1004 }
1005
1006 /**
1007 * Returns how many MRFs an FS opcode will write over.
1008 *
1009 * Note that this is not the 0 or 1 implied writes in an actual gen
1010 * instruction -- the FS opcodes often generate MOVs in addition.
1011 */
1012 int
1013 fs_visitor::implied_mrf_writes(fs_inst *inst)
1014 {
1015 if (inst->mlen == 0)
1016 return 0;
1017
1018 if (inst->base_mrf == -1)
1019 return 0;
1020
1021 switch (inst->opcode) {
1022 case SHADER_OPCODE_RCP:
1023 case SHADER_OPCODE_RSQ:
1024 case SHADER_OPCODE_SQRT:
1025 case SHADER_OPCODE_EXP2:
1026 case SHADER_OPCODE_LOG2:
1027 case SHADER_OPCODE_SIN:
1028 case SHADER_OPCODE_COS:
1029 return 1 * dispatch_width / 8;
1030 case SHADER_OPCODE_POW:
1031 case SHADER_OPCODE_INT_QUOTIENT:
1032 case SHADER_OPCODE_INT_REMAINDER:
1033 return 2 * dispatch_width / 8;
1034 case SHADER_OPCODE_TEX:
1035 case FS_OPCODE_TXB:
1036 case SHADER_OPCODE_TXD:
1037 case SHADER_OPCODE_TXF:
1038 case SHADER_OPCODE_TXF_CMS:
1039 case SHADER_OPCODE_TXF_MCS:
1040 case SHADER_OPCODE_TG4:
1041 case SHADER_OPCODE_TG4_OFFSET:
1042 case SHADER_OPCODE_TXL:
1043 case SHADER_OPCODE_TXS:
1044 case SHADER_OPCODE_LOD:
1045 return 1;
1046 case FS_OPCODE_FB_WRITE:
1047 return 2;
1048 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1049 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1050 return 1;
1051 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1052 return inst->mlen;
1053 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1054 return 2;
1055 case SHADER_OPCODE_UNTYPED_ATOMIC:
1056 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1057 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1058 case SHADER_OPCODE_TYPED_ATOMIC:
1059 case SHADER_OPCODE_TYPED_SURFACE_READ:
1060 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1061 case SHADER_OPCODE_URB_WRITE_SIMD8:
1062 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1063 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1064 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1065 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1066 return 0;
1067 default:
1068 unreachable("not reached");
1069 }
1070 }
1071
1072 fs_reg
1073 fs_visitor::vgrf(const glsl_type *const type)
1074 {
1075 int reg_width = dispatch_width / 8;
1076 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1077 brw_type_for_base_type(type), dispatch_width);
1078 }
1079
1080 fs_reg
1081 fs_visitor::vgrf(int num_components)
1082 {
1083 int reg_width = dispatch_width / 8;
1084 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1085 BRW_REGISTER_TYPE_F, dispatch_width);
1086 }
1087
1088 /** Fixed HW reg constructor. */
1089 fs_reg::fs_reg(enum register_file file, int reg)
1090 {
1091 init();
1092 this->file = file;
1093 this->reg = reg;
1094 this->type = BRW_REGISTER_TYPE_F;
1095
1096 switch (file) {
1097 case UNIFORM:
1098 this->width = 1;
1099 break;
1100 default:
1101 this->width = 8;
1102 }
1103 }
1104
1105 /** Fixed HW reg constructor. */
1106 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1107 {
1108 init();
1109 this->file = file;
1110 this->reg = reg;
1111 this->type = type;
1112
1113 switch (file) {
1114 case UNIFORM:
1115 this->width = 1;
1116 break;
1117 default:
1118 this->width = 8;
1119 }
1120 }
1121
1122 /** Fixed HW reg constructor. */
1123 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1124 uint8_t width)
1125 {
1126 init();
1127 this->file = file;
1128 this->reg = reg;
1129 this->type = type;
1130 this->width = width;
1131 }
1132
1133 fs_reg *
1134 fs_visitor::variable_storage(ir_variable *var)
1135 {
1136 return (fs_reg *)hash_table_find(this->variable_ht, var);
1137 }
1138
1139 void
1140 import_uniforms_callback(const void *key,
1141 void *data,
1142 void *closure)
1143 {
1144 struct hash_table *dst_ht = (struct hash_table *)closure;
1145 const fs_reg *reg = (const fs_reg *)data;
1146
1147 if (reg->file != UNIFORM)
1148 return;
1149
1150 hash_table_insert(dst_ht, data, key);
1151 }
1152
1153 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1154 * This brings in those uniform definitions
1155 */
1156 void
1157 fs_visitor::import_uniforms(fs_visitor *v)
1158 {
1159 hash_table_call_foreach(v->variable_ht,
1160 import_uniforms_callback,
1161 variable_ht);
1162 this->push_constant_loc = v->push_constant_loc;
1163 this->pull_constant_loc = v->pull_constant_loc;
1164 this->uniforms = v->uniforms;
1165 this->param_size = v->param_size;
1166 }
1167
1168 /* Our support for uniforms is piggy-backed on the struct
1169 * gl_fragment_program, because that's where the values actually
1170 * get stored, rather than in some global gl_shader_program uniform
1171 * store.
1172 */
1173 void
1174 fs_visitor::setup_uniform_values(ir_variable *ir)
1175 {
1176 int namelen = strlen(ir->name);
1177
1178 /* The data for our (non-builtin) uniforms is stored in a series of
1179 * gl_uniform_driver_storage structs for each subcomponent that
1180 * glGetUniformLocation() could name. We know it's been set up in the same
1181 * order we'd walk the type, so walk the list of storage and find anything
1182 * with our name, or the prefix of a component that starts with our name.
1183 */
1184 unsigned params_before = uniforms;
1185 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1186 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1187
1188 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1189 (storage->name[namelen] != 0 &&
1190 storage->name[namelen] != '.' &&
1191 storage->name[namelen] != '[')) {
1192 continue;
1193 }
1194
1195 unsigned slots = storage->type->component_slots();
1196 if (storage->array_elements)
1197 slots *= storage->array_elements;
1198
1199 for (unsigned i = 0; i < slots; i++) {
1200 stage_prog_data->param[uniforms++] = &storage->storage[i];
1201 }
1202 }
1203
1204 /* Make sure we actually initialized the right amount of stuff here. */
1205 assert(params_before + ir->type->component_slots() == uniforms);
1206 (void)params_before;
1207 }
1208
1209
1210 /* Our support for builtin uniforms is even scarier than non-builtin.
1211 * It sits on top of the PROG_STATE_VAR parameters that are
1212 * automatically updated from GL context state.
1213 */
1214 void
1215 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1216 {
1217 const ir_state_slot *const slots = ir->get_state_slots();
1218 assert(slots != NULL);
1219
1220 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1221 /* This state reference has already been setup by ir_to_mesa, but we'll
1222 * get the same index back here.
1223 */
1224 int index = _mesa_add_state_reference(this->prog->Parameters,
1225 (gl_state_index *)slots[i].tokens);
1226
1227 /* Add each of the unique swizzles of the element as a parameter.
1228 * This'll end up matching the expected layout of the
1229 * array/matrix/structure we're trying to fill in.
1230 */
1231 int last_swiz = -1;
1232 for (unsigned int j = 0; j < 4; j++) {
1233 int swiz = GET_SWZ(slots[i].swizzle, j);
1234 if (swiz == last_swiz)
1235 break;
1236 last_swiz = swiz;
1237
1238 stage_prog_data->param[uniforms++] =
1239 &prog->Parameters->ParameterValues[index][swiz];
1240 }
1241 }
1242 }
1243
1244 fs_reg *
1245 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1246 bool origin_upper_left)
1247 {
1248 assert(stage == MESA_SHADER_FRAGMENT);
1249 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1250 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1251 fs_reg wpos = *reg;
1252 bool flip = !origin_upper_left ^ key->render_to_fbo;
1253
1254 /* gl_FragCoord.x */
1255 if (pixel_center_integer) {
1256 emit(MOV(wpos, this->pixel_x));
1257 } else {
1258 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1259 }
1260 wpos = offset(wpos, 1);
1261
1262 /* gl_FragCoord.y */
1263 if (!flip && pixel_center_integer) {
1264 emit(MOV(wpos, this->pixel_y));
1265 } else {
1266 fs_reg pixel_y = this->pixel_y;
1267 float offset = (pixel_center_integer ? 0.0 : 0.5);
1268
1269 if (flip) {
1270 pixel_y.negate = true;
1271 offset += key->drawable_height - 1.0;
1272 }
1273
1274 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1275 }
1276 wpos = offset(wpos, 1);
1277
1278 /* gl_FragCoord.z */
1279 if (devinfo->gen >= 6) {
1280 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1281 } else {
1282 emit(FS_OPCODE_LINTERP, wpos,
1283 this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1284 interp_reg(VARYING_SLOT_POS, 2));
1285 }
1286 wpos = offset(wpos, 1);
1287
1288 /* gl_FragCoord.w: Already set up in emit_interpolation */
1289 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1290
1291 return reg;
1292 }
1293
1294 fs_inst *
1295 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1296 glsl_interp_qualifier interpolation_mode,
1297 bool is_centroid, bool is_sample)
1298 {
1299 brw_wm_barycentric_interp_mode barycoord_mode;
1300 if (devinfo->gen >= 6) {
1301 if (is_centroid) {
1302 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1303 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1304 else
1305 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1306 } else if (is_sample) {
1307 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1308 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1309 else
1310 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1311 } else {
1312 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1313 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1314 else
1315 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1316 }
1317 } else {
1318 /* On Ironlake and below, there is only one interpolation mode.
1319 * Centroid interpolation doesn't mean anything on this hardware --
1320 * there is no multisampling.
1321 */
1322 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1323 }
1324 return emit(FS_OPCODE_LINTERP, attr,
1325 this->delta_xy[barycoord_mode], interp);
1326 }
1327
1328 void
1329 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1330 const glsl_type *type,
1331 glsl_interp_qualifier interpolation_mode,
1332 int location, bool mod_centroid,
1333 bool mod_sample)
1334 {
1335 attr.type = brw_type_for_base_type(type->get_scalar_type());
1336
1337 assert(stage == MESA_SHADER_FRAGMENT);
1338 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1339 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1340
1341 unsigned int array_elements;
1342
1343 if (type->is_array()) {
1344 array_elements = type->length;
1345 if (array_elements == 0) {
1346 fail("dereferenced array '%s' has length 0\n", name);
1347 }
1348 type = type->fields.array;
1349 } else {
1350 array_elements = 1;
1351 }
1352
1353 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1354 bool is_gl_Color =
1355 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1356 if (key->flat_shade && is_gl_Color) {
1357 interpolation_mode = INTERP_QUALIFIER_FLAT;
1358 } else {
1359 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1360 }
1361 }
1362
1363 for (unsigned int i = 0; i < array_elements; i++) {
1364 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1365 if (prog_data->urb_setup[location] == -1) {
1366 /* If there's no incoming setup data for this slot, don't
1367 * emit interpolation for it.
1368 */
1369 attr = offset(attr, type->vector_elements);
1370 location++;
1371 continue;
1372 }
1373
1374 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1375 /* Constant interpolation (flat shading) case. The SF has
1376 * handed us defined values in only the constant offset
1377 * field of the setup reg.
1378 */
1379 for (unsigned int k = 0; k < type->vector_elements; k++) {
1380 struct brw_reg interp = interp_reg(location, k);
1381 interp = suboffset(interp, 3);
1382 interp.type = attr.type;
1383 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1384 attr = offset(attr, 1);
1385 }
1386 } else {
1387 /* Smooth/noperspective interpolation case. */
1388 for (unsigned int k = 0; k < type->vector_elements; k++) {
1389 struct brw_reg interp = interp_reg(location, k);
1390 if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1391 /* Get the pixel/sample mask into f0 so that we know
1392 * which pixels are lit. Then, for each channel that is
1393 * unlit, replace the centroid data with non-centroid
1394 * data.
1395 */
1396 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1397
1398 fs_inst *inst;
1399 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1400 false, false);
1401 inst->predicate = BRW_PREDICATE_NORMAL;
1402 inst->predicate_inverse = true;
1403 if (devinfo->has_pln)
1404 inst->no_dd_clear = true;
1405
1406 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1407 mod_centroid && !key->persample_shading,
1408 mod_sample || key->persample_shading);
1409 inst->predicate = BRW_PREDICATE_NORMAL;
1410 inst->predicate_inverse = false;
1411 if (devinfo->has_pln)
1412 inst->no_dd_check = true;
1413
1414 } else {
1415 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1416 mod_centroid && !key->persample_shading,
1417 mod_sample || key->persample_shading);
1418 }
1419 if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1420 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1421 }
1422 attr = offset(attr, 1);
1423 }
1424
1425 }
1426 location++;
1427 }
1428 }
1429 }
1430
1431 fs_reg *
1432 fs_visitor::emit_frontfacing_interpolation()
1433 {
1434 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1435
1436 if (devinfo->gen >= 6) {
1437 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1438 * a boolean result from this (~0/true or 0/false).
1439 *
1440 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1441 * this task in only one instruction:
1442 * - a negation source modifier will flip the bit; and
1443 * - a W -> D type conversion will sign extend the bit into the high
1444 * word of the destination.
1445 *
1446 * An ASR 15 fills the low word of the destination.
1447 */
1448 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1449 g0.negate = true;
1450
1451 emit(ASR(*reg, g0, fs_reg(15)));
1452 } else {
1453 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1454 * a boolean result from this (1/true or 0/false).
1455 *
1456 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1457 * the negation source modifier to flip it. Unfortunately the SHR
1458 * instruction only operates on UD (or D with an abs source modifier)
1459 * sources without negation.
1460 *
1461 * Instead, use ASR (which will give ~0/true or 0/false).
1462 */
1463 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1464 g1_6.negate = true;
1465
1466 emit(ASR(*reg, g1_6, fs_reg(31)));
1467 }
1468
1469 return reg;
1470 }
1471
1472 void
1473 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1474 {
1475 assert(stage == MESA_SHADER_FRAGMENT);
1476 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1477 assert(dst.type == BRW_REGISTER_TYPE_F);
1478
1479 if (key->compute_pos_offset) {
1480 /* Convert int_sample_pos to floating point */
1481 emit(MOV(dst, int_sample_pos));
1482 /* Scale to the range [0, 1] */
1483 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1484 }
1485 else {
1486 /* From ARB_sample_shading specification:
1487 * "When rendering to a non-multisample buffer, or if multisample
1488 * rasterization is disabled, gl_SamplePosition will always be
1489 * (0.5, 0.5).
1490 */
1491 emit(MOV(dst, fs_reg(0.5f)));
1492 }
1493 }
1494
1495 fs_reg *
1496 fs_visitor::emit_samplepos_setup()
1497 {
1498 assert(devinfo->gen >= 6);
1499
1500 this->current_annotation = "compute sample position";
1501 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1502 fs_reg pos = *reg;
1503 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1504 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1505
1506 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1507 * mode will be enabled.
1508 *
1509 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1510 * R31.1:0 Position Offset X/Y for Slot[3:0]
1511 * R31.3:2 Position Offset X/Y for Slot[7:4]
1512 * .....
1513 *
1514 * The X, Y sample positions come in as bytes in thread payload. So, read
1515 * the positions using vstride=16, width=8, hstride=2.
1516 */
1517 struct brw_reg sample_pos_reg =
1518 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1519 BRW_REGISTER_TYPE_B), 16, 8, 2);
1520
1521 if (dispatch_width == 8) {
1522 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1523 } else {
1524 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1525 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1526 ->force_sechalf = true;
1527 }
1528 /* Compute gl_SamplePosition.x */
1529 compute_sample_position(pos, int_sample_x);
1530 pos = offset(pos, 1);
1531 if (dispatch_width == 8) {
1532 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1533 } else {
1534 emit(MOV(half(int_sample_y, 0),
1535 fs_reg(suboffset(sample_pos_reg, 1))));
1536 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1537 ->force_sechalf = true;
1538 }
1539 /* Compute gl_SamplePosition.y */
1540 compute_sample_position(pos, int_sample_y);
1541 return reg;
1542 }
1543
1544 fs_reg *
1545 fs_visitor::emit_sampleid_setup()
1546 {
1547 assert(stage == MESA_SHADER_FRAGMENT);
1548 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1549 assert(devinfo->gen >= 6);
1550
1551 this->current_annotation = "compute sample id";
1552 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1553
1554 if (key->compute_sample_id) {
1555 fs_reg t1 = vgrf(glsl_type::int_type);
1556 fs_reg t2 = vgrf(glsl_type::int_type);
1557 t2.type = BRW_REGISTER_TYPE_UW;
1558
1559 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1560 * 8x multisampling, subspan 0 will represent sample N (where N
1561 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1562 * 7. We can find the value of N by looking at R0.0 bits 7:6
1563 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1564 * (since samples are always delivered in pairs). That is, we
1565 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1566 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1567 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1568 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1569 * populating a temporary variable with the sequence (0, 1, 2, 3),
1570 * and then reading from it using vstride=1, width=4, hstride=0.
1571 * These computations hold good for 4x multisampling as well.
1572 *
1573 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1574 * the first four slots are sample 0 of subspan 0; the next four
1575 * are sample 1 of subspan 0; the third group is sample 0 of
1576 * subspan 1, and finally sample 1 of subspan 1.
1577 */
1578 fs_inst *inst;
1579 inst = emit(BRW_OPCODE_AND, t1,
1580 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1581 fs_reg(0xc0));
1582 inst->force_writemask_all = true;
1583 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1584 inst->force_writemask_all = true;
1585 /* This works for both SIMD8 and SIMD16 */
1586 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1587 inst->force_writemask_all = true;
1588 /* This special instruction takes care of setting vstride=1,
1589 * width=4, hstride=0 of t2 during an ADD instruction.
1590 */
1591 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1592 } else {
1593 /* As per GL_ARB_sample_shading specification:
1594 * "When rendering to a non-multisample buffer, or if multisample
1595 * rasterization is disabled, gl_SampleID will always be zero."
1596 */
1597 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1598 }
1599
1600 return reg;
1601 }
1602
1603 void
1604 fs_visitor::resolve_source_modifiers(fs_reg *src)
1605 {
1606 if (!src->abs && !src->negate)
1607 return;
1608
1609 fs_reg temp = retype(vgrf(1), src->type);
1610 emit(MOV(temp, *src));
1611 *src = temp;
1612 }
1613
1614 fs_reg
1615 fs_visitor::fix_math_operand(fs_reg src)
1616 {
1617 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1618 * might be able to do better by doing execsize = 1 math and then
1619 * expanding that result out, but we would need to be careful with
1620 * masking.
1621 *
1622 * The hardware ignores source modifiers (negate and abs) on math
1623 * instructions, so we also move to a temp to set those up.
1624 */
1625 if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1626 !src.abs && !src.negate)
1627 return src;
1628
1629 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1630 * operands to math
1631 */
1632 if (devinfo->gen >= 7 && src.file != IMM)
1633 return src;
1634
1635 fs_reg expanded = vgrf(glsl_type::float_type);
1636 expanded.type = src.type;
1637 emit(BRW_OPCODE_MOV, expanded, src);
1638 return expanded;
1639 }
1640
1641 fs_inst *
1642 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1643 {
1644 switch (opcode) {
1645 case SHADER_OPCODE_RCP:
1646 case SHADER_OPCODE_RSQ:
1647 case SHADER_OPCODE_SQRT:
1648 case SHADER_OPCODE_EXP2:
1649 case SHADER_OPCODE_LOG2:
1650 case SHADER_OPCODE_SIN:
1651 case SHADER_OPCODE_COS:
1652 break;
1653 default:
1654 unreachable("not reached: bad math opcode");
1655 }
1656
1657 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1658 * might be able to do better by doing execsize = 1 math and then
1659 * expanding that result out, but we would need to be careful with
1660 * masking.
1661 *
1662 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1663 * instructions, so we also move to a temp to set those up.
1664 */
1665 if (devinfo->gen == 6 || devinfo->gen == 7)
1666 src = fix_math_operand(src);
1667
1668 fs_inst *inst = emit(opcode, dst, src);
1669
1670 if (devinfo->gen < 6) {
1671 inst->base_mrf = 2;
1672 inst->mlen = dispatch_width / 8;
1673 }
1674
1675 return inst;
1676 }
1677
1678 fs_inst *
1679 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1680 {
1681 int base_mrf = 2;
1682 fs_inst *inst;
1683
1684 if (devinfo->gen >= 8) {
1685 inst = emit(opcode, dst, src0, src1);
1686 } else if (devinfo->gen >= 6) {
1687 src0 = fix_math_operand(src0);
1688 src1 = fix_math_operand(src1);
1689
1690 inst = emit(opcode, dst, src0, src1);
1691 } else {
1692 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1693 * "Message Payload":
1694 *
1695 * "Operand0[7]. For the INT DIV functions, this operand is the
1696 * denominator."
1697 * ...
1698 * "Operand1[7]. For the INT DIV functions, this operand is the
1699 * numerator."
1700 */
1701 bool is_int_div = opcode != SHADER_OPCODE_POW;
1702 fs_reg &op0 = is_int_div ? src1 : src0;
1703 fs_reg &op1 = is_int_div ? src0 : src1;
1704
1705 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1706 inst = emit(opcode, dst, op0, reg_null_f);
1707
1708 inst->base_mrf = base_mrf;
1709 inst->mlen = 2 * dispatch_width / 8;
1710 }
1711 return inst;
1712 }
1713
1714 void
1715 fs_visitor::emit_discard_jump()
1716 {
1717 assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1718
1719 /* For performance, after a discard, jump to the end of the
1720 * shader if all relevant channels have been discarded.
1721 */
1722 fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1723 discard_jump->flag_subreg = 1;
1724
1725 discard_jump->predicate = (dispatch_width == 8)
1726 ? BRW_PREDICATE_ALIGN1_ANY8H
1727 : BRW_PREDICATE_ALIGN1_ANY16H;
1728 discard_jump->predicate_inverse = true;
1729 }
1730
1731 void
1732 fs_visitor::assign_curb_setup()
1733 {
1734 if (dispatch_width == 8) {
1735 prog_data->dispatch_grf_start_reg = payload.num_regs;
1736 } else {
1737 if (stage == MESA_SHADER_FRAGMENT) {
1738 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1739 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1740 } else if (stage == MESA_SHADER_COMPUTE) {
1741 brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1742 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1743 } else {
1744 unreachable("Unsupported shader type!");
1745 }
1746 }
1747
1748 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1749
1750 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1751 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1752 for (unsigned int i = 0; i < inst->sources; i++) {
1753 if (inst->src[i].file == UNIFORM) {
1754 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1755 int constant_nr;
1756 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1757 constant_nr = push_constant_loc[uniform_nr];
1758 } else {
1759 /* Section 5.11 of the OpenGL 4.1 spec says:
1760 * "Out-of-bounds reads return undefined values, which include
1761 * values from other variables of the active program or zero."
1762 * Just return the first push constant.
1763 */
1764 constant_nr = 0;
1765 }
1766
1767 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1768 constant_nr / 8,
1769 constant_nr % 8);
1770
1771 inst->src[i].file = HW_REG;
1772 inst->src[i].fixed_hw_reg = byte_offset(
1773 retype(brw_reg, inst->src[i].type),
1774 inst->src[i].subreg_offset);
1775 }
1776 }
1777 }
1778 }
1779
1780 void
1781 fs_visitor::calculate_urb_setup()
1782 {
1783 assert(stage == MESA_SHADER_FRAGMENT);
1784 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1785 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1786
1787 memset(prog_data->urb_setup, -1,
1788 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1789
1790 int urb_next = 0;
1791 /* Figure out where each of the incoming setup attributes lands. */
1792 if (devinfo->gen >= 6) {
1793 if (_mesa_bitcount_64(prog->InputsRead &
1794 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1795 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1796 * first 16 varying inputs, so we can put them wherever we want.
1797 * Just put them in order.
1798 *
1799 * This is useful because it means that (a) inputs not used by the
1800 * fragment shader won't take up valuable register space, and (b) we
1801 * won't have to recompile the fragment shader if it gets paired with
1802 * a different vertex (or geometry) shader.
1803 */
1804 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1805 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1806 BITFIELD64_BIT(i)) {
1807 prog_data->urb_setup[i] = urb_next++;
1808 }
1809 }
1810 } else {
1811 /* We have enough input varyings that the SF/SBE pipeline stage can't
1812 * arbitrarily rearrange them to suit our whim; we have to put them
1813 * in an order that matches the output of the previous pipeline stage
1814 * (geometry or vertex shader).
1815 */
1816 struct brw_vue_map prev_stage_vue_map;
1817 brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1818 key->input_slots_valid);
1819 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1820 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1821 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1822 slot++) {
1823 int varying = prev_stage_vue_map.slot_to_varying[slot];
1824 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1825 * unused.
1826 */
1827 if (varying != BRW_VARYING_SLOT_COUNT &&
1828 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1829 BITFIELD64_BIT(varying))) {
1830 prog_data->urb_setup[varying] = slot - first_slot;
1831 }
1832 }
1833 urb_next = prev_stage_vue_map.num_slots - first_slot;
1834 }
1835 } else {
1836 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1837 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1838 /* Point size is packed into the header, not as a general attribute */
1839 if (i == VARYING_SLOT_PSIZ)
1840 continue;
1841
1842 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1843 /* The back color slot is skipped when the front color is
1844 * also written to. In addition, some slots can be
1845 * written in the vertex shader and not read in the
1846 * fragment shader. So the register number must always be
1847 * incremented, mapped or not.
1848 */
1849 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1850 prog_data->urb_setup[i] = urb_next;
1851 urb_next++;
1852 }
1853 }
1854
1855 /*
1856 * It's a FS only attribute, and we did interpolation for this attribute
1857 * in SF thread. So, count it here, too.
1858 *
1859 * See compile_sf_prog() for more info.
1860 */
1861 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1862 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1863 }
1864
1865 prog_data->num_varying_inputs = urb_next;
1866 }
1867
1868 void
1869 fs_visitor::assign_urb_setup()
1870 {
1871 assert(stage == MESA_SHADER_FRAGMENT);
1872 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1873
1874 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1875
1876 /* Offset all the urb_setup[] index by the actual position of the
1877 * setup regs, now that the location of the constants has been chosen.
1878 */
1879 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1880 if (inst->opcode == FS_OPCODE_LINTERP) {
1881 assert(inst->src[1].file == HW_REG);
1882 inst->src[1].fixed_hw_reg.nr += urb_start;
1883 }
1884
1885 if (inst->opcode == FS_OPCODE_CINTERP) {
1886 assert(inst->src[0].file == HW_REG);
1887 inst->src[0].fixed_hw_reg.nr += urb_start;
1888 }
1889 }
1890
1891 /* Each attribute is 4 setup channels, each of which is half a reg. */
1892 this->first_non_payload_grf =
1893 urb_start + prog_data->num_varying_inputs * 2;
1894 }
1895
1896 void
1897 fs_visitor::assign_vs_urb_setup()
1898 {
1899 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1900 int grf, count, slot, channel, attr;
1901
1902 assert(stage == MESA_SHADER_VERTEX);
1903 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1904 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1905 count++;
1906
1907 /* Each attribute is 4 regs. */
1908 this->first_non_payload_grf =
1909 payload.num_regs + prog_data->curb_read_length + count * 4;
1910
1911 unsigned vue_entries =
1912 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1913
1914 /* URB entry size is counted in units of 64 bytes (for the 3DSTATE_URB_VS
1915 * command). Each attribute is 16 bytes (4 floats/dwords), so each unit
1916 * fits four attributes.
1917 */
1918 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1919 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1920
1921 assert(vs_prog_data->base.urb_read_length <= 15);
1922
1923 /* Rewrite all ATTR file references to the hw grf that they land in. */
1924 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1925 for (int i = 0; i < inst->sources; i++) {
1926 if (inst->src[i].file == ATTR) {
1927
1928 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1929 slot = count - 1;
1930 } else {
1931 /* Attributes come in in a contiguous block, ordered by their
1932 * gl_vert_attrib value. That means we can compute the slot
1933 * number for an attribute by masking out the enabled
1934 * attributes before it and counting the bits.
1935 */
1936 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1937 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1938 BITFIELD64_MASK(attr));
1939 }
1940
1941 channel = inst->src[i].reg_offset & 3;
1942
1943 grf = payload.num_regs +
1944 prog_data->curb_read_length +
1945 slot * 4 + channel;
1946
1947 inst->src[i].file = HW_REG;
1948 inst->src[i].fixed_hw_reg =
1949 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1950 }
1951 }
1952 }
1953 }
1954
1955 /**
1956 * Split large virtual GRFs into separate components if we can.
1957 *
1958 * This is mostly duplicated with what brw_fs_vector_splitting does,
1959 * but that's really conservative because it's afraid of doing
1960 * splitting that doesn't result in real progress after the rest of
1961 * the optimization phases, which would cause infinite looping in
1962 * optimization. We can do it once here, safely. This also has the
1963 * opportunity to split interpolated values, or maybe even uniforms,
1964 * which we don't have at the IR level.
1965 *
1966 * We want to split, because virtual GRFs are what we register
1967 * allocate and spill (due to contiguousness requirements for some
1968 * instructions), and they're what we naturally generate in the
1969 * codegen process, but most virtual GRFs don't actually need to be
1970 * contiguous sets of GRFs. If we split, we'll end up with reduced
1971 * live intervals and better dead code elimination and coalescing.
1972 */
1973 void
1974 fs_visitor::split_virtual_grfs()
1975 {
1976 int num_vars = this->alloc.count;
1977
1978 /* Count the total number of registers */
1979 int reg_count = 0;
1980 int vgrf_to_reg[num_vars];
1981 for (int i = 0; i < num_vars; i++) {
1982 vgrf_to_reg[i] = reg_count;
1983 reg_count += alloc.sizes[i];
1984 }
1985
1986 /* An array of "split points". For each register slot, this indicates
1987 * if this slot can be separated from the previous slot. Every time an
1988 * instruction uses multiple elements of a register (as a source or
1989 * destination), we mark the used slots as inseparable. Then we go
1990 * through and split the registers into the smallest pieces we can.
1991 */
1992 bool split_points[reg_count];
1993 memset(split_points, 0, sizeof(split_points));
1994
1995 /* Mark all used registers as fully splittable */
1996 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1997 if (inst->dst.file == GRF) {
1998 int reg = vgrf_to_reg[inst->dst.reg];
1999 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
2000 split_points[reg + j] = true;
2001 }
2002
2003 for (int i = 0; i < inst->sources; i++) {
2004 if (inst->src[i].file == GRF) {
2005 int reg = vgrf_to_reg[inst->src[i].reg];
2006 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
2007 split_points[reg + j] = true;
2008 }
2009 }
2010 }
2011
2012 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2013 if (inst->dst.file == GRF) {
2014 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2015 for (int j = 1; j < inst->regs_written; j++)
2016 split_points[reg + j] = false;
2017 }
2018 for (int i = 0; i < inst->sources; i++) {
2019 if (inst->src[i].file == GRF) {
2020 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2021 for (int j = 1; j < inst->regs_read(i); j++)
2022 split_points[reg + j] = false;
2023 }
2024 }
2025 }
2026
2027 int new_virtual_grf[reg_count];
2028 int new_reg_offset[reg_count];
2029
2030 int reg = 0;
2031 for (int i = 0; i < num_vars; i++) {
2032 /* The first one should always be 0 as a quick sanity check. */
2033 assert(split_points[reg] == false);
2034
2035 /* j = 0 case */
2036 new_reg_offset[reg] = 0;
2037 reg++;
2038 int offset = 1;
2039
2040 /* j > 0 case */
2041 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2042 /* If this is a split point, reset the offset to 0 and allocate a
2043 * new virtual GRF for the previous offset many registers
2044 */
2045 if (split_points[reg]) {
2046 assert(offset <= MAX_VGRF_SIZE);
2047 int grf = alloc.allocate(offset);
2048 for (int k = reg - offset; k < reg; k++)
2049 new_virtual_grf[k] = grf;
2050 offset = 0;
2051 }
2052 new_reg_offset[reg] = offset;
2053 offset++;
2054 reg++;
2055 }
2056
2057 /* The last one gets the original register number */
2058 assert(offset <= MAX_VGRF_SIZE);
2059 alloc.sizes[i] = offset;
2060 for (int k = reg - offset; k < reg; k++)
2061 new_virtual_grf[k] = i;
2062 }
2063 assert(reg == reg_count);
2064
2065 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2066 if (inst->dst.file == GRF) {
2067 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2068 inst->dst.reg = new_virtual_grf[reg];
2069 inst->dst.reg_offset = new_reg_offset[reg];
2070 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2071 }
2072 for (int i = 0; i < inst->sources; i++) {
2073 if (inst->src[i].file == GRF) {
2074 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2075 inst->src[i].reg = new_virtual_grf[reg];
2076 inst->src[i].reg_offset = new_reg_offset[reg];
2077 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2078 }
2079 }
2080 }
2081 invalidate_live_intervals();
2082 }
2083
2084 /**
2085 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2086 *
2087 * During code generation, we create tons of temporary variables, many of
2088 * which get immediately killed and are never used again. Yet, in later
2089 * optimization and analysis passes, such as compute_live_intervals, we need
2090 * to loop over all the virtual GRFs. Compacting them can save a lot of
2091 * overhead.
2092 */
2093 bool
2094 fs_visitor::compact_virtual_grfs()
2095 {
2096 bool progress = false;
2097 int remap_table[this->alloc.count];
2098 memset(remap_table, -1, sizeof(remap_table));
2099
2100 /* Mark which virtual GRFs are used. */
2101 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2102 if (inst->dst.file == GRF)
2103 remap_table[inst->dst.reg] = 0;
2104
2105 for (int i = 0; i < inst->sources; i++) {
2106 if (inst->src[i].file == GRF)
2107 remap_table[inst->src[i].reg] = 0;
2108 }
2109 }
2110
2111 /* Compact the GRF arrays. */
2112 int new_index = 0;
2113 for (unsigned i = 0; i < this->alloc.count; i++) {
2114 if (remap_table[i] == -1) {
2115 /* We just found an unused register. This means that we are
2116 * actually going to compact something.
2117 */
2118 progress = true;
2119 } else {
2120 remap_table[i] = new_index;
2121 alloc.sizes[new_index] = alloc.sizes[i];
2122 invalidate_live_intervals();
2123 ++new_index;
2124 }
2125 }
2126
2127 this->alloc.count = new_index;
2128
2129 /* Patch all the instructions to use the newly renumbered registers */
2130 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2131 if (inst->dst.file == GRF)
2132 inst->dst.reg = remap_table[inst->dst.reg];
2133
2134 for (int i = 0; i < inst->sources; i++) {
2135 if (inst->src[i].file == GRF)
2136 inst->src[i].reg = remap_table[inst->src[i].reg];
2137 }
2138 }
2139
2140 /* Patch all the references to delta_xy, since they're used in register
2141 * allocation. If they're unused, switch them to BAD_FILE so we don't
2142 * think some random VGRF is delta_xy.
2143 */
2144 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2145 if (delta_xy[i].file == GRF) {
2146 if (remap_table[delta_xy[i].reg] != -1) {
2147 delta_xy[i].reg = remap_table[delta_xy[i].reg];
2148 } else {
2149 delta_xy[i].file = BAD_FILE;
2150 }
2151 }
2152 }
2153
2154 return progress;
2155 }
2156
2157 /*
2158 * Implements array access of uniforms by inserting a
2159 * PULL_CONSTANT_LOAD instruction.
2160 *
2161 * Unlike temporary GRF array access (where we don't support it due to
2162 * the difficulty of doing relative addressing on instruction
2163 * destinations), we could potentially do array access of uniforms
2164 * that were loaded in GRF space as push constants. In real-world
2165 * usage we've seen, though, the arrays being used are always larger
2166 * than we could load as push constants, so just always move all
2167 * uniform array access out to a pull constant buffer.
2168 */
2169 void
2170 fs_visitor::move_uniform_array_access_to_pull_constants()
2171 {
2172 if (dispatch_width != 8)
2173 return;
2174
2175 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2176 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2177
2178 /* Walk through and find array access of uniforms. Put a copy of that
2179 * uniform in the pull constant buffer.
2180 *
2181 * Note that we don't move constant-indexed accesses to arrays. No
2182 * testing has been done of the performance impact of this choice.
2183 */
2184 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2185 for (int i = 0 ; i < inst->sources; i++) {
2186 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2187 continue;
2188
2189 int uniform = inst->src[i].reg;
2190
2191 /* If this array isn't already present in the pull constant buffer,
2192 * add it.
2193 */
2194 if (pull_constant_loc[uniform] == -1) {
2195 const gl_constant_value **values = &stage_prog_data->param[uniform];
2196
2197 assert(param_size[uniform]);
2198
2199 for (int j = 0; j < param_size[uniform]; j++) {
2200 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2201
2202 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2203 values[j];
2204 }
2205 }
2206 }
2207 }
2208 }
2209
2210 /**
2211 * Assign UNIFORM file registers to either push constants or pull constants.
2212 *
2213 * We allow a fragment shader to have more than the specified minimum
2214 * maximum number of fragment shader uniform components (64). If
2215 * there are too many of these, they'd fill up all of register space.
2216 * So, this will push some of them out to the pull constant buffer and
2217 * update the program to load them.
2218 */
2219 void
2220 fs_visitor::assign_constant_locations()
2221 {
2222 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2223 if (dispatch_width != 8)
2224 return;
2225
2226 /* Find which UNIFORM registers are still in use. */
2227 bool is_live[uniforms];
2228 for (unsigned int i = 0; i < uniforms; i++) {
2229 is_live[i] = false;
2230 }
2231
2232 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2233 for (int i = 0; i < inst->sources; i++) {
2234 if (inst->src[i].file != UNIFORM)
2235 continue;
2236
2237 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2238 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2239 is_live[constant_nr] = true;
2240 }
2241 }
2242
2243 /* Only allow 16 registers (128 uniform components) as push constants.
2244 *
2245 * Just demote the end of the list. We could probably do better
2246 * here, demoting things that are rarely used in the program first.
2247 *
2248 * If changing this value, note the limitation about total_regs in
2249 * brw_curbe.c.
2250 */
2251 unsigned int max_push_components = 16 * 8;
2252 unsigned int num_push_constants = 0;
2253
2254 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2255
2256 for (unsigned int i = 0; i < uniforms; i++) {
2257 if (!is_live[i] || pull_constant_loc[i] != -1) {
2258 /* This UNIFORM register is either dead, or has already been demoted
2259 * to a pull const. Mark it as no longer living in the param[] array.
2260 */
2261 push_constant_loc[i] = -1;
2262 continue;
2263 }
2264
2265 if (num_push_constants < max_push_components) {
2266 /* Retain as a push constant. Record the location in the params[]
2267 * array.
2268 */
2269 push_constant_loc[i] = num_push_constants++;
2270 } else {
2271 /* Demote to a pull constant. */
2272 push_constant_loc[i] = -1;
2273
2274 int pull_index = stage_prog_data->nr_pull_params++;
2275 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2276 pull_constant_loc[i] = pull_index;
2277 }
2278 }
2279
2280 stage_prog_data->nr_params = num_push_constants;
2281
2282 /* Up until now, the param[] array has been indexed by reg + reg_offset
2283 * of UNIFORM registers. Condense it to only contain the uniforms we
2284 * chose to upload as push constants.
2285 */
2286 for (unsigned int i = 0; i < uniforms; i++) {
2287 int remapped = push_constant_loc[i];
2288
2289 if (remapped == -1)
2290 continue;
2291
2292 assert(remapped <= (int)i);
2293 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2294 }
2295 }
2296
2297 /**
2298 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2299 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2300 */
2301 void
2302 fs_visitor::demote_pull_constants()
2303 {
2304 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2305 for (int i = 0; i < inst->sources; i++) {
2306 if (inst->src[i].file != UNIFORM)
2307 continue;
2308
2309 int pull_index;
2310 unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2311 if (location >= uniforms) /* Out of bounds access */
2312 pull_index = -1;
2313 else
2314 pull_index = pull_constant_loc[location];
2315
2316 if (pull_index == -1)
2317 continue;
2318
2319 /* Set up the annotation tracking for new generated instructions. */
2320 base_ir = inst->ir;
2321 current_annotation = inst->annotation;
2322
2323 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2324 fs_reg dst = vgrf(glsl_type::float_type);
2325
2326 /* Generate a pull load into dst. */
2327 if (inst->src[i].reladdr) {
2328 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2329 surf_index,
2330 *inst->src[i].reladdr,
2331 pull_index);
2332 inst->insert_before(block, &list);
2333 inst->src[i].reladdr = NULL;
2334 } else {
2335 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2336 fs_inst *pull =
2337 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2338 dst, surf_index, offset);
2339 inst->insert_before(block, pull);
2340 inst->src[i].set_smear(pull_index & 3);
2341 }
2342
2343 /* Rewrite the instruction to use the temporary VGRF. */
2344 inst->src[i].file = GRF;
2345 inst->src[i].reg = dst.reg;
2346 inst->src[i].reg_offset = 0;
2347 inst->src[i].width = dispatch_width;
2348 }
2349 }
2350 invalidate_live_intervals();
2351 }
2352
2353 bool
2354 fs_visitor::opt_algebraic()
2355 {
2356 bool progress = false;
2357
2358 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2359 switch (inst->opcode) {
2360 case BRW_OPCODE_MOV:
2361 if (inst->src[0].file != IMM)
2362 break;
2363
2364 if (inst->saturate) {
2365 if (inst->dst.type != inst->src[0].type)
2366 assert(!"unimplemented: saturate mixed types");
2367
2368 if (brw_saturate_immediate(inst->dst.type,
2369 &inst->src[0].fixed_hw_reg)) {
2370 inst->saturate = false;
2371 progress = true;
2372 }
2373 }
2374 break;
2375
2376 case BRW_OPCODE_MUL:
2377 if (inst->src[1].file != IMM)
2378 continue;
2379
2380 /* a * 1.0 = a */
2381 if (inst->src[1].is_one()) {
2382 inst->opcode = BRW_OPCODE_MOV;
2383 inst->src[1] = reg_undef;
2384 progress = true;
2385 break;
2386 }
2387
2388 /* a * -1.0 = -a */
2389 if (inst->src[1].is_negative_one()) {
2390 inst->opcode = BRW_OPCODE_MOV;
2391 inst->src[0].negate = !inst->src[0].negate;
2392 inst->src[1] = reg_undef;
2393 progress = true;
2394 break;
2395 }
2396
2397 /* a * 0.0 = 0.0 */
2398 if (inst->src[1].is_zero()) {
2399 inst->opcode = BRW_OPCODE_MOV;
2400 inst->src[0] = inst->src[1];
2401 inst->src[1] = reg_undef;
2402 progress = true;
2403 break;
2404 }
2405
2406 if (inst->src[0].file == IMM) {
2407 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2408 inst->opcode = BRW_OPCODE_MOV;
2409 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2410 inst->src[1] = reg_undef;
2411 progress = true;
2412 break;
2413 }
2414 break;
2415 case BRW_OPCODE_ADD:
2416 if (inst->src[1].file != IMM)
2417 continue;
2418
2419 /* a + 0.0 = a */
2420 if (inst->src[1].is_zero()) {
2421 inst->opcode = BRW_OPCODE_MOV;
2422 inst->src[1] = reg_undef;
2423 progress = true;
2424 break;
2425 }
2426
2427 if (inst->src[0].file == IMM) {
2428 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2429 inst->opcode = BRW_OPCODE_MOV;
2430 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2431 inst->src[1] = reg_undef;
2432 progress = true;
2433 break;
2434 }
2435 break;
2436 case BRW_OPCODE_OR:
2437 if (inst->src[0].equals(inst->src[1])) {
2438 inst->opcode = BRW_OPCODE_MOV;
2439 inst->src[1] = reg_undef;
2440 progress = true;
2441 break;
2442 }
2443 break;
2444 case BRW_OPCODE_LRP:
2445 if (inst->src[1].equals(inst->src[2])) {
2446 inst->opcode = BRW_OPCODE_MOV;
2447 inst->src[0] = inst->src[1];
2448 inst->src[1] = reg_undef;
2449 inst->src[2] = reg_undef;
2450 progress = true;
2451 break;
2452 }
2453 break;
2454 case BRW_OPCODE_CMP:
2455 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2456 inst->src[0].abs &&
2457 inst->src[0].negate &&
2458 inst->src[1].is_zero()) {
2459 inst->src[0].abs = false;
2460 inst->src[0].negate = false;
2461 inst->conditional_mod = BRW_CONDITIONAL_Z;
2462 progress = true;
2463 break;
2464 }
2465 break;
2466 case BRW_OPCODE_SEL:
2467 if (inst->src[0].equals(inst->src[1])) {
2468 inst->opcode = BRW_OPCODE_MOV;
2469 inst->src[1] = reg_undef;
2470 inst->predicate = BRW_PREDICATE_NONE;
2471 inst->predicate_inverse = false;
2472 progress = true;
2473 } else if (inst->saturate && inst->src[1].file == IMM) {
2474 switch (inst->conditional_mod) {
2475 case BRW_CONDITIONAL_LE:
2476 case BRW_CONDITIONAL_L:
2477 switch (inst->src[1].type) {
2478 case BRW_REGISTER_TYPE_F:
2479 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2480 inst->opcode = BRW_OPCODE_MOV;
2481 inst->src[1] = reg_undef;
2482 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2483 progress = true;
2484 }
2485 break;
2486 default:
2487 break;
2488 }
2489 break;
2490 case BRW_CONDITIONAL_GE:
2491 case BRW_CONDITIONAL_G:
2492 switch (inst->src[1].type) {
2493 case BRW_REGISTER_TYPE_F:
2494 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2495 inst->opcode = BRW_OPCODE_MOV;
2496 inst->src[1] = reg_undef;
2497 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2498 progress = true;
2499 }
2500 break;
2501 default:
2502 break;
2503 }
2504 default:
2505 break;
2506 }
2507 }
2508 break;
2509 case BRW_OPCODE_MAD:
2510 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2511 inst->opcode = BRW_OPCODE_MOV;
2512 inst->src[1] = reg_undef;
2513 inst->src[2] = reg_undef;
2514 progress = true;
2515 } else if (inst->src[0].is_zero()) {
2516 inst->opcode = BRW_OPCODE_MUL;
2517 inst->src[0] = inst->src[2];
2518 inst->src[2] = reg_undef;
2519 progress = true;
2520 } else if (inst->src[1].is_one()) {
2521 inst->opcode = BRW_OPCODE_ADD;
2522 inst->src[1] = inst->src[2];
2523 inst->src[2] = reg_undef;
2524 progress = true;
2525 } else if (inst->src[2].is_one()) {
2526 inst->opcode = BRW_OPCODE_ADD;
2527 inst->src[2] = reg_undef;
2528 progress = true;
2529 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2530 inst->opcode = BRW_OPCODE_ADD;
2531 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2532 inst->src[2] = reg_undef;
2533 progress = true;
2534 }
2535 break;
2536 case SHADER_OPCODE_RCP: {
2537 fs_inst *prev = (fs_inst *)inst->prev;
2538 if (prev->opcode == SHADER_OPCODE_SQRT) {
2539 if (inst->src[0].equals(prev->dst)) {
2540 inst->opcode = SHADER_OPCODE_RSQ;
2541 inst->src[0] = prev->src[0];
2542 progress = true;
2543 }
2544 }
2545 break;
2546 }
2547 case SHADER_OPCODE_BROADCAST:
2548 if (is_uniform(inst->src[0])) {
2549 inst->opcode = BRW_OPCODE_MOV;
2550 inst->sources = 1;
2551 inst->force_writemask_all = true;
2552 progress = true;
2553 } else if (inst->src[1].file == IMM) {
2554 inst->opcode = BRW_OPCODE_MOV;
2555 inst->src[0] = component(inst->src[0],
2556 inst->src[1].fixed_hw_reg.dw1.ud);
2557 inst->sources = 1;
2558 inst->force_writemask_all = true;
2559 progress = true;
2560 }
2561 break;
2562
2563 default:
2564 break;
2565 }
2566
2567 /* Swap if src[0] is immediate. */
2568 if (progress && inst->is_commutative()) {
2569 if (inst->src[0].file == IMM) {
2570 fs_reg tmp = inst->src[1];
2571 inst->src[1] = inst->src[0];
2572 inst->src[0] = tmp;
2573 }
2574 }
2575 }
2576 return progress;
2577 }
2578
2579 /**
2580 * Optimize sample messages that have constant zero values for the trailing
2581 * texture coordinates. We can just reduce the message length for these
2582 * instructions instead of reserving a register for it. Trailing parameters
2583 * that aren't sent default to zero anyway. This will cause the dead code
2584 * eliminator to remove the MOV instruction that would otherwise be emitted to
2585 * set up the zero value.
2586 */
2587 bool
2588 fs_visitor::opt_zero_samples()
2589 {
2590 /* Gen4 infers the texturing opcode based on the message length so we can't
2591 * change it.
2592 */
2593 if (devinfo->gen < 5)
2594 return false;
2595
2596 bool progress = false;
2597
2598 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2599 if (!inst->is_tex())
2600 continue;
2601
2602 fs_inst *load_payload = (fs_inst *) inst->prev;
2603
2604 if (load_payload->is_head_sentinel() ||
2605 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2606 continue;
2607
2608 /* We don't want to remove the message header or the first parameter.
2609 * Removing the first parameter is not allowed, see the Haswell PRM
2610 * volume 7, page 149:
2611 *
2612 * "Parameter 0 is required except for the sampleinfo message, which
2613 * has no parameter 0"
2614 */
2615 while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2616 load_payload->src[(inst->mlen - inst->header_size) /
2617 (dispatch_width / 8) +
2618 inst->header_size - 1].is_zero()) {
2619 inst->mlen -= dispatch_width / 8;
2620 progress = true;
2621 }
2622 }
2623
2624 if (progress)
2625 invalidate_live_intervals();
2626
2627 return progress;
2628 }
2629
2630 /**
2631 * Optimize sample messages which are followed by the final RT write.
2632 *
2633 * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2634 * results sent directly to the framebuffer, bypassing the EU. Recognize the
2635 * final texturing results copied to the framebuffer write payload and modify
2636 * them to write to the framebuffer directly.
2637 */
2638 bool
2639 fs_visitor::opt_sampler_eot()
2640 {
2641 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2642
2643 if (stage != MESA_SHADER_FRAGMENT)
2644 return false;
2645
2646 if (devinfo->gen < 9 && !devinfo->is_cherryview)
2647 return false;
2648
2649 /* FINISHME: It should be possible to implement this optimization when there
2650 * are multiple drawbuffers.
2651 */
2652 if (key->nr_color_regions != 1)
2653 return false;
2654
2655 /* Look for a texturing instruction immediately before the final FB_WRITE. */
2656 fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2657 assert(fb_write->eot);
2658 assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2659
2660 fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2661
2662 /* There wasn't one; nothing to do. */
2663 if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2664 return false;
2665
2666 /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2667 * It's very likely to be the previous instruction.
2668 */
2669 fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2670 if (load_payload->is_head_sentinel() ||
2671 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2672 return false;
2673
2674 assert(!tex_inst->eot); /* We can't get here twice */
2675 assert((tex_inst->offset & (0xff << 24)) == 0);
2676
2677 tex_inst->offset |= fb_write->target << 24;
2678 tex_inst->eot = true;
2679 tex_inst->dst = reg_null_ud;
2680 fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2681
2682 /* If a header is present, marking the eot is sufficient. Otherwise, we need
2683 * to create a new LOAD_PAYLOAD command with the same sources and a space
2684 * saved for the header. Using a new destination register not only makes sure
2685 * we have enough space, but it will make sure the dead code eliminator kills
2686 * the instruction that this will replace.
2687 */
2688 if (tex_inst->header_size != 0)
2689 return true;
2690
2691 fs_reg send_header = vgrf(load_payload->sources + 1);
2692 fs_reg *new_sources =
2693 ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2694
2695 new_sources[0] = fs_reg();
2696 for (int i = 0; i < load_payload->sources; i++)
2697 new_sources[i+1] = load_payload->src[i];
2698
2699 /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2700 * requires a lot of information about the sources to appropriately figure
2701 * out the number of registers needed to be used. Given this stage in our
2702 * optimization, we may not have the appropriate GRFs required by
2703 * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2704 * manually emit the instruction.
2705 */
2706 fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2707 load_payload->exec_size,
2708 send_header,
2709 new_sources,
2710 load_payload->sources + 1);
2711
2712 new_load_payload->regs_written = load_payload->regs_written + 1;
2713 new_load_payload->header_size = 1;
2714 tex_inst->mlen++;
2715 tex_inst->header_size = 1;
2716 tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2717 tex_inst->src[0] = send_header;
2718
2719 return true;
2720 }
2721
2722 bool
2723 fs_visitor::opt_register_renaming()
2724 {
2725 bool progress = false;
2726 int depth = 0;
2727
2728 int remap[alloc.count];
2729 memset(remap, -1, sizeof(int) * alloc.count);
2730
2731 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2732 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2733 depth++;
2734 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2735 inst->opcode == BRW_OPCODE_WHILE) {
2736 depth--;
2737 }
2738
2739 /* Rewrite instruction sources. */
2740 for (int i = 0; i < inst->sources; i++) {
2741 if (inst->src[i].file == GRF &&
2742 remap[inst->src[i].reg] != -1 &&
2743 remap[inst->src[i].reg] != inst->src[i].reg) {
2744 inst->src[i].reg = remap[inst->src[i].reg];
2745 progress = true;
2746 }
2747 }
2748
2749 const int dst = inst->dst.reg;
2750
2751 if (depth == 0 &&
2752 inst->dst.file == GRF &&
2753 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2754 !inst->is_partial_write()) {
2755 if (remap[dst] == -1) {
2756 remap[dst] = dst;
2757 } else {
2758 remap[dst] = alloc.allocate(inst->dst.width / 8);
2759 inst->dst.reg = remap[dst];
2760 progress = true;
2761 }
2762 } else if (inst->dst.file == GRF &&
2763 remap[dst] != -1 &&
2764 remap[dst] != dst) {
2765 inst->dst.reg = remap[dst];
2766 progress = true;
2767 }
2768 }
2769
2770 if (progress) {
2771 invalidate_live_intervals();
2772
2773 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2774 if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2775 delta_xy[i].reg = remap[delta_xy[i].reg];
2776 }
2777 }
2778 }
2779
2780 return progress;
2781 }
2782
2783 /**
2784 * Remove redundant or useless discard jumps.
2785 *
2786 * For example, we can eliminate jumps in the following sequence:
2787 *
2788 * discard-jump (redundant with the next jump)
2789 * discard-jump (useless; jumps to the next instruction)
2790 * placeholder-halt
2791 */
2792 bool
2793 fs_visitor::opt_redundant_discard_jumps()
2794 {
2795 bool progress = false;
2796
2797 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2798
2799 fs_inst *placeholder_halt = NULL;
2800 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2801 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2802 placeholder_halt = inst;
2803 break;
2804 }
2805 }
2806
2807 if (!placeholder_halt)
2808 return false;
2809
2810 /* Delete any HALTs immediately before the placeholder halt. */
2811 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2812 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2813 prev = (fs_inst *) placeholder_halt->prev) {
2814 prev->remove(last_bblock);
2815 progress = true;
2816 }
2817
2818 if (progress)
2819 invalidate_live_intervals();
2820
2821 return progress;
2822 }
2823
2824 bool
2825 fs_visitor::compute_to_mrf()
2826 {
2827 bool progress = false;
2828 int next_ip = 0;
2829
2830 /* No MRFs on Gen >= 7. */
2831 if (devinfo->gen >= 7)
2832 return false;
2833
2834 calculate_live_intervals();
2835
2836 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2837 int ip = next_ip;
2838 next_ip++;
2839
2840 if (inst->opcode != BRW_OPCODE_MOV ||
2841 inst->is_partial_write() ||
2842 inst->dst.file != MRF || inst->src[0].file != GRF ||
2843 inst->dst.type != inst->src[0].type ||
2844 inst->src[0].abs || inst->src[0].negate ||
2845 !inst->src[0].is_contiguous() ||
2846 inst->src[0].subreg_offset)
2847 continue;
2848
2849 /* Work out which hardware MRF registers are written by this
2850 * instruction.
2851 */
2852 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2853 int mrf_high;
2854 if (inst->dst.reg & BRW_MRF_COMPR4) {
2855 mrf_high = mrf_low + 4;
2856 } else if (inst->exec_size == 16) {
2857 mrf_high = mrf_low + 1;
2858 } else {
2859 mrf_high = mrf_low;
2860 }
2861
2862 /* Can't compute-to-MRF this GRF if someone else was going to
2863 * read it later.
2864 */
2865 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2866 continue;
2867
2868 /* Found a move of a GRF to a MRF. Let's see if we can go
2869 * rewrite the thing that made this GRF to write into the MRF.
2870 */
2871 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2872 if (scan_inst->dst.file == GRF &&
2873 scan_inst->dst.reg == inst->src[0].reg) {
2874 /* Found the last thing to write our reg we want to turn
2875 * into a compute-to-MRF.
2876 */
2877
2878 /* If this one instruction didn't populate all the
2879 * channels, bail. We might be able to rewrite everything
2880 * that writes that reg, but it would require smarter
2881 * tracking to delay the rewriting until complete success.
2882 */
2883 if (scan_inst->is_partial_write())
2884 break;
2885
2886 /* Things returning more than one register would need us to
2887 * understand coalescing out more than one MOV at a time.
2888 */
2889 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2890 break;
2891
2892 /* SEND instructions can't have MRF as a destination. */
2893 if (scan_inst->mlen)
2894 break;
2895
2896 if (devinfo->gen == 6) {
2897 /* gen6 math instructions must have the destination be
2898 * GRF, so no compute-to-MRF for them.
2899 */
2900 if (scan_inst->is_math()) {
2901 break;
2902 }
2903 }
2904
2905 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2906 /* Found the creator of our MRF's source value. */
2907 scan_inst->dst.file = MRF;
2908 scan_inst->dst.reg = inst->dst.reg;
2909 scan_inst->saturate |= inst->saturate;
2910 inst->remove(block);
2911 progress = true;
2912 }
2913 break;
2914 }
2915
2916 /* We don't handle control flow here. Most computation of
2917 * values that end up in MRFs are shortly before the MRF
2918 * write anyway.
2919 */
2920 if (block->start() == scan_inst)
2921 break;
2922
2923 /* You can't read from an MRF, so if someone else reads our
2924 * MRF's source GRF that we wanted to rewrite, that stops us.
2925 */
2926 bool interfered = false;
2927 for (int i = 0; i < scan_inst->sources; i++) {
2928 if (scan_inst->src[i].file == GRF &&
2929 scan_inst->src[i].reg == inst->src[0].reg &&
2930 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2931 interfered = true;
2932 }
2933 }
2934 if (interfered)
2935 break;
2936
2937 if (scan_inst->dst.file == MRF) {
2938 /* If somebody else writes our MRF here, we can't
2939 * compute-to-MRF before that.
2940 */
2941 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2942 int scan_mrf_high;
2943
2944 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2945 scan_mrf_high = scan_mrf_low + 4;
2946 } else if (scan_inst->exec_size == 16) {
2947 scan_mrf_high = scan_mrf_low + 1;
2948 } else {
2949 scan_mrf_high = scan_mrf_low;
2950 }
2951
2952 if (mrf_low == scan_mrf_low ||
2953 mrf_low == scan_mrf_high ||
2954 mrf_high == scan_mrf_low ||
2955 mrf_high == scan_mrf_high) {
2956 break;
2957 }
2958 }
2959
2960 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2961 /* Found a SEND instruction, which means that there are
2962 * live values in MRFs from base_mrf to base_mrf +
2963 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2964 * above it.
2965 */
2966 if (mrf_low >= scan_inst->base_mrf &&
2967 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2968 break;
2969 }
2970 if (mrf_high >= scan_inst->base_mrf &&
2971 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2972 break;
2973 }
2974 }
2975 }
2976 }
2977
2978 if (progress)
2979 invalidate_live_intervals();
2980
2981 return progress;
2982 }
2983
2984 /**
2985 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2986 * flow. We could probably do better here with some form of divergence
2987 * analysis.
2988 */
2989 bool
2990 fs_visitor::eliminate_find_live_channel()
2991 {
2992 bool progress = false;
2993 unsigned depth = 0;
2994
2995 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2996 switch (inst->opcode) {
2997 case BRW_OPCODE_IF:
2998 case BRW_OPCODE_DO:
2999 depth++;
3000 break;
3001
3002 case BRW_OPCODE_ENDIF:
3003 case BRW_OPCODE_WHILE:
3004 depth--;
3005 break;
3006
3007 case FS_OPCODE_DISCARD_JUMP:
3008 /* This can potentially make control flow non-uniform until the end
3009 * of the program.
3010 */
3011 return progress;
3012
3013 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
3014 if (depth == 0) {
3015 inst->opcode = BRW_OPCODE_MOV;
3016 inst->src[0] = fs_reg(0);
3017 inst->sources = 1;
3018 inst->force_writemask_all = true;
3019 progress = true;
3020 }
3021 break;
3022
3023 default:
3024 break;
3025 }
3026 }
3027
3028 return progress;
3029 }
3030
3031 /**
3032 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
3033 * instructions to FS_OPCODE_REP_FB_WRITE.
3034 */
3035 void
3036 fs_visitor::emit_repclear_shader()
3037 {
3038 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3039 int base_mrf = 1;
3040 int color_mrf = base_mrf + 2;
3041 fs_inst *mov;
3042
3043 if (uniforms == 1) {
3044 mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
3045 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
3046 } else {
3047 struct brw_reg reg =
3048 brw_reg(BRW_GENERAL_REGISTER_FILE,
3049 2, 3, 0, 0, BRW_REGISTER_TYPE_F,
3050 BRW_VERTICAL_STRIDE_8,
3051 BRW_WIDTH_2,
3052 BRW_HORIZONTAL_STRIDE_4, BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
3053
3054 mov = emit(MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg)));
3055 }
3056
3057 mov->force_writemask_all = true;
3058
3059 fs_inst *write;
3060 if (key->nr_color_regions == 1) {
3061 write = emit(FS_OPCODE_REP_FB_WRITE);
3062 write->saturate = key->clamp_fragment_color;
3063 write->base_mrf = color_mrf;
3064 write->target = 0;
3065 write->header_size = 0;
3066 write->mlen = 1;
3067 } else {
3068 assume(key->nr_color_regions > 0);
3069 for (int i = 0; i < key->nr_color_regions; ++i) {
3070 write = emit(FS_OPCODE_REP_FB_WRITE);
3071 write->saturate = key->clamp_fragment_color;
3072 write->base_mrf = base_mrf;
3073 write->target = i;
3074 write->header_size = 2;
3075 write->mlen = 3;
3076 }
3077 }
3078 write->eot = true;
3079
3080 calculate_cfg();
3081
3082 assign_constant_locations();
3083 assign_curb_setup();
3084
3085 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
3086 if (uniforms == 1) {
3087 assert(mov->src[0].file == HW_REG);
3088 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
3089 }
3090 }
3091
3092 /**
3093 * Walks through basic blocks, looking for repeated MRF writes and
3094 * removing the later ones.
3095 */
3096 bool
3097 fs_visitor::remove_duplicate_mrf_writes()
3098 {
3099 fs_inst *last_mrf_move[16];
3100 bool progress = false;
3101
3102 /* Need to update the MRF tracking for compressed instructions. */
3103 if (dispatch_width == 16)
3104 return false;
3105
3106 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3107
3108 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3109 if (inst->is_control_flow()) {
3110 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3111 }
3112
3113 if (inst->opcode == BRW_OPCODE_MOV &&
3114 inst->dst.file == MRF) {
3115 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3116 if (prev_inst && inst->equals(prev_inst)) {
3117 inst->remove(block);
3118 progress = true;
3119 continue;
3120 }
3121 }
3122
3123 /* Clear out the last-write records for MRFs that were overwritten. */
3124 if (inst->dst.file == MRF) {
3125 last_mrf_move[inst->dst.reg] = NULL;
3126 }
3127
3128 if (inst->mlen > 0 && inst->base_mrf != -1) {
3129 /* Found a SEND instruction, which will include two or fewer
3130 * implied MRF writes. We could do better here.
3131 */
3132 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3133 last_mrf_move[inst->base_mrf + i] = NULL;
3134 }
3135 }
3136
3137 /* Clear out any MRF move records whose sources got overwritten. */
3138 if (inst->dst.file == GRF) {
3139 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3140 if (last_mrf_move[i] &&
3141 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3142 last_mrf_move[i] = NULL;
3143 }
3144 }
3145 }
3146
3147 if (inst->opcode == BRW_OPCODE_MOV &&
3148 inst->dst.file == MRF &&
3149 inst->src[0].file == GRF &&
3150 !inst->is_partial_write()) {
3151 last_mrf_move[inst->dst.reg] = inst;
3152 }
3153 }
3154
3155 if (progress)
3156 invalidate_live_intervals();
3157
3158 return progress;
3159 }
3160
3161 static void
3162 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3163 {
3164 /* Clear the flag for registers that actually got read (as expected). */
3165 for (int i = 0; i < inst->sources; i++) {
3166 int grf;
3167 if (inst->src[i].file == GRF) {
3168 grf = inst->src[i].reg;
3169 } else if (inst->src[i].file == HW_REG &&
3170 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3171 grf = inst->src[i].fixed_hw_reg.nr;
3172 } else {
3173 continue;
3174 }
3175
3176 if (grf >= first_grf &&
3177 grf < first_grf + grf_len) {
3178 deps[grf - first_grf] = false;
3179 if (inst->exec_size == 16)
3180 deps[grf - first_grf + 1] = false;
3181 }
3182 }
3183 }
3184
3185 /**
3186 * Implements this workaround for the original 965:
3187 *
3188 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3189 * check for post destination dependencies on this instruction, software
3190 * must ensure that there is no destination hazard for the case of ‘write
3191 * followed by a posted write’ shown in the following example.
3192 *
3193 * 1. mov r3 0
3194 * 2. send r3.xy <rest of send instruction>
3195 * 3. mov r2 r3
3196 *
3197 * Due to no post-destination dependency check on the ‘send’, the above
3198 * code sequence could have two instructions (1 and 2) in flight at the
3199 * same time that both consider ‘r3’ as the target of their final writes.
3200 */
3201 void
3202 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3203 fs_inst *inst)
3204 {
3205 int write_len = inst->regs_written;
3206 int first_write_grf = inst->dst.reg;
3207 bool needs_dep[BRW_MAX_MRF];
3208 assert(write_len < (int)sizeof(needs_dep) - 1);
3209
3210 memset(needs_dep, false, sizeof(needs_dep));
3211 memset(needs_dep, true, write_len);
3212
3213 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3214
3215 /* Walk backwards looking for writes to registers we're writing which
3216 * aren't read since being written. If we hit the start of the program,
3217 * we assume that there are no outstanding dependencies on entry to the
3218 * program.
3219 */
3220 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3221 /* If we hit control flow, assume that there *are* outstanding
3222 * dependencies, and force their cleanup before our instruction.
3223 */
3224 if (block->start() == scan_inst) {
3225 for (int i = 0; i < write_len; i++) {
3226 if (needs_dep[i]) {
3227 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3228 }
3229 }
3230 return;
3231 }
3232
3233 /* We insert our reads as late as possible on the assumption that any
3234 * instruction but a MOV that might have left us an outstanding
3235 * dependency has more latency than a MOV.
3236 */
3237 if (scan_inst->dst.file == GRF) {
3238 for (int i = 0; i < scan_inst->regs_written; i++) {
3239 int reg = scan_inst->dst.reg + i;
3240
3241 if (reg >= first_write_grf &&
3242 reg < first_write_grf + write_len &&
3243 needs_dep[reg - first_write_grf]) {
3244 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3245 needs_dep[reg - first_write_grf] = false;
3246 if (scan_inst->exec_size == 16)
3247 needs_dep[reg - first_write_grf + 1] = false;
3248 }
3249 }
3250 }
3251
3252 /* Clear the flag for registers that actually got read (as expected). */
3253 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3254
3255 /* Continue the loop only if we haven't resolved all the dependencies */
3256 int i;
3257 for (i = 0; i < write_len; i++) {
3258 if (needs_dep[i])
3259 break;
3260 }
3261 if (i == write_len)
3262 return;
3263 }
3264 }
3265
3266 /**
3267 * Implements this workaround for the original 965:
3268 *
3269 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3270 * used as a destination register until after it has been sourced by an
3271 * instruction with a different destination register.
3272 */
3273 void
3274 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3275 {
3276 int write_len = inst->regs_written;
3277 int first_write_grf = inst->dst.reg;
3278 bool needs_dep[BRW_MAX_MRF];
3279 assert(write_len < (int)sizeof(needs_dep) - 1);
3280
3281 memset(needs_dep, false, sizeof(needs_dep));
3282 memset(needs_dep, true, write_len);
3283 /* Walk forwards looking for writes to registers we're writing which aren't
3284 * read before being written.
3285 */
3286 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3287 /* If we hit control flow, force resolve all remaining dependencies. */
3288 if (block->end() == scan_inst) {
3289 for (int i = 0; i < write_len; i++) {
3290 if (needs_dep[i])
3291 scan_inst->insert_before(block,
3292 DEP_RESOLVE_MOV(first_write_grf + i));
3293 }
3294 return;
3295 }
3296
3297 /* Clear the flag for registers that actually got read (as expected). */
3298 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3299
3300 /* We insert our reads as late as possible since they're reading the
3301 * result of a SEND, which has massive latency.
3302 */
3303 if (scan_inst->dst.file == GRF &&
3304 scan_inst->dst.reg >= first_write_grf &&
3305 scan_inst->dst.reg < first_write_grf + write_len &&
3306 needs_dep[scan_inst->dst.reg - first_write_grf]) {
3307 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3308 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3309 }
3310
3311 /* Continue the loop only if we haven't resolved all the dependencies */
3312 int i;
3313 for (i = 0; i < write_len; i++) {
3314 if (needs_dep[i])
3315 break;
3316 }
3317 if (i == write_len)
3318 return;
3319 }
3320 }
3321
3322 void
3323 fs_visitor::insert_gen4_send_dependency_workarounds()
3324 {
3325 if (devinfo->gen != 4 || devinfo->is_g4x)
3326 return;
3327
3328 bool progress = false;
3329
3330 /* Note that we're done with register allocation, so GRF fs_regs always
3331 * have a .reg_offset of 0.
3332 */
3333
3334 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3335 if (inst->mlen != 0 && inst->dst.file == GRF) {
3336 insert_gen4_pre_send_dependency_workarounds(block, inst);
3337 insert_gen4_post_send_dependency_workarounds(block, inst);
3338 progress = true;
3339 }
3340 }
3341
3342 if (progress)
3343 invalidate_live_intervals();
3344 }
3345
3346 /**
3347 * Turns the generic expression-style uniform pull constant load instruction
3348 * into a hardware-specific series of instructions for loading a pull
3349 * constant.
3350 *
3351 * The expression style allows the CSE pass before this to optimize out
3352 * repeated loads from the same offset, and gives the pre-register-allocation
3353 * scheduling full flexibility, while the conversion to native instructions
3354 * allows the post-register-allocation scheduler the best information
3355 * possible.
3356 *
3357 * Note that execution masking for setting up pull constant loads is special:
3358 * the channels that need to be written are unrelated to the current execution
3359 * mask, since a later instruction will use one of the result channels as a
3360 * source operand for all 8 or 16 of its channels.
3361 */
3362 void
3363 fs_visitor::lower_uniform_pull_constant_loads()
3364 {
3365 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3366 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3367 continue;
3368
3369 if (devinfo->gen >= 7) {
3370 /* The offset arg before was a vec4-aligned byte offset. We need to
3371 * turn it into a dword offset.
3372 */
3373 fs_reg const_offset_reg = inst->src[1];
3374 assert(const_offset_reg.file == IMM &&
3375 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3376 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3377 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3378
3379 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3380 * Reserve space for the register.
3381 */
3382 if (devinfo->gen >= 9) {
3383 payload.reg_offset++;
3384 alloc.sizes[payload.reg] = 2;
3385 }
3386
3387 /* This is actually going to be a MOV, but since only the first dword
3388 * is accessed, we have a special opcode to do just that one. Note
3389 * that this needs to be an operation that will be considered a def
3390 * by live variable analysis, or register allocation will explode.
3391 */
3392 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3393 8, payload, const_offset_reg);
3394 setup->force_writemask_all = true;
3395
3396 setup->ir = inst->ir;
3397 setup->annotation = inst->annotation;
3398 inst->insert_before(block, setup);
3399
3400 /* Similarly, this will only populate the first 4 channels of the
3401 * result register (since we only use smear values from 0-3), but we
3402 * don't tell the optimizer.
3403 */
3404 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3405 inst->src[1] = payload;
3406
3407 invalidate_live_intervals();
3408 } else {
3409 /* Before register allocation, we didn't tell the scheduler about the
3410 * MRF we use. We know it's safe to use this MRF because nothing
3411 * else does except for register spill/unspill, which generates and
3412 * uses its MRF within a single IR instruction.
3413 */
3414 inst->base_mrf = 14;
3415 inst->mlen = 1;
3416 }
3417 }
3418 }
3419
3420 bool
3421 fs_visitor::lower_load_payload()
3422 {
3423 bool progress = false;
3424
3425 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3426 if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3427 continue;
3428
3429 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3430 assert(inst->saturate == false);
3431
3432 fs_reg dst = inst->dst;
3433
3434 /* Get rid of COMPR4. We'll add it back in if we need it */
3435 if (dst.file == MRF)
3436 dst.reg = dst.reg & ~BRW_MRF_COMPR4;
3437
3438 dst.width = 8;
3439 for (uint8_t i = 0; i < inst->header_size; i++) {
3440 if (inst->src[i].file != BAD_FILE) {
3441 fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3442 fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3443 mov_src.width = 8;
3444 fs_inst *mov = MOV(mov_dst, mov_src);
3445 mov->force_writemask_all = true;
3446 inst->insert_before(block, mov);
3447 }
3448 dst = offset(dst, 1);
3449 }
3450
3451 dst.width = inst->exec_size;
3452 if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
3453 inst->exec_size > 8) {
3454 /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3455 * a straightforward copy. Instead, the result of the
3456 * LOAD_PAYLOAD is treated as interleaved and the first four
3457 * non-header sources are unpacked as:
3458 *
3459 * m + 0: r0
3460 * m + 1: g0
3461 * m + 2: b0
3462 * m + 3: a0
3463 * m + 4: r1
3464 * m + 5: g1
3465 * m + 6: b1
3466 * m + 7: a1
3467 *
3468 * This is used for gen <= 5 fb writes.
3469 */
3470 assert(inst->exec_size == 16);
3471 assert(inst->header_size + 4 <= inst->sources);
3472 for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3473 if (inst->src[i].file != BAD_FILE) {
3474 if (devinfo->has_compr4) {
3475 fs_reg compr4_dst = retype(dst, inst->src[i].type);
3476 compr4_dst.reg |= BRW_MRF_COMPR4;
3477
3478 fs_inst *mov = MOV(compr4_dst, inst->src[i]);
3479 mov->force_writemask_all = inst->force_writemask_all;
3480 inst->insert_before(block, mov);
3481 } else {
3482 /* Platform doesn't have COMPR4. We have to fake it */
3483 fs_reg mov_dst = retype(dst, inst->src[i].type);
3484 mov_dst.width = 8;
3485
3486 fs_inst *mov = MOV(mov_dst, half(inst->src[i], 0));
3487 mov->force_writemask_all = inst->force_writemask_all;
3488 inst->insert_before(block, mov);
3489
3490 mov = MOV(offset(mov_dst, 4), half(inst->src[i], 1));
3491 mov->force_writemask_all = inst->force_writemask_all;
3492 mov->force_sechalf = true;
3493 inst->insert_before(block, mov);
3494 }
3495 }
3496
3497 dst.reg++;
3498 }
3499
3500 /* The loop above only ever incremented us through the first set
3501 * of 4 registers. However, thanks to the magic of COMPR4, we
3502 * actually wrote to the first 8 registers, so we need to take
3503 * that into account now.
3504 */
3505 dst.reg += 4;
3506
3507 /* The COMPR4 code took care of the first 4 sources. We'll let
3508 * the regular path handle any remaining sources. Yes, we are
3509 * modifying the instruction but we're about to delete it so
3510 * this really doesn't hurt anything.
3511 */
3512 inst->header_size += 4;
3513 }
3514
3515 for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3516 if (inst->src[i].file != BAD_FILE) {
3517 fs_inst *mov = MOV(retype(dst, inst->src[i].type),
3518 inst->src[i]);
3519 mov->force_writemask_all = inst->force_writemask_all;
3520 inst->insert_before(block, mov);
3521 }
3522 dst = offset(dst, 1);
3523 }
3524
3525 inst->remove(block);
3526 progress = true;
3527 }
3528
3529 if (progress)
3530 invalidate_live_intervals();
3531
3532 return progress;
3533 }
3534
3535 void
3536 fs_visitor::dump_instructions()
3537 {
3538 dump_instructions(NULL);
3539 }
3540
3541 void
3542 fs_visitor::dump_instructions(const char *name)
3543 {
3544 FILE *file = stderr;
3545 if (name && geteuid() != 0) {
3546 file = fopen(name, "w");
3547 if (!file)
3548 file = stderr;
3549 }
3550
3551 if (cfg) {
3552 calculate_register_pressure();
3553 int ip = 0, max_pressure = 0;
3554 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3555 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3556 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3557 dump_instruction(inst, file);
3558 ip++;
3559 }
3560 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3561 } else {
3562 int ip = 0;
3563 foreach_in_list(backend_instruction, inst, &instructions) {
3564 fprintf(file, "%4d: ", ip++);
3565 dump_instruction(inst, file);
3566 }
3567 }
3568
3569 if (file != stderr) {
3570 fclose(file);
3571 }
3572 }
3573
3574 void
3575 fs_visitor::dump_instruction(backend_instruction *be_inst)
3576 {
3577 dump_instruction(be_inst, stderr);
3578 }
3579
3580 void
3581 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3582 {
3583 fs_inst *inst = (fs_inst *)be_inst;
3584
3585 if (inst->predicate) {
3586 fprintf(file, "(%cf0.%d) ",
3587 inst->predicate_inverse ? '-' : '+',
3588 inst->flag_subreg);
3589 }
3590
3591 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3592 if (inst->saturate)
3593 fprintf(file, ".sat");
3594 if (inst->conditional_mod) {
3595 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3596 if (!inst->predicate &&
3597 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3598 inst->opcode != BRW_OPCODE_IF &&
3599 inst->opcode != BRW_OPCODE_WHILE))) {
3600 fprintf(file, ".f0.%d", inst->flag_subreg);
3601 }
3602 }
3603 fprintf(file, "(%d) ", inst->exec_size);
3604
3605
3606 switch (inst->dst.file) {
3607 case GRF:
3608 fprintf(file, "vgrf%d", inst->dst.reg);
3609 if (inst->dst.width != dispatch_width)
3610 fprintf(file, "@%d", inst->dst.width);
3611 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3612 inst->dst.subreg_offset)
3613 fprintf(file, "+%d.%d",
3614 inst->dst.reg_offset, inst->dst.subreg_offset);
3615 break;
3616 case MRF:
3617 fprintf(file, "m%d", inst->dst.reg);
3618 break;
3619 case BAD_FILE:
3620 fprintf(file, "(null)");
3621 break;
3622 case UNIFORM:
3623 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3624 break;
3625 case ATTR:
3626 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3627 break;
3628 case HW_REG:
3629 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3630 switch (inst->dst.fixed_hw_reg.nr) {
3631 case BRW_ARF_NULL:
3632 fprintf(file, "null");
3633 break;
3634 case BRW_ARF_ADDRESS:
3635 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3636 break;
3637 case BRW_ARF_ACCUMULATOR:
3638 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3639 break;
3640 case BRW_ARF_FLAG:
3641 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3642 inst->dst.fixed_hw_reg.subnr);
3643 break;
3644 default:
3645 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3646 inst->dst.fixed_hw_reg.subnr);
3647 break;
3648 }
3649 } else {
3650 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3651 }
3652 if (inst->dst.fixed_hw_reg.subnr)
3653 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3654 break;
3655 default:
3656 fprintf(file, "???");
3657 break;
3658 }
3659 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3660
3661 for (int i = 0; i < inst->sources; i++) {
3662 if (inst->src[i].negate)
3663 fprintf(file, "-");
3664 if (inst->src[i].abs)
3665 fprintf(file, "|");
3666 switch (inst->src[i].file) {
3667 case GRF:
3668 fprintf(file, "vgrf%d", inst->src[i].reg);
3669 if (inst->src[i].width != dispatch_width)
3670 fprintf(file, "@%d", inst->src[i].width);
3671 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3672 inst->src[i].subreg_offset)
3673 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3674 inst->src[i].subreg_offset);
3675 break;
3676 case MRF:
3677 fprintf(file, "***m%d***", inst->src[i].reg);
3678 break;
3679 case ATTR:
3680 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3681 break;
3682 case UNIFORM:
3683 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3684 if (inst->src[i].reladdr) {
3685 fprintf(file, "+reladdr");
3686 } else if (inst->src[i].subreg_offset) {
3687 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3688 inst->src[i].subreg_offset);
3689 }
3690 break;
3691 case BAD_FILE:
3692 fprintf(file, "(null)");
3693 break;
3694 case IMM:
3695 switch (inst->src[i].type) {
3696 case BRW_REGISTER_TYPE_F:
3697 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3698 break;
3699 case BRW_REGISTER_TYPE_W:
3700 case BRW_REGISTER_TYPE_D:
3701 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3702 break;
3703 case BRW_REGISTER_TYPE_UW:
3704 case BRW_REGISTER_TYPE_UD:
3705 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3706 break;
3707 case BRW_REGISTER_TYPE_VF:
3708 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3709 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3710 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3711 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3712 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3713 break;
3714 default:
3715 fprintf(file, "???");
3716 break;
3717 }
3718 break;
3719 case HW_REG:
3720 if (inst->src[i].fixed_hw_reg.negate)
3721 fprintf(file, "-");
3722 if (inst->src[i].fixed_hw_reg.abs)
3723 fprintf(file, "|");
3724 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3725 switch (inst->src[i].fixed_hw_reg.nr) {
3726 case BRW_ARF_NULL:
3727 fprintf(file, "null");
3728 break;
3729 case BRW_ARF_ADDRESS:
3730 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3731 break;
3732 case BRW_ARF_ACCUMULATOR:
3733 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3734 break;
3735 case BRW_ARF_FLAG:
3736 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3737 inst->src[i].fixed_hw_reg.subnr);
3738 break;
3739 default:
3740 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3741 inst->src[i].fixed_hw_reg.subnr);
3742 break;
3743 }
3744 } else {
3745 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3746 }
3747 if (inst->src[i].fixed_hw_reg.subnr)
3748 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3749 if (inst->src[i].fixed_hw_reg.abs)
3750 fprintf(file, "|");
3751 break;
3752 default:
3753 fprintf(file, "???");
3754 break;
3755 }
3756 if (inst->src[i].abs)
3757 fprintf(file, "|");
3758
3759 if (inst->src[i].file != IMM) {
3760 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3761 }
3762
3763 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3764 fprintf(file, ", ");
3765 }
3766
3767 fprintf(file, " ");
3768
3769 if (dispatch_width == 16 && inst->exec_size == 8) {
3770 if (inst->force_sechalf)
3771 fprintf(file, "2ndhalf ");
3772 else
3773 fprintf(file, "1sthalf ");
3774 }
3775
3776 fprintf(file, "\n");
3777 }
3778
3779 /**
3780 * Possibly returns an instruction that set up @param reg.
3781 *
3782 * Sometimes we want to take the result of some expression/variable
3783 * dereference tree and rewrite the instruction generating the result
3784 * of the tree. When processing the tree, we know that the
3785 * instructions generated are all writing temporaries that are dead
3786 * outside of this tree. So, if we have some instructions that write
3787 * a temporary, we're free to point that temp write somewhere else.
3788 *
3789 * Note that this doesn't guarantee that the instruction generated
3790 * only reg -- it might be the size=4 destination of a texture instruction.
3791 */
3792 fs_inst *
3793 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3794 fs_inst *end,
3795 const fs_reg &reg)
3796 {
3797 if (end == start ||
3798 end->is_partial_write() ||
3799 reg.reladdr ||
3800 !reg.equals(end->dst)) {
3801 return NULL;
3802 } else {
3803 return end;
3804 }
3805 }
3806
3807 void
3808 fs_visitor::setup_payload_gen6()
3809 {
3810 bool uses_depth =
3811 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3812 unsigned barycentric_interp_modes =
3813 (stage == MESA_SHADER_FRAGMENT) ?
3814 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3815
3816 assert(devinfo->gen >= 6);
3817
3818 /* R0-1: masks, pixel X/Y coordinates. */
3819 payload.num_regs = 2;
3820 /* R2: only for 32-pixel dispatch.*/
3821
3822 /* R3-26: barycentric interpolation coordinates. These appear in the
3823 * same order that they appear in the brw_wm_barycentric_interp_mode
3824 * enum. Each set of coordinates occupies 2 registers if dispatch width
3825 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3826 * appear if they were enabled using the "Barycentric Interpolation
3827 * Mode" bits in WM_STATE.
3828 */
3829 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3830 if (barycentric_interp_modes & (1 << i)) {
3831 payload.barycentric_coord_reg[i] = payload.num_regs;
3832 payload.num_regs += 2;
3833 if (dispatch_width == 16) {
3834 payload.num_regs += 2;
3835 }
3836 }
3837 }
3838
3839 /* R27: interpolated depth if uses source depth */
3840 if (uses_depth) {
3841 payload.source_depth_reg = payload.num_regs;
3842 payload.num_regs++;
3843 if (dispatch_width == 16) {
3844 /* R28: interpolated depth if not SIMD8. */
3845 payload.num_regs++;
3846 }
3847 }
3848 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3849 if (uses_depth) {
3850 payload.source_w_reg = payload.num_regs;
3851 payload.num_regs++;
3852 if (dispatch_width == 16) {
3853 /* R30: interpolated W if not SIMD8. */
3854 payload.num_regs++;
3855 }
3856 }
3857
3858 if (stage == MESA_SHADER_FRAGMENT) {
3859 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3860 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3861 prog_data->uses_pos_offset = key->compute_pos_offset;
3862 /* R31: MSAA position offsets. */
3863 if (prog_data->uses_pos_offset) {
3864 payload.sample_pos_reg = payload.num_regs;
3865 payload.num_regs++;
3866 }
3867 }
3868
3869 /* R32: MSAA input coverage mask */
3870 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3871 assert(devinfo->gen >= 7);
3872 payload.sample_mask_in_reg = payload.num_regs;
3873 payload.num_regs++;
3874 if (dispatch_width == 16) {
3875 /* R33: input coverage mask if not SIMD8. */
3876 payload.num_regs++;
3877 }
3878 }
3879
3880 /* R34-: bary for 32-pixel. */
3881 /* R58-59: interp W for 32-pixel. */
3882
3883 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3884 source_depth_to_render_target = true;
3885 }
3886 }
3887
3888 void
3889 fs_visitor::setup_vs_payload()
3890 {
3891 /* R0: thread header, R1: urb handles */
3892 payload.num_regs = 2;
3893 }
3894
3895 void
3896 fs_visitor::setup_cs_payload()
3897 {
3898 assert(brw->gen >= 7);
3899
3900 payload.num_regs = 1;
3901 }
3902
3903 void
3904 fs_visitor::assign_binding_table_offsets()
3905 {
3906 assert(stage == MESA_SHADER_FRAGMENT);
3907 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3908 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3909 uint32_t next_binding_table_offset = 0;
3910
3911 /* If there are no color regions, we still perform an FB write to a null
3912 * renderbuffer, which we place at surface index 0.
3913 */
3914 prog_data->binding_table.render_target_start = next_binding_table_offset;
3915 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3916
3917 assign_common_binding_table_offsets(next_binding_table_offset);
3918 }
3919
3920 void
3921 fs_visitor::calculate_register_pressure()
3922 {
3923 invalidate_live_intervals();
3924 calculate_live_intervals();
3925
3926 unsigned num_instructions = 0;
3927 foreach_block(block, cfg)
3928 num_instructions += block->instructions.length();
3929
3930 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3931
3932 for (unsigned reg = 0; reg < alloc.count; reg++) {
3933 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3934 regs_live_at_ip[ip] += alloc.sizes[reg];
3935 }
3936 }
3937
3938 void
3939 fs_visitor::optimize()
3940 {
3941 split_virtual_grfs();
3942
3943 move_uniform_array_access_to_pull_constants();
3944 assign_constant_locations();
3945 demote_pull_constants();
3946
3947 #define OPT(pass, args...) ({ \
3948 pass_num++; \
3949 bool this_progress = pass(args); \
3950 \
3951 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3952 char filename[64]; \
3953 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3954 stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3955 \
3956 backend_visitor::dump_instructions(filename); \
3957 } \
3958 \
3959 progress = progress || this_progress; \
3960 this_progress; \
3961 })
3962
3963 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3964 char filename[64];
3965 snprintf(filename, 64, "%s%d-%04d-00-start",
3966 stage_abbrev, dispatch_width,
3967 shader_prog ? shader_prog->Name : 0);
3968
3969 backend_visitor::dump_instructions(filename);
3970 }
3971
3972 bool progress;
3973 int iteration = 0;
3974 int pass_num = 0;
3975 do {
3976 progress = false;
3977 pass_num = 0;
3978 iteration++;
3979
3980 OPT(remove_duplicate_mrf_writes);
3981
3982 OPT(opt_algebraic);
3983 OPT(opt_cse);
3984 OPT(opt_copy_propagate);
3985 OPT(opt_peephole_predicated_break);
3986 OPT(opt_cmod_propagation);
3987 OPT(dead_code_eliminate);
3988 OPT(opt_peephole_sel);
3989 OPT(dead_control_flow_eliminate, this);
3990 OPT(opt_register_renaming);
3991 OPT(opt_redundant_discard_jumps);
3992 OPT(opt_saturate_propagation);
3993 OPT(opt_zero_samples);
3994 OPT(register_coalesce);
3995 OPT(compute_to_mrf);
3996 OPT(eliminate_find_live_channel);
3997
3998 OPT(compact_virtual_grfs);
3999 } while (progress);
4000
4001 pass_num = 0;
4002
4003 OPT(opt_sampler_eot);
4004
4005 if (OPT(lower_load_payload)) {
4006 split_virtual_grfs();
4007 OPT(register_coalesce);
4008 OPT(compute_to_mrf);
4009 OPT(dead_code_eliminate);
4010 }
4011
4012 OPT(opt_combine_constants);
4013
4014 lower_uniform_pull_constant_loads();
4015 }
4016
4017 /**
4018 * Three source instruction must have a GRF/MRF destination register.
4019 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
4020 */
4021 void
4022 fs_visitor::fixup_3src_null_dest()
4023 {
4024 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
4025 if (inst->is_3src() && inst->dst.is_null()) {
4026 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
4027 inst->dst.type);
4028 }
4029 }
4030 }
4031
4032 void
4033 fs_visitor::allocate_registers()
4034 {
4035 bool allocated_without_spills;
4036
4037 static const enum instruction_scheduler_mode pre_modes[] = {
4038 SCHEDULE_PRE,
4039 SCHEDULE_PRE_NON_LIFO,
4040 SCHEDULE_PRE_LIFO,
4041 };
4042
4043 /* Try each scheduling heuristic to see if it can successfully register
4044 * allocate without spilling. They should be ordered by decreasing
4045 * performance but increasing likelihood of allocating.
4046 */
4047 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
4048 schedule_instructions(pre_modes[i]);
4049
4050 if (0) {
4051 assign_regs_trivial();
4052 allocated_without_spills = true;
4053 } else {
4054 allocated_without_spills = assign_regs(false);
4055 }
4056 if (allocated_without_spills)
4057 break;
4058 }
4059
4060 if (!allocated_without_spills) {
4061 /* We assume that any spilling is worse than just dropping back to
4062 * SIMD8. There's probably actually some intermediate point where
4063 * SIMD16 with a couple of spills is still better.
4064 */
4065 if (dispatch_width == 16) {
4066 fail("Failure to register allocate. Reduce number of "
4067 "live scalar values to avoid this.");
4068 } else {
4069 perf_debug("%s shader triggered register spilling. "
4070 "Try reducing the number of live scalar values to "
4071 "improve performance.\n", stage_name);
4072 }
4073
4074 /* Since we're out of heuristics, just go spill registers until we
4075 * get an allocation.
4076 */
4077 while (!assign_regs(true)) {
4078 if (failed)
4079 break;
4080 }
4081 }
4082
4083 /* This must come after all optimization and register allocation, since
4084 * it inserts dead code that happens to have side effects, and it does
4085 * so based on the actual physical registers in use.
4086 */
4087 insert_gen4_send_dependency_workarounds();
4088
4089 if (failed)
4090 return;
4091
4092 if (!allocated_without_spills)
4093 schedule_instructions(SCHEDULE_POST);
4094
4095 if (last_scratch > 0)
4096 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4097 }
4098
4099 bool
4100 fs_visitor::run_vs()
4101 {
4102 assert(stage == MESA_SHADER_VERTEX);
4103
4104 if (prog_data->map_entries == NULL)
4105 assign_common_binding_table_offsets(0);
4106 setup_vs_payload();
4107
4108 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4109 emit_shader_time_begin();
4110
4111 if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
4112 emit_nir_code();
4113 } else {
4114 foreach_in_list(ir_instruction, ir, shader->base.ir) {
4115 base_ir = ir;
4116 this->result = reg_undef;
4117 ir->accept(this);
4118 }
4119 base_ir = NULL;
4120 }
4121
4122 if (failed)
4123 return false;
4124
4125 emit_urb_writes();
4126
4127 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4128 emit_shader_time_end();
4129
4130 calculate_cfg();
4131
4132 optimize();
4133
4134 assign_curb_setup();
4135 assign_vs_urb_setup();
4136
4137 fixup_3src_null_dest();
4138 allocate_registers();
4139
4140 return !failed;
4141 }
4142
4143 bool
4144 fs_visitor::run_fs()
4145 {
4146 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4147 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4148
4149 assert(stage == MESA_SHADER_FRAGMENT);
4150
4151 sanity_param_count = prog->Parameters->NumParameters;
4152
4153 if (prog_data->map_entries == NULL)
4154 assign_binding_table_offsets();
4155
4156 if (devinfo->gen >= 6)
4157 setup_payload_gen6();
4158 else
4159 setup_payload_gen4();
4160
4161 if (0) {
4162 emit_dummy_fs();
4163 } else if (brw->use_rep_send && dispatch_width == 16) {
4164 emit_repclear_shader();
4165 } else {
4166 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4167 emit_shader_time_begin();
4168
4169 calculate_urb_setup();
4170 if (prog->InputsRead > 0) {
4171 if (devinfo->gen < 6)
4172 emit_interpolation_setup_gen4();
4173 else
4174 emit_interpolation_setup_gen6();
4175 }
4176
4177 /* We handle discards by keeping track of the still-live pixels in f0.1.
4178 * Initialize it with the dispatched pixels.
4179 */
4180 if (wm_prog_data->uses_kill) {
4181 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4182 discard_init->flag_subreg = 1;
4183 }
4184
4185 /* Generate FS IR for main(). (the visitor only descends into
4186 * functions called "main").
4187 */
4188 if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
4189 emit_nir_code();
4190 } else if (shader) {
4191 foreach_in_list(ir_instruction, ir, shader->base.ir) {
4192 base_ir = ir;
4193 this->result = reg_undef;
4194 ir->accept(this);
4195 }
4196 } else {
4197 emit_fragment_program_code();
4198 }
4199 base_ir = NULL;
4200 if (failed)
4201 return false;
4202
4203 if (wm_prog_data->uses_kill)
4204 emit(FS_OPCODE_PLACEHOLDER_HALT);
4205
4206 if (wm_key->alpha_test_func)
4207 emit_alpha_test();
4208
4209 emit_fb_writes();
4210
4211 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4212 emit_shader_time_end();
4213
4214 calculate_cfg();
4215
4216 optimize();
4217
4218 assign_curb_setup();
4219 assign_urb_setup();
4220
4221 fixup_3src_null_dest();
4222 allocate_registers();
4223
4224 if (failed)
4225 return false;
4226 }
4227
4228 if (dispatch_width == 8)
4229 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4230 else
4231 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4232
4233 /* If any state parameters were appended, then ParameterValues could have
4234 * been realloced, in which case the driver uniform storage set up by
4235 * _mesa_associate_uniform_storage() would point to freed memory. Make
4236 * sure that didn't happen.
4237 */
4238 assert(sanity_param_count == prog->Parameters->NumParameters);
4239
4240 return !failed;
4241 }
4242
4243 bool
4244 fs_visitor::run_cs()
4245 {
4246 assert(stage == MESA_SHADER_COMPUTE);
4247 assert(shader);
4248
4249 sanity_param_count = prog->Parameters->NumParameters;
4250
4251 assign_common_binding_table_offsets(0);
4252
4253 setup_cs_payload();
4254
4255 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4256 emit_shader_time_begin();
4257
4258 emit_nir_code();
4259
4260 if (failed)
4261 return false;
4262
4263 emit_cs_terminate();
4264
4265 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4266 emit_shader_time_end();
4267
4268 calculate_cfg();
4269
4270 optimize();
4271
4272 assign_curb_setup();
4273
4274 fixup_3src_null_dest();
4275 allocate_registers();
4276
4277 if (failed)
4278 return false;
4279
4280 /* If any state parameters were appended, then ParameterValues could have
4281 * been realloced, in which case the driver uniform storage set up by
4282 * _mesa_associate_uniform_storage() would point to freed memory. Make
4283 * sure that didn't happen.
4284 */
4285 assert(sanity_param_count == prog->Parameters->NumParameters);
4286
4287 return !failed;
4288 }
4289
4290 const unsigned *
4291 brw_wm_fs_emit(struct brw_context *brw,
4292 void *mem_ctx,
4293 const struct brw_wm_prog_key *key,
4294 struct brw_wm_prog_data *prog_data,
4295 struct gl_fragment_program *fp,
4296 struct gl_shader_program *prog,
4297 unsigned *final_assembly_size)
4298 {
4299 bool start_busy = false;
4300 double start_time = 0;
4301
4302 if (unlikely(brw->perf_debug)) {
4303 start_busy = (brw->batch.last_bo &&
4304 drm_intel_bo_busy(brw->batch.last_bo));
4305 start_time = get_time();
4306 }
4307
4308 struct brw_shader *shader = NULL;
4309 if (prog)
4310 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4311
4312 if (unlikely(INTEL_DEBUG & DEBUG_WM))
4313 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4314
4315 /* Now the main event: Visit the shader IR and generate our FS IR for it.
4316 */
4317 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
4318 if (!v.run_fs()) {
4319 if (prog) {
4320 prog->LinkStatus = false;
4321 ralloc_strcat(&prog->InfoLog, v.fail_msg);
4322 }
4323
4324 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4325 v.fail_msg);
4326
4327 return NULL;
4328 }
4329
4330 cfg_t *simd16_cfg = NULL;
4331 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
4332 if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4333 if (!v.simd16_unsupported) {
4334 /* Try a SIMD16 compile */
4335 v2.import_uniforms(&v);
4336 if (!v2.run_fs()) {
4337 perf_debug("SIMD16 shader failed to compile, falling back to "
4338 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4339 } else {
4340 simd16_cfg = v2.cfg;
4341 }
4342 } else {
4343 perf_debug("SIMD16 shader unsupported, falling back to "
4344 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4345 }
4346 }
4347
4348 cfg_t *simd8_cfg;
4349 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4350 if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4351 simd8_cfg = NULL;
4352 prog_data->no_8 = true;
4353 } else {
4354 simd8_cfg = v.cfg;
4355 prog_data->no_8 = false;
4356 }
4357
4358 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4359 &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4360
4361 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4362 char *name;
4363 if (prog)
4364 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4365 prog->Label ? prog->Label : "unnamed",
4366 prog->Name);
4367 else
4368 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4369
4370 g.enable_debug(name);
4371 }
4372
4373 if (simd8_cfg)
4374 g.generate_code(simd8_cfg, 8);
4375 if (simd16_cfg)
4376 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4377
4378 if (unlikely(brw->perf_debug) && shader) {
4379 if (shader->compiled_once)
4380 brw_wm_debug_recompile(brw, prog, key);
4381 shader->compiled_once = true;
4382
4383 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4384 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4385 (get_time() - start_time) * 1000);
4386 }
4387 }
4388
4389 return g.get_assembly(final_assembly_size);
4390 }
4391
4392 extern "C" bool
4393 brw_fs_precompile(struct gl_context *ctx,
4394 struct gl_shader_program *shader_prog,
4395 struct gl_program *prog)
4396 {
4397 struct brw_context *brw = brw_context(ctx);
4398 struct brw_wm_prog_key key;
4399
4400 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4401 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4402 bool program_uses_dfdy = fp->UsesDFdy;
4403
4404 memset(&key, 0, sizeof(key));
4405
4406 if (brw->gen < 6) {
4407 if (fp->UsesKill)
4408 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4409
4410 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4411 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4412
4413 /* Just assume depth testing. */
4414 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4415 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4416 }
4417
4418 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4419 BRW_FS_VARYING_INPUT_MASK) > 16)
4420 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4421
4422 brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4423
4424 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4425 key.drawable_height = ctx->DrawBuffer->Height;
4426 }
4427
4428 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4429 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4430 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4431
4432 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4433 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4434 key.nr_color_regions > 1;
4435 }
4436
4437 key.program_string_id = bfp->id;
4438
4439 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4440 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4441
4442 bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4443
4444 brw->wm.base.prog_offset = old_prog_offset;
4445 brw->wm.prog_data = old_prog_data;
4446
4447 return success;
4448 }
4449
4450 void
4451 brw_setup_tex_for_precompile(struct brw_context *brw,
4452 struct brw_sampler_prog_key_data *tex,
4453 struct gl_program *prog)
4454 {
4455 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4456 unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4457 for (unsigned i = 0; i < sampler_count; i++) {
4458 if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4459 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4460 tex->swizzles[i] =
4461 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4462 } else {
4463 /* Color sampler: assume no swizzling. */
4464 tex->swizzles[i] = SWIZZLE_XYZW;
4465 }
4466 }
4467 }