i965: Add typed surface access opcodes.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 void
53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
54 const fs_reg *src, unsigned sources)
55 {
56 memset(this, 0, sizeof(*this));
57
58 this->src = new fs_reg[MAX2(sources, 3)];
59 for (unsigned i = 0; i < sources; i++)
60 this->src[i] = src[i];
61
62 this->opcode = opcode;
63 this->dst = dst;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (unsigned i = 0; i < sources; ++i) {
79 if (src[i].file != GRF && src[i].file != ATTR)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 for (unsigned i = 0; i < sources; ++i) {
94 switch (this->src[i].file) {
95 case BAD_FILE:
96 this->src[i].effective_width = 8;
97 break;
98 case GRF:
99 case HW_REG:
100 case ATTR:
101 assert(this->src[i].width > 0);
102 if (this->src[i].width == 1) {
103 this->src[i].effective_width = this->exec_size;
104 } else {
105 this->src[i].effective_width = this->src[i].width;
106 }
107 break;
108 case IMM:
109 case UNIFORM:
110 this->src[i].effective_width = this->exec_size;
111 break;
112 default:
113 unreachable("Invalid source register file");
114 }
115 }
116 this->dst.effective_width = this->exec_size;
117
118 this->conditional_mod = BRW_CONDITIONAL_NONE;
119
120 /* This will be the case for almost all instructions. */
121 switch (dst.file) {
122 case GRF:
123 case HW_REG:
124 case MRF:
125 case ATTR:
126 this->regs_written =
127 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
128 break;
129 case BAD_FILE:
130 this->regs_written = 0;
131 break;
132 case IMM:
133 case UNIFORM:
134 unreachable("Invalid destination register file");
135 default:
136 unreachable("Invalid register file");
137 }
138
139 this->writes_accumulator = false;
140 }
141
142 fs_inst::fs_inst()
143 {
144 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
145 }
146
147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
148 {
149 init(opcode, exec_size, reg_undef, NULL, 0);
150 }
151
152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
153 {
154 init(opcode, 0, dst, NULL, 0);
155 }
156
157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
158 const fs_reg &src0)
159 {
160 const fs_reg src[1] = { src0 };
161 init(opcode, exec_size, dst, src, 1);
162 }
163
164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
165 {
166 const fs_reg src[1] = { src0 };
167 init(opcode, 0, dst, src, 1);
168 }
169
170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
171 const fs_reg &src0, const fs_reg &src1)
172 {
173 const fs_reg src[2] = { src0, src1 };
174 init(opcode, exec_size, dst, src, 2);
175 }
176
177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
178 const fs_reg &src1)
179 {
180 const fs_reg src[2] = { src0, src1 };
181 init(opcode, 0, dst, src, 2);
182 }
183
184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
185 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
186 {
187 const fs_reg src[3] = { src0, src1, src2 };
188 init(opcode, exec_size, dst, src, 3);
189 }
190
191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
192 const fs_reg &src1, const fs_reg &src2)
193 {
194 const fs_reg src[3] = { src0, src1, src2 };
195 init(opcode, 0, dst, src, 3);
196 }
197
198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
199 const fs_reg src[], unsigned sources)
200 {
201 init(opcode, 0, dst, src, sources);
202 }
203
204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
205 const fs_reg src[], unsigned sources)
206 {
207 init(opcode, exec_width, dst, src, sources);
208 }
209
210 fs_inst::fs_inst(const fs_inst &that)
211 {
212 memcpy(this, &that, sizeof(that));
213
214 this->src = new fs_reg[MAX2(that.sources, 3)];
215
216 for (unsigned i = 0; i < that.sources; i++)
217 this->src[i] = that.src[i];
218 }
219
220 fs_inst::~fs_inst()
221 {
222 delete[] this->src;
223 }
224
225 void
226 fs_inst::resize_sources(uint8_t num_sources)
227 {
228 if (this->sources != num_sources) {
229 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
230
231 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
232 src[i] = this->src[i];
233
234 delete[] this->src;
235 this->src = src;
236 this->sources = num_sources;
237 }
238 }
239
240 #define ALU1(op) \
241 fs_inst * \
242 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
243 { \
244 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
245 }
246
247 #define ALU2(op) \
248 fs_inst * \
249 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
250 const fs_reg &src1) \
251 { \
252 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
253 }
254
255 #define ALU2_ACC(op) \
256 fs_inst * \
257 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
258 const fs_reg &src1) \
259 { \
260 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
261 inst->writes_accumulator = true; \
262 return inst; \
263 }
264
265 #define ALU3(op) \
266 fs_inst * \
267 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
268 const fs_reg &src1, const fs_reg &src2) \
269 { \
270 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
271 }
272
273 ALU1(NOT)
274 ALU1(MOV)
275 ALU1(FRC)
276 ALU1(RNDD)
277 ALU1(RNDE)
278 ALU1(RNDZ)
279 ALU2(ADD)
280 ALU2(MUL)
281 ALU2_ACC(MACH)
282 ALU2(AND)
283 ALU2(OR)
284 ALU2(XOR)
285 ALU2(SHL)
286 ALU2(SHR)
287 ALU2(ASR)
288 ALU3(LRP)
289 ALU1(BFREV)
290 ALU3(BFE)
291 ALU2(BFI1)
292 ALU3(BFI2)
293 ALU1(FBH)
294 ALU1(FBL)
295 ALU1(CBIT)
296 ALU3(MAD)
297 ALU2_ACC(ADDC)
298 ALU2_ACC(SUBB)
299 ALU2(SEL)
300 ALU2(MAC)
301
302 /** Gen4 predicated IF. */
303 fs_inst *
304 fs_visitor::IF(enum brw_predicate predicate)
305 {
306 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
307 inst->predicate = predicate;
308 return inst;
309 }
310
311 /** Gen6 IF with embedded comparison. */
312 fs_inst *
313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
314 enum brw_conditional_mod condition)
315 {
316 assert(devinfo->gen == 6);
317 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
318 reg_null_d, src0, src1);
319 inst->conditional_mod = condition;
320 return inst;
321 }
322
323 /**
324 * CMP: Sets the low bit of the destination channels with the result
325 * of the comparison, while the upper bits are undefined, and updates
326 * the flag register with the packed 16 bits of the result.
327 */
328 fs_inst *
329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
330 enum brw_conditional_mod condition)
331 {
332 fs_inst *inst;
333
334 /* Take the instruction:
335 *
336 * CMP null<d> src0<f> src1<f>
337 *
338 * Original gen4 does type conversion to the destination type before
339 * comparison, producing garbage results for floating point comparisons.
340 *
341 * The destination type doesn't matter on newer generations, so we set the
342 * type to match src0 so we can compact the instruction.
343 */
344 dst.type = src0.type;
345 if (dst.file == HW_REG)
346 dst.fixed_hw_reg.type = dst.type;
347
348 resolve_ud_negate(&src0);
349 resolve_ud_negate(&src1);
350
351 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
352 inst->conditional_mod = condition;
353
354 return inst;
355 }
356
357 fs_inst *
358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
359 {
360 uint8_t exec_size = dst.width;
361 for (int i = 0; i < sources; ++i) {
362 assert(src[i].width % dst.width == 0);
363 if (src[i].width > exec_size)
364 exec_size = src[i].width;
365 }
366
367 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
368 dst, src, sources);
369 inst->regs_written = 0;
370 for (int i = 0; i < sources; ++i) {
371 /* The LOAD_PAYLOAD instruction only really makes sense if we are
372 * dealing with whole registers. If this ever changes, we can deal
373 * with it later.
374 */
375 int size = inst->src[i].effective_width * type_sz(src[i].type);
376 assert(size % 32 == 0);
377 inst->regs_written += (size + 31) / 32;
378 }
379
380 return inst;
381 }
382
383 exec_list
384 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
385 const fs_reg &surf_index,
386 const fs_reg &varying_offset,
387 uint32_t const_offset)
388 {
389 exec_list instructions;
390 fs_inst *inst;
391
392 /* We have our constant surface use a pitch of 4 bytes, so our index can
393 * be any component of a vector, and then we load 4 contiguous
394 * components starting from that.
395 *
396 * We break down the const_offset to a portion added to the variable
397 * offset and a portion done using reg_offset, which means that if you
398 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
399 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
400 * CSE can later notice that those loads are all the same and eliminate
401 * the redundant ones.
402 */
403 fs_reg vec4_offset = vgrf(glsl_type::int_type);
404 instructions.push_tail(ADD(vec4_offset,
405 varying_offset, fs_reg(const_offset & ~3)));
406
407 int scale = 1;
408 if (devinfo->gen == 4 && dst.width == 8) {
409 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
410 * u, v, r) as parameters, or we can just use the SIMD16 message
411 * consisting of (header, u). We choose the second, at the cost of a
412 * longer return length.
413 */
414 scale = 2;
415 }
416
417 enum opcode op;
418 if (devinfo->gen >= 7)
419 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
420 else
421 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
422
423 assert(dst.width % 8 == 0);
424 int regs_written = 4 * (dst.width / 8) * scale;
425 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
426 dst.type, dst.width);
427 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
428 inst->regs_written = regs_written;
429 instructions.push_tail(inst);
430
431 if (devinfo->gen < 7) {
432 inst->base_mrf = 13;
433 inst->header_present = true;
434 if (devinfo->gen == 4)
435 inst->mlen = 3;
436 else
437 inst->mlen = 1 + dispatch_width / 8;
438 }
439
440 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
441 instructions.push_tail(MOV(dst, result));
442
443 return instructions;
444 }
445
446 /**
447 * A helper for MOV generation for fixing up broken hardware SEND dependency
448 * handling.
449 */
450 fs_inst *
451 fs_visitor::DEP_RESOLVE_MOV(int grf)
452 {
453 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
454
455 inst->ir = NULL;
456 inst->annotation = "send dependency resolve";
457
458 /* The caller always wants uncompressed to emit the minimal extra
459 * dependencies, and to avoid having to deal with aligning its regs to 2.
460 */
461 inst->exec_size = 8;
462
463 return inst;
464 }
465
466 bool
467 fs_inst::equals(fs_inst *inst) const
468 {
469 return (opcode == inst->opcode &&
470 dst.equals(inst->dst) &&
471 src[0].equals(inst->src[0]) &&
472 src[1].equals(inst->src[1]) &&
473 src[2].equals(inst->src[2]) &&
474 saturate == inst->saturate &&
475 predicate == inst->predicate &&
476 conditional_mod == inst->conditional_mod &&
477 mlen == inst->mlen &&
478 base_mrf == inst->base_mrf &&
479 target == inst->target &&
480 eot == inst->eot &&
481 header_present == inst->header_present &&
482 shadow_compare == inst->shadow_compare &&
483 exec_size == inst->exec_size &&
484 offset == inst->offset);
485 }
486
487 bool
488 fs_inst::overwrites_reg(const fs_reg &reg) const
489 {
490 return reg.in_range(dst, regs_written);
491 }
492
493 bool
494 fs_inst::is_send_from_grf() const
495 {
496 switch (opcode) {
497 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
498 case SHADER_OPCODE_SHADER_TIME_ADD:
499 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
500 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
501 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
502 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
503 case SHADER_OPCODE_UNTYPED_ATOMIC:
504 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
505 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
506 case SHADER_OPCODE_TYPED_ATOMIC:
507 case SHADER_OPCODE_TYPED_SURFACE_READ:
508 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
509 case SHADER_OPCODE_URB_WRITE_SIMD8:
510 return true;
511 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
512 return src[1].file == GRF;
513 case FS_OPCODE_FB_WRITE:
514 return src[0].file == GRF;
515 default:
516 if (is_tex())
517 return src[0].file == GRF;
518
519 return false;
520 }
521 }
522
523 bool
524 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
525 {
526 if (devinfo->gen == 6 && is_math())
527 return false;
528
529 if (is_send_from_grf())
530 return false;
531
532 if (!backend_instruction::can_do_source_mods())
533 return false;
534
535 return true;
536 }
537
538 bool
539 fs_inst::has_side_effects() const
540 {
541 return this->eot || backend_instruction::has_side_effects();
542 }
543
544 void
545 fs_reg::init()
546 {
547 memset(this, 0, sizeof(*this));
548 stride = 1;
549 }
550
551 /** Generic unset register constructor. */
552 fs_reg::fs_reg()
553 {
554 init();
555 this->file = BAD_FILE;
556 }
557
558 /** Immediate value constructor. */
559 fs_reg::fs_reg(float f)
560 {
561 init();
562 this->file = IMM;
563 this->type = BRW_REGISTER_TYPE_F;
564 this->fixed_hw_reg.dw1.f = f;
565 this->width = 1;
566 }
567
568 /** Immediate value constructor. */
569 fs_reg::fs_reg(int32_t i)
570 {
571 init();
572 this->file = IMM;
573 this->type = BRW_REGISTER_TYPE_D;
574 this->fixed_hw_reg.dw1.d = i;
575 this->width = 1;
576 }
577
578 /** Immediate value constructor. */
579 fs_reg::fs_reg(uint32_t u)
580 {
581 init();
582 this->file = IMM;
583 this->type = BRW_REGISTER_TYPE_UD;
584 this->fixed_hw_reg.dw1.ud = u;
585 this->width = 1;
586 }
587
588 /** Vector float immediate value constructor. */
589 fs_reg::fs_reg(uint8_t vf[4])
590 {
591 init();
592 this->file = IMM;
593 this->type = BRW_REGISTER_TYPE_VF;
594 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
595 }
596
597 /** Vector float immediate value constructor. */
598 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
599 {
600 init();
601 this->file = IMM;
602 this->type = BRW_REGISTER_TYPE_VF;
603 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
604 (vf1 << 8) |
605 (vf2 << 16) |
606 (vf3 << 24);
607 }
608
609 /** Fixed brw_reg. */
610 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
611 {
612 init();
613 this->file = HW_REG;
614 this->fixed_hw_reg = fixed_hw_reg;
615 this->type = fixed_hw_reg.type;
616 this->width = 1 << fixed_hw_reg.width;
617 }
618
619 bool
620 fs_reg::equals(const fs_reg &r) const
621 {
622 return (file == r.file &&
623 reg == r.reg &&
624 reg_offset == r.reg_offset &&
625 subreg_offset == r.subreg_offset &&
626 type == r.type &&
627 negate == r.negate &&
628 abs == r.abs &&
629 !reladdr && !r.reladdr &&
630 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
631 width == r.width &&
632 stride == r.stride);
633 }
634
635 fs_reg &
636 fs_reg::set_smear(unsigned subreg)
637 {
638 assert(file != HW_REG && file != IMM);
639 subreg_offset = subreg * type_sz(type);
640 stride = 0;
641 return *this;
642 }
643
644 bool
645 fs_reg::is_contiguous() const
646 {
647 return stride == 1;
648 }
649
650 int
651 fs_visitor::type_size(const struct glsl_type *type)
652 {
653 unsigned int size, i;
654
655 switch (type->base_type) {
656 case GLSL_TYPE_UINT:
657 case GLSL_TYPE_INT:
658 case GLSL_TYPE_FLOAT:
659 case GLSL_TYPE_BOOL:
660 return type->components();
661 case GLSL_TYPE_ARRAY:
662 return type_size(type->fields.array) * type->length;
663 case GLSL_TYPE_STRUCT:
664 size = 0;
665 for (i = 0; i < type->length; i++) {
666 size += type_size(type->fields.structure[i].type);
667 }
668 return size;
669 case GLSL_TYPE_SAMPLER:
670 /* Samplers take up no register space, since they're baked in at
671 * link time.
672 */
673 return 0;
674 case GLSL_TYPE_ATOMIC_UINT:
675 return 0;
676 case GLSL_TYPE_IMAGE:
677 case GLSL_TYPE_VOID:
678 case GLSL_TYPE_ERROR:
679 case GLSL_TYPE_INTERFACE:
680 case GLSL_TYPE_DOUBLE:
681 unreachable("not reached");
682 }
683
684 return 0;
685 }
686
687 /**
688 * Create a MOV to read the timestamp register.
689 *
690 * The caller is responsible for emitting the MOV. The return value is
691 * the destination of the MOV, with extra parameters set.
692 */
693 fs_reg
694 fs_visitor::get_timestamp(fs_inst **out_mov)
695 {
696 assert(devinfo->gen >= 7);
697
698 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
699 BRW_ARF_TIMESTAMP,
700 0),
701 BRW_REGISTER_TYPE_UD));
702
703 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
704
705 fs_inst *mov = MOV(dst, ts);
706 /* We want to read the 3 fields we care about even if it's not enabled in
707 * the dispatch.
708 */
709 mov->force_writemask_all = true;
710
711 /* The caller wants the low 32 bits of the timestamp. Since it's running
712 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
713 * which is plenty of time for our purposes. It is identical across the
714 * EUs, but since it's tracking GPU core speed it will increment at a
715 * varying rate as render P-states change.
716 *
717 * The caller could also check if render P-states have changed (or anything
718 * else that might disrupt timing) by setting smear to 2 and checking if
719 * that field is != 0.
720 */
721 dst.set_smear(0);
722
723 *out_mov = mov;
724 return dst;
725 }
726
727 void
728 fs_visitor::emit_shader_time_begin()
729 {
730 current_annotation = "shader time start";
731 fs_inst *mov;
732 shader_start_time = get_timestamp(&mov);
733 emit(mov);
734 }
735
736 void
737 fs_visitor::emit_shader_time_end()
738 {
739 current_annotation = "shader time end";
740
741 enum shader_time_shader_type type, written_type, reset_type;
742 switch (stage) {
743 case MESA_SHADER_VERTEX:
744 type = ST_VS;
745 written_type = ST_VS_WRITTEN;
746 reset_type = ST_VS_RESET;
747 break;
748 case MESA_SHADER_GEOMETRY:
749 type = ST_GS;
750 written_type = ST_GS_WRITTEN;
751 reset_type = ST_GS_RESET;
752 break;
753 case MESA_SHADER_FRAGMENT:
754 if (dispatch_width == 8) {
755 type = ST_FS8;
756 written_type = ST_FS8_WRITTEN;
757 reset_type = ST_FS8_RESET;
758 } else {
759 assert(dispatch_width == 16);
760 type = ST_FS16;
761 written_type = ST_FS16_WRITTEN;
762 reset_type = ST_FS16_RESET;
763 }
764 break;
765 case MESA_SHADER_COMPUTE:
766 type = ST_CS;
767 written_type = ST_CS_WRITTEN;
768 reset_type = ST_CS_RESET;
769 break;
770 default:
771 unreachable("fs_visitor::emit_shader_time_end missing code");
772 }
773
774 /* Insert our code just before the final SEND with EOT. */
775 exec_node *end = this->instructions.get_tail();
776 assert(end && ((fs_inst *) end)->eot);
777
778 fs_inst *tm_read;
779 fs_reg shader_end_time = get_timestamp(&tm_read);
780 end->insert_before(tm_read);
781
782 /* Check that there weren't any timestamp reset events (assuming these
783 * were the only two timestamp reads that happened).
784 */
785 fs_reg reset = shader_end_time;
786 reset.set_smear(2);
787 fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
788 test->conditional_mod = BRW_CONDITIONAL_Z;
789 test->force_writemask_all = true;
790 end->insert_before(test);
791 end->insert_before(IF(BRW_PREDICATE_NORMAL));
792
793 fs_reg start = shader_start_time;
794 start.negate = true;
795 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
796 diff.set_smear(0);
797 fs_inst *add = ADD(diff, start, shader_end_time);
798 add->force_writemask_all = true;
799 end->insert_before(add);
800
801 /* If there were no instructions between the two timestamp gets, the diff
802 * is 2 cycles. Remove that overhead, so I can forget about that when
803 * trying to determine the time taken for single instructions.
804 */
805 add = ADD(diff, diff, fs_reg(-2u));
806 add->force_writemask_all = true;
807 end->insert_before(add);
808
809 end->insert_before(SHADER_TIME_ADD(type, diff));
810 end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
811 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
812 end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
813 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
814 }
815
816 fs_inst *
817 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
818 {
819 int shader_time_index =
820 brw_get_shader_time_index(brw, shader_prog, prog, type);
821 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
822
823 fs_reg payload;
824 if (dispatch_width == 8)
825 payload = vgrf(glsl_type::uvec2_type);
826 else
827 payload = vgrf(glsl_type::uint_type);
828
829 return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
830 fs_reg(), payload, offset, value);
831 }
832
833 void
834 fs_visitor::vfail(const char *format, va_list va)
835 {
836 char *msg;
837
838 if (failed)
839 return;
840
841 failed = true;
842
843 msg = ralloc_vasprintf(mem_ctx, format, va);
844 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
845
846 this->fail_msg = msg;
847
848 if (debug_enabled) {
849 fprintf(stderr, "%s", msg);
850 }
851 }
852
853 void
854 fs_visitor::fail(const char *format, ...)
855 {
856 va_list va;
857
858 va_start(va, format);
859 vfail(format, va);
860 va_end(va);
861 }
862
863 /**
864 * Mark this program as impossible to compile in SIMD16 mode.
865 *
866 * During the SIMD8 compile (which happens first), we can detect and flag
867 * things that are unsupported in SIMD16 mode, so the compiler can skip
868 * the SIMD16 compile altogether.
869 *
870 * During a SIMD16 compile (if one happens anyway), this just calls fail().
871 */
872 void
873 fs_visitor::no16(const char *format, ...)
874 {
875 va_list va;
876
877 va_start(va, format);
878
879 if (dispatch_width == 16) {
880 vfail(format, va);
881 } else {
882 simd16_unsupported = true;
883
884 if (brw->perf_debug) {
885 if (no16_msg)
886 ralloc_vasprintf_append(&no16_msg, format, va);
887 else
888 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
889 }
890 }
891
892 va_end(va);
893 }
894
895 fs_inst *
896 fs_visitor::emit(enum opcode opcode)
897 {
898 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
899 }
900
901 fs_inst *
902 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
903 {
904 return emit(new(mem_ctx) fs_inst(opcode, dst));
905 }
906
907 fs_inst *
908 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
909 {
910 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
911 }
912
913 fs_inst *
914 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
915 const fs_reg &src1)
916 {
917 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
918 }
919
920 fs_inst *
921 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
922 const fs_reg &src1, const fs_reg &src2)
923 {
924 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
925 }
926
927 fs_inst *
928 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
929 fs_reg src[], int sources)
930 {
931 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
932 }
933
934 /**
935 * Returns true if the instruction has a flag that means it won't
936 * update an entire destination register.
937 *
938 * For example, dead code elimination and live variable analysis want to know
939 * when a write to a variable screens off any preceding values that were in
940 * it.
941 */
942 bool
943 fs_inst::is_partial_write() const
944 {
945 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
946 (this->dst.width * type_sz(this->dst.type)) < 32 ||
947 !this->dst.is_contiguous());
948 }
949
950 int
951 fs_inst::regs_read(int arg) const
952 {
953 if (is_tex() && arg == 0 && src[0].file == GRF) {
954 return mlen;
955 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
956 return mlen;
957 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
958 return mlen;
959 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
960 return mlen;
961 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
962 return mlen;
963 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
964 return mlen;
965 } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
966 return mlen;
967 } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
968 return mlen;
969 } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
970 return mlen;
971 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
972 return mlen;
973 } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
974 return exec_size / 4;
975 }
976
977 switch (src[arg].file) {
978 case BAD_FILE:
979 case UNIFORM:
980 case IMM:
981 return 1;
982 case GRF:
983 case HW_REG:
984 if (src[arg].stride == 0) {
985 return 1;
986 } else {
987 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
988 return (size + 31) / 32;
989 }
990 case MRF:
991 unreachable("MRF registers are not allowed as sources");
992 default:
993 unreachable("Invalid register file");
994 }
995 }
996
997 bool
998 fs_inst::reads_flag() const
999 {
1000 return predicate;
1001 }
1002
1003 bool
1004 fs_inst::writes_flag() const
1005 {
1006 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
1007 opcode != BRW_OPCODE_IF &&
1008 opcode != BRW_OPCODE_WHILE)) ||
1009 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
1010 }
1011
1012 /**
1013 * Returns how many MRFs an FS opcode will write over.
1014 *
1015 * Note that this is not the 0 or 1 implied writes in an actual gen
1016 * instruction -- the FS opcodes often generate MOVs in addition.
1017 */
1018 int
1019 fs_visitor::implied_mrf_writes(fs_inst *inst)
1020 {
1021 if (inst->mlen == 0)
1022 return 0;
1023
1024 if (inst->base_mrf == -1)
1025 return 0;
1026
1027 switch (inst->opcode) {
1028 case SHADER_OPCODE_RCP:
1029 case SHADER_OPCODE_RSQ:
1030 case SHADER_OPCODE_SQRT:
1031 case SHADER_OPCODE_EXP2:
1032 case SHADER_OPCODE_LOG2:
1033 case SHADER_OPCODE_SIN:
1034 case SHADER_OPCODE_COS:
1035 return 1 * dispatch_width / 8;
1036 case SHADER_OPCODE_POW:
1037 case SHADER_OPCODE_INT_QUOTIENT:
1038 case SHADER_OPCODE_INT_REMAINDER:
1039 return 2 * dispatch_width / 8;
1040 case SHADER_OPCODE_TEX:
1041 case FS_OPCODE_TXB:
1042 case SHADER_OPCODE_TXD:
1043 case SHADER_OPCODE_TXF:
1044 case SHADER_OPCODE_TXF_CMS:
1045 case SHADER_OPCODE_TXF_MCS:
1046 case SHADER_OPCODE_TG4:
1047 case SHADER_OPCODE_TG4_OFFSET:
1048 case SHADER_OPCODE_TXL:
1049 case SHADER_OPCODE_TXS:
1050 case SHADER_OPCODE_LOD:
1051 return 1;
1052 case FS_OPCODE_FB_WRITE:
1053 return 2;
1054 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1055 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1056 return 1;
1057 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1058 return inst->mlen;
1059 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1060 return 2;
1061 case SHADER_OPCODE_UNTYPED_ATOMIC:
1062 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1063 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1064 case SHADER_OPCODE_TYPED_ATOMIC:
1065 case SHADER_OPCODE_TYPED_SURFACE_READ:
1066 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1067 case SHADER_OPCODE_URB_WRITE_SIMD8:
1068 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1069 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1070 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1071 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1072 return 0;
1073 default:
1074 unreachable("not reached");
1075 }
1076 }
1077
1078 fs_reg
1079 fs_visitor::vgrf(const glsl_type *const type)
1080 {
1081 int reg_width = dispatch_width / 8;
1082 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1083 brw_type_for_base_type(type), dispatch_width);
1084 }
1085
1086 fs_reg
1087 fs_visitor::vgrf(int num_components)
1088 {
1089 int reg_width = dispatch_width / 8;
1090 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1091 BRW_REGISTER_TYPE_F, dispatch_width);
1092 }
1093
1094 /** Fixed HW reg constructor. */
1095 fs_reg::fs_reg(enum register_file file, int reg)
1096 {
1097 init();
1098 this->file = file;
1099 this->reg = reg;
1100 this->type = BRW_REGISTER_TYPE_F;
1101
1102 switch (file) {
1103 case UNIFORM:
1104 this->width = 1;
1105 break;
1106 default:
1107 this->width = 8;
1108 }
1109 }
1110
1111 /** Fixed HW reg constructor. */
1112 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1113 {
1114 init();
1115 this->file = file;
1116 this->reg = reg;
1117 this->type = type;
1118
1119 switch (file) {
1120 case UNIFORM:
1121 this->width = 1;
1122 break;
1123 default:
1124 this->width = 8;
1125 }
1126 }
1127
1128 /** Fixed HW reg constructor. */
1129 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1130 uint8_t width)
1131 {
1132 init();
1133 this->file = file;
1134 this->reg = reg;
1135 this->type = type;
1136 this->width = width;
1137 }
1138
1139 fs_reg *
1140 fs_visitor::variable_storage(ir_variable *var)
1141 {
1142 return (fs_reg *)hash_table_find(this->variable_ht, var);
1143 }
1144
1145 void
1146 import_uniforms_callback(const void *key,
1147 void *data,
1148 void *closure)
1149 {
1150 struct hash_table *dst_ht = (struct hash_table *)closure;
1151 const fs_reg *reg = (const fs_reg *)data;
1152
1153 if (reg->file != UNIFORM)
1154 return;
1155
1156 hash_table_insert(dst_ht, data, key);
1157 }
1158
1159 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1160 * This brings in those uniform definitions
1161 */
1162 void
1163 fs_visitor::import_uniforms(fs_visitor *v)
1164 {
1165 hash_table_call_foreach(v->variable_ht,
1166 import_uniforms_callback,
1167 variable_ht);
1168 this->push_constant_loc = v->push_constant_loc;
1169 this->pull_constant_loc = v->pull_constant_loc;
1170 this->uniforms = v->uniforms;
1171 this->param_size = v->param_size;
1172 }
1173
1174 /* Our support for uniforms is piggy-backed on the struct
1175 * gl_fragment_program, because that's where the values actually
1176 * get stored, rather than in some global gl_shader_program uniform
1177 * store.
1178 */
1179 void
1180 fs_visitor::setup_uniform_values(ir_variable *ir)
1181 {
1182 int namelen = strlen(ir->name);
1183
1184 /* The data for our (non-builtin) uniforms is stored in a series of
1185 * gl_uniform_driver_storage structs for each subcomponent that
1186 * glGetUniformLocation() could name. We know it's been set up in the same
1187 * order we'd walk the type, so walk the list of storage and find anything
1188 * with our name, or the prefix of a component that starts with our name.
1189 */
1190 unsigned params_before = uniforms;
1191 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1192 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1193
1194 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1195 (storage->name[namelen] != 0 &&
1196 storage->name[namelen] != '.' &&
1197 storage->name[namelen] != '[')) {
1198 continue;
1199 }
1200
1201 unsigned slots = storage->type->component_slots();
1202 if (storage->array_elements)
1203 slots *= storage->array_elements;
1204
1205 for (unsigned i = 0; i < slots; i++) {
1206 stage_prog_data->param[uniforms++] = &storage->storage[i];
1207 }
1208 }
1209
1210 /* Make sure we actually initialized the right amount of stuff here. */
1211 assert(params_before + ir->type->component_slots() == uniforms);
1212 (void)params_before;
1213 }
1214
1215
1216 /* Our support for builtin uniforms is even scarier than non-builtin.
1217 * It sits on top of the PROG_STATE_VAR parameters that are
1218 * automatically updated from GL context state.
1219 */
1220 void
1221 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1222 {
1223 const ir_state_slot *const slots = ir->get_state_slots();
1224 assert(slots != NULL);
1225
1226 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1227 /* This state reference has already been setup by ir_to_mesa, but we'll
1228 * get the same index back here.
1229 */
1230 int index = _mesa_add_state_reference(this->prog->Parameters,
1231 (gl_state_index *)slots[i].tokens);
1232
1233 /* Add each of the unique swizzles of the element as a parameter.
1234 * This'll end up matching the expected layout of the
1235 * array/matrix/structure we're trying to fill in.
1236 */
1237 int last_swiz = -1;
1238 for (unsigned int j = 0; j < 4; j++) {
1239 int swiz = GET_SWZ(slots[i].swizzle, j);
1240 if (swiz == last_swiz)
1241 break;
1242 last_swiz = swiz;
1243
1244 stage_prog_data->param[uniforms++] =
1245 &prog->Parameters->ParameterValues[index][swiz];
1246 }
1247 }
1248 }
1249
1250 fs_reg *
1251 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1252 bool origin_upper_left)
1253 {
1254 assert(stage == MESA_SHADER_FRAGMENT);
1255 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1256 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1257 fs_reg wpos = *reg;
1258 bool flip = !origin_upper_left ^ key->render_to_fbo;
1259
1260 /* gl_FragCoord.x */
1261 if (pixel_center_integer) {
1262 emit(MOV(wpos, this->pixel_x));
1263 } else {
1264 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1265 }
1266 wpos = offset(wpos, 1);
1267
1268 /* gl_FragCoord.y */
1269 if (!flip && pixel_center_integer) {
1270 emit(MOV(wpos, this->pixel_y));
1271 } else {
1272 fs_reg pixel_y = this->pixel_y;
1273 float offset = (pixel_center_integer ? 0.0 : 0.5);
1274
1275 if (flip) {
1276 pixel_y.negate = true;
1277 offset += key->drawable_height - 1.0;
1278 }
1279
1280 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1281 }
1282 wpos = offset(wpos, 1);
1283
1284 /* gl_FragCoord.z */
1285 if (devinfo->gen >= 6) {
1286 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1287 } else {
1288 emit(FS_OPCODE_LINTERP, wpos,
1289 this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1290 interp_reg(VARYING_SLOT_POS, 2));
1291 }
1292 wpos = offset(wpos, 1);
1293
1294 /* gl_FragCoord.w: Already set up in emit_interpolation */
1295 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1296
1297 return reg;
1298 }
1299
1300 fs_inst *
1301 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1302 glsl_interp_qualifier interpolation_mode,
1303 bool is_centroid, bool is_sample)
1304 {
1305 brw_wm_barycentric_interp_mode barycoord_mode;
1306 if (devinfo->gen >= 6) {
1307 if (is_centroid) {
1308 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1309 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1310 else
1311 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1312 } else if (is_sample) {
1313 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1314 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1315 else
1316 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1317 } else {
1318 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1319 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1320 else
1321 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1322 }
1323 } else {
1324 /* On Ironlake and below, there is only one interpolation mode.
1325 * Centroid interpolation doesn't mean anything on this hardware --
1326 * there is no multisampling.
1327 */
1328 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1329 }
1330 return emit(FS_OPCODE_LINTERP, attr,
1331 this->delta_xy[barycoord_mode], interp);
1332 }
1333
1334 void
1335 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1336 const glsl_type *type,
1337 glsl_interp_qualifier interpolation_mode,
1338 int location, bool mod_centroid,
1339 bool mod_sample)
1340 {
1341 attr.type = brw_type_for_base_type(type->get_scalar_type());
1342
1343 assert(stage == MESA_SHADER_FRAGMENT);
1344 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1345 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1346
1347 unsigned int array_elements;
1348
1349 if (type->is_array()) {
1350 array_elements = type->length;
1351 if (array_elements == 0) {
1352 fail("dereferenced array '%s' has length 0\n", name);
1353 }
1354 type = type->fields.array;
1355 } else {
1356 array_elements = 1;
1357 }
1358
1359 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1360 bool is_gl_Color =
1361 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1362 if (key->flat_shade && is_gl_Color) {
1363 interpolation_mode = INTERP_QUALIFIER_FLAT;
1364 } else {
1365 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1366 }
1367 }
1368
1369 for (unsigned int i = 0; i < array_elements; i++) {
1370 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1371 if (prog_data->urb_setup[location] == -1) {
1372 /* If there's no incoming setup data for this slot, don't
1373 * emit interpolation for it.
1374 */
1375 attr = offset(attr, type->vector_elements);
1376 location++;
1377 continue;
1378 }
1379
1380 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1381 /* Constant interpolation (flat shading) case. The SF has
1382 * handed us defined values in only the constant offset
1383 * field of the setup reg.
1384 */
1385 for (unsigned int k = 0; k < type->vector_elements; k++) {
1386 struct brw_reg interp = interp_reg(location, k);
1387 interp = suboffset(interp, 3);
1388 interp.type = attr.type;
1389 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1390 attr = offset(attr, 1);
1391 }
1392 } else {
1393 /* Smooth/noperspective interpolation case. */
1394 for (unsigned int k = 0; k < type->vector_elements; k++) {
1395 struct brw_reg interp = interp_reg(location, k);
1396 if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1397 /* Get the pixel/sample mask into f0 so that we know
1398 * which pixels are lit. Then, for each channel that is
1399 * unlit, replace the centroid data with non-centroid
1400 * data.
1401 */
1402 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1403
1404 fs_inst *inst;
1405 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1406 false, false);
1407 inst->predicate = BRW_PREDICATE_NORMAL;
1408 inst->predicate_inverse = true;
1409 if (devinfo->has_pln)
1410 inst->no_dd_clear = true;
1411
1412 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1413 mod_centroid && !key->persample_shading,
1414 mod_sample || key->persample_shading);
1415 inst->predicate = BRW_PREDICATE_NORMAL;
1416 inst->predicate_inverse = false;
1417 if (devinfo->has_pln)
1418 inst->no_dd_check = true;
1419
1420 } else {
1421 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1422 mod_centroid && !key->persample_shading,
1423 mod_sample || key->persample_shading);
1424 }
1425 if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1426 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1427 }
1428 attr = offset(attr, 1);
1429 }
1430
1431 }
1432 location++;
1433 }
1434 }
1435 }
1436
1437 fs_reg *
1438 fs_visitor::emit_frontfacing_interpolation()
1439 {
1440 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1441
1442 if (devinfo->gen >= 6) {
1443 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1444 * a boolean result from this (~0/true or 0/false).
1445 *
1446 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1447 * this task in only one instruction:
1448 * - a negation source modifier will flip the bit; and
1449 * - a W -> D type conversion will sign extend the bit into the high
1450 * word of the destination.
1451 *
1452 * An ASR 15 fills the low word of the destination.
1453 */
1454 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1455 g0.negate = true;
1456
1457 emit(ASR(*reg, g0, fs_reg(15)));
1458 } else {
1459 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1460 * a boolean result from this (1/true or 0/false).
1461 *
1462 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1463 * the negation source modifier to flip it. Unfortunately the SHR
1464 * instruction only operates on UD (or D with an abs source modifier)
1465 * sources without negation.
1466 *
1467 * Instead, use ASR (which will give ~0/true or 0/false).
1468 */
1469 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1470 g1_6.negate = true;
1471
1472 emit(ASR(*reg, g1_6, fs_reg(31)));
1473 }
1474
1475 return reg;
1476 }
1477
1478 void
1479 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1480 {
1481 assert(stage == MESA_SHADER_FRAGMENT);
1482 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1483 assert(dst.type == BRW_REGISTER_TYPE_F);
1484
1485 if (key->compute_pos_offset) {
1486 /* Convert int_sample_pos to floating point */
1487 emit(MOV(dst, int_sample_pos));
1488 /* Scale to the range [0, 1] */
1489 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1490 }
1491 else {
1492 /* From ARB_sample_shading specification:
1493 * "When rendering to a non-multisample buffer, or if multisample
1494 * rasterization is disabled, gl_SamplePosition will always be
1495 * (0.5, 0.5).
1496 */
1497 emit(MOV(dst, fs_reg(0.5f)));
1498 }
1499 }
1500
1501 fs_reg *
1502 fs_visitor::emit_samplepos_setup()
1503 {
1504 assert(devinfo->gen >= 6);
1505
1506 this->current_annotation = "compute sample position";
1507 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1508 fs_reg pos = *reg;
1509 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1510 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1511
1512 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1513 * mode will be enabled.
1514 *
1515 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1516 * R31.1:0 Position Offset X/Y for Slot[3:0]
1517 * R31.3:2 Position Offset X/Y for Slot[7:4]
1518 * .....
1519 *
1520 * The X, Y sample positions come in as bytes in thread payload. So, read
1521 * the positions using vstride=16, width=8, hstride=2.
1522 */
1523 struct brw_reg sample_pos_reg =
1524 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1525 BRW_REGISTER_TYPE_B), 16, 8, 2);
1526
1527 if (dispatch_width == 8) {
1528 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1529 } else {
1530 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1531 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1532 ->force_sechalf = true;
1533 }
1534 /* Compute gl_SamplePosition.x */
1535 compute_sample_position(pos, int_sample_x);
1536 pos = offset(pos, 1);
1537 if (dispatch_width == 8) {
1538 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1539 } else {
1540 emit(MOV(half(int_sample_y, 0),
1541 fs_reg(suboffset(sample_pos_reg, 1))));
1542 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1543 ->force_sechalf = true;
1544 }
1545 /* Compute gl_SamplePosition.y */
1546 compute_sample_position(pos, int_sample_y);
1547 return reg;
1548 }
1549
1550 fs_reg *
1551 fs_visitor::emit_sampleid_setup()
1552 {
1553 assert(stage == MESA_SHADER_FRAGMENT);
1554 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1555 assert(devinfo->gen >= 6);
1556
1557 this->current_annotation = "compute sample id";
1558 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1559
1560 if (key->compute_sample_id) {
1561 fs_reg t1 = vgrf(glsl_type::int_type);
1562 fs_reg t2 = vgrf(glsl_type::int_type);
1563 t2.type = BRW_REGISTER_TYPE_UW;
1564
1565 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1566 * 8x multisampling, subspan 0 will represent sample N (where N
1567 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1568 * 7. We can find the value of N by looking at R0.0 bits 7:6
1569 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1570 * (since samples are always delivered in pairs). That is, we
1571 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1572 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1573 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1574 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1575 * populating a temporary variable with the sequence (0, 1, 2, 3),
1576 * and then reading from it using vstride=1, width=4, hstride=0.
1577 * These computations hold good for 4x multisampling as well.
1578 *
1579 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1580 * the first four slots are sample 0 of subspan 0; the next four
1581 * are sample 1 of subspan 0; the third group is sample 0 of
1582 * subspan 1, and finally sample 1 of subspan 1.
1583 */
1584 fs_inst *inst;
1585 inst = emit(BRW_OPCODE_AND, t1,
1586 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1587 fs_reg(0xc0));
1588 inst->force_writemask_all = true;
1589 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1590 inst->force_writemask_all = true;
1591 /* This works for both SIMD8 and SIMD16 */
1592 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1593 inst->force_writemask_all = true;
1594 /* This special instruction takes care of setting vstride=1,
1595 * width=4, hstride=0 of t2 during an ADD instruction.
1596 */
1597 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1598 } else {
1599 /* As per GL_ARB_sample_shading specification:
1600 * "When rendering to a non-multisample buffer, or if multisample
1601 * rasterization is disabled, gl_SampleID will always be zero."
1602 */
1603 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1604 }
1605
1606 return reg;
1607 }
1608
1609 void
1610 fs_visitor::resolve_source_modifiers(fs_reg *src)
1611 {
1612 if (!src->abs && !src->negate)
1613 return;
1614
1615 fs_reg temp = retype(vgrf(1), src->type);
1616 emit(MOV(temp, *src));
1617 *src = temp;
1618 }
1619
1620 fs_reg
1621 fs_visitor::fix_math_operand(fs_reg src)
1622 {
1623 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1624 * might be able to do better by doing execsize = 1 math and then
1625 * expanding that result out, but we would need to be careful with
1626 * masking.
1627 *
1628 * The hardware ignores source modifiers (negate and abs) on math
1629 * instructions, so we also move to a temp to set those up.
1630 */
1631 if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1632 !src.abs && !src.negate)
1633 return src;
1634
1635 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1636 * operands to math
1637 */
1638 if (devinfo->gen >= 7 && src.file != IMM)
1639 return src;
1640
1641 fs_reg expanded = vgrf(glsl_type::float_type);
1642 expanded.type = src.type;
1643 emit(BRW_OPCODE_MOV, expanded, src);
1644 return expanded;
1645 }
1646
1647 fs_inst *
1648 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1649 {
1650 switch (opcode) {
1651 case SHADER_OPCODE_RCP:
1652 case SHADER_OPCODE_RSQ:
1653 case SHADER_OPCODE_SQRT:
1654 case SHADER_OPCODE_EXP2:
1655 case SHADER_OPCODE_LOG2:
1656 case SHADER_OPCODE_SIN:
1657 case SHADER_OPCODE_COS:
1658 break;
1659 default:
1660 unreachable("not reached: bad math opcode");
1661 }
1662
1663 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1664 * might be able to do better by doing execsize = 1 math and then
1665 * expanding that result out, but we would need to be careful with
1666 * masking.
1667 *
1668 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1669 * instructions, so we also move to a temp to set those up.
1670 */
1671 if (devinfo->gen == 6 || devinfo->gen == 7)
1672 src = fix_math_operand(src);
1673
1674 fs_inst *inst = emit(opcode, dst, src);
1675
1676 if (devinfo->gen < 6) {
1677 inst->base_mrf = 2;
1678 inst->mlen = dispatch_width / 8;
1679 }
1680
1681 return inst;
1682 }
1683
1684 fs_inst *
1685 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1686 {
1687 int base_mrf = 2;
1688 fs_inst *inst;
1689
1690 if (devinfo->gen >= 8) {
1691 inst = emit(opcode, dst, src0, src1);
1692 } else if (devinfo->gen >= 6) {
1693 src0 = fix_math_operand(src0);
1694 src1 = fix_math_operand(src1);
1695
1696 inst = emit(opcode, dst, src0, src1);
1697 } else {
1698 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1699 * "Message Payload":
1700 *
1701 * "Operand0[7]. For the INT DIV functions, this operand is the
1702 * denominator."
1703 * ...
1704 * "Operand1[7]. For the INT DIV functions, this operand is the
1705 * numerator."
1706 */
1707 bool is_int_div = opcode != SHADER_OPCODE_POW;
1708 fs_reg &op0 = is_int_div ? src1 : src0;
1709 fs_reg &op1 = is_int_div ? src0 : src1;
1710
1711 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1712 inst = emit(opcode, dst, op0, reg_null_f);
1713
1714 inst->base_mrf = base_mrf;
1715 inst->mlen = 2 * dispatch_width / 8;
1716 }
1717 return inst;
1718 }
1719
1720 void
1721 fs_visitor::emit_discard_jump()
1722 {
1723 assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1724
1725 /* For performance, after a discard, jump to the end of the
1726 * shader if all relevant channels have been discarded.
1727 */
1728 fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1729 discard_jump->flag_subreg = 1;
1730
1731 discard_jump->predicate = (dispatch_width == 8)
1732 ? BRW_PREDICATE_ALIGN1_ANY8H
1733 : BRW_PREDICATE_ALIGN1_ANY16H;
1734 discard_jump->predicate_inverse = true;
1735 }
1736
1737 void
1738 fs_visitor::assign_curb_setup()
1739 {
1740 if (dispatch_width == 8) {
1741 prog_data->dispatch_grf_start_reg = payload.num_regs;
1742 } else {
1743 if (stage == MESA_SHADER_FRAGMENT) {
1744 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1745 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1746 } else if (stage == MESA_SHADER_COMPUTE) {
1747 brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1748 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1749 } else {
1750 unreachable("Unsupported shader type!");
1751 }
1752 }
1753
1754 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1755
1756 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1757 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1758 for (unsigned int i = 0; i < inst->sources; i++) {
1759 if (inst->src[i].file == UNIFORM) {
1760 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1761 int constant_nr;
1762 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1763 constant_nr = push_constant_loc[uniform_nr];
1764 } else {
1765 /* Section 5.11 of the OpenGL 4.1 spec says:
1766 * "Out-of-bounds reads return undefined values, which include
1767 * values from other variables of the active program or zero."
1768 * Just return the first push constant.
1769 */
1770 constant_nr = 0;
1771 }
1772
1773 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1774 constant_nr / 8,
1775 constant_nr % 8);
1776
1777 inst->src[i].file = HW_REG;
1778 inst->src[i].fixed_hw_reg = byte_offset(
1779 retype(brw_reg, inst->src[i].type),
1780 inst->src[i].subreg_offset);
1781 }
1782 }
1783 }
1784 }
1785
1786 void
1787 fs_visitor::calculate_urb_setup()
1788 {
1789 assert(stage == MESA_SHADER_FRAGMENT);
1790 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1791 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1792
1793 memset(prog_data->urb_setup, -1,
1794 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1795
1796 int urb_next = 0;
1797 /* Figure out where each of the incoming setup attributes lands. */
1798 if (devinfo->gen >= 6) {
1799 if (_mesa_bitcount_64(prog->InputsRead &
1800 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1801 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1802 * first 16 varying inputs, so we can put them wherever we want.
1803 * Just put them in order.
1804 *
1805 * This is useful because it means that (a) inputs not used by the
1806 * fragment shader won't take up valuable register space, and (b) we
1807 * won't have to recompile the fragment shader if it gets paired with
1808 * a different vertex (or geometry) shader.
1809 */
1810 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1811 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1812 BITFIELD64_BIT(i)) {
1813 prog_data->urb_setup[i] = urb_next++;
1814 }
1815 }
1816 } else {
1817 /* We have enough input varyings that the SF/SBE pipeline stage can't
1818 * arbitrarily rearrange them to suit our whim; we have to put them
1819 * in an order that matches the output of the previous pipeline stage
1820 * (geometry or vertex shader).
1821 */
1822 struct brw_vue_map prev_stage_vue_map;
1823 brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1824 key->input_slots_valid);
1825 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1826 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1827 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1828 slot++) {
1829 int varying = prev_stage_vue_map.slot_to_varying[slot];
1830 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1831 * unused.
1832 */
1833 if (varying != BRW_VARYING_SLOT_COUNT &&
1834 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1835 BITFIELD64_BIT(varying))) {
1836 prog_data->urb_setup[varying] = slot - first_slot;
1837 }
1838 }
1839 urb_next = prev_stage_vue_map.num_slots - first_slot;
1840 }
1841 } else {
1842 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1843 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1844 /* Point size is packed into the header, not as a general attribute */
1845 if (i == VARYING_SLOT_PSIZ)
1846 continue;
1847
1848 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1849 /* The back color slot is skipped when the front color is
1850 * also written to. In addition, some slots can be
1851 * written in the vertex shader and not read in the
1852 * fragment shader. So the register number must always be
1853 * incremented, mapped or not.
1854 */
1855 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1856 prog_data->urb_setup[i] = urb_next;
1857 urb_next++;
1858 }
1859 }
1860
1861 /*
1862 * It's a FS only attribute, and we did interpolation for this attribute
1863 * in SF thread. So, count it here, too.
1864 *
1865 * See compile_sf_prog() for more info.
1866 */
1867 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1868 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1869 }
1870
1871 prog_data->num_varying_inputs = urb_next;
1872 }
1873
1874 void
1875 fs_visitor::assign_urb_setup()
1876 {
1877 assert(stage == MESA_SHADER_FRAGMENT);
1878 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1879
1880 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1881
1882 /* Offset all the urb_setup[] index by the actual position of the
1883 * setup regs, now that the location of the constants has been chosen.
1884 */
1885 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1886 if (inst->opcode == FS_OPCODE_LINTERP) {
1887 assert(inst->src[1].file == HW_REG);
1888 inst->src[1].fixed_hw_reg.nr += urb_start;
1889 }
1890
1891 if (inst->opcode == FS_OPCODE_CINTERP) {
1892 assert(inst->src[0].file == HW_REG);
1893 inst->src[0].fixed_hw_reg.nr += urb_start;
1894 }
1895 }
1896
1897 /* Each attribute is 4 setup channels, each of which is half a reg. */
1898 this->first_non_payload_grf =
1899 urb_start + prog_data->num_varying_inputs * 2;
1900 }
1901
1902 void
1903 fs_visitor::assign_vs_urb_setup()
1904 {
1905 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1906 int grf, count, slot, channel, attr;
1907
1908 assert(stage == MESA_SHADER_VERTEX);
1909 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1910 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1911 count++;
1912
1913 /* Each attribute is 4 regs. */
1914 this->first_non_payload_grf =
1915 payload.num_regs + prog_data->curb_read_length + count * 4;
1916
1917 unsigned vue_entries =
1918 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1919
1920 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1921 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1922
1923 assert(vs_prog_data->base.urb_read_length <= 15);
1924
1925 /* Rewrite all ATTR file references to the hw grf that they land in. */
1926 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1927 for (int i = 0; i < inst->sources; i++) {
1928 if (inst->src[i].file == ATTR) {
1929
1930 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1931 slot = count - 1;
1932 } else {
1933 /* Attributes come in in a contiguous block, ordered by their
1934 * gl_vert_attrib value. That means we can compute the slot
1935 * number for an attribute by masking out the enabled
1936 * attributes before it and counting the bits.
1937 */
1938 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1939 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1940 BITFIELD64_MASK(attr));
1941 }
1942
1943 channel = inst->src[i].reg_offset & 3;
1944
1945 grf = payload.num_regs +
1946 prog_data->curb_read_length +
1947 slot * 4 + channel;
1948
1949 inst->src[i].file = HW_REG;
1950 inst->src[i].fixed_hw_reg =
1951 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1952 }
1953 }
1954 }
1955 }
1956
1957 /**
1958 * Split large virtual GRFs into separate components if we can.
1959 *
1960 * This is mostly duplicated with what brw_fs_vector_splitting does,
1961 * but that's really conservative because it's afraid of doing
1962 * splitting that doesn't result in real progress after the rest of
1963 * the optimization phases, which would cause infinite looping in
1964 * optimization. We can do it once here, safely. This also has the
1965 * opportunity to split interpolated values, or maybe even uniforms,
1966 * which we don't have at the IR level.
1967 *
1968 * We want to split, because virtual GRFs are what we register
1969 * allocate and spill (due to contiguousness requirements for some
1970 * instructions), and they're what we naturally generate in the
1971 * codegen process, but most virtual GRFs don't actually need to be
1972 * contiguous sets of GRFs. If we split, we'll end up with reduced
1973 * live intervals and better dead code elimination and coalescing.
1974 */
1975 void
1976 fs_visitor::split_virtual_grfs()
1977 {
1978 int num_vars = this->alloc.count;
1979
1980 /* Count the total number of registers */
1981 int reg_count = 0;
1982 int vgrf_to_reg[num_vars];
1983 for (int i = 0; i < num_vars; i++) {
1984 vgrf_to_reg[i] = reg_count;
1985 reg_count += alloc.sizes[i];
1986 }
1987
1988 /* An array of "split points". For each register slot, this indicates
1989 * if this slot can be separated from the previous slot. Every time an
1990 * instruction uses multiple elements of a register (as a source or
1991 * destination), we mark the used slots as inseparable. Then we go
1992 * through and split the registers into the smallest pieces we can.
1993 */
1994 bool split_points[reg_count];
1995 memset(split_points, 0, sizeof(split_points));
1996
1997 /* Mark all used registers as fully splittable */
1998 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1999 if (inst->dst.file == GRF) {
2000 int reg = vgrf_to_reg[inst->dst.reg];
2001 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
2002 split_points[reg + j] = true;
2003 }
2004
2005 for (int i = 0; i < inst->sources; i++) {
2006 if (inst->src[i].file == GRF) {
2007 int reg = vgrf_to_reg[inst->src[i].reg];
2008 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
2009 split_points[reg + j] = true;
2010 }
2011 }
2012 }
2013
2014 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2015 if (inst->dst.file == GRF) {
2016 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2017 for (int j = 1; j < inst->regs_written; j++)
2018 split_points[reg + j] = false;
2019 }
2020 for (int i = 0; i < inst->sources; i++) {
2021 if (inst->src[i].file == GRF) {
2022 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2023 for (int j = 1; j < inst->regs_read(i); j++)
2024 split_points[reg + j] = false;
2025 }
2026 }
2027 }
2028
2029 int new_virtual_grf[reg_count];
2030 int new_reg_offset[reg_count];
2031
2032 int reg = 0;
2033 for (int i = 0; i < num_vars; i++) {
2034 /* The first one should always be 0 as a quick sanity check. */
2035 assert(split_points[reg] == false);
2036
2037 /* j = 0 case */
2038 new_reg_offset[reg] = 0;
2039 reg++;
2040 int offset = 1;
2041
2042 /* j > 0 case */
2043 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2044 /* If this is a split point, reset the offset to 0 and allocate a
2045 * new virtual GRF for the previous offset many registers
2046 */
2047 if (split_points[reg]) {
2048 assert(offset <= MAX_VGRF_SIZE);
2049 int grf = alloc.allocate(offset);
2050 for (int k = reg - offset; k < reg; k++)
2051 new_virtual_grf[k] = grf;
2052 offset = 0;
2053 }
2054 new_reg_offset[reg] = offset;
2055 offset++;
2056 reg++;
2057 }
2058
2059 /* The last one gets the original register number */
2060 assert(offset <= MAX_VGRF_SIZE);
2061 alloc.sizes[i] = offset;
2062 for (int k = reg - offset; k < reg; k++)
2063 new_virtual_grf[k] = i;
2064 }
2065 assert(reg == reg_count);
2066
2067 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2068 if (inst->dst.file == GRF) {
2069 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2070 inst->dst.reg = new_virtual_grf[reg];
2071 inst->dst.reg_offset = new_reg_offset[reg];
2072 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2073 }
2074 for (int i = 0; i < inst->sources; i++) {
2075 if (inst->src[i].file == GRF) {
2076 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2077 inst->src[i].reg = new_virtual_grf[reg];
2078 inst->src[i].reg_offset = new_reg_offset[reg];
2079 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2080 }
2081 }
2082 }
2083 invalidate_live_intervals();
2084 }
2085
2086 /**
2087 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2088 *
2089 * During code generation, we create tons of temporary variables, many of
2090 * which get immediately killed and are never used again. Yet, in later
2091 * optimization and analysis passes, such as compute_live_intervals, we need
2092 * to loop over all the virtual GRFs. Compacting them can save a lot of
2093 * overhead.
2094 */
2095 bool
2096 fs_visitor::compact_virtual_grfs()
2097 {
2098 bool progress = false;
2099 int remap_table[this->alloc.count];
2100 memset(remap_table, -1, sizeof(remap_table));
2101
2102 /* Mark which virtual GRFs are used. */
2103 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2104 if (inst->dst.file == GRF)
2105 remap_table[inst->dst.reg] = 0;
2106
2107 for (int i = 0; i < inst->sources; i++) {
2108 if (inst->src[i].file == GRF)
2109 remap_table[inst->src[i].reg] = 0;
2110 }
2111 }
2112
2113 /* Compact the GRF arrays. */
2114 int new_index = 0;
2115 for (unsigned i = 0; i < this->alloc.count; i++) {
2116 if (remap_table[i] == -1) {
2117 /* We just found an unused register. This means that we are
2118 * actually going to compact something.
2119 */
2120 progress = true;
2121 } else {
2122 remap_table[i] = new_index;
2123 alloc.sizes[new_index] = alloc.sizes[i];
2124 invalidate_live_intervals();
2125 ++new_index;
2126 }
2127 }
2128
2129 this->alloc.count = new_index;
2130
2131 /* Patch all the instructions to use the newly renumbered registers */
2132 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2133 if (inst->dst.file == GRF)
2134 inst->dst.reg = remap_table[inst->dst.reg];
2135
2136 for (int i = 0; i < inst->sources; i++) {
2137 if (inst->src[i].file == GRF)
2138 inst->src[i].reg = remap_table[inst->src[i].reg];
2139 }
2140 }
2141
2142 /* Patch all the references to delta_xy, since they're used in register
2143 * allocation. If they're unused, switch them to BAD_FILE so we don't
2144 * think some random VGRF is delta_xy.
2145 */
2146 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2147 if (delta_xy[i].file == GRF) {
2148 if (remap_table[delta_xy[i].reg] != -1) {
2149 delta_xy[i].reg = remap_table[delta_xy[i].reg];
2150 } else {
2151 delta_xy[i].file = BAD_FILE;
2152 }
2153 }
2154 }
2155
2156 return progress;
2157 }
2158
2159 /*
2160 * Implements array access of uniforms by inserting a
2161 * PULL_CONSTANT_LOAD instruction.
2162 *
2163 * Unlike temporary GRF array access (where we don't support it due to
2164 * the difficulty of doing relative addressing on instruction
2165 * destinations), we could potentially do array access of uniforms
2166 * that were loaded in GRF space as push constants. In real-world
2167 * usage we've seen, though, the arrays being used are always larger
2168 * than we could load as push constants, so just always move all
2169 * uniform array access out to a pull constant buffer.
2170 */
2171 void
2172 fs_visitor::move_uniform_array_access_to_pull_constants()
2173 {
2174 if (dispatch_width != 8)
2175 return;
2176
2177 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2178 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2179
2180 /* Walk through and find array access of uniforms. Put a copy of that
2181 * uniform in the pull constant buffer.
2182 *
2183 * Note that we don't move constant-indexed accesses to arrays. No
2184 * testing has been done of the performance impact of this choice.
2185 */
2186 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2187 for (int i = 0 ; i < inst->sources; i++) {
2188 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2189 continue;
2190
2191 int uniform = inst->src[i].reg;
2192
2193 /* If this array isn't already present in the pull constant buffer,
2194 * add it.
2195 */
2196 if (pull_constant_loc[uniform] == -1) {
2197 const gl_constant_value **values = &stage_prog_data->param[uniform];
2198
2199 assert(param_size[uniform]);
2200
2201 for (int j = 0; j < param_size[uniform]; j++) {
2202 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2203
2204 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2205 values[j];
2206 }
2207 }
2208 }
2209 }
2210 }
2211
2212 /**
2213 * Assign UNIFORM file registers to either push constants or pull constants.
2214 *
2215 * We allow a fragment shader to have more than the specified minimum
2216 * maximum number of fragment shader uniform components (64). If
2217 * there are too many of these, they'd fill up all of register space.
2218 * So, this will push some of them out to the pull constant buffer and
2219 * update the program to load them.
2220 */
2221 void
2222 fs_visitor::assign_constant_locations()
2223 {
2224 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2225 if (dispatch_width != 8)
2226 return;
2227
2228 /* Find which UNIFORM registers are still in use. */
2229 bool is_live[uniforms];
2230 for (unsigned int i = 0; i < uniforms; i++) {
2231 is_live[i] = false;
2232 }
2233
2234 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2235 for (int i = 0; i < inst->sources; i++) {
2236 if (inst->src[i].file != UNIFORM)
2237 continue;
2238
2239 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2240 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2241 is_live[constant_nr] = true;
2242 }
2243 }
2244
2245 /* Only allow 16 registers (128 uniform components) as push constants.
2246 *
2247 * Just demote the end of the list. We could probably do better
2248 * here, demoting things that are rarely used in the program first.
2249 *
2250 * If changing this value, note the limitation about total_regs in
2251 * brw_curbe.c.
2252 */
2253 unsigned int max_push_components = 16 * 8;
2254 unsigned int num_push_constants = 0;
2255
2256 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2257
2258 for (unsigned int i = 0; i < uniforms; i++) {
2259 if (!is_live[i] || pull_constant_loc[i] != -1) {
2260 /* This UNIFORM register is either dead, or has already been demoted
2261 * to a pull const. Mark it as no longer living in the param[] array.
2262 */
2263 push_constant_loc[i] = -1;
2264 continue;
2265 }
2266
2267 if (num_push_constants < max_push_components) {
2268 /* Retain as a push constant. Record the location in the params[]
2269 * array.
2270 */
2271 push_constant_loc[i] = num_push_constants++;
2272 } else {
2273 /* Demote to a pull constant. */
2274 push_constant_loc[i] = -1;
2275
2276 int pull_index = stage_prog_data->nr_pull_params++;
2277 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2278 pull_constant_loc[i] = pull_index;
2279 }
2280 }
2281
2282 stage_prog_data->nr_params = num_push_constants;
2283
2284 /* Up until now, the param[] array has been indexed by reg + reg_offset
2285 * of UNIFORM registers. Condense it to only contain the uniforms we
2286 * chose to upload as push constants.
2287 */
2288 for (unsigned int i = 0; i < uniforms; i++) {
2289 int remapped = push_constant_loc[i];
2290
2291 if (remapped == -1)
2292 continue;
2293
2294 assert(remapped <= (int)i);
2295 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2296 }
2297 }
2298
2299 /**
2300 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2301 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2302 */
2303 void
2304 fs_visitor::demote_pull_constants()
2305 {
2306 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2307 for (int i = 0; i < inst->sources; i++) {
2308 if (inst->src[i].file != UNIFORM)
2309 continue;
2310
2311 int pull_index;
2312 unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2313 if (location >= uniforms) /* Out of bounds access */
2314 pull_index = -1;
2315 else
2316 pull_index = pull_constant_loc[location];
2317
2318 if (pull_index == -1)
2319 continue;
2320
2321 /* Set up the annotation tracking for new generated instructions. */
2322 base_ir = inst->ir;
2323 current_annotation = inst->annotation;
2324
2325 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2326 fs_reg dst = vgrf(glsl_type::float_type);
2327
2328 /* Generate a pull load into dst. */
2329 if (inst->src[i].reladdr) {
2330 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2331 surf_index,
2332 *inst->src[i].reladdr,
2333 pull_index);
2334 inst->insert_before(block, &list);
2335 inst->src[i].reladdr = NULL;
2336 } else {
2337 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2338 fs_inst *pull =
2339 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2340 dst, surf_index, offset);
2341 inst->insert_before(block, pull);
2342 inst->src[i].set_smear(pull_index & 3);
2343 }
2344
2345 /* Rewrite the instruction to use the temporary VGRF. */
2346 inst->src[i].file = GRF;
2347 inst->src[i].reg = dst.reg;
2348 inst->src[i].reg_offset = 0;
2349 inst->src[i].width = dispatch_width;
2350 }
2351 }
2352 invalidate_live_intervals();
2353 }
2354
2355 bool
2356 fs_visitor::opt_algebraic()
2357 {
2358 bool progress = false;
2359
2360 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2361 switch (inst->opcode) {
2362 case BRW_OPCODE_MOV:
2363 if (inst->src[0].file != IMM)
2364 break;
2365
2366 if (inst->saturate) {
2367 if (inst->dst.type != inst->src[0].type)
2368 assert(!"unimplemented: saturate mixed types");
2369
2370 if (brw_saturate_immediate(inst->dst.type,
2371 &inst->src[0].fixed_hw_reg)) {
2372 inst->saturate = false;
2373 progress = true;
2374 }
2375 }
2376 break;
2377
2378 case BRW_OPCODE_MUL:
2379 if (inst->src[1].file != IMM)
2380 continue;
2381
2382 /* a * 1.0 = a */
2383 if (inst->src[1].is_one()) {
2384 inst->opcode = BRW_OPCODE_MOV;
2385 inst->src[1] = reg_undef;
2386 progress = true;
2387 break;
2388 }
2389
2390 /* a * -1.0 = -a */
2391 if (inst->src[1].is_negative_one()) {
2392 inst->opcode = BRW_OPCODE_MOV;
2393 inst->src[0].negate = !inst->src[0].negate;
2394 inst->src[1] = reg_undef;
2395 progress = true;
2396 break;
2397 }
2398
2399 /* a * 0.0 = 0.0 */
2400 if (inst->src[1].is_zero()) {
2401 inst->opcode = BRW_OPCODE_MOV;
2402 inst->src[0] = inst->src[1];
2403 inst->src[1] = reg_undef;
2404 progress = true;
2405 break;
2406 }
2407
2408 if (inst->src[0].file == IMM) {
2409 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2410 inst->opcode = BRW_OPCODE_MOV;
2411 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2412 inst->src[1] = reg_undef;
2413 progress = true;
2414 break;
2415 }
2416 break;
2417 case BRW_OPCODE_ADD:
2418 if (inst->src[1].file != IMM)
2419 continue;
2420
2421 /* a + 0.0 = a */
2422 if (inst->src[1].is_zero()) {
2423 inst->opcode = BRW_OPCODE_MOV;
2424 inst->src[1] = reg_undef;
2425 progress = true;
2426 break;
2427 }
2428
2429 if (inst->src[0].file == IMM) {
2430 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2431 inst->opcode = BRW_OPCODE_MOV;
2432 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2433 inst->src[1] = reg_undef;
2434 progress = true;
2435 break;
2436 }
2437 break;
2438 case BRW_OPCODE_OR:
2439 if (inst->src[0].equals(inst->src[1])) {
2440 inst->opcode = BRW_OPCODE_MOV;
2441 inst->src[1] = reg_undef;
2442 progress = true;
2443 break;
2444 }
2445 break;
2446 case BRW_OPCODE_LRP:
2447 if (inst->src[1].equals(inst->src[2])) {
2448 inst->opcode = BRW_OPCODE_MOV;
2449 inst->src[0] = inst->src[1];
2450 inst->src[1] = reg_undef;
2451 inst->src[2] = reg_undef;
2452 progress = true;
2453 break;
2454 }
2455 break;
2456 case BRW_OPCODE_CMP:
2457 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2458 inst->src[0].abs &&
2459 inst->src[0].negate &&
2460 inst->src[1].is_zero()) {
2461 inst->src[0].abs = false;
2462 inst->src[0].negate = false;
2463 inst->conditional_mod = BRW_CONDITIONAL_Z;
2464 progress = true;
2465 break;
2466 }
2467 break;
2468 case BRW_OPCODE_SEL:
2469 if (inst->src[0].equals(inst->src[1])) {
2470 inst->opcode = BRW_OPCODE_MOV;
2471 inst->src[1] = reg_undef;
2472 inst->predicate = BRW_PREDICATE_NONE;
2473 inst->predicate_inverse = false;
2474 progress = true;
2475 } else if (inst->saturate && inst->src[1].file == IMM) {
2476 switch (inst->conditional_mod) {
2477 case BRW_CONDITIONAL_LE:
2478 case BRW_CONDITIONAL_L:
2479 switch (inst->src[1].type) {
2480 case BRW_REGISTER_TYPE_F:
2481 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2482 inst->opcode = BRW_OPCODE_MOV;
2483 inst->src[1] = reg_undef;
2484 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2485 progress = true;
2486 }
2487 break;
2488 default:
2489 break;
2490 }
2491 break;
2492 case BRW_CONDITIONAL_GE:
2493 case BRW_CONDITIONAL_G:
2494 switch (inst->src[1].type) {
2495 case BRW_REGISTER_TYPE_F:
2496 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2497 inst->opcode = BRW_OPCODE_MOV;
2498 inst->src[1] = reg_undef;
2499 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2500 progress = true;
2501 }
2502 break;
2503 default:
2504 break;
2505 }
2506 default:
2507 break;
2508 }
2509 }
2510 break;
2511 case BRW_OPCODE_MAD:
2512 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2513 inst->opcode = BRW_OPCODE_MOV;
2514 inst->src[1] = reg_undef;
2515 inst->src[2] = reg_undef;
2516 progress = true;
2517 } else if (inst->src[0].is_zero()) {
2518 inst->opcode = BRW_OPCODE_MUL;
2519 inst->src[0] = inst->src[2];
2520 inst->src[2] = reg_undef;
2521 progress = true;
2522 } else if (inst->src[1].is_one()) {
2523 inst->opcode = BRW_OPCODE_ADD;
2524 inst->src[1] = inst->src[2];
2525 inst->src[2] = reg_undef;
2526 progress = true;
2527 } else if (inst->src[2].is_one()) {
2528 inst->opcode = BRW_OPCODE_ADD;
2529 inst->src[2] = reg_undef;
2530 progress = true;
2531 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2532 inst->opcode = BRW_OPCODE_ADD;
2533 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2534 inst->src[2] = reg_undef;
2535 progress = true;
2536 }
2537 break;
2538 case SHADER_OPCODE_RCP: {
2539 fs_inst *prev = (fs_inst *)inst->prev;
2540 if (prev->opcode == SHADER_OPCODE_SQRT) {
2541 if (inst->src[0].equals(prev->dst)) {
2542 inst->opcode = SHADER_OPCODE_RSQ;
2543 inst->src[0] = prev->src[0];
2544 progress = true;
2545 }
2546 }
2547 break;
2548 }
2549 default:
2550 break;
2551 }
2552
2553 /* Swap if src[0] is immediate. */
2554 if (progress && inst->is_commutative()) {
2555 if (inst->src[0].file == IMM) {
2556 fs_reg tmp = inst->src[1];
2557 inst->src[1] = inst->src[0];
2558 inst->src[0] = tmp;
2559 }
2560 }
2561 }
2562 return progress;
2563 }
2564
2565 /**
2566 * Optimize sample messages that have constant zero values for the trailing
2567 * texture coordinates. We can just reduce the message length for these
2568 * instructions instead of reserving a register for it. Trailing parameters
2569 * that aren't sent default to zero anyway. This will cause the dead code
2570 * eliminator to remove the MOV instruction that would otherwise be emitted to
2571 * set up the zero value.
2572 */
2573 bool
2574 fs_visitor::opt_zero_samples()
2575 {
2576 /* Gen4 infers the texturing opcode based on the message length so we can't
2577 * change it.
2578 */
2579 if (devinfo->gen < 5)
2580 return false;
2581
2582 bool progress = false;
2583
2584 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2585 if (!inst->is_tex())
2586 continue;
2587
2588 fs_inst *load_payload = (fs_inst *) inst->prev;
2589
2590 if (load_payload->is_head_sentinel() ||
2591 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2592 continue;
2593
2594 /* We don't want to remove the message header. Removing all of the
2595 * parameters is avoided because it seems to cause a GPU hang but I
2596 * can't find any documentation indicating that this is expected.
2597 */
2598 while (inst->mlen > inst->header_present + dispatch_width / 8 &&
2599 load_payload->src[(inst->mlen - inst->header_present) /
2600 (dispatch_width / 8) +
2601 inst->header_present - 1].is_zero()) {
2602 inst->mlen -= dispatch_width / 8;
2603 progress = true;
2604 }
2605 }
2606
2607 if (progress)
2608 invalidate_live_intervals();
2609
2610 return progress;
2611 }
2612
2613 /**
2614 * Optimize sample messages which are followed by the final RT write.
2615 *
2616 * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2617 * results sent directly to the framebuffer, bypassing the EU. Recognize the
2618 * final texturing results copied to the framebuffer write payload and modify
2619 * them to write to the framebuffer directly.
2620 */
2621 bool
2622 fs_visitor::opt_sampler_eot()
2623 {
2624 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2625
2626 if (stage != MESA_SHADER_FRAGMENT)
2627 return false;
2628
2629 if (devinfo->gen < 9 && !devinfo->is_cherryview)
2630 return false;
2631
2632 /* FINISHME: It should be possible to implement this optimization when there
2633 * are multiple drawbuffers.
2634 */
2635 if (key->nr_color_regions != 1)
2636 return false;
2637
2638 /* Look for a texturing instruction immediately before the final FB_WRITE. */
2639 fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2640 assert(fb_write->eot);
2641 assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2642
2643 fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2644
2645 /* There wasn't one; nothing to do. */
2646 if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2647 return false;
2648
2649 /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2650 * It's very likely to be the previous instruction.
2651 */
2652 fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2653 if (load_payload->is_head_sentinel() ||
2654 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2655 return false;
2656
2657 assert(!tex_inst->eot); /* We can't get here twice */
2658 assert((tex_inst->offset & (0xff << 24)) == 0);
2659
2660 tex_inst->offset |= fb_write->target << 24;
2661 tex_inst->eot = true;
2662 fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2663
2664 /* If a header is present, marking the eot is sufficient. Otherwise, we need
2665 * to create a new LOAD_PAYLOAD command with the same sources and a space
2666 * saved for the header. Using a new destination register not only makes sure
2667 * we have enough space, but it will make sure the dead code eliminator kills
2668 * the instruction that this will replace.
2669 */
2670 if (tex_inst->header_present)
2671 return true;
2672
2673 fs_reg send_header = vgrf(load_payload->sources + 1);
2674 fs_reg *new_sources =
2675 ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2676
2677 new_sources[0] = fs_reg();
2678 for (int i = 0; i < load_payload->sources; i++)
2679 new_sources[i+1] = load_payload->src[i];
2680
2681 /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2682 * requires a lot of information about the sources to appropriately figure
2683 * out the number of registers needed to be used. Given this stage in our
2684 * optimization, we may not have the appropriate GRFs required by
2685 * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2686 * manually emit the instruction.
2687 */
2688 fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2689 load_payload->exec_size,
2690 send_header,
2691 new_sources,
2692 load_payload->sources + 1);
2693
2694 new_load_payload->regs_written = load_payload->regs_written + 1;
2695 tex_inst->mlen++;
2696 tex_inst->header_present = true;
2697 tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2698 tex_inst->src[0] = send_header;
2699 tex_inst->dst = reg_null_ud;
2700
2701 return true;
2702 }
2703
2704 bool
2705 fs_visitor::opt_register_renaming()
2706 {
2707 bool progress = false;
2708 int depth = 0;
2709
2710 int remap[alloc.count];
2711 memset(remap, -1, sizeof(int) * alloc.count);
2712
2713 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2714 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2715 depth++;
2716 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2717 inst->opcode == BRW_OPCODE_WHILE) {
2718 depth--;
2719 }
2720
2721 /* Rewrite instruction sources. */
2722 for (int i = 0; i < inst->sources; i++) {
2723 if (inst->src[i].file == GRF &&
2724 remap[inst->src[i].reg] != -1 &&
2725 remap[inst->src[i].reg] != inst->src[i].reg) {
2726 inst->src[i].reg = remap[inst->src[i].reg];
2727 progress = true;
2728 }
2729 }
2730
2731 const int dst = inst->dst.reg;
2732
2733 if (depth == 0 &&
2734 inst->dst.file == GRF &&
2735 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2736 !inst->is_partial_write()) {
2737 if (remap[dst] == -1) {
2738 remap[dst] = dst;
2739 } else {
2740 remap[dst] = alloc.allocate(inst->dst.width / 8);
2741 inst->dst.reg = remap[dst];
2742 progress = true;
2743 }
2744 } else if (inst->dst.file == GRF &&
2745 remap[dst] != -1 &&
2746 remap[dst] != dst) {
2747 inst->dst.reg = remap[dst];
2748 progress = true;
2749 }
2750 }
2751
2752 if (progress) {
2753 invalidate_live_intervals();
2754
2755 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2756 if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2757 delta_xy[i].reg = remap[delta_xy[i].reg];
2758 }
2759 }
2760 }
2761
2762 return progress;
2763 }
2764
2765 /**
2766 * Remove redundant or useless discard jumps.
2767 *
2768 * For example, we can eliminate jumps in the following sequence:
2769 *
2770 * discard-jump (redundant with the next jump)
2771 * discard-jump (useless; jumps to the next instruction)
2772 * placeholder-halt
2773 */
2774 bool
2775 fs_visitor::opt_redundant_discard_jumps()
2776 {
2777 bool progress = false;
2778
2779 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2780
2781 fs_inst *placeholder_halt = NULL;
2782 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2783 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2784 placeholder_halt = inst;
2785 break;
2786 }
2787 }
2788
2789 if (!placeholder_halt)
2790 return false;
2791
2792 /* Delete any HALTs immediately before the placeholder halt. */
2793 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2794 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2795 prev = (fs_inst *) placeholder_halt->prev) {
2796 prev->remove(last_bblock);
2797 progress = true;
2798 }
2799
2800 if (progress)
2801 invalidate_live_intervals();
2802
2803 return progress;
2804 }
2805
2806 bool
2807 fs_visitor::compute_to_mrf()
2808 {
2809 bool progress = false;
2810 int next_ip = 0;
2811
2812 /* No MRFs on Gen >= 7. */
2813 if (devinfo->gen >= 7)
2814 return false;
2815
2816 calculate_live_intervals();
2817
2818 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2819 int ip = next_ip;
2820 next_ip++;
2821
2822 if (inst->opcode != BRW_OPCODE_MOV ||
2823 inst->is_partial_write() ||
2824 inst->dst.file != MRF || inst->src[0].file != GRF ||
2825 inst->dst.type != inst->src[0].type ||
2826 inst->src[0].abs || inst->src[0].negate ||
2827 !inst->src[0].is_contiguous() ||
2828 inst->src[0].subreg_offset)
2829 continue;
2830
2831 /* Work out which hardware MRF registers are written by this
2832 * instruction.
2833 */
2834 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2835 int mrf_high;
2836 if (inst->dst.reg & BRW_MRF_COMPR4) {
2837 mrf_high = mrf_low + 4;
2838 } else if (inst->exec_size == 16) {
2839 mrf_high = mrf_low + 1;
2840 } else {
2841 mrf_high = mrf_low;
2842 }
2843
2844 /* Can't compute-to-MRF this GRF if someone else was going to
2845 * read it later.
2846 */
2847 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2848 continue;
2849
2850 /* Found a move of a GRF to a MRF. Let's see if we can go
2851 * rewrite the thing that made this GRF to write into the MRF.
2852 */
2853 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2854 if (scan_inst->dst.file == GRF &&
2855 scan_inst->dst.reg == inst->src[0].reg) {
2856 /* Found the last thing to write our reg we want to turn
2857 * into a compute-to-MRF.
2858 */
2859
2860 /* If this one instruction didn't populate all the
2861 * channels, bail. We might be able to rewrite everything
2862 * that writes that reg, but it would require smarter
2863 * tracking to delay the rewriting until complete success.
2864 */
2865 if (scan_inst->is_partial_write())
2866 break;
2867
2868 /* Things returning more than one register would need us to
2869 * understand coalescing out more than one MOV at a time.
2870 */
2871 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2872 break;
2873
2874 /* SEND instructions can't have MRF as a destination. */
2875 if (scan_inst->mlen)
2876 break;
2877
2878 if (devinfo->gen == 6) {
2879 /* gen6 math instructions must have the destination be
2880 * GRF, so no compute-to-MRF for them.
2881 */
2882 if (scan_inst->is_math()) {
2883 break;
2884 }
2885 }
2886
2887 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2888 /* Found the creator of our MRF's source value. */
2889 scan_inst->dst.file = MRF;
2890 scan_inst->dst.reg = inst->dst.reg;
2891 scan_inst->saturate |= inst->saturate;
2892 inst->remove(block);
2893 progress = true;
2894 }
2895 break;
2896 }
2897
2898 /* We don't handle control flow here. Most computation of
2899 * values that end up in MRFs are shortly before the MRF
2900 * write anyway.
2901 */
2902 if (block->start() == scan_inst)
2903 break;
2904
2905 /* You can't read from an MRF, so if someone else reads our
2906 * MRF's source GRF that we wanted to rewrite, that stops us.
2907 */
2908 bool interfered = false;
2909 for (int i = 0; i < scan_inst->sources; i++) {
2910 if (scan_inst->src[i].file == GRF &&
2911 scan_inst->src[i].reg == inst->src[0].reg &&
2912 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2913 interfered = true;
2914 }
2915 }
2916 if (interfered)
2917 break;
2918
2919 if (scan_inst->dst.file == MRF) {
2920 /* If somebody else writes our MRF here, we can't
2921 * compute-to-MRF before that.
2922 */
2923 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2924 int scan_mrf_high;
2925
2926 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2927 scan_mrf_high = scan_mrf_low + 4;
2928 } else if (scan_inst->exec_size == 16) {
2929 scan_mrf_high = scan_mrf_low + 1;
2930 } else {
2931 scan_mrf_high = scan_mrf_low;
2932 }
2933
2934 if (mrf_low == scan_mrf_low ||
2935 mrf_low == scan_mrf_high ||
2936 mrf_high == scan_mrf_low ||
2937 mrf_high == scan_mrf_high) {
2938 break;
2939 }
2940 }
2941
2942 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2943 /* Found a SEND instruction, which means that there are
2944 * live values in MRFs from base_mrf to base_mrf +
2945 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2946 * above it.
2947 */
2948 if (mrf_low >= scan_inst->base_mrf &&
2949 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2950 break;
2951 }
2952 if (mrf_high >= scan_inst->base_mrf &&
2953 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2954 break;
2955 }
2956 }
2957 }
2958 }
2959
2960 if (progress)
2961 invalidate_live_intervals();
2962
2963 return progress;
2964 }
2965
2966 /**
2967 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2968 * instructions to FS_OPCODE_REP_FB_WRITE.
2969 */
2970 void
2971 fs_visitor::emit_repclear_shader()
2972 {
2973 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2974 int base_mrf = 1;
2975 int color_mrf = base_mrf + 2;
2976
2977 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2978 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2979 mov->force_writemask_all = true;
2980
2981 fs_inst *write;
2982 if (key->nr_color_regions == 1) {
2983 write = emit(FS_OPCODE_REP_FB_WRITE);
2984 write->saturate = key->clamp_fragment_color;
2985 write->base_mrf = color_mrf;
2986 write->target = 0;
2987 write->header_present = false;
2988 write->mlen = 1;
2989 } else {
2990 assume(key->nr_color_regions > 0);
2991 for (int i = 0; i < key->nr_color_regions; ++i) {
2992 write = emit(FS_OPCODE_REP_FB_WRITE);
2993 write->saturate = key->clamp_fragment_color;
2994 write->base_mrf = base_mrf;
2995 write->target = i;
2996 write->header_present = true;
2997 write->mlen = 3;
2998 }
2999 }
3000 write->eot = true;
3001
3002 calculate_cfg();
3003
3004 assign_constant_locations();
3005 assign_curb_setup();
3006
3007 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
3008 assert(mov->src[0].file == HW_REG);
3009 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
3010 }
3011
3012 /**
3013 * Walks through basic blocks, looking for repeated MRF writes and
3014 * removing the later ones.
3015 */
3016 bool
3017 fs_visitor::remove_duplicate_mrf_writes()
3018 {
3019 fs_inst *last_mrf_move[16];
3020 bool progress = false;
3021
3022 /* Need to update the MRF tracking for compressed instructions. */
3023 if (dispatch_width == 16)
3024 return false;
3025
3026 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3027
3028 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3029 if (inst->is_control_flow()) {
3030 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3031 }
3032
3033 if (inst->opcode == BRW_OPCODE_MOV &&
3034 inst->dst.file == MRF) {
3035 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3036 if (prev_inst && inst->equals(prev_inst)) {
3037 inst->remove(block);
3038 progress = true;
3039 continue;
3040 }
3041 }
3042
3043 /* Clear out the last-write records for MRFs that were overwritten. */
3044 if (inst->dst.file == MRF) {
3045 last_mrf_move[inst->dst.reg] = NULL;
3046 }
3047
3048 if (inst->mlen > 0 && inst->base_mrf != -1) {
3049 /* Found a SEND instruction, which will include two or fewer
3050 * implied MRF writes. We could do better here.
3051 */
3052 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3053 last_mrf_move[inst->base_mrf + i] = NULL;
3054 }
3055 }
3056
3057 /* Clear out any MRF move records whose sources got overwritten. */
3058 if (inst->dst.file == GRF) {
3059 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3060 if (last_mrf_move[i] &&
3061 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3062 last_mrf_move[i] = NULL;
3063 }
3064 }
3065 }
3066
3067 if (inst->opcode == BRW_OPCODE_MOV &&
3068 inst->dst.file == MRF &&
3069 inst->src[0].file == GRF &&
3070 !inst->is_partial_write()) {
3071 last_mrf_move[inst->dst.reg] = inst;
3072 }
3073 }
3074
3075 if (progress)
3076 invalidate_live_intervals();
3077
3078 return progress;
3079 }
3080
3081 static void
3082 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3083 {
3084 /* Clear the flag for registers that actually got read (as expected). */
3085 for (int i = 0; i < inst->sources; i++) {
3086 int grf;
3087 if (inst->src[i].file == GRF) {
3088 grf = inst->src[i].reg;
3089 } else if (inst->src[i].file == HW_REG &&
3090 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3091 grf = inst->src[i].fixed_hw_reg.nr;
3092 } else {
3093 continue;
3094 }
3095
3096 if (grf >= first_grf &&
3097 grf < first_grf + grf_len) {
3098 deps[grf - first_grf] = false;
3099 if (inst->exec_size == 16)
3100 deps[grf - first_grf + 1] = false;
3101 }
3102 }
3103 }
3104
3105 /**
3106 * Implements this workaround for the original 965:
3107 *
3108 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3109 * check for post destination dependencies on this instruction, software
3110 * must ensure that there is no destination hazard for the case of ‘write
3111 * followed by a posted write’ shown in the following example.
3112 *
3113 * 1. mov r3 0
3114 * 2. send r3.xy <rest of send instruction>
3115 * 3. mov r2 r3
3116 *
3117 * Due to no post-destination dependency check on the ‘send’, the above
3118 * code sequence could have two instructions (1 and 2) in flight at the
3119 * same time that both consider ‘r3’ as the target of their final writes.
3120 */
3121 void
3122 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3123 fs_inst *inst)
3124 {
3125 int write_len = inst->regs_written;
3126 int first_write_grf = inst->dst.reg;
3127 bool needs_dep[BRW_MAX_MRF];
3128 assert(write_len < (int)sizeof(needs_dep) - 1);
3129
3130 memset(needs_dep, false, sizeof(needs_dep));
3131 memset(needs_dep, true, write_len);
3132
3133 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3134
3135 /* Walk backwards looking for writes to registers we're writing which
3136 * aren't read since being written. If we hit the start of the program,
3137 * we assume that there are no outstanding dependencies on entry to the
3138 * program.
3139 */
3140 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3141 /* If we hit control flow, assume that there *are* outstanding
3142 * dependencies, and force their cleanup before our instruction.
3143 */
3144 if (block->start() == scan_inst) {
3145 for (int i = 0; i < write_len; i++) {
3146 if (needs_dep[i]) {
3147 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3148 }
3149 }
3150 return;
3151 }
3152
3153 /* We insert our reads as late as possible on the assumption that any
3154 * instruction but a MOV that might have left us an outstanding
3155 * dependency has more latency than a MOV.
3156 */
3157 if (scan_inst->dst.file == GRF) {
3158 for (int i = 0; i < scan_inst->regs_written; i++) {
3159 int reg = scan_inst->dst.reg + i;
3160
3161 if (reg >= first_write_grf &&
3162 reg < first_write_grf + write_len &&
3163 needs_dep[reg - first_write_grf]) {
3164 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3165 needs_dep[reg - first_write_grf] = false;
3166 if (scan_inst->exec_size == 16)
3167 needs_dep[reg - first_write_grf + 1] = false;
3168 }
3169 }
3170 }
3171
3172 /* Clear the flag for registers that actually got read (as expected). */
3173 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3174
3175 /* Continue the loop only if we haven't resolved all the dependencies */
3176 int i;
3177 for (i = 0; i < write_len; i++) {
3178 if (needs_dep[i])
3179 break;
3180 }
3181 if (i == write_len)
3182 return;
3183 }
3184 }
3185
3186 /**
3187 * Implements this workaround for the original 965:
3188 *
3189 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3190 * used as a destination register until after it has been sourced by an
3191 * instruction with a different destination register.
3192 */
3193 void
3194 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3195 {
3196 int write_len = inst->regs_written;
3197 int first_write_grf = inst->dst.reg;
3198 bool needs_dep[BRW_MAX_MRF];
3199 assert(write_len < (int)sizeof(needs_dep) - 1);
3200
3201 memset(needs_dep, false, sizeof(needs_dep));
3202 memset(needs_dep, true, write_len);
3203 /* Walk forwards looking for writes to registers we're writing which aren't
3204 * read before being written.
3205 */
3206 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3207 /* If we hit control flow, force resolve all remaining dependencies. */
3208 if (block->end() == scan_inst) {
3209 for (int i = 0; i < write_len; i++) {
3210 if (needs_dep[i])
3211 scan_inst->insert_before(block,
3212 DEP_RESOLVE_MOV(first_write_grf + i));
3213 }
3214 return;
3215 }
3216
3217 /* Clear the flag for registers that actually got read (as expected). */
3218 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3219
3220 /* We insert our reads as late as possible since they're reading the
3221 * result of a SEND, which has massive latency.
3222 */
3223 if (scan_inst->dst.file == GRF &&
3224 scan_inst->dst.reg >= first_write_grf &&
3225 scan_inst->dst.reg < first_write_grf + write_len &&
3226 needs_dep[scan_inst->dst.reg - first_write_grf]) {
3227 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3228 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3229 }
3230
3231 /* Continue the loop only if we haven't resolved all the dependencies */
3232 int i;
3233 for (i = 0; i < write_len; i++) {
3234 if (needs_dep[i])
3235 break;
3236 }
3237 if (i == write_len)
3238 return;
3239 }
3240 }
3241
3242 void
3243 fs_visitor::insert_gen4_send_dependency_workarounds()
3244 {
3245 if (devinfo->gen != 4 || devinfo->is_g4x)
3246 return;
3247
3248 bool progress = false;
3249
3250 /* Note that we're done with register allocation, so GRF fs_regs always
3251 * have a .reg_offset of 0.
3252 */
3253
3254 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3255 if (inst->mlen != 0 && inst->dst.file == GRF) {
3256 insert_gen4_pre_send_dependency_workarounds(block, inst);
3257 insert_gen4_post_send_dependency_workarounds(block, inst);
3258 progress = true;
3259 }
3260 }
3261
3262 if (progress)
3263 invalidate_live_intervals();
3264 }
3265
3266 /**
3267 * Turns the generic expression-style uniform pull constant load instruction
3268 * into a hardware-specific series of instructions for loading a pull
3269 * constant.
3270 *
3271 * The expression style allows the CSE pass before this to optimize out
3272 * repeated loads from the same offset, and gives the pre-register-allocation
3273 * scheduling full flexibility, while the conversion to native instructions
3274 * allows the post-register-allocation scheduler the best information
3275 * possible.
3276 *
3277 * Note that execution masking for setting up pull constant loads is special:
3278 * the channels that need to be written are unrelated to the current execution
3279 * mask, since a later instruction will use one of the result channels as a
3280 * source operand for all 8 or 16 of its channels.
3281 */
3282 void
3283 fs_visitor::lower_uniform_pull_constant_loads()
3284 {
3285 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3286 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3287 continue;
3288
3289 if (devinfo->gen >= 7) {
3290 /* The offset arg before was a vec4-aligned byte offset. We need to
3291 * turn it into a dword offset.
3292 */
3293 fs_reg const_offset_reg = inst->src[1];
3294 assert(const_offset_reg.file == IMM &&
3295 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3296 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3297 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3298
3299 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3300 * Reserve space for the register.
3301 */
3302 if (devinfo->gen >= 9) {
3303 payload.reg_offset++;
3304 alloc.sizes[payload.reg] = 2;
3305 }
3306
3307 /* This is actually going to be a MOV, but since only the first dword
3308 * is accessed, we have a special opcode to do just that one. Note
3309 * that this needs to be an operation that will be considered a def
3310 * by live variable analysis, or register allocation will explode.
3311 */
3312 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3313 8, payload, const_offset_reg);
3314 setup->force_writemask_all = true;
3315
3316 setup->ir = inst->ir;
3317 setup->annotation = inst->annotation;
3318 inst->insert_before(block, setup);
3319
3320 /* Similarly, this will only populate the first 4 channels of the
3321 * result register (since we only use smear values from 0-3), but we
3322 * don't tell the optimizer.
3323 */
3324 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3325 inst->src[1] = payload;
3326
3327 invalidate_live_intervals();
3328 } else {
3329 /* Before register allocation, we didn't tell the scheduler about the
3330 * MRF we use. We know it's safe to use this MRF because nothing
3331 * else does except for register spill/unspill, which generates and
3332 * uses its MRF within a single IR instruction.
3333 */
3334 inst->base_mrf = 14;
3335 inst->mlen = 1;
3336 }
3337 }
3338 }
3339
3340 bool
3341 fs_visitor::lower_load_payload()
3342 {
3343 bool progress = false;
3344
3345 int vgrf_to_reg[alloc.count];
3346 int reg_count = 0;
3347 for (unsigned i = 0; i < alloc.count; ++i) {
3348 vgrf_to_reg[i] = reg_count;
3349 reg_count += alloc.sizes[i];
3350 }
3351
3352 struct {
3353 bool written:1; /* Whether this register has ever been written */
3354 bool force_writemask_all:1;
3355 bool force_sechalf:1;
3356 } metadata[reg_count];
3357 memset(metadata, 0, sizeof(metadata));
3358
3359 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3360 if (inst->dst.file == GRF) {
3361 const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3362 bool force_sechalf = inst->force_sechalf &&
3363 !inst->force_writemask_all;
3364 bool toggle_sechalf = inst->dst.width == 16 &&
3365 type_sz(inst->dst.type) == 4 &&
3366 !inst->force_writemask_all;
3367 for (int i = 0; i < inst->regs_written; ++i) {
3368 metadata[dst_reg + i].written = true;
3369 metadata[dst_reg + i].force_sechalf = force_sechalf;
3370 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3371 force_sechalf = (toggle_sechalf != force_sechalf);
3372 }
3373 }
3374
3375 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3376 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3377 fs_reg dst = inst->dst;
3378
3379 for (int i = 0; i < inst->sources; i++) {
3380 dst.width = inst->src[i].effective_width;
3381 dst.type = inst->src[i].type;
3382
3383 if (inst->src[i].file == BAD_FILE) {
3384 /* Do nothing but otherwise increment as normal */
3385 } else if (dst.file == MRF &&
3386 dst.width == 8 &&
3387 devinfo->has_compr4 &&
3388 i + 4 < inst->sources &&
3389 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3390 fs_reg compr4_dst = dst;
3391 compr4_dst.reg += BRW_MRF_COMPR4;
3392 compr4_dst.width = 16;
3393 fs_reg compr4_src = inst->src[i];
3394 compr4_src.width = 16;
3395 fs_inst *mov = MOV(compr4_dst, compr4_src);
3396 mov->force_writemask_all = true;
3397 inst->insert_before(block, mov);
3398 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3399 inst->src[i + 4].file = BAD_FILE;
3400 } else {
3401 fs_inst *mov = MOV(dst, inst->src[i]);
3402 if (inst->src[i].file == GRF) {
3403 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3404 inst->src[i].reg_offset;
3405 mov->force_sechalf = metadata[src_reg].force_sechalf;
3406 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3407 } else {
3408 /* We don't have any useful metadata for immediates or
3409 * uniforms. Assume that any of the channels of the
3410 * destination may be used.
3411 */
3412 assert(inst->src[i].file == IMM ||
3413 inst->src[i].file == UNIFORM);
3414 mov->force_writemask_all = true;
3415 }
3416
3417 if (dst.file == GRF) {
3418 const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3419 const bool force_writemask = mov->force_writemask_all;
3420 metadata[dst_reg].force_writemask_all = force_writemask;
3421 metadata[dst_reg].force_sechalf = mov->force_sechalf;
3422 if (dst.width * type_sz(dst.type) > 32) {
3423 assert(!mov->force_sechalf);
3424 metadata[dst_reg + 1].force_writemask_all = force_writemask;
3425 metadata[dst_reg + 1].force_sechalf = !force_writemask;
3426 }
3427 }
3428
3429 inst->insert_before(block, mov);
3430 }
3431
3432 dst = offset(dst, 1);
3433 }
3434
3435 inst->remove(block);
3436 progress = true;
3437 }
3438 }
3439
3440 if (progress)
3441 invalidate_live_intervals();
3442
3443 return progress;
3444 }
3445
3446 void
3447 fs_visitor::dump_instructions()
3448 {
3449 dump_instructions(NULL);
3450 }
3451
3452 void
3453 fs_visitor::dump_instructions(const char *name)
3454 {
3455 FILE *file = stderr;
3456 if (name && geteuid() != 0) {
3457 file = fopen(name, "w");
3458 if (!file)
3459 file = stderr;
3460 }
3461
3462 if (cfg) {
3463 calculate_register_pressure();
3464 int ip = 0, max_pressure = 0;
3465 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3466 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3467 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3468 dump_instruction(inst, file);
3469 ip++;
3470 }
3471 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3472 } else {
3473 int ip = 0;
3474 foreach_in_list(backend_instruction, inst, &instructions) {
3475 fprintf(file, "%4d: ", ip++);
3476 dump_instruction(inst, file);
3477 }
3478 }
3479
3480 if (file != stderr) {
3481 fclose(file);
3482 }
3483 }
3484
3485 void
3486 fs_visitor::dump_instruction(backend_instruction *be_inst)
3487 {
3488 dump_instruction(be_inst, stderr);
3489 }
3490
3491 void
3492 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3493 {
3494 fs_inst *inst = (fs_inst *)be_inst;
3495
3496 if (inst->predicate) {
3497 fprintf(file, "(%cf0.%d) ",
3498 inst->predicate_inverse ? '-' : '+',
3499 inst->flag_subreg);
3500 }
3501
3502 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3503 if (inst->saturate)
3504 fprintf(file, ".sat");
3505 if (inst->conditional_mod) {
3506 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3507 if (!inst->predicate &&
3508 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3509 inst->opcode != BRW_OPCODE_IF &&
3510 inst->opcode != BRW_OPCODE_WHILE))) {
3511 fprintf(file, ".f0.%d", inst->flag_subreg);
3512 }
3513 }
3514 fprintf(file, "(%d) ", inst->exec_size);
3515
3516
3517 switch (inst->dst.file) {
3518 case GRF:
3519 fprintf(file, "vgrf%d", inst->dst.reg);
3520 if (inst->dst.width != dispatch_width)
3521 fprintf(file, "@%d", inst->dst.width);
3522 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3523 inst->dst.subreg_offset)
3524 fprintf(file, "+%d.%d",
3525 inst->dst.reg_offset, inst->dst.subreg_offset);
3526 break;
3527 case MRF:
3528 fprintf(file, "m%d", inst->dst.reg);
3529 break;
3530 case BAD_FILE:
3531 fprintf(file, "(null)");
3532 break;
3533 case UNIFORM:
3534 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3535 break;
3536 case ATTR:
3537 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3538 break;
3539 case HW_REG:
3540 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3541 switch (inst->dst.fixed_hw_reg.nr) {
3542 case BRW_ARF_NULL:
3543 fprintf(file, "null");
3544 break;
3545 case BRW_ARF_ADDRESS:
3546 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3547 break;
3548 case BRW_ARF_ACCUMULATOR:
3549 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3550 break;
3551 case BRW_ARF_FLAG:
3552 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3553 inst->dst.fixed_hw_reg.subnr);
3554 break;
3555 default:
3556 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3557 inst->dst.fixed_hw_reg.subnr);
3558 break;
3559 }
3560 } else {
3561 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3562 }
3563 if (inst->dst.fixed_hw_reg.subnr)
3564 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3565 break;
3566 default:
3567 fprintf(file, "???");
3568 break;
3569 }
3570 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3571
3572 for (int i = 0; i < inst->sources; i++) {
3573 if (inst->src[i].negate)
3574 fprintf(file, "-");
3575 if (inst->src[i].abs)
3576 fprintf(file, "|");
3577 switch (inst->src[i].file) {
3578 case GRF:
3579 fprintf(file, "vgrf%d", inst->src[i].reg);
3580 if (inst->src[i].width != dispatch_width)
3581 fprintf(file, "@%d", inst->src[i].width);
3582 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3583 inst->src[i].subreg_offset)
3584 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3585 inst->src[i].subreg_offset);
3586 break;
3587 case MRF:
3588 fprintf(file, "***m%d***", inst->src[i].reg);
3589 break;
3590 case ATTR:
3591 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3592 break;
3593 case UNIFORM:
3594 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3595 if (inst->src[i].reladdr) {
3596 fprintf(file, "+reladdr");
3597 } else if (inst->src[i].subreg_offset) {
3598 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3599 inst->src[i].subreg_offset);
3600 }
3601 break;
3602 case BAD_FILE:
3603 fprintf(file, "(null)");
3604 break;
3605 case IMM:
3606 switch (inst->src[i].type) {
3607 case BRW_REGISTER_TYPE_F:
3608 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3609 break;
3610 case BRW_REGISTER_TYPE_W:
3611 case BRW_REGISTER_TYPE_D:
3612 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3613 break;
3614 case BRW_REGISTER_TYPE_UW:
3615 case BRW_REGISTER_TYPE_UD:
3616 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3617 break;
3618 case BRW_REGISTER_TYPE_VF:
3619 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3620 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3621 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3622 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3623 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3624 break;
3625 default:
3626 fprintf(file, "???");
3627 break;
3628 }
3629 break;
3630 case HW_REG:
3631 if (inst->src[i].fixed_hw_reg.negate)
3632 fprintf(file, "-");
3633 if (inst->src[i].fixed_hw_reg.abs)
3634 fprintf(file, "|");
3635 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3636 switch (inst->src[i].fixed_hw_reg.nr) {
3637 case BRW_ARF_NULL:
3638 fprintf(file, "null");
3639 break;
3640 case BRW_ARF_ADDRESS:
3641 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3642 break;
3643 case BRW_ARF_ACCUMULATOR:
3644 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3645 break;
3646 case BRW_ARF_FLAG:
3647 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3648 inst->src[i].fixed_hw_reg.subnr);
3649 break;
3650 default:
3651 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3652 inst->src[i].fixed_hw_reg.subnr);
3653 break;
3654 }
3655 } else {
3656 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3657 }
3658 if (inst->src[i].fixed_hw_reg.subnr)
3659 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3660 if (inst->src[i].fixed_hw_reg.abs)
3661 fprintf(file, "|");
3662 break;
3663 default:
3664 fprintf(file, "???");
3665 break;
3666 }
3667 if (inst->src[i].abs)
3668 fprintf(file, "|");
3669
3670 if (inst->src[i].file != IMM) {
3671 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3672 }
3673
3674 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3675 fprintf(file, ", ");
3676 }
3677
3678 fprintf(file, " ");
3679
3680 if (dispatch_width == 16 && inst->exec_size == 8) {
3681 if (inst->force_sechalf)
3682 fprintf(file, "2ndhalf ");
3683 else
3684 fprintf(file, "1sthalf ");
3685 }
3686
3687 fprintf(file, "\n");
3688 }
3689
3690 /**
3691 * Possibly returns an instruction that set up @param reg.
3692 *
3693 * Sometimes we want to take the result of some expression/variable
3694 * dereference tree and rewrite the instruction generating the result
3695 * of the tree. When processing the tree, we know that the
3696 * instructions generated are all writing temporaries that are dead
3697 * outside of this tree. So, if we have some instructions that write
3698 * a temporary, we're free to point that temp write somewhere else.
3699 *
3700 * Note that this doesn't guarantee that the instruction generated
3701 * only reg -- it might be the size=4 destination of a texture instruction.
3702 */
3703 fs_inst *
3704 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3705 fs_inst *end,
3706 const fs_reg &reg)
3707 {
3708 if (end == start ||
3709 end->is_partial_write() ||
3710 reg.reladdr ||
3711 !reg.equals(end->dst)) {
3712 return NULL;
3713 } else {
3714 return end;
3715 }
3716 }
3717
3718 void
3719 fs_visitor::setup_payload_gen6()
3720 {
3721 bool uses_depth =
3722 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3723 unsigned barycentric_interp_modes =
3724 (stage == MESA_SHADER_FRAGMENT) ?
3725 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3726
3727 assert(devinfo->gen >= 6);
3728
3729 /* R0-1: masks, pixel X/Y coordinates. */
3730 payload.num_regs = 2;
3731 /* R2: only for 32-pixel dispatch.*/
3732
3733 /* R3-26: barycentric interpolation coordinates. These appear in the
3734 * same order that they appear in the brw_wm_barycentric_interp_mode
3735 * enum. Each set of coordinates occupies 2 registers if dispatch width
3736 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3737 * appear if they were enabled using the "Barycentric Interpolation
3738 * Mode" bits in WM_STATE.
3739 */
3740 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3741 if (barycentric_interp_modes & (1 << i)) {
3742 payload.barycentric_coord_reg[i] = payload.num_regs;
3743 payload.num_regs += 2;
3744 if (dispatch_width == 16) {
3745 payload.num_regs += 2;
3746 }
3747 }
3748 }
3749
3750 /* R27: interpolated depth if uses source depth */
3751 if (uses_depth) {
3752 payload.source_depth_reg = payload.num_regs;
3753 payload.num_regs++;
3754 if (dispatch_width == 16) {
3755 /* R28: interpolated depth if not SIMD8. */
3756 payload.num_regs++;
3757 }
3758 }
3759 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3760 if (uses_depth) {
3761 payload.source_w_reg = payload.num_regs;
3762 payload.num_regs++;
3763 if (dispatch_width == 16) {
3764 /* R30: interpolated W if not SIMD8. */
3765 payload.num_regs++;
3766 }
3767 }
3768
3769 if (stage == MESA_SHADER_FRAGMENT) {
3770 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3771 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3772 prog_data->uses_pos_offset = key->compute_pos_offset;
3773 /* R31: MSAA position offsets. */
3774 if (prog_data->uses_pos_offset) {
3775 payload.sample_pos_reg = payload.num_regs;
3776 payload.num_regs++;
3777 }
3778 }
3779
3780 /* R32: MSAA input coverage mask */
3781 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3782 assert(devinfo->gen >= 7);
3783 payload.sample_mask_in_reg = payload.num_regs;
3784 payload.num_regs++;
3785 if (dispatch_width == 16) {
3786 /* R33: input coverage mask if not SIMD8. */
3787 payload.num_regs++;
3788 }
3789 }
3790
3791 /* R34-: bary for 32-pixel. */
3792 /* R58-59: interp W for 32-pixel. */
3793
3794 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3795 source_depth_to_render_target = true;
3796 }
3797 }
3798
3799 void
3800 fs_visitor::setup_vs_payload()
3801 {
3802 /* R0: thread header, R1: urb handles */
3803 payload.num_regs = 2;
3804 }
3805
3806 void
3807 fs_visitor::setup_cs_payload()
3808 {
3809 assert(brw->gen >= 7);
3810
3811 payload.num_regs = 1;
3812 }
3813
3814 void
3815 fs_visitor::assign_binding_table_offsets()
3816 {
3817 assert(stage == MESA_SHADER_FRAGMENT);
3818 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3819 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3820 uint32_t next_binding_table_offset = 0;
3821
3822 /* If there are no color regions, we still perform an FB write to a null
3823 * renderbuffer, which we place at surface index 0.
3824 */
3825 prog_data->binding_table.render_target_start = next_binding_table_offset;
3826 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3827
3828 assign_common_binding_table_offsets(next_binding_table_offset);
3829 }
3830
3831 void
3832 fs_visitor::calculate_register_pressure()
3833 {
3834 invalidate_live_intervals();
3835 calculate_live_intervals();
3836
3837 unsigned num_instructions = 0;
3838 foreach_block(block, cfg)
3839 num_instructions += block->instructions.length();
3840
3841 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3842
3843 for (unsigned reg = 0; reg < alloc.count; reg++) {
3844 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3845 regs_live_at_ip[ip] += alloc.sizes[reg];
3846 }
3847 }
3848
3849 void
3850 fs_visitor::optimize()
3851 {
3852 split_virtual_grfs();
3853
3854 move_uniform_array_access_to_pull_constants();
3855 assign_constant_locations();
3856 demote_pull_constants();
3857
3858 #define OPT(pass, args...) ({ \
3859 pass_num++; \
3860 bool this_progress = pass(args); \
3861 \
3862 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3863 char filename[64]; \
3864 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3865 stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3866 \
3867 backend_visitor::dump_instructions(filename); \
3868 } \
3869 \
3870 progress = progress || this_progress; \
3871 this_progress; \
3872 })
3873
3874 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3875 char filename[64];
3876 snprintf(filename, 64, "%s%d-%04d-00-start",
3877 stage_abbrev, dispatch_width,
3878 shader_prog ? shader_prog->Name : 0);
3879
3880 backend_visitor::dump_instructions(filename);
3881 }
3882
3883 bool progress;
3884 int iteration = 0;
3885 int pass_num = 0;
3886 do {
3887 progress = false;
3888 pass_num = 0;
3889 iteration++;
3890
3891 OPT(remove_duplicate_mrf_writes);
3892
3893 OPT(opt_algebraic);
3894 OPT(opt_cse);
3895 OPT(opt_copy_propagate);
3896 OPT(opt_peephole_predicated_break);
3897 OPT(opt_cmod_propagation);
3898 OPT(dead_code_eliminate);
3899 OPT(opt_peephole_sel);
3900 OPT(dead_control_flow_eliminate, this);
3901 OPT(opt_register_renaming);
3902 OPT(opt_redundant_discard_jumps);
3903 OPT(opt_saturate_propagation);
3904 OPT(opt_zero_samples);
3905 OPT(register_coalesce);
3906 OPT(compute_to_mrf);
3907
3908 OPT(compact_virtual_grfs);
3909 } while (progress);
3910
3911 pass_num = 0;
3912
3913 OPT(opt_sampler_eot);
3914
3915 if (OPT(lower_load_payload)) {
3916 split_virtual_grfs();
3917 OPT(register_coalesce);
3918 OPT(compute_to_mrf);
3919 OPT(dead_code_eliminate);
3920 }
3921
3922 OPT(opt_combine_constants);
3923
3924 lower_uniform_pull_constant_loads();
3925 }
3926
3927 /**
3928 * Three source instruction must have a GRF/MRF destination register.
3929 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
3930 */
3931 void
3932 fs_visitor::fixup_3src_null_dest()
3933 {
3934 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3935 if (inst->is_3src() && inst->dst.is_null()) {
3936 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3937 inst->dst.type);
3938 }
3939 }
3940 }
3941
3942 void
3943 fs_visitor::allocate_registers()
3944 {
3945 bool allocated_without_spills;
3946
3947 static const enum instruction_scheduler_mode pre_modes[] = {
3948 SCHEDULE_PRE,
3949 SCHEDULE_PRE_NON_LIFO,
3950 SCHEDULE_PRE_LIFO,
3951 };
3952
3953 /* Try each scheduling heuristic to see if it can successfully register
3954 * allocate without spilling. They should be ordered by decreasing
3955 * performance but increasing likelihood of allocating.
3956 */
3957 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3958 schedule_instructions(pre_modes[i]);
3959
3960 if (0) {
3961 assign_regs_trivial();
3962 allocated_without_spills = true;
3963 } else {
3964 allocated_without_spills = assign_regs(false);
3965 }
3966 if (allocated_without_spills)
3967 break;
3968 }
3969
3970 if (!allocated_without_spills) {
3971 /* We assume that any spilling is worse than just dropping back to
3972 * SIMD8. There's probably actually some intermediate point where
3973 * SIMD16 with a couple of spills is still better.
3974 */
3975 if (dispatch_width == 16) {
3976 fail("Failure to register allocate. Reduce number of "
3977 "live scalar values to avoid this.");
3978 } else {
3979 perf_debug("%s shader triggered register spilling. "
3980 "Try reducing the number of live scalar values to "
3981 "improve performance.\n", stage_name);
3982 }
3983
3984 /* Since we're out of heuristics, just go spill registers until we
3985 * get an allocation.
3986 */
3987 while (!assign_regs(true)) {
3988 if (failed)
3989 break;
3990 }
3991 }
3992
3993 /* This must come after all optimization and register allocation, since
3994 * it inserts dead code that happens to have side effects, and it does
3995 * so based on the actual physical registers in use.
3996 */
3997 insert_gen4_send_dependency_workarounds();
3998
3999 if (failed)
4000 return;
4001
4002 if (!allocated_without_spills)
4003 schedule_instructions(SCHEDULE_POST);
4004
4005 if (last_scratch > 0)
4006 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4007 }
4008
4009 bool
4010 fs_visitor::run_vs()
4011 {
4012 assert(stage == MESA_SHADER_VERTEX);
4013
4014 assign_common_binding_table_offsets(0);
4015 setup_vs_payload();
4016
4017 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4018 emit_shader_time_begin();
4019
4020 if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
4021 emit_nir_code();
4022 } else {
4023 foreach_in_list(ir_instruction, ir, shader->base.ir) {
4024 base_ir = ir;
4025 this->result = reg_undef;
4026 ir->accept(this);
4027 }
4028 base_ir = NULL;
4029 }
4030
4031 if (failed)
4032 return false;
4033
4034 emit_urb_writes();
4035
4036 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4037 emit_shader_time_end();
4038
4039 calculate_cfg();
4040
4041 optimize();
4042
4043 assign_curb_setup();
4044 assign_vs_urb_setup();
4045
4046 fixup_3src_null_dest();
4047 allocate_registers();
4048
4049 return !failed;
4050 }
4051
4052 bool
4053 fs_visitor::run_fs()
4054 {
4055 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4056 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4057
4058 assert(stage == MESA_SHADER_FRAGMENT);
4059
4060 sanity_param_count = prog->Parameters->NumParameters;
4061
4062 assign_binding_table_offsets();
4063
4064 if (devinfo->gen >= 6)
4065 setup_payload_gen6();
4066 else
4067 setup_payload_gen4();
4068
4069 if (0) {
4070 emit_dummy_fs();
4071 } else if (brw->use_rep_send && dispatch_width == 16) {
4072 emit_repclear_shader();
4073 } else {
4074 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4075 emit_shader_time_begin();
4076
4077 calculate_urb_setup();
4078 if (prog->InputsRead > 0) {
4079 if (devinfo->gen < 6)
4080 emit_interpolation_setup_gen4();
4081 else
4082 emit_interpolation_setup_gen6();
4083 }
4084
4085 /* We handle discards by keeping track of the still-live pixels in f0.1.
4086 * Initialize it with the dispatched pixels.
4087 */
4088 if (wm_prog_data->uses_kill) {
4089 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4090 discard_init->flag_subreg = 1;
4091 }
4092
4093 /* Generate FS IR for main(). (the visitor only descends into
4094 * functions called "main").
4095 */
4096 if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
4097 emit_nir_code();
4098 } else if (shader) {
4099 foreach_in_list(ir_instruction, ir, shader->base.ir) {
4100 base_ir = ir;
4101 this->result = reg_undef;
4102 ir->accept(this);
4103 }
4104 } else {
4105 emit_fragment_program_code();
4106 }
4107 base_ir = NULL;
4108 if (failed)
4109 return false;
4110
4111 if (wm_prog_data->uses_kill)
4112 emit(FS_OPCODE_PLACEHOLDER_HALT);
4113
4114 if (wm_key->alpha_test_func)
4115 emit_alpha_test();
4116
4117 emit_fb_writes();
4118
4119 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4120 emit_shader_time_end();
4121
4122 calculate_cfg();
4123
4124 optimize();
4125
4126 assign_curb_setup();
4127 assign_urb_setup();
4128
4129 fixup_3src_null_dest();
4130 allocate_registers();
4131
4132 if (failed)
4133 return false;
4134 }
4135
4136 if (dispatch_width == 8)
4137 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4138 else
4139 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4140
4141 /* If any state parameters were appended, then ParameterValues could have
4142 * been realloced, in which case the driver uniform storage set up by
4143 * _mesa_associate_uniform_storage() would point to freed memory. Make
4144 * sure that didn't happen.
4145 */
4146 assert(sanity_param_count == prog->Parameters->NumParameters);
4147
4148 return !failed;
4149 }
4150
4151 bool
4152 fs_visitor::run_cs()
4153 {
4154 assert(stage == MESA_SHADER_COMPUTE);
4155 assert(shader);
4156
4157 sanity_param_count = prog->Parameters->NumParameters;
4158
4159 assign_common_binding_table_offsets(0);
4160
4161 setup_cs_payload();
4162
4163 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4164 emit_shader_time_begin();
4165
4166 emit_nir_code();
4167
4168 if (failed)
4169 return false;
4170
4171 emit_cs_terminate();
4172
4173 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4174 emit_shader_time_end();
4175
4176 calculate_cfg();
4177
4178 optimize();
4179
4180 assign_curb_setup();
4181
4182 fixup_3src_null_dest();
4183 allocate_registers();
4184
4185 if (failed)
4186 return false;
4187
4188 /* If any state parameters were appended, then ParameterValues could have
4189 * been realloced, in which case the driver uniform storage set up by
4190 * _mesa_associate_uniform_storage() would point to freed memory. Make
4191 * sure that didn't happen.
4192 */
4193 assert(sanity_param_count == prog->Parameters->NumParameters);
4194
4195 return !failed;
4196 }
4197
4198 const unsigned *
4199 brw_wm_fs_emit(struct brw_context *brw,
4200 void *mem_ctx,
4201 const struct brw_wm_prog_key *key,
4202 struct brw_wm_prog_data *prog_data,
4203 struct gl_fragment_program *fp,
4204 struct gl_shader_program *prog,
4205 unsigned *final_assembly_size)
4206 {
4207 bool start_busy = false;
4208 double start_time = 0;
4209
4210 if (unlikely(brw->perf_debug)) {
4211 start_busy = (brw->batch.last_bo &&
4212 drm_intel_bo_busy(brw->batch.last_bo));
4213 start_time = get_time();
4214 }
4215
4216 struct brw_shader *shader = NULL;
4217 if (prog)
4218 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4219
4220 if (unlikely(INTEL_DEBUG & DEBUG_WM))
4221 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4222
4223 /* Now the main event: Visit the shader IR and generate our FS IR for it.
4224 */
4225 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
4226 if (!v.run_fs()) {
4227 if (prog) {
4228 prog->LinkStatus = false;
4229 ralloc_strcat(&prog->InfoLog, v.fail_msg);
4230 }
4231
4232 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4233 v.fail_msg);
4234
4235 return NULL;
4236 }
4237
4238 cfg_t *simd16_cfg = NULL;
4239 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
4240 if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4241 if (!v.simd16_unsupported) {
4242 /* Try a SIMD16 compile */
4243 v2.import_uniforms(&v);
4244 if (!v2.run_fs()) {
4245 perf_debug("SIMD16 shader failed to compile, falling back to "
4246 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4247 } else {
4248 simd16_cfg = v2.cfg;
4249 }
4250 } else {
4251 perf_debug("SIMD16 shader unsupported, falling back to "
4252 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4253 }
4254 }
4255
4256 cfg_t *simd8_cfg;
4257 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4258 if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4259 simd8_cfg = NULL;
4260 prog_data->no_8 = true;
4261 } else {
4262 simd8_cfg = v.cfg;
4263 prog_data->no_8 = false;
4264 }
4265
4266 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4267 &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4268
4269 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4270 char *name;
4271 if (prog)
4272 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4273 prog->Label ? prog->Label : "unnamed",
4274 prog->Name);
4275 else
4276 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4277
4278 g.enable_debug(name);
4279 }
4280
4281 if (simd8_cfg)
4282 g.generate_code(simd8_cfg, 8);
4283 if (simd16_cfg)
4284 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4285
4286 if (unlikely(brw->perf_debug) && shader) {
4287 if (shader->compiled_once)
4288 brw_wm_debug_recompile(brw, prog, key);
4289 shader->compiled_once = true;
4290
4291 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4292 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4293 (get_time() - start_time) * 1000);
4294 }
4295 }
4296
4297 return g.get_assembly(final_assembly_size);
4298 }
4299
4300 extern "C" bool
4301 brw_fs_precompile(struct gl_context *ctx,
4302 struct gl_shader_program *shader_prog,
4303 struct gl_program *prog)
4304 {
4305 struct brw_context *brw = brw_context(ctx);
4306 struct brw_wm_prog_key key;
4307
4308 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4309 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4310 bool program_uses_dfdy = fp->UsesDFdy;
4311
4312 memset(&key, 0, sizeof(key));
4313
4314 if (brw->gen < 6) {
4315 if (fp->UsesKill)
4316 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4317
4318 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4319 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4320
4321 /* Just assume depth testing. */
4322 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4323 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4324 }
4325
4326 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4327 BRW_FS_VARYING_INPUT_MASK) > 16)
4328 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4329
4330 brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4331
4332 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4333 key.drawable_height = ctx->DrawBuffer->Height;
4334 }
4335
4336 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4337 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4338 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4339
4340 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4341 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4342 key.nr_color_regions > 1;
4343 }
4344
4345 key.program_string_id = bfp->id;
4346
4347 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4348 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4349
4350 bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4351
4352 brw->wm.base.prog_offset = old_prog_offset;
4353 brw->wm.prog_data = old_prog_data;
4354
4355 return success;
4356 }
4357
4358 void
4359 brw_setup_tex_for_precompile(struct brw_context *brw,
4360 struct brw_sampler_prog_key_data *tex,
4361 struct gl_program *prog)
4362 {
4363 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4364 unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4365 for (unsigned i = 0; i < sampler_count; i++) {
4366 if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4367 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4368 tex->swizzles[i] =
4369 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4370 } else {
4371 /* Color sampler: assume no swizzling. */
4372 tex->swizzles[i] = SWIZZLE_XYZW;
4373 }
4374 }
4375 }