i965/vec4: Simplify opt_reduce_swizzle() using the swizzle utils.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 void
53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
54 const fs_reg *src, unsigned sources)
55 {
56 memset(this, 0, sizeof(*this));
57
58 this->src = new fs_reg[MAX2(sources, 3)];
59 for (unsigned i = 0; i < sources; i++)
60 this->src[i] = src[i];
61
62 this->opcode = opcode;
63 this->dst = dst;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (unsigned i = 0; i < sources; ++i) {
79 if (src[i].file != GRF && src[i].file != ATTR)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 for (unsigned i = 0; i < sources; ++i) {
94 switch (this->src[i].file) {
95 case BAD_FILE:
96 this->src[i].effective_width = 8;
97 break;
98 case GRF:
99 case HW_REG:
100 case ATTR:
101 assert(this->src[i].width > 0);
102 if (this->src[i].width == 1) {
103 this->src[i].effective_width = this->exec_size;
104 } else {
105 this->src[i].effective_width = this->src[i].width;
106 }
107 break;
108 case IMM:
109 case UNIFORM:
110 this->src[i].effective_width = this->exec_size;
111 break;
112 default:
113 unreachable("Invalid source register file");
114 }
115 }
116 this->dst.effective_width = this->exec_size;
117
118 this->conditional_mod = BRW_CONDITIONAL_NONE;
119
120 /* This will be the case for almost all instructions. */
121 switch (dst.file) {
122 case GRF:
123 case HW_REG:
124 case MRF:
125 case ATTR:
126 this->regs_written =
127 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
128 break;
129 case BAD_FILE:
130 this->regs_written = 0;
131 break;
132 case IMM:
133 case UNIFORM:
134 unreachable("Invalid destination register file");
135 default:
136 unreachable("Invalid register file");
137 }
138
139 this->writes_accumulator = false;
140 }
141
142 fs_inst::fs_inst()
143 {
144 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
145 }
146
147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
148 {
149 init(opcode, exec_size, reg_undef, NULL, 0);
150 }
151
152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
153 {
154 init(opcode, 0, dst, NULL, 0);
155 }
156
157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
158 const fs_reg &src0)
159 {
160 const fs_reg src[1] = { src0 };
161 init(opcode, exec_size, dst, src, 1);
162 }
163
164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
165 {
166 const fs_reg src[1] = { src0 };
167 init(opcode, 0, dst, src, 1);
168 }
169
170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
171 const fs_reg &src0, const fs_reg &src1)
172 {
173 const fs_reg src[2] = { src0, src1 };
174 init(opcode, exec_size, dst, src, 2);
175 }
176
177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
178 const fs_reg &src1)
179 {
180 const fs_reg src[2] = { src0, src1 };
181 init(opcode, 0, dst, src, 2);
182 }
183
184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
185 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
186 {
187 const fs_reg src[3] = { src0, src1, src2 };
188 init(opcode, exec_size, dst, src, 3);
189 }
190
191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
192 const fs_reg &src1, const fs_reg &src2)
193 {
194 const fs_reg src[3] = { src0, src1, src2 };
195 init(opcode, 0, dst, src, 3);
196 }
197
198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
199 const fs_reg src[], unsigned sources)
200 {
201 init(opcode, 0, dst, src, sources);
202 }
203
204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
205 const fs_reg src[], unsigned sources)
206 {
207 init(opcode, exec_width, dst, src, sources);
208 }
209
210 fs_inst::fs_inst(const fs_inst &that)
211 {
212 memcpy(this, &that, sizeof(that));
213
214 this->src = new fs_reg[MAX2(that.sources, 3)];
215
216 for (unsigned i = 0; i < that.sources; i++)
217 this->src[i] = that.src[i];
218 }
219
220 fs_inst::~fs_inst()
221 {
222 delete[] this->src;
223 }
224
225 void
226 fs_inst::resize_sources(uint8_t num_sources)
227 {
228 if (this->sources != num_sources) {
229 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
230
231 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
232 src[i] = this->src[i];
233
234 delete[] this->src;
235 this->src = src;
236 this->sources = num_sources;
237 }
238 }
239
240 #define ALU1(op) \
241 fs_inst * \
242 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
243 { \
244 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
245 }
246
247 #define ALU2(op) \
248 fs_inst * \
249 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
250 const fs_reg &src1) \
251 { \
252 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
253 }
254
255 #define ALU2_ACC(op) \
256 fs_inst * \
257 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
258 const fs_reg &src1) \
259 { \
260 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
261 inst->writes_accumulator = true; \
262 return inst; \
263 }
264
265 #define ALU3(op) \
266 fs_inst * \
267 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
268 const fs_reg &src1, const fs_reg &src2) \
269 { \
270 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
271 }
272
273 ALU1(NOT)
274 ALU1(MOV)
275 ALU1(FRC)
276 ALU1(RNDD)
277 ALU1(RNDE)
278 ALU1(RNDZ)
279 ALU2(ADD)
280 ALU2(MUL)
281 ALU2_ACC(MACH)
282 ALU2(AND)
283 ALU2(OR)
284 ALU2(XOR)
285 ALU2(SHL)
286 ALU2(SHR)
287 ALU2(ASR)
288 ALU3(LRP)
289 ALU1(BFREV)
290 ALU3(BFE)
291 ALU2(BFI1)
292 ALU3(BFI2)
293 ALU1(FBH)
294 ALU1(FBL)
295 ALU1(CBIT)
296 ALU3(MAD)
297 ALU2_ACC(ADDC)
298 ALU2_ACC(SUBB)
299 ALU2(SEL)
300 ALU2(MAC)
301
302 /** Gen4 predicated IF. */
303 fs_inst *
304 fs_visitor::IF(enum brw_predicate predicate)
305 {
306 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
307 inst->predicate = predicate;
308 return inst;
309 }
310
311 /** Gen6 IF with embedded comparison. */
312 fs_inst *
313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
314 enum brw_conditional_mod condition)
315 {
316 assert(brw->gen == 6);
317 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
318 reg_null_d, src0, src1);
319 inst->conditional_mod = condition;
320 return inst;
321 }
322
323 /**
324 * CMP: Sets the low bit of the destination channels with the result
325 * of the comparison, while the upper bits are undefined, and updates
326 * the flag register with the packed 16 bits of the result.
327 */
328 fs_inst *
329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
330 enum brw_conditional_mod condition)
331 {
332 fs_inst *inst;
333
334 /* Take the instruction:
335 *
336 * CMP null<d> src0<f> src1<f>
337 *
338 * Original gen4 does type conversion to the destination type before
339 * comparison, producing garbage results for floating point comparisons.
340 *
341 * The destination type doesn't matter on newer generations, so we set the
342 * type to match src0 so we can compact the instruction.
343 */
344 dst.type = src0.type;
345 if (dst.file == HW_REG)
346 dst.fixed_hw_reg.type = dst.type;
347
348 resolve_ud_negate(&src0);
349 resolve_ud_negate(&src1);
350
351 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
352 inst->conditional_mod = condition;
353
354 return inst;
355 }
356
357 fs_inst *
358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
359 {
360 uint8_t exec_size = dst.width;
361 for (int i = 0; i < sources; ++i) {
362 assert(src[i].width % dst.width == 0);
363 if (src[i].width > exec_size)
364 exec_size = src[i].width;
365 }
366
367 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
368 dst, src, sources);
369 inst->regs_written = 0;
370 for (int i = 0; i < sources; ++i) {
371 /* The LOAD_PAYLOAD instruction only really makes sense if we are
372 * dealing with whole registers. If this ever changes, we can deal
373 * with it later.
374 */
375 int size = inst->src[i].effective_width * type_sz(src[i].type);
376 assert(size % 32 == 0);
377 inst->regs_written += (size + 31) / 32;
378 }
379
380 return inst;
381 }
382
383 exec_list
384 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
385 const fs_reg &surf_index,
386 const fs_reg &varying_offset,
387 uint32_t const_offset)
388 {
389 exec_list instructions;
390 fs_inst *inst;
391
392 /* We have our constant surface use a pitch of 4 bytes, so our index can
393 * be any component of a vector, and then we load 4 contiguous
394 * components starting from that.
395 *
396 * We break down the const_offset to a portion added to the variable
397 * offset and a portion done using reg_offset, which means that if you
398 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
399 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
400 * CSE can later notice that those loads are all the same and eliminate
401 * the redundant ones.
402 */
403 fs_reg vec4_offset = vgrf(glsl_type::int_type);
404 instructions.push_tail(ADD(vec4_offset,
405 varying_offset, fs_reg(const_offset & ~3)));
406
407 int scale = 1;
408 if (brw->gen == 4 && dst.width == 8) {
409 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
410 * u, v, r) as parameters, or we can just use the SIMD16 message
411 * consisting of (header, u). We choose the second, at the cost of a
412 * longer return length.
413 */
414 scale = 2;
415 }
416
417 enum opcode op;
418 if (brw->gen >= 7)
419 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
420 else
421 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
422
423 assert(dst.width % 8 == 0);
424 int regs_written = 4 * (dst.width / 8) * scale;
425 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
426 dst.type, dst.width);
427 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
428 inst->regs_written = regs_written;
429 instructions.push_tail(inst);
430
431 if (brw->gen < 7) {
432 inst->base_mrf = 13;
433 inst->header_present = true;
434 if (brw->gen == 4)
435 inst->mlen = 3;
436 else
437 inst->mlen = 1 + dispatch_width / 8;
438 }
439
440 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
441 instructions.push_tail(MOV(dst, result));
442
443 return instructions;
444 }
445
446 /**
447 * A helper for MOV generation for fixing up broken hardware SEND dependency
448 * handling.
449 */
450 fs_inst *
451 fs_visitor::DEP_RESOLVE_MOV(int grf)
452 {
453 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
454
455 inst->ir = NULL;
456 inst->annotation = "send dependency resolve";
457
458 /* The caller always wants uncompressed to emit the minimal extra
459 * dependencies, and to avoid having to deal with aligning its regs to 2.
460 */
461 inst->exec_size = 8;
462
463 return inst;
464 }
465
466 bool
467 fs_inst::equals(fs_inst *inst) const
468 {
469 return (opcode == inst->opcode &&
470 dst.equals(inst->dst) &&
471 src[0].equals(inst->src[0]) &&
472 src[1].equals(inst->src[1]) &&
473 src[2].equals(inst->src[2]) &&
474 saturate == inst->saturate &&
475 predicate == inst->predicate &&
476 conditional_mod == inst->conditional_mod &&
477 mlen == inst->mlen &&
478 base_mrf == inst->base_mrf &&
479 target == inst->target &&
480 eot == inst->eot &&
481 header_present == inst->header_present &&
482 shadow_compare == inst->shadow_compare &&
483 exec_size == inst->exec_size &&
484 offset == inst->offset);
485 }
486
487 bool
488 fs_inst::overwrites_reg(const fs_reg &reg) const
489 {
490 return (reg.file == dst.file &&
491 reg.reg == dst.reg &&
492 reg.reg_offset >= dst.reg_offset &&
493 reg.reg_offset < dst.reg_offset + regs_written);
494 }
495
496 bool
497 fs_inst::is_send_from_grf() const
498 {
499 switch (opcode) {
500 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
501 case SHADER_OPCODE_SHADER_TIME_ADD:
502 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
503 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
504 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
505 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
506 case SHADER_OPCODE_UNTYPED_ATOMIC:
507 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
508 case SHADER_OPCODE_URB_WRITE_SIMD8:
509 return true;
510 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
511 return src[1].file == GRF;
512 case FS_OPCODE_FB_WRITE:
513 return src[0].file == GRF;
514 default:
515 if (is_tex())
516 return src[0].file == GRF;
517
518 return false;
519 }
520 }
521
522 bool
523 fs_inst::can_do_source_mods(struct brw_context *brw)
524 {
525 if (brw->gen == 6 && is_math())
526 return false;
527
528 if (is_send_from_grf())
529 return false;
530
531 if (!backend_instruction::can_do_source_mods())
532 return false;
533
534 return true;
535 }
536
537 void
538 fs_reg::init()
539 {
540 memset(this, 0, sizeof(*this));
541 stride = 1;
542 }
543
544 /** Generic unset register constructor. */
545 fs_reg::fs_reg()
546 {
547 init();
548 this->file = BAD_FILE;
549 }
550
551 /** Immediate value constructor. */
552 fs_reg::fs_reg(float f)
553 {
554 init();
555 this->file = IMM;
556 this->type = BRW_REGISTER_TYPE_F;
557 this->fixed_hw_reg.dw1.f = f;
558 this->width = 1;
559 }
560
561 /** Immediate value constructor. */
562 fs_reg::fs_reg(int32_t i)
563 {
564 init();
565 this->file = IMM;
566 this->type = BRW_REGISTER_TYPE_D;
567 this->fixed_hw_reg.dw1.d = i;
568 this->width = 1;
569 }
570
571 /** Immediate value constructor. */
572 fs_reg::fs_reg(uint32_t u)
573 {
574 init();
575 this->file = IMM;
576 this->type = BRW_REGISTER_TYPE_UD;
577 this->fixed_hw_reg.dw1.ud = u;
578 this->width = 1;
579 }
580
581 /** Vector float immediate value constructor. */
582 fs_reg::fs_reg(uint8_t vf[4])
583 {
584 init();
585 this->file = IMM;
586 this->type = BRW_REGISTER_TYPE_VF;
587 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
588 }
589
590 /** Vector float immediate value constructor. */
591 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
592 {
593 init();
594 this->file = IMM;
595 this->type = BRW_REGISTER_TYPE_VF;
596 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
597 (vf1 << 8) |
598 (vf2 << 16) |
599 (vf3 << 24);
600 }
601
602 /** Fixed brw_reg. */
603 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
604 {
605 init();
606 this->file = HW_REG;
607 this->fixed_hw_reg = fixed_hw_reg;
608 this->type = fixed_hw_reg.type;
609 this->width = 1 << fixed_hw_reg.width;
610 }
611
612 bool
613 fs_reg::equals(const fs_reg &r) const
614 {
615 return (file == r.file &&
616 reg == r.reg &&
617 reg_offset == r.reg_offset &&
618 subreg_offset == r.subreg_offset &&
619 type == r.type &&
620 negate == r.negate &&
621 abs == r.abs &&
622 !reladdr && !r.reladdr &&
623 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
624 width == r.width &&
625 stride == r.stride);
626 }
627
628 fs_reg &
629 fs_reg::set_smear(unsigned subreg)
630 {
631 assert(file != HW_REG && file != IMM);
632 subreg_offset = subreg * type_sz(type);
633 stride = 0;
634 return *this;
635 }
636
637 bool
638 fs_reg::is_contiguous() const
639 {
640 return stride == 1;
641 }
642
643 int
644 fs_visitor::type_size(const struct glsl_type *type)
645 {
646 unsigned int size, i;
647
648 switch (type->base_type) {
649 case GLSL_TYPE_UINT:
650 case GLSL_TYPE_INT:
651 case GLSL_TYPE_FLOAT:
652 case GLSL_TYPE_BOOL:
653 return type->components();
654 case GLSL_TYPE_ARRAY:
655 return type_size(type->fields.array) * type->length;
656 case GLSL_TYPE_STRUCT:
657 size = 0;
658 for (i = 0; i < type->length; i++) {
659 size += type_size(type->fields.structure[i].type);
660 }
661 return size;
662 case GLSL_TYPE_SAMPLER:
663 /* Samplers take up no register space, since they're baked in at
664 * link time.
665 */
666 return 0;
667 case GLSL_TYPE_ATOMIC_UINT:
668 return 0;
669 case GLSL_TYPE_IMAGE:
670 case GLSL_TYPE_VOID:
671 case GLSL_TYPE_ERROR:
672 case GLSL_TYPE_INTERFACE:
673 case GLSL_TYPE_DOUBLE:
674 unreachable("not reached");
675 }
676
677 return 0;
678 }
679
680 /**
681 * Create a MOV to read the timestamp register.
682 *
683 * The caller is responsible for emitting the MOV. The return value is
684 * the destination of the MOV, with extra parameters set.
685 */
686 fs_reg
687 fs_visitor::get_timestamp(fs_inst **out_mov)
688 {
689 assert(brw->gen >= 7);
690
691 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
692 BRW_ARF_TIMESTAMP,
693 0),
694 BRW_REGISTER_TYPE_UD));
695
696 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
697
698 fs_inst *mov = MOV(dst, ts);
699 /* We want to read the 3 fields we care about even if it's not enabled in
700 * the dispatch.
701 */
702 mov->force_writemask_all = true;
703
704 /* The caller wants the low 32 bits of the timestamp. Since it's running
705 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
706 * which is plenty of time for our purposes. It is identical across the
707 * EUs, but since it's tracking GPU core speed it will increment at a
708 * varying rate as render P-states change.
709 *
710 * The caller could also check if render P-states have changed (or anything
711 * else that might disrupt timing) by setting smear to 2 and checking if
712 * that field is != 0.
713 */
714 dst.set_smear(0);
715
716 *out_mov = mov;
717 return dst;
718 }
719
720 void
721 fs_visitor::emit_shader_time_begin()
722 {
723 current_annotation = "shader time start";
724 fs_inst *mov;
725 shader_start_time = get_timestamp(&mov);
726 emit(mov);
727 }
728
729 void
730 fs_visitor::emit_shader_time_end()
731 {
732 current_annotation = "shader time end";
733
734 enum shader_time_shader_type type, written_type, reset_type;
735 switch (stage) {
736 case MESA_SHADER_VERTEX:
737 type = ST_VS;
738 written_type = ST_VS_WRITTEN;
739 reset_type = ST_VS_RESET;
740 break;
741 case MESA_SHADER_GEOMETRY:
742 type = ST_GS;
743 written_type = ST_GS_WRITTEN;
744 reset_type = ST_GS_RESET;
745 break;
746 case MESA_SHADER_FRAGMENT:
747 if (dispatch_width == 8) {
748 type = ST_FS8;
749 written_type = ST_FS8_WRITTEN;
750 reset_type = ST_FS8_RESET;
751 } else {
752 assert(dispatch_width == 16);
753 type = ST_FS16;
754 written_type = ST_FS16_WRITTEN;
755 reset_type = ST_FS16_RESET;
756 }
757 break;
758 default:
759 unreachable("fs_visitor::emit_shader_time_end missing code");
760 }
761
762 /* Insert our code just before the final SEND with EOT. */
763 exec_node *end = this->instructions.get_tail();
764 assert(end && ((fs_inst *) end)->eot);
765
766 fs_inst *tm_read;
767 fs_reg shader_end_time = get_timestamp(&tm_read);
768 end->insert_before(tm_read);
769
770 /* Check that there weren't any timestamp reset events (assuming these
771 * were the only two timestamp reads that happened).
772 */
773 fs_reg reset = shader_end_time;
774 reset.set_smear(2);
775 fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
776 test->conditional_mod = BRW_CONDITIONAL_Z;
777 test->force_writemask_all = true;
778 end->insert_before(test);
779 end->insert_before(IF(BRW_PREDICATE_NORMAL));
780
781 fs_reg start = shader_start_time;
782 start.negate = true;
783 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
784 diff.set_smear(0);
785 fs_inst *add = ADD(diff, start, shader_end_time);
786 add->force_writemask_all = true;
787 end->insert_before(add);
788
789 /* If there were no instructions between the two timestamp gets, the diff
790 * is 2 cycles. Remove that overhead, so I can forget about that when
791 * trying to determine the time taken for single instructions.
792 */
793 add = ADD(diff, diff, fs_reg(-2u));
794 add->force_writemask_all = true;
795 end->insert_before(add);
796
797 end->insert_before(SHADER_TIME_ADD(type, diff));
798 end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
799 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
800 end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
801 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
802 }
803
804 fs_inst *
805 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
806 {
807 int shader_time_index =
808 brw_get_shader_time_index(brw, shader_prog, prog, type);
809 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
810
811 fs_reg payload;
812 if (dispatch_width == 8)
813 payload = vgrf(glsl_type::uvec2_type);
814 else
815 payload = vgrf(glsl_type::uint_type);
816
817 return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
818 fs_reg(), payload, offset, value);
819 }
820
821 void
822 fs_visitor::vfail(const char *format, va_list va)
823 {
824 char *msg;
825
826 if (failed)
827 return;
828
829 failed = true;
830
831 msg = ralloc_vasprintf(mem_ctx, format, va);
832 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
833
834 this->fail_msg = msg;
835
836 if (debug_enabled) {
837 fprintf(stderr, "%s", msg);
838 }
839 }
840
841 void
842 fs_visitor::fail(const char *format, ...)
843 {
844 va_list va;
845
846 va_start(va, format);
847 vfail(format, va);
848 va_end(va);
849 }
850
851 /**
852 * Mark this program as impossible to compile in SIMD16 mode.
853 *
854 * During the SIMD8 compile (which happens first), we can detect and flag
855 * things that are unsupported in SIMD16 mode, so the compiler can skip
856 * the SIMD16 compile altogether.
857 *
858 * During a SIMD16 compile (if one happens anyway), this just calls fail().
859 */
860 void
861 fs_visitor::no16(const char *format, ...)
862 {
863 va_list va;
864
865 va_start(va, format);
866
867 if (dispatch_width == 16) {
868 vfail(format, va);
869 } else {
870 simd16_unsupported = true;
871
872 if (brw->perf_debug) {
873 if (no16_msg)
874 ralloc_vasprintf_append(&no16_msg, format, va);
875 else
876 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
877 }
878 }
879
880 va_end(va);
881 }
882
883 fs_inst *
884 fs_visitor::emit(enum opcode opcode)
885 {
886 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
887 }
888
889 fs_inst *
890 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
891 {
892 return emit(new(mem_ctx) fs_inst(opcode, dst));
893 }
894
895 fs_inst *
896 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
897 {
898 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
899 }
900
901 fs_inst *
902 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
903 const fs_reg &src1)
904 {
905 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
906 }
907
908 fs_inst *
909 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
910 const fs_reg &src1, const fs_reg &src2)
911 {
912 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
913 }
914
915 fs_inst *
916 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
917 fs_reg src[], int sources)
918 {
919 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
920 }
921
922 /**
923 * Returns true if the instruction has a flag that means it won't
924 * update an entire destination register.
925 *
926 * For example, dead code elimination and live variable analysis want to know
927 * when a write to a variable screens off any preceding values that were in
928 * it.
929 */
930 bool
931 fs_inst::is_partial_write() const
932 {
933 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
934 (this->dst.width * type_sz(this->dst.type)) < 32 ||
935 !this->dst.is_contiguous());
936 }
937
938 int
939 fs_inst::regs_read(int arg) const
940 {
941 if (is_tex() && arg == 0 && src[0].file == GRF) {
942 return mlen;
943 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
944 return mlen;
945 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
946 return mlen;
947 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
948 return mlen;
949 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
950 return mlen;
951 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
952 return mlen;
953 }
954
955 switch (src[arg].file) {
956 case BAD_FILE:
957 case UNIFORM:
958 case IMM:
959 return 1;
960 case GRF:
961 case HW_REG:
962 if (src[arg].stride == 0) {
963 return 1;
964 } else {
965 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
966 return (size + 31) / 32;
967 }
968 case MRF:
969 unreachable("MRF registers are not allowed as sources");
970 default:
971 unreachable("Invalid register file");
972 }
973 }
974
975 bool
976 fs_inst::reads_flag() const
977 {
978 return predicate;
979 }
980
981 bool
982 fs_inst::writes_flag() const
983 {
984 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
985 opcode != BRW_OPCODE_IF &&
986 opcode != BRW_OPCODE_WHILE)) ||
987 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
988 }
989
990 /**
991 * Returns how many MRFs an FS opcode will write over.
992 *
993 * Note that this is not the 0 or 1 implied writes in an actual gen
994 * instruction -- the FS opcodes often generate MOVs in addition.
995 */
996 int
997 fs_visitor::implied_mrf_writes(fs_inst *inst)
998 {
999 if (inst->mlen == 0)
1000 return 0;
1001
1002 if (inst->base_mrf == -1)
1003 return 0;
1004
1005 switch (inst->opcode) {
1006 case SHADER_OPCODE_RCP:
1007 case SHADER_OPCODE_RSQ:
1008 case SHADER_OPCODE_SQRT:
1009 case SHADER_OPCODE_EXP2:
1010 case SHADER_OPCODE_LOG2:
1011 case SHADER_OPCODE_SIN:
1012 case SHADER_OPCODE_COS:
1013 return 1 * dispatch_width / 8;
1014 case SHADER_OPCODE_POW:
1015 case SHADER_OPCODE_INT_QUOTIENT:
1016 case SHADER_OPCODE_INT_REMAINDER:
1017 return 2 * dispatch_width / 8;
1018 case SHADER_OPCODE_TEX:
1019 case FS_OPCODE_TXB:
1020 case SHADER_OPCODE_TXD:
1021 case SHADER_OPCODE_TXF:
1022 case SHADER_OPCODE_TXF_CMS:
1023 case SHADER_OPCODE_TXF_MCS:
1024 case SHADER_OPCODE_TG4:
1025 case SHADER_OPCODE_TG4_OFFSET:
1026 case SHADER_OPCODE_TXL:
1027 case SHADER_OPCODE_TXS:
1028 case SHADER_OPCODE_LOD:
1029 return 1;
1030 case FS_OPCODE_FB_WRITE:
1031 return 2;
1032 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1033 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1034 return 1;
1035 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1036 return inst->mlen;
1037 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1038 return 2;
1039 case SHADER_OPCODE_UNTYPED_ATOMIC:
1040 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1041 case SHADER_OPCODE_URB_WRITE_SIMD8:
1042 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1043 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1044 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1045 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1046 return 0;
1047 default:
1048 unreachable("not reached");
1049 }
1050 }
1051
1052 fs_reg
1053 fs_visitor::vgrf(const glsl_type *const type)
1054 {
1055 int reg_width = dispatch_width / 8;
1056 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1057 brw_type_for_base_type(type), dispatch_width);
1058 }
1059
1060 fs_reg
1061 fs_visitor::vgrf(int num_components)
1062 {
1063 int reg_width = dispatch_width / 8;
1064 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1065 BRW_REGISTER_TYPE_F, dispatch_width);
1066 }
1067
1068 /** Fixed HW reg constructor. */
1069 fs_reg::fs_reg(enum register_file file, int reg)
1070 {
1071 init();
1072 this->file = file;
1073 this->reg = reg;
1074 this->type = BRW_REGISTER_TYPE_F;
1075
1076 switch (file) {
1077 case UNIFORM:
1078 this->width = 1;
1079 break;
1080 default:
1081 this->width = 8;
1082 }
1083 }
1084
1085 /** Fixed HW reg constructor. */
1086 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1087 {
1088 init();
1089 this->file = file;
1090 this->reg = reg;
1091 this->type = type;
1092
1093 switch (file) {
1094 case UNIFORM:
1095 this->width = 1;
1096 break;
1097 default:
1098 this->width = 8;
1099 }
1100 }
1101
1102 /** Fixed HW reg constructor. */
1103 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1104 uint8_t width)
1105 {
1106 init();
1107 this->file = file;
1108 this->reg = reg;
1109 this->type = type;
1110 this->width = width;
1111 }
1112
1113 fs_reg *
1114 fs_visitor::variable_storage(ir_variable *var)
1115 {
1116 return (fs_reg *)hash_table_find(this->variable_ht, var);
1117 }
1118
1119 void
1120 import_uniforms_callback(const void *key,
1121 void *data,
1122 void *closure)
1123 {
1124 struct hash_table *dst_ht = (struct hash_table *)closure;
1125 const fs_reg *reg = (const fs_reg *)data;
1126
1127 if (reg->file != UNIFORM)
1128 return;
1129
1130 hash_table_insert(dst_ht, data, key);
1131 }
1132
1133 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1134 * This brings in those uniform definitions
1135 */
1136 void
1137 fs_visitor::import_uniforms(fs_visitor *v)
1138 {
1139 hash_table_call_foreach(v->variable_ht,
1140 import_uniforms_callback,
1141 variable_ht);
1142 this->push_constant_loc = v->push_constant_loc;
1143 this->pull_constant_loc = v->pull_constant_loc;
1144 this->uniforms = v->uniforms;
1145 this->param_size = v->param_size;
1146 }
1147
1148 /* Our support for uniforms is piggy-backed on the struct
1149 * gl_fragment_program, because that's where the values actually
1150 * get stored, rather than in some global gl_shader_program uniform
1151 * store.
1152 */
1153 void
1154 fs_visitor::setup_uniform_values(ir_variable *ir)
1155 {
1156 int namelen = strlen(ir->name);
1157
1158 /* The data for our (non-builtin) uniforms is stored in a series of
1159 * gl_uniform_driver_storage structs for each subcomponent that
1160 * glGetUniformLocation() could name. We know it's been set up in the same
1161 * order we'd walk the type, so walk the list of storage and find anything
1162 * with our name, or the prefix of a component that starts with our name.
1163 */
1164 unsigned params_before = uniforms;
1165 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1166 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1167
1168 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1169 (storage->name[namelen] != 0 &&
1170 storage->name[namelen] != '.' &&
1171 storage->name[namelen] != '[')) {
1172 continue;
1173 }
1174
1175 unsigned slots = storage->type->component_slots();
1176 if (storage->array_elements)
1177 slots *= storage->array_elements;
1178
1179 for (unsigned i = 0; i < slots; i++) {
1180 stage_prog_data->param[uniforms++] = &storage->storage[i];
1181 }
1182 }
1183
1184 /* Make sure we actually initialized the right amount of stuff here. */
1185 assert(params_before + ir->type->component_slots() == uniforms);
1186 (void)params_before;
1187 }
1188
1189
1190 /* Our support for builtin uniforms is even scarier than non-builtin.
1191 * It sits on top of the PROG_STATE_VAR parameters that are
1192 * automatically updated from GL context state.
1193 */
1194 void
1195 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1196 {
1197 const ir_state_slot *const slots = ir->get_state_slots();
1198 assert(slots != NULL);
1199
1200 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1201 /* This state reference has already been setup by ir_to_mesa, but we'll
1202 * get the same index back here.
1203 */
1204 int index = _mesa_add_state_reference(this->prog->Parameters,
1205 (gl_state_index *)slots[i].tokens);
1206
1207 /* Add each of the unique swizzles of the element as a parameter.
1208 * This'll end up matching the expected layout of the
1209 * array/matrix/structure we're trying to fill in.
1210 */
1211 int last_swiz = -1;
1212 for (unsigned int j = 0; j < 4; j++) {
1213 int swiz = GET_SWZ(slots[i].swizzle, j);
1214 if (swiz == last_swiz)
1215 break;
1216 last_swiz = swiz;
1217
1218 stage_prog_data->param[uniforms++] =
1219 &prog->Parameters->ParameterValues[index][swiz];
1220 }
1221 }
1222 }
1223
1224 fs_reg *
1225 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1226 bool origin_upper_left)
1227 {
1228 assert(stage == MESA_SHADER_FRAGMENT);
1229 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1230 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1231 fs_reg wpos = *reg;
1232 bool flip = !origin_upper_left ^ key->render_to_fbo;
1233
1234 /* gl_FragCoord.x */
1235 if (pixel_center_integer) {
1236 emit(MOV(wpos, this->pixel_x));
1237 } else {
1238 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1239 }
1240 wpos = offset(wpos, 1);
1241
1242 /* gl_FragCoord.y */
1243 if (!flip && pixel_center_integer) {
1244 emit(MOV(wpos, this->pixel_y));
1245 } else {
1246 fs_reg pixel_y = this->pixel_y;
1247 float offset = (pixel_center_integer ? 0.0 : 0.5);
1248
1249 if (flip) {
1250 pixel_y.negate = true;
1251 offset += key->drawable_height - 1.0;
1252 }
1253
1254 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1255 }
1256 wpos = offset(wpos, 1);
1257
1258 /* gl_FragCoord.z */
1259 if (brw->gen >= 6) {
1260 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1261 } else {
1262 emit(FS_OPCODE_LINTERP, wpos,
1263 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1264 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1265 interp_reg(VARYING_SLOT_POS, 2));
1266 }
1267 wpos = offset(wpos, 1);
1268
1269 /* gl_FragCoord.w: Already set up in emit_interpolation */
1270 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1271
1272 return reg;
1273 }
1274
1275 fs_inst *
1276 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1277 glsl_interp_qualifier interpolation_mode,
1278 bool is_centroid, bool is_sample)
1279 {
1280 brw_wm_barycentric_interp_mode barycoord_mode;
1281 if (brw->gen >= 6) {
1282 if (is_centroid) {
1283 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1284 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1285 else
1286 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1287 } else if (is_sample) {
1288 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1289 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1290 else
1291 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1292 } else {
1293 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1294 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1295 else
1296 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1297 }
1298 } else {
1299 /* On Ironlake and below, there is only one interpolation mode.
1300 * Centroid interpolation doesn't mean anything on this hardware --
1301 * there is no multisampling.
1302 */
1303 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1304 }
1305 return emit(FS_OPCODE_LINTERP, attr,
1306 this->delta_x[barycoord_mode],
1307 this->delta_y[barycoord_mode], interp);
1308 }
1309
1310 void
1311 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1312 const glsl_type *type,
1313 glsl_interp_qualifier interpolation_mode,
1314 int location, bool mod_centroid,
1315 bool mod_sample)
1316 {
1317 attr.type = brw_type_for_base_type(type->get_scalar_type());
1318
1319 assert(stage == MESA_SHADER_FRAGMENT);
1320 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1321 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1322
1323 unsigned int array_elements;
1324
1325 if (type->is_array()) {
1326 array_elements = type->length;
1327 if (array_elements == 0) {
1328 fail("dereferenced array '%s' has length 0\n", name);
1329 }
1330 type = type->fields.array;
1331 } else {
1332 array_elements = 1;
1333 }
1334
1335 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1336 bool is_gl_Color =
1337 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1338 if (key->flat_shade && is_gl_Color) {
1339 interpolation_mode = INTERP_QUALIFIER_FLAT;
1340 } else {
1341 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1342 }
1343 }
1344
1345 for (unsigned int i = 0; i < array_elements; i++) {
1346 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1347 if (prog_data->urb_setup[location] == -1) {
1348 /* If there's no incoming setup data for this slot, don't
1349 * emit interpolation for it.
1350 */
1351 attr = offset(attr, type->vector_elements);
1352 location++;
1353 continue;
1354 }
1355
1356 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1357 /* Constant interpolation (flat shading) case. The SF has
1358 * handed us defined values in only the constant offset
1359 * field of the setup reg.
1360 */
1361 for (unsigned int k = 0; k < type->vector_elements; k++) {
1362 struct brw_reg interp = interp_reg(location, k);
1363 interp = suboffset(interp, 3);
1364 interp.type = attr.type;
1365 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1366 attr = offset(attr, 1);
1367 }
1368 } else {
1369 /* Smooth/noperspective interpolation case. */
1370 for (unsigned int k = 0; k < type->vector_elements; k++) {
1371 struct brw_reg interp = interp_reg(location, k);
1372 if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1373 /* Get the pixel/sample mask into f0 so that we know
1374 * which pixels are lit. Then, for each channel that is
1375 * unlit, replace the centroid data with non-centroid
1376 * data.
1377 */
1378 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1379
1380 fs_inst *inst;
1381 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1382 false, false);
1383 inst->predicate = BRW_PREDICATE_NORMAL;
1384 inst->predicate_inverse = true;
1385 if (brw->has_pln)
1386 inst->no_dd_clear = true;
1387
1388 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1389 mod_centroid && !key->persample_shading,
1390 mod_sample || key->persample_shading);
1391 inst->predicate = BRW_PREDICATE_NORMAL;
1392 inst->predicate_inverse = false;
1393 if (brw->has_pln)
1394 inst->no_dd_check = true;
1395
1396 } else {
1397 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1398 mod_centroid && !key->persample_shading,
1399 mod_sample || key->persample_shading);
1400 }
1401 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1402 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1403 }
1404 attr = offset(attr, 1);
1405 }
1406
1407 }
1408 location++;
1409 }
1410 }
1411 }
1412
1413 fs_reg *
1414 fs_visitor::emit_frontfacing_interpolation()
1415 {
1416 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1417
1418 if (brw->gen >= 6) {
1419 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1420 * a boolean result from this (~0/true or 0/false).
1421 *
1422 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1423 * this task in only one instruction:
1424 * - a negation source modifier will flip the bit; and
1425 * - a W -> D type conversion will sign extend the bit into the high
1426 * word of the destination.
1427 *
1428 * An ASR 15 fills the low word of the destination.
1429 */
1430 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1431 g0.negate = true;
1432
1433 emit(ASR(*reg, g0, fs_reg(15)));
1434 } else {
1435 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1436 * a boolean result from this (1/true or 0/false).
1437 *
1438 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1439 * the negation source modifier to flip it. Unfortunately the SHR
1440 * instruction only operates on UD (or D with an abs source modifier)
1441 * sources without negation.
1442 *
1443 * Instead, use ASR (which will give ~0/true or 0/false).
1444 */
1445 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1446 g1_6.negate = true;
1447
1448 emit(ASR(*reg, g1_6, fs_reg(31)));
1449 }
1450
1451 return reg;
1452 }
1453
1454 void
1455 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1456 {
1457 assert(stage == MESA_SHADER_FRAGMENT);
1458 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1459 assert(dst.type == BRW_REGISTER_TYPE_F);
1460
1461 if (key->compute_pos_offset) {
1462 /* Convert int_sample_pos to floating point */
1463 emit(MOV(dst, int_sample_pos));
1464 /* Scale to the range [0, 1] */
1465 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1466 }
1467 else {
1468 /* From ARB_sample_shading specification:
1469 * "When rendering to a non-multisample buffer, or if multisample
1470 * rasterization is disabled, gl_SamplePosition will always be
1471 * (0.5, 0.5).
1472 */
1473 emit(MOV(dst, fs_reg(0.5f)));
1474 }
1475 }
1476
1477 fs_reg *
1478 fs_visitor::emit_samplepos_setup()
1479 {
1480 assert(brw->gen >= 6);
1481
1482 this->current_annotation = "compute sample position";
1483 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1484 fs_reg pos = *reg;
1485 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1486 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1487
1488 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1489 * mode will be enabled.
1490 *
1491 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1492 * R31.1:0 Position Offset X/Y for Slot[3:0]
1493 * R31.3:2 Position Offset X/Y for Slot[7:4]
1494 * .....
1495 *
1496 * The X, Y sample positions come in as bytes in thread payload. So, read
1497 * the positions using vstride=16, width=8, hstride=2.
1498 */
1499 struct brw_reg sample_pos_reg =
1500 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1501 BRW_REGISTER_TYPE_B), 16, 8, 2);
1502
1503 if (dispatch_width == 8) {
1504 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1505 } else {
1506 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1507 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1508 ->force_sechalf = true;
1509 }
1510 /* Compute gl_SamplePosition.x */
1511 compute_sample_position(pos, int_sample_x);
1512 pos = offset(pos, 1);
1513 if (dispatch_width == 8) {
1514 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1515 } else {
1516 emit(MOV(half(int_sample_y, 0),
1517 fs_reg(suboffset(sample_pos_reg, 1))));
1518 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1519 ->force_sechalf = true;
1520 }
1521 /* Compute gl_SamplePosition.y */
1522 compute_sample_position(pos, int_sample_y);
1523 return reg;
1524 }
1525
1526 fs_reg *
1527 fs_visitor::emit_sampleid_setup()
1528 {
1529 assert(stage == MESA_SHADER_FRAGMENT);
1530 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1531 assert(brw->gen >= 6);
1532
1533 this->current_annotation = "compute sample id";
1534 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1535
1536 if (key->compute_sample_id) {
1537 fs_reg t1 = vgrf(glsl_type::int_type);
1538 fs_reg t2 = vgrf(glsl_type::int_type);
1539 t2.type = BRW_REGISTER_TYPE_UW;
1540
1541 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1542 * 8x multisampling, subspan 0 will represent sample N (where N
1543 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1544 * 7. We can find the value of N by looking at R0.0 bits 7:6
1545 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1546 * (since samples are always delivered in pairs). That is, we
1547 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1548 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1549 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1550 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1551 * populating a temporary variable with the sequence (0, 1, 2, 3),
1552 * and then reading from it using vstride=1, width=4, hstride=0.
1553 * These computations hold good for 4x multisampling as well.
1554 *
1555 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1556 * the first four slots are sample 0 of subspan 0; the next four
1557 * are sample 1 of subspan 0; the third group is sample 0 of
1558 * subspan 1, and finally sample 1 of subspan 1.
1559 */
1560 fs_inst *inst;
1561 inst = emit(BRW_OPCODE_AND, t1,
1562 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1563 fs_reg(0xc0));
1564 inst->force_writemask_all = true;
1565 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1566 inst->force_writemask_all = true;
1567 /* This works for both SIMD8 and SIMD16 */
1568 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1569 inst->force_writemask_all = true;
1570 /* This special instruction takes care of setting vstride=1,
1571 * width=4, hstride=0 of t2 during an ADD instruction.
1572 */
1573 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1574 } else {
1575 /* As per GL_ARB_sample_shading specification:
1576 * "When rendering to a non-multisample buffer, or if multisample
1577 * rasterization is disabled, gl_SampleID will always be zero."
1578 */
1579 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1580 }
1581
1582 return reg;
1583 }
1584
1585 void
1586 fs_visitor::resolve_source_modifiers(fs_reg *src)
1587 {
1588 if (!src->abs && !src->negate)
1589 return;
1590
1591 fs_reg temp = retype(vgrf(1), src->type);
1592 emit(MOV(temp, *src));
1593 *src = temp;
1594 }
1595
1596 fs_reg
1597 fs_visitor::fix_math_operand(fs_reg src)
1598 {
1599 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1600 * might be able to do better by doing execsize = 1 math and then
1601 * expanding that result out, but we would need to be careful with
1602 * masking.
1603 *
1604 * The hardware ignores source modifiers (negate and abs) on math
1605 * instructions, so we also move to a temp to set those up.
1606 */
1607 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1608 !src.abs && !src.negate)
1609 return src;
1610
1611 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1612 * operands to math
1613 */
1614 if (brw->gen >= 7 && src.file != IMM)
1615 return src;
1616
1617 fs_reg expanded = vgrf(glsl_type::float_type);
1618 expanded.type = src.type;
1619 emit(BRW_OPCODE_MOV, expanded, src);
1620 return expanded;
1621 }
1622
1623 fs_inst *
1624 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1625 {
1626 switch (opcode) {
1627 case SHADER_OPCODE_RCP:
1628 case SHADER_OPCODE_RSQ:
1629 case SHADER_OPCODE_SQRT:
1630 case SHADER_OPCODE_EXP2:
1631 case SHADER_OPCODE_LOG2:
1632 case SHADER_OPCODE_SIN:
1633 case SHADER_OPCODE_COS:
1634 break;
1635 default:
1636 unreachable("not reached: bad math opcode");
1637 }
1638
1639 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1640 * might be able to do better by doing execsize = 1 math and then
1641 * expanding that result out, but we would need to be careful with
1642 * masking.
1643 *
1644 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1645 * instructions, so we also move to a temp to set those up.
1646 */
1647 if (brw->gen == 6 || brw->gen == 7)
1648 src = fix_math_operand(src);
1649
1650 fs_inst *inst = emit(opcode, dst, src);
1651
1652 if (brw->gen < 6) {
1653 inst->base_mrf = 2;
1654 inst->mlen = dispatch_width / 8;
1655 }
1656
1657 return inst;
1658 }
1659
1660 fs_inst *
1661 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1662 {
1663 int base_mrf = 2;
1664 fs_inst *inst;
1665
1666 if (brw->gen >= 8) {
1667 inst = emit(opcode, dst, src0, src1);
1668 } else if (brw->gen >= 6) {
1669 src0 = fix_math_operand(src0);
1670 src1 = fix_math_operand(src1);
1671
1672 inst = emit(opcode, dst, src0, src1);
1673 } else {
1674 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1675 * "Message Payload":
1676 *
1677 * "Operand0[7]. For the INT DIV functions, this operand is the
1678 * denominator."
1679 * ...
1680 * "Operand1[7]. For the INT DIV functions, this operand is the
1681 * numerator."
1682 */
1683 bool is_int_div = opcode != SHADER_OPCODE_POW;
1684 fs_reg &op0 = is_int_div ? src1 : src0;
1685 fs_reg &op1 = is_int_div ? src0 : src1;
1686
1687 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1688 inst = emit(opcode, dst, op0, reg_null_f);
1689
1690 inst->base_mrf = base_mrf;
1691 inst->mlen = 2 * dispatch_width / 8;
1692 }
1693 return inst;
1694 }
1695
1696 void
1697 fs_visitor::emit_discard_jump()
1698 {
1699 /* For performance, after a discard, jump to the end of the
1700 * shader if all relevant channels have been discarded.
1701 */
1702 fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1703 discard_jump->flag_subreg = 1;
1704
1705 discard_jump->predicate = (dispatch_width == 8)
1706 ? BRW_PREDICATE_ALIGN1_ANY8H
1707 : BRW_PREDICATE_ALIGN1_ANY16H;
1708 discard_jump->predicate_inverse = true;
1709 }
1710
1711 void
1712 fs_visitor::assign_curb_setup()
1713 {
1714 if (dispatch_width == 8) {
1715 prog_data->dispatch_grf_start_reg = payload.num_regs;
1716 } else {
1717 assert(stage == MESA_SHADER_FRAGMENT);
1718 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1719 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1720 }
1721
1722 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1723
1724 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1725 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1726 for (unsigned int i = 0; i < inst->sources; i++) {
1727 if (inst->src[i].file == UNIFORM) {
1728 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1729 int constant_nr;
1730 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1731 constant_nr = push_constant_loc[uniform_nr];
1732 } else {
1733 /* Section 5.11 of the OpenGL 4.1 spec says:
1734 * "Out-of-bounds reads return undefined values, which include
1735 * values from other variables of the active program or zero."
1736 * Just return the first push constant.
1737 */
1738 constant_nr = 0;
1739 }
1740
1741 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1742 constant_nr / 8,
1743 constant_nr % 8);
1744
1745 inst->src[i].file = HW_REG;
1746 inst->src[i].fixed_hw_reg = byte_offset(
1747 retype(brw_reg, inst->src[i].type),
1748 inst->src[i].subreg_offset);
1749 }
1750 }
1751 }
1752 }
1753
1754 void
1755 fs_visitor::calculate_urb_setup()
1756 {
1757 assert(stage == MESA_SHADER_FRAGMENT);
1758 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1759 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1760
1761 memset(prog_data->urb_setup, -1,
1762 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1763
1764 int urb_next = 0;
1765 /* Figure out where each of the incoming setup attributes lands. */
1766 if (brw->gen >= 6) {
1767 if (_mesa_bitcount_64(prog->InputsRead &
1768 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1769 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1770 * first 16 varying inputs, so we can put them wherever we want.
1771 * Just put them in order.
1772 *
1773 * This is useful because it means that (a) inputs not used by the
1774 * fragment shader won't take up valuable register space, and (b) we
1775 * won't have to recompile the fragment shader if it gets paired with
1776 * a different vertex (or geometry) shader.
1777 */
1778 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1779 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1780 BITFIELD64_BIT(i)) {
1781 prog_data->urb_setup[i] = urb_next++;
1782 }
1783 }
1784 } else {
1785 /* We have enough input varyings that the SF/SBE pipeline stage can't
1786 * arbitrarily rearrange them to suit our whim; we have to put them
1787 * in an order that matches the output of the previous pipeline stage
1788 * (geometry or vertex shader).
1789 */
1790 struct brw_vue_map prev_stage_vue_map;
1791 brw_compute_vue_map(brw, &prev_stage_vue_map,
1792 key->input_slots_valid);
1793 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1794 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1795 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1796 slot++) {
1797 int varying = prev_stage_vue_map.slot_to_varying[slot];
1798 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1799 * unused.
1800 */
1801 if (varying != BRW_VARYING_SLOT_COUNT &&
1802 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1803 BITFIELD64_BIT(varying))) {
1804 prog_data->urb_setup[varying] = slot - first_slot;
1805 }
1806 }
1807 urb_next = prev_stage_vue_map.num_slots - first_slot;
1808 }
1809 } else {
1810 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1811 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1812 /* Point size is packed into the header, not as a general attribute */
1813 if (i == VARYING_SLOT_PSIZ)
1814 continue;
1815
1816 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1817 /* The back color slot is skipped when the front color is
1818 * also written to. In addition, some slots can be
1819 * written in the vertex shader and not read in the
1820 * fragment shader. So the register number must always be
1821 * incremented, mapped or not.
1822 */
1823 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1824 prog_data->urb_setup[i] = urb_next;
1825 urb_next++;
1826 }
1827 }
1828
1829 /*
1830 * It's a FS only attribute, and we did interpolation for this attribute
1831 * in SF thread. So, count it here, too.
1832 *
1833 * See compile_sf_prog() for more info.
1834 */
1835 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1836 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1837 }
1838
1839 prog_data->num_varying_inputs = urb_next;
1840 }
1841
1842 void
1843 fs_visitor::assign_urb_setup()
1844 {
1845 assert(stage == MESA_SHADER_FRAGMENT);
1846 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1847
1848 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1849
1850 /* Offset all the urb_setup[] index by the actual position of the
1851 * setup regs, now that the location of the constants has been chosen.
1852 */
1853 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1854 if (inst->opcode == FS_OPCODE_LINTERP) {
1855 assert(inst->src[2].file == HW_REG);
1856 inst->src[2].fixed_hw_reg.nr += urb_start;
1857 }
1858
1859 if (inst->opcode == FS_OPCODE_CINTERP) {
1860 assert(inst->src[0].file == HW_REG);
1861 inst->src[0].fixed_hw_reg.nr += urb_start;
1862 }
1863 }
1864
1865 /* Each attribute is 4 setup channels, each of which is half a reg. */
1866 this->first_non_payload_grf =
1867 urb_start + prog_data->num_varying_inputs * 2;
1868 }
1869
1870 void
1871 fs_visitor::assign_vs_urb_setup()
1872 {
1873 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1874 int grf, count, slot, channel, attr;
1875
1876 assert(stage == MESA_SHADER_VERTEX);
1877 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1878 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1879 count++;
1880
1881 /* Each attribute is 4 regs. */
1882 this->first_non_payload_grf =
1883 payload.num_regs + prog_data->curb_read_length + count * 4;
1884
1885 unsigned vue_entries =
1886 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1887
1888 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1889 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1890
1891 assert(vs_prog_data->base.urb_read_length <= 15);
1892
1893 /* Rewrite all ATTR file references to the hw grf that they land in. */
1894 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1895 for (int i = 0; i < inst->sources; i++) {
1896 if (inst->src[i].file == ATTR) {
1897
1898 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1899 slot = count - 1;
1900 } else {
1901 /* Attributes come in in a contiguous block, ordered by their
1902 * gl_vert_attrib value. That means we can compute the slot
1903 * number for an attribute by masking out the enabled
1904 * attributes before it and counting the bits.
1905 */
1906 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1907 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1908 BITFIELD64_MASK(attr));
1909 }
1910
1911 channel = inst->src[i].reg_offset & 3;
1912
1913 grf = payload.num_regs +
1914 prog_data->curb_read_length +
1915 slot * 4 + channel;
1916
1917 inst->src[i].file = HW_REG;
1918 inst->src[i].fixed_hw_reg =
1919 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1920 }
1921 }
1922 }
1923 }
1924
1925 /**
1926 * Split large virtual GRFs into separate components if we can.
1927 *
1928 * This is mostly duplicated with what brw_fs_vector_splitting does,
1929 * but that's really conservative because it's afraid of doing
1930 * splitting that doesn't result in real progress after the rest of
1931 * the optimization phases, which would cause infinite looping in
1932 * optimization. We can do it once here, safely. This also has the
1933 * opportunity to split interpolated values, or maybe even uniforms,
1934 * which we don't have at the IR level.
1935 *
1936 * We want to split, because virtual GRFs are what we register
1937 * allocate and spill (due to contiguousness requirements for some
1938 * instructions), and they're what we naturally generate in the
1939 * codegen process, but most virtual GRFs don't actually need to be
1940 * contiguous sets of GRFs. If we split, we'll end up with reduced
1941 * live intervals and better dead code elimination and coalescing.
1942 */
1943 void
1944 fs_visitor::split_virtual_grfs()
1945 {
1946 int num_vars = this->alloc.count;
1947
1948 /* Count the total number of registers */
1949 int reg_count = 0;
1950 int vgrf_to_reg[num_vars];
1951 for (int i = 0; i < num_vars; i++) {
1952 vgrf_to_reg[i] = reg_count;
1953 reg_count += alloc.sizes[i];
1954 }
1955
1956 /* An array of "split points". For each register slot, this indicates
1957 * if this slot can be separated from the previous slot. Every time an
1958 * instruction uses multiple elements of a register (as a source or
1959 * destination), we mark the used slots as inseparable. Then we go
1960 * through and split the registers into the smallest pieces we can.
1961 */
1962 bool split_points[reg_count];
1963 memset(split_points, 0, sizeof(split_points));
1964
1965 /* Mark all used registers as fully splittable */
1966 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1967 if (inst->dst.file == GRF) {
1968 int reg = vgrf_to_reg[inst->dst.reg];
1969 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1970 split_points[reg + j] = true;
1971 }
1972
1973 for (int i = 0; i < inst->sources; i++) {
1974 if (inst->src[i].file == GRF) {
1975 int reg = vgrf_to_reg[inst->src[i].reg];
1976 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1977 split_points[reg + j] = true;
1978 }
1979 }
1980 }
1981
1982 if (brw->has_pln &&
1983 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1984 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1985 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1986 * Gen6, that was the only supported interpolation mode, and since Gen6,
1987 * delta_x and delta_y are in fixed hardware registers.
1988 */
1989 int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1990 split_points[vgrf_to_reg[vgrf] + 1] = false;
1991 }
1992
1993 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1994 if (inst->dst.file == GRF) {
1995 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1996 for (int j = 1; j < inst->regs_written; j++)
1997 split_points[reg + j] = false;
1998 }
1999 for (int i = 0; i < inst->sources; i++) {
2000 if (inst->src[i].file == GRF) {
2001 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2002 for (int j = 1; j < inst->regs_read(i); j++)
2003 split_points[reg + j] = false;
2004 }
2005 }
2006 }
2007
2008 int new_virtual_grf[reg_count];
2009 int new_reg_offset[reg_count];
2010
2011 int reg = 0;
2012 for (int i = 0; i < num_vars; i++) {
2013 /* The first one should always be 0 as a quick sanity check. */
2014 assert(split_points[reg] == false);
2015
2016 /* j = 0 case */
2017 new_reg_offset[reg] = 0;
2018 reg++;
2019 int offset = 1;
2020
2021 /* j > 0 case */
2022 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2023 /* If this is a split point, reset the offset to 0 and allocate a
2024 * new virtual GRF for the previous offset many registers
2025 */
2026 if (split_points[reg]) {
2027 assert(offset <= MAX_VGRF_SIZE);
2028 int grf = alloc.allocate(offset);
2029 for (int k = reg - offset; k < reg; k++)
2030 new_virtual_grf[k] = grf;
2031 offset = 0;
2032 }
2033 new_reg_offset[reg] = offset;
2034 offset++;
2035 reg++;
2036 }
2037
2038 /* The last one gets the original register number */
2039 assert(offset <= MAX_VGRF_SIZE);
2040 alloc.sizes[i] = offset;
2041 for (int k = reg - offset; k < reg; k++)
2042 new_virtual_grf[k] = i;
2043 }
2044 assert(reg == reg_count);
2045
2046 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2047 if (inst->dst.file == GRF) {
2048 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2049 inst->dst.reg = new_virtual_grf[reg];
2050 inst->dst.reg_offset = new_reg_offset[reg];
2051 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2052 }
2053 for (int i = 0; i < inst->sources; i++) {
2054 if (inst->src[i].file == GRF) {
2055 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2056 inst->src[i].reg = new_virtual_grf[reg];
2057 inst->src[i].reg_offset = new_reg_offset[reg];
2058 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2059 }
2060 }
2061 }
2062 invalidate_live_intervals();
2063 }
2064
2065 /**
2066 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2067 *
2068 * During code generation, we create tons of temporary variables, many of
2069 * which get immediately killed and are never used again. Yet, in later
2070 * optimization and analysis passes, such as compute_live_intervals, we need
2071 * to loop over all the virtual GRFs. Compacting them can save a lot of
2072 * overhead.
2073 */
2074 bool
2075 fs_visitor::compact_virtual_grfs()
2076 {
2077 bool progress = false;
2078 int remap_table[this->alloc.count];
2079 memset(remap_table, -1, sizeof(remap_table));
2080
2081 /* Mark which virtual GRFs are used. */
2082 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2083 if (inst->dst.file == GRF)
2084 remap_table[inst->dst.reg] = 0;
2085
2086 for (int i = 0; i < inst->sources; i++) {
2087 if (inst->src[i].file == GRF)
2088 remap_table[inst->src[i].reg] = 0;
2089 }
2090 }
2091
2092 /* Compact the GRF arrays. */
2093 int new_index = 0;
2094 for (unsigned i = 0; i < this->alloc.count; i++) {
2095 if (remap_table[i] == -1) {
2096 /* We just found an unused register. This means that we are
2097 * actually going to compact something.
2098 */
2099 progress = true;
2100 } else {
2101 remap_table[i] = new_index;
2102 alloc.sizes[new_index] = alloc.sizes[i];
2103 invalidate_live_intervals();
2104 ++new_index;
2105 }
2106 }
2107
2108 this->alloc.count = new_index;
2109
2110 /* Patch all the instructions to use the newly renumbered registers */
2111 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2112 if (inst->dst.file == GRF)
2113 inst->dst.reg = remap_table[inst->dst.reg];
2114
2115 for (int i = 0; i < inst->sources; i++) {
2116 if (inst->src[i].file == GRF)
2117 inst->src[i].reg = remap_table[inst->src[i].reg];
2118 }
2119 }
2120
2121 /* Patch all the references to delta_x/delta_y, since they're used in
2122 * register allocation. If they're unused, switch them to BAD_FILE so
2123 * we don't think some random VGRF is delta_x/delta_y.
2124 */
2125 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2126 if (delta_x[i].file == GRF) {
2127 if (remap_table[delta_x[i].reg] != -1) {
2128 delta_x[i].reg = remap_table[delta_x[i].reg];
2129 } else {
2130 delta_x[i].file = BAD_FILE;
2131 }
2132 }
2133 }
2134 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2135 if (delta_y[i].file == GRF) {
2136 if (remap_table[delta_y[i].reg] != -1) {
2137 delta_y[i].reg = remap_table[delta_y[i].reg];
2138 } else {
2139 delta_y[i].file = BAD_FILE;
2140 }
2141 }
2142 }
2143
2144 return progress;
2145 }
2146
2147 /*
2148 * Implements array access of uniforms by inserting a
2149 * PULL_CONSTANT_LOAD instruction.
2150 *
2151 * Unlike temporary GRF array access (where we don't support it due to
2152 * the difficulty of doing relative addressing on instruction
2153 * destinations), we could potentially do array access of uniforms
2154 * that were loaded in GRF space as push constants. In real-world
2155 * usage we've seen, though, the arrays being used are always larger
2156 * than we could load as push constants, so just always move all
2157 * uniform array access out to a pull constant buffer.
2158 */
2159 void
2160 fs_visitor::move_uniform_array_access_to_pull_constants()
2161 {
2162 if (dispatch_width != 8)
2163 return;
2164
2165 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2166 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2167
2168 /* Walk through and find array access of uniforms. Put a copy of that
2169 * uniform in the pull constant buffer.
2170 *
2171 * Note that we don't move constant-indexed accesses to arrays. No
2172 * testing has been done of the performance impact of this choice.
2173 */
2174 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2175 for (int i = 0 ; i < inst->sources; i++) {
2176 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2177 continue;
2178
2179 int uniform = inst->src[i].reg;
2180
2181 /* If this array isn't already present in the pull constant buffer,
2182 * add it.
2183 */
2184 if (pull_constant_loc[uniform] == -1) {
2185 const gl_constant_value **values = &stage_prog_data->param[uniform];
2186
2187 assert(param_size[uniform]);
2188
2189 for (int j = 0; j < param_size[uniform]; j++) {
2190 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2191
2192 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2193 values[j];
2194 }
2195 }
2196 }
2197 }
2198 }
2199
2200 /**
2201 * Assign UNIFORM file registers to either push constants or pull constants.
2202 *
2203 * We allow a fragment shader to have more than the specified minimum
2204 * maximum number of fragment shader uniform components (64). If
2205 * there are too many of these, they'd fill up all of register space.
2206 * So, this will push some of them out to the pull constant buffer and
2207 * update the program to load them.
2208 */
2209 void
2210 fs_visitor::assign_constant_locations()
2211 {
2212 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2213 if (dispatch_width != 8)
2214 return;
2215
2216 /* Find which UNIFORM registers are still in use. */
2217 bool is_live[uniforms];
2218 for (unsigned int i = 0; i < uniforms; i++) {
2219 is_live[i] = false;
2220 }
2221
2222 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2223 for (int i = 0; i < inst->sources; i++) {
2224 if (inst->src[i].file != UNIFORM)
2225 continue;
2226
2227 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2228 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2229 is_live[constant_nr] = true;
2230 }
2231 }
2232
2233 /* Only allow 16 registers (128 uniform components) as push constants.
2234 *
2235 * Just demote the end of the list. We could probably do better
2236 * here, demoting things that are rarely used in the program first.
2237 *
2238 * If changing this value, note the limitation about total_regs in
2239 * brw_curbe.c.
2240 */
2241 unsigned int max_push_components = 16 * 8;
2242 unsigned int num_push_constants = 0;
2243
2244 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2245
2246 for (unsigned int i = 0; i < uniforms; i++) {
2247 if (!is_live[i] || pull_constant_loc[i] != -1) {
2248 /* This UNIFORM register is either dead, or has already been demoted
2249 * to a pull const. Mark it as no longer living in the param[] array.
2250 */
2251 push_constant_loc[i] = -1;
2252 continue;
2253 }
2254
2255 if (num_push_constants < max_push_components) {
2256 /* Retain as a push constant. Record the location in the params[]
2257 * array.
2258 */
2259 push_constant_loc[i] = num_push_constants++;
2260 } else {
2261 /* Demote to a pull constant. */
2262 push_constant_loc[i] = -1;
2263
2264 int pull_index = stage_prog_data->nr_pull_params++;
2265 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2266 pull_constant_loc[i] = pull_index;
2267 }
2268 }
2269
2270 stage_prog_data->nr_params = num_push_constants;
2271
2272 /* Up until now, the param[] array has been indexed by reg + reg_offset
2273 * of UNIFORM registers. Condense it to only contain the uniforms we
2274 * chose to upload as push constants.
2275 */
2276 for (unsigned int i = 0; i < uniforms; i++) {
2277 int remapped = push_constant_loc[i];
2278
2279 if (remapped == -1)
2280 continue;
2281
2282 assert(remapped <= (int)i);
2283 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2284 }
2285 }
2286
2287 /**
2288 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2289 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2290 */
2291 void
2292 fs_visitor::demote_pull_constants()
2293 {
2294 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2295 for (int i = 0; i < inst->sources; i++) {
2296 if (inst->src[i].file != UNIFORM)
2297 continue;
2298
2299 int pull_index;
2300 unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2301 if (location >= uniforms) /* Out of bounds access */
2302 pull_index = -1;
2303 else
2304 pull_index = pull_constant_loc[location];
2305
2306 if (pull_index == -1)
2307 continue;
2308
2309 /* Set up the annotation tracking for new generated instructions. */
2310 base_ir = inst->ir;
2311 current_annotation = inst->annotation;
2312
2313 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2314 fs_reg dst = vgrf(glsl_type::float_type);
2315
2316 /* Generate a pull load into dst. */
2317 if (inst->src[i].reladdr) {
2318 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2319 surf_index,
2320 *inst->src[i].reladdr,
2321 pull_index);
2322 inst->insert_before(block, &list);
2323 inst->src[i].reladdr = NULL;
2324 } else {
2325 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2326 fs_inst *pull =
2327 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2328 dst, surf_index, offset);
2329 inst->insert_before(block, pull);
2330 inst->src[i].set_smear(pull_index & 3);
2331 }
2332
2333 /* Rewrite the instruction to use the temporary VGRF. */
2334 inst->src[i].file = GRF;
2335 inst->src[i].reg = dst.reg;
2336 inst->src[i].reg_offset = 0;
2337 inst->src[i].width = dispatch_width;
2338 }
2339 }
2340 invalidate_live_intervals();
2341 }
2342
2343 bool
2344 fs_visitor::opt_algebraic()
2345 {
2346 bool progress = false;
2347
2348 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2349 switch (inst->opcode) {
2350 case BRW_OPCODE_MOV:
2351 if (inst->src[0].file != IMM)
2352 break;
2353
2354 if (inst->saturate) {
2355 if (inst->dst.type != inst->src[0].type)
2356 assert(!"unimplemented: saturate mixed types");
2357
2358 if (brw_saturate_immediate(inst->dst.type,
2359 &inst->src[0].fixed_hw_reg)) {
2360 inst->saturate = false;
2361 progress = true;
2362 }
2363 }
2364 break;
2365
2366 case BRW_OPCODE_MUL:
2367 if (inst->src[1].file != IMM)
2368 continue;
2369
2370 /* a * 1.0 = a */
2371 if (inst->src[1].is_one()) {
2372 inst->opcode = BRW_OPCODE_MOV;
2373 inst->src[1] = reg_undef;
2374 progress = true;
2375 break;
2376 }
2377
2378 /* a * -1.0 = -a */
2379 if (inst->src[1].is_negative_one()) {
2380 inst->opcode = BRW_OPCODE_MOV;
2381 inst->src[0].negate = !inst->src[0].negate;
2382 inst->src[1] = reg_undef;
2383 progress = true;
2384 break;
2385 }
2386
2387 /* a * 0.0 = 0.0 */
2388 if (inst->src[1].is_zero()) {
2389 inst->opcode = BRW_OPCODE_MOV;
2390 inst->src[0] = inst->src[1];
2391 inst->src[1] = reg_undef;
2392 progress = true;
2393 break;
2394 }
2395
2396 if (inst->src[0].file == IMM) {
2397 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2398 inst->opcode = BRW_OPCODE_MOV;
2399 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2400 inst->src[1] = reg_undef;
2401 progress = true;
2402 break;
2403 }
2404 break;
2405 case BRW_OPCODE_ADD:
2406 if (inst->src[1].file != IMM)
2407 continue;
2408
2409 /* a + 0.0 = a */
2410 if (inst->src[1].is_zero()) {
2411 inst->opcode = BRW_OPCODE_MOV;
2412 inst->src[1] = reg_undef;
2413 progress = true;
2414 break;
2415 }
2416
2417 if (inst->src[0].file == IMM) {
2418 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2419 inst->opcode = BRW_OPCODE_MOV;
2420 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2421 inst->src[1] = reg_undef;
2422 progress = true;
2423 break;
2424 }
2425 break;
2426 case BRW_OPCODE_OR:
2427 if (inst->src[0].equals(inst->src[1])) {
2428 inst->opcode = BRW_OPCODE_MOV;
2429 inst->src[1] = reg_undef;
2430 progress = true;
2431 break;
2432 }
2433 break;
2434 case BRW_OPCODE_LRP:
2435 if (inst->src[1].equals(inst->src[2])) {
2436 inst->opcode = BRW_OPCODE_MOV;
2437 inst->src[0] = inst->src[1];
2438 inst->src[1] = reg_undef;
2439 inst->src[2] = reg_undef;
2440 progress = true;
2441 break;
2442 }
2443 break;
2444 case BRW_OPCODE_CMP:
2445 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2446 inst->src[0].abs &&
2447 inst->src[0].negate &&
2448 inst->src[1].is_zero()) {
2449 inst->src[0].abs = false;
2450 inst->src[0].negate = false;
2451 inst->conditional_mod = BRW_CONDITIONAL_Z;
2452 progress = true;
2453 break;
2454 }
2455 break;
2456 case BRW_OPCODE_SEL:
2457 if (inst->src[0].equals(inst->src[1])) {
2458 inst->opcode = BRW_OPCODE_MOV;
2459 inst->src[1] = reg_undef;
2460 inst->predicate = BRW_PREDICATE_NONE;
2461 inst->predicate_inverse = false;
2462 progress = true;
2463 } else if (inst->saturate && inst->src[1].file == IMM) {
2464 switch (inst->conditional_mod) {
2465 case BRW_CONDITIONAL_LE:
2466 case BRW_CONDITIONAL_L:
2467 switch (inst->src[1].type) {
2468 case BRW_REGISTER_TYPE_F:
2469 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2470 inst->opcode = BRW_OPCODE_MOV;
2471 inst->src[1] = reg_undef;
2472 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2473 progress = true;
2474 }
2475 break;
2476 default:
2477 break;
2478 }
2479 break;
2480 case BRW_CONDITIONAL_GE:
2481 case BRW_CONDITIONAL_G:
2482 switch (inst->src[1].type) {
2483 case BRW_REGISTER_TYPE_F:
2484 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2485 inst->opcode = BRW_OPCODE_MOV;
2486 inst->src[1] = reg_undef;
2487 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2488 progress = true;
2489 }
2490 break;
2491 default:
2492 break;
2493 }
2494 default:
2495 break;
2496 }
2497 }
2498 break;
2499 case BRW_OPCODE_MAD:
2500 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2501 inst->opcode = BRW_OPCODE_MOV;
2502 inst->src[1] = reg_undef;
2503 inst->src[2] = reg_undef;
2504 progress = true;
2505 } else if (inst->src[0].is_zero()) {
2506 inst->opcode = BRW_OPCODE_MUL;
2507 inst->src[0] = inst->src[2];
2508 inst->src[2] = reg_undef;
2509 progress = true;
2510 } else if (inst->src[1].is_one()) {
2511 inst->opcode = BRW_OPCODE_ADD;
2512 inst->src[1] = inst->src[2];
2513 inst->src[2] = reg_undef;
2514 progress = true;
2515 } else if (inst->src[2].is_one()) {
2516 inst->opcode = BRW_OPCODE_ADD;
2517 inst->src[2] = reg_undef;
2518 progress = true;
2519 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2520 inst->opcode = BRW_OPCODE_ADD;
2521 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2522 inst->src[2] = reg_undef;
2523 progress = true;
2524 }
2525 break;
2526 case SHADER_OPCODE_RCP: {
2527 fs_inst *prev = (fs_inst *)inst->prev;
2528 if (prev->opcode == SHADER_OPCODE_SQRT) {
2529 if (inst->src[0].equals(prev->dst)) {
2530 inst->opcode = SHADER_OPCODE_RSQ;
2531 inst->src[0] = prev->src[0];
2532 progress = true;
2533 }
2534 }
2535 break;
2536 }
2537 default:
2538 break;
2539 }
2540
2541 /* Swap if src[0] is immediate. */
2542 if (progress && inst->is_commutative()) {
2543 if (inst->src[0].file == IMM) {
2544 fs_reg tmp = inst->src[1];
2545 inst->src[1] = inst->src[0];
2546 inst->src[0] = tmp;
2547 }
2548 }
2549 }
2550 return progress;
2551 }
2552
2553 bool
2554 fs_visitor::opt_register_renaming()
2555 {
2556 bool progress = false;
2557 int depth = 0;
2558
2559 int remap[alloc.count];
2560 memset(remap, -1, sizeof(int) * alloc.count);
2561
2562 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2563 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2564 depth++;
2565 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2566 inst->opcode == BRW_OPCODE_WHILE) {
2567 depth--;
2568 }
2569
2570 /* Rewrite instruction sources. */
2571 for (int i = 0; i < inst->sources; i++) {
2572 if (inst->src[i].file == GRF &&
2573 remap[inst->src[i].reg] != -1 &&
2574 remap[inst->src[i].reg] != inst->src[i].reg) {
2575 inst->src[i].reg = remap[inst->src[i].reg];
2576 progress = true;
2577 }
2578 }
2579
2580 const int dst = inst->dst.reg;
2581
2582 if (depth == 0 &&
2583 inst->dst.file == GRF &&
2584 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2585 !inst->is_partial_write()) {
2586 if (remap[dst] == -1) {
2587 remap[dst] = dst;
2588 } else {
2589 remap[dst] = alloc.allocate(inst->dst.width / 8);
2590 inst->dst.reg = remap[dst];
2591 progress = true;
2592 }
2593 } else if (inst->dst.file == GRF &&
2594 remap[dst] != -1 &&
2595 remap[dst] != dst) {
2596 inst->dst.reg = remap[dst];
2597 progress = true;
2598 }
2599 }
2600
2601 if (progress) {
2602 invalidate_live_intervals();
2603
2604 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2605 if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2606 delta_x[i].reg = remap[delta_x[i].reg];
2607 }
2608 }
2609 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2610 if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2611 delta_y[i].reg = remap[delta_y[i].reg];
2612 }
2613 }
2614 }
2615
2616 return progress;
2617 }
2618
2619 /**
2620 * Remove redundant or useless discard jumps.
2621 *
2622 * For example, we can eliminate jumps in the following sequence:
2623 *
2624 * discard-jump (redundant with the next jump)
2625 * discard-jump (useless; jumps to the next instruction)
2626 * placeholder-halt
2627 */
2628 bool
2629 fs_visitor::opt_redundant_discard_jumps()
2630 {
2631 bool progress = false;
2632
2633 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2634
2635 fs_inst *placeholder_halt = NULL;
2636 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2637 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2638 placeholder_halt = inst;
2639 break;
2640 }
2641 }
2642
2643 if (!placeholder_halt)
2644 return false;
2645
2646 /* Delete any HALTs immediately before the placeholder halt. */
2647 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2648 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2649 prev = (fs_inst *) placeholder_halt->prev) {
2650 prev->remove(last_bblock);
2651 progress = true;
2652 }
2653
2654 if (progress)
2655 invalidate_live_intervals();
2656
2657 return progress;
2658 }
2659
2660 bool
2661 fs_visitor::compute_to_mrf()
2662 {
2663 bool progress = false;
2664 int next_ip = 0;
2665
2666 /* No MRFs on Gen >= 7. */
2667 if (brw->gen >= 7)
2668 return false;
2669
2670 calculate_live_intervals();
2671
2672 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2673 int ip = next_ip;
2674 next_ip++;
2675
2676 if (inst->opcode != BRW_OPCODE_MOV ||
2677 inst->is_partial_write() ||
2678 inst->dst.file != MRF || inst->src[0].file != GRF ||
2679 inst->dst.type != inst->src[0].type ||
2680 inst->src[0].abs || inst->src[0].negate ||
2681 !inst->src[0].is_contiguous() ||
2682 inst->src[0].subreg_offset)
2683 continue;
2684
2685 /* Work out which hardware MRF registers are written by this
2686 * instruction.
2687 */
2688 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2689 int mrf_high;
2690 if (inst->dst.reg & BRW_MRF_COMPR4) {
2691 mrf_high = mrf_low + 4;
2692 } else if (inst->exec_size == 16) {
2693 mrf_high = mrf_low + 1;
2694 } else {
2695 mrf_high = mrf_low;
2696 }
2697
2698 /* Can't compute-to-MRF this GRF if someone else was going to
2699 * read it later.
2700 */
2701 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2702 continue;
2703
2704 /* Found a move of a GRF to a MRF. Let's see if we can go
2705 * rewrite the thing that made this GRF to write into the MRF.
2706 */
2707 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2708 if (scan_inst->dst.file == GRF &&
2709 scan_inst->dst.reg == inst->src[0].reg) {
2710 /* Found the last thing to write our reg we want to turn
2711 * into a compute-to-MRF.
2712 */
2713
2714 /* If this one instruction didn't populate all the
2715 * channels, bail. We might be able to rewrite everything
2716 * that writes that reg, but it would require smarter
2717 * tracking to delay the rewriting until complete success.
2718 */
2719 if (scan_inst->is_partial_write())
2720 break;
2721
2722 /* Things returning more than one register would need us to
2723 * understand coalescing out more than one MOV at a time.
2724 */
2725 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2726 break;
2727
2728 /* SEND instructions can't have MRF as a destination. */
2729 if (scan_inst->mlen)
2730 break;
2731
2732 if (brw->gen == 6) {
2733 /* gen6 math instructions must have the destination be
2734 * GRF, so no compute-to-MRF for them.
2735 */
2736 if (scan_inst->is_math()) {
2737 break;
2738 }
2739 }
2740
2741 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2742 /* Found the creator of our MRF's source value. */
2743 scan_inst->dst.file = MRF;
2744 scan_inst->dst.reg = inst->dst.reg;
2745 scan_inst->saturate |= inst->saturate;
2746 inst->remove(block);
2747 progress = true;
2748 }
2749 break;
2750 }
2751
2752 /* We don't handle control flow here. Most computation of
2753 * values that end up in MRFs are shortly before the MRF
2754 * write anyway.
2755 */
2756 if (block->start() == scan_inst)
2757 break;
2758
2759 /* You can't read from an MRF, so if someone else reads our
2760 * MRF's source GRF that we wanted to rewrite, that stops us.
2761 */
2762 bool interfered = false;
2763 for (int i = 0; i < scan_inst->sources; i++) {
2764 if (scan_inst->src[i].file == GRF &&
2765 scan_inst->src[i].reg == inst->src[0].reg &&
2766 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2767 interfered = true;
2768 }
2769 }
2770 if (interfered)
2771 break;
2772
2773 if (scan_inst->dst.file == MRF) {
2774 /* If somebody else writes our MRF here, we can't
2775 * compute-to-MRF before that.
2776 */
2777 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2778 int scan_mrf_high;
2779
2780 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2781 scan_mrf_high = scan_mrf_low + 4;
2782 } else if (scan_inst->exec_size == 16) {
2783 scan_mrf_high = scan_mrf_low + 1;
2784 } else {
2785 scan_mrf_high = scan_mrf_low;
2786 }
2787
2788 if (mrf_low == scan_mrf_low ||
2789 mrf_low == scan_mrf_high ||
2790 mrf_high == scan_mrf_low ||
2791 mrf_high == scan_mrf_high) {
2792 break;
2793 }
2794 }
2795
2796 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2797 /* Found a SEND instruction, which means that there are
2798 * live values in MRFs from base_mrf to base_mrf +
2799 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2800 * above it.
2801 */
2802 if (mrf_low >= scan_inst->base_mrf &&
2803 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2804 break;
2805 }
2806 if (mrf_high >= scan_inst->base_mrf &&
2807 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2808 break;
2809 }
2810 }
2811 }
2812 }
2813
2814 if (progress)
2815 invalidate_live_intervals();
2816
2817 return progress;
2818 }
2819
2820 /**
2821 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2822 * instructions to FS_OPCODE_REP_FB_WRITE.
2823 */
2824 void
2825 fs_visitor::emit_repclear_shader()
2826 {
2827 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2828 int base_mrf = 1;
2829 int color_mrf = base_mrf + 2;
2830
2831 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2832 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2833 mov->force_writemask_all = true;
2834
2835 fs_inst *write;
2836 if (key->nr_color_regions == 1) {
2837 write = emit(FS_OPCODE_REP_FB_WRITE);
2838 write->saturate = key->clamp_fragment_color;
2839 write->base_mrf = color_mrf;
2840 write->target = 0;
2841 write->header_present = false;
2842 write->mlen = 1;
2843 } else {
2844 assume(key->nr_color_regions > 0);
2845 for (int i = 0; i < key->nr_color_regions; ++i) {
2846 write = emit(FS_OPCODE_REP_FB_WRITE);
2847 write->saturate = key->clamp_fragment_color;
2848 write->base_mrf = base_mrf;
2849 write->target = i;
2850 write->header_present = true;
2851 write->mlen = 3;
2852 }
2853 }
2854 write->eot = true;
2855
2856 calculate_cfg();
2857
2858 assign_constant_locations();
2859 assign_curb_setup();
2860
2861 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2862 assert(mov->src[0].file == HW_REG);
2863 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2864 }
2865
2866 /**
2867 * Walks through basic blocks, looking for repeated MRF writes and
2868 * removing the later ones.
2869 */
2870 bool
2871 fs_visitor::remove_duplicate_mrf_writes()
2872 {
2873 fs_inst *last_mrf_move[16];
2874 bool progress = false;
2875
2876 /* Need to update the MRF tracking for compressed instructions. */
2877 if (dispatch_width == 16)
2878 return false;
2879
2880 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2881
2882 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2883 if (inst->is_control_flow()) {
2884 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2885 }
2886
2887 if (inst->opcode == BRW_OPCODE_MOV &&
2888 inst->dst.file == MRF) {
2889 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2890 if (prev_inst && inst->equals(prev_inst)) {
2891 inst->remove(block);
2892 progress = true;
2893 continue;
2894 }
2895 }
2896
2897 /* Clear out the last-write records for MRFs that were overwritten. */
2898 if (inst->dst.file == MRF) {
2899 last_mrf_move[inst->dst.reg] = NULL;
2900 }
2901
2902 if (inst->mlen > 0 && inst->base_mrf != -1) {
2903 /* Found a SEND instruction, which will include two or fewer
2904 * implied MRF writes. We could do better here.
2905 */
2906 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2907 last_mrf_move[inst->base_mrf + i] = NULL;
2908 }
2909 }
2910
2911 /* Clear out any MRF move records whose sources got overwritten. */
2912 if (inst->dst.file == GRF) {
2913 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2914 if (last_mrf_move[i] &&
2915 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2916 last_mrf_move[i] = NULL;
2917 }
2918 }
2919 }
2920
2921 if (inst->opcode == BRW_OPCODE_MOV &&
2922 inst->dst.file == MRF &&
2923 inst->src[0].file == GRF &&
2924 !inst->is_partial_write()) {
2925 last_mrf_move[inst->dst.reg] = inst;
2926 }
2927 }
2928
2929 if (progress)
2930 invalidate_live_intervals();
2931
2932 return progress;
2933 }
2934
2935 static void
2936 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
2937 {
2938 /* Clear the flag for registers that actually got read (as expected). */
2939 for (int i = 0; i < inst->sources; i++) {
2940 int grf;
2941 if (inst->src[i].file == GRF) {
2942 grf = inst->src[i].reg;
2943 } else if (inst->src[i].file == HW_REG &&
2944 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2945 grf = inst->src[i].fixed_hw_reg.nr;
2946 } else {
2947 continue;
2948 }
2949
2950 if (grf >= first_grf &&
2951 grf < first_grf + grf_len) {
2952 deps[grf - first_grf] = false;
2953 if (inst->exec_size == 16)
2954 deps[grf - first_grf + 1] = false;
2955 }
2956 }
2957 }
2958
2959 /**
2960 * Implements this workaround for the original 965:
2961 *
2962 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2963 * check for post destination dependencies on this instruction, software
2964 * must ensure that there is no destination hazard for the case of ‘write
2965 * followed by a posted write’ shown in the following example.
2966 *
2967 * 1. mov r3 0
2968 * 2. send r3.xy <rest of send instruction>
2969 * 3. mov r2 r3
2970 *
2971 * Due to no post-destination dependency check on the ‘send’, the above
2972 * code sequence could have two instructions (1 and 2) in flight at the
2973 * same time that both consider ‘r3’ as the target of their final writes.
2974 */
2975 void
2976 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2977 fs_inst *inst)
2978 {
2979 int write_len = inst->regs_written;
2980 int first_write_grf = inst->dst.reg;
2981 bool needs_dep[BRW_MAX_MRF];
2982 assert(write_len < (int)sizeof(needs_dep) - 1);
2983
2984 memset(needs_dep, false, sizeof(needs_dep));
2985 memset(needs_dep, true, write_len);
2986
2987 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
2988
2989 /* Walk backwards looking for writes to registers we're writing which
2990 * aren't read since being written. If we hit the start of the program,
2991 * we assume that there are no outstanding dependencies on entry to the
2992 * program.
2993 */
2994 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2995 /* If we hit control flow, assume that there *are* outstanding
2996 * dependencies, and force their cleanup before our instruction.
2997 */
2998 if (block->start() == scan_inst) {
2999 for (int i = 0; i < write_len; i++) {
3000 if (needs_dep[i]) {
3001 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3002 }
3003 }
3004 return;
3005 }
3006
3007 /* We insert our reads as late as possible on the assumption that any
3008 * instruction but a MOV that might have left us an outstanding
3009 * dependency has more latency than a MOV.
3010 */
3011 if (scan_inst->dst.file == GRF) {
3012 for (int i = 0; i < scan_inst->regs_written; i++) {
3013 int reg = scan_inst->dst.reg + i;
3014
3015 if (reg >= first_write_grf &&
3016 reg < first_write_grf + write_len &&
3017 needs_dep[reg - first_write_grf]) {
3018 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3019 needs_dep[reg - first_write_grf] = false;
3020 if (scan_inst->exec_size == 16)
3021 needs_dep[reg - first_write_grf + 1] = false;
3022 }
3023 }
3024 }
3025
3026 /* Clear the flag for registers that actually got read (as expected). */
3027 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3028
3029 /* Continue the loop only if we haven't resolved all the dependencies */
3030 int i;
3031 for (i = 0; i < write_len; i++) {
3032 if (needs_dep[i])
3033 break;
3034 }
3035 if (i == write_len)
3036 return;
3037 }
3038 }
3039
3040 /**
3041 * Implements this workaround for the original 965:
3042 *
3043 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3044 * used as a destination register until after it has been sourced by an
3045 * instruction with a different destination register.
3046 */
3047 void
3048 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3049 {
3050 int write_len = inst->regs_written;
3051 int first_write_grf = inst->dst.reg;
3052 bool needs_dep[BRW_MAX_MRF];
3053 assert(write_len < (int)sizeof(needs_dep) - 1);
3054
3055 memset(needs_dep, false, sizeof(needs_dep));
3056 memset(needs_dep, true, write_len);
3057 /* Walk forwards looking for writes to registers we're writing which aren't
3058 * read before being written.
3059 */
3060 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3061 /* If we hit control flow, force resolve all remaining dependencies. */
3062 if (block->end() == scan_inst) {
3063 for (int i = 0; i < write_len; i++) {
3064 if (needs_dep[i])
3065 scan_inst->insert_before(block,
3066 DEP_RESOLVE_MOV(first_write_grf + i));
3067 }
3068 return;
3069 }
3070
3071 /* Clear the flag for registers that actually got read (as expected). */
3072 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3073
3074 /* We insert our reads as late as possible since they're reading the
3075 * result of a SEND, which has massive latency.
3076 */
3077 if (scan_inst->dst.file == GRF &&
3078 scan_inst->dst.reg >= first_write_grf &&
3079 scan_inst->dst.reg < first_write_grf + write_len &&
3080 needs_dep[scan_inst->dst.reg - first_write_grf]) {
3081 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3082 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3083 }
3084
3085 /* Continue the loop only if we haven't resolved all the dependencies */
3086 int i;
3087 for (i = 0; i < write_len; i++) {
3088 if (needs_dep[i])
3089 break;
3090 }
3091 if (i == write_len)
3092 return;
3093 }
3094 }
3095
3096 void
3097 fs_visitor::insert_gen4_send_dependency_workarounds()
3098 {
3099 if (brw->gen != 4 || brw->is_g4x)
3100 return;
3101
3102 bool progress = false;
3103
3104 /* Note that we're done with register allocation, so GRF fs_regs always
3105 * have a .reg_offset of 0.
3106 */
3107
3108 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3109 if (inst->mlen != 0 && inst->dst.file == GRF) {
3110 insert_gen4_pre_send_dependency_workarounds(block, inst);
3111 insert_gen4_post_send_dependency_workarounds(block, inst);
3112 progress = true;
3113 }
3114 }
3115
3116 if (progress)
3117 invalidate_live_intervals();
3118 }
3119
3120 /**
3121 * Turns the generic expression-style uniform pull constant load instruction
3122 * into a hardware-specific series of instructions for loading a pull
3123 * constant.
3124 *
3125 * The expression style allows the CSE pass before this to optimize out
3126 * repeated loads from the same offset, and gives the pre-register-allocation
3127 * scheduling full flexibility, while the conversion to native instructions
3128 * allows the post-register-allocation scheduler the best information
3129 * possible.
3130 *
3131 * Note that execution masking for setting up pull constant loads is special:
3132 * the channels that need to be written are unrelated to the current execution
3133 * mask, since a later instruction will use one of the result channels as a
3134 * source operand for all 8 or 16 of its channels.
3135 */
3136 void
3137 fs_visitor::lower_uniform_pull_constant_loads()
3138 {
3139 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3140 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3141 continue;
3142
3143 if (brw->gen >= 7) {
3144 /* The offset arg before was a vec4-aligned byte offset. We need to
3145 * turn it into a dword offset.
3146 */
3147 fs_reg const_offset_reg = inst->src[1];
3148 assert(const_offset_reg.file == IMM &&
3149 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3150 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3151 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3152
3153 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3154 * Reserve space for the register.
3155 */
3156 if (brw->gen >= 9) {
3157 payload.reg_offset++;
3158 alloc.sizes[payload.reg] = 2;
3159 }
3160
3161 /* This is actually going to be a MOV, but since only the first dword
3162 * is accessed, we have a special opcode to do just that one. Note
3163 * that this needs to be an operation that will be considered a def
3164 * by live variable analysis, or register allocation will explode.
3165 */
3166 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3167 8, payload, const_offset_reg);
3168 setup->force_writemask_all = true;
3169
3170 setup->ir = inst->ir;
3171 setup->annotation = inst->annotation;
3172 inst->insert_before(block, setup);
3173
3174 /* Similarly, this will only populate the first 4 channels of the
3175 * result register (since we only use smear values from 0-3), but we
3176 * don't tell the optimizer.
3177 */
3178 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3179 inst->src[1] = payload;
3180
3181 invalidate_live_intervals();
3182 } else {
3183 /* Before register allocation, we didn't tell the scheduler about the
3184 * MRF we use. We know it's safe to use this MRF because nothing
3185 * else does except for register spill/unspill, which generates and
3186 * uses its MRF within a single IR instruction.
3187 */
3188 inst->base_mrf = 14;
3189 inst->mlen = 1;
3190 }
3191 }
3192 }
3193
3194 bool
3195 fs_visitor::lower_load_payload()
3196 {
3197 bool progress = false;
3198
3199 int vgrf_to_reg[alloc.count];
3200 int reg_count = 0;
3201 for (unsigned i = 0; i < alloc.count; ++i) {
3202 vgrf_to_reg[i] = reg_count;
3203 reg_count += alloc.sizes[i];
3204 }
3205
3206 struct {
3207 bool written:1; /* Whether this register has ever been written */
3208 bool force_writemask_all:1;
3209 bool force_sechalf:1;
3210 } metadata[reg_count];
3211 memset(metadata, 0, sizeof(metadata));
3212
3213 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3214 if (inst->dst.file == GRF) {
3215 const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3216 bool force_sechalf = inst->force_sechalf &&
3217 !inst->force_writemask_all;
3218 bool toggle_sechalf = inst->dst.width == 16 &&
3219 type_sz(inst->dst.type) == 4 &&
3220 !inst->force_writemask_all;
3221 for (int i = 0; i < inst->regs_written; ++i) {
3222 metadata[dst_reg + i].written = true;
3223 metadata[dst_reg + i].force_sechalf = force_sechalf;
3224 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3225 force_sechalf = (toggle_sechalf != force_sechalf);
3226 }
3227 }
3228
3229 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3230 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3231 fs_reg dst = inst->dst;
3232
3233 for (int i = 0; i < inst->sources; i++) {
3234 dst.width = inst->src[i].effective_width;
3235 dst.type = inst->src[i].type;
3236
3237 if (inst->src[i].file == BAD_FILE) {
3238 /* Do nothing but otherwise increment as normal */
3239 } else if (dst.file == MRF &&
3240 dst.width == 8 &&
3241 brw->has_compr4 &&
3242 i + 4 < inst->sources &&
3243 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3244 fs_reg compr4_dst = dst;
3245 compr4_dst.reg += BRW_MRF_COMPR4;
3246 compr4_dst.width = 16;
3247 fs_reg compr4_src = inst->src[i];
3248 compr4_src.width = 16;
3249 fs_inst *mov = MOV(compr4_dst, compr4_src);
3250 mov->force_writemask_all = true;
3251 inst->insert_before(block, mov);
3252 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3253 inst->src[i + 4].file = BAD_FILE;
3254 } else {
3255 fs_inst *mov = MOV(dst, inst->src[i]);
3256 if (inst->src[i].file == GRF) {
3257 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3258 inst->src[i].reg_offset;
3259 mov->force_sechalf = metadata[src_reg].force_sechalf;
3260 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3261 } else {
3262 /* We don't have any useful metadata for immediates or
3263 * uniforms. Assume that any of the channels of the
3264 * destination may be used.
3265 */
3266 assert(inst->src[i].file == IMM ||
3267 inst->src[i].file == UNIFORM);
3268 mov->force_writemask_all = true;
3269 }
3270
3271 if (dst.file == GRF) {
3272 const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3273 const bool force_writemask = mov->force_writemask_all;
3274 metadata[dst_reg].force_writemask_all = force_writemask;
3275 metadata[dst_reg].force_sechalf = mov->force_sechalf;
3276 if (dst.width * type_sz(dst.type) > 32) {
3277 assert(!mov->force_sechalf);
3278 metadata[dst_reg + 1].force_writemask_all = force_writemask;
3279 metadata[dst_reg + 1].force_sechalf = !force_writemask;
3280 }
3281 }
3282
3283 inst->insert_before(block, mov);
3284 }
3285
3286 dst = offset(dst, 1);
3287 }
3288
3289 inst->remove(block);
3290 progress = true;
3291 }
3292 }
3293
3294 if (progress)
3295 invalidate_live_intervals();
3296
3297 return progress;
3298 }
3299
3300 void
3301 fs_visitor::dump_instructions()
3302 {
3303 dump_instructions(NULL);
3304 }
3305
3306 void
3307 fs_visitor::dump_instructions(const char *name)
3308 {
3309 FILE *file = stderr;
3310 if (name && geteuid() != 0) {
3311 file = fopen(name, "w");
3312 if (!file)
3313 file = stderr;
3314 }
3315
3316 if (cfg) {
3317 calculate_register_pressure();
3318 int ip = 0, max_pressure = 0;
3319 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3320 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3321 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3322 dump_instruction(inst, file);
3323 ip++;
3324 }
3325 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3326 } else {
3327 int ip = 0;
3328 foreach_in_list(backend_instruction, inst, &instructions) {
3329 fprintf(file, "%4d: ", ip++);
3330 dump_instruction(inst, file);
3331 }
3332 }
3333
3334 if (file != stderr) {
3335 fclose(file);
3336 }
3337 }
3338
3339 void
3340 fs_visitor::dump_instruction(backend_instruction *be_inst)
3341 {
3342 dump_instruction(be_inst, stderr);
3343 }
3344
3345 void
3346 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3347 {
3348 fs_inst *inst = (fs_inst *)be_inst;
3349
3350 if (inst->predicate) {
3351 fprintf(file, "(%cf0.%d) ",
3352 inst->predicate_inverse ? '-' : '+',
3353 inst->flag_subreg);
3354 }
3355
3356 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3357 if (inst->saturate)
3358 fprintf(file, ".sat");
3359 if (inst->conditional_mod) {
3360 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3361 if (!inst->predicate &&
3362 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3363 inst->opcode != BRW_OPCODE_IF &&
3364 inst->opcode != BRW_OPCODE_WHILE))) {
3365 fprintf(file, ".f0.%d", inst->flag_subreg);
3366 }
3367 }
3368 fprintf(file, "(%d) ", inst->exec_size);
3369
3370
3371 switch (inst->dst.file) {
3372 case GRF:
3373 fprintf(file, "vgrf%d", inst->dst.reg);
3374 if (inst->dst.width != dispatch_width)
3375 fprintf(file, "@%d", inst->dst.width);
3376 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3377 inst->dst.subreg_offset)
3378 fprintf(file, "+%d.%d",
3379 inst->dst.reg_offset, inst->dst.subreg_offset);
3380 break;
3381 case MRF:
3382 fprintf(file, "m%d", inst->dst.reg);
3383 break;
3384 case BAD_FILE:
3385 fprintf(file, "(null)");
3386 break;
3387 case UNIFORM:
3388 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3389 break;
3390 case ATTR:
3391 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3392 break;
3393 case HW_REG:
3394 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3395 switch (inst->dst.fixed_hw_reg.nr) {
3396 case BRW_ARF_NULL:
3397 fprintf(file, "null");
3398 break;
3399 case BRW_ARF_ADDRESS:
3400 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3401 break;
3402 case BRW_ARF_ACCUMULATOR:
3403 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3404 break;
3405 case BRW_ARF_FLAG:
3406 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3407 inst->dst.fixed_hw_reg.subnr);
3408 break;
3409 default:
3410 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3411 inst->dst.fixed_hw_reg.subnr);
3412 break;
3413 }
3414 } else {
3415 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3416 }
3417 if (inst->dst.fixed_hw_reg.subnr)
3418 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3419 break;
3420 default:
3421 fprintf(file, "???");
3422 break;
3423 }
3424 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3425
3426 for (int i = 0; i < inst->sources; i++) {
3427 if (inst->src[i].negate)
3428 fprintf(file, "-");
3429 if (inst->src[i].abs)
3430 fprintf(file, "|");
3431 switch (inst->src[i].file) {
3432 case GRF:
3433 fprintf(file, "vgrf%d", inst->src[i].reg);
3434 if (inst->src[i].width != dispatch_width)
3435 fprintf(file, "@%d", inst->src[i].width);
3436 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3437 inst->src[i].subreg_offset)
3438 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3439 inst->src[i].subreg_offset);
3440 break;
3441 case MRF:
3442 fprintf(file, "***m%d***", inst->src[i].reg);
3443 break;
3444 case ATTR:
3445 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3446 break;
3447 case UNIFORM:
3448 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3449 if (inst->src[i].reladdr) {
3450 fprintf(file, "+reladdr");
3451 } else if (inst->src[i].subreg_offset) {
3452 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3453 inst->src[i].subreg_offset);
3454 }
3455 break;
3456 case BAD_FILE:
3457 fprintf(file, "(null)");
3458 break;
3459 case IMM:
3460 switch (inst->src[i].type) {
3461 case BRW_REGISTER_TYPE_F:
3462 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3463 break;
3464 case BRW_REGISTER_TYPE_W:
3465 case BRW_REGISTER_TYPE_D:
3466 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3467 break;
3468 case BRW_REGISTER_TYPE_UW:
3469 case BRW_REGISTER_TYPE_UD:
3470 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3471 break;
3472 case BRW_REGISTER_TYPE_VF:
3473 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3474 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3475 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3476 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3477 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3478 break;
3479 default:
3480 fprintf(file, "???");
3481 break;
3482 }
3483 break;
3484 case HW_REG:
3485 if (inst->src[i].fixed_hw_reg.negate)
3486 fprintf(file, "-");
3487 if (inst->src[i].fixed_hw_reg.abs)
3488 fprintf(file, "|");
3489 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3490 switch (inst->src[i].fixed_hw_reg.nr) {
3491 case BRW_ARF_NULL:
3492 fprintf(file, "null");
3493 break;
3494 case BRW_ARF_ADDRESS:
3495 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3496 break;
3497 case BRW_ARF_ACCUMULATOR:
3498 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3499 break;
3500 case BRW_ARF_FLAG:
3501 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3502 inst->src[i].fixed_hw_reg.subnr);
3503 break;
3504 default:
3505 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3506 inst->src[i].fixed_hw_reg.subnr);
3507 break;
3508 }
3509 } else {
3510 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3511 }
3512 if (inst->src[i].fixed_hw_reg.subnr)
3513 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3514 if (inst->src[i].fixed_hw_reg.abs)
3515 fprintf(file, "|");
3516 break;
3517 default:
3518 fprintf(file, "???");
3519 break;
3520 }
3521 if (inst->src[i].abs)
3522 fprintf(file, "|");
3523
3524 if (inst->src[i].file != IMM) {
3525 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3526 }
3527
3528 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3529 fprintf(file, ", ");
3530 }
3531
3532 fprintf(file, " ");
3533
3534 if (dispatch_width == 16 && inst->exec_size == 8) {
3535 if (inst->force_sechalf)
3536 fprintf(file, "2ndhalf ");
3537 else
3538 fprintf(file, "1sthalf ");
3539 }
3540
3541 fprintf(file, "\n");
3542 }
3543
3544 /**
3545 * Possibly returns an instruction that set up @param reg.
3546 *
3547 * Sometimes we want to take the result of some expression/variable
3548 * dereference tree and rewrite the instruction generating the result
3549 * of the tree. When processing the tree, we know that the
3550 * instructions generated are all writing temporaries that are dead
3551 * outside of this tree. So, if we have some instructions that write
3552 * a temporary, we're free to point that temp write somewhere else.
3553 *
3554 * Note that this doesn't guarantee that the instruction generated
3555 * only reg -- it might be the size=4 destination of a texture instruction.
3556 */
3557 fs_inst *
3558 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3559 fs_inst *end,
3560 const fs_reg &reg)
3561 {
3562 if (end == start ||
3563 end->is_partial_write() ||
3564 reg.reladdr ||
3565 !reg.equals(end->dst)) {
3566 return NULL;
3567 } else {
3568 return end;
3569 }
3570 }
3571
3572 void
3573 fs_visitor::setup_payload_gen6()
3574 {
3575 bool uses_depth =
3576 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3577 unsigned barycentric_interp_modes =
3578 (stage == MESA_SHADER_FRAGMENT) ?
3579 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3580
3581 assert(brw->gen >= 6);
3582
3583 /* R0-1: masks, pixel X/Y coordinates. */
3584 payload.num_regs = 2;
3585 /* R2: only for 32-pixel dispatch.*/
3586
3587 /* R3-26: barycentric interpolation coordinates. These appear in the
3588 * same order that they appear in the brw_wm_barycentric_interp_mode
3589 * enum. Each set of coordinates occupies 2 registers if dispatch width
3590 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3591 * appear if they were enabled using the "Barycentric Interpolation
3592 * Mode" bits in WM_STATE.
3593 */
3594 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3595 if (barycentric_interp_modes & (1 << i)) {
3596 payload.barycentric_coord_reg[i] = payload.num_regs;
3597 payload.num_regs += 2;
3598 if (dispatch_width == 16) {
3599 payload.num_regs += 2;
3600 }
3601 }
3602 }
3603
3604 /* R27: interpolated depth if uses source depth */
3605 if (uses_depth) {
3606 payload.source_depth_reg = payload.num_regs;
3607 payload.num_regs++;
3608 if (dispatch_width == 16) {
3609 /* R28: interpolated depth if not SIMD8. */
3610 payload.num_regs++;
3611 }
3612 }
3613 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3614 if (uses_depth) {
3615 payload.source_w_reg = payload.num_regs;
3616 payload.num_regs++;
3617 if (dispatch_width == 16) {
3618 /* R30: interpolated W if not SIMD8. */
3619 payload.num_regs++;
3620 }
3621 }
3622
3623 if (stage == MESA_SHADER_FRAGMENT) {
3624 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3625 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3626 prog_data->uses_pos_offset = key->compute_pos_offset;
3627 /* R31: MSAA position offsets. */
3628 if (prog_data->uses_pos_offset) {
3629 payload.sample_pos_reg = payload.num_regs;
3630 payload.num_regs++;
3631 }
3632 }
3633
3634 /* R32: MSAA input coverage mask */
3635 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3636 assert(brw->gen >= 7);
3637 payload.sample_mask_in_reg = payload.num_regs;
3638 payload.num_regs++;
3639 if (dispatch_width == 16) {
3640 /* R33: input coverage mask if not SIMD8. */
3641 payload.num_regs++;
3642 }
3643 }
3644
3645 /* R34-: bary for 32-pixel. */
3646 /* R58-59: interp W for 32-pixel. */
3647
3648 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3649 source_depth_to_render_target = true;
3650 }
3651 }
3652
3653 void
3654 fs_visitor::setup_vs_payload()
3655 {
3656 /* R0: thread header, R1: urb handles */
3657 payload.num_regs = 2;
3658 }
3659
3660 void
3661 fs_visitor::assign_binding_table_offsets()
3662 {
3663 assert(stage == MESA_SHADER_FRAGMENT);
3664 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3665 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3666 uint32_t next_binding_table_offset = 0;
3667
3668 /* If there are no color regions, we still perform an FB write to a null
3669 * renderbuffer, which we place at surface index 0.
3670 */
3671 prog_data->binding_table.render_target_start = next_binding_table_offset;
3672 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3673
3674 assign_common_binding_table_offsets(next_binding_table_offset);
3675 }
3676
3677 void
3678 fs_visitor::calculate_register_pressure()
3679 {
3680 invalidate_live_intervals();
3681 calculate_live_intervals();
3682
3683 unsigned num_instructions = 0;
3684 foreach_block(block, cfg)
3685 num_instructions += block->instructions.length();
3686
3687 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3688
3689 for (unsigned reg = 0; reg < alloc.count; reg++) {
3690 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3691 regs_live_at_ip[ip] += alloc.sizes[reg];
3692 }
3693 }
3694
3695 void
3696 fs_visitor::optimize()
3697 {
3698 const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3699
3700 split_virtual_grfs();
3701
3702 move_uniform_array_access_to_pull_constants();
3703 assign_constant_locations();
3704 demote_pull_constants();
3705
3706 #define OPT(pass, args...) ({ \
3707 pass_num++; \
3708 bool this_progress = pass(args); \
3709 \
3710 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3711 char filename[64]; \
3712 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3713 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3714 \
3715 backend_visitor::dump_instructions(filename); \
3716 } \
3717 \
3718 progress = progress || this_progress; \
3719 this_progress; \
3720 })
3721
3722 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3723 char filename[64];
3724 snprintf(filename, 64, "%s%d-%04d-00-start",
3725 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3726
3727 backend_visitor::dump_instructions(filename);
3728 }
3729
3730 bool progress;
3731 int iteration = 0;
3732 int pass_num = 0;
3733 do {
3734 progress = false;
3735 pass_num = 0;
3736 iteration++;
3737
3738 OPT(remove_duplicate_mrf_writes);
3739
3740 OPT(opt_algebraic);
3741 OPT(opt_cse);
3742 OPT(opt_copy_propagate);
3743 OPT(opt_peephole_predicated_break);
3744 OPT(opt_cmod_propagation);
3745 OPT(dead_code_eliminate);
3746 OPT(opt_peephole_sel);
3747 OPT(dead_control_flow_eliminate, this);
3748 OPT(opt_register_renaming);
3749 OPT(opt_redundant_discard_jumps);
3750 OPT(opt_saturate_propagation);
3751 OPT(register_coalesce);
3752 OPT(compute_to_mrf);
3753
3754 OPT(compact_virtual_grfs);
3755 } while (progress);
3756
3757 pass_num = 0;
3758
3759 if (OPT(lower_load_payload)) {
3760 split_virtual_grfs();
3761 OPT(register_coalesce);
3762 OPT(compute_to_mrf);
3763 OPT(dead_code_eliminate);
3764 }
3765
3766 OPT(opt_combine_constants);
3767
3768 lower_uniform_pull_constant_loads();
3769 }
3770
3771 /**
3772 * Three source instruction must have a GRF/MRF destination register.
3773 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
3774 */
3775 void
3776 fs_visitor::fixup_3src_null_dest()
3777 {
3778 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3779 if (inst->is_3src() && inst->dst.is_null()) {
3780 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3781 inst->dst.type);
3782 }
3783 }
3784 }
3785
3786 void
3787 fs_visitor::allocate_registers()
3788 {
3789 bool allocated_without_spills;
3790
3791 static const enum instruction_scheduler_mode pre_modes[] = {
3792 SCHEDULE_PRE,
3793 SCHEDULE_PRE_NON_LIFO,
3794 SCHEDULE_PRE_LIFO,
3795 };
3796
3797 /* Try each scheduling heuristic to see if it can successfully register
3798 * allocate without spilling. They should be ordered by decreasing
3799 * performance but increasing likelihood of allocating.
3800 */
3801 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3802 schedule_instructions(pre_modes[i]);
3803
3804 if (0) {
3805 assign_regs_trivial();
3806 allocated_without_spills = true;
3807 } else {
3808 allocated_without_spills = assign_regs(false);
3809 }
3810 if (allocated_without_spills)
3811 break;
3812 }
3813
3814 if (!allocated_without_spills) {
3815 const char *stage_name = stage == MESA_SHADER_VERTEX ?
3816 "Vertex" : "Fragment";
3817
3818 /* We assume that any spilling is worse than just dropping back to
3819 * SIMD8. There's probably actually some intermediate point where
3820 * SIMD16 with a couple of spills is still better.
3821 */
3822 if (dispatch_width == 16) {
3823 fail("Failure to register allocate. Reduce number of "
3824 "live scalar values to avoid this.");
3825 } else {
3826 perf_debug("%s shader triggered register spilling. "
3827 "Try reducing the number of live scalar values to "
3828 "improve performance.\n", stage_name);
3829 }
3830
3831 /* Since we're out of heuristics, just go spill registers until we
3832 * get an allocation.
3833 */
3834 while (!assign_regs(true)) {
3835 if (failed)
3836 break;
3837 }
3838 }
3839
3840 /* This must come after all optimization and register allocation, since
3841 * it inserts dead code that happens to have side effects, and it does
3842 * so based on the actual physical registers in use.
3843 */
3844 insert_gen4_send_dependency_workarounds();
3845
3846 if (failed)
3847 return;
3848
3849 if (!allocated_without_spills)
3850 schedule_instructions(SCHEDULE_POST);
3851
3852 if (last_scratch > 0)
3853 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3854 }
3855
3856 static bool
3857 env_var_as_boolean(const char *var_name, bool default_value)
3858 {
3859 const char *str = getenv(var_name);
3860 if (str == NULL)
3861 return default_value;
3862
3863 if (strcmp(str, "1") == 0 ||
3864 strcasecmp(str, "true") == 0 ||
3865 strcasecmp(str, "yes") == 0) {
3866 return true;
3867 } else if (strcmp(str, "0") == 0 ||
3868 strcasecmp(str, "false") == 0 ||
3869 strcasecmp(str, "no") == 0) {
3870 return false;
3871 } else {
3872 return default_value;
3873 }
3874 }
3875
3876 bool
3877 fs_visitor::run_vs()
3878 {
3879 assert(stage == MESA_SHADER_VERTEX);
3880
3881 assign_common_binding_table_offsets(0);
3882 setup_vs_payload();
3883
3884 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3885 emit_shader_time_begin();
3886
3887 if (env_var_as_boolean("INTEL_USE_NIR", false)) {
3888 emit_nir_code();
3889 } else {
3890 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3891 base_ir = ir;
3892 this->result = reg_undef;
3893 ir->accept(this);
3894 }
3895 base_ir = NULL;
3896 }
3897
3898 if (failed)
3899 return false;
3900
3901 emit_urb_writes();
3902
3903 calculate_cfg();
3904
3905 optimize();
3906
3907 assign_curb_setup();
3908 assign_vs_urb_setup();
3909
3910 fixup_3src_null_dest();
3911 allocate_registers();
3912
3913 return !failed;
3914 }
3915
3916 bool
3917 fs_visitor::run_fs()
3918 {
3919 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3920 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3921
3922 assert(stage == MESA_SHADER_FRAGMENT);
3923
3924 sanity_param_count = prog->Parameters->NumParameters;
3925
3926 assign_binding_table_offsets();
3927
3928 if (brw->gen >= 6)
3929 setup_payload_gen6();
3930 else
3931 setup_payload_gen4();
3932
3933 if (0) {
3934 emit_dummy_fs();
3935 } else if (brw->use_rep_send && dispatch_width == 16) {
3936 emit_repclear_shader();
3937 } else {
3938 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3939 emit_shader_time_begin();
3940
3941 calculate_urb_setup();
3942 if (prog->InputsRead > 0) {
3943 if (brw->gen < 6)
3944 emit_interpolation_setup_gen4();
3945 else
3946 emit_interpolation_setup_gen6();
3947 }
3948
3949 /* We handle discards by keeping track of the still-live pixels in f0.1.
3950 * Initialize it with the dispatched pixels.
3951 */
3952 if (wm_prog_data->uses_kill) {
3953 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3954 discard_init->flag_subreg = 1;
3955 }
3956
3957 /* Generate FS IR for main(). (the visitor only descends into
3958 * functions called "main").
3959 */
3960 if (shader) {
3961 if (env_var_as_boolean("INTEL_USE_NIR", false)) {
3962 emit_nir_code();
3963 } else {
3964 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3965 base_ir = ir;
3966 this->result = reg_undef;
3967 ir->accept(this);
3968 }
3969 }
3970 } else {
3971 emit_fragment_program_code();
3972 }
3973 base_ir = NULL;
3974 if (failed)
3975 return false;
3976
3977 emit(FS_OPCODE_PLACEHOLDER_HALT);
3978
3979 if (wm_key->alpha_test_func)
3980 emit_alpha_test();
3981
3982 emit_fb_writes();
3983
3984 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3985 emit_shader_time_end();
3986
3987 calculate_cfg();
3988
3989 optimize();
3990
3991 assign_curb_setup();
3992 assign_urb_setup();
3993
3994 fixup_3src_null_dest();
3995 allocate_registers();
3996
3997 if (failed)
3998 return false;
3999 }
4000
4001 if (dispatch_width == 8)
4002 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4003 else
4004 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4005
4006 /* If any state parameters were appended, then ParameterValues could have
4007 * been realloced, in which case the driver uniform storage set up by
4008 * _mesa_associate_uniform_storage() would point to freed memory. Make
4009 * sure that didn't happen.
4010 */
4011 assert(sanity_param_count == prog->Parameters->NumParameters);
4012
4013 return !failed;
4014 }
4015
4016 const unsigned *
4017 brw_wm_fs_emit(struct brw_context *brw,
4018 void *mem_ctx,
4019 const struct brw_wm_prog_key *key,
4020 struct brw_wm_prog_data *prog_data,
4021 struct gl_fragment_program *fp,
4022 struct gl_shader_program *prog,
4023 unsigned *final_assembly_size)
4024 {
4025 bool start_busy = false;
4026 double start_time = 0;
4027
4028 if (unlikely(brw->perf_debug)) {
4029 start_busy = (brw->batch.last_bo &&
4030 drm_intel_bo_busy(brw->batch.last_bo));
4031 start_time = get_time();
4032 }
4033
4034 struct brw_shader *shader = NULL;
4035 if (prog)
4036 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4037
4038 if (unlikely(INTEL_DEBUG & DEBUG_WM))
4039 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4040
4041 /* Now the main event: Visit the shader IR and generate our FS IR for it.
4042 */
4043 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
4044 if (!v.run_fs()) {
4045 if (prog) {
4046 prog->LinkStatus = false;
4047 ralloc_strcat(&prog->InfoLog, v.fail_msg);
4048 }
4049
4050 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4051 v.fail_msg);
4052
4053 return NULL;
4054 }
4055
4056 cfg_t *simd16_cfg = NULL;
4057 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
4058 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
4059 brw->use_rep_send)) {
4060 if (!v.simd16_unsupported) {
4061 /* Try a SIMD16 compile */
4062 v2.import_uniforms(&v);
4063 if (!v2.run_fs()) {
4064 perf_debug("SIMD16 shader failed to compile, falling back to "
4065 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4066 } else {
4067 simd16_cfg = v2.cfg;
4068 }
4069 } else {
4070 perf_debug("SIMD16 shader unsupported, falling back to "
4071 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4072 }
4073 }
4074
4075 cfg_t *simd8_cfg;
4076 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4077 if (no_simd8 && simd16_cfg) {
4078 simd8_cfg = NULL;
4079 prog_data->no_8 = true;
4080 } else {
4081 simd8_cfg = v.cfg;
4082 prog_data->no_8 = false;
4083 }
4084
4085 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4086 &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4087
4088 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4089 char *name;
4090 if (prog)
4091 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4092 prog->Label ? prog->Label : "unnamed",
4093 prog->Name);
4094 else
4095 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4096
4097 g.enable_debug(name);
4098 }
4099
4100 if (simd8_cfg)
4101 g.generate_code(simd8_cfg, 8);
4102 if (simd16_cfg)
4103 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4104
4105 if (unlikely(brw->perf_debug) && shader) {
4106 if (shader->compiled_once)
4107 brw_wm_debug_recompile(brw, prog, key);
4108 shader->compiled_once = true;
4109
4110 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4111 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4112 (get_time() - start_time) * 1000);
4113 }
4114 }
4115
4116 return g.get_assembly(final_assembly_size);
4117 }
4118
4119 extern "C" bool
4120 brw_fs_precompile(struct gl_context *ctx,
4121 struct gl_shader_program *shader_prog,
4122 struct gl_program *prog)
4123 {
4124 struct brw_context *brw = brw_context(ctx);
4125 struct brw_wm_prog_key key;
4126
4127 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4128 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4129 bool program_uses_dfdy = fp->UsesDFdy;
4130
4131 memset(&key, 0, sizeof(key));
4132
4133 if (brw->gen < 6) {
4134 if (fp->UsesKill)
4135 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4136
4137 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4138 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4139
4140 /* Just assume depth testing. */
4141 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4142 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4143 }
4144
4145 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4146 BRW_FS_VARYING_INPUT_MASK) > 16)
4147 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4148
4149 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4150 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
4151 for (unsigned i = 0; i < sampler_count; i++) {
4152 if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
4153 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4154 key.tex.swizzles[i] =
4155 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4156 } else {
4157 /* Color sampler: assume no swizzling. */
4158 key.tex.swizzles[i] = SWIZZLE_XYZW;
4159 }
4160 }
4161
4162 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4163 key.drawable_height = ctx->DrawBuffer->Height;
4164 }
4165
4166 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4167 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4168 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4169
4170 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4171 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4172 key.nr_color_regions > 1;
4173 }
4174
4175 key.program_string_id = bfp->id;
4176
4177 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4178 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4179
4180 bool success = do_wm_prog(brw, shader_prog, bfp, &key);
4181
4182 brw->wm.base.prog_offset = old_prog_offset;
4183 brw->wm.prog_data = old_prog_data;
4184
4185 return success;
4186 }