i965: Fix out-of-bounds accesses into pull_constant_loc array
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 void
53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
54 const fs_reg *src, unsigned sources)
55 {
56 memset(this, 0, sizeof(*this));
57
58 this->src = new fs_reg[MAX2(sources, 3)];
59 for (unsigned i = 0; i < sources; i++)
60 this->src[i] = src[i];
61
62 this->opcode = opcode;
63 this->dst = dst;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (unsigned i = 0; i < sources; ++i) {
79 if (src[i].file != GRF && src[i].file != ATTR)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 for (unsigned i = 0; i < sources; ++i) {
94 switch (this->src[i].file) {
95 case BAD_FILE:
96 this->src[i].effective_width = 8;
97 break;
98 case GRF:
99 case HW_REG:
100 case ATTR:
101 assert(this->src[i].width > 0);
102 if (this->src[i].width == 1) {
103 this->src[i].effective_width = this->exec_size;
104 } else {
105 this->src[i].effective_width = this->src[i].width;
106 }
107 break;
108 case IMM:
109 case UNIFORM:
110 this->src[i].effective_width = this->exec_size;
111 break;
112 default:
113 unreachable("Invalid source register file");
114 }
115 }
116 this->dst.effective_width = this->exec_size;
117
118 this->conditional_mod = BRW_CONDITIONAL_NONE;
119
120 /* This will be the case for almost all instructions. */
121 switch (dst.file) {
122 case GRF:
123 case HW_REG:
124 case MRF:
125 case ATTR:
126 this->regs_written =
127 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
128 break;
129 case BAD_FILE:
130 this->regs_written = 0;
131 break;
132 case IMM:
133 case UNIFORM:
134 unreachable("Invalid destination register file");
135 default:
136 unreachable("Invalid register file");
137 }
138
139 this->writes_accumulator = false;
140 }
141
142 fs_inst::fs_inst()
143 {
144 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
145 }
146
147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
148 {
149 init(opcode, exec_size, reg_undef, NULL, 0);
150 }
151
152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
153 {
154 init(opcode, 0, dst, NULL, 0);
155 }
156
157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
158 const fs_reg &src0)
159 {
160 const fs_reg src[1] = { src0 };
161 init(opcode, exec_size, dst, src, 1);
162 }
163
164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
165 {
166 const fs_reg src[1] = { src0 };
167 init(opcode, 0, dst, src, 1);
168 }
169
170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
171 const fs_reg &src0, const fs_reg &src1)
172 {
173 const fs_reg src[2] = { src0, src1 };
174 init(opcode, exec_size, dst, src, 2);
175 }
176
177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
178 const fs_reg &src1)
179 {
180 const fs_reg src[2] = { src0, src1 };
181 init(opcode, 0, dst, src, 2);
182 }
183
184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
185 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
186 {
187 const fs_reg src[3] = { src0, src1, src2 };
188 init(opcode, exec_size, dst, src, 3);
189 }
190
191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
192 const fs_reg &src1, const fs_reg &src2)
193 {
194 const fs_reg src[3] = { src0, src1, src2 };
195 init(opcode, 0, dst, src, 3);
196 }
197
198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
199 const fs_reg src[], unsigned sources)
200 {
201 init(opcode, 0, dst, src, sources);
202 }
203
204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
205 const fs_reg src[], unsigned sources)
206 {
207 init(opcode, exec_width, dst, src, sources);
208 }
209
210 fs_inst::fs_inst(const fs_inst &that)
211 {
212 memcpy(this, &that, sizeof(that));
213
214 this->src = new fs_reg[MAX2(that.sources, 3)];
215
216 for (unsigned i = 0; i < that.sources; i++)
217 this->src[i] = that.src[i];
218 }
219
220 fs_inst::~fs_inst()
221 {
222 delete[] this->src;
223 }
224
225 void
226 fs_inst::resize_sources(uint8_t num_sources)
227 {
228 if (this->sources != num_sources) {
229 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
230
231 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
232 src[i] = this->src[i];
233
234 delete[] this->src;
235 this->src = src;
236 this->sources = num_sources;
237 }
238 }
239
240 #define ALU1(op) \
241 fs_inst * \
242 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
243 { \
244 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
245 }
246
247 #define ALU2(op) \
248 fs_inst * \
249 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
250 const fs_reg &src1) \
251 { \
252 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
253 }
254
255 #define ALU2_ACC(op) \
256 fs_inst * \
257 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
258 const fs_reg &src1) \
259 { \
260 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
261 inst->writes_accumulator = true; \
262 return inst; \
263 }
264
265 #define ALU3(op) \
266 fs_inst * \
267 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
268 const fs_reg &src1, const fs_reg &src2) \
269 { \
270 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
271 }
272
273 ALU1(NOT)
274 ALU1(MOV)
275 ALU1(FRC)
276 ALU1(RNDD)
277 ALU1(RNDE)
278 ALU1(RNDZ)
279 ALU2(ADD)
280 ALU2(MUL)
281 ALU2_ACC(MACH)
282 ALU2(AND)
283 ALU2(OR)
284 ALU2(XOR)
285 ALU2(SHL)
286 ALU2(SHR)
287 ALU2(ASR)
288 ALU3(LRP)
289 ALU1(BFREV)
290 ALU3(BFE)
291 ALU2(BFI1)
292 ALU3(BFI2)
293 ALU1(FBH)
294 ALU1(FBL)
295 ALU1(CBIT)
296 ALU3(MAD)
297 ALU2_ACC(ADDC)
298 ALU2_ACC(SUBB)
299 ALU2(SEL)
300 ALU2(MAC)
301
302 /** Gen4 predicated IF. */
303 fs_inst *
304 fs_visitor::IF(enum brw_predicate predicate)
305 {
306 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
307 inst->predicate = predicate;
308 return inst;
309 }
310
311 /** Gen6 IF with embedded comparison. */
312 fs_inst *
313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
314 enum brw_conditional_mod condition)
315 {
316 assert(brw->gen == 6);
317 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
318 reg_null_d, src0, src1);
319 inst->conditional_mod = condition;
320 return inst;
321 }
322
323 /**
324 * CMP: Sets the low bit of the destination channels with the result
325 * of the comparison, while the upper bits are undefined, and updates
326 * the flag register with the packed 16 bits of the result.
327 */
328 fs_inst *
329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
330 enum brw_conditional_mod condition)
331 {
332 fs_inst *inst;
333
334 /* Take the instruction:
335 *
336 * CMP null<d> src0<f> src1<f>
337 *
338 * Original gen4 does type conversion to the destination type before
339 * comparison, producing garbage results for floating point comparisons.
340 *
341 * The destination type doesn't matter on newer generations, so we set the
342 * type to match src0 so we can compact the instruction.
343 */
344 dst.type = src0.type;
345 if (dst.file == HW_REG)
346 dst.fixed_hw_reg.type = dst.type;
347
348 resolve_ud_negate(&src0);
349 resolve_ud_negate(&src1);
350
351 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
352 inst->conditional_mod = condition;
353
354 return inst;
355 }
356
357 fs_inst *
358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
359 {
360 uint8_t exec_size = dst.width;
361 for (int i = 0; i < sources; ++i) {
362 assert(src[i].width % dst.width == 0);
363 if (src[i].width > exec_size)
364 exec_size = src[i].width;
365 }
366
367 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
368 dst, src, sources);
369 inst->regs_written = 0;
370 for (int i = 0; i < sources; ++i) {
371 /* The LOAD_PAYLOAD instruction only really makes sense if we are
372 * dealing with whole registers. If this ever changes, we can deal
373 * with it later.
374 */
375 int size = inst->src[i].effective_width * type_sz(src[i].type);
376 assert(size % 32 == 0);
377 inst->regs_written += (size + 31) / 32;
378 }
379
380 return inst;
381 }
382
383 exec_list
384 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
385 const fs_reg &surf_index,
386 const fs_reg &varying_offset,
387 uint32_t const_offset)
388 {
389 exec_list instructions;
390 fs_inst *inst;
391
392 /* We have our constant surface use a pitch of 4 bytes, so our index can
393 * be any component of a vector, and then we load 4 contiguous
394 * components starting from that.
395 *
396 * We break down the const_offset to a portion added to the variable
397 * offset and a portion done using reg_offset, which means that if you
398 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
399 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
400 * CSE can later notice that those loads are all the same and eliminate
401 * the redundant ones.
402 */
403 fs_reg vec4_offset = vgrf(glsl_type::int_type);
404 instructions.push_tail(ADD(vec4_offset,
405 varying_offset, fs_reg(const_offset & ~3)));
406
407 int scale = 1;
408 if (brw->gen == 4 && dst.width == 8) {
409 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
410 * u, v, r) as parameters, or we can just use the SIMD16 message
411 * consisting of (header, u). We choose the second, at the cost of a
412 * longer return length.
413 */
414 scale = 2;
415 }
416
417 enum opcode op;
418 if (brw->gen >= 7)
419 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
420 else
421 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
422
423 assert(dst.width % 8 == 0);
424 int regs_written = 4 * (dst.width / 8) * scale;
425 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
426 dst.type, dst.width);
427 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
428 inst->regs_written = regs_written;
429 instructions.push_tail(inst);
430
431 if (brw->gen < 7) {
432 inst->base_mrf = 13;
433 inst->header_present = true;
434 if (brw->gen == 4)
435 inst->mlen = 3;
436 else
437 inst->mlen = 1 + dispatch_width / 8;
438 }
439
440 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
441 instructions.push_tail(MOV(dst, result));
442
443 return instructions;
444 }
445
446 /**
447 * A helper for MOV generation for fixing up broken hardware SEND dependency
448 * handling.
449 */
450 fs_inst *
451 fs_visitor::DEP_RESOLVE_MOV(int grf)
452 {
453 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
454
455 inst->ir = NULL;
456 inst->annotation = "send dependency resolve";
457
458 /* The caller always wants uncompressed to emit the minimal extra
459 * dependencies, and to avoid having to deal with aligning its regs to 2.
460 */
461 inst->exec_size = 8;
462
463 return inst;
464 }
465
466 bool
467 fs_inst::equals(fs_inst *inst) const
468 {
469 return (opcode == inst->opcode &&
470 dst.equals(inst->dst) &&
471 src[0].equals(inst->src[0]) &&
472 src[1].equals(inst->src[1]) &&
473 src[2].equals(inst->src[2]) &&
474 saturate == inst->saturate &&
475 predicate == inst->predicate &&
476 conditional_mod == inst->conditional_mod &&
477 mlen == inst->mlen &&
478 base_mrf == inst->base_mrf &&
479 target == inst->target &&
480 eot == inst->eot &&
481 header_present == inst->header_present &&
482 shadow_compare == inst->shadow_compare &&
483 exec_size == inst->exec_size &&
484 offset == inst->offset);
485 }
486
487 bool
488 fs_inst::overwrites_reg(const fs_reg &reg) const
489 {
490 return (reg.file == dst.file &&
491 reg.reg == dst.reg &&
492 reg.reg_offset >= dst.reg_offset &&
493 reg.reg_offset < dst.reg_offset + regs_written);
494 }
495
496 bool
497 fs_inst::is_send_from_grf() const
498 {
499 switch (opcode) {
500 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
501 case SHADER_OPCODE_SHADER_TIME_ADD:
502 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
503 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
504 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
505 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
506 case SHADER_OPCODE_UNTYPED_ATOMIC:
507 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
508 case SHADER_OPCODE_URB_WRITE_SIMD8:
509 return true;
510 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
511 return src[1].file == GRF;
512 case FS_OPCODE_FB_WRITE:
513 return src[0].file == GRF;
514 default:
515 if (is_tex())
516 return src[0].file == GRF;
517
518 return false;
519 }
520 }
521
522 bool
523 fs_inst::can_do_source_mods(struct brw_context *brw)
524 {
525 if (brw->gen == 6 && is_math())
526 return false;
527
528 if (is_send_from_grf())
529 return false;
530
531 if (!backend_instruction::can_do_source_mods())
532 return false;
533
534 return true;
535 }
536
537 void
538 fs_reg::init()
539 {
540 memset(this, 0, sizeof(*this));
541 stride = 1;
542 }
543
544 /** Generic unset register constructor. */
545 fs_reg::fs_reg()
546 {
547 init();
548 this->file = BAD_FILE;
549 }
550
551 /** Immediate value constructor. */
552 fs_reg::fs_reg(float f)
553 {
554 init();
555 this->file = IMM;
556 this->type = BRW_REGISTER_TYPE_F;
557 this->fixed_hw_reg.dw1.f = f;
558 this->width = 1;
559 }
560
561 /** Immediate value constructor. */
562 fs_reg::fs_reg(int32_t i)
563 {
564 init();
565 this->file = IMM;
566 this->type = BRW_REGISTER_TYPE_D;
567 this->fixed_hw_reg.dw1.d = i;
568 this->width = 1;
569 }
570
571 /** Immediate value constructor. */
572 fs_reg::fs_reg(uint32_t u)
573 {
574 init();
575 this->file = IMM;
576 this->type = BRW_REGISTER_TYPE_UD;
577 this->fixed_hw_reg.dw1.ud = u;
578 this->width = 1;
579 }
580
581 /** Vector float immediate value constructor. */
582 fs_reg::fs_reg(uint8_t vf[4])
583 {
584 init();
585 this->file = IMM;
586 this->type = BRW_REGISTER_TYPE_VF;
587 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
588 }
589
590 /** Vector float immediate value constructor. */
591 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
592 {
593 init();
594 this->file = IMM;
595 this->type = BRW_REGISTER_TYPE_VF;
596 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
597 (vf1 << 8) |
598 (vf2 << 16) |
599 (vf3 << 24);
600 }
601
602 /** Fixed brw_reg. */
603 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
604 {
605 init();
606 this->file = HW_REG;
607 this->fixed_hw_reg = fixed_hw_reg;
608 this->type = fixed_hw_reg.type;
609 this->width = 1 << fixed_hw_reg.width;
610 }
611
612 bool
613 fs_reg::equals(const fs_reg &r) const
614 {
615 return (file == r.file &&
616 reg == r.reg &&
617 reg_offset == r.reg_offset &&
618 subreg_offset == r.subreg_offset &&
619 type == r.type &&
620 negate == r.negate &&
621 abs == r.abs &&
622 !reladdr && !r.reladdr &&
623 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
624 width == r.width &&
625 stride == r.stride);
626 }
627
628 fs_reg &
629 fs_reg::set_smear(unsigned subreg)
630 {
631 assert(file != HW_REG && file != IMM);
632 subreg_offset = subreg * type_sz(type);
633 stride = 0;
634 return *this;
635 }
636
637 bool
638 fs_reg::is_contiguous() const
639 {
640 return stride == 1;
641 }
642
643 int
644 fs_visitor::type_size(const struct glsl_type *type)
645 {
646 unsigned int size, i;
647
648 switch (type->base_type) {
649 case GLSL_TYPE_UINT:
650 case GLSL_TYPE_INT:
651 case GLSL_TYPE_FLOAT:
652 case GLSL_TYPE_BOOL:
653 return type->components();
654 case GLSL_TYPE_ARRAY:
655 return type_size(type->fields.array) * type->length;
656 case GLSL_TYPE_STRUCT:
657 size = 0;
658 for (i = 0; i < type->length; i++) {
659 size += type_size(type->fields.structure[i].type);
660 }
661 return size;
662 case GLSL_TYPE_SAMPLER:
663 /* Samplers take up no register space, since they're baked in at
664 * link time.
665 */
666 return 0;
667 case GLSL_TYPE_ATOMIC_UINT:
668 return 0;
669 case GLSL_TYPE_IMAGE:
670 case GLSL_TYPE_VOID:
671 case GLSL_TYPE_ERROR:
672 case GLSL_TYPE_INTERFACE:
673 case GLSL_TYPE_DOUBLE:
674 unreachable("not reached");
675 }
676
677 return 0;
678 }
679
680 /**
681 * Create a MOV to read the timestamp register.
682 *
683 * The caller is responsible for emitting the MOV. The return value is
684 * the destination of the MOV, with extra parameters set.
685 */
686 fs_reg
687 fs_visitor::get_timestamp(fs_inst **out_mov)
688 {
689 assert(brw->gen >= 7);
690
691 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
692 BRW_ARF_TIMESTAMP,
693 0),
694 BRW_REGISTER_TYPE_UD));
695
696 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
697
698 fs_inst *mov = MOV(dst, ts);
699 /* We want to read the 3 fields we care about even if it's not enabled in
700 * the dispatch.
701 */
702 mov->force_writemask_all = true;
703
704 /* The caller wants the low 32 bits of the timestamp. Since it's running
705 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
706 * which is plenty of time for our purposes. It is identical across the
707 * EUs, but since it's tracking GPU core speed it will increment at a
708 * varying rate as render P-states change.
709 *
710 * The caller could also check if render P-states have changed (or anything
711 * else that might disrupt timing) by setting smear to 2 and checking if
712 * that field is != 0.
713 */
714 dst.set_smear(0);
715
716 *out_mov = mov;
717 return dst;
718 }
719
720 void
721 fs_visitor::emit_shader_time_begin()
722 {
723 current_annotation = "shader time start";
724 fs_inst *mov;
725 shader_start_time = get_timestamp(&mov);
726 emit(mov);
727 }
728
729 void
730 fs_visitor::emit_shader_time_end()
731 {
732 current_annotation = "shader time end";
733
734 enum shader_time_shader_type type, written_type, reset_type;
735 switch (stage) {
736 case MESA_SHADER_VERTEX:
737 type = ST_VS;
738 written_type = ST_VS_WRITTEN;
739 reset_type = ST_VS_RESET;
740 break;
741 case MESA_SHADER_GEOMETRY:
742 type = ST_GS;
743 written_type = ST_GS_WRITTEN;
744 reset_type = ST_GS_RESET;
745 break;
746 case MESA_SHADER_FRAGMENT:
747 if (dispatch_width == 8) {
748 type = ST_FS8;
749 written_type = ST_FS8_WRITTEN;
750 reset_type = ST_FS8_RESET;
751 } else {
752 assert(dispatch_width == 16);
753 type = ST_FS16;
754 written_type = ST_FS16_WRITTEN;
755 reset_type = ST_FS16_RESET;
756 }
757 break;
758 default:
759 unreachable("fs_visitor::emit_shader_time_end missing code");
760 }
761
762 /* Insert our code just before the final SEND with EOT. */
763 exec_node *end = this->instructions.get_tail();
764 assert(end && ((fs_inst *) end)->eot);
765
766 fs_inst *tm_read;
767 fs_reg shader_end_time = get_timestamp(&tm_read);
768 end->insert_before(tm_read);
769
770 /* Check that there weren't any timestamp reset events (assuming these
771 * were the only two timestamp reads that happened).
772 */
773 fs_reg reset = shader_end_time;
774 reset.set_smear(2);
775 fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
776 test->conditional_mod = BRW_CONDITIONAL_Z;
777 test->force_writemask_all = true;
778 end->insert_before(test);
779 end->insert_before(IF(BRW_PREDICATE_NORMAL));
780
781 fs_reg start = shader_start_time;
782 start.negate = true;
783 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
784 diff.set_smear(0);
785 fs_inst *add = ADD(diff, start, shader_end_time);
786 add->force_writemask_all = true;
787 end->insert_before(add);
788
789 /* If there were no instructions between the two timestamp gets, the diff
790 * is 2 cycles. Remove that overhead, so I can forget about that when
791 * trying to determine the time taken for single instructions.
792 */
793 add = ADD(diff, diff, fs_reg(-2u));
794 add->force_writemask_all = true;
795 end->insert_before(add);
796
797 end->insert_before(SHADER_TIME_ADD(type, diff));
798 end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
799 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
800 end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
801 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
802 }
803
804 fs_inst *
805 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
806 {
807 int shader_time_index =
808 brw_get_shader_time_index(brw, shader_prog, prog, type);
809 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
810
811 fs_reg payload;
812 if (dispatch_width == 8)
813 payload = vgrf(glsl_type::uvec2_type);
814 else
815 payload = vgrf(glsl_type::uint_type);
816
817 return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
818 fs_reg(), payload, offset, value);
819 }
820
821 void
822 fs_visitor::vfail(const char *format, va_list va)
823 {
824 char *msg;
825
826 if (failed)
827 return;
828
829 failed = true;
830
831 msg = ralloc_vasprintf(mem_ctx, format, va);
832 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
833
834 this->fail_msg = msg;
835
836 if (debug_enabled) {
837 fprintf(stderr, "%s", msg);
838 }
839 }
840
841 void
842 fs_visitor::fail(const char *format, ...)
843 {
844 va_list va;
845
846 va_start(va, format);
847 vfail(format, va);
848 va_end(va);
849 }
850
851 /**
852 * Mark this program as impossible to compile in SIMD16 mode.
853 *
854 * During the SIMD8 compile (which happens first), we can detect and flag
855 * things that are unsupported in SIMD16 mode, so the compiler can skip
856 * the SIMD16 compile altogether.
857 *
858 * During a SIMD16 compile (if one happens anyway), this just calls fail().
859 */
860 void
861 fs_visitor::no16(const char *format, ...)
862 {
863 va_list va;
864
865 va_start(va, format);
866
867 if (dispatch_width == 16) {
868 vfail(format, va);
869 } else {
870 simd16_unsupported = true;
871
872 if (brw->perf_debug) {
873 if (no16_msg)
874 ralloc_vasprintf_append(&no16_msg, format, va);
875 else
876 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
877 }
878 }
879
880 va_end(va);
881 }
882
883 fs_inst *
884 fs_visitor::emit(enum opcode opcode)
885 {
886 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
887 }
888
889 fs_inst *
890 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
891 {
892 return emit(new(mem_ctx) fs_inst(opcode, dst));
893 }
894
895 fs_inst *
896 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
897 {
898 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
899 }
900
901 fs_inst *
902 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
903 const fs_reg &src1)
904 {
905 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
906 }
907
908 fs_inst *
909 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
910 const fs_reg &src1, const fs_reg &src2)
911 {
912 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
913 }
914
915 fs_inst *
916 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
917 fs_reg src[], int sources)
918 {
919 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
920 }
921
922 /**
923 * Returns true if the instruction has a flag that means it won't
924 * update an entire destination register.
925 *
926 * For example, dead code elimination and live variable analysis want to know
927 * when a write to a variable screens off any preceding values that were in
928 * it.
929 */
930 bool
931 fs_inst::is_partial_write() const
932 {
933 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
934 (this->dst.width * type_sz(this->dst.type)) < 32 ||
935 !this->dst.is_contiguous());
936 }
937
938 int
939 fs_inst::regs_read(int arg) const
940 {
941 if (is_tex() && arg == 0 && src[0].file == GRF) {
942 return mlen;
943 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
944 return mlen;
945 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
946 return mlen;
947 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
948 return mlen;
949 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
950 return mlen;
951 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
952 return mlen;
953 }
954
955 switch (src[arg].file) {
956 case BAD_FILE:
957 case UNIFORM:
958 case IMM:
959 return 1;
960 case GRF:
961 case HW_REG:
962 if (src[arg].stride == 0) {
963 return 1;
964 } else {
965 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
966 return (size + 31) / 32;
967 }
968 case MRF:
969 unreachable("MRF registers are not allowed as sources");
970 default:
971 unreachable("Invalid register file");
972 }
973 }
974
975 bool
976 fs_inst::reads_flag() const
977 {
978 return predicate;
979 }
980
981 bool
982 fs_inst::writes_flag() const
983 {
984 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
985 opcode != BRW_OPCODE_IF &&
986 opcode != BRW_OPCODE_WHILE)) ||
987 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
988 }
989
990 /**
991 * Returns how many MRFs an FS opcode will write over.
992 *
993 * Note that this is not the 0 or 1 implied writes in an actual gen
994 * instruction -- the FS opcodes often generate MOVs in addition.
995 */
996 int
997 fs_visitor::implied_mrf_writes(fs_inst *inst)
998 {
999 if (inst->mlen == 0)
1000 return 0;
1001
1002 if (inst->base_mrf == -1)
1003 return 0;
1004
1005 switch (inst->opcode) {
1006 case SHADER_OPCODE_RCP:
1007 case SHADER_OPCODE_RSQ:
1008 case SHADER_OPCODE_SQRT:
1009 case SHADER_OPCODE_EXP2:
1010 case SHADER_OPCODE_LOG2:
1011 case SHADER_OPCODE_SIN:
1012 case SHADER_OPCODE_COS:
1013 return 1 * dispatch_width / 8;
1014 case SHADER_OPCODE_POW:
1015 case SHADER_OPCODE_INT_QUOTIENT:
1016 case SHADER_OPCODE_INT_REMAINDER:
1017 return 2 * dispatch_width / 8;
1018 case SHADER_OPCODE_TEX:
1019 case FS_OPCODE_TXB:
1020 case SHADER_OPCODE_TXD:
1021 case SHADER_OPCODE_TXF:
1022 case SHADER_OPCODE_TXF_CMS:
1023 case SHADER_OPCODE_TXF_MCS:
1024 case SHADER_OPCODE_TG4:
1025 case SHADER_OPCODE_TG4_OFFSET:
1026 case SHADER_OPCODE_TXL:
1027 case SHADER_OPCODE_TXS:
1028 case SHADER_OPCODE_LOD:
1029 return 1;
1030 case FS_OPCODE_FB_WRITE:
1031 return 2;
1032 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1033 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1034 return 1;
1035 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1036 return inst->mlen;
1037 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1038 return 2;
1039 case SHADER_OPCODE_UNTYPED_ATOMIC:
1040 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1041 case SHADER_OPCODE_URB_WRITE_SIMD8:
1042 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1043 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1044 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1045 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1046 return 0;
1047 default:
1048 unreachable("not reached");
1049 }
1050 }
1051
1052 fs_reg
1053 fs_visitor::vgrf(const glsl_type *const type)
1054 {
1055 int reg_width = dispatch_width / 8;
1056 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1057 brw_type_for_base_type(type), dispatch_width);
1058 }
1059
1060 fs_reg
1061 fs_visitor::vgrf(int num_components)
1062 {
1063 int reg_width = dispatch_width / 8;
1064 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1065 BRW_REGISTER_TYPE_F, dispatch_width);
1066 }
1067
1068 /** Fixed HW reg constructor. */
1069 fs_reg::fs_reg(enum register_file file, int reg)
1070 {
1071 init();
1072 this->file = file;
1073 this->reg = reg;
1074 this->type = BRW_REGISTER_TYPE_F;
1075
1076 switch (file) {
1077 case UNIFORM:
1078 this->width = 1;
1079 break;
1080 default:
1081 this->width = 8;
1082 }
1083 }
1084
1085 /** Fixed HW reg constructor. */
1086 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1087 {
1088 init();
1089 this->file = file;
1090 this->reg = reg;
1091 this->type = type;
1092
1093 switch (file) {
1094 case UNIFORM:
1095 this->width = 1;
1096 break;
1097 default:
1098 this->width = 8;
1099 }
1100 }
1101
1102 /** Fixed HW reg constructor. */
1103 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1104 uint8_t width)
1105 {
1106 init();
1107 this->file = file;
1108 this->reg = reg;
1109 this->type = type;
1110 this->width = width;
1111 }
1112
1113 fs_reg *
1114 fs_visitor::variable_storage(ir_variable *var)
1115 {
1116 return (fs_reg *)hash_table_find(this->variable_ht, var);
1117 }
1118
1119 void
1120 import_uniforms_callback(const void *key,
1121 void *data,
1122 void *closure)
1123 {
1124 struct hash_table *dst_ht = (struct hash_table *)closure;
1125 const fs_reg *reg = (const fs_reg *)data;
1126
1127 if (reg->file != UNIFORM)
1128 return;
1129
1130 hash_table_insert(dst_ht, data, key);
1131 }
1132
1133 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1134 * This brings in those uniform definitions
1135 */
1136 void
1137 fs_visitor::import_uniforms(fs_visitor *v)
1138 {
1139 hash_table_call_foreach(v->variable_ht,
1140 import_uniforms_callback,
1141 variable_ht);
1142 this->push_constant_loc = v->push_constant_loc;
1143 this->pull_constant_loc = v->pull_constant_loc;
1144 this->uniforms = v->uniforms;
1145 this->param_size = v->param_size;
1146 }
1147
1148 /* Our support for uniforms is piggy-backed on the struct
1149 * gl_fragment_program, because that's where the values actually
1150 * get stored, rather than in some global gl_shader_program uniform
1151 * store.
1152 */
1153 void
1154 fs_visitor::setup_uniform_values(ir_variable *ir)
1155 {
1156 int namelen = strlen(ir->name);
1157
1158 /* The data for our (non-builtin) uniforms is stored in a series of
1159 * gl_uniform_driver_storage structs for each subcomponent that
1160 * glGetUniformLocation() could name. We know it's been set up in the same
1161 * order we'd walk the type, so walk the list of storage and find anything
1162 * with our name, or the prefix of a component that starts with our name.
1163 */
1164 unsigned params_before = uniforms;
1165 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1166 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1167
1168 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1169 (storage->name[namelen] != 0 &&
1170 storage->name[namelen] != '.' &&
1171 storage->name[namelen] != '[')) {
1172 continue;
1173 }
1174
1175 unsigned slots = storage->type->component_slots();
1176 if (storage->array_elements)
1177 slots *= storage->array_elements;
1178
1179 for (unsigned i = 0; i < slots; i++) {
1180 stage_prog_data->param[uniforms++] = &storage->storage[i];
1181 }
1182 }
1183
1184 /* Make sure we actually initialized the right amount of stuff here. */
1185 assert(params_before + ir->type->component_slots() == uniforms);
1186 (void)params_before;
1187 }
1188
1189
1190 /* Our support for builtin uniforms is even scarier than non-builtin.
1191 * It sits on top of the PROG_STATE_VAR parameters that are
1192 * automatically updated from GL context state.
1193 */
1194 void
1195 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1196 {
1197 const ir_state_slot *const slots = ir->get_state_slots();
1198 assert(slots != NULL);
1199
1200 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1201 /* This state reference has already been setup by ir_to_mesa, but we'll
1202 * get the same index back here.
1203 */
1204 int index = _mesa_add_state_reference(this->prog->Parameters,
1205 (gl_state_index *)slots[i].tokens);
1206
1207 /* Add each of the unique swizzles of the element as a parameter.
1208 * This'll end up matching the expected layout of the
1209 * array/matrix/structure we're trying to fill in.
1210 */
1211 int last_swiz = -1;
1212 for (unsigned int j = 0; j < 4; j++) {
1213 int swiz = GET_SWZ(slots[i].swizzle, j);
1214 if (swiz == last_swiz)
1215 break;
1216 last_swiz = swiz;
1217
1218 stage_prog_data->param[uniforms++] =
1219 &prog->Parameters->ParameterValues[index][swiz];
1220 }
1221 }
1222 }
1223
1224 fs_reg *
1225 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1226 bool origin_upper_left)
1227 {
1228 assert(stage == MESA_SHADER_FRAGMENT);
1229 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1230 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1231 fs_reg wpos = *reg;
1232 bool flip = !origin_upper_left ^ key->render_to_fbo;
1233
1234 /* gl_FragCoord.x */
1235 if (pixel_center_integer) {
1236 emit(MOV(wpos, this->pixel_x));
1237 } else {
1238 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1239 }
1240 wpos = offset(wpos, 1);
1241
1242 /* gl_FragCoord.y */
1243 if (!flip && pixel_center_integer) {
1244 emit(MOV(wpos, this->pixel_y));
1245 } else {
1246 fs_reg pixel_y = this->pixel_y;
1247 float offset = (pixel_center_integer ? 0.0 : 0.5);
1248
1249 if (flip) {
1250 pixel_y.negate = true;
1251 offset += key->drawable_height - 1.0;
1252 }
1253
1254 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1255 }
1256 wpos = offset(wpos, 1);
1257
1258 /* gl_FragCoord.z */
1259 if (brw->gen >= 6) {
1260 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1261 } else {
1262 emit(FS_OPCODE_LINTERP, wpos,
1263 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1264 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1265 interp_reg(VARYING_SLOT_POS, 2));
1266 }
1267 wpos = offset(wpos, 1);
1268
1269 /* gl_FragCoord.w: Already set up in emit_interpolation */
1270 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1271
1272 return reg;
1273 }
1274
1275 fs_inst *
1276 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1277 glsl_interp_qualifier interpolation_mode,
1278 bool is_centroid, bool is_sample)
1279 {
1280 brw_wm_barycentric_interp_mode barycoord_mode;
1281 if (brw->gen >= 6) {
1282 if (is_centroid) {
1283 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1284 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1285 else
1286 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1287 } else if (is_sample) {
1288 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1289 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1290 else
1291 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1292 } else {
1293 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1294 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1295 else
1296 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1297 }
1298 } else {
1299 /* On Ironlake and below, there is only one interpolation mode.
1300 * Centroid interpolation doesn't mean anything on this hardware --
1301 * there is no multisampling.
1302 */
1303 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1304 }
1305 return emit(FS_OPCODE_LINTERP, attr,
1306 this->delta_x[barycoord_mode],
1307 this->delta_y[barycoord_mode], interp);
1308 }
1309
1310 void
1311 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1312 const glsl_type *type,
1313 glsl_interp_qualifier interpolation_mode,
1314 int location, bool mod_centroid,
1315 bool mod_sample)
1316 {
1317 attr.type = brw_type_for_base_type(type->get_scalar_type());
1318
1319 assert(stage == MESA_SHADER_FRAGMENT);
1320 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1321 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1322
1323 unsigned int array_elements;
1324
1325 if (type->is_array()) {
1326 array_elements = type->length;
1327 if (array_elements == 0) {
1328 fail("dereferenced array '%s' has length 0\n", name);
1329 }
1330 type = type->fields.array;
1331 } else {
1332 array_elements = 1;
1333 }
1334
1335 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1336 bool is_gl_Color =
1337 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1338 if (key->flat_shade && is_gl_Color) {
1339 interpolation_mode = INTERP_QUALIFIER_FLAT;
1340 } else {
1341 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1342 }
1343 }
1344
1345 for (unsigned int i = 0; i < array_elements; i++) {
1346 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1347 if (prog_data->urb_setup[location] == -1) {
1348 /* If there's no incoming setup data for this slot, don't
1349 * emit interpolation for it.
1350 */
1351 attr = offset(attr, type->vector_elements);
1352 location++;
1353 continue;
1354 }
1355
1356 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1357 /* Constant interpolation (flat shading) case. The SF has
1358 * handed us defined values in only the constant offset
1359 * field of the setup reg.
1360 */
1361 for (unsigned int k = 0; k < type->vector_elements; k++) {
1362 struct brw_reg interp = interp_reg(location, k);
1363 interp = suboffset(interp, 3);
1364 interp.type = attr.type;
1365 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1366 attr = offset(attr, 1);
1367 }
1368 } else {
1369 /* Smooth/noperspective interpolation case. */
1370 for (unsigned int k = 0; k < type->vector_elements; k++) {
1371 struct brw_reg interp = interp_reg(location, k);
1372 if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1373 /* Get the pixel/sample mask into f0 so that we know
1374 * which pixels are lit. Then, for each channel that is
1375 * unlit, replace the centroid data with non-centroid
1376 * data.
1377 */
1378 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1379
1380 fs_inst *inst;
1381 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1382 false, false);
1383 inst->predicate = BRW_PREDICATE_NORMAL;
1384 inst->predicate_inverse = true;
1385 if (brw->has_pln)
1386 inst->no_dd_clear = true;
1387
1388 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1389 mod_centroid && !key->persample_shading,
1390 mod_sample || key->persample_shading);
1391 inst->predicate = BRW_PREDICATE_NORMAL;
1392 inst->predicate_inverse = false;
1393 if (brw->has_pln)
1394 inst->no_dd_check = true;
1395
1396 } else {
1397 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1398 mod_centroid && !key->persample_shading,
1399 mod_sample || key->persample_shading);
1400 }
1401 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1402 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1403 }
1404 attr = offset(attr, 1);
1405 }
1406
1407 }
1408 location++;
1409 }
1410 }
1411 }
1412
1413 fs_reg *
1414 fs_visitor::emit_frontfacing_interpolation()
1415 {
1416 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1417
1418 if (brw->gen >= 6) {
1419 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1420 * a boolean result from this (~0/true or 0/false).
1421 *
1422 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1423 * this task in only one instruction:
1424 * - a negation source modifier will flip the bit; and
1425 * - a W -> D type conversion will sign extend the bit into the high
1426 * word of the destination.
1427 *
1428 * An ASR 15 fills the low word of the destination.
1429 */
1430 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1431 g0.negate = true;
1432
1433 emit(ASR(*reg, g0, fs_reg(15)));
1434 } else {
1435 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1436 * a boolean result from this (1/true or 0/false).
1437 *
1438 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1439 * the negation source modifier to flip it. Unfortunately the SHR
1440 * instruction only operates on UD (or D with an abs source modifier)
1441 * sources without negation.
1442 *
1443 * Instead, use ASR (which will give ~0/true or 0/false).
1444 */
1445 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1446 g1_6.negate = true;
1447
1448 emit(ASR(*reg, g1_6, fs_reg(31)));
1449 }
1450
1451 return reg;
1452 }
1453
1454 void
1455 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1456 {
1457 assert(stage == MESA_SHADER_FRAGMENT);
1458 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1459 assert(dst.type == BRW_REGISTER_TYPE_F);
1460
1461 if (key->compute_pos_offset) {
1462 /* Convert int_sample_pos to floating point */
1463 emit(MOV(dst, int_sample_pos));
1464 /* Scale to the range [0, 1] */
1465 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1466 }
1467 else {
1468 /* From ARB_sample_shading specification:
1469 * "When rendering to a non-multisample buffer, or if multisample
1470 * rasterization is disabled, gl_SamplePosition will always be
1471 * (0.5, 0.5).
1472 */
1473 emit(MOV(dst, fs_reg(0.5f)));
1474 }
1475 }
1476
1477 fs_reg *
1478 fs_visitor::emit_samplepos_setup()
1479 {
1480 assert(brw->gen >= 6);
1481
1482 this->current_annotation = "compute sample position";
1483 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1484 fs_reg pos = *reg;
1485 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1486 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1487
1488 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1489 * mode will be enabled.
1490 *
1491 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1492 * R31.1:0 Position Offset X/Y for Slot[3:0]
1493 * R31.3:2 Position Offset X/Y for Slot[7:4]
1494 * .....
1495 *
1496 * The X, Y sample positions come in as bytes in thread payload. So, read
1497 * the positions using vstride=16, width=8, hstride=2.
1498 */
1499 struct brw_reg sample_pos_reg =
1500 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1501 BRW_REGISTER_TYPE_B), 16, 8, 2);
1502
1503 if (dispatch_width == 8) {
1504 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1505 } else {
1506 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1507 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1508 ->force_sechalf = true;
1509 }
1510 /* Compute gl_SamplePosition.x */
1511 compute_sample_position(pos, int_sample_x);
1512 pos = offset(pos, 1);
1513 if (dispatch_width == 8) {
1514 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1515 } else {
1516 emit(MOV(half(int_sample_y, 0),
1517 fs_reg(suboffset(sample_pos_reg, 1))));
1518 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1519 ->force_sechalf = true;
1520 }
1521 /* Compute gl_SamplePosition.y */
1522 compute_sample_position(pos, int_sample_y);
1523 return reg;
1524 }
1525
1526 fs_reg *
1527 fs_visitor::emit_sampleid_setup()
1528 {
1529 assert(stage == MESA_SHADER_FRAGMENT);
1530 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1531 assert(brw->gen >= 6);
1532
1533 this->current_annotation = "compute sample id";
1534 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1535
1536 if (key->compute_sample_id) {
1537 fs_reg t1 = vgrf(glsl_type::int_type);
1538 fs_reg t2 = vgrf(glsl_type::int_type);
1539 t2.type = BRW_REGISTER_TYPE_UW;
1540
1541 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1542 * 8x multisampling, subspan 0 will represent sample N (where N
1543 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1544 * 7. We can find the value of N by looking at R0.0 bits 7:6
1545 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1546 * (since samples are always delivered in pairs). That is, we
1547 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1548 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1549 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1550 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1551 * populating a temporary variable with the sequence (0, 1, 2, 3),
1552 * and then reading from it using vstride=1, width=4, hstride=0.
1553 * These computations hold good for 4x multisampling as well.
1554 *
1555 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1556 * the first four slots are sample 0 of subspan 0; the next four
1557 * are sample 1 of subspan 0; the third group is sample 0 of
1558 * subspan 1, and finally sample 1 of subspan 1.
1559 */
1560 fs_inst *inst;
1561 inst = emit(BRW_OPCODE_AND, t1,
1562 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1563 fs_reg(0xc0));
1564 inst->force_writemask_all = true;
1565 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1566 inst->force_writemask_all = true;
1567 /* This works for both SIMD8 and SIMD16 */
1568 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1569 inst->force_writemask_all = true;
1570 /* This special instruction takes care of setting vstride=1,
1571 * width=4, hstride=0 of t2 during an ADD instruction.
1572 */
1573 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1574 } else {
1575 /* As per GL_ARB_sample_shading specification:
1576 * "When rendering to a non-multisample buffer, or if multisample
1577 * rasterization is disabled, gl_SampleID will always be zero."
1578 */
1579 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1580 }
1581
1582 return reg;
1583 }
1584
1585 void
1586 fs_visitor::resolve_source_modifiers(fs_reg *src)
1587 {
1588 if (!src->abs && !src->negate)
1589 return;
1590
1591 fs_reg temp = retype(vgrf(1), src->type);
1592 emit(MOV(temp, *src));
1593 *src = temp;
1594 }
1595
1596 fs_reg
1597 fs_visitor::fix_math_operand(fs_reg src)
1598 {
1599 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1600 * might be able to do better by doing execsize = 1 math and then
1601 * expanding that result out, but we would need to be careful with
1602 * masking.
1603 *
1604 * The hardware ignores source modifiers (negate and abs) on math
1605 * instructions, so we also move to a temp to set those up.
1606 */
1607 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1608 !src.abs && !src.negate)
1609 return src;
1610
1611 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1612 * operands to math
1613 */
1614 if (brw->gen >= 7 && src.file != IMM)
1615 return src;
1616
1617 fs_reg expanded = vgrf(glsl_type::float_type);
1618 expanded.type = src.type;
1619 emit(BRW_OPCODE_MOV, expanded, src);
1620 return expanded;
1621 }
1622
1623 fs_inst *
1624 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1625 {
1626 switch (opcode) {
1627 case SHADER_OPCODE_RCP:
1628 case SHADER_OPCODE_RSQ:
1629 case SHADER_OPCODE_SQRT:
1630 case SHADER_OPCODE_EXP2:
1631 case SHADER_OPCODE_LOG2:
1632 case SHADER_OPCODE_SIN:
1633 case SHADER_OPCODE_COS:
1634 break;
1635 default:
1636 unreachable("not reached: bad math opcode");
1637 }
1638
1639 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1640 * might be able to do better by doing execsize = 1 math and then
1641 * expanding that result out, but we would need to be careful with
1642 * masking.
1643 *
1644 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1645 * instructions, so we also move to a temp to set those up.
1646 */
1647 if (brw->gen == 6 || brw->gen == 7)
1648 src = fix_math_operand(src);
1649
1650 fs_inst *inst = emit(opcode, dst, src);
1651
1652 if (brw->gen < 6) {
1653 inst->base_mrf = 2;
1654 inst->mlen = dispatch_width / 8;
1655 }
1656
1657 return inst;
1658 }
1659
1660 fs_inst *
1661 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1662 {
1663 int base_mrf = 2;
1664 fs_inst *inst;
1665
1666 if (brw->gen >= 8) {
1667 inst = emit(opcode, dst, src0, src1);
1668 } else if (brw->gen >= 6) {
1669 src0 = fix_math_operand(src0);
1670 src1 = fix_math_operand(src1);
1671
1672 inst = emit(opcode, dst, src0, src1);
1673 } else {
1674 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1675 * "Message Payload":
1676 *
1677 * "Operand0[7]. For the INT DIV functions, this operand is the
1678 * denominator."
1679 * ...
1680 * "Operand1[7]. For the INT DIV functions, this operand is the
1681 * numerator."
1682 */
1683 bool is_int_div = opcode != SHADER_OPCODE_POW;
1684 fs_reg &op0 = is_int_div ? src1 : src0;
1685 fs_reg &op1 = is_int_div ? src0 : src1;
1686
1687 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1688 inst = emit(opcode, dst, op0, reg_null_f);
1689
1690 inst->base_mrf = base_mrf;
1691 inst->mlen = 2 * dispatch_width / 8;
1692 }
1693 return inst;
1694 }
1695
1696 void
1697 fs_visitor::assign_curb_setup()
1698 {
1699 if (dispatch_width == 8) {
1700 prog_data->dispatch_grf_start_reg = payload.num_regs;
1701 } else {
1702 assert(stage == MESA_SHADER_FRAGMENT);
1703 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1704 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1705 }
1706
1707 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1708
1709 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1710 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1711 for (unsigned int i = 0; i < inst->sources; i++) {
1712 if (inst->src[i].file == UNIFORM) {
1713 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1714 int constant_nr;
1715 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1716 constant_nr = push_constant_loc[uniform_nr];
1717 } else {
1718 /* Section 5.11 of the OpenGL 4.1 spec says:
1719 * "Out-of-bounds reads return undefined values, which include
1720 * values from other variables of the active program or zero."
1721 * Just return the first push constant.
1722 */
1723 constant_nr = 0;
1724 }
1725
1726 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1727 constant_nr / 8,
1728 constant_nr % 8);
1729
1730 inst->src[i].file = HW_REG;
1731 inst->src[i].fixed_hw_reg = byte_offset(
1732 retype(brw_reg, inst->src[i].type),
1733 inst->src[i].subreg_offset);
1734 }
1735 }
1736 }
1737 }
1738
1739 void
1740 fs_visitor::calculate_urb_setup()
1741 {
1742 assert(stage == MESA_SHADER_FRAGMENT);
1743 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1744 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1745
1746 memset(prog_data->urb_setup, -1,
1747 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1748
1749 int urb_next = 0;
1750 /* Figure out where each of the incoming setup attributes lands. */
1751 if (brw->gen >= 6) {
1752 if (_mesa_bitcount_64(prog->InputsRead &
1753 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1754 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1755 * first 16 varying inputs, so we can put them wherever we want.
1756 * Just put them in order.
1757 *
1758 * This is useful because it means that (a) inputs not used by the
1759 * fragment shader won't take up valuable register space, and (b) we
1760 * won't have to recompile the fragment shader if it gets paired with
1761 * a different vertex (or geometry) shader.
1762 */
1763 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1764 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1765 BITFIELD64_BIT(i)) {
1766 prog_data->urb_setup[i] = urb_next++;
1767 }
1768 }
1769 } else {
1770 /* We have enough input varyings that the SF/SBE pipeline stage can't
1771 * arbitrarily rearrange them to suit our whim; we have to put them
1772 * in an order that matches the output of the previous pipeline stage
1773 * (geometry or vertex shader).
1774 */
1775 struct brw_vue_map prev_stage_vue_map;
1776 brw_compute_vue_map(brw, &prev_stage_vue_map,
1777 key->input_slots_valid);
1778 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1779 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1780 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1781 slot++) {
1782 int varying = prev_stage_vue_map.slot_to_varying[slot];
1783 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1784 * unused.
1785 */
1786 if (varying != BRW_VARYING_SLOT_COUNT &&
1787 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1788 BITFIELD64_BIT(varying))) {
1789 prog_data->urb_setup[varying] = slot - first_slot;
1790 }
1791 }
1792 urb_next = prev_stage_vue_map.num_slots - first_slot;
1793 }
1794 } else {
1795 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1796 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1797 /* Point size is packed into the header, not as a general attribute */
1798 if (i == VARYING_SLOT_PSIZ)
1799 continue;
1800
1801 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1802 /* The back color slot is skipped when the front color is
1803 * also written to. In addition, some slots can be
1804 * written in the vertex shader and not read in the
1805 * fragment shader. So the register number must always be
1806 * incremented, mapped or not.
1807 */
1808 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1809 prog_data->urb_setup[i] = urb_next;
1810 urb_next++;
1811 }
1812 }
1813
1814 /*
1815 * It's a FS only attribute, and we did interpolation for this attribute
1816 * in SF thread. So, count it here, too.
1817 *
1818 * See compile_sf_prog() for more info.
1819 */
1820 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1821 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1822 }
1823
1824 prog_data->num_varying_inputs = urb_next;
1825 }
1826
1827 void
1828 fs_visitor::assign_urb_setup()
1829 {
1830 assert(stage == MESA_SHADER_FRAGMENT);
1831 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1832
1833 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1834
1835 /* Offset all the urb_setup[] index by the actual position of the
1836 * setup regs, now that the location of the constants has been chosen.
1837 */
1838 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1839 if (inst->opcode == FS_OPCODE_LINTERP) {
1840 assert(inst->src[2].file == HW_REG);
1841 inst->src[2].fixed_hw_reg.nr += urb_start;
1842 }
1843
1844 if (inst->opcode == FS_OPCODE_CINTERP) {
1845 assert(inst->src[0].file == HW_REG);
1846 inst->src[0].fixed_hw_reg.nr += urb_start;
1847 }
1848 }
1849
1850 /* Each attribute is 4 setup channels, each of which is half a reg. */
1851 this->first_non_payload_grf =
1852 urb_start + prog_data->num_varying_inputs * 2;
1853 }
1854
1855 void
1856 fs_visitor::assign_vs_urb_setup()
1857 {
1858 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1859 int grf, count, slot, channel, attr;
1860
1861 assert(stage == MESA_SHADER_VERTEX);
1862 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1863 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1864 count++;
1865
1866 /* Each attribute is 4 regs. */
1867 this->first_non_payload_grf =
1868 payload.num_regs + prog_data->curb_read_length + count * 4;
1869
1870 unsigned vue_entries =
1871 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1872
1873 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1874 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1875
1876 assert(vs_prog_data->base.urb_read_length <= 15);
1877
1878 /* Rewrite all ATTR file references to the hw grf that they land in. */
1879 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1880 for (int i = 0; i < inst->sources; i++) {
1881 if (inst->src[i].file == ATTR) {
1882
1883 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1884 slot = count - 1;
1885 } else {
1886 /* Attributes come in in a contiguous block, ordered by their
1887 * gl_vert_attrib value. That means we can compute the slot
1888 * number for an attribute by masking out the enabled
1889 * attributes before it and counting the bits.
1890 */
1891 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1892 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1893 BITFIELD64_MASK(attr));
1894 }
1895
1896 channel = inst->src[i].reg_offset & 3;
1897
1898 grf = payload.num_regs +
1899 prog_data->curb_read_length +
1900 slot * 4 + channel;
1901
1902 inst->src[i].file = HW_REG;
1903 inst->src[i].fixed_hw_reg =
1904 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1905 }
1906 }
1907 }
1908 }
1909
1910 /**
1911 * Split large virtual GRFs into separate components if we can.
1912 *
1913 * This is mostly duplicated with what brw_fs_vector_splitting does,
1914 * but that's really conservative because it's afraid of doing
1915 * splitting that doesn't result in real progress after the rest of
1916 * the optimization phases, which would cause infinite looping in
1917 * optimization. We can do it once here, safely. This also has the
1918 * opportunity to split interpolated values, or maybe even uniforms,
1919 * which we don't have at the IR level.
1920 *
1921 * We want to split, because virtual GRFs are what we register
1922 * allocate and spill (due to contiguousness requirements for some
1923 * instructions), and they're what we naturally generate in the
1924 * codegen process, but most virtual GRFs don't actually need to be
1925 * contiguous sets of GRFs. If we split, we'll end up with reduced
1926 * live intervals and better dead code elimination and coalescing.
1927 */
1928 void
1929 fs_visitor::split_virtual_grfs()
1930 {
1931 int num_vars = this->alloc.count;
1932
1933 /* Count the total number of registers */
1934 int reg_count = 0;
1935 int vgrf_to_reg[num_vars];
1936 for (int i = 0; i < num_vars; i++) {
1937 vgrf_to_reg[i] = reg_count;
1938 reg_count += alloc.sizes[i];
1939 }
1940
1941 /* An array of "split points". For each register slot, this indicates
1942 * if this slot can be separated from the previous slot. Every time an
1943 * instruction uses multiple elements of a register (as a source or
1944 * destination), we mark the used slots as inseparable. Then we go
1945 * through and split the registers into the smallest pieces we can.
1946 */
1947 bool split_points[reg_count];
1948 memset(split_points, 0, sizeof(split_points));
1949
1950 /* Mark all used registers as fully splittable */
1951 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1952 if (inst->dst.file == GRF) {
1953 int reg = vgrf_to_reg[inst->dst.reg];
1954 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1955 split_points[reg + j] = true;
1956 }
1957
1958 for (int i = 0; i < inst->sources; i++) {
1959 if (inst->src[i].file == GRF) {
1960 int reg = vgrf_to_reg[inst->src[i].reg];
1961 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1962 split_points[reg + j] = true;
1963 }
1964 }
1965 }
1966
1967 if (brw->has_pln &&
1968 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1969 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1970 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1971 * Gen6, that was the only supported interpolation mode, and since Gen6,
1972 * delta_x and delta_y are in fixed hardware registers.
1973 */
1974 int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1975 split_points[vgrf_to_reg[vgrf] + 1] = false;
1976 }
1977
1978 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1979 if (inst->dst.file == GRF) {
1980 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1981 for (int j = 1; j < inst->regs_written; j++)
1982 split_points[reg + j] = false;
1983 }
1984 for (int i = 0; i < inst->sources; i++) {
1985 if (inst->src[i].file == GRF) {
1986 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1987 for (int j = 1; j < inst->regs_read(i); j++)
1988 split_points[reg + j] = false;
1989 }
1990 }
1991 }
1992
1993 int new_virtual_grf[reg_count];
1994 int new_reg_offset[reg_count];
1995
1996 int reg = 0;
1997 for (int i = 0; i < num_vars; i++) {
1998 /* The first one should always be 0 as a quick sanity check. */
1999 assert(split_points[reg] == false);
2000
2001 /* j = 0 case */
2002 new_reg_offset[reg] = 0;
2003 reg++;
2004 int offset = 1;
2005
2006 /* j > 0 case */
2007 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2008 /* If this is a split point, reset the offset to 0 and allocate a
2009 * new virtual GRF for the previous offset many registers
2010 */
2011 if (split_points[reg]) {
2012 assert(offset <= MAX_VGRF_SIZE);
2013 int grf = alloc.allocate(offset);
2014 for (int k = reg - offset; k < reg; k++)
2015 new_virtual_grf[k] = grf;
2016 offset = 0;
2017 }
2018 new_reg_offset[reg] = offset;
2019 offset++;
2020 reg++;
2021 }
2022
2023 /* The last one gets the original register number */
2024 assert(offset <= MAX_VGRF_SIZE);
2025 alloc.sizes[i] = offset;
2026 for (int k = reg - offset; k < reg; k++)
2027 new_virtual_grf[k] = i;
2028 }
2029 assert(reg == reg_count);
2030
2031 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2032 if (inst->dst.file == GRF) {
2033 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2034 inst->dst.reg = new_virtual_grf[reg];
2035 inst->dst.reg_offset = new_reg_offset[reg];
2036 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2037 }
2038 for (int i = 0; i < inst->sources; i++) {
2039 if (inst->src[i].file == GRF) {
2040 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2041 inst->src[i].reg = new_virtual_grf[reg];
2042 inst->src[i].reg_offset = new_reg_offset[reg];
2043 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2044 }
2045 }
2046 }
2047 invalidate_live_intervals();
2048 }
2049
2050 /**
2051 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2052 *
2053 * During code generation, we create tons of temporary variables, many of
2054 * which get immediately killed and are never used again. Yet, in later
2055 * optimization and analysis passes, such as compute_live_intervals, we need
2056 * to loop over all the virtual GRFs. Compacting them can save a lot of
2057 * overhead.
2058 */
2059 bool
2060 fs_visitor::compact_virtual_grfs()
2061 {
2062 bool progress = false;
2063 int remap_table[this->alloc.count];
2064 memset(remap_table, -1, sizeof(remap_table));
2065
2066 /* Mark which virtual GRFs are used. */
2067 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2068 if (inst->dst.file == GRF)
2069 remap_table[inst->dst.reg] = 0;
2070
2071 for (int i = 0; i < inst->sources; i++) {
2072 if (inst->src[i].file == GRF)
2073 remap_table[inst->src[i].reg] = 0;
2074 }
2075 }
2076
2077 /* Compact the GRF arrays. */
2078 int new_index = 0;
2079 for (unsigned i = 0; i < this->alloc.count; i++) {
2080 if (remap_table[i] == -1) {
2081 /* We just found an unused register. This means that we are
2082 * actually going to compact something.
2083 */
2084 progress = true;
2085 } else {
2086 remap_table[i] = new_index;
2087 alloc.sizes[new_index] = alloc.sizes[i];
2088 invalidate_live_intervals();
2089 ++new_index;
2090 }
2091 }
2092
2093 this->alloc.count = new_index;
2094
2095 /* Patch all the instructions to use the newly renumbered registers */
2096 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2097 if (inst->dst.file == GRF)
2098 inst->dst.reg = remap_table[inst->dst.reg];
2099
2100 for (int i = 0; i < inst->sources; i++) {
2101 if (inst->src[i].file == GRF)
2102 inst->src[i].reg = remap_table[inst->src[i].reg];
2103 }
2104 }
2105
2106 /* Patch all the references to delta_x/delta_y, since they're used in
2107 * register allocation. If they're unused, switch them to BAD_FILE so
2108 * we don't think some random VGRF is delta_x/delta_y.
2109 */
2110 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2111 if (delta_x[i].file == GRF) {
2112 if (remap_table[delta_x[i].reg] != -1) {
2113 delta_x[i].reg = remap_table[delta_x[i].reg];
2114 } else {
2115 delta_x[i].file = BAD_FILE;
2116 }
2117 }
2118 }
2119 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2120 if (delta_y[i].file == GRF) {
2121 if (remap_table[delta_y[i].reg] != -1) {
2122 delta_y[i].reg = remap_table[delta_y[i].reg];
2123 } else {
2124 delta_y[i].file = BAD_FILE;
2125 }
2126 }
2127 }
2128
2129 return progress;
2130 }
2131
2132 /*
2133 * Implements array access of uniforms by inserting a
2134 * PULL_CONSTANT_LOAD instruction.
2135 *
2136 * Unlike temporary GRF array access (where we don't support it due to
2137 * the difficulty of doing relative addressing on instruction
2138 * destinations), we could potentially do array access of uniforms
2139 * that were loaded in GRF space as push constants. In real-world
2140 * usage we've seen, though, the arrays being used are always larger
2141 * than we could load as push constants, so just always move all
2142 * uniform array access out to a pull constant buffer.
2143 */
2144 void
2145 fs_visitor::move_uniform_array_access_to_pull_constants()
2146 {
2147 if (dispatch_width != 8)
2148 return;
2149
2150 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2151 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2152
2153 /* Walk through and find array access of uniforms. Put a copy of that
2154 * uniform in the pull constant buffer.
2155 *
2156 * Note that we don't move constant-indexed accesses to arrays. No
2157 * testing has been done of the performance impact of this choice.
2158 */
2159 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2160 for (int i = 0 ; i < inst->sources; i++) {
2161 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2162 continue;
2163
2164 int uniform = inst->src[i].reg;
2165
2166 /* If this array isn't already present in the pull constant buffer,
2167 * add it.
2168 */
2169 if (pull_constant_loc[uniform] == -1) {
2170 const gl_constant_value **values = &stage_prog_data->param[uniform];
2171
2172 assert(param_size[uniform]);
2173
2174 for (int j = 0; j < param_size[uniform]; j++) {
2175 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2176
2177 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2178 values[j];
2179 }
2180 }
2181 }
2182 }
2183 }
2184
2185 /**
2186 * Assign UNIFORM file registers to either push constants or pull constants.
2187 *
2188 * We allow a fragment shader to have more than the specified minimum
2189 * maximum number of fragment shader uniform components (64). If
2190 * there are too many of these, they'd fill up all of register space.
2191 * So, this will push some of them out to the pull constant buffer and
2192 * update the program to load them.
2193 */
2194 void
2195 fs_visitor::assign_constant_locations()
2196 {
2197 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2198 if (dispatch_width != 8)
2199 return;
2200
2201 /* Find which UNIFORM registers are still in use. */
2202 bool is_live[uniforms];
2203 for (unsigned int i = 0; i < uniforms; i++) {
2204 is_live[i] = false;
2205 }
2206
2207 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2208 for (int i = 0; i < inst->sources; i++) {
2209 if (inst->src[i].file != UNIFORM)
2210 continue;
2211
2212 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2213 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2214 is_live[constant_nr] = true;
2215 }
2216 }
2217
2218 /* Only allow 16 registers (128 uniform components) as push constants.
2219 *
2220 * Just demote the end of the list. We could probably do better
2221 * here, demoting things that are rarely used in the program first.
2222 *
2223 * If changing this value, note the limitation about total_regs in
2224 * brw_curbe.c.
2225 */
2226 unsigned int max_push_components = 16 * 8;
2227 unsigned int num_push_constants = 0;
2228
2229 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2230
2231 for (unsigned int i = 0; i < uniforms; i++) {
2232 if (!is_live[i] || pull_constant_loc[i] != -1) {
2233 /* This UNIFORM register is either dead, or has already been demoted
2234 * to a pull const. Mark it as no longer living in the param[] array.
2235 */
2236 push_constant_loc[i] = -1;
2237 continue;
2238 }
2239
2240 if (num_push_constants < max_push_components) {
2241 /* Retain as a push constant. Record the location in the params[]
2242 * array.
2243 */
2244 push_constant_loc[i] = num_push_constants++;
2245 } else {
2246 /* Demote to a pull constant. */
2247 push_constant_loc[i] = -1;
2248
2249 int pull_index = stage_prog_data->nr_pull_params++;
2250 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2251 pull_constant_loc[i] = pull_index;
2252 }
2253 }
2254
2255 stage_prog_data->nr_params = num_push_constants;
2256
2257 /* Up until now, the param[] array has been indexed by reg + reg_offset
2258 * of UNIFORM registers. Condense it to only contain the uniforms we
2259 * chose to upload as push constants.
2260 */
2261 for (unsigned int i = 0; i < uniforms; i++) {
2262 int remapped = push_constant_loc[i];
2263
2264 if (remapped == -1)
2265 continue;
2266
2267 assert(remapped <= (int)i);
2268 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2269 }
2270 }
2271
2272 /**
2273 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2274 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2275 */
2276 void
2277 fs_visitor::demote_pull_constants()
2278 {
2279 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2280 for (int i = 0; i < inst->sources; i++) {
2281 if (inst->src[i].file != UNIFORM)
2282 continue;
2283
2284 int pull_index;
2285 unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2286 if (location >= uniforms) /* Out of bounds access */
2287 pull_index = -1;
2288 else
2289 pull_index = pull_constant_loc[location];
2290
2291 if (pull_index == -1)
2292 continue;
2293
2294 /* Set up the annotation tracking for new generated instructions. */
2295 base_ir = inst->ir;
2296 current_annotation = inst->annotation;
2297
2298 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2299 fs_reg dst = vgrf(glsl_type::float_type);
2300
2301 /* Generate a pull load into dst. */
2302 if (inst->src[i].reladdr) {
2303 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2304 surf_index,
2305 *inst->src[i].reladdr,
2306 pull_index);
2307 inst->insert_before(block, &list);
2308 inst->src[i].reladdr = NULL;
2309 } else {
2310 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2311 fs_inst *pull =
2312 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2313 dst, surf_index, offset);
2314 inst->insert_before(block, pull);
2315 inst->src[i].set_smear(pull_index & 3);
2316 }
2317
2318 /* Rewrite the instruction to use the temporary VGRF. */
2319 inst->src[i].file = GRF;
2320 inst->src[i].reg = dst.reg;
2321 inst->src[i].reg_offset = 0;
2322 inst->src[i].width = dispatch_width;
2323 }
2324 }
2325 invalidate_live_intervals();
2326 }
2327
2328 bool
2329 fs_visitor::opt_algebraic()
2330 {
2331 bool progress = false;
2332
2333 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2334 switch (inst->opcode) {
2335 case BRW_OPCODE_MOV:
2336 if (inst->src[0].file != IMM)
2337 break;
2338
2339 if (inst->saturate) {
2340 if (inst->dst.type != inst->src[0].type)
2341 assert(!"unimplemented: saturate mixed types");
2342
2343 if (brw_saturate_immediate(inst->dst.type,
2344 &inst->src[0].fixed_hw_reg)) {
2345 inst->saturate = false;
2346 progress = true;
2347 }
2348 }
2349 break;
2350
2351 case BRW_OPCODE_MUL:
2352 if (inst->src[1].file != IMM)
2353 continue;
2354
2355 /* a * 1.0 = a */
2356 if (inst->src[1].is_one()) {
2357 inst->opcode = BRW_OPCODE_MOV;
2358 inst->src[1] = reg_undef;
2359 progress = true;
2360 break;
2361 }
2362
2363 /* a * -1.0 = -a */
2364 if (inst->src[1].is_negative_one()) {
2365 inst->opcode = BRW_OPCODE_MOV;
2366 inst->src[0].negate = !inst->src[0].negate;
2367 inst->src[1] = reg_undef;
2368 progress = true;
2369 break;
2370 }
2371
2372 /* a * 0.0 = 0.0 */
2373 if (inst->src[1].is_zero()) {
2374 inst->opcode = BRW_OPCODE_MOV;
2375 inst->src[0] = inst->src[1];
2376 inst->src[1] = reg_undef;
2377 progress = true;
2378 break;
2379 }
2380
2381 if (inst->src[0].file == IMM) {
2382 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2383 inst->opcode = BRW_OPCODE_MOV;
2384 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2385 inst->src[1] = reg_undef;
2386 progress = true;
2387 break;
2388 }
2389 break;
2390 case BRW_OPCODE_ADD:
2391 if (inst->src[1].file != IMM)
2392 continue;
2393
2394 /* a + 0.0 = a */
2395 if (inst->src[1].is_zero()) {
2396 inst->opcode = BRW_OPCODE_MOV;
2397 inst->src[1] = reg_undef;
2398 progress = true;
2399 break;
2400 }
2401
2402 if (inst->src[0].file == IMM) {
2403 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2404 inst->opcode = BRW_OPCODE_MOV;
2405 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2406 inst->src[1] = reg_undef;
2407 progress = true;
2408 break;
2409 }
2410 break;
2411 case BRW_OPCODE_OR:
2412 if (inst->src[0].equals(inst->src[1])) {
2413 inst->opcode = BRW_OPCODE_MOV;
2414 inst->src[1] = reg_undef;
2415 progress = true;
2416 break;
2417 }
2418 break;
2419 case BRW_OPCODE_LRP:
2420 if (inst->src[1].equals(inst->src[2])) {
2421 inst->opcode = BRW_OPCODE_MOV;
2422 inst->src[0] = inst->src[1];
2423 inst->src[1] = reg_undef;
2424 inst->src[2] = reg_undef;
2425 progress = true;
2426 break;
2427 }
2428 break;
2429 case BRW_OPCODE_CMP:
2430 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2431 inst->src[0].abs &&
2432 inst->src[0].negate &&
2433 inst->src[1].is_zero()) {
2434 inst->src[0].abs = false;
2435 inst->src[0].negate = false;
2436 inst->conditional_mod = BRW_CONDITIONAL_Z;
2437 progress = true;
2438 break;
2439 }
2440 break;
2441 case BRW_OPCODE_SEL:
2442 if (inst->src[0].equals(inst->src[1])) {
2443 inst->opcode = BRW_OPCODE_MOV;
2444 inst->src[1] = reg_undef;
2445 inst->predicate = BRW_PREDICATE_NONE;
2446 inst->predicate_inverse = false;
2447 progress = true;
2448 } else if (inst->saturate && inst->src[1].file == IMM) {
2449 switch (inst->conditional_mod) {
2450 case BRW_CONDITIONAL_LE:
2451 case BRW_CONDITIONAL_L:
2452 switch (inst->src[1].type) {
2453 case BRW_REGISTER_TYPE_F:
2454 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2455 inst->opcode = BRW_OPCODE_MOV;
2456 inst->src[1] = reg_undef;
2457 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2458 progress = true;
2459 }
2460 break;
2461 default:
2462 break;
2463 }
2464 break;
2465 case BRW_CONDITIONAL_GE:
2466 case BRW_CONDITIONAL_G:
2467 switch (inst->src[1].type) {
2468 case BRW_REGISTER_TYPE_F:
2469 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2470 inst->opcode = BRW_OPCODE_MOV;
2471 inst->src[1] = reg_undef;
2472 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2473 progress = true;
2474 }
2475 break;
2476 default:
2477 break;
2478 }
2479 default:
2480 break;
2481 }
2482 }
2483 break;
2484 case BRW_OPCODE_MAD:
2485 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2486 inst->opcode = BRW_OPCODE_MOV;
2487 inst->src[1] = reg_undef;
2488 inst->src[2] = reg_undef;
2489 progress = true;
2490 } else if (inst->src[0].is_zero()) {
2491 inst->opcode = BRW_OPCODE_MUL;
2492 inst->src[0] = inst->src[2];
2493 inst->src[2] = reg_undef;
2494 } else if (inst->src[1].is_one()) {
2495 inst->opcode = BRW_OPCODE_ADD;
2496 inst->src[1] = inst->src[2];
2497 inst->src[2] = reg_undef;
2498 progress = true;
2499 } else if (inst->src[2].is_one()) {
2500 inst->opcode = BRW_OPCODE_ADD;
2501 inst->src[2] = reg_undef;
2502 progress = true;
2503 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2504 inst->opcode = BRW_OPCODE_ADD;
2505 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2506 inst->src[2] = reg_undef;
2507 progress = true;
2508 }
2509 break;
2510 case SHADER_OPCODE_RCP: {
2511 fs_inst *prev = (fs_inst *)inst->prev;
2512 if (prev->opcode == SHADER_OPCODE_SQRT) {
2513 if (inst->src[0].equals(prev->dst)) {
2514 inst->opcode = SHADER_OPCODE_RSQ;
2515 inst->src[0] = prev->src[0];
2516 progress = true;
2517 }
2518 }
2519 break;
2520 }
2521 default:
2522 break;
2523 }
2524 }
2525
2526 return progress;
2527 }
2528
2529 bool
2530 fs_visitor::opt_register_renaming()
2531 {
2532 bool progress = false;
2533 int depth = 0;
2534
2535 int remap[alloc.count];
2536 memset(remap, -1, sizeof(int) * alloc.count);
2537
2538 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2539 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2540 depth++;
2541 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2542 inst->opcode == BRW_OPCODE_WHILE) {
2543 depth--;
2544 }
2545
2546 /* Rewrite instruction sources. */
2547 for (int i = 0; i < inst->sources; i++) {
2548 if (inst->src[i].file == GRF &&
2549 remap[inst->src[i].reg] != -1 &&
2550 remap[inst->src[i].reg] != inst->src[i].reg) {
2551 inst->src[i].reg = remap[inst->src[i].reg];
2552 progress = true;
2553 }
2554 }
2555
2556 const int dst = inst->dst.reg;
2557
2558 if (depth == 0 &&
2559 inst->dst.file == GRF &&
2560 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2561 !inst->is_partial_write()) {
2562 if (remap[dst] == -1) {
2563 remap[dst] = dst;
2564 } else {
2565 remap[dst] = alloc.allocate(inst->dst.width / 8);
2566 inst->dst.reg = remap[dst];
2567 progress = true;
2568 }
2569 } else if (inst->dst.file == GRF &&
2570 remap[dst] != -1 &&
2571 remap[dst] != dst) {
2572 inst->dst.reg = remap[dst];
2573 progress = true;
2574 }
2575 }
2576
2577 if (progress) {
2578 invalidate_live_intervals();
2579
2580 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2581 if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2582 delta_x[i].reg = remap[delta_x[i].reg];
2583 }
2584 }
2585 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2586 if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2587 delta_y[i].reg = remap[delta_y[i].reg];
2588 }
2589 }
2590 }
2591
2592 return progress;
2593 }
2594
2595 /**
2596 * Remove redundant or useless discard jumps.
2597 *
2598 * For example, we can eliminate jumps in the following sequence:
2599 *
2600 * discard-jump (redundant with the next jump)
2601 * discard-jump (useless; jumps to the next instruction)
2602 * placeholder-halt
2603 */
2604 bool
2605 fs_visitor::opt_redundant_discard_jumps()
2606 {
2607 bool progress = false;
2608
2609 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2610
2611 fs_inst *placeholder_halt = NULL;
2612 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2613 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2614 placeholder_halt = inst;
2615 break;
2616 }
2617 }
2618
2619 if (!placeholder_halt)
2620 return false;
2621
2622 /* Delete any HALTs immediately before the placeholder halt. */
2623 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2624 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2625 prev = (fs_inst *) placeholder_halt->prev) {
2626 prev->remove(last_bblock);
2627 progress = true;
2628 }
2629
2630 if (progress)
2631 invalidate_live_intervals();
2632
2633 return progress;
2634 }
2635
2636 bool
2637 fs_visitor::compute_to_mrf()
2638 {
2639 bool progress = false;
2640 int next_ip = 0;
2641
2642 /* No MRFs on Gen >= 7. */
2643 if (brw->gen >= 7)
2644 return false;
2645
2646 calculate_live_intervals();
2647
2648 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2649 int ip = next_ip;
2650 next_ip++;
2651
2652 if (inst->opcode != BRW_OPCODE_MOV ||
2653 inst->is_partial_write() ||
2654 inst->dst.file != MRF || inst->src[0].file != GRF ||
2655 inst->dst.type != inst->src[0].type ||
2656 inst->src[0].abs || inst->src[0].negate ||
2657 !inst->src[0].is_contiguous() ||
2658 inst->src[0].subreg_offset)
2659 continue;
2660
2661 /* Work out which hardware MRF registers are written by this
2662 * instruction.
2663 */
2664 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2665 int mrf_high;
2666 if (inst->dst.reg & BRW_MRF_COMPR4) {
2667 mrf_high = mrf_low + 4;
2668 } else if (inst->exec_size == 16) {
2669 mrf_high = mrf_low + 1;
2670 } else {
2671 mrf_high = mrf_low;
2672 }
2673
2674 /* Can't compute-to-MRF this GRF if someone else was going to
2675 * read it later.
2676 */
2677 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2678 continue;
2679
2680 /* Found a move of a GRF to a MRF. Let's see if we can go
2681 * rewrite the thing that made this GRF to write into the MRF.
2682 */
2683 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2684 if (scan_inst->dst.file == GRF &&
2685 scan_inst->dst.reg == inst->src[0].reg) {
2686 /* Found the last thing to write our reg we want to turn
2687 * into a compute-to-MRF.
2688 */
2689
2690 /* If this one instruction didn't populate all the
2691 * channels, bail. We might be able to rewrite everything
2692 * that writes that reg, but it would require smarter
2693 * tracking to delay the rewriting until complete success.
2694 */
2695 if (scan_inst->is_partial_write())
2696 break;
2697
2698 /* Things returning more than one register would need us to
2699 * understand coalescing out more than one MOV at a time.
2700 */
2701 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2702 break;
2703
2704 /* SEND instructions can't have MRF as a destination. */
2705 if (scan_inst->mlen)
2706 break;
2707
2708 if (brw->gen == 6) {
2709 /* gen6 math instructions must have the destination be
2710 * GRF, so no compute-to-MRF for them.
2711 */
2712 if (scan_inst->is_math()) {
2713 break;
2714 }
2715 }
2716
2717 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2718 /* Found the creator of our MRF's source value. */
2719 scan_inst->dst.file = MRF;
2720 scan_inst->dst.reg = inst->dst.reg;
2721 scan_inst->saturate |= inst->saturate;
2722 inst->remove(block);
2723 progress = true;
2724 }
2725 break;
2726 }
2727
2728 /* We don't handle control flow here. Most computation of
2729 * values that end up in MRFs are shortly before the MRF
2730 * write anyway.
2731 */
2732 if (block->start() == scan_inst)
2733 break;
2734
2735 /* You can't read from an MRF, so if someone else reads our
2736 * MRF's source GRF that we wanted to rewrite, that stops us.
2737 */
2738 bool interfered = false;
2739 for (int i = 0; i < scan_inst->sources; i++) {
2740 if (scan_inst->src[i].file == GRF &&
2741 scan_inst->src[i].reg == inst->src[0].reg &&
2742 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2743 interfered = true;
2744 }
2745 }
2746 if (interfered)
2747 break;
2748
2749 if (scan_inst->dst.file == MRF) {
2750 /* If somebody else writes our MRF here, we can't
2751 * compute-to-MRF before that.
2752 */
2753 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2754 int scan_mrf_high;
2755
2756 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2757 scan_mrf_high = scan_mrf_low + 4;
2758 } else if (scan_inst->exec_size == 16) {
2759 scan_mrf_high = scan_mrf_low + 1;
2760 } else {
2761 scan_mrf_high = scan_mrf_low;
2762 }
2763
2764 if (mrf_low == scan_mrf_low ||
2765 mrf_low == scan_mrf_high ||
2766 mrf_high == scan_mrf_low ||
2767 mrf_high == scan_mrf_high) {
2768 break;
2769 }
2770 }
2771
2772 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2773 /* Found a SEND instruction, which means that there are
2774 * live values in MRFs from base_mrf to base_mrf +
2775 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2776 * above it.
2777 */
2778 if (mrf_low >= scan_inst->base_mrf &&
2779 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2780 break;
2781 }
2782 if (mrf_high >= scan_inst->base_mrf &&
2783 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2784 break;
2785 }
2786 }
2787 }
2788 }
2789
2790 if (progress)
2791 invalidate_live_intervals();
2792
2793 return progress;
2794 }
2795
2796 /**
2797 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2798 * instructions to FS_OPCODE_REP_FB_WRITE.
2799 */
2800 void
2801 fs_visitor::emit_repclear_shader()
2802 {
2803 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2804 int base_mrf = 1;
2805 int color_mrf = base_mrf + 2;
2806
2807 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2808 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2809 mov->force_writemask_all = true;
2810
2811 fs_inst *write;
2812 if (key->nr_color_regions == 1) {
2813 write = emit(FS_OPCODE_REP_FB_WRITE);
2814 write->saturate = key->clamp_fragment_color;
2815 write->base_mrf = color_mrf;
2816 write->target = 0;
2817 write->header_present = false;
2818 write->mlen = 1;
2819 } else {
2820 assume(key->nr_color_regions > 0);
2821 for (int i = 0; i < key->nr_color_regions; ++i) {
2822 write = emit(FS_OPCODE_REP_FB_WRITE);
2823 write->saturate = key->clamp_fragment_color;
2824 write->base_mrf = base_mrf;
2825 write->target = i;
2826 write->header_present = true;
2827 write->mlen = 3;
2828 }
2829 }
2830 write->eot = true;
2831
2832 calculate_cfg();
2833
2834 assign_constant_locations();
2835 assign_curb_setup();
2836
2837 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2838 assert(mov->src[0].file == HW_REG);
2839 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2840 }
2841
2842 /**
2843 * Walks through basic blocks, looking for repeated MRF writes and
2844 * removing the later ones.
2845 */
2846 bool
2847 fs_visitor::remove_duplicate_mrf_writes()
2848 {
2849 fs_inst *last_mrf_move[16];
2850 bool progress = false;
2851
2852 /* Need to update the MRF tracking for compressed instructions. */
2853 if (dispatch_width == 16)
2854 return false;
2855
2856 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2857
2858 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2859 if (inst->is_control_flow()) {
2860 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2861 }
2862
2863 if (inst->opcode == BRW_OPCODE_MOV &&
2864 inst->dst.file == MRF) {
2865 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2866 if (prev_inst && inst->equals(prev_inst)) {
2867 inst->remove(block);
2868 progress = true;
2869 continue;
2870 }
2871 }
2872
2873 /* Clear out the last-write records for MRFs that were overwritten. */
2874 if (inst->dst.file == MRF) {
2875 last_mrf_move[inst->dst.reg] = NULL;
2876 }
2877
2878 if (inst->mlen > 0 && inst->base_mrf != -1) {
2879 /* Found a SEND instruction, which will include two or fewer
2880 * implied MRF writes. We could do better here.
2881 */
2882 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2883 last_mrf_move[inst->base_mrf + i] = NULL;
2884 }
2885 }
2886
2887 /* Clear out any MRF move records whose sources got overwritten. */
2888 if (inst->dst.file == GRF) {
2889 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2890 if (last_mrf_move[i] &&
2891 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2892 last_mrf_move[i] = NULL;
2893 }
2894 }
2895 }
2896
2897 if (inst->opcode == BRW_OPCODE_MOV &&
2898 inst->dst.file == MRF &&
2899 inst->src[0].file == GRF &&
2900 !inst->is_partial_write()) {
2901 last_mrf_move[inst->dst.reg] = inst;
2902 }
2903 }
2904
2905 if (progress)
2906 invalidate_live_intervals();
2907
2908 return progress;
2909 }
2910
2911 static void
2912 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
2913 {
2914 /* Clear the flag for registers that actually got read (as expected). */
2915 for (int i = 0; i < inst->sources; i++) {
2916 int grf;
2917 if (inst->src[i].file == GRF) {
2918 grf = inst->src[i].reg;
2919 } else if (inst->src[i].file == HW_REG &&
2920 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2921 grf = inst->src[i].fixed_hw_reg.nr;
2922 } else {
2923 continue;
2924 }
2925
2926 if (grf >= first_grf &&
2927 grf < first_grf + grf_len) {
2928 deps[grf - first_grf] = false;
2929 if (inst->exec_size == 16)
2930 deps[grf - first_grf + 1] = false;
2931 }
2932 }
2933 }
2934
2935 /**
2936 * Implements this workaround for the original 965:
2937 *
2938 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2939 * check for post destination dependencies on this instruction, software
2940 * must ensure that there is no destination hazard for the case of ‘write
2941 * followed by a posted write’ shown in the following example.
2942 *
2943 * 1. mov r3 0
2944 * 2. send r3.xy <rest of send instruction>
2945 * 3. mov r2 r3
2946 *
2947 * Due to no post-destination dependency check on the ‘send’, the above
2948 * code sequence could have two instructions (1 and 2) in flight at the
2949 * same time that both consider ‘r3’ as the target of their final writes.
2950 */
2951 void
2952 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2953 fs_inst *inst)
2954 {
2955 int write_len = inst->regs_written;
2956 int first_write_grf = inst->dst.reg;
2957 bool needs_dep[BRW_MAX_MRF];
2958 assert(write_len < (int)sizeof(needs_dep) - 1);
2959
2960 memset(needs_dep, false, sizeof(needs_dep));
2961 memset(needs_dep, true, write_len);
2962
2963 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
2964
2965 /* Walk backwards looking for writes to registers we're writing which
2966 * aren't read since being written. If we hit the start of the program,
2967 * we assume that there are no outstanding dependencies on entry to the
2968 * program.
2969 */
2970 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2971 /* If we hit control flow, assume that there *are* outstanding
2972 * dependencies, and force their cleanup before our instruction.
2973 */
2974 if (block->start() == scan_inst) {
2975 for (int i = 0; i < write_len; i++) {
2976 if (needs_dep[i]) {
2977 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2978 }
2979 }
2980 return;
2981 }
2982
2983 /* We insert our reads as late as possible on the assumption that any
2984 * instruction but a MOV that might have left us an outstanding
2985 * dependency has more latency than a MOV.
2986 */
2987 if (scan_inst->dst.file == GRF) {
2988 for (int i = 0; i < scan_inst->regs_written; i++) {
2989 int reg = scan_inst->dst.reg + i;
2990
2991 if (reg >= first_write_grf &&
2992 reg < first_write_grf + write_len &&
2993 needs_dep[reg - first_write_grf]) {
2994 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2995 needs_dep[reg - first_write_grf] = false;
2996 if (scan_inst->exec_size == 16)
2997 needs_dep[reg - first_write_grf + 1] = false;
2998 }
2999 }
3000 }
3001
3002 /* Clear the flag for registers that actually got read (as expected). */
3003 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3004
3005 /* Continue the loop only if we haven't resolved all the dependencies */
3006 int i;
3007 for (i = 0; i < write_len; i++) {
3008 if (needs_dep[i])
3009 break;
3010 }
3011 if (i == write_len)
3012 return;
3013 }
3014 }
3015
3016 /**
3017 * Implements this workaround for the original 965:
3018 *
3019 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3020 * used as a destination register until after it has been sourced by an
3021 * instruction with a different destination register.
3022 */
3023 void
3024 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3025 {
3026 int write_len = inst->regs_written;
3027 int first_write_grf = inst->dst.reg;
3028 bool needs_dep[BRW_MAX_MRF];
3029 assert(write_len < (int)sizeof(needs_dep) - 1);
3030
3031 memset(needs_dep, false, sizeof(needs_dep));
3032 memset(needs_dep, true, write_len);
3033 /* Walk forwards looking for writes to registers we're writing which aren't
3034 * read before being written.
3035 */
3036 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3037 /* If we hit control flow, force resolve all remaining dependencies. */
3038 if (block->end() == scan_inst) {
3039 for (int i = 0; i < write_len; i++) {
3040 if (needs_dep[i])
3041 scan_inst->insert_before(block,
3042 DEP_RESOLVE_MOV(first_write_grf + i));
3043 }
3044 return;
3045 }
3046
3047 /* Clear the flag for registers that actually got read (as expected). */
3048 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3049
3050 /* We insert our reads as late as possible since they're reading the
3051 * result of a SEND, which has massive latency.
3052 */
3053 if (scan_inst->dst.file == GRF &&
3054 scan_inst->dst.reg >= first_write_grf &&
3055 scan_inst->dst.reg < first_write_grf + write_len &&
3056 needs_dep[scan_inst->dst.reg - first_write_grf]) {
3057 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3058 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3059 }
3060
3061 /* Continue the loop only if we haven't resolved all the dependencies */
3062 int i;
3063 for (i = 0; i < write_len; i++) {
3064 if (needs_dep[i])
3065 break;
3066 }
3067 if (i == write_len)
3068 return;
3069 }
3070 }
3071
3072 void
3073 fs_visitor::insert_gen4_send_dependency_workarounds()
3074 {
3075 if (brw->gen != 4 || brw->is_g4x)
3076 return;
3077
3078 bool progress = false;
3079
3080 /* Note that we're done with register allocation, so GRF fs_regs always
3081 * have a .reg_offset of 0.
3082 */
3083
3084 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3085 if (inst->mlen != 0 && inst->dst.file == GRF) {
3086 insert_gen4_pre_send_dependency_workarounds(block, inst);
3087 insert_gen4_post_send_dependency_workarounds(block, inst);
3088 progress = true;
3089 }
3090 }
3091
3092 if (progress)
3093 invalidate_live_intervals();
3094 }
3095
3096 /**
3097 * Turns the generic expression-style uniform pull constant load instruction
3098 * into a hardware-specific series of instructions for loading a pull
3099 * constant.
3100 *
3101 * The expression style allows the CSE pass before this to optimize out
3102 * repeated loads from the same offset, and gives the pre-register-allocation
3103 * scheduling full flexibility, while the conversion to native instructions
3104 * allows the post-register-allocation scheduler the best information
3105 * possible.
3106 *
3107 * Note that execution masking for setting up pull constant loads is special:
3108 * the channels that need to be written are unrelated to the current execution
3109 * mask, since a later instruction will use one of the result channels as a
3110 * source operand for all 8 or 16 of its channels.
3111 */
3112 void
3113 fs_visitor::lower_uniform_pull_constant_loads()
3114 {
3115 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3116 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3117 continue;
3118
3119 if (brw->gen >= 7) {
3120 /* The offset arg before was a vec4-aligned byte offset. We need to
3121 * turn it into a dword offset.
3122 */
3123 fs_reg const_offset_reg = inst->src[1];
3124 assert(const_offset_reg.file == IMM &&
3125 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3126 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3127 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3128
3129 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3130 * Reserve space for the register.
3131 */
3132 if (brw->gen >= 9) {
3133 payload.reg_offset++;
3134 alloc.sizes[payload.reg] = 2;
3135 }
3136
3137 /* This is actually going to be a MOV, but since only the first dword
3138 * is accessed, we have a special opcode to do just that one. Note
3139 * that this needs to be an operation that will be considered a def
3140 * by live variable analysis, or register allocation will explode.
3141 */
3142 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3143 8, payload, const_offset_reg);
3144 setup->force_writemask_all = true;
3145
3146 setup->ir = inst->ir;
3147 setup->annotation = inst->annotation;
3148 inst->insert_before(block, setup);
3149
3150 /* Similarly, this will only populate the first 4 channels of the
3151 * result register (since we only use smear values from 0-3), but we
3152 * don't tell the optimizer.
3153 */
3154 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3155 inst->src[1] = payload;
3156
3157 invalidate_live_intervals();
3158 } else {
3159 /* Before register allocation, we didn't tell the scheduler about the
3160 * MRF we use. We know it's safe to use this MRF because nothing
3161 * else does except for register spill/unspill, which generates and
3162 * uses its MRF within a single IR instruction.
3163 */
3164 inst->base_mrf = 14;
3165 inst->mlen = 1;
3166 }
3167 }
3168 }
3169
3170 bool
3171 fs_visitor::lower_load_payload()
3172 {
3173 bool progress = false;
3174
3175 int vgrf_to_reg[alloc.count];
3176 int reg_count = 0;
3177 for (unsigned i = 0; i < alloc.count; ++i) {
3178 vgrf_to_reg[i] = reg_count;
3179 reg_count += alloc.sizes[i];
3180 }
3181
3182 struct {
3183 bool written:1; /* Whether this register has ever been written */
3184 bool force_writemask_all:1;
3185 bool force_sechalf:1;
3186 } metadata[reg_count];
3187 memset(metadata, 0, sizeof(metadata));
3188
3189 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3190 if (inst->dst.file == GRF) {
3191 const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3192 bool force_sechalf = inst->force_sechalf &&
3193 !inst->force_writemask_all;
3194 bool toggle_sechalf = inst->dst.width == 16 &&
3195 type_sz(inst->dst.type) == 4 &&
3196 !inst->force_writemask_all;
3197 for (int i = 0; i < inst->regs_written; ++i) {
3198 metadata[dst_reg + i].written = true;
3199 metadata[dst_reg + i].force_sechalf = force_sechalf;
3200 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3201 force_sechalf = (toggle_sechalf != force_sechalf);
3202 }
3203 }
3204
3205 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3206 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3207 fs_reg dst = inst->dst;
3208
3209 for (int i = 0; i < inst->sources; i++) {
3210 dst.width = inst->src[i].effective_width;
3211 dst.type = inst->src[i].type;
3212
3213 if (inst->src[i].file == BAD_FILE) {
3214 /* Do nothing but otherwise increment as normal */
3215 } else if (dst.file == MRF &&
3216 dst.width == 8 &&
3217 brw->has_compr4 &&
3218 i + 4 < inst->sources &&
3219 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3220 fs_reg compr4_dst = dst;
3221 compr4_dst.reg += BRW_MRF_COMPR4;
3222 compr4_dst.width = 16;
3223 fs_reg compr4_src = inst->src[i];
3224 compr4_src.width = 16;
3225 fs_inst *mov = MOV(compr4_dst, compr4_src);
3226 mov->force_writemask_all = true;
3227 inst->insert_before(block, mov);
3228 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3229 inst->src[i + 4].file = BAD_FILE;
3230 } else {
3231 fs_inst *mov = MOV(dst, inst->src[i]);
3232 if (inst->src[i].file == GRF) {
3233 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3234 inst->src[i].reg_offset;
3235 mov->force_sechalf = metadata[src_reg].force_sechalf;
3236 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3237 } else {
3238 /* We don't have any useful metadata for immediates or
3239 * uniforms. Assume that any of the channels of the
3240 * destination may be used.
3241 */
3242 assert(inst->src[i].file == IMM ||
3243 inst->src[i].file == UNIFORM);
3244 mov->force_writemask_all = true;
3245 }
3246
3247 if (dst.file == GRF) {
3248 const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3249 const bool force_writemask = mov->force_writemask_all;
3250 metadata[dst_reg].force_writemask_all = force_writemask;
3251 metadata[dst_reg].force_sechalf = mov->force_sechalf;
3252 if (dst.width * type_sz(dst.type) > 32) {
3253 assert(!mov->force_sechalf);
3254 metadata[dst_reg + 1].force_writemask_all = force_writemask;
3255 metadata[dst_reg + 1].force_sechalf = !force_writemask;
3256 }
3257 }
3258
3259 inst->insert_before(block, mov);
3260 }
3261
3262 dst = offset(dst, 1);
3263 }
3264
3265 inst->remove(block);
3266 progress = true;
3267 }
3268 }
3269
3270 if (progress)
3271 invalidate_live_intervals();
3272
3273 return progress;
3274 }
3275
3276 void
3277 fs_visitor::dump_instructions()
3278 {
3279 dump_instructions(NULL);
3280 }
3281
3282 void
3283 fs_visitor::dump_instructions(const char *name)
3284 {
3285 FILE *file = stderr;
3286 if (name && geteuid() != 0) {
3287 file = fopen(name, "w");
3288 if (!file)
3289 file = stderr;
3290 }
3291
3292 if (cfg) {
3293 calculate_register_pressure();
3294 int ip = 0, max_pressure = 0;
3295 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3296 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3297 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3298 dump_instruction(inst, file);
3299 ip++;
3300 }
3301 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3302 } else {
3303 int ip = 0;
3304 foreach_in_list(backend_instruction, inst, &instructions) {
3305 fprintf(file, "%4d: ", ip++);
3306 dump_instruction(inst, file);
3307 }
3308 }
3309
3310 if (file != stderr) {
3311 fclose(file);
3312 }
3313 }
3314
3315 void
3316 fs_visitor::dump_instruction(backend_instruction *be_inst)
3317 {
3318 dump_instruction(be_inst, stderr);
3319 }
3320
3321 void
3322 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3323 {
3324 fs_inst *inst = (fs_inst *)be_inst;
3325
3326 if (inst->predicate) {
3327 fprintf(file, "(%cf0.%d) ",
3328 inst->predicate_inverse ? '-' : '+',
3329 inst->flag_subreg);
3330 }
3331
3332 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3333 if (inst->saturate)
3334 fprintf(file, ".sat");
3335 if (inst->conditional_mod) {
3336 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3337 if (!inst->predicate &&
3338 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3339 inst->opcode != BRW_OPCODE_IF &&
3340 inst->opcode != BRW_OPCODE_WHILE))) {
3341 fprintf(file, ".f0.%d", inst->flag_subreg);
3342 }
3343 }
3344 fprintf(file, "(%d) ", inst->exec_size);
3345
3346
3347 switch (inst->dst.file) {
3348 case GRF:
3349 fprintf(file, "vgrf%d", inst->dst.reg);
3350 if (inst->dst.width != dispatch_width)
3351 fprintf(file, "@%d", inst->dst.width);
3352 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3353 inst->dst.subreg_offset)
3354 fprintf(file, "+%d.%d",
3355 inst->dst.reg_offset, inst->dst.subreg_offset);
3356 break;
3357 case MRF:
3358 fprintf(file, "m%d", inst->dst.reg);
3359 break;
3360 case BAD_FILE:
3361 fprintf(file, "(null)");
3362 break;
3363 case UNIFORM:
3364 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3365 break;
3366 case ATTR:
3367 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3368 break;
3369 case HW_REG:
3370 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3371 switch (inst->dst.fixed_hw_reg.nr) {
3372 case BRW_ARF_NULL:
3373 fprintf(file, "null");
3374 break;
3375 case BRW_ARF_ADDRESS:
3376 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3377 break;
3378 case BRW_ARF_ACCUMULATOR:
3379 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3380 break;
3381 case BRW_ARF_FLAG:
3382 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3383 inst->dst.fixed_hw_reg.subnr);
3384 break;
3385 default:
3386 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3387 inst->dst.fixed_hw_reg.subnr);
3388 break;
3389 }
3390 } else {
3391 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3392 }
3393 if (inst->dst.fixed_hw_reg.subnr)
3394 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3395 break;
3396 default:
3397 fprintf(file, "???");
3398 break;
3399 }
3400 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3401
3402 for (int i = 0; i < inst->sources; i++) {
3403 if (inst->src[i].negate)
3404 fprintf(file, "-");
3405 if (inst->src[i].abs)
3406 fprintf(file, "|");
3407 switch (inst->src[i].file) {
3408 case GRF:
3409 fprintf(file, "vgrf%d", inst->src[i].reg);
3410 if (inst->src[i].width != dispatch_width)
3411 fprintf(file, "@%d", inst->src[i].width);
3412 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3413 inst->src[i].subreg_offset)
3414 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3415 inst->src[i].subreg_offset);
3416 break;
3417 case MRF:
3418 fprintf(file, "***m%d***", inst->src[i].reg);
3419 break;
3420 case ATTR:
3421 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3422 break;
3423 case UNIFORM:
3424 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3425 if (inst->src[i].reladdr) {
3426 fprintf(file, "+reladdr");
3427 } else if (inst->src[i].subreg_offset) {
3428 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3429 inst->src[i].subreg_offset);
3430 }
3431 break;
3432 case BAD_FILE:
3433 fprintf(file, "(null)");
3434 break;
3435 case IMM:
3436 switch (inst->src[i].type) {
3437 case BRW_REGISTER_TYPE_F:
3438 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3439 break;
3440 case BRW_REGISTER_TYPE_W:
3441 case BRW_REGISTER_TYPE_D:
3442 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3443 break;
3444 case BRW_REGISTER_TYPE_UW:
3445 case BRW_REGISTER_TYPE_UD:
3446 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3447 break;
3448 case BRW_REGISTER_TYPE_VF:
3449 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3450 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3451 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3452 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3453 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3454 break;
3455 default:
3456 fprintf(file, "???");
3457 break;
3458 }
3459 break;
3460 case HW_REG:
3461 if (inst->src[i].fixed_hw_reg.negate)
3462 fprintf(file, "-");
3463 if (inst->src[i].fixed_hw_reg.abs)
3464 fprintf(file, "|");
3465 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3466 switch (inst->src[i].fixed_hw_reg.nr) {
3467 case BRW_ARF_NULL:
3468 fprintf(file, "null");
3469 break;
3470 case BRW_ARF_ADDRESS:
3471 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3472 break;
3473 case BRW_ARF_ACCUMULATOR:
3474 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3475 break;
3476 case BRW_ARF_FLAG:
3477 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3478 inst->src[i].fixed_hw_reg.subnr);
3479 break;
3480 default:
3481 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3482 inst->src[i].fixed_hw_reg.subnr);
3483 break;
3484 }
3485 } else {
3486 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3487 }
3488 if (inst->src[i].fixed_hw_reg.subnr)
3489 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3490 if (inst->src[i].fixed_hw_reg.abs)
3491 fprintf(file, "|");
3492 break;
3493 default:
3494 fprintf(file, "???");
3495 break;
3496 }
3497 if (inst->src[i].abs)
3498 fprintf(file, "|");
3499
3500 if (inst->src[i].file != IMM) {
3501 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3502 }
3503
3504 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3505 fprintf(file, ", ");
3506 }
3507
3508 fprintf(file, " ");
3509
3510 if (dispatch_width == 16 && inst->exec_size == 8) {
3511 if (inst->force_sechalf)
3512 fprintf(file, "2ndhalf ");
3513 else
3514 fprintf(file, "1sthalf ");
3515 }
3516
3517 fprintf(file, "\n");
3518 }
3519
3520 /**
3521 * Possibly returns an instruction that set up @param reg.
3522 *
3523 * Sometimes we want to take the result of some expression/variable
3524 * dereference tree and rewrite the instruction generating the result
3525 * of the tree. When processing the tree, we know that the
3526 * instructions generated are all writing temporaries that are dead
3527 * outside of this tree. So, if we have some instructions that write
3528 * a temporary, we're free to point that temp write somewhere else.
3529 *
3530 * Note that this doesn't guarantee that the instruction generated
3531 * only reg -- it might be the size=4 destination of a texture instruction.
3532 */
3533 fs_inst *
3534 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3535 fs_inst *end,
3536 const fs_reg &reg)
3537 {
3538 if (end == start ||
3539 end->is_partial_write() ||
3540 reg.reladdr ||
3541 !reg.equals(end->dst)) {
3542 return NULL;
3543 } else {
3544 return end;
3545 }
3546 }
3547
3548 void
3549 fs_visitor::setup_payload_gen6()
3550 {
3551 bool uses_depth =
3552 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3553 unsigned barycentric_interp_modes =
3554 (stage == MESA_SHADER_FRAGMENT) ?
3555 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3556
3557 assert(brw->gen >= 6);
3558
3559 /* R0-1: masks, pixel X/Y coordinates. */
3560 payload.num_regs = 2;
3561 /* R2: only for 32-pixel dispatch.*/
3562
3563 /* R3-26: barycentric interpolation coordinates. These appear in the
3564 * same order that they appear in the brw_wm_barycentric_interp_mode
3565 * enum. Each set of coordinates occupies 2 registers if dispatch width
3566 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3567 * appear if they were enabled using the "Barycentric Interpolation
3568 * Mode" bits in WM_STATE.
3569 */
3570 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3571 if (barycentric_interp_modes & (1 << i)) {
3572 payload.barycentric_coord_reg[i] = payload.num_regs;
3573 payload.num_regs += 2;
3574 if (dispatch_width == 16) {
3575 payload.num_regs += 2;
3576 }
3577 }
3578 }
3579
3580 /* R27: interpolated depth if uses source depth */
3581 if (uses_depth) {
3582 payload.source_depth_reg = payload.num_regs;
3583 payload.num_regs++;
3584 if (dispatch_width == 16) {
3585 /* R28: interpolated depth if not SIMD8. */
3586 payload.num_regs++;
3587 }
3588 }
3589 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3590 if (uses_depth) {
3591 payload.source_w_reg = payload.num_regs;
3592 payload.num_regs++;
3593 if (dispatch_width == 16) {
3594 /* R30: interpolated W if not SIMD8. */
3595 payload.num_regs++;
3596 }
3597 }
3598
3599 if (stage == MESA_SHADER_FRAGMENT) {
3600 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3601 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3602 prog_data->uses_pos_offset = key->compute_pos_offset;
3603 /* R31: MSAA position offsets. */
3604 if (prog_data->uses_pos_offset) {
3605 payload.sample_pos_reg = payload.num_regs;
3606 payload.num_regs++;
3607 }
3608 }
3609
3610 /* R32: MSAA input coverage mask */
3611 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3612 assert(brw->gen >= 7);
3613 payload.sample_mask_in_reg = payload.num_regs;
3614 payload.num_regs++;
3615 if (dispatch_width == 16) {
3616 /* R33: input coverage mask if not SIMD8. */
3617 payload.num_regs++;
3618 }
3619 }
3620
3621 /* R34-: bary for 32-pixel. */
3622 /* R58-59: interp W for 32-pixel. */
3623
3624 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3625 source_depth_to_render_target = true;
3626 }
3627 }
3628
3629 void
3630 fs_visitor::setup_vs_payload()
3631 {
3632 /* R0: thread header, R1: urb handles */
3633 payload.num_regs = 2;
3634 }
3635
3636 void
3637 fs_visitor::assign_binding_table_offsets()
3638 {
3639 assert(stage == MESA_SHADER_FRAGMENT);
3640 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3641 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3642 uint32_t next_binding_table_offset = 0;
3643
3644 /* If there are no color regions, we still perform an FB write to a null
3645 * renderbuffer, which we place at surface index 0.
3646 */
3647 prog_data->binding_table.render_target_start = next_binding_table_offset;
3648 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3649
3650 assign_common_binding_table_offsets(next_binding_table_offset);
3651 }
3652
3653 void
3654 fs_visitor::calculate_register_pressure()
3655 {
3656 invalidate_live_intervals();
3657 calculate_live_intervals();
3658
3659 unsigned num_instructions = 0;
3660 foreach_block(block, cfg)
3661 num_instructions += block->instructions.length();
3662
3663 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3664
3665 for (unsigned reg = 0; reg < alloc.count; reg++) {
3666 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3667 regs_live_at_ip[ip] += alloc.sizes[reg];
3668 }
3669 }
3670
3671 void
3672 fs_visitor::optimize()
3673 {
3674 const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3675
3676 split_virtual_grfs();
3677
3678 move_uniform_array_access_to_pull_constants();
3679 assign_constant_locations();
3680 demote_pull_constants();
3681
3682 #define OPT(pass, args...) ({ \
3683 pass_num++; \
3684 bool this_progress = pass(args); \
3685 \
3686 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3687 char filename[64]; \
3688 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3689 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3690 \
3691 backend_visitor::dump_instructions(filename); \
3692 } \
3693 \
3694 progress = progress || this_progress; \
3695 this_progress; \
3696 })
3697
3698 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3699 char filename[64];
3700 snprintf(filename, 64, "%s%d-%04d-00-start",
3701 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3702
3703 backend_visitor::dump_instructions(filename);
3704 }
3705
3706 bool progress;
3707 int iteration = 0;
3708 int pass_num = 0;
3709 do {
3710 progress = false;
3711 pass_num = 0;
3712 iteration++;
3713
3714 OPT(remove_duplicate_mrf_writes);
3715
3716 OPT(opt_algebraic);
3717 OPT(opt_cse);
3718 OPT(opt_copy_propagate);
3719 OPT(opt_peephole_predicated_break);
3720 OPT(opt_cmod_propagation);
3721 OPT(dead_code_eliminate);
3722 OPT(opt_peephole_sel);
3723 OPT(dead_control_flow_eliminate, this);
3724 OPT(opt_register_renaming);
3725 OPT(opt_redundant_discard_jumps);
3726 OPT(opt_saturate_propagation);
3727 OPT(register_coalesce);
3728 OPT(compute_to_mrf);
3729
3730 OPT(compact_virtual_grfs);
3731 } while (progress);
3732
3733 pass_num = 0;
3734
3735 if (OPT(lower_load_payload)) {
3736 split_virtual_grfs();
3737 OPT(register_coalesce);
3738 OPT(compute_to_mrf);
3739 OPT(dead_code_eliminate);
3740 }
3741
3742 OPT(opt_combine_constants);
3743
3744 lower_uniform_pull_constant_loads();
3745 }
3746
3747 /**
3748 * Three source instruction must have a GRF/MRF destination register.
3749 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
3750 */
3751 void
3752 fs_visitor::fixup_3src_null_dest()
3753 {
3754 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3755 if (inst->is_3src() && inst->dst.is_null()) {
3756 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3757 inst->dst.type);
3758 }
3759 }
3760 }
3761
3762 void
3763 fs_visitor::allocate_registers()
3764 {
3765 bool allocated_without_spills;
3766
3767 static const enum instruction_scheduler_mode pre_modes[] = {
3768 SCHEDULE_PRE,
3769 SCHEDULE_PRE_NON_LIFO,
3770 SCHEDULE_PRE_LIFO,
3771 };
3772
3773 /* Try each scheduling heuristic to see if it can successfully register
3774 * allocate without spilling. They should be ordered by decreasing
3775 * performance but increasing likelihood of allocating.
3776 */
3777 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3778 schedule_instructions(pre_modes[i]);
3779
3780 if (0) {
3781 assign_regs_trivial();
3782 allocated_without_spills = true;
3783 } else {
3784 allocated_without_spills = assign_regs(false);
3785 }
3786 if (allocated_without_spills)
3787 break;
3788 }
3789
3790 if (!allocated_without_spills) {
3791 const char *stage_name = stage == MESA_SHADER_VERTEX ?
3792 "Vertex" : "Fragment";
3793
3794 /* We assume that any spilling is worse than just dropping back to
3795 * SIMD8. There's probably actually some intermediate point where
3796 * SIMD16 with a couple of spills is still better.
3797 */
3798 if (dispatch_width == 16) {
3799 fail("Failure to register allocate. Reduce number of "
3800 "live scalar values to avoid this.");
3801 } else {
3802 perf_debug("%s shader triggered register spilling. "
3803 "Try reducing the number of live scalar values to "
3804 "improve performance.\n", stage_name);
3805 }
3806
3807 /* Since we're out of heuristics, just go spill registers until we
3808 * get an allocation.
3809 */
3810 while (!assign_regs(true)) {
3811 if (failed)
3812 break;
3813 }
3814 }
3815
3816 /* This must come after all optimization and register allocation, since
3817 * it inserts dead code that happens to have side effects, and it does
3818 * so based on the actual physical registers in use.
3819 */
3820 insert_gen4_send_dependency_workarounds();
3821
3822 if (failed)
3823 return;
3824
3825 if (!allocated_without_spills)
3826 schedule_instructions(SCHEDULE_POST);
3827
3828 if (last_scratch > 0)
3829 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3830 }
3831
3832 bool
3833 fs_visitor::run_vs()
3834 {
3835 assert(stage == MESA_SHADER_VERTEX);
3836
3837 assign_common_binding_table_offsets(0);
3838 setup_vs_payload();
3839
3840 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3841 emit_shader_time_begin();
3842
3843 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3844 base_ir = ir;
3845 this->result = reg_undef;
3846 ir->accept(this);
3847 }
3848 base_ir = NULL;
3849 if (failed)
3850 return false;
3851
3852 emit_urb_writes();
3853
3854 calculate_cfg();
3855
3856 optimize();
3857
3858 assign_curb_setup();
3859 assign_vs_urb_setup();
3860
3861 fixup_3src_null_dest();
3862 allocate_registers();
3863
3864 return !failed;
3865 }
3866
3867 bool
3868 fs_visitor::run_fs()
3869 {
3870 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3871 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3872
3873 assert(stage == MESA_SHADER_FRAGMENT);
3874
3875 sanity_param_count = prog->Parameters->NumParameters;
3876
3877 assign_binding_table_offsets();
3878
3879 if (brw->gen >= 6)
3880 setup_payload_gen6();
3881 else
3882 setup_payload_gen4();
3883
3884 if (0) {
3885 emit_dummy_fs();
3886 } else if (brw->use_rep_send && dispatch_width == 16) {
3887 emit_repclear_shader();
3888 } else {
3889 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3890 emit_shader_time_begin();
3891
3892 calculate_urb_setup();
3893 if (prog->InputsRead > 0) {
3894 if (brw->gen < 6)
3895 emit_interpolation_setup_gen4();
3896 else
3897 emit_interpolation_setup_gen6();
3898 }
3899
3900 /* We handle discards by keeping track of the still-live pixels in f0.1.
3901 * Initialize it with the dispatched pixels.
3902 */
3903 if (wm_prog_data->uses_kill) {
3904 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3905 discard_init->flag_subreg = 1;
3906 }
3907
3908 /* Generate FS IR for main(). (the visitor only descends into
3909 * functions called "main").
3910 */
3911 if (shader) {
3912 if (getenv("INTEL_USE_NIR") != NULL) {
3913 emit_nir_code();
3914 } else {
3915 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3916 base_ir = ir;
3917 this->result = reg_undef;
3918 ir->accept(this);
3919 }
3920 }
3921 } else {
3922 emit_fragment_program_code();
3923 }
3924 base_ir = NULL;
3925 if (failed)
3926 return false;
3927
3928 emit(FS_OPCODE_PLACEHOLDER_HALT);
3929
3930 if (wm_key->alpha_test_func)
3931 emit_alpha_test();
3932
3933 emit_fb_writes();
3934
3935 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3936 emit_shader_time_end();
3937
3938 calculate_cfg();
3939
3940 optimize();
3941
3942 assign_curb_setup();
3943 assign_urb_setup();
3944
3945 fixup_3src_null_dest();
3946 allocate_registers();
3947
3948 if (failed)
3949 return false;
3950 }
3951
3952 if (dispatch_width == 8)
3953 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3954 else
3955 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3956
3957 /* If any state parameters were appended, then ParameterValues could have
3958 * been realloced, in which case the driver uniform storage set up by
3959 * _mesa_associate_uniform_storage() would point to freed memory. Make
3960 * sure that didn't happen.
3961 */
3962 assert(sanity_param_count == prog->Parameters->NumParameters);
3963
3964 return !failed;
3965 }
3966
3967 const unsigned *
3968 brw_wm_fs_emit(struct brw_context *brw,
3969 void *mem_ctx,
3970 const struct brw_wm_prog_key *key,
3971 struct brw_wm_prog_data *prog_data,
3972 struct gl_fragment_program *fp,
3973 struct gl_shader_program *prog,
3974 unsigned *final_assembly_size)
3975 {
3976 bool start_busy = false;
3977 double start_time = 0;
3978
3979 if (unlikely(brw->perf_debug)) {
3980 start_busy = (brw->batch.last_bo &&
3981 drm_intel_bo_busy(brw->batch.last_bo));
3982 start_time = get_time();
3983 }
3984
3985 struct brw_shader *shader = NULL;
3986 if (prog)
3987 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3988
3989 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3990 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3991
3992 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3993 */
3994 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3995 if (!v.run_fs()) {
3996 if (prog) {
3997 prog->LinkStatus = false;
3998 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3999 }
4000
4001 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4002 v.fail_msg);
4003
4004 return NULL;
4005 }
4006
4007 cfg_t *simd16_cfg = NULL;
4008 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
4009 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
4010 brw->use_rep_send)) {
4011 if (!v.simd16_unsupported) {
4012 /* Try a SIMD16 compile */
4013 v2.import_uniforms(&v);
4014 if (!v2.run_fs()) {
4015 perf_debug("SIMD16 shader failed to compile, falling back to "
4016 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4017 } else {
4018 simd16_cfg = v2.cfg;
4019 }
4020 } else {
4021 perf_debug("SIMD16 shader unsupported, falling back to "
4022 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4023 }
4024 }
4025
4026 cfg_t *simd8_cfg;
4027 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4028 if (no_simd8 && simd16_cfg) {
4029 simd8_cfg = NULL;
4030 prog_data->no_8 = true;
4031 } else {
4032 simd8_cfg = v.cfg;
4033 prog_data->no_8 = false;
4034 }
4035
4036 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4037 &fp->Base, v.runtime_check_aads_emit, "FS");
4038
4039 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4040 char *name;
4041 if (prog)
4042 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4043 prog->Label ? prog->Label : "unnamed",
4044 prog->Name);
4045 else
4046 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4047
4048 g.enable_debug(name);
4049 }
4050
4051 if (simd8_cfg)
4052 g.generate_code(simd8_cfg, 8);
4053 if (simd16_cfg)
4054 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4055
4056 if (unlikely(brw->perf_debug) && shader) {
4057 if (shader->compiled_once)
4058 brw_wm_debug_recompile(brw, prog, key);
4059 shader->compiled_once = true;
4060
4061 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4062 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4063 (get_time() - start_time) * 1000);
4064 }
4065 }
4066
4067 return g.get_assembly(final_assembly_size);
4068 }
4069
4070 extern "C" bool
4071 brw_fs_precompile(struct gl_context *ctx,
4072 struct gl_shader_program *shader_prog,
4073 struct gl_program *prog)
4074 {
4075 struct brw_context *brw = brw_context(ctx);
4076 struct brw_wm_prog_key key;
4077
4078 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4079 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4080 bool program_uses_dfdy = fp->UsesDFdy;
4081
4082 memset(&key, 0, sizeof(key));
4083
4084 if (brw->gen < 6) {
4085 if (fp->UsesKill)
4086 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4087
4088 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4089 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4090
4091 /* Just assume depth testing. */
4092 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4093 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4094 }
4095
4096 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4097 BRW_FS_VARYING_INPUT_MASK) > 16)
4098 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4099
4100 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4101 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
4102 for (unsigned i = 0; i < sampler_count; i++) {
4103 if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
4104 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4105 key.tex.swizzles[i] =
4106 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4107 } else {
4108 /* Color sampler: assume no swizzling. */
4109 key.tex.swizzles[i] = SWIZZLE_XYZW;
4110 }
4111 }
4112
4113 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4114 key.drawable_height = ctx->DrawBuffer->Height;
4115 }
4116
4117 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4118 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4119 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4120
4121 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4122 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4123 key.nr_color_regions > 1;
4124 }
4125
4126 key.program_string_id = bfp->id;
4127
4128 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4129 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4130
4131 bool success = do_wm_prog(brw, shader_prog, bfp, &key);
4132
4133 brw->wm.base.prog_offset = old_prog_offset;
4134 brw->wm.prog_data = old_prog_data;
4135
4136 return success;
4137 }