i965: Make NIR non-optional for scalar shaders
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 void
53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
54 const fs_reg *src, unsigned sources)
55 {
56 memset(this, 0, sizeof(*this));
57
58 this->src = new fs_reg[MAX2(sources, 3)];
59 for (unsigned i = 0; i < sources; i++)
60 this->src[i] = src[i];
61
62 this->opcode = opcode;
63 this->dst = dst;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (unsigned i = 0; i < sources; ++i) {
79 if (src[i].file != GRF && src[i].file != ATTR)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 this->conditional_mod = BRW_CONDITIONAL_NONE;
94
95 /* This will be the case for almost all instructions. */
96 switch (dst.file) {
97 case GRF:
98 case HW_REG:
99 case MRF:
100 case ATTR:
101 this->regs_written =
102 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
103 break;
104 case BAD_FILE:
105 this->regs_written = 0;
106 break;
107 case IMM:
108 case UNIFORM:
109 unreachable("Invalid destination register file");
110 default:
111 unreachable("Invalid register file");
112 }
113
114 this->writes_accumulator = false;
115 }
116
117 fs_inst::fs_inst()
118 {
119 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
120 }
121
122 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
123 {
124 init(opcode, exec_size, reg_undef, NULL, 0);
125 }
126
127 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
128 {
129 init(opcode, 0, dst, NULL, 0);
130 }
131
132 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
133 const fs_reg &src0)
134 {
135 const fs_reg src[1] = { src0 };
136 init(opcode, exec_size, dst, src, 1);
137 }
138
139 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
140 {
141 const fs_reg src[1] = { src0 };
142 init(opcode, 0, dst, src, 1);
143 }
144
145 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
146 const fs_reg &src0, const fs_reg &src1)
147 {
148 const fs_reg src[2] = { src0, src1 };
149 init(opcode, exec_size, dst, src, 2);
150 }
151
152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
153 const fs_reg &src1)
154 {
155 const fs_reg src[2] = { src0, src1 };
156 init(opcode, 0, dst, src, 2);
157 }
158
159 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
160 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
161 {
162 const fs_reg src[3] = { src0, src1, src2 };
163 init(opcode, exec_size, dst, src, 3);
164 }
165
166 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
167 const fs_reg &src1, const fs_reg &src2)
168 {
169 const fs_reg src[3] = { src0, src1, src2 };
170 init(opcode, 0, dst, src, 3);
171 }
172
173 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
174 const fs_reg src[], unsigned sources)
175 {
176 init(opcode, 0, dst, src, sources);
177 }
178
179 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
180 const fs_reg src[], unsigned sources)
181 {
182 init(opcode, exec_width, dst, src, sources);
183 }
184
185 fs_inst::fs_inst(const fs_inst &that)
186 {
187 memcpy(this, &that, sizeof(that));
188
189 this->src = new fs_reg[MAX2(that.sources, 3)];
190
191 for (unsigned i = 0; i < that.sources; i++)
192 this->src[i] = that.src[i];
193 }
194
195 fs_inst::~fs_inst()
196 {
197 delete[] this->src;
198 }
199
200 void
201 fs_inst::resize_sources(uint8_t num_sources)
202 {
203 if (this->sources != num_sources) {
204 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
205
206 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
207 src[i] = this->src[i];
208
209 delete[] this->src;
210 this->src = src;
211 this->sources = num_sources;
212 }
213 }
214
215 #define ALU1(op) \
216 fs_inst * \
217 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
218 { \
219 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
220 }
221
222 #define ALU2(op) \
223 fs_inst * \
224 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
225 const fs_reg &src1) \
226 { \
227 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
228 }
229
230 #define ALU2_ACC(op) \
231 fs_inst * \
232 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
233 const fs_reg &src1) \
234 { \
235 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
236 inst->writes_accumulator = true; \
237 return inst; \
238 }
239
240 #define ALU3(op) \
241 fs_inst * \
242 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
243 const fs_reg &src1, const fs_reg &src2) \
244 { \
245 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
246 }
247
248 ALU1(NOT)
249 ALU1(MOV)
250 ALU1(FRC)
251 ALU1(RNDD)
252 ALU1(RNDE)
253 ALU1(RNDZ)
254 ALU2(ADD)
255 ALU2(MUL)
256 ALU2_ACC(MACH)
257 ALU2(AND)
258 ALU2(OR)
259 ALU2(XOR)
260 ALU2(SHL)
261 ALU2(SHR)
262 ALU2(ASR)
263 ALU3(LRP)
264 ALU1(BFREV)
265 ALU3(BFE)
266 ALU2(BFI1)
267 ALU3(BFI2)
268 ALU1(FBH)
269 ALU1(FBL)
270 ALU1(CBIT)
271 ALU3(MAD)
272 ALU2_ACC(ADDC)
273 ALU2_ACC(SUBB)
274 ALU2(SEL)
275 ALU2(MAC)
276
277 /** Gen4 predicated IF. */
278 fs_inst *
279 fs_visitor::IF(enum brw_predicate predicate)
280 {
281 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
282 inst->predicate = predicate;
283 return inst;
284 }
285
286 /** Gen6 IF with embedded comparison. */
287 fs_inst *
288 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
289 enum brw_conditional_mod condition)
290 {
291 assert(devinfo->gen == 6);
292 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
293 reg_null_d, src0, src1);
294 inst->conditional_mod = condition;
295 return inst;
296 }
297
298 /**
299 * CMP: Sets the low bit of the destination channels with the result
300 * of the comparison, while the upper bits are undefined, and updates
301 * the flag register with the packed 16 bits of the result.
302 */
303 fs_inst *
304 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
305 enum brw_conditional_mod condition)
306 {
307 fs_inst *inst;
308
309 /* Take the instruction:
310 *
311 * CMP null<d> src0<f> src1<f>
312 *
313 * Original gen4 does type conversion to the destination type before
314 * comparison, producing garbage results for floating point comparisons.
315 *
316 * The destination type doesn't matter on newer generations, so we set the
317 * type to match src0 so we can compact the instruction.
318 */
319 dst.type = src0.type;
320 if (dst.file == HW_REG)
321 dst.fixed_hw_reg.type = dst.type;
322
323 resolve_ud_negate(&src0);
324 resolve_ud_negate(&src1);
325
326 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
327 inst->conditional_mod = condition;
328
329 return inst;
330 }
331
332 fs_inst *
333 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
334 int header_size)
335 {
336 assert(dst.width % 8 == 0);
337 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
338 dst, src, sources);
339 inst->header_size = header_size;
340
341 for (int i = 0; i < header_size; i++)
342 assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
343 inst->regs_written = header_size;
344
345 for (int i = header_size; i < sources; ++i)
346 assert(src[i].file != GRF || src[i].width == dst.width);
347 inst->regs_written += (sources - header_size) * (dst.width / 8);
348
349 return inst;
350 }
351
352 exec_list
353 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
354 const fs_reg &surf_index,
355 const fs_reg &varying_offset,
356 uint32_t const_offset)
357 {
358 exec_list instructions;
359 fs_inst *inst;
360
361 /* We have our constant surface use a pitch of 4 bytes, so our index can
362 * be any component of a vector, and then we load 4 contiguous
363 * components starting from that.
364 *
365 * We break down the const_offset to a portion added to the variable
366 * offset and a portion done using reg_offset, which means that if you
367 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
368 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
369 * CSE can later notice that those loads are all the same and eliminate
370 * the redundant ones.
371 */
372 fs_reg vec4_offset = vgrf(glsl_type::int_type);
373 instructions.push_tail(ADD(vec4_offset,
374 varying_offset, fs_reg(const_offset & ~3)));
375
376 int scale = 1;
377 if (devinfo->gen == 4 && dst.width == 8) {
378 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
379 * u, v, r) as parameters, or we can just use the SIMD16 message
380 * consisting of (header, u). We choose the second, at the cost of a
381 * longer return length.
382 */
383 scale = 2;
384 }
385
386 enum opcode op;
387 if (devinfo->gen >= 7)
388 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
389 else
390 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
391
392 assert(dst.width % 8 == 0);
393 int regs_written = 4 * (dst.width / 8) * scale;
394 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
395 dst.type, dst.width);
396 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
397 inst->regs_written = regs_written;
398 instructions.push_tail(inst);
399
400 if (devinfo->gen < 7) {
401 inst->base_mrf = 13;
402 inst->header_size = 1;
403 if (devinfo->gen == 4)
404 inst->mlen = 3;
405 else
406 inst->mlen = 1 + dispatch_width / 8;
407 }
408
409 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
410 instructions.push_tail(MOV(dst, result));
411
412 return instructions;
413 }
414
415 /**
416 * A helper for MOV generation for fixing up broken hardware SEND dependency
417 * handling.
418 */
419 fs_inst *
420 fs_visitor::DEP_RESOLVE_MOV(int grf)
421 {
422 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
423
424 inst->ir = NULL;
425 inst->annotation = "send dependency resolve";
426
427 /* The caller always wants uncompressed to emit the minimal extra
428 * dependencies, and to avoid having to deal with aligning its regs to 2.
429 */
430 inst->exec_size = 8;
431
432 return inst;
433 }
434
435 bool
436 fs_inst::equals(fs_inst *inst) const
437 {
438 return (opcode == inst->opcode &&
439 dst.equals(inst->dst) &&
440 src[0].equals(inst->src[0]) &&
441 src[1].equals(inst->src[1]) &&
442 src[2].equals(inst->src[2]) &&
443 saturate == inst->saturate &&
444 predicate == inst->predicate &&
445 conditional_mod == inst->conditional_mod &&
446 mlen == inst->mlen &&
447 base_mrf == inst->base_mrf &&
448 target == inst->target &&
449 eot == inst->eot &&
450 header_size == inst->header_size &&
451 shadow_compare == inst->shadow_compare &&
452 exec_size == inst->exec_size &&
453 offset == inst->offset);
454 }
455
456 bool
457 fs_inst::overwrites_reg(const fs_reg &reg) const
458 {
459 return reg.in_range(dst, regs_written);
460 }
461
462 bool
463 fs_inst::is_send_from_grf() const
464 {
465 switch (opcode) {
466 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
467 case SHADER_OPCODE_SHADER_TIME_ADD:
468 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
469 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
470 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
471 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
472 case SHADER_OPCODE_UNTYPED_ATOMIC:
473 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
474 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
475 case SHADER_OPCODE_TYPED_ATOMIC:
476 case SHADER_OPCODE_TYPED_SURFACE_READ:
477 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
478 case SHADER_OPCODE_URB_WRITE_SIMD8:
479 return true;
480 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
481 return src[1].file == GRF;
482 case FS_OPCODE_FB_WRITE:
483 return src[0].file == GRF;
484 default:
485 if (is_tex())
486 return src[0].file == GRF;
487
488 return false;
489 }
490 }
491
492 bool
493 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
494 {
495 if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
496 return false;
497
498 fs_reg reg = this->src[0];
499 if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
500 return false;
501
502 if (grf_alloc.sizes[reg.reg] != this->regs_written)
503 return false;
504
505 for (int i = 0; i < this->sources; i++) {
506 reg.type = this->src[i].type;
507 reg.width = this->src[i].width;
508 if (!this->src[i].equals(reg))
509 return false;
510 reg = ::offset(reg, 1);
511 }
512
513 return true;
514 }
515
516 bool
517 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
518 {
519 if (devinfo->gen == 6 && is_math())
520 return false;
521
522 if (is_send_from_grf())
523 return false;
524
525 if (!backend_instruction::can_do_source_mods())
526 return false;
527
528 return true;
529 }
530
531 bool
532 fs_inst::has_side_effects() const
533 {
534 return this->eot || backend_instruction::has_side_effects();
535 }
536
537 void
538 fs_reg::init()
539 {
540 memset(this, 0, sizeof(*this));
541 stride = 1;
542 }
543
544 /** Generic unset register constructor. */
545 fs_reg::fs_reg()
546 {
547 init();
548 this->file = BAD_FILE;
549 }
550
551 /** Immediate value constructor. */
552 fs_reg::fs_reg(float f)
553 {
554 init();
555 this->file = IMM;
556 this->type = BRW_REGISTER_TYPE_F;
557 this->fixed_hw_reg.dw1.f = f;
558 this->width = 1;
559 }
560
561 /** Immediate value constructor. */
562 fs_reg::fs_reg(int32_t i)
563 {
564 init();
565 this->file = IMM;
566 this->type = BRW_REGISTER_TYPE_D;
567 this->fixed_hw_reg.dw1.d = i;
568 this->width = 1;
569 }
570
571 /** Immediate value constructor. */
572 fs_reg::fs_reg(uint32_t u)
573 {
574 init();
575 this->file = IMM;
576 this->type = BRW_REGISTER_TYPE_UD;
577 this->fixed_hw_reg.dw1.ud = u;
578 this->width = 1;
579 }
580
581 /** Vector float immediate value constructor. */
582 fs_reg::fs_reg(uint8_t vf[4])
583 {
584 init();
585 this->file = IMM;
586 this->type = BRW_REGISTER_TYPE_VF;
587 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
588 }
589
590 /** Vector float immediate value constructor. */
591 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
592 {
593 init();
594 this->file = IMM;
595 this->type = BRW_REGISTER_TYPE_VF;
596 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
597 (vf1 << 8) |
598 (vf2 << 16) |
599 (vf3 << 24);
600 }
601
602 /** Fixed brw_reg. */
603 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
604 {
605 init();
606 this->file = HW_REG;
607 this->fixed_hw_reg = fixed_hw_reg;
608 this->type = fixed_hw_reg.type;
609 this->width = 1 << fixed_hw_reg.width;
610 }
611
612 bool
613 fs_reg::equals(const fs_reg &r) const
614 {
615 return (file == r.file &&
616 reg == r.reg &&
617 reg_offset == r.reg_offset &&
618 subreg_offset == r.subreg_offset &&
619 type == r.type &&
620 negate == r.negate &&
621 abs == r.abs &&
622 !reladdr && !r.reladdr &&
623 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
624 width == r.width &&
625 stride == r.stride);
626 }
627
628 fs_reg &
629 fs_reg::set_smear(unsigned subreg)
630 {
631 assert(file != HW_REG && file != IMM);
632 subreg_offset = subreg * type_sz(type);
633 stride = 0;
634 return *this;
635 }
636
637 bool
638 fs_reg::is_contiguous() const
639 {
640 return stride == 1;
641 }
642
643 int
644 fs_visitor::type_size(const struct glsl_type *type)
645 {
646 unsigned int size, i;
647
648 switch (type->base_type) {
649 case GLSL_TYPE_UINT:
650 case GLSL_TYPE_INT:
651 case GLSL_TYPE_FLOAT:
652 case GLSL_TYPE_BOOL:
653 return type->components();
654 case GLSL_TYPE_ARRAY:
655 return type_size(type->fields.array) * type->length;
656 case GLSL_TYPE_STRUCT:
657 size = 0;
658 for (i = 0; i < type->length; i++) {
659 size += type_size(type->fields.structure[i].type);
660 }
661 return size;
662 case GLSL_TYPE_SAMPLER:
663 /* Samplers take up no register space, since they're baked in at
664 * link time.
665 */
666 return 0;
667 case GLSL_TYPE_ATOMIC_UINT:
668 return 0;
669 case GLSL_TYPE_IMAGE:
670 case GLSL_TYPE_VOID:
671 case GLSL_TYPE_ERROR:
672 case GLSL_TYPE_INTERFACE:
673 case GLSL_TYPE_DOUBLE:
674 unreachable("not reached");
675 }
676
677 return 0;
678 }
679
680 /**
681 * Create a MOV to read the timestamp register.
682 *
683 * The caller is responsible for emitting the MOV. The return value is
684 * the destination of the MOV, with extra parameters set.
685 */
686 fs_reg
687 fs_visitor::get_timestamp(fs_inst **out_mov)
688 {
689 assert(devinfo->gen >= 7);
690
691 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
692 BRW_ARF_TIMESTAMP,
693 0),
694 BRW_REGISTER_TYPE_UD));
695
696 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
697
698 fs_inst *mov = MOV(dst, ts);
699 /* We want to read the 3 fields we care about even if it's not enabled in
700 * the dispatch.
701 */
702 mov->force_writemask_all = true;
703
704 /* The caller wants the low 32 bits of the timestamp. Since it's running
705 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
706 * which is plenty of time for our purposes. It is identical across the
707 * EUs, but since it's tracking GPU core speed it will increment at a
708 * varying rate as render P-states change.
709 *
710 * The caller could also check if render P-states have changed (or anything
711 * else that might disrupt timing) by setting smear to 2 and checking if
712 * that field is != 0.
713 */
714 dst.set_smear(0);
715
716 *out_mov = mov;
717 return dst;
718 }
719
720 void
721 fs_visitor::emit_shader_time_begin()
722 {
723 current_annotation = "shader time start";
724 fs_inst *mov;
725 shader_start_time = get_timestamp(&mov);
726 emit(mov);
727 }
728
729 void
730 fs_visitor::emit_shader_time_end()
731 {
732 current_annotation = "shader time end";
733
734 enum shader_time_shader_type type, written_type, reset_type;
735 switch (stage) {
736 case MESA_SHADER_VERTEX:
737 type = ST_VS;
738 written_type = ST_VS_WRITTEN;
739 reset_type = ST_VS_RESET;
740 break;
741 case MESA_SHADER_GEOMETRY:
742 type = ST_GS;
743 written_type = ST_GS_WRITTEN;
744 reset_type = ST_GS_RESET;
745 break;
746 case MESA_SHADER_FRAGMENT:
747 if (dispatch_width == 8) {
748 type = ST_FS8;
749 written_type = ST_FS8_WRITTEN;
750 reset_type = ST_FS8_RESET;
751 } else {
752 assert(dispatch_width == 16);
753 type = ST_FS16;
754 written_type = ST_FS16_WRITTEN;
755 reset_type = ST_FS16_RESET;
756 }
757 break;
758 case MESA_SHADER_COMPUTE:
759 type = ST_CS;
760 written_type = ST_CS_WRITTEN;
761 reset_type = ST_CS_RESET;
762 break;
763 default:
764 unreachable("fs_visitor::emit_shader_time_end missing code");
765 }
766
767 /* Insert our code just before the final SEND with EOT. */
768 exec_node *end = this->instructions.get_tail();
769 assert(end && ((fs_inst *) end)->eot);
770
771 fs_inst *tm_read;
772 fs_reg shader_end_time = get_timestamp(&tm_read);
773 end->insert_before(tm_read);
774
775 /* Check that there weren't any timestamp reset events (assuming these
776 * were the only two timestamp reads that happened).
777 */
778 fs_reg reset = shader_end_time;
779 reset.set_smear(2);
780 fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
781 test->conditional_mod = BRW_CONDITIONAL_Z;
782 test->force_writemask_all = true;
783 end->insert_before(test);
784 end->insert_before(IF(BRW_PREDICATE_NORMAL));
785
786 fs_reg start = shader_start_time;
787 start.negate = true;
788 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
789 diff.set_smear(0);
790 fs_inst *add = ADD(diff, start, shader_end_time);
791 add->force_writemask_all = true;
792 end->insert_before(add);
793
794 /* If there were no instructions between the two timestamp gets, the diff
795 * is 2 cycles. Remove that overhead, so I can forget about that when
796 * trying to determine the time taken for single instructions.
797 */
798 add = ADD(diff, diff, fs_reg(-2u));
799 add->force_writemask_all = true;
800 end->insert_before(add);
801
802 end->insert_before(SHADER_TIME_ADD(type, diff));
803 end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
804 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
805 end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
806 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
807 }
808
809 fs_inst *
810 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
811 {
812 int shader_time_index =
813 brw_get_shader_time_index(brw, shader_prog, prog, type);
814 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
815
816 fs_reg payload;
817 if (dispatch_width == 8)
818 payload = vgrf(glsl_type::uvec2_type);
819 else
820 payload = vgrf(glsl_type::uint_type);
821
822 return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
823 fs_reg(), payload, offset, value);
824 }
825
826 void
827 fs_visitor::vfail(const char *format, va_list va)
828 {
829 char *msg;
830
831 if (failed)
832 return;
833
834 failed = true;
835
836 msg = ralloc_vasprintf(mem_ctx, format, va);
837 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
838
839 this->fail_msg = msg;
840
841 if (debug_enabled) {
842 fprintf(stderr, "%s", msg);
843 }
844 }
845
846 void
847 fs_visitor::fail(const char *format, ...)
848 {
849 va_list va;
850
851 va_start(va, format);
852 vfail(format, va);
853 va_end(va);
854 }
855
856 /**
857 * Mark this program as impossible to compile in SIMD16 mode.
858 *
859 * During the SIMD8 compile (which happens first), we can detect and flag
860 * things that are unsupported in SIMD16 mode, so the compiler can skip
861 * the SIMD16 compile altogether.
862 *
863 * During a SIMD16 compile (if one happens anyway), this just calls fail().
864 */
865 void
866 fs_visitor::no16(const char *format, ...)
867 {
868 va_list va;
869
870 va_start(va, format);
871
872 if (dispatch_width == 16) {
873 vfail(format, va);
874 } else {
875 simd16_unsupported = true;
876
877 if (brw->perf_debug) {
878 if (no16_msg)
879 ralloc_vasprintf_append(&no16_msg, format, va);
880 else
881 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
882 }
883 }
884
885 va_end(va);
886 }
887
888 fs_inst *
889 fs_visitor::emit(enum opcode opcode)
890 {
891 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
892 }
893
894 fs_inst *
895 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
896 {
897 return emit(new(mem_ctx) fs_inst(opcode, dst));
898 }
899
900 fs_inst *
901 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
902 {
903 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
904 }
905
906 fs_inst *
907 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
908 const fs_reg &src1)
909 {
910 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
911 }
912
913 fs_inst *
914 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
915 const fs_reg &src1, const fs_reg &src2)
916 {
917 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
918 }
919
920 fs_inst *
921 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
922 fs_reg src[], int sources)
923 {
924 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
925 }
926
927 /**
928 * Returns true if the instruction has a flag that means it won't
929 * update an entire destination register.
930 *
931 * For example, dead code elimination and live variable analysis want to know
932 * when a write to a variable screens off any preceding values that were in
933 * it.
934 */
935 bool
936 fs_inst::is_partial_write() const
937 {
938 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
939 (this->dst.width * type_sz(this->dst.type)) < 32 ||
940 !this->dst.is_contiguous());
941 }
942
943 int
944 fs_inst::regs_read(int arg) const
945 {
946 if (is_tex() && arg == 0 && src[0].file == GRF) {
947 return mlen;
948 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
949 return mlen;
950 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
951 return mlen;
952 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
953 return mlen;
954 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
955 return mlen;
956 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
957 return mlen;
958 } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
959 return mlen;
960 } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
961 return mlen;
962 } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
963 return mlen;
964 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
965 return mlen;
966 } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
967 return exec_size / 4;
968 }
969
970 switch (src[arg].file) {
971 case BAD_FILE:
972 case UNIFORM:
973 case IMM:
974 return 1;
975 case GRF:
976 case HW_REG:
977 if (src[arg].stride == 0) {
978 return 1;
979 } else {
980 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
981 return (size + 31) / 32;
982 }
983 case MRF:
984 unreachable("MRF registers are not allowed as sources");
985 default:
986 unreachable("Invalid register file");
987 }
988 }
989
990 bool
991 fs_inst::reads_flag() const
992 {
993 return predicate;
994 }
995
996 bool
997 fs_inst::writes_flag() const
998 {
999 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
1000 opcode != BRW_OPCODE_IF &&
1001 opcode != BRW_OPCODE_WHILE)) ||
1002 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
1003 }
1004
1005 /**
1006 * Returns how many MRFs an FS opcode will write over.
1007 *
1008 * Note that this is not the 0 or 1 implied writes in an actual gen
1009 * instruction -- the FS opcodes often generate MOVs in addition.
1010 */
1011 int
1012 fs_visitor::implied_mrf_writes(fs_inst *inst)
1013 {
1014 if (inst->mlen == 0)
1015 return 0;
1016
1017 if (inst->base_mrf == -1)
1018 return 0;
1019
1020 switch (inst->opcode) {
1021 case SHADER_OPCODE_RCP:
1022 case SHADER_OPCODE_RSQ:
1023 case SHADER_OPCODE_SQRT:
1024 case SHADER_OPCODE_EXP2:
1025 case SHADER_OPCODE_LOG2:
1026 case SHADER_OPCODE_SIN:
1027 case SHADER_OPCODE_COS:
1028 return 1 * dispatch_width / 8;
1029 case SHADER_OPCODE_POW:
1030 case SHADER_OPCODE_INT_QUOTIENT:
1031 case SHADER_OPCODE_INT_REMAINDER:
1032 return 2 * dispatch_width / 8;
1033 case SHADER_OPCODE_TEX:
1034 case FS_OPCODE_TXB:
1035 case SHADER_OPCODE_TXD:
1036 case SHADER_OPCODE_TXF:
1037 case SHADER_OPCODE_TXF_CMS:
1038 case SHADER_OPCODE_TXF_MCS:
1039 case SHADER_OPCODE_TG4:
1040 case SHADER_OPCODE_TG4_OFFSET:
1041 case SHADER_OPCODE_TXL:
1042 case SHADER_OPCODE_TXS:
1043 case SHADER_OPCODE_LOD:
1044 return 1;
1045 case FS_OPCODE_FB_WRITE:
1046 return 2;
1047 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1048 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1049 return 1;
1050 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1051 return inst->mlen;
1052 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1053 return inst->mlen;
1054 case SHADER_OPCODE_UNTYPED_ATOMIC:
1055 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1056 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1057 case SHADER_OPCODE_TYPED_ATOMIC:
1058 case SHADER_OPCODE_TYPED_SURFACE_READ:
1059 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1060 case SHADER_OPCODE_URB_WRITE_SIMD8:
1061 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1062 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1063 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1064 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1065 return 0;
1066 default:
1067 unreachable("not reached");
1068 }
1069 }
1070
1071 fs_reg
1072 fs_visitor::vgrf(const glsl_type *const type)
1073 {
1074 int reg_width = dispatch_width / 8;
1075 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1076 brw_type_for_base_type(type), dispatch_width);
1077 }
1078
1079 fs_reg
1080 fs_visitor::vgrf(int num_components)
1081 {
1082 int reg_width = dispatch_width / 8;
1083 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1084 BRW_REGISTER_TYPE_F, dispatch_width);
1085 }
1086
1087 /** Fixed HW reg constructor. */
1088 fs_reg::fs_reg(enum register_file file, int reg)
1089 {
1090 init();
1091 this->file = file;
1092 this->reg = reg;
1093 this->type = BRW_REGISTER_TYPE_F;
1094
1095 switch (file) {
1096 case UNIFORM:
1097 this->width = 1;
1098 break;
1099 default:
1100 this->width = 8;
1101 }
1102 }
1103
1104 /** Fixed HW reg constructor. */
1105 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1106 {
1107 init();
1108 this->file = file;
1109 this->reg = reg;
1110 this->type = type;
1111
1112 switch (file) {
1113 case UNIFORM:
1114 this->width = 1;
1115 break;
1116 default:
1117 this->width = 8;
1118 }
1119 }
1120
1121 /** Fixed HW reg constructor. */
1122 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1123 uint8_t width)
1124 {
1125 init();
1126 this->file = file;
1127 this->reg = reg;
1128 this->type = type;
1129 this->width = width;
1130 }
1131
1132 fs_reg *
1133 fs_visitor::variable_storage(ir_variable *var)
1134 {
1135 return (fs_reg *)hash_table_find(this->variable_ht, var);
1136 }
1137
1138 void
1139 import_uniforms_callback(const void *key,
1140 void *data,
1141 void *closure)
1142 {
1143 struct hash_table *dst_ht = (struct hash_table *)closure;
1144 const fs_reg *reg = (const fs_reg *)data;
1145
1146 if (reg->file != UNIFORM)
1147 return;
1148
1149 hash_table_insert(dst_ht, data, key);
1150 }
1151
1152 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1153 * This brings in those uniform definitions
1154 */
1155 void
1156 fs_visitor::import_uniforms(fs_visitor *v)
1157 {
1158 hash_table_call_foreach(v->variable_ht,
1159 import_uniforms_callback,
1160 variable_ht);
1161 this->push_constant_loc = v->push_constant_loc;
1162 this->pull_constant_loc = v->pull_constant_loc;
1163 this->uniforms = v->uniforms;
1164 this->param_size = v->param_size;
1165 }
1166
1167 /* Our support for uniforms is piggy-backed on the struct
1168 * gl_fragment_program, because that's where the values actually
1169 * get stored, rather than in some global gl_shader_program uniform
1170 * store.
1171 */
1172 void
1173 fs_visitor::setup_uniform_values(ir_variable *ir)
1174 {
1175 int namelen = strlen(ir->name);
1176
1177 /* The data for our (non-builtin) uniforms is stored in a series of
1178 * gl_uniform_driver_storage structs for each subcomponent that
1179 * glGetUniformLocation() could name. We know it's been set up in the same
1180 * order we'd walk the type, so walk the list of storage and find anything
1181 * with our name, or the prefix of a component that starts with our name.
1182 */
1183 unsigned params_before = uniforms;
1184 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1185 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1186
1187 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1188 (storage->name[namelen] != 0 &&
1189 storage->name[namelen] != '.' &&
1190 storage->name[namelen] != '[')) {
1191 continue;
1192 }
1193
1194 unsigned slots = storage->type->component_slots();
1195 if (storage->array_elements)
1196 slots *= storage->array_elements;
1197
1198 for (unsigned i = 0; i < slots; i++) {
1199 stage_prog_data->param[uniforms++] = &storage->storage[i];
1200 }
1201 }
1202
1203 /* Make sure we actually initialized the right amount of stuff here. */
1204 assert(params_before + ir->type->component_slots() == uniforms);
1205 (void)params_before;
1206 }
1207
1208
1209 /* Our support for builtin uniforms is even scarier than non-builtin.
1210 * It sits on top of the PROG_STATE_VAR parameters that are
1211 * automatically updated from GL context state.
1212 */
1213 void
1214 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1215 {
1216 const ir_state_slot *const slots = ir->get_state_slots();
1217 assert(slots != NULL);
1218
1219 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1220 /* This state reference has already been setup by ir_to_mesa, but we'll
1221 * get the same index back here.
1222 */
1223 int index = _mesa_add_state_reference(this->prog->Parameters,
1224 (gl_state_index *)slots[i].tokens);
1225
1226 /* Add each of the unique swizzles of the element as a parameter.
1227 * This'll end up matching the expected layout of the
1228 * array/matrix/structure we're trying to fill in.
1229 */
1230 int last_swiz = -1;
1231 for (unsigned int j = 0; j < 4; j++) {
1232 int swiz = GET_SWZ(slots[i].swizzle, j);
1233 if (swiz == last_swiz)
1234 break;
1235 last_swiz = swiz;
1236
1237 stage_prog_data->param[uniforms++] =
1238 &prog->Parameters->ParameterValues[index][swiz];
1239 }
1240 }
1241 }
1242
1243 fs_reg *
1244 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1245 bool origin_upper_left)
1246 {
1247 assert(stage == MESA_SHADER_FRAGMENT);
1248 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1249 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1250 fs_reg wpos = *reg;
1251 bool flip = !origin_upper_left ^ key->render_to_fbo;
1252
1253 /* gl_FragCoord.x */
1254 if (pixel_center_integer) {
1255 emit(MOV(wpos, this->pixel_x));
1256 } else {
1257 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1258 }
1259 wpos = offset(wpos, 1);
1260
1261 /* gl_FragCoord.y */
1262 if (!flip && pixel_center_integer) {
1263 emit(MOV(wpos, this->pixel_y));
1264 } else {
1265 fs_reg pixel_y = this->pixel_y;
1266 float offset = (pixel_center_integer ? 0.0 : 0.5);
1267
1268 if (flip) {
1269 pixel_y.negate = true;
1270 offset += key->drawable_height - 1.0;
1271 }
1272
1273 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1274 }
1275 wpos = offset(wpos, 1);
1276
1277 /* gl_FragCoord.z */
1278 if (devinfo->gen >= 6) {
1279 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1280 } else {
1281 emit(FS_OPCODE_LINTERP, wpos,
1282 this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1283 interp_reg(VARYING_SLOT_POS, 2));
1284 }
1285 wpos = offset(wpos, 1);
1286
1287 /* gl_FragCoord.w: Already set up in emit_interpolation */
1288 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1289
1290 return reg;
1291 }
1292
1293 fs_inst *
1294 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1295 glsl_interp_qualifier interpolation_mode,
1296 bool is_centroid, bool is_sample)
1297 {
1298 brw_wm_barycentric_interp_mode barycoord_mode;
1299 if (devinfo->gen >= 6) {
1300 if (is_centroid) {
1301 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1302 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1303 else
1304 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1305 } else if (is_sample) {
1306 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1307 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1308 else
1309 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1310 } else {
1311 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1312 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1313 else
1314 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1315 }
1316 } else {
1317 /* On Ironlake and below, there is only one interpolation mode.
1318 * Centroid interpolation doesn't mean anything on this hardware --
1319 * there is no multisampling.
1320 */
1321 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1322 }
1323 return emit(FS_OPCODE_LINTERP, attr,
1324 this->delta_xy[barycoord_mode], interp);
1325 }
1326
1327 void
1328 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1329 const glsl_type *type,
1330 glsl_interp_qualifier interpolation_mode,
1331 int location, bool mod_centroid,
1332 bool mod_sample)
1333 {
1334 attr.type = brw_type_for_base_type(type->get_scalar_type());
1335
1336 assert(stage == MESA_SHADER_FRAGMENT);
1337 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1338 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1339
1340 unsigned int array_elements;
1341
1342 if (type->is_array()) {
1343 array_elements = type->length;
1344 if (array_elements == 0) {
1345 fail("dereferenced array '%s' has length 0\n", name);
1346 }
1347 type = type->fields.array;
1348 } else {
1349 array_elements = 1;
1350 }
1351
1352 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1353 bool is_gl_Color =
1354 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1355 if (key->flat_shade && is_gl_Color) {
1356 interpolation_mode = INTERP_QUALIFIER_FLAT;
1357 } else {
1358 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1359 }
1360 }
1361
1362 for (unsigned int i = 0; i < array_elements; i++) {
1363 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1364 if (prog_data->urb_setup[location] == -1) {
1365 /* If there's no incoming setup data for this slot, don't
1366 * emit interpolation for it.
1367 */
1368 attr = offset(attr, type->vector_elements);
1369 location++;
1370 continue;
1371 }
1372
1373 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1374 /* Constant interpolation (flat shading) case. The SF has
1375 * handed us defined values in only the constant offset
1376 * field of the setup reg.
1377 */
1378 for (unsigned int k = 0; k < type->vector_elements; k++) {
1379 struct brw_reg interp = interp_reg(location, k);
1380 interp = suboffset(interp, 3);
1381 interp.type = attr.type;
1382 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1383 attr = offset(attr, 1);
1384 }
1385 } else {
1386 /* Smooth/noperspective interpolation case. */
1387 for (unsigned int k = 0; k < type->vector_elements; k++) {
1388 struct brw_reg interp = interp_reg(location, k);
1389 if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1390 /* Get the pixel/sample mask into f0 so that we know
1391 * which pixels are lit. Then, for each channel that is
1392 * unlit, replace the centroid data with non-centroid
1393 * data.
1394 */
1395 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1396
1397 fs_inst *inst;
1398 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1399 false, false);
1400 inst->predicate = BRW_PREDICATE_NORMAL;
1401 inst->predicate_inverse = true;
1402 if (devinfo->has_pln)
1403 inst->no_dd_clear = true;
1404
1405 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1406 mod_centroid && !key->persample_shading,
1407 mod_sample || key->persample_shading);
1408 inst->predicate = BRW_PREDICATE_NORMAL;
1409 inst->predicate_inverse = false;
1410 if (devinfo->has_pln)
1411 inst->no_dd_check = true;
1412
1413 } else {
1414 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1415 mod_centroid && !key->persample_shading,
1416 mod_sample || key->persample_shading);
1417 }
1418 if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1419 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1420 }
1421 attr = offset(attr, 1);
1422 }
1423
1424 }
1425 location++;
1426 }
1427 }
1428 }
1429
1430 fs_reg *
1431 fs_visitor::emit_frontfacing_interpolation()
1432 {
1433 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1434
1435 if (devinfo->gen >= 6) {
1436 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1437 * a boolean result from this (~0/true or 0/false).
1438 *
1439 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1440 * this task in only one instruction:
1441 * - a negation source modifier will flip the bit; and
1442 * - a W -> D type conversion will sign extend the bit into the high
1443 * word of the destination.
1444 *
1445 * An ASR 15 fills the low word of the destination.
1446 */
1447 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1448 g0.negate = true;
1449
1450 emit(ASR(*reg, g0, fs_reg(15)));
1451 } else {
1452 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1453 * a boolean result from this (1/true or 0/false).
1454 *
1455 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1456 * the negation source modifier to flip it. Unfortunately the SHR
1457 * instruction only operates on UD (or D with an abs source modifier)
1458 * sources without negation.
1459 *
1460 * Instead, use ASR (which will give ~0/true or 0/false).
1461 */
1462 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1463 g1_6.negate = true;
1464
1465 emit(ASR(*reg, g1_6, fs_reg(31)));
1466 }
1467
1468 return reg;
1469 }
1470
1471 void
1472 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1473 {
1474 assert(stage == MESA_SHADER_FRAGMENT);
1475 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1476 assert(dst.type == BRW_REGISTER_TYPE_F);
1477
1478 if (key->compute_pos_offset) {
1479 /* Convert int_sample_pos to floating point */
1480 emit(MOV(dst, int_sample_pos));
1481 /* Scale to the range [0, 1] */
1482 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1483 }
1484 else {
1485 /* From ARB_sample_shading specification:
1486 * "When rendering to a non-multisample buffer, or if multisample
1487 * rasterization is disabled, gl_SamplePosition will always be
1488 * (0.5, 0.5).
1489 */
1490 emit(MOV(dst, fs_reg(0.5f)));
1491 }
1492 }
1493
1494 fs_reg *
1495 fs_visitor::emit_samplepos_setup()
1496 {
1497 assert(devinfo->gen >= 6);
1498
1499 this->current_annotation = "compute sample position";
1500 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1501 fs_reg pos = *reg;
1502 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1503 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1504
1505 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1506 * mode will be enabled.
1507 *
1508 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1509 * R31.1:0 Position Offset X/Y for Slot[3:0]
1510 * R31.3:2 Position Offset X/Y for Slot[7:4]
1511 * .....
1512 *
1513 * The X, Y sample positions come in as bytes in thread payload. So, read
1514 * the positions using vstride=16, width=8, hstride=2.
1515 */
1516 struct brw_reg sample_pos_reg =
1517 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1518 BRW_REGISTER_TYPE_B), 16, 8, 2);
1519
1520 if (dispatch_width == 8) {
1521 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1522 } else {
1523 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1524 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1525 ->force_sechalf = true;
1526 }
1527 /* Compute gl_SamplePosition.x */
1528 compute_sample_position(pos, int_sample_x);
1529 pos = offset(pos, 1);
1530 if (dispatch_width == 8) {
1531 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1532 } else {
1533 emit(MOV(half(int_sample_y, 0),
1534 fs_reg(suboffset(sample_pos_reg, 1))));
1535 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1536 ->force_sechalf = true;
1537 }
1538 /* Compute gl_SamplePosition.y */
1539 compute_sample_position(pos, int_sample_y);
1540 return reg;
1541 }
1542
1543 fs_reg *
1544 fs_visitor::emit_sampleid_setup()
1545 {
1546 assert(stage == MESA_SHADER_FRAGMENT);
1547 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1548 assert(devinfo->gen >= 6);
1549
1550 this->current_annotation = "compute sample id";
1551 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1552
1553 if (key->compute_sample_id) {
1554 fs_reg t1 = vgrf(glsl_type::int_type);
1555 fs_reg t2 = vgrf(glsl_type::int_type);
1556 t2.type = BRW_REGISTER_TYPE_UW;
1557
1558 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1559 * 8x multisampling, subspan 0 will represent sample N (where N
1560 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1561 * 7. We can find the value of N by looking at R0.0 bits 7:6
1562 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1563 * (since samples are always delivered in pairs). That is, we
1564 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1565 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1566 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1567 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1568 * populating a temporary variable with the sequence (0, 1, 2, 3),
1569 * and then reading from it using vstride=1, width=4, hstride=0.
1570 * These computations hold good for 4x multisampling as well.
1571 *
1572 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1573 * the first four slots are sample 0 of subspan 0; the next four
1574 * are sample 1 of subspan 0; the third group is sample 0 of
1575 * subspan 1, and finally sample 1 of subspan 1.
1576 */
1577 fs_inst *inst;
1578 inst = emit(BRW_OPCODE_AND, t1,
1579 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1580 fs_reg(0xc0));
1581 inst->force_writemask_all = true;
1582 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1583 inst->force_writemask_all = true;
1584 /* This works for both SIMD8 and SIMD16 */
1585 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1586 inst->force_writemask_all = true;
1587 /* This special instruction takes care of setting vstride=1,
1588 * width=4, hstride=0 of t2 during an ADD instruction.
1589 */
1590 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1591 } else {
1592 /* As per GL_ARB_sample_shading specification:
1593 * "When rendering to a non-multisample buffer, or if multisample
1594 * rasterization is disabled, gl_SampleID will always be zero."
1595 */
1596 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1597 }
1598
1599 return reg;
1600 }
1601
1602 void
1603 fs_visitor::resolve_source_modifiers(fs_reg *src)
1604 {
1605 if (!src->abs && !src->negate)
1606 return;
1607
1608 fs_reg temp = retype(vgrf(1), src->type);
1609 emit(MOV(temp, *src));
1610 *src = temp;
1611 }
1612
1613 fs_reg
1614 fs_visitor::fix_math_operand(fs_reg src)
1615 {
1616 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1617 * might be able to do better by doing execsize = 1 math and then
1618 * expanding that result out, but we would need to be careful with
1619 * masking.
1620 *
1621 * The hardware ignores source modifiers (negate and abs) on math
1622 * instructions, so we also move to a temp to set those up.
1623 */
1624 if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1625 !src.abs && !src.negate)
1626 return src;
1627
1628 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1629 * operands to math
1630 */
1631 if (devinfo->gen >= 7 && src.file != IMM)
1632 return src;
1633
1634 fs_reg expanded = vgrf(glsl_type::float_type);
1635 expanded.type = src.type;
1636 emit(BRW_OPCODE_MOV, expanded, src);
1637 return expanded;
1638 }
1639
1640 fs_inst *
1641 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1642 {
1643 switch (opcode) {
1644 case SHADER_OPCODE_RCP:
1645 case SHADER_OPCODE_RSQ:
1646 case SHADER_OPCODE_SQRT:
1647 case SHADER_OPCODE_EXP2:
1648 case SHADER_OPCODE_LOG2:
1649 case SHADER_OPCODE_SIN:
1650 case SHADER_OPCODE_COS:
1651 break;
1652 default:
1653 unreachable("not reached: bad math opcode");
1654 }
1655
1656 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1657 * might be able to do better by doing execsize = 1 math and then
1658 * expanding that result out, but we would need to be careful with
1659 * masking.
1660 *
1661 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1662 * instructions, so we also move to a temp to set those up.
1663 */
1664 if (devinfo->gen == 6 || devinfo->gen == 7)
1665 src = fix_math_operand(src);
1666
1667 fs_inst *inst = emit(opcode, dst, src);
1668
1669 if (devinfo->gen < 6) {
1670 inst->base_mrf = 2;
1671 inst->mlen = dispatch_width / 8;
1672 }
1673
1674 return inst;
1675 }
1676
1677 fs_inst *
1678 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1679 {
1680 int base_mrf = 2;
1681 fs_inst *inst;
1682
1683 if (devinfo->gen >= 8) {
1684 inst = emit(opcode, dst, src0, src1);
1685 } else if (devinfo->gen >= 6) {
1686 src0 = fix_math_operand(src0);
1687 src1 = fix_math_operand(src1);
1688
1689 inst = emit(opcode, dst, src0, src1);
1690 } else {
1691 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1692 * "Message Payload":
1693 *
1694 * "Operand0[7]. For the INT DIV functions, this operand is the
1695 * denominator."
1696 * ...
1697 * "Operand1[7]. For the INT DIV functions, this operand is the
1698 * numerator."
1699 */
1700 bool is_int_div = opcode != SHADER_OPCODE_POW;
1701 fs_reg &op0 = is_int_div ? src1 : src0;
1702 fs_reg &op1 = is_int_div ? src0 : src1;
1703
1704 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1705 inst = emit(opcode, dst, op0, reg_null_f);
1706
1707 inst->base_mrf = base_mrf;
1708 inst->mlen = 2 * dispatch_width / 8;
1709 }
1710 return inst;
1711 }
1712
1713 void
1714 fs_visitor::emit_discard_jump()
1715 {
1716 assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1717
1718 /* For performance, after a discard, jump to the end of the
1719 * shader if all relevant channels have been discarded.
1720 */
1721 fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1722 discard_jump->flag_subreg = 1;
1723
1724 discard_jump->predicate = (dispatch_width == 8)
1725 ? BRW_PREDICATE_ALIGN1_ANY8H
1726 : BRW_PREDICATE_ALIGN1_ANY16H;
1727 discard_jump->predicate_inverse = true;
1728 }
1729
1730 void
1731 fs_visitor::assign_curb_setup()
1732 {
1733 if (dispatch_width == 8) {
1734 prog_data->dispatch_grf_start_reg = payload.num_regs;
1735 } else {
1736 if (stage == MESA_SHADER_FRAGMENT) {
1737 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1738 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1739 } else if (stage == MESA_SHADER_COMPUTE) {
1740 brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1741 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1742 } else {
1743 unreachable("Unsupported shader type!");
1744 }
1745 }
1746
1747 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1748
1749 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1750 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1751 for (unsigned int i = 0; i < inst->sources; i++) {
1752 if (inst->src[i].file == UNIFORM) {
1753 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1754 int constant_nr;
1755 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1756 constant_nr = push_constant_loc[uniform_nr];
1757 } else {
1758 /* Section 5.11 of the OpenGL 4.1 spec says:
1759 * "Out-of-bounds reads return undefined values, which include
1760 * values from other variables of the active program or zero."
1761 * Just return the first push constant.
1762 */
1763 constant_nr = 0;
1764 }
1765
1766 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1767 constant_nr / 8,
1768 constant_nr % 8);
1769
1770 inst->src[i].file = HW_REG;
1771 inst->src[i].fixed_hw_reg = byte_offset(
1772 retype(brw_reg, inst->src[i].type),
1773 inst->src[i].subreg_offset);
1774 }
1775 }
1776 }
1777 }
1778
1779 void
1780 fs_visitor::calculate_urb_setup()
1781 {
1782 assert(stage == MESA_SHADER_FRAGMENT);
1783 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1784 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1785
1786 memset(prog_data->urb_setup, -1,
1787 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1788
1789 int urb_next = 0;
1790 /* Figure out where each of the incoming setup attributes lands. */
1791 if (devinfo->gen >= 6) {
1792 if (_mesa_bitcount_64(prog->InputsRead &
1793 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1794 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1795 * first 16 varying inputs, so we can put them wherever we want.
1796 * Just put them in order.
1797 *
1798 * This is useful because it means that (a) inputs not used by the
1799 * fragment shader won't take up valuable register space, and (b) we
1800 * won't have to recompile the fragment shader if it gets paired with
1801 * a different vertex (or geometry) shader.
1802 */
1803 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1804 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1805 BITFIELD64_BIT(i)) {
1806 prog_data->urb_setup[i] = urb_next++;
1807 }
1808 }
1809 } else {
1810 /* We have enough input varyings that the SF/SBE pipeline stage can't
1811 * arbitrarily rearrange them to suit our whim; we have to put them
1812 * in an order that matches the output of the previous pipeline stage
1813 * (geometry or vertex shader).
1814 */
1815 struct brw_vue_map prev_stage_vue_map;
1816 brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1817 key->input_slots_valid);
1818 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1819 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1820 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1821 slot++) {
1822 int varying = prev_stage_vue_map.slot_to_varying[slot];
1823 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1824 * unused.
1825 */
1826 if (varying != BRW_VARYING_SLOT_COUNT &&
1827 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1828 BITFIELD64_BIT(varying))) {
1829 prog_data->urb_setup[varying] = slot - first_slot;
1830 }
1831 }
1832 urb_next = prev_stage_vue_map.num_slots - first_slot;
1833 }
1834 } else {
1835 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1836 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1837 /* Point size is packed into the header, not as a general attribute */
1838 if (i == VARYING_SLOT_PSIZ)
1839 continue;
1840
1841 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1842 /* The back color slot is skipped when the front color is
1843 * also written to. In addition, some slots can be
1844 * written in the vertex shader and not read in the
1845 * fragment shader. So the register number must always be
1846 * incremented, mapped or not.
1847 */
1848 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1849 prog_data->urb_setup[i] = urb_next;
1850 urb_next++;
1851 }
1852 }
1853
1854 /*
1855 * It's a FS only attribute, and we did interpolation for this attribute
1856 * in SF thread. So, count it here, too.
1857 *
1858 * See compile_sf_prog() for more info.
1859 */
1860 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1861 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1862 }
1863
1864 prog_data->num_varying_inputs = urb_next;
1865 }
1866
1867 void
1868 fs_visitor::assign_urb_setup()
1869 {
1870 assert(stage == MESA_SHADER_FRAGMENT);
1871 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1872
1873 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1874
1875 /* Offset all the urb_setup[] index by the actual position of the
1876 * setup regs, now that the location of the constants has been chosen.
1877 */
1878 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1879 if (inst->opcode == FS_OPCODE_LINTERP) {
1880 assert(inst->src[1].file == HW_REG);
1881 inst->src[1].fixed_hw_reg.nr += urb_start;
1882 }
1883
1884 if (inst->opcode == FS_OPCODE_CINTERP) {
1885 assert(inst->src[0].file == HW_REG);
1886 inst->src[0].fixed_hw_reg.nr += urb_start;
1887 }
1888 }
1889
1890 /* Each attribute is 4 setup channels, each of which is half a reg. */
1891 this->first_non_payload_grf =
1892 urb_start + prog_data->num_varying_inputs * 2;
1893 }
1894
1895 void
1896 fs_visitor::assign_vs_urb_setup()
1897 {
1898 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1899 int grf, count, slot, channel, attr;
1900
1901 assert(stage == MESA_SHADER_VERTEX);
1902 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1903 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1904 count++;
1905
1906 /* Each attribute is 4 regs. */
1907 this->first_non_payload_grf =
1908 payload.num_regs + prog_data->curb_read_length + count * 4;
1909
1910 unsigned vue_entries =
1911 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1912
1913 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1914 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1915
1916 assert(vs_prog_data->base.urb_read_length <= 15);
1917
1918 /* Rewrite all ATTR file references to the hw grf that they land in. */
1919 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1920 for (int i = 0; i < inst->sources; i++) {
1921 if (inst->src[i].file == ATTR) {
1922
1923 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1924 slot = count - 1;
1925 } else {
1926 /* Attributes come in in a contiguous block, ordered by their
1927 * gl_vert_attrib value. That means we can compute the slot
1928 * number for an attribute by masking out the enabled
1929 * attributes before it and counting the bits.
1930 */
1931 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1932 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1933 BITFIELD64_MASK(attr));
1934 }
1935
1936 channel = inst->src[i].reg_offset & 3;
1937
1938 grf = payload.num_regs +
1939 prog_data->curb_read_length +
1940 slot * 4 + channel;
1941
1942 inst->src[i].file = HW_REG;
1943 inst->src[i].fixed_hw_reg =
1944 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1945 }
1946 }
1947 }
1948 }
1949
1950 /**
1951 * Split large virtual GRFs into separate components if we can.
1952 *
1953 * This is mostly duplicated with what brw_fs_vector_splitting does,
1954 * but that's really conservative because it's afraid of doing
1955 * splitting that doesn't result in real progress after the rest of
1956 * the optimization phases, which would cause infinite looping in
1957 * optimization. We can do it once here, safely. This also has the
1958 * opportunity to split interpolated values, or maybe even uniforms,
1959 * which we don't have at the IR level.
1960 *
1961 * We want to split, because virtual GRFs are what we register
1962 * allocate and spill (due to contiguousness requirements for some
1963 * instructions), and they're what we naturally generate in the
1964 * codegen process, but most virtual GRFs don't actually need to be
1965 * contiguous sets of GRFs. If we split, we'll end up with reduced
1966 * live intervals and better dead code elimination and coalescing.
1967 */
1968 void
1969 fs_visitor::split_virtual_grfs()
1970 {
1971 int num_vars = this->alloc.count;
1972
1973 /* Count the total number of registers */
1974 int reg_count = 0;
1975 int vgrf_to_reg[num_vars];
1976 for (int i = 0; i < num_vars; i++) {
1977 vgrf_to_reg[i] = reg_count;
1978 reg_count += alloc.sizes[i];
1979 }
1980
1981 /* An array of "split points". For each register slot, this indicates
1982 * if this slot can be separated from the previous slot. Every time an
1983 * instruction uses multiple elements of a register (as a source or
1984 * destination), we mark the used slots as inseparable. Then we go
1985 * through and split the registers into the smallest pieces we can.
1986 */
1987 bool split_points[reg_count];
1988 memset(split_points, 0, sizeof(split_points));
1989
1990 /* Mark all used registers as fully splittable */
1991 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1992 if (inst->dst.file == GRF) {
1993 int reg = vgrf_to_reg[inst->dst.reg];
1994 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1995 split_points[reg + j] = true;
1996 }
1997
1998 for (int i = 0; i < inst->sources; i++) {
1999 if (inst->src[i].file == GRF) {
2000 int reg = vgrf_to_reg[inst->src[i].reg];
2001 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
2002 split_points[reg + j] = true;
2003 }
2004 }
2005 }
2006
2007 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2008 if (inst->dst.file == GRF) {
2009 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2010 for (int j = 1; j < inst->regs_written; j++)
2011 split_points[reg + j] = false;
2012 }
2013 for (int i = 0; i < inst->sources; i++) {
2014 if (inst->src[i].file == GRF) {
2015 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2016 for (int j = 1; j < inst->regs_read(i); j++)
2017 split_points[reg + j] = false;
2018 }
2019 }
2020 }
2021
2022 int new_virtual_grf[reg_count];
2023 int new_reg_offset[reg_count];
2024
2025 int reg = 0;
2026 for (int i = 0; i < num_vars; i++) {
2027 /* The first one should always be 0 as a quick sanity check. */
2028 assert(split_points[reg] == false);
2029
2030 /* j = 0 case */
2031 new_reg_offset[reg] = 0;
2032 reg++;
2033 int offset = 1;
2034
2035 /* j > 0 case */
2036 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2037 /* If this is a split point, reset the offset to 0 and allocate a
2038 * new virtual GRF for the previous offset many registers
2039 */
2040 if (split_points[reg]) {
2041 assert(offset <= MAX_VGRF_SIZE);
2042 int grf = alloc.allocate(offset);
2043 for (int k = reg - offset; k < reg; k++)
2044 new_virtual_grf[k] = grf;
2045 offset = 0;
2046 }
2047 new_reg_offset[reg] = offset;
2048 offset++;
2049 reg++;
2050 }
2051
2052 /* The last one gets the original register number */
2053 assert(offset <= MAX_VGRF_SIZE);
2054 alloc.sizes[i] = offset;
2055 for (int k = reg - offset; k < reg; k++)
2056 new_virtual_grf[k] = i;
2057 }
2058 assert(reg == reg_count);
2059
2060 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2061 if (inst->dst.file == GRF) {
2062 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2063 inst->dst.reg = new_virtual_grf[reg];
2064 inst->dst.reg_offset = new_reg_offset[reg];
2065 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2066 }
2067 for (int i = 0; i < inst->sources; i++) {
2068 if (inst->src[i].file == GRF) {
2069 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2070 inst->src[i].reg = new_virtual_grf[reg];
2071 inst->src[i].reg_offset = new_reg_offset[reg];
2072 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2073 }
2074 }
2075 }
2076 invalidate_live_intervals();
2077 }
2078
2079 /**
2080 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2081 *
2082 * During code generation, we create tons of temporary variables, many of
2083 * which get immediately killed and are never used again. Yet, in later
2084 * optimization and analysis passes, such as compute_live_intervals, we need
2085 * to loop over all the virtual GRFs. Compacting them can save a lot of
2086 * overhead.
2087 */
2088 bool
2089 fs_visitor::compact_virtual_grfs()
2090 {
2091 bool progress = false;
2092 int remap_table[this->alloc.count];
2093 memset(remap_table, -1, sizeof(remap_table));
2094
2095 /* Mark which virtual GRFs are used. */
2096 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2097 if (inst->dst.file == GRF)
2098 remap_table[inst->dst.reg] = 0;
2099
2100 for (int i = 0; i < inst->sources; i++) {
2101 if (inst->src[i].file == GRF)
2102 remap_table[inst->src[i].reg] = 0;
2103 }
2104 }
2105
2106 /* Compact the GRF arrays. */
2107 int new_index = 0;
2108 for (unsigned i = 0; i < this->alloc.count; i++) {
2109 if (remap_table[i] == -1) {
2110 /* We just found an unused register. This means that we are
2111 * actually going to compact something.
2112 */
2113 progress = true;
2114 } else {
2115 remap_table[i] = new_index;
2116 alloc.sizes[new_index] = alloc.sizes[i];
2117 invalidate_live_intervals();
2118 ++new_index;
2119 }
2120 }
2121
2122 this->alloc.count = new_index;
2123
2124 /* Patch all the instructions to use the newly renumbered registers */
2125 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2126 if (inst->dst.file == GRF)
2127 inst->dst.reg = remap_table[inst->dst.reg];
2128
2129 for (int i = 0; i < inst->sources; i++) {
2130 if (inst->src[i].file == GRF)
2131 inst->src[i].reg = remap_table[inst->src[i].reg];
2132 }
2133 }
2134
2135 /* Patch all the references to delta_xy, since they're used in register
2136 * allocation. If they're unused, switch them to BAD_FILE so we don't
2137 * think some random VGRF is delta_xy.
2138 */
2139 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2140 if (delta_xy[i].file == GRF) {
2141 if (remap_table[delta_xy[i].reg] != -1) {
2142 delta_xy[i].reg = remap_table[delta_xy[i].reg];
2143 } else {
2144 delta_xy[i].file = BAD_FILE;
2145 }
2146 }
2147 }
2148
2149 return progress;
2150 }
2151
2152 /*
2153 * Implements array access of uniforms by inserting a
2154 * PULL_CONSTANT_LOAD instruction.
2155 *
2156 * Unlike temporary GRF array access (where we don't support it due to
2157 * the difficulty of doing relative addressing on instruction
2158 * destinations), we could potentially do array access of uniforms
2159 * that were loaded in GRF space as push constants. In real-world
2160 * usage we've seen, though, the arrays being used are always larger
2161 * than we could load as push constants, so just always move all
2162 * uniform array access out to a pull constant buffer.
2163 */
2164 void
2165 fs_visitor::move_uniform_array_access_to_pull_constants()
2166 {
2167 if (dispatch_width != 8)
2168 return;
2169
2170 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2171 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2172
2173 /* Walk through and find array access of uniforms. Put a copy of that
2174 * uniform in the pull constant buffer.
2175 *
2176 * Note that we don't move constant-indexed accesses to arrays. No
2177 * testing has been done of the performance impact of this choice.
2178 */
2179 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2180 for (int i = 0 ; i < inst->sources; i++) {
2181 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2182 continue;
2183
2184 int uniform = inst->src[i].reg;
2185
2186 /* If this array isn't already present in the pull constant buffer,
2187 * add it.
2188 */
2189 if (pull_constant_loc[uniform] == -1) {
2190 const gl_constant_value **values = &stage_prog_data->param[uniform];
2191
2192 assert(param_size[uniform]);
2193
2194 for (int j = 0; j < param_size[uniform]; j++) {
2195 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2196
2197 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2198 values[j];
2199 }
2200 }
2201 }
2202 }
2203 }
2204
2205 /**
2206 * Assign UNIFORM file registers to either push constants or pull constants.
2207 *
2208 * We allow a fragment shader to have more than the specified minimum
2209 * maximum number of fragment shader uniform components (64). If
2210 * there are too many of these, they'd fill up all of register space.
2211 * So, this will push some of them out to the pull constant buffer and
2212 * update the program to load them.
2213 */
2214 void
2215 fs_visitor::assign_constant_locations()
2216 {
2217 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2218 if (dispatch_width != 8)
2219 return;
2220
2221 /* Find which UNIFORM registers are still in use. */
2222 bool is_live[uniforms];
2223 for (unsigned int i = 0; i < uniforms; i++) {
2224 is_live[i] = false;
2225 }
2226
2227 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2228 for (int i = 0; i < inst->sources; i++) {
2229 if (inst->src[i].file != UNIFORM)
2230 continue;
2231
2232 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2233 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2234 is_live[constant_nr] = true;
2235 }
2236 }
2237
2238 /* Only allow 16 registers (128 uniform components) as push constants.
2239 *
2240 * Just demote the end of the list. We could probably do better
2241 * here, demoting things that are rarely used in the program first.
2242 *
2243 * If changing this value, note the limitation about total_regs in
2244 * brw_curbe.c.
2245 */
2246 unsigned int max_push_components = 16 * 8;
2247 unsigned int num_push_constants = 0;
2248
2249 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2250
2251 for (unsigned int i = 0; i < uniforms; i++) {
2252 if (!is_live[i] || pull_constant_loc[i] != -1) {
2253 /* This UNIFORM register is either dead, or has already been demoted
2254 * to a pull const. Mark it as no longer living in the param[] array.
2255 */
2256 push_constant_loc[i] = -1;
2257 continue;
2258 }
2259
2260 if (num_push_constants < max_push_components) {
2261 /* Retain as a push constant. Record the location in the params[]
2262 * array.
2263 */
2264 push_constant_loc[i] = num_push_constants++;
2265 } else {
2266 /* Demote to a pull constant. */
2267 push_constant_loc[i] = -1;
2268
2269 int pull_index = stage_prog_data->nr_pull_params++;
2270 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2271 pull_constant_loc[i] = pull_index;
2272 }
2273 }
2274
2275 stage_prog_data->nr_params = num_push_constants;
2276
2277 /* Up until now, the param[] array has been indexed by reg + reg_offset
2278 * of UNIFORM registers. Condense it to only contain the uniforms we
2279 * chose to upload as push constants.
2280 */
2281 for (unsigned int i = 0; i < uniforms; i++) {
2282 int remapped = push_constant_loc[i];
2283
2284 if (remapped == -1)
2285 continue;
2286
2287 assert(remapped <= (int)i);
2288 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2289 }
2290 }
2291
2292 /**
2293 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2294 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2295 */
2296 void
2297 fs_visitor::demote_pull_constants()
2298 {
2299 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2300 for (int i = 0; i < inst->sources; i++) {
2301 if (inst->src[i].file != UNIFORM)
2302 continue;
2303
2304 int pull_index;
2305 unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2306 if (location >= uniforms) /* Out of bounds access */
2307 pull_index = -1;
2308 else
2309 pull_index = pull_constant_loc[location];
2310
2311 if (pull_index == -1)
2312 continue;
2313
2314 /* Set up the annotation tracking for new generated instructions. */
2315 base_ir = inst->ir;
2316 current_annotation = inst->annotation;
2317
2318 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2319 fs_reg dst = vgrf(glsl_type::float_type);
2320
2321 /* Generate a pull load into dst. */
2322 if (inst->src[i].reladdr) {
2323 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2324 surf_index,
2325 *inst->src[i].reladdr,
2326 pull_index);
2327 inst->insert_before(block, &list);
2328 inst->src[i].reladdr = NULL;
2329 } else {
2330 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2331 fs_inst *pull =
2332 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2333 dst, surf_index, offset);
2334 inst->insert_before(block, pull);
2335 inst->src[i].set_smear(pull_index & 3);
2336 }
2337
2338 /* Rewrite the instruction to use the temporary VGRF. */
2339 inst->src[i].file = GRF;
2340 inst->src[i].reg = dst.reg;
2341 inst->src[i].reg_offset = 0;
2342 inst->src[i].width = dispatch_width;
2343 }
2344 }
2345 invalidate_live_intervals();
2346 }
2347
2348 bool
2349 fs_visitor::opt_algebraic()
2350 {
2351 bool progress = false;
2352
2353 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2354 switch (inst->opcode) {
2355 case BRW_OPCODE_MOV:
2356 if (inst->src[0].file != IMM)
2357 break;
2358
2359 if (inst->saturate) {
2360 if (inst->dst.type != inst->src[0].type)
2361 assert(!"unimplemented: saturate mixed types");
2362
2363 if (brw_saturate_immediate(inst->dst.type,
2364 &inst->src[0].fixed_hw_reg)) {
2365 inst->saturate = false;
2366 progress = true;
2367 }
2368 }
2369 break;
2370
2371 case BRW_OPCODE_MUL:
2372 if (inst->src[1].file != IMM)
2373 continue;
2374
2375 /* a * 1.0 = a */
2376 if (inst->src[1].is_one()) {
2377 inst->opcode = BRW_OPCODE_MOV;
2378 inst->src[1] = reg_undef;
2379 progress = true;
2380 break;
2381 }
2382
2383 /* a * -1.0 = -a */
2384 if (inst->src[1].is_negative_one()) {
2385 inst->opcode = BRW_OPCODE_MOV;
2386 inst->src[0].negate = !inst->src[0].negate;
2387 inst->src[1] = reg_undef;
2388 progress = true;
2389 break;
2390 }
2391
2392 /* a * 0.0 = 0.0 */
2393 if (inst->src[1].is_zero()) {
2394 inst->opcode = BRW_OPCODE_MOV;
2395 inst->src[0] = inst->src[1];
2396 inst->src[1] = reg_undef;
2397 progress = true;
2398 break;
2399 }
2400
2401 if (inst->src[0].file == IMM) {
2402 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2403 inst->opcode = BRW_OPCODE_MOV;
2404 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2405 inst->src[1] = reg_undef;
2406 progress = true;
2407 break;
2408 }
2409 break;
2410 case BRW_OPCODE_ADD:
2411 if (inst->src[1].file != IMM)
2412 continue;
2413
2414 /* a + 0.0 = a */
2415 if (inst->src[1].is_zero()) {
2416 inst->opcode = BRW_OPCODE_MOV;
2417 inst->src[1] = reg_undef;
2418 progress = true;
2419 break;
2420 }
2421
2422 if (inst->src[0].file == IMM) {
2423 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2424 inst->opcode = BRW_OPCODE_MOV;
2425 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2426 inst->src[1] = reg_undef;
2427 progress = true;
2428 break;
2429 }
2430 break;
2431 case BRW_OPCODE_OR:
2432 if (inst->src[0].equals(inst->src[1])) {
2433 inst->opcode = BRW_OPCODE_MOV;
2434 inst->src[1] = reg_undef;
2435 progress = true;
2436 break;
2437 }
2438 break;
2439 case BRW_OPCODE_LRP:
2440 if (inst->src[1].equals(inst->src[2])) {
2441 inst->opcode = BRW_OPCODE_MOV;
2442 inst->src[0] = inst->src[1];
2443 inst->src[1] = reg_undef;
2444 inst->src[2] = reg_undef;
2445 progress = true;
2446 break;
2447 }
2448 break;
2449 case BRW_OPCODE_CMP:
2450 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2451 inst->src[0].abs &&
2452 inst->src[0].negate &&
2453 inst->src[1].is_zero()) {
2454 inst->src[0].abs = false;
2455 inst->src[0].negate = false;
2456 inst->conditional_mod = BRW_CONDITIONAL_Z;
2457 progress = true;
2458 break;
2459 }
2460 break;
2461 case BRW_OPCODE_SEL:
2462 if (inst->src[0].equals(inst->src[1])) {
2463 inst->opcode = BRW_OPCODE_MOV;
2464 inst->src[1] = reg_undef;
2465 inst->predicate = BRW_PREDICATE_NONE;
2466 inst->predicate_inverse = false;
2467 progress = true;
2468 } else if (inst->saturate && inst->src[1].file == IMM) {
2469 switch (inst->conditional_mod) {
2470 case BRW_CONDITIONAL_LE:
2471 case BRW_CONDITIONAL_L:
2472 switch (inst->src[1].type) {
2473 case BRW_REGISTER_TYPE_F:
2474 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2475 inst->opcode = BRW_OPCODE_MOV;
2476 inst->src[1] = reg_undef;
2477 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2478 progress = true;
2479 }
2480 break;
2481 default:
2482 break;
2483 }
2484 break;
2485 case BRW_CONDITIONAL_GE:
2486 case BRW_CONDITIONAL_G:
2487 switch (inst->src[1].type) {
2488 case BRW_REGISTER_TYPE_F:
2489 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2490 inst->opcode = BRW_OPCODE_MOV;
2491 inst->src[1] = reg_undef;
2492 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2493 progress = true;
2494 }
2495 break;
2496 default:
2497 break;
2498 }
2499 default:
2500 break;
2501 }
2502 }
2503 break;
2504 case BRW_OPCODE_MAD:
2505 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2506 inst->opcode = BRW_OPCODE_MOV;
2507 inst->src[1] = reg_undef;
2508 inst->src[2] = reg_undef;
2509 progress = true;
2510 } else if (inst->src[0].is_zero()) {
2511 inst->opcode = BRW_OPCODE_MUL;
2512 inst->src[0] = inst->src[2];
2513 inst->src[2] = reg_undef;
2514 progress = true;
2515 } else if (inst->src[1].is_one()) {
2516 inst->opcode = BRW_OPCODE_ADD;
2517 inst->src[1] = inst->src[2];
2518 inst->src[2] = reg_undef;
2519 progress = true;
2520 } else if (inst->src[2].is_one()) {
2521 inst->opcode = BRW_OPCODE_ADD;
2522 inst->src[2] = reg_undef;
2523 progress = true;
2524 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2525 inst->opcode = BRW_OPCODE_ADD;
2526 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2527 inst->src[2] = reg_undef;
2528 progress = true;
2529 }
2530 break;
2531 case SHADER_OPCODE_RCP: {
2532 fs_inst *prev = (fs_inst *)inst->prev;
2533 if (prev->opcode == SHADER_OPCODE_SQRT) {
2534 if (inst->src[0].equals(prev->dst)) {
2535 inst->opcode = SHADER_OPCODE_RSQ;
2536 inst->src[0] = prev->src[0];
2537 progress = true;
2538 }
2539 }
2540 break;
2541 }
2542 case SHADER_OPCODE_BROADCAST:
2543 if (is_uniform(inst->src[0])) {
2544 inst->opcode = BRW_OPCODE_MOV;
2545 inst->sources = 1;
2546 inst->force_writemask_all = true;
2547 progress = true;
2548 } else if (inst->src[1].file == IMM) {
2549 inst->opcode = BRW_OPCODE_MOV;
2550 inst->src[0] = component(inst->src[0],
2551 inst->src[1].fixed_hw_reg.dw1.ud);
2552 inst->sources = 1;
2553 inst->force_writemask_all = true;
2554 progress = true;
2555 }
2556 break;
2557
2558 default:
2559 break;
2560 }
2561
2562 /* Swap if src[0] is immediate. */
2563 if (progress && inst->is_commutative()) {
2564 if (inst->src[0].file == IMM) {
2565 fs_reg tmp = inst->src[1];
2566 inst->src[1] = inst->src[0];
2567 inst->src[0] = tmp;
2568 }
2569 }
2570 }
2571 return progress;
2572 }
2573
2574 /**
2575 * Optimize sample messages that have constant zero values for the trailing
2576 * texture coordinates. We can just reduce the message length for these
2577 * instructions instead of reserving a register for it. Trailing parameters
2578 * that aren't sent default to zero anyway. This will cause the dead code
2579 * eliminator to remove the MOV instruction that would otherwise be emitted to
2580 * set up the zero value.
2581 */
2582 bool
2583 fs_visitor::opt_zero_samples()
2584 {
2585 /* Gen4 infers the texturing opcode based on the message length so we can't
2586 * change it.
2587 */
2588 if (devinfo->gen < 5)
2589 return false;
2590
2591 bool progress = false;
2592
2593 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2594 if (!inst->is_tex())
2595 continue;
2596
2597 fs_inst *load_payload = (fs_inst *) inst->prev;
2598
2599 if (load_payload->is_head_sentinel() ||
2600 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2601 continue;
2602
2603 /* We don't want to remove the message header or the first parameter.
2604 * Removing the first parameter is not allowed, see the Haswell PRM
2605 * volume 7, page 149:
2606 *
2607 * "Parameter 0 is required except for the sampleinfo message, which
2608 * has no parameter 0"
2609 */
2610 while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2611 load_payload->src[(inst->mlen - inst->header_size) /
2612 (dispatch_width / 8) +
2613 inst->header_size - 1].is_zero()) {
2614 inst->mlen -= dispatch_width / 8;
2615 progress = true;
2616 }
2617 }
2618
2619 if (progress)
2620 invalidate_live_intervals();
2621
2622 return progress;
2623 }
2624
2625 /**
2626 * Optimize sample messages which are followed by the final RT write.
2627 *
2628 * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2629 * results sent directly to the framebuffer, bypassing the EU. Recognize the
2630 * final texturing results copied to the framebuffer write payload and modify
2631 * them to write to the framebuffer directly.
2632 */
2633 bool
2634 fs_visitor::opt_sampler_eot()
2635 {
2636 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2637
2638 if (stage != MESA_SHADER_FRAGMENT)
2639 return false;
2640
2641 if (devinfo->gen < 9 && !devinfo->is_cherryview)
2642 return false;
2643
2644 /* FINISHME: It should be possible to implement this optimization when there
2645 * are multiple drawbuffers.
2646 */
2647 if (key->nr_color_regions != 1)
2648 return false;
2649
2650 /* Look for a texturing instruction immediately before the final FB_WRITE. */
2651 fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2652 assert(fb_write->eot);
2653 assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2654
2655 fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2656
2657 /* There wasn't one; nothing to do. */
2658 if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2659 return false;
2660
2661 /* This optimisation doesn't seem to work for textureGather for some
2662 * reason. I can't find any documentation or known workarounds to indicate
2663 * that this is expected, but considering that it is probably pretty
2664 * unlikely that a shader would directly write out the results from
2665 * textureGather we might as well just disable it.
2666 */
2667 if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
2668 tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2669 return false;
2670
2671 /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2672 * It's very likely to be the previous instruction.
2673 */
2674 fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2675 if (load_payload->is_head_sentinel() ||
2676 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2677 return false;
2678
2679 assert(!tex_inst->eot); /* We can't get here twice */
2680 assert((tex_inst->offset & (0xff << 24)) == 0);
2681
2682 tex_inst->offset |= fb_write->target << 24;
2683 tex_inst->eot = true;
2684 tex_inst->dst = reg_null_ud;
2685 fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2686
2687 /* If a header is present, marking the eot is sufficient. Otherwise, we need
2688 * to create a new LOAD_PAYLOAD command with the same sources and a space
2689 * saved for the header. Using a new destination register not only makes sure
2690 * we have enough space, but it will make sure the dead code eliminator kills
2691 * the instruction that this will replace.
2692 */
2693 if (tex_inst->header_size != 0)
2694 return true;
2695
2696 fs_reg send_header = vgrf(load_payload->sources + 1);
2697 fs_reg *new_sources =
2698 ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2699
2700 new_sources[0] = fs_reg();
2701 for (int i = 0; i < load_payload->sources; i++)
2702 new_sources[i+1] = load_payload->src[i];
2703
2704 /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2705 * requires a lot of information about the sources to appropriately figure
2706 * out the number of registers needed to be used. Given this stage in our
2707 * optimization, we may not have the appropriate GRFs required by
2708 * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2709 * manually emit the instruction.
2710 */
2711 fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2712 load_payload->exec_size,
2713 send_header,
2714 new_sources,
2715 load_payload->sources + 1);
2716
2717 new_load_payload->regs_written = load_payload->regs_written + 1;
2718 new_load_payload->header_size = 1;
2719 tex_inst->mlen++;
2720 tex_inst->header_size = 1;
2721 tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2722 tex_inst->src[0] = send_header;
2723
2724 return true;
2725 }
2726
2727 bool
2728 fs_visitor::opt_register_renaming()
2729 {
2730 bool progress = false;
2731 int depth = 0;
2732
2733 int remap[alloc.count];
2734 memset(remap, -1, sizeof(int) * alloc.count);
2735
2736 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2737 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2738 depth++;
2739 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2740 inst->opcode == BRW_OPCODE_WHILE) {
2741 depth--;
2742 }
2743
2744 /* Rewrite instruction sources. */
2745 for (int i = 0; i < inst->sources; i++) {
2746 if (inst->src[i].file == GRF &&
2747 remap[inst->src[i].reg] != -1 &&
2748 remap[inst->src[i].reg] != inst->src[i].reg) {
2749 inst->src[i].reg = remap[inst->src[i].reg];
2750 progress = true;
2751 }
2752 }
2753
2754 const int dst = inst->dst.reg;
2755
2756 if (depth == 0 &&
2757 inst->dst.file == GRF &&
2758 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2759 !inst->is_partial_write()) {
2760 if (remap[dst] == -1) {
2761 remap[dst] = dst;
2762 } else {
2763 remap[dst] = alloc.allocate(inst->dst.width / 8);
2764 inst->dst.reg = remap[dst];
2765 progress = true;
2766 }
2767 } else if (inst->dst.file == GRF &&
2768 remap[dst] != -1 &&
2769 remap[dst] != dst) {
2770 inst->dst.reg = remap[dst];
2771 progress = true;
2772 }
2773 }
2774
2775 if (progress) {
2776 invalidate_live_intervals();
2777
2778 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2779 if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2780 delta_xy[i].reg = remap[delta_xy[i].reg];
2781 }
2782 }
2783 }
2784
2785 return progress;
2786 }
2787
2788 /**
2789 * Remove redundant or useless discard jumps.
2790 *
2791 * For example, we can eliminate jumps in the following sequence:
2792 *
2793 * discard-jump (redundant with the next jump)
2794 * discard-jump (useless; jumps to the next instruction)
2795 * placeholder-halt
2796 */
2797 bool
2798 fs_visitor::opt_redundant_discard_jumps()
2799 {
2800 bool progress = false;
2801
2802 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2803
2804 fs_inst *placeholder_halt = NULL;
2805 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2806 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2807 placeholder_halt = inst;
2808 break;
2809 }
2810 }
2811
2812 if (!placeholder_halt)
2813 return false;
2814
2815 /* Delete any HALTs immediately before the placeholder halt. */
2816 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2817 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2818 prev = (fs_inst *) placeholder_halt->prev) {
2819 prev->remove(last_bblock);
2820 progress = true;
2821 }
2822
2823 if (progress)
2824 invalidate_live_intervals();
2825
2826 return progress;
2827 }
2828
2829 bool
2830 fs_visitor::compute_to_mrf()
2831 {
2832 bool progress = false;
2833 int next_ip = 0;
2834
2835 /* No MRFs on Gen >= 7. */
2836 if (devinfo->gen >= 7)
2837 return false;
2838
2839 calculate_live_intervals();
2840
2841 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2842 int ip = next_ip;
2843 next_ip++;
2844
2845 if (inst->opcode != BRW_OPCODE_MOV ||
2846 inst->is_partial_write() ||
2847 inst->dst.file != MRF || inst->src[0].file != GRF ||
2848 inst->dst.type != inst->src[0].type ||
2849 inst->src[0].abs || inst->src[0].negate ||
2850 !inst->src[0].is_contiguous() ||
2851 inst->src[0].subreg_offset)
2852 continue;
2853
2854 /* Work out which hardware MRF registers are written by this
2855 * instruction.
2856 */
2857 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2858 int mrf_high;
2859 if (inst->dst.reg & BRW_MRF_COMPR4) {
2860 mrf_high = mrf_low + 4;
2861 } else if (inst->exec_size == 16) {
2862 mrf_high = mrf_low + 1;
2863 } else {
2864 mrf_high = mrf_low;
2865 }
2866
2867 /* Can't compute-to-MRF this GRF if someone else was going to
2868 * read it later.
2869 */
2870 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2871 continue;
2872
2873 /* Found a move of a GRF to a MRF. Let's see if we can go
2874 * rewrite the thing that made this GRF to write into the MRF.
2875 */
2876 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2877 if (scan_inst->dst.file == GRF &&
2878 scan_inst->dst.reg == inst->src[0].reg) {
2879 /* Found the last thing to write our reg we want to turn
2880 * into a compute-to-MRF.
2881 */
2882
2883 /* If this one instruction didn't populate all the
2884 * channels, bail. We might be able to rewrite everything
2885 * that writes that reg, but it would require smarter
2886 * tracking to delay the rewriting until complete success.
2887 */
2888 if (scan_inst->is_partial_write())
2889 break;
2890
2891 /* Things returning more than one register would need us to
2892 * understand coalescing out more than one MOV at a time.
2893 */
2894 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2895 break;
2896
2897 /* SEND instructions can't have MRF as a destination. */
2898 if (scan_inst->mlen)
2899 break;
2900
2901 if (devinfo->gen == 6) {
2902 /* gen6 math instructions must have the destination be
2903 * GRF, so no compute-to-MRF for them.
2904 */
2905 if (scan_inst->is_math()) {
2906 break;
2907 }
2908 }
2909
2910 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2911 /* Found the creator of our MRF's source value. */
2912 scan_inst->dst.file = MRF;
2913 scan_inst->dst.reg = inst->dst.reg;
2914 scan_inst->saturate |= inst->saturate;
2915 inst->remove(block);
2916 progress = true;
2917 }
2918 break;
2919 }
2920
2921 /* We don't handle control flow here. Most computation of
2922 * values that end up in MRFs are shortly before the MRF
2923 * write anyway.
2924 */
2925 if (block->start() == scan_inst)
2926 break;
2927
2928 /* You can't read from an MRF, so if someone else reads our
2929 * MRF's source GRF that we wanted to rewrite, that stops us.
2930 */
2931 bool interfered = false;
2932 for (int i = 0; i < scan_inst->sources; i++) {
2933 if (scan_inst->src[i].file == GRF &&
2934 scan_inst->src[i].reg == inst->src[0].reg &&
2935 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2936 interfered = true;
2937 }
2938 }
2939 if (interfered)
2940 break;
2941
2942 if (scan_inst->dst.file == MRF) {
2943 /* If somebody else writes our MRF here, we can't
2944 * compute-to-MRF before that.
2945 */
2946 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2947 int scan_mrf_high;
2948
2949 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2950 scan_mrf_high = scan_mrf_low + 4;
2951 } else if (scan_inst->exec_size == 16) {
2952 scan_mrf_high = scan_mrf_low + 1;
2953 } else {
2954 scan_mrf_high = scan_mrf_low;
2955 }
2956
2957 if (mrf_low == scan_mrf_low ||
2958 mrf_low == scan_mrf_high ||
2959 mrf_high == scan_mrf_low ||
2960 mrf_high == scan_mrf_high) {
2961 break;
2962 }
2963 }
2964
2965 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2966 /* Found a SEND instruction, which means that there are
2967 * live values in MRFs from base_mrf to base_mrf +
2968 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2969 * above it.
2970 */
2971 if (mrf_low >= scan_inst->base_mrf &&
2972 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2973 break;
2974 }
2975 if (mrf_high >= scan_inst->base_mrf &&
2976 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2977 break;
2978 }
2979 }
2980 }
2981 }
2982
2983 if (progress)
2984 invalidate_live_intervals();
2985
2986 return progress;
2987 }
2988
2989 /**
2990 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2991 * flow. We could probably do better here with some form of divergence
2992 * analysis.
2993 */
2994 bool
2995 fs_visitor::eliminate_find_live_channel()
2996 {
2997 bool progress = false;
2998 unsigned depth = 0;
2999
3000 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3001 switch (inst->opcode) {
3002 case BRW_OPCODE_IF:
3003 case BRW_OPCODE_DO:
3004 depth++;
3005 break;
3006
3007 case BRW_OPCODE_ENDIF:
3008 case BRW_OPCODE_WHILE:
3009 depth--;
3010 break;
3011
3012 case FS_OPCODE_DISCARD_JUMP:
3013 /* This can potentially make control flow non-uniform until the end
3014 * of the program.
3015 */
3016 return progress;
3017
3018 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
3019 if (depth == 0) {
3020 inst->opcode = BRW_OPCODE_MOV;
3021 inst->src[0] = fs_reg(0);
3022 inst->sources = 1;
3023 inst->force_writemask_all = true;
3024 progress = true;
3025 }
3026 break;
3027
3028 default:
3029 break;
3030 }
3031 }
3032
3033 return progress;
3034 }
3035
3036 /**
3037 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
3038 * instructions to FS_OPCODE_REP_FB_WRITE.
3039 */
3040 void
3041 fs_visitor::emit_repclear_shader()
3042 {
3043 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3044 int base_mrf = 1;
3045 int color_mrf = base_mrf + 2;
3046
3047 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
3048 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
3049 mov->force_writemask_all = true;
3050
3051 fs_inst *write;
3052 if (key->nr_color_regions == 1) {
3053 write = emit(FS_OPCODE_REP_FB_WRITE);
3054 write->saturate = key->clamp_fragment_color;
3055 write->base_mrf = color_mrf;
3056 write->target = 0;
3057 write->header_size = 0;
3058 write->mlen = 1;
3059 } else {
3060 assume(key->nr_color_regions > 0);
3061 for (int i = 0; i < key->nr_color_regions; ++i) {
3062 write = emit(FS_OPCODE_REP_FB_WRITE);
3063 write->saturate = key->clamp_fragment_color;
3064 write->base_mrf = base_mrf;
3065 write->target = i;
3066 write->header_size = 2;
3067 write->mlen = 3;
3068 }
3069 }
3070 write->eot = true;
3071
3072 calculate_cfg();
3073
3074 assign_constant_locations();
3075 assign_curb_setup();
3076
3077 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
3078 assert(mov->src[0].file == HW_REG);
3079 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
3080 }
3081
3082 /**
3083 * Walks through basic blocks, looking for repeated MRF writes and
3084 * removing the later ones.
3085 */
3086 bool
3087 fs_visitor::remove_duplicate_mrf_writes()
3088 {
3089 fs_inst *last_mrf_move[16];
3090 bool progress = false;
3091
3092 /* Need to update the MRF tracking for compressed instructions. */
3093 if (dispatch_width == 16)
3094 return false;
3095
3096 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3097
3098 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3099 if (inst->is_control_flow()) {
3100 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3101 }
3102
3103 if (inst->opcode == BRW_OPCODE_MOV &&
3104 inst->dst.file == MRF) {
3105 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3106 if (prev_inst && inst->equals(prev_inst)) {
3107 inst->remove(block);
3108 progress = true;
3109 continue;
3110 }
3111 }
3112
3113 /* Clear out the last-write records for MRFs that were overwritten. */
3114 if (inst->dst.file == MRF) {
3115 last_mrf_move[inst->dst.reg] = NULL;
3116 }
3117
3118 if (inst->mlen > 0 && inst->base_mrf != -1) {
3119 /* Found a SEND instruction, which will include two or fewer
3120 * implied MRF writes. We could do better here.
3121 */
3122 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3123 last_mrf_move[inst->base_mrf + i] = NULL;
3124 }
3125 }
3126
3127 /* Clear out any MRF move records whose sources got overwritten. */
3128 if (inst->dst.file == GRF) {
3129 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3130 if (last_mrf_move[i] &&
3131 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3132 last_mrf_move[i] = NULL;
3133 }
3134 }
3135 }
3136
3137 if (inst->opcode == BRW_OPCODE_MOV &&
3138 inst->dst.file == MRF &&
3139 inst->src[0].file == GRF &&
3140 !inst->is_partial_write()) {
3141 last_mrf_move[inst->dst.reg] = inst;
3142 }
3143 }
3144
3145 if (progress)
3146 invalidate_live_intervals();
3147
3148 return progress;
3149 }
3150
3151 static void
3152 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3153 {
3154 /* Clear the flag for registers that actually got read (as expected). */
3155 for (int i = 0; i < inst->sources; i++) {
3156 int grf;
3157 if (inst->src[i].file == GRF) {
3158 grf = inst->src[i].reg;
3159 } else if (inst->src[i].file == HW_REG &&
3160 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3161 grf = inst->src[i].fixed_hw_reg.nr;
3162 } else {
3163 continue;
3164 }
3165
3166 if (grf >= first_grf &&
3167 grf < first_grf + grf_len) {
3168 deps[grf - first_grf] = false;
3169 if (inst->exec_size == 16)
3170 deps[grf - first_grf + 1] = false;
3171 }
3172 }
3173 }
3174
3175 /**
3176 * Implements this workaround for the original 965:
3177 *
3178 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3179 * check for post destination dependencies on this instruction, software
3180 * must ensure that there is no destination hazard for the case of ‘write
3181 * followed by a posted write’ shown in the following example.
3182 *
3183 * 1. mov r3 0
3184 * 2. send r3.xy <rest of send instruction>
3185 * 3. mov r2 r3
3186 *
3187 * Due to no post-destination dependency check on the ‘send’, the above
3188 * code sequence could have two instructions (1 and 2) in flight at the
3189 * same time that both consider ‘r3’ as the target of their final writes.
3190 */
3191 void
3192 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3193 fs_inst *inst)
3194 {
3195 int write_len = inst->regs_written;
3196 int first_write_grf = inst->dst.reg;
3197 bool needs_dep[BRW_MAX_MRF];
3198 assert(write_len < (int)sizeof(needs_dep) - 1);
3199
3200 memset(needs_dep, false, sizeof(needs_dep));
3201 memset(needs_dep, true, write_len);
3202
3203 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3204
3205 /* Walk backwards looking for writes to registers we're writing which
3206 * aren't read since being written. If we hit the start of the program,
3207 * we assume that there are no outstanding dependencies on entry to the
3208 * program.
3209 */
3210 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3211 /* If we hit control flow, assume that there *are* outstanding
3212 * dependencies, and force their cleanup before our instruction.
3213 */
3214 if (block->start() == scan_inst) {
3215 for (int i = 0; i < write_len; i++) {
3216 if (needs_dep[i]) {
3217 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3218 }
3219 }
3220 return;
3221 }
3222
3223 /* We insert our reads as late as possible on the assumption that any
3224 * instruction but a MOV that might have left us an outstanding
3225 * dependency has more latency than a MOV.
3226 */
3227 if (scan_inst->dst.file == GRF) {
3228 for (int i = 0; i < scan_inst->regs_written; i++) {
3229 int reg = scan_inst->dst.reg + i;
3230
3231 if (reg >= first_write_grf &&
3232 reg < first_write_grf + write_len &&
3233 needs_dep[reg - first_write_grf]) {
3234 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3235 needs_dep[reg - first_write_grf] = false;
3236 if (scan_inst->exec_size == 16)
3237 needs_dep[reg - first_write_grf + 1] = false;
3238 }
3239 }
3240 }
3241
3242 /* Clear the flag for registers that actually got read (as expected). */
3243 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3244
3245 /* Continue the loop only if we haven't resolved all the dependencies */
3246 int i;
3247 for (i = 0; i < write_len; i++) {
3248 if (needs_dep[i])
3249 break;
3250 }
3251 if (i == write_len)
3252 return;
3253 }
3254 }
3255
3256 /**
3257 * Implements this workaround for the original 965:
3258 *
3259 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3260 * used as a destination register until after it has been sourced by an
3261 * instruction with a different destination register.
3262 */
3263 void
3264 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3265 {
3266 int write_len = inst->regs_written;
3267 int first_write_grf = inst->dst.reg;
3268 bool needs_dep[BRW_MAX_MRF];
3269 assert(write_len < (int)sizeof(needs_dep) - 1);
3270
3271 memset(needs_dep, false, sizeof(needs_dep));
3272 memset(needs_dep, true, write_len);
3273 /* Walk forwards looking for writes to registers we're writing which aren't
3274 * read before being written.
3275 */
3276 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3277 /* If we hit control flow, force resolve all remaining dependencies. */
3278 if (block->end() == scan_inst) {
3279 for (int i = 0; i < write_len; i++) {
3280 if (needs_dep[i])
3281 scan_inst->insert_before(block,
3282 DEP_RESOLVE_MOV(first_write_grf + i));
3283 }
3284 return;
3285 }
3286
3287 /* Clear the flag for registers that actually got read (as expected). */
3288 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3289
3290 /* We insert our reads as late as possible since they're reading the
3291 * result of a SEND, which has massive latency.
3292 */
3293 if (scan_inst->dst.file == GRF &&
3294 scan_inst->dst.reg >= first_write_grf &&
3295 scan_inst->dst.reg < first_write_grf + write_len &&
3296 needs_dep[scan_inst->dst.reg - first_write_grf]) {
3297 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3298 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3299 }
3300
3301 /* Continue the loop only if we haven't resolved all the dependencies */
3302 int i;
3303 for (i = 0; i < write_len; i++) {
3304 if (needs_dep[i])
3305 break;
3306 }
3307 if (i == write_len)
3308 return;
3309 }
3310 }
3311
3312 void
3313 fs_visitor::insert_gen4_send_dependency_workarounds()
3314 {
3315 if (devinfo->gen != 4 || devinfo->is_g4x)
3316 return;
3317
3318 bool progress = false;
3319
3320 /* Note that we're done with register allocation, so GRF fs_regs always
3321 * have a .reg_offset of 0.
3322 */
3323
3324 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3325 if (inst->mlen != 0 && inst->dst.file == GRF) {
3326 insert_gen4_pre_send_dependency_workarounds(block, inst);
3327 insert_gen4_post_send_dependency_workarounds(block, inst);
3328 progress = true;
3329 }
3330 }
3331
3332 if (progress)
3333 invalidate_live_intervals();
3334 }
3335
3336 /**
3337 * Turns the generic expression-style uniform pull constant load instruction
3338 * into a hardware-specific series of instructions for loading a pull
3339 * constant.
3340 *
3341 * The expression style allows the CSE pass before this to optimize out
3342 * repeated loads from the same offset, and gives the pre-register-allocation
3343 * scheduling full flexibility, while the conversion to native instructions
3344 * allows the post-register-allocation scheduler the best information
3345 * possible.
3346 *
3347 * Note that execution masking for setting up pull constant loads is special:
3348 * the channels that need to be written are unrelated to the current execution
3349 * mask, since a later instruction will use one of the result channels as a
3350 * source operand for all 8 or 16 of its channels.
3351 */
3352 void
3353 fs_visitor::lower_uniform_pull_constant_loads()
3354 {
3355 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3356 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3357 continue;
3358
3359 if (devinfo->gen >= 7) {
3360 /* The offset arg before was a vec4-aligned byte offset. We need to
3361 * turn it into a dword offset.
3362 */
3363 fs_reg const_offset_reg = inst->src[1];
3364 assert(const_offset_reg.file == IMM &&
3365 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3366 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3367 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3368
3369 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3370 * Reserve space for the register.
3371 */
3372 if (devinfo->gen >= 9) {
3373 payload.reg_offset++;
3374 alloc.sizes[payload.reg] = 2;
3375 }
3376
3377 /* This is actually going to be a MOV, but since only the first dword
3378 * is accessed, we have a special opcode to do just that one. Note
3379 * that this needs to be an operation that will be considered a def
3380 * by live variable analysis, or register allocation will explode.
3381 */
3382 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3383 8, payload, const_offset_reg);
3384 setup->force_writemask_all = true;
3385
3386 setup->ir = inst->ir;
3387 setup->annotation = inst->annotation;
3388 inst->insert_before(block, setup);
3389
3390 /* Similarly, this will only populate the first 4 channels of the
3391 * result register (since we only use smear values from 0-3), but we
3392 * don't tell the optimizer.
3393 */
3394 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3395 inst->src[1] = payload;
3396
3397 invalidate_live_intervals();
3398 } else {
3399 /* Before register allocation, we didn't tell the scheduler about the
3400 * MRF we use. We know it's safe to use this MRF because nothing
3401 * else does except for register spill/unspill, which generates and
3402 * uses its MRF within a single IR instruction.
3403 */
3404 inst->base_mrf = 14;
3405 inst->mlen = 1;
3406 }
3407 }
3408 }
3409
3410 bool
3411 fs_visitor::lower_load_payload()
3412 {
3413 bool progress = false;
3414
3415 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3416 if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3417 continue;
3418
3419 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3420 assert(inst->saturate == false);
3421
3422 fs_reg dst = inst->dst;
3423
3424 /* Get rid of COMPR4. We'll add it back in if we need it */
3425 if (dst.file == MRF)
3426 dst.reg = dst.reg & ~BRW_MRF_COMPR4;
3427
3428 dst.width = 8;
3429 for (uint8_t i = 0; i < inst->header_size; i++) {
3430 if (inst->src[i].file != BAD_FILE) {
3431 fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3432 fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3433 mov_src.width = 8;
3434 fs_inst *mov = MOV(mov_dst, mov_src);
3435 mov->force_writemask_all = true;
3436 inst->insert_before(block, mov);
3437 }
3438 dst = offset(dst, 1);
3439 }
3440
3441 dst.width = inst->exec_size;
3442 if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
3443 inst->exec_size > 8) {
3444 /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3445 * a straightforward copy. Instead, the result of the
3446 * LOAD_PAYLOAD is treated as interleaved and the first four
3447 * non-header sources are unpacked as:
3448 *
3449 * m + 0: r0
3450 * m + 1: g0
3451 * m + 2: b0
3452 * m + 3: a0
3453 * m + 4: r1
3454 * m + 5: g1
3455 * m + 6: b1
3456 * m + 7: a1
3457 *
3458 * This is used for gen <= 5 fb writes.
3459 */
3460 assert(inst->exec_size == 16);
3461 assert(inst->header_size + 4 <= inst->sources);
3462 for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3463 if (inst->src[i].file != BAD_FILE) {
3464 if (devinfo->has_compr4) {
3465 fs_reg compr4_dst = retype(dst, inst->src[i].type);
3466 compr4_dst.reg |= BRW_MRF_COMPR4;
3467
3468 fs_inst *mov = MOV(compr4_dst, inst->src[i]);
3469 mov->force_writemask_all = inst->force_writemask_all;
3470 inst->insert_before(block, mov);
3471 } else {
3472 /* Platform doesn't have COMPR4. We have to fake it */
3473 fs_reg mov_dst = retype(dst, inst->src[i].type);
3474 mov_dst.width = 8;
3475
3476 fs_inst *mov = MOV(mov_dst, half(inst->src[i], 0));
3477 mov->force_writemask_all = inst->force_writemask_all;
3478 inst->insert_before(block, mov);
3479
3480 mov = MOV(offset(mov_dst, 4), half(inst->src[i], 1));
3481 mov->force_writemask_all = inst->force_writemask_all;
3482 mov->force_sechalf = true;
3483 inst->insert_before(block, mov);
3484 }
3485 }
3486
3487 dst.reg++;
3488 }
3489
3490 /* The loop above only ever incremented us through the first set
3491 * of 4 registers. However, thanks to the magic of COMPR4, we
3492 * actually wrote to the first 8 registers, so we need to take
3493 * that into account now.
3494 */
3495 dst.reg += 4;
3496
3497 /* The COMPR4 code took care of the first 4 sources. We'll let
3498 * the regular path handle any remaining sources. Yes, we are
3499 * modifying the instruction but we're about to delete it so
3500 * this really doesn't hurt anything.
3501 */
3502 inst->header_size += 4;
3503 }
3504
3505 for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3506 if (inst->src[i].file != BAD_FILE) {
3507 fs_inst *mov = MOV(retype(dst, inst->src[i].type),
3508 inst->src[i]);
3509 mov->force_writemask_all = inst->force_writemask_all;
3510 mov->force_sechalf = inst->force_sechalf;
3511 inst->insert_before(block, mov);
3512 }
3513 dst = offset(dst, 1);
3514 }
3515
3516 inst->remove(block);
3517 progress = true;
3518 }
3519
3520 if (progress)
3521 invalidate_live_intervals();
3522
3523 return progress;
3524 }
3525
3526 bool
3527 fs_visitor::lower_integer_multiplication()
3528 {
3529 bool progress = false;
3530
3531 /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
3532 * directly, but Cherryview cannot.
3533 */
3534 if (devinfo->gen >= 8 && !devinfo->is_cherryview)
3535 return false;
3536
3537 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3538 if (inst->opcode != BRW_OPCODE_MUL ||
3539 inst->dst.is_accumulator() ||
3540 (inst->dst.type != BRW_REGISTER_TYPE_D &&
3541 inst->dst.type != BRW_REGISTER_TYPE_UD))
3542 continue;
3543
3544 #define insert(instr) inst->insert_before(block, instr)
3545
3546 /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3547 * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3548 * src1 are used.
3549 *
3550 * If multiplying by an immediate value that fits in 16-bits, do a
3551 * single MUL instruction with that value in the proper location.
3552 */
3553 if (inst->src[1].file == IMM &&
3554 inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
3555 if (devinfo->gen < 7) {
3556 fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
3557 inst->dst.type, dispatch_width);
3558 insert(MOV(imm, inst->src[1]));
3559 insert(MUL(inst->dst, imm, inst->src[0]));
3560 } else {
3561 insert(MUL(inst->dst, inst->src[0], inst->src[1]));
3562 }
3563 } else {
3564 /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3565 * do 32-bit integer multiplication in one instruction, but instead
3566 * must do a sequence (which actually calculates a 64-bit result):
3567 *
3568 * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
3569 * mach(8) null g3<8,8,1>D g4<8,8,1>D
3570 * mov(8) g2<1>D acc0<8,8,1>D
3571 *
3572 * But on Gen > 6, the ability to use second accumulator register
3573 * (acc1) for non-float data types was removed, preventing a simple
3574 * implementation in SIMD16. A 16-channel result can be calculated by
3575 * executing the three instructions twice in SIMD8, once with quarter
3576 * control of 1Q for the first eight channels and again with 2Q for
3577 * the second eight channels.
3578 *
3579 * Which accumulator register is implicitly accessed (by AccWrEnable
3580 * for instance) is determined by the quarter control. Unfortunately
3581 * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3582 * implicit accumulator access by an instruction with 2Q will access
3583 * acc1 regardless of whether the data type is usable in acc1.
3584 *
3585 * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3586 * integer data types.
3587 *
3588 * Since we only want the low 32-bits of the result, we can do two
3589 * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3590 * adjust the high result and add them (like the mach is doing):
3591 *
3592 * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
3593 * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
3594 * shl(8) g9<1>D g8<8,8,1>D 16D
3595 * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
3596 *
3597 * We avoid the shl instruction by realizing that we only want to add
3598 * the low 16-bits of the "high" result to the high 16-bits of the
3599 * "low" result and using proper regioning on the add:
3600 *
3601 * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
3602 * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
3603 * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
3604 *
3605 * Since it does not use the (single) accumulator register, we can
3606 * schedule multi-component multiplications much better.
3607 */
3608
3609 if (inst->conditional_mod && inst->dst.is_null()) {
3610 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3611 inst->dst.type, dispatch_width);
3612 }
3613 fs_reg low = inst->dst;
3614 fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
3615 inst->dst.type, dispatch_width);
3616
3617 if (brw->gen >= 7) {
3618 fs_reg src1_0_w = inst->src[1];
3619 fs_reg src1_1_w = inst->src[1];
3620
3621 if (inst->src[1].file == IMM) {
3622 src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
3623 src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
3624 } else {
3625 src1_0_w.type = BRW_REGISTER_TYPE_UW;
3626 src1_0_w.stride = 2;
3627
3628 src1_1_w.type = BRW_REGISTER_TYPE_UW;
3629 src1_1_w.stride = 2;
3630 src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3631 }
3632 insert(MUL(low, inst->src[0], src1_0_w));
3633 insert(MUL(high, inst->src[0], src1_1_w));
3634 } else {
3635 fs_reg src0_0_w = inst->src[0];
3636 fs_reg src0_1_w = inst->src[0];
3637
3638 src0_0_w.type = BRW_REGISTER_TYPE_UW;
3639 src0_0_w.stride = 2;
3640
3641 src0_1_w.type = BRW_REGISTER_TYPE_UW;
3642 src0_1_w.stride = 2;
3643 src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3644
3645 insert(MUL(low, src0_0_w, inst->src[1]));
3646 insert(MUL(high, src0_1_w, inst->src[1]));
3647 }
3648
3649 fs_reg dst = inst->dst;
3650 dst.type = BRW_REGISTER_TYPE_UW;
3651 dst.subreg_offset = 2;
3652 dst.stride = 2;
3653
3654 high.type = BRW_REGISTER_TYPE_UW;
3655 high.stride = 2;
3656
3657 low.type = BRW_REGISTER_TYPE_UW;
3658 low.subreg_offset = 2;
3659 low.stride = 2;
3660
3661 insert(ADD(dst, low, high));
3662
3663 if (inst->conditional_mod) {
3664 fs_reg null(retype(brw_null_reg(), inst->dst.type));
3665 fs_inst *mov = MOV(null, inst->dst);
3666 mov->conditional_mod = inst->conditional_mod;
3667 insert(mov);
3668 }
3669 }
3670 #undef insert
3671
3672 inst->remove(block);
3673 progress = true;
3674 }
3675
3676 if (progress)
3677 invalidate_live_intervals();
3678
3679 return progress;
3680 }
3681
3682 void
3683 fs_visitor::dump_instructions()
3684 {
3685 dump_instructions(NULL);
3686 }
3687
3688 void
3689 fs_visitor::dump_instructions(const char *name)
3690 {
3691 FILE *file = stderr;
3692 if (name && geteuid() != 0) {
3693 file = fopen(name, "w");
3694 if (!file)
3695 file = stderr;
3696 }
3697
3698 if (cfg) {
3699 calculate_register_pressure();
3700 int ip = 0, max_pressure = 0;
3701 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3702 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3703 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3704 dump_instruction(inst, file);
3705 ip++;
3706 }
3707 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3708 } else {
3709 int ip = 0;
3710 foreach_in_list(backend_instruction, inst, &instructions) {
3711 fprintf(file, "%4d: ", ip++);
3712 dump_instruction(inst, file);
3713 }
3714 }
3715
3716 if (file != stderr) {
3717 fclose(file);
3718 }
3719 }
3720
3721 void
3722 fs_visitor::dump_instruction(backend_instruction *be_inst)
3723 {
3724 dump_instruction(be_inst, stderr);
3725 }
3726
3727 void
3728 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3729 {
3730 fs_inst *inst = (fs_inst *)be_inst;
3731
3732 if (inst->predicate) {
3733 fprintf(file, "(%cf0.%d) ",
3734 inst->predicate_inverse ? '-' : '+',
3735 inst->flag_subreg);
3736 }
3737
3738 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3739 if (inst->saturate)
3740 fprintf(file, ".sat");
3741 if (inst->conditional_mod) {
3742 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3743 if (!inst->predicate &&
3744 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3745 inst->opcode != BRW_OPCODE_IF &&
3746 inst->opcode != BRW_OPCODE_WHILE))) {
3747 fprintf(file, ".f0.%d", inst->flag_subreg);
3748 }
3749 }
3750 fprintf(file, "(%d) ", inst->exec_size);
3751
3752
3753 switch (inst->dst.file) {
3754 case GRF:
3755 fprintf(file, "vgrf%d", inst->dst.reg);
3756 if (inst->dst.width != dispatch_width)
3757 fprintf(file, "@%d", inst->dst.width);
3758 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3759 inst->dst.subreg_offset)
3760 fprintf(file, "+%d.%d",
3761 inst->dst.reg_offset, inst->dst.subreg_offset);
3762 break;
3763 case MRF:
3764 fprintf(file, "m%d", inst->dst.reg);
3765 break;
3766 case BAD_FILE:
3767 fprintf(file, "(null)");
3768 break;
3769 case UNIFORM:
3770 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3771 break;
3772 case ATTR:
3773 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3774 break;
3775 case HW_REG:
3776 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3777 switch (inst->dst.fixed_hw_reg.nr) {
3778 case BRW_ARF_NULL:
3779 fprintf(file, "null");
3780 break;
3781 case BRW_ARF_ADDRESS:
3782 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3783 break;
3784 case BRW_ARF_ACCUMULATOR:
3785 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3786 break;
3787 case BRW_ARF_FLAG:
3788 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3789 inst->dst.fixed_hw_reg.subnr);
3790 break;
3791 default:
3792 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3793 inst->dst.fixed_hw_reg.subnr);
3794 break;
3795 }
3796 } else {
3797 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3798 }
3799 if (inst->dst.fixed_hw_reg.subnr)
3800 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3801 break;
3802 default:
3803 fprintf(file, "???");
3804 break;
3805 }
3806 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3807
3808 for (int i = 0; i < inst->sources; i++) {
3809 if (inst->src[i].negate)
3810 fprintf(file, "-");
3811 if (inst->src[i].abs)
3812 fprintf(file, "|");
3813 switch (inst->src[i].file) {
3814 case GRF:
3815 fprintf(file, "vgrf%d", inst->src[i].reg);
3816 if (inst->src[i].width != dispatch_width)
3817 fprintf(file, "@%d", inst->src[i].width);
3818 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3819 inst->src[i].subreg_offset)
3820 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3821 inst->src[i].subreg_offset);
3822 break;
3823 case MRF:
3824 fprintf(file, "***m%d***", inst->src[i].reg);
3825 break;
3826 case ATTR:
3827 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3828 break;
3829 case UNIFORM:
3830 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3831 if (inst->src[i].reladdr) {
3832 fprintf(file, "+reladdr");
3833 } else if (inst->src[i].subreg_offset) {
3834 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3835 inst->src[i].subreg_offset);
3836 }
3837 break;
3838 case BAD_FILE:
3839 fprintf(file, "(null)");
3840 break;
3841 case IMM:
3842 switch (inst->src[i].type) {
3843 case BRW_REGISTER_TYPE_F:
3844 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3845 break;
3846 case BRW_REGISTER_TYPE_W:
3847 case BRW_REGISTER_TYPE_D:
3848 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3849 break;
3850 case BRW_REGISTER_TYPE_UW:
3851 case BRW_REGISTER_TYPE_UD:
3852 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3853 break;
3854 case BRW_REGISTER_TYPE_VF:
3855 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3856 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3857 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3858 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3859 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3860 break;
3861 default:
3862 fprintf(file, "???");
3863 break;
3864 }
3865 break;
3866 case HW_REG:
3867 if (inst->src[i].fixed_hw_reg.negate)
3868 fprintf(file, "-");
3869 if (inst->src[i].fixed_hw_reg.abs)
3870 fprintf(file, "|");
3871 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3872 switch (inst->src[i].fixed_hw_reg.nr) {
3873 case BRW_ARF_NULL:
3874 fprintf(file, "null");
3875 break;
3876 case BRW_ARF_ADDRESS:
3877 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3878 break;
3879 case BRW_ARF_ACCUMULATOR:
3880 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3881 break;
3882 case BRW_ARF_FLAG:
3883 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3884 inst->src[i].fixed_hw_reg.subnr);
3885 break;
3886 default:
3887 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3888 inst->src[i].fixed_hw_reg.subnr);
3889 break;
3890 }
3891 } else {
3892 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3893 }
3894 if (inst->src[i].fixed_hw_reg.subnr)
3895 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3896 if (inst->src[i].fixed_hw_reg.abs)
3897 fprintf(file, "|");
3898 break;
3899 default:
3900 fprintf(file, "???");
3901 break;
3902 }
3903 if (inst->src[i].abs)
3904 fprintf(file, "|");
3905
3906 if (inst->src[i].file != IMM) {
3907 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3908 }
3909
3910 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3911 fprintf(file, ", ");
3912 }
3913
3914 fprintf(file, " ");
3915
3916 if (dispatch_width == 16 && inst->exec_size == 8) {
3917 if (inst->force_sechalf)
3918 fprintf(file, "2ndhalf ");
3919 else
3920 fprintf(file, "1sthalf ");
3921 }
3922
3923 fprintf(file, "\n");
3924 }
3925
3926 /**
3927 * Possibly returns an instruction that set up @param reg.
3928 *
3929 * Sometimes we want to take the result of some expression/variable
3930 * dereference tree and rewrite the instruction generating the result
3931 * of the tree. When processing the tree, we know that the
3932 * instructions generated are all writing temporaries that are dead
3933 * outside of this tree. So, if we have some instructions that write
3934 * a temporary, we're free to point that temp write somewhere else.
3935 *
3936 * Note that this doesn't guarantee that the instruction generated
3937 * only reg -- it might be the size=4 destination of a texture instruction.
3938 */
3939 fs_inst *
3940 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3941 fs_inst *end,
3942 const fs_reg &reg)
3943 {
3944 if (end == start ||
3945 end->is_partial_write() ||
3946 reg.reladdr ||
3947 !reg.equals(end->dst)) {
3948 return NULL;
3949 } else {
3950 return end;
3951 }
3952 }
3953
3954 void
3955 fs_visitor::setup_payload_gen6()
3956 {
3957 bool uses_depth =
3958 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3959 unsigned barycentric_interp_modes =
3960 (stage == MESA_SHADER_FRAGMENT) ?
3961 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3962
3963 assert(devinfo->gen >= 6);
3964
3965 /* R0-1: masks, pixel X/Y coordinates. */
3966 payload.num_regs = 2;
3967 /* R2: only for 32-pixel dispatch.*/
3968
3969 /* R3-26: barycentric interpolation coordinates. These appear in the
3970 * same order that they appear in the brw_wm_barycentric_interp_mode
3971 * enum. Each set of coordinates occupies 2 registers if dispatch width
3972 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3973 * appear if they were enabled using the "Barycentric Interpolation
3974 * Mode" bits in WM_STATE.
3975 */
3976 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3977 if (barycentric_interp_modes & (1 << i)) {
3978 payload.barycentric_coord_reg[i] = payload.num_regs;
3979 payload.num_regs += 2;
3980 if (dispatch_width == 16) {
3981 payload.num_regs += 2;
3982 }
3983 }
3984 }
3985
3986 /* R27: interpolated depth if uses source depth */
3987 if (uses_depth) {
3988 payload.source_depth_reg = payload.num_regs;
3989 payload.num_regs++;
3990 if (dispatch_width == 16) {
3991 /* R28: interpolated depth if not SIMD8. */
3992 payload.num_regs++;
3993 }
3994 }
3995 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3996 if (uses_depth) {
3997 payload.source_w_reg = payload.num_regs;
3998 payload.num_regs++;
3999 if (dispatch_width == 16) {
4000 /* R30: interpolated W if not SIMD8. */
4001 payload.num_regs++;
4002 }
4003 }
4004
4005 if (stage == MESA_SHADER_FRAGMENT) {
4006 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
4007 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
4008 prog_data->uses_pos_offset = key->compute_pos_offset;
4009 /* R31: MSAA position offsets. */
4010 if (prog_data->uses_pos_offset) {
4011 payload.sample_pos_reg = payload.num_regs;
4012 payload.num_regs++;
4013 }
4014 }
4015
4016 /* R32: MSAA input coverage mask */
4017 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
4018 assert(devinfo->gen >= 7);
4019 payload.sample_mask_in_reg = payload.num_regs;
4020 payload.num_regs++;
4021 if (dispatch_width == 16) {
4022 /* R33: input coverage mask if not SIMD8. */
4023 payload.num_regs++;
4024 }
4025 }
4026
4027 /* R34-: bary for 32-pixel. */
4028 /* R58-59: interp W for 32-pixel. */
4029
4030 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
4031 source_depth_to_render_target = true;
4032 }
4033 }
4034
4035 void
4036 fs_visitor::setup_vs_payload()
4037 {
4038 /* R0: thread header, R1: urb handles */
4039 payload.num_regs = 2;
4040 }
4041
4042 void
4043 fs_visitor::setup_cs_payload()
4044 {
4045 assert(brw->gen >= 7);
4046
4047 payload.num_regs = 1;
4048 }
4049
4050 void
4051 fs_visitor::assign_binding_table_offsets()
4052 {
4053 assert(stage == MESA_SHADER_FRAGMENT);
4054 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
4055 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
4056 uint32_t next_binding_table_offset = 0;
4057
4058 /* If there are no color regions, we still perform an FB write to a null
4059 * renderbuffer, which we place at surface index 0.
4060 */
4061 prog_data->binding_table.render_target_start = next_binding_table_offset;
4062 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
4063
4064 assign_common_binding_table_offsets(next_binding_table_offset);
4065 }
4066
4067 void
4068 fs_visitor::calculate_register_pressure()
4069 {
4070 invalidate_live_intervals();
4071 calculate_live_intervals();
4072
4073 unsigned num_instructions = 0;
4074 foreach_block(block, cfg)
4075 num_instructions += block->instructions.length();
4076
4077 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
4078
4079 for (unsigned reg = 0; reg < alloc.count; reg++) {
4080 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
4081 regs_live_at_ip[ip] += alloc.sizes[reg];
4082 }
4083 }
4084
4085 void
4086 fs_visitor::optimize()
4087 {
4088 split_virtual_grfs();
4089
4090 move_uniform_array_access_to_pull_constants();
4091 assign_constant_locations();
4092 demote_pull_constants();
4093
4094 #define OPT(pass, args...) ({ \
4095 pass_num++; \
4096 bool this_progress = pass(args); \
4097 \
4098 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
4099 char filename[64]; \
4100 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
4101 stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
4102 \
4103 backend_shader::dump_instructions(filename); \
4104 } \
4105 \
4106 progress = progress || this_progress; \
4107 this_progress; \
4108 })
4109
4110 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
4111 char filename[64];
4112 snprintf(filename, 64, "%s%d-%04d-00-start",
4113 stage_abbrev, dispatch_width,
4114 shader_prog ? shader_prog->Name : 0);
4115
4116 backend_shader::dump_instructions(filename);
4117 }
4118
4119 bool progress;
4120 int iteration = 0;
4121 int pass_num = 0;
4122 do {
4123 progress = false;
4124 pass_num = 0;
4125 iteration++;
4126
4127 OPT(remove_duplicate_mrf_writes);
4128
4129 OPT(opt_algebraic);
4130 OPT(opt_cse);
4131 OPT(opt_copy_propagate);
4132 OPT(opt_peephole_predicated_break);
4133 OPT(opt_cmod_propagation);
4134 OPT(dead_code_eliminate);
4135 OPT(opt_peephole_sel);
4136 OPT(dead_control_flow_eliminate, this);
4137 OPT(opt_register_renaming);
4138 OPT(opt_redundant_discard_jumps);
4139 OPT(opt_saturate_propagation);
4140 OPT(opt_zero_samples);
4141 OPT(register_coalesce);
4142 OPT(compute_to_mrf);
4143 OPT(eliminate_find_live_channel);
4144
4145 OPT(compact_virtual_grfs);
4146 } while (progress);
4147
4148 pass_num = 0;
4149
4150 OPT(opt_sampler_eot);
4151
4152 if (OPT(lower_load_payload)) {
4153 split_virtual_grfs();
4154 OPT(register_coalesce);
4155 OPT(compute_to_mrf);
4156 OPT(dead_code_eliminate);
4157 }
4158
4159 OPT(opt_combine_constants);
4160 OPT(lower_integer_multiplication);
4161
4162 lower_uniform_pull_constant_loads();
4163 }
4164
4165 /**
4166 * Three source instruction must have a GRF/MRF destination register.
4167 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
4168 */
4169 void
4170 fs_visitor::fixup_3src_null_dest()
4171 {
4172 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
4173 if (inst->is_3src() && inst->dst.is_null()) {
4174 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
4175 inst->dst.type);
4176 }
4177 }
4178 }
4179
4180 void
4181 fs_visitor::allocate_registers()
4182 {
4183 bool allocated_without_spills;
4184
4185 static const enum instruction_scheduler_mode pre_modes[] = {
4186 SCHEDULE_PRE,
4187 SCHEDULE_PRE_NON_LIFO,
4188 SCHEDULE_PRE_LIFO,
4189 };
4190
4191 /* Try each scheduling heuristic to see if it can successfully register
4192 * allocate without spilling. They should be ordered by decreasing
4193 * performance but increasing likelihood of allocating.
4194 */
4195 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
4196 schedule_instructions(pre_modes[i]);
4197
4198 if (0) {
4199 assign_regs_trivial();
4200 allocated_without_spills = true;
4201 } else {
4202 allocated_without_spills = assign_regs(false);
4203 }
4204 if (allocated_without_spills)
4205 break;
4206 }
4207
4208 if (!allocated_without_spills) {
4209 /* We assume that any spilling is worse than just dropping back to
4210 * SIMD8. There's probably actually some intermediate point where
4211 * SIMD16 with a couple of spills is still better.
4212 */
4213 if (dispatch_width == 16) {
4214 fail("Failure to register allocate. Reduce number of "
4215 "live scalar values to avoid this.");
4216 } else {
4217 perf_debug("%s shader triggered register spilling. "
4218 "Try reducing the number of live scalar values to "
4219 "improve performance.\n", stage_name);
4220 }
4221
4222 /* Since we're out of heuristics, just go spill registers until we
4223 * get an allocation.
4224 */
4225 while (!assign_regs(true)) {
4226 if (failed)
4227 break;
4228 }
4229 }
4230
4231 /* This must come after all optimization and register allocation, since
4232 * it inserts dead code that happens to have side effects, and it does
4233 * so based on the actual physical registers in use.
4234 */
4235 insert_gen4_send_dependency_workarounds();
4236
4237 if (failed)
4238 return;
4239
4240 if (!allocated_without_spills)
4241 schedule_instructions(SCHEDULE_POST);
4242
4243 if (last_scratch > 0)
4244 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4245 }
4246
4247 bool
4248 fs_visitor::run_vs()
4249 {
4250 assert(stage == MESA_SHADER_VERTEX);
4251
4252 assign_common_binding_table_offsets(0);
4253 setup_vs_payload();
4254
4255 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4256 emit_shader_time_begin();
4257
4258 emit_nir_code();
4259
4260 if (failed)
4261 return false;
4262
4263 emit_urb_writes();
4264
4265 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4266 emit_shader_time_end();
4267
4268 calculate_cfg();
4269
4270 optimize();
4271
4272 assign_curb_setup();
4273 assign_vs_urb_setup();
4274
4275 fixup_3src_null_dest();
4276 allocate_registers();
4277
4278 return !failed;
4279 }
4280
4281 bool
4282 fs_visitor::run_fs()
4283 {
4284 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4285 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4286
4287 assert(stage == MESA_SHADER_FRAGMENT);
4288
4289 sanity_param_count = prog->Parameters->NumParameters;
4290
4291 assign_binding_table_offsets();
4292
4293 if (devinfo->gen >= 6)
4294 setup_payload_gen6();
4295 else
4296 setup_payload_gen4();
4297
4298 if (0) {
4299 emit_dummy_fs();
4300 } else if (brw->use_rep_send && dispatch_width == 16) {
4301 emit_repclear_shader();
4302 } else {
4303 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4304 emit_shader_time_begin();
4305
4306 calculate_urb_setup();
4307 if (prog->InputsRead > 0) {
4308 if (devinfo->gen < 6)
4309 emit_interpolation_setup_gen4();
4310 else
4311 emit_interpolation_setup_gen6();
4312 }
4313
4314 /* We handle discards by keeping track of the still-live pixels in f0.1.
4315 * Initialize it with the dispatched pixels.
4316 */
4317 if (wm_prog_data->uses_kill) {
4318 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4319 discard_init->flag_subreg = 1;
4320 }
4321
4322 /* Generate FS IR for main(). (the visitor only descends into
4323 * functions called "main").
4324 */
4325 emit_nir_code();
4326
4327 if (failed)
4328 return false;
4329
4330 if (wm_prog_data->uses_kill)
4331 emit(FS_OPCODE_PLACEHOLDER_HALT);
4332
4333 if (wm_key->alpha_test_func)
4334 emit_alpha_test();
4335
4336 emit_fb_writes();
4337
4338 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4339 emit_shader_time_end();
4340
4341 calculate_cfg();
4342
4343 optimize();
4344
4345 assign_curb_setup();
4346 assign_urb_setup();
4347
4348 fixup_3src_null_dest();
4349 allocate_registers();
4350
4351 if (failed)
4352 return false;
4353 }
4354
4355 if (dispatch_width == 8)
4356 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4357 else
4358 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4359
4360 /* If any state parameters were appended, then ParameterValues could have
4361 * been realloced, in which case the driver uniform storage set up by
4362 * _mesa_associate_uniform_storage() would point to freed memory. Make
4363 * sure that didn't happen.
4364 */
4365 assert(sanity_param_count == prog->Parameters->NumParameters);
4366
4367 return !failed;
4368 }
4369
4370 bool
4371 fs_visitor::run_cs()
4372 {
4373 assert(stage == MESA_SHADER_COMPUTE);
4374 assert(shader);
4375
4376 sanity_param_count = prog->Parameters->NumParameters;
4377
4378 assign_common_binding_table_offsets(0);
4379
4380 setup_cs_payload();
4381
4382 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4383 emit_shader_time_begin();
4384
4385 emit_nir_code();
4386
4387 if (failed)
4388 return false;
4389
4390 emit_cs_terminate();
4391
4392 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4393 emit_shader_time_end();
4394
4395 calculate_cfg();
4396
4397 optimize();
4398
4399 assign_curb_setup();
4400
4401 fixup_3src_null_dest();
4402 allocate_registers();
4403
4404 if (failed)
4405 return false;
4406
4407 /* If any state parameters were appended, then ParameterValues could have
4408 * been realloced, in which case the driver uniform storage set up by
4409 * _mesa_associate_uniform_storage() would point to freed memory. Make
4410 * sure that didn't happen.
4411 */
4412 assert(sanity_param_count == prog->Parameters->NumParameters);
4413
4414 return !failed;
4415 }
4416
4417 const unsigned *
4418 brw_wm_fs_emit(struct brw_context *brw,
4419 void *mem_ctx,
4420 const struct brw_wm_prog_key *key,
4421 struct brw_wm_prog_data *prog_data,
4422 struct gl_fragment_program *fp,
4423 struct gl_shader_program *prog,
4424 unsigned *final_assembly_size)
4425 {
4426 bool start_busy = false;
4427 double start_time = 0;
4428
4429 if (unlikely(brw->perf_debug)) {
4430 start_busy = (brw->batch.last_bo &&
4431 drm_intel_bo_busy(brw->batch.last_bo));
4432 start_time = get_time();
4433 }
4434
4435 struct brw_shader *shader = NULL;
4436 if (prog)
4437 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4438
4439 if (unlikely(INTEL_DEBUG & DEBUG_WM))
4440 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4441
4442 /* Now the main event: Visit the shader IR and generate our FS IR for it.
4443 */
4444 fs_visitor v(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4445 prog, &fp->Base, 8);
4446 if (!v.run_fs()) {
4447 if (prog) {
4448 prog->LinkStatus = false;
4449 ralloc_strcat(&prog->InfoLog, v.fail_msg);
4450 }
4451
4452 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4453 v.fail_msg);
4454
4455 return NULL;
4456 }
4457
4458 cfg_t *simd16_cfg = NULL;
4459 fs_visitor v2(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4460 prog, &fp->Base, 16);
4461 if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4462 if (!v.simd16_unsupported) {
4463 /* Try a SIMD16 compile */
4464 v2.import_uniforms(&v);
4465 if (!v2.run_fs()) {
4466 perf_debug("SIMD16 shader failed to compile, falling back to "
4467 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4468 } else {
4469 simd16_cfg = v2.cfg;
4470 }
4471 } else {
4472 perf_debug("SIMD16 shader unsupported, falling back to "
4473 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4474 }
4475 }
4476
4477 cfg_t *simd8_cfg;
4478 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4479 if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4480 simd8_cfg = NULL;
4481 prog_data->no_8 = true;
4482 } else {
4483 simd8_cfg = v.cfg;
4484 prog_data->no_8 = false;
4485 }
4486
4487 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4488 &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4489
4490 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4491 char *name;
4492 if (prog)
4493 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4494 prog->Label ? prog->Label : "unnamed",
4495 prog->Name);
4496 else
4497 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4498
4499 g.enable_debug(name);
4500 }
4501
4502 if (simd8_cfg)
4503 g.generate_code(simd8_cfg, 8);
4504 if (simd16_cfg)
4505 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4506
4507 if (unlikely(brw->perf_debug) && shader) {
4508 if (shader->compiled_once)
4509 brw_wm_debug_recompile(brw, prog, key);
4510 shader->compiled_once = true;
4511
4512 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4513 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4514 (get_time() - start_time) * 1000);
4515 }
4516 }
4517
4518 return g.get_assembly(final_assembly_size);
4519 }
4520
4521 extern "C" bool
4522 brw_fs_precompile(struct gl_context *ctx,
4523 struct gl_shader_program *shader_prog,
4524 struct gl_program *prog)
4525 {
4526 struct brw_context *brw = brw_context(ctx);
4527 struct brw_wm_prog_key key;
4528
4529 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4530 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4531 bool program_uses_dfdy = fp->UsesDFdy;
4532
4533 memset(&key, 0, sizeof(key));
4534
4535 if (brw->gen < 6) {
4536 if (fp->UsesKill)
4537 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4538
4539 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4540 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4541
4542 /* Just assume depth testing. */
4543 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4544 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4545 }
4546
4547 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4548 BRW_FS_VARYING_INPUT_MASK) > 16)
4549 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4550
4551 brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4552
4553 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4554 key.drawable_height = ctx->DrawBuffer->Height;
4555 }
4556
4557 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4558 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4559 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4560
4561 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4562 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4563 key.nr_color_regions > 1;
4564 }
4565
4566 key.program_string_id = bfp->id;
4567
4568 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4569 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4570
4571 bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4572
4573 brw->wm.base.prog_offset = old_prog_offset;
4574 brw->wm.prog_data = old_prog_data;
4575
4576 return success;
4577 }
4578
4579 void
4580 brw_setup_tex_for_precompile(struct brw_context *brw,
4581 struct brw_sampler_prog_key_data *tex,
4582 struct gl_program *prog)
4583 {
4584 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4585 unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4586 for (unsigned i = 0; i < sampler_count; i++) {
4587 if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4588 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4589 tex->swizzles[i] =
4590 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4591 } else {
4592 /* Color sampler: assume no swizzling. */
4593 tex->swizzles[i] = SWIZZLE_XYZW;
4594 }
4595 }
4596 }