i965/fs: Fix implied_mrf_writes for scratch writes
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 void
53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
54 const fs_reg *src, unsigned sources)
55 {
56 memset(this, 0, sizeof(*this));
57
58 this->src = new fs_reg[MAX2(sources, 3)];
59 for (unsigned i = 0; i < sources; i++)
60 this->src[i] = src[i];
61
62 this->opcode = opcode;
63 this->dst = dst;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (unsigned i = 0; i < sources; ++i) {
79 if (src[i].file != GRF && src[i].file != ATTR)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 this->conditional_mod = BRW_CONDITIONAL_NONE;
94
95 /* This will be the case for almost all instructions. */
96 switch (dst.file) {
97 case GRF:
98 case HW_REG:
99 case MRF:
100 case ATTR:
101 this->regs_written =
102 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
103 break;
104 case BAD_FILE:
105 this->regs_written = 0;
106 break;
107 case IMM:
108 case UNIFORM:
109 unreachable("Invalid destination register file");
110 default:
111 unreachable("Invalid register file");
112 }
113
114 this->writes_accumulator = false;
115 }
116
117 fs_inst::fs_inst()
118 {
119 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
120 }
121
122 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
123 {
124 init(opcode, exec_size, reg_undef, NULL, 0);
125 }
126
127 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
128 {
129 init(opcode, 0, dst, NULL, 0);
130 }
131
132 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
133 const fs_reg &src0)
134 {
135 const fs_reg src[1] = { src0 };
136 init(opcode, exec_size, dst, src, 1);
137 }
138
139 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
140 {
141 const fs_reg src[1] = { src0 };
142 init(opcode, 0, dst, src, 1);
143 }
144
145 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
146 const fs_reg &src0, const fs_reg &src1)
147 {
148 const fs_reg src[2] = { src0, src1 };
149 init(opcode, exec_size, dst, src, 2);
150 }
151
152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
153 const fs_reg &src1)
154 {
155 const fs_reg src[2] = { src0, src1 };
156 init(opcode, 0, dst, src, 2);
157 }
158
159 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
160 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
161 {
162 const fs_reg src[3] = { src0, src1, src2 };
163 init(opcode, exec_size, dst, src, 3);
164 }
165
166 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
167 const fs_reg &src1, const fs_reg &src2)
168 {
169 const fs_reg src[3] = { src0, src1, src2 };
170 init(opcode, 0, dst, src, 3);
171 }
172
173 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
174 const fs_reg src[], unsigned sources)
175 {
176 init(opcode, 0, dst, src, sources);
177 }
178
179 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
180 const fs_reg src[], unsigned sources)
181 {
182 init(opcode, exec_width, dst, src, sources);
183 }
184
185 fs_inst::fs_inst(const fs_inst &that)
186 {
187 memcpy(this, &that, sizeof(that));
188
189 this->src = new fs_reg[MAX2(that.sources, 3)];
190
191 for (unsigned i = 0; i < that.sources; i++)
192 this->src[i] = that.src[i];
193 }
194
195 fs_inst::~fs_inst()
196 {
197 delete[] this->src;
198 }
199
200 void
201 fs_inst::resize_sources(uint8_t num_sources)
202 {
203 if (this->sources != num_sources) {
204 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
205
206 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
207 src[i] = this->src[i];
208
209 delete[] this->src;
210 this->src = src;
211 this->sources = num_sources;
212 }
213 }
214
215 #define ALU1(op) \
216 fs_inst * \
217 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
218 { \
219 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
220 }
221
222 #define ALU2(op) \
223 fs_inst * \
224 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
225 const fs_reg &src1) \
226 { \
227 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
228 }
229
230 #define ALU2_ACC(op) \
231 fs_inst * \
232 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
233 const fs_reg &src1) \
234 { \
235 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
236 inst->writes_accumulator = true; \
237 return inst; \
238 }
239
240 #define ALU3(op) \
241 fs_inst * \
242 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
243 const fs_reg &src1, const fs_reg &src2) \
244 { \
245 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
246 }
247
248 ALU1(NOT)
249 ALU1(MOV)
250 ALU1(FRC)
251 ALU1(RNDD)
252 ALU1(RNDE)
253 ALU1(RNDZ)
254 ALU2(ADD)
255 ALU2(MUL)
256 ALU2_ACC(MACH)
257 ALU2(AND)
258 ALU2(OR)
259 ALU2(XOR)
260 ALU2(SHL)
261 ALU2(SHR)
262 ALU2(ASR)
263 ALU3(LRP)
264 ALU1(BFREV)
265 ALU3(BFE)
266 ALU2(BFI1)
267 ALU3(BFI2)
268 ALU1(FBH)
269 ALU1(FBL)
270 ALU1(CBIT)
271 ALU3(MAD)
272 ALU2_ACC(ADDC)
273 ALU2_ACC(SUBB)
274 ALU2(SEL)
275 ALU2(MAC)
276
277 /** Gen4 predicated IF. */
278 fs_inst *
279 fs_visitor::IF(enum brw_predicate predicate)
280 {
281 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
282 inst->predicate = predicate;
283 return inst;
284 }
285
286 /** Gen6 IF with embedded comparison. */
287 fs_inst *
288 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
289 enum brw_conditional_mod condition)
290 {
291 assert(devinfo->gen == 6);
292 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
293 reg_null_d, src0, src1);
294 inst->conditional_mod = condition;
295 return inst;
296 }
297
298 /**
299 * CMP: Sets the low bit of the destination channels with the result
300 * of the comparison, while the upper bits are undefined, and updates
301 * the flag register with the packed 16 bits of the result.
302 */
303 fs_inst *
304 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
305 enum brw_conditional_mod condition)
306 {
307 fs_inst *inst;
308
309 /* Take the instruction:
310 *
311 * CMP null<d> src0<f> src1<f>
312 *
313 * Original gen4 does type conversion to the destination type before
314 * comparison, producing garbage results for floating point comparisons.
315 *
316 * The destination type doesn't matter on newer generations, so we set the
317 * type to match src0 so we can compact the instruction.
318 */
319 dst.type = src0.type;
320 if (dst.file == HW_REG)
321 dst.fixed_hw_reg.type = dst.type;
322
323 resolve_ud_negate(&src0);
324 resolve_ud_negate(&src1);
325
326 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
327 inst->conditional_mod = condition;
328
329 return inst;
330 }
331
332 fs_inst *
333 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
334 int header_size)
335 {
336 assert(dst.width % 8 == 0);
337 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
338 dst, src, sources);
339 inst->header_size = header_size;
340
341 for (int i = 0; i < header_size; i++)
342 assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
343 inst->regs_written = header_size;
344
345 for (int i = header_size; i < sources; ++i)
346 assert(src[i].file != GRF || src[i].width == dst.width);
347 inst->regs_written += (sources - header_size) * (dst.width / 8);
348
349 return inst;
350 }
351
352 exec_list
353 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
354 const fs_reg &surf_index,
355 const fs_reg &varying_offset,
356 uint32_t const_offset)
357 {
358 exec_list instructions;
359 fs_inst *inst;
360
361 /* We have our constant surface use a pitch of 4 bytes, so our index can
362 * be any component of a vector, and then we load 4 contiguous
363 * components starting from that.
364 *
365 * We break down the const_offset to a portion added to the variable
366 * offset and a portion done using reg_offset, which means that if you
367 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
368 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
369 * CSE can later notice that those loads are all the same and eliminate
370 * the redundant ones.
371 */
372 fs_reg vec4_offset = vgrf(glsl_type::int_type);
373 instructions.push_tail(ADD(vec4_offset,
374 varying_offset, fs_reg(const_offset & ~3)));
375
376 int scale = 1;
377 if (devinfo->gen == 4 && dst.width == 8) {
378 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
379 * u, v, r) as parameters, or we can just use the SIMD16 message
380 * consisting of (header, u). We choose the second, at the cost of a
381 * longer return length.
382 */
383 scale = 2;
384 }
385
386 enum opcode op;
387 if (devinfo->gen >= 7)
388 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
389 else
390 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
391
392 assert(dst.width % 8 == 0);
393 int regs_written = 4 * (dst.width / 8) * scale;
394 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
395 dst.type, dst.width);
396 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
397 inst->regs_written = regs_written;
398 instructions.push_tail(inst);
399
400 if (devinfo->gen < 7) {
401 inst->base_mrf = 13;
402 inst->header_size = 1;
403 if (devinfo->gen == 4)
404 inst->mlen = 3;
405 else
406 inst->mlen = 1 + dispatch_width / 8;
407 }
408
409 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
410 instructions.push_tail(MOV(dst, result));
411
412 return instructions;
413 }
414
415 /**
416 * A helper for MOV generation for fixing up broken hardware SEND dependency
417 * handling.
418 */
419 fs_inst *
420 fs_visitor::DEP_RESOLVE_MOV(int grf)
421 {
422 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
423
424 inst->ir = NULL;
425 inst->annotation = "send dependency resolve";
426
427 /* The caller always wants uncompressed to emit the minimal extra
428 * dependencies, and to avoid having to deal with aligning its regs to 2.
429 */
430 inst->exec_size = 8;
431
432 return inst;
433 }
434
435 bool
436 fs_inst::equals(fs_inst *inst) const
437 {
438 return (opcode == inst->opcode &&
439 dst.equals(inst->dst) &&
440 src[0].equals(inst->src[0]) &&
441 src[1].equals(inst->src[1]) &&
442 src[2].equals(inst->src[2]) &&
443 saturate == inst->saturate &&
444 predicate == inst->predicate &&
445 conditional_mod == inst->conditional_mod &&
446 mlen == inst->mlen &&
447 base_mrf == inst->base_mrf &&
448 target == inst->target &&
449 eot == inst->eot &&
450 header_size == inst->header_size &&
451 shadow_compare == inst->shadow_compare &&
452 exec_size == inst->exec_size &&
453 offset == inst->offset);
454 }
455
456 bool
457 fs_inst::overwrites_reg(const fs_reg &reg) const
458 {
459 return reg.in_range(dst, regs_written);
460 }
461
462 bool
463 fs_inst::is_send_from_grf() const
464 {
465 switch (opcode) {
466 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
467 case SHADER_OPCODE_SHADER_TIME_ADD:
468 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
469 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
470 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
471 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
472 case SHADER_OPCODE_UNTYPED_ATOMIC:
473 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
474 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
475 case SHADER_OPCODE_TYPED_ATOMIC:
476 case SHADER_OPCODE_TYPED_SURFACE_READ:
477 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
478 case SHADER_OPCODE_URB_WRITE_SIMD8:
479 return true;
480 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
481 return src[1].file == GRF;
482 case FS_OPCODE_FB_WRITE:
483 return src[0].file == GRF;
484 default:
485 if (is_tex())
486 return src[0].file == GRF;
487
488 return false;
489 }
490 }
491
492 bool
493 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
494 {
495 if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
496 return false;
497
498 fs_reg reg = this->src[0];
499 if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
500 return false;
501
502 if (grf_alloc.sizes[reg.reg] != this->regs_written)
503 return false;
504
505 for (int i = 0; i < this->sources; i++) {
506 reg.type = this->src[i].type;
507 reg.width = this->src[i].width;
508 if (!this->src[i].equals(reg))
509 return false;
510 reg = ::offset(reg, 1);
511 }
512
513 return true;
514 }
515
516 bool
517 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
518 {
519 if (devinfo->gen == 6 && is_math())
520 return false;
521
522 if (is_send_from_grf())
523 return false;
524
525 if (!backend_instruction::can_do_source_mods())
526 return false;
527
528 return true;
529 }
530
531 bool
532 fs_inst::has_side_effects() const
533 {
534 return this->eot || backend_instruction::has_side_effects();
535 }
536
537 void
538 fs_reg::init()
539 {
540 memset(this, 0, sizeof(*this));
541 stride = 1;
542 }
543
544 /** Generic unset register constructor. */
545 fs_reg::fs_reg()
546 {
547 init();
548 this->file = BAD_FILE;
549 }
550
551 /** Immediate value constructor. */
552 fs_reg::fs_reg(float f)
553 {
554 init();
555 this->file = IMM;
556 this->type = BRW_REGISTER_TYPE_F;
557 this->fixed_hw_reg.dw1.f = f;
558 this->width = 1;
559 }
560
561 /** Immediate value constructor. */
562 fs_reg::fs_reg(int32_t i)
563 {
564 init();
565 this->file = IMM;
566 this->type = BRW_REGISTER_TYPE_D;
567 this->fixed_hw_reg.dw1.d = i;
568 this->width = 1;
569 }
570
571 /** Immediate value constructor. */
572 fs_reg::fs_reg(uint32_t u)
573 {
574 init();
575 this->file = IMM;
576 this->type = BRW_REGISTER_TYPE_UD;
577 this->fixed_hw_reg.dw1.ud = u;
578 this->width = 1;
579 }
580
581 /** Vector float immediate value constructor. */
582 fs_reg::fs_reg(uint8_t vf[4])
583 {
584 init();
585 this->file = IMM;
586 this->type = BRW_REGISTER_TYPE_VF;
587 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
588 }
589
590 /** Vector float immediate value constructor. */
591 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
592 {
593 init();
594 this->file = IMM;
595 this->type = BRW_REGISTER_TYPE_VF;
596 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
597 (vf1 << 8) |
598 (vf2 << 16) |
599 (vf3 << 24);
600 }
601
602 /** Fixed brw_reg. */
603 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
604 {
605 init();
606 this->file = HW_REG;
607 this->fixed_hw_reg = fixed_hw_reg;
608 this->type = fixed_hw_reg.type;
609 this->width = 1 << fixed_hw_reg.width;
610 }
611
612 bool
613 fs_reg::equals(const fs_reg &r) const
614 {
615 return (file == r.file &&
616 reg == r.reg &&
617 reg_offset == r.reg_offset &&
618 subreg_offset == r.subreg_offset &&
619 type == r.type &&
620 negate == r.negate &&
621 abs == r.abs &&
622 !reladdr && !r.reladdr &&
623 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
624 width == r.width &&
625 stride == r.stride);
626 }
627
628 fs_reg &
629 fs_reg::set_smear(unsigned subreg)
630 {
631 assert(file != HW_REG && file != IMM);
632 subreg_offset = subreg * type_sz(type);
633 stride = 0;
634 return *this;
635 }
636
637 bool
638 fs_reg::is_contiguous() const
639 {
640 return stride == 1;
641 }
642
643 int
644 fs_visitor::type_size(const struct glsl_type *type)
645 {
646 unsigned int size, i;
647
648 switch (type->base_type) {
649 case GLSL_TYPE_UINT:
650 case GLSL_TYPE_INT:
651 case GLSL_TYPE_FLOAT:
652 case GLSL_TYPE_BOOL:
653 return type->components();
654 case GLSL_TYPE_ARRAY:
655 return type_size(type->fields.array) * type->length;
656 case GLSL_TYPE_STRUCT:
657 size = 0;
658 for (i = 0; i < type->length; i++) {
659 size += type_size(type->fields.structure[i].type);
660 }
661 return size;
662 case GLSL_TYPE_SAMPLER:
663 /* Samplers take up no register space, since they're baked in at
664 * link time.
665 */
666 return 0;
667 case GLSL_TYPE_ATOMIC_UINT:
668 return 0;
669 case GLSL_TYPE_IMAGE:
670 case GLSL_TYPE_VOID:
671 case GLSL_TYPE_ERROR:
672 case GLSL_TYPE_INTERFACE:
673 case GLSL_TYPE_DOUBLE:
674 unreachable("not reached");
675 }
676
677 return 0;
678 }
679
680 /**
681 * Create a MOV to read the timestamp register.
682 *
683 * The caller is responsible for emitting the MOV. The return value is
684 * the destination of the MOV, with extra parameters set.
685 */
686 fs_reg
687 fs_visitor::get_timestamp(fs_inst **out_mov)
688 {
689 assert(devinfo->gen >= 7);
690
691 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
692 BRW_ARF_TIMESTAMP,
693 0),
694 BRW_REGISTER_TYPE_UD));
695
696 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
697
698 fs_inst *mov = MOV(dst, ts);
699 /* We want to read the 3 fields we care about even if it's not enabled in
700 * the dispatch.
701 */
702 mov->force_writemask_all = true;
703
704 /* The caller wants the low 32 bits of the timestamp. Since it's running
705 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
706 * which is plenty of time for our purposes. It is identical across the
707 * EUs, but since it's tracking GPU core speed it will increment at a
708 * varying rate as render P-states change.
709 *
710 * The caller could also check if render P-states have changed (or anything
711 * else that might disrupt timing) by setting smear to 2 and checking if
712 * that field is != 0.
713 */
714 dst.set_smear(0);
715
716 *out_mov = mov;
717 return dst;
718 }
719
720 void
721 fs_visitor::emit_shader_time_begin()
722 {
723 current_annotation = "shader time start";
724 fs_inst *mov;
725 shader_start_time = get_timestamp(&mov);
726 emit(mov);
727 }
728
729 void
730 fs_visitor::emit_shader_time_end()
731 {
732 current_annotation = "shader time end";
733
734 enum shader_time_shader_type type, written_type, reset_type;
735 switch (stage) {
736 case MESA_SHADER_VERTEX:
737 type = ST_VS;
738 written_type = ST_VS_WRITTEN;
739 reset_type = ST_VS_RESET;
740 break;
741 case MESA_SHADER_GEOMETRY:
742 type = ST_GS;
743 written_type = ST_GS_WRITTEN;
744 reset_type = ST_GS_RESET;
745 break;
746 case MESA_SHADER_FRAGMENT:
747 if (dispatch_width == 8) {
748 type = ST_FS8;
749 written_type = ST_FS8_WRITTEN;
750 reset_type = ST_FS8_RESET;
751 } else {
752 assert(dispatch_width == 16);
753 type = ST_FS16;
754 written_type = ST_FS16_WRITTEN;
755 reset_type = ST_FS16_RESET;
756 }
757 break;
758 case MESA_SHADER_COMPUTE:
759 type = ST_CS;
760 written_type = ST_CS_WRITTEN;
761 reset_type = ST_CS_RESET;
762 break;
763 default:
764 unreachable("fs_visitor::emit_shader_time_end missing code");
765 }
766
767 /* Insert our code just before the final SEND with EOT. */
768 exec_node *end = this->instructions.get_tail();
769 assert(end && ((fs_inst *) end)->eot);
770
771 fs_inst *tm_read;
772 fs_reg shader_end_time = get_timestamp(&tm_read);
773 end->insert_before(tm_read);
774
775 /* Check that there weren't any timestamp reset events (assuming these
776 * were the only two timestamp reads that happened).
777 */
778 fs_reg reset = shader_end_time;
779 reset.set_smear(2);
780 fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
781 test->conditional_mod = BRW_CONDITIONAL_Z;
782 test->force_writemask_all = true;
783 end->insert_before(test);
784 end->insert_before(IF(BRW_PREDICATE_NORMAL));
785
786 fs_reg start = shader_start_time;
787 start.negate = true;
788 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
789 diff.set_smear(0);
790 fs_inst *add = ADD(diff, start, shader_end_time);
791 add->force_writemask_all = true;
792 end->insert_before(add);
793
794 /* If there were no instructions between the two timestamp gets, the diff
795 * is 2 cycles. Remove that overhead, so I can forget about that when
796 * trying to determine the time taken for single instructions.
797 */
798 add = ADD(diff, diff, fs_reg(-2u));
799 add->force_writemask_all = true;
800 end->insert_before(add);
801
802 end->insert_before(SHADER_TIME_ADD(type, diff));
803 end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
804 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
805 end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
806 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
807 }
808
809 fs_inst *
810 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
811 {
812 int shader_time_index =
813 brw_get_shader_time_index(brw, shader_prog, prog, type);
814 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
815
816 fs_reg payload;
817 if (dispatch_width == 8)
818 payload = vgrf(glsl_type::uvec2_type);
819 else
820 payload = vgrf(glsl_type::uint_type);
821
822 return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
823 fs_reg(), payload, offset, value);
824 }
825
826 void
827 fs_visitor::vfail(const char *format, va_list va)
828 {
829 char *msg;
830
831 if (failed)
832 return;
833
834 failed = true;
835
836 msg = ralloc_vasprintf(mem_ctx, format, va);
837 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
838
839 this->fail_msg = msg;
840
841 if (debug_enabled) {
842 fprintf(stderr, "%s", msg);
843 }
844 }
845
846 void
847 fs_visitor::fail(const char *format, ...)
848 {
849 va_list va;
850
851 va_start(va, format);
852 vfail(format, va);
853 va_end(va);
854 }
855
856 /**
857 * Mark this program as impossible to compile in SIMD16 mode.
858 *
859 * During the SIMD8 compile (which happens first), we can detect and flag
860 * things that are unsupported in SIMD16 mode, so the compiler can skip
861 * the SIMD16 compile altogether.
862 *
863 * During a SIMD16 compile (if one happens anyway), this just calls fail().
864 */
865 void
866 fs_visitor::no16(const char *format, ...)
867 {
868 va_list va;
869
870 va_start(va, format);
871
872 if (dispatch_width == 16) {
873 vfail(format, va);
874 } else {
875 simd16_unsupported = true;
876
877 if (brw->perf_debug) {
878 if (no16_msg)
879 ralloc_vasprintf_append(&no16_msg, format, va);
880 else
881 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
882 }
883 }
884
885 va_end(va);
886 }
887
888 fs_inst *
889 fs_visitor::emit(enum opcode opcode)
890 {
891 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
892 }
893
894 fs_inst *
895 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
896 {
897 return emit(new(mem_ctx) fs_inst(opcode, dst));
898 }
899
900 fs_inst *
901 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
902 {
903 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
904 }
905
906 fs_inst *
907 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
908 const fs_reg &src1)
909 {
910 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
911 }
912
913 fs_inst *
914 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
915 const fs_reg &src1, const fs_reg &src2)
916 {
917 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
918 }
919
920 fs_inst *
921 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
922 fs_reg src[], int sources)
923 {
924 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
925 }
926
927 /**
928 * Returns true if the instruction has a flag that means it won't
929 * update an entire destination register.
930 *
931 * For example, dead code elimination and live variable analysis want to know
932 * when a write to a variable screens off any preceding values that were in
933 * it.
934 */
935 bool
936 fs_inst::is_partial_write() const
937 {
938 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
939 (this->dst.width * type_sz(this->dst.type)) < 32 ||
940 !this->dst.is_contiguous());
941 }
942
943 int
944 fs_inst::regs_read(int arg) const
945 {
946 if (is_tex() && arg == 0 && src[0].file == GRF) {
947 return mlen;
948 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
949 return mlen;
950 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
951 return mlen;
952 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
953 return mlen;
954 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
955 return mlen;
956 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
957 return mlen;
958 } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
959 return mlen;
960 } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
961 return mlen;
962 } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
963 return mlen;
964 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
965 return mlen;
966 } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
967 return exec_size / 4;
968 }
969
970 switch (src[arg].file) {
971 case BAD_FILE:
972 case UNIFORM:
973 case IMM:
974 return 1;
975 case GRF:
976 case HW_REG:
977 if (src[arg].stride == 0) {
978 return 1;
979 } else {
980 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
981 return (size + 31) / 32;
982 }
983 case MRF:
984 unreachable("MRF registers are not allowed as sources");
985 default:
986 unreachable("Invalid register file");
987 }
988 }
989
990 bool
991 fs_inst::reads_flag() const
992 {
993 return predicate;
994 }
995
996 bool
997 fs_inst::writes_flag() const
998 {
999 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
1000 opcode != BRW_OPCODE_IF &&
1001 opcode != BRW_OPCODE_WHILE)) ||
1002 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
1003 }
1004
1005 /**
1006 * Returns how many MRFs an FS opcode will write over.
1007 *
1008 * Note that this is not the 0 or 1 implied writes in an actual gen
1009 * instruction -- the FS opcodes often generate MOVs in addition.
1010 */
1011 int
1012 fs_visitor::implied_mrf_writes(fs_inst *inst)
1013 {
1014 if (inst->mlen == 0)
1015 return 0;
1016
1017 if (inst->base_mrf == -1)
1018 return 0;
1019
1020 switch (inst->opcode) {
1021 case SHADER_OPCODE_RCP:
1022 case SHADER_OPCODE_RSQ:
1023 case SHADER_OPCODE_SQRT:
1024 case SHADER_OPCODE_EXP2:
1025 case SHADER_OPCODE_LOG2:
1026 case SHADER_OPCODE_SIN:
1027 case SHADER_OPCODE_COS:
1028 return 1 * dispatch_width / 8;
1029 case SHADER_OPCODE_POW:
1030 case SHADER_OPCODE_INT_QUOTIENT:
1031 case SHADER_OPCODE_INT_REMAINDER:
1032 return 2 * dispatch_width / 8;
1033 case SHADER_OPCODE_TEX:
1034 case FS_OPCODE_TXB:
1035 case SHADER_OPCODE_TXD:
1036 case SHADER_OPCODE_TXF:
1037 case SHADER_OPCODE_TXF_CMS:
1038 case SHADER_OPCODE_TXF_MCS:
1039 case SHADER_OPCODE_TG4:
1040 case SHADER_OPCODE_TG4_OFFSET:
1041 case SHADER_OPCODE_TXL:
1042 case SHADER_OPCODE_TXS:
1043 case SHADER_OPCODE_LOD:
1044 return 1;
1045 case FS_OPCODE_FB_WRITE:
1046 return 2;
1047 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1048 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1049 return 1;
1050 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1051 return inst->mlen;
1052 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1053 return inst->mlen;
1054 case SHADER_OPCODE_UNTYPED_ATOMIC:
1055 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1056 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1057 case SHADER_OPCODE_TYPED_ATOMIC:
1058 case SHADER_OPCODE_TYPED_SURFACE_READ:
1059 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1060 case SHADER_OPCODE_URB_WRITE_SIMD8:
1061 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1062 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1063 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1064 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1065 return 0;
1066 default:
1067 unreachable("not reached");
1068 }
1069 }
1070
1071 fs_reg
1072 fs_visitor::vgrf(const glsl_type *const type)
1073 {
1074 int reg_width = dispatch_width / 8;
1075 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1076 brw_type_for_base_type(type), dispatch_width);
1077 }
1078
1079 fs_reg
1080 fs_visitor::vgrf(int num_components)
1081 {
1082 int reg_width = dispatch_width / 8;
1083 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1084 BRW_REGISTER_TYPE_F, dispatch_width);
1085 }
1086
1087 /** Fixed HW reg constructor. */
1088 fs_reg::fs_reg(enum register_file file, int reg)
1089 {
1090 init();
1091 this->file = file;
1092 this->reg = reg;
1093 this->type = BRW_REGISTER_TYPE_F;
1094
1095 switch (file) {
1096 case UNIFORM:
1097 this->width = 1;
1098 break;
1099 default:
1100 this->width = 8;
1101 }
1102 }
1103
1104 /** Fixed HW reg constructor. */
1105 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1106 {
1107 init();
1108 this->file = file;
1109 this->reg = reg;
1110 this->type = type;
1111
1112 switch (file) {
1113 case UNIFORM:
1114 this->width = 1;
1115 break;
1116 default:
1117 this->width = 8;
1118 }
1119 }
1120
1121 /** Fixed HW reg constructor. */
1122 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1123 uint8_t width)
1124 {
1125 init();
1126 this->file = file;
1127 this->reg = reg;
1128 this->type = type;
1129 this->width = width;
1130 }
1131
1132 fs_reg *
1133 fs_visitor::variable_storage(ir_variable *var)
1134 {
1135 return (fs_reg *)hash_table_find(this->variable_ht, var);
1136 }
1137
1138 void
1139 import_uniforms_callback(const void *key,
1140 void *data,
1141 void *closure)
1142 {
1143 struct hash_table *dst_ht = (struct hash_table *)closure;
1144 const fs_reg *reg = (const fs_reg *)data;
1145
1146 if (reg->file != UNIFORM)
1147 return;
1148
1149 hash_table_insert(dst_ht, data, key);
1150 }
1151
1152 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1153 * This brings in those uniform definitions
1154 */
1155 void
1156 fs_visitor::import_uniforms(fs_visitor *v)
1157 {
1158 hash_table_call_foreach(v->variable_ht,
1159 import_uniforms_callback,
1160 variable_ht);
1161 this->push_constant_loc = v->push_constant_loc;
1162 this->pull_constant_loc = v->pull_constant_loc;
1163 this->uniforms = v->uniforms;
1164 this->param_size = v->param_size;
1165 }
1166
1167 /* Our support for uniforms is piggy-backed on the struct
1168 * gl_fragment_program, because that's where the values actually
1169 * get stored, rather than in some global gl_shader_program uniform
1170 * store.
1171 */
1172 void
1173 fs_visitor::setup_uniform_values(ir_variable *ir)
1174 {
1175 int namelen = strlen(ir->name);
1176
1177 /* The data for our (non-builtin) uniforms is stored in a series of
1178 * gl_uniform_driver_storage structs for each subcomponent that
1179 * glGetUniformLocation() could name. We know it's been set up in the same
1180 * order we'd walk the type, so walk the list of storage and find anything
1181 * with our name, or the prefix of a component that starts with our name.
1182 */
1183 unsigned params_before = uniforms;
1184 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1185 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1186
1187 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1188 (storage->name[namelen] != 0 &&
1189 storage->name[namelen] != '.' &&
1190 storage->name[namelen] != '[')) {
1191 continue;
1192 }
1193
1194 unsigned slots = storage->type->component_slots();
1195 if (storage->array_elements)
1196 slots *= storage->array_elements;
1197
1198 for (unsigned i = 0; i < slots; i++) {
1199 stage_prog_data->param[uniforms++] = &storage->storage[i];
1200 }
1201 }
1202
1203 /* Make sure we actually initialized the right amount of stuff here. */
1204 assert(params_before + ir->type->component_slots() == uniforms);
1205 (void)params_before;
1206 }
1207
1208
1209 /* Our support for builtin uniforms is even scarier than non-builtin.
1210 * It sits on top of the PROG_STATE_VAR parameters that are
1211 * automatically updated from GL context state.
1212 */
1213 void
1214 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1215 {
1216 const ir_state_slot *const slots = ir->get_state_slots();
1217 assert(slots != NULL);
1218
1219 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1220 /* This state reference has already been setup by ir_to_mesa, but we'll
1221 * get the same index back here.
1222 */
1223 int index = _mesa_add_state_reference(this->prog->Parameters,
1224 (gl_state_index *)slots[i].tokens);
1225
1226 /* Add each of the unique swizzles of the element as a parameter.
1227 * This'll end up matching the expected layout of the
1228 * array/matrix/structure we're trying to fill in.
1229 */
1230 int last_swiz = -1;
1231 for (unsigned int j = 0; j < 4; j++) {
1232 int swiz = GET_SWZ(slots[i].swizzle, j);
1233 if (swiz == last_swiz)
1234 break;
1235 last_swiz = swiz;
1236
1237 stage_prog_data->param[uniforms++] =
1238 &prog->Parameters->ParameterValues[index][swiz];
1239 }
1240 }
1241 }
1242
1243 fs_reg *
1244 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1245 bool origin_upper_left)
1246 {
1247 assert(stage == MESA_SHADER_FRAGMENT);
1248 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1249 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1250 fs_reg wpos = *reg;
1251 bool flip = !origin_upper_left ^ key->render_to_fbo;
1252
1253 /* gl_FragCoord.x */
1254 if (pixel_center_integer) {
1255 emit(MOV(wpos, this->pixel_x));
1256 } else {
1257 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1258 }
1259 wpos = offset(wpos, 1);
1260
1261 /* gl_FragCoord.y */
1262 if (!flip && pixel_center_integer) {
1263 emit(MOV(wpos, this->pixel_y));
1264 } else {
1265 fs_reg pixel_y = this->pixel_y;
1266 float offset = (pixel_center_integer ? 0.0 : 0.5);
1267
1268 if (flip) {
1269 pixel_y.negate = true;
1270 offset += key->drawable_height - 1.0;
1271 }
1272
1273 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1274 }
1275 wpos = offset(wpos, 1);
1276
1277 /* gl_FragCoord.z */
1278 if (devinfo->gen >= 6) {
1279 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1280 } else {
1281 emit(FS_OPCODE_LINTERP, wpos,
1282 this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1283 interp_reg(VARYING_SLOT_POS, 2));
1284 }
1285 wpos = offset(wpos, 1);
1286
1287 /* gl_FragCoord.w: Already set up in emit_interpolation */
1288 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1289
1290 return reg;
1291 }
1292
1293 fs_inst *
1294 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1295 glsl_interp_qualifier interpolation_mode,
1296 bool is_centroid, bool is_sample)
1297 {
1298 brw_wm_barycentric_interp_mode barycoord_mode;
1299 if (devinfo->gen >= 6) {
1300 if (is_centroid) {
1301 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1302 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1303 else
1304 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1305 } else if (is_sample) {
1306 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1307 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1308 else
1309 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1310 } else {
1311 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1312 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1313 else
1314 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1315 }
1316 } else {
1317 /* On Ironlake and below, there is only one interpolation mode.
1318 * Centroid interpolation doesn't mean anything on this hardware --
1319 * there is no multisampling.
1320 */
1321 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1322 }
1323 return emit(FS_OPCODE_LINTERP, attr,
1324 this->delta_xy[barycoord_mode], interp);
1325 }
1326
1327 void
1328 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1329 const glsl_type *type,
1330 glsl_interp_qualifier interpolation_mode,
1331 int location, bool mod_centroid,
1332 bool mod_sample)
1333 {
1334 attr.type = brw_type_for_base_type(type->get_scalar_type());
1335
1336 assert(stage == MESA_SHADER_FRAGMENT);
1337 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1338 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1339
1340 unsigned int array_elements;
1341
1342 if (type->is_array()) {
1343 array_elements = type->length;
1344 if (array_elements == 0) {
1345 fail("dereferenced array '%s' has length 0\n", name);
1346 }
1347 type = type->fields.array;
1348 } else {
1349 array_elements = 1;
1350 }
1351
1352 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1353 bool is_gl_Color =
1354 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1355 if (key->flat_shade && is_gl_Color) {
1356 interpolation_mode = INTERP_QUALIFIER_FLAT;
1357 } else {
1358 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1359 }
1360 }
1361
1362 for (unsigned int i = 0; i < array_elements; i++) {
1363 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1364 if (prog_data->urb_setup[location] == -1) {
1365 /* If there's no incoming setup data for this slot, don't
1366 * emit interpolation for it.
1367 */
1368 attr = offset(attr, type->vector_elements);
1369 location++;
1370 continue;
1371 }
1372
1373 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1374 /* Constant interpolation (flat shading) case. The SF has
1375 * handed us defined values in only the constant offset
1376 * field of the setup reg.
1377 */
1378 for (unsigned int k = 0; k < type->vector_elements; k++) {
1379 struct brw_reg interp = interp_reg(location, k);
1380 interp = suboffset(interp, 3);
1381 interp.type = attr.type;
1382 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1383 attr = offset(attr, 1);
1384 }
1385 } else {
1386 /* Smooth/noperspective interpolation case. */
1387 for (unsigned int k = 0; k < type->vector_elements; k++) {
1388 struct brw_reg interp = interp_reg(location, k);
1389 if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1390 /* Get the pixel/sample mask into f0 so that we know
1391 * which pixels are lit. Then, for each channel that is
1392 * unlit, replace the centroid data with non-centroid
1393 * data.
1394 */
1395 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1396
1397 fs_inst *inst;
1398 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1399 false, false);
1400 inst->predicate = BRW_PREDICATE_NORMAL;
1401 inst->predicate_inverse = true;
1402 if (devinfo->has_pln)
1403 inst->no_dd_clear = true;
1404
1405 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1406 mod_centroid && !key->persample_shading,
1407 mod_sample || key->persample_shading);
1408 inst->predicate = BRW_PREDICATE_NORMAL;
1409 inst->predicate_inverse = false;
1410 if (devinfo->has_pln)
1411 inst->no_dd_check = true;
1412
1413 } else {
1414 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1415 mod_centroid && !key->persample_shading,
1416 mod_sample || key->persample_shading);
1417 }
1418 if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1419 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1420 }
1421 attr = offset(attr, 1);
1422 }
1423
1424 }
1425 location++;
1426 }
1427 }
1428 }
1429
1430 fs_reg *
1431 fs_visitor::emit_frontfacing_interpolation()
1432 {
1433 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1434
1435 if (devinfo->gen >= 6) {
1436 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1437 * a boolean result from this (~0/true or 0/false).
1438 *
1439 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1440 * this task in only one instruction:
1441 * - a negation source modifier will flip the bit; and
1442 * - a W -> D type conversion will sign extend the bit into the high
1443 * word of the destination.
1444 *
1445 * An ASR 15 fills the low word of the destination.
1446 */
1447 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1448 g0.negate = true;
1449
1450 emit(ASR(*reg, g0, fs_reg(15)));
1451 } else {
1452 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1453 * a boolean result from this (1/true or 0/false).
1454 *
1455 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1456 * the negation source modifier to flip it. Unfortunately the SHR
1457 * instruction only operates on UD (or D with an abs source modifier)
1458 * sources without negation.
1459 *
1460 * Instead, use ASR (which will give ~0/true or 0/false).
1461 */
1462 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1463 g1_6.negate = true;
1464
1465 emit(ASR(*reg, g1_6, fs_reg(31)));
1466 }
1467
1468 return reg;
1469 }
1470
1471 void
1472 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1473 {
1474 assert(stage == MESA_SHADER_FRAGMENT);
1475 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1476 assert(dst.type == BRW_REGISTER_TYPE_F);
1477
1478 if (key->compute_pos_offset) {
1479 /* Convert int_sample_pos to floating point */
1480 emit(MOV(dst, int_sample_pos));
1481 /* Scale to the range [0, 1] */
1482 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1483 }
1484 else {
1485 /* From ARB_sample_shading specification:
1486 * "When rendering to a non-multisample buffer, or if multisample
1487 * rasterization is disabled, gl_SamplePosition will always be
1488 * (0.5, 0.5).
1489 */
1490 emit(MOV(dst, fs_reg(0.5f)));
1491 }
1492 }
1493
1494 fs_reg *
1495 fs_visitor::emit_samplepos_setup()
1496 {
1497 assert(devinfo->gen >= 6);
1498
1499 this->current_annotation = "compute sample position";
1500 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1501 fs_reg pos = *reg;
1502 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1503 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1504
1505 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1506 * mode will be enabled.
1507 *
1508 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1509 * R31.1:0 Position Offset X/Y for Slot[3:0]
1510 * R31.3:2 Position Offset X/Y for Slot[7:4]
1511 * .....
1512 *
1513 * The X, Y sample positions come in as bytes in thread payload. So, read
1514 * the positions using vstride=16, width=8, hstride=2.
1515 */
1516 struct brw_reg sample_pos_reg =
1517 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1518 BRW_REGISTER_TYPE_B), 16, 8, 2);
1519
1520 if (dispatch_width == 8) {
1521 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1522 } else {
1523 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1524 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1525 ->force_sechalf = true;
1526 }
1527 /* Compute gl_SamplePosition.x */
1528 compute_sample_position(pos, int_sample_x);
1529 pos = offset(pos, 1);
1530 if (dispatch_width == 8) {
1531 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1532 } else {
1533 emit(MOV(half(int_sample_y, 0),
1534 fs_reg(suboffset(sample_pos_reg, 1))));
1535 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1536 ->force_sechalf = true;
1537 }
1538 /* Compute gl_SamplePosition.y */
1539 compute_sample_position(pos, int_sample_y);
1540 return reg;
1541 }
1542
1543 fs_reg *
1544 fs_visitor::emit_sampleid_setup()
1545 {
1546 assert(stage == MESA_SHADER_FRAGMENT);
1547 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1548 assert(devinfo->gen >= 6);
1549
1550 this->current_annotation = "compute sample id";
1551 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1552
1553 if (key->compute_sample_id) {
1554 fs_reg t1 = vgrf(glsl_type::int_type);
1555 fs_reg t2 = vgrf(glsl_type::int_type);
1556 t2.type = BRW_REGISTER_TYPE_UW;
1557
1558 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1559 * 8x multisampling, subspan 0 will represent sample N (where N
1560 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1561 * 7. We can find the value of N by looking at R0.0 bits 7:6
1562 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1563 * (since samples are always delivered in pairs). That is, we
1564 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1565 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1566 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1567 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1568 * populating a temporary variable with the sequence (0, 1, 2, 3),
1569 * and then reading from it using vstride=1, width=4, hstride=0.
1570 * These computations hold good for 4x multisampling as well.
1571 *
1572 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1573 * the first four slots are sample 0 of subspan 0; the next four
1574 * are sample 1 of subspan 0; the third group is sample 0 of
1575 * subspan 1, and finally sample 1 of subspan 1.
1576 */
1577 fs_inst *inst;
1578 inst = emit(BRW_OPCODE_AND, t1,
1579 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1580 fs_reg(0xc0));
1581 inst->force_writemask_all = true;
1582 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1583 inst->force_writemask_all = true;
1584 /* This works for both SIMD8 and SIMD16 */
1585 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1586 inst->force_writemask_all = true;
1587 /* This special instruction takes care of setting vstride=1,
1588 * width=4, hstride=0 of t2 during an ADD instruction.
1589 */
1590 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1591 } else {
1592 /* As per GL_ARB_sample_shading specification:
1593 * "When rendering to a non-multisample buffer, or if multisample
1594 * rasterization is disabled, gl_SampleID will always be zero."
1595 */
1596 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1597 }
1598
1599 return reg;
1600 }
1601
1602 void
1603 fs_visitor::resolve_source_modifiers(fs_reg *src)
1604 {
1605 if (!src->abs && !src->negate)
1606 return;
1607
1608 fs_reg temp = retype(vgrf(1), src->type);
1609 emit(MOV(temp, *src));
1610 *src = temp;
1611 }
1612
1613 fs_reg
1614 fs_visitor::fix_math_operand(fs_reg src)
1615 {
1616 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1617 * might be able to do better by doing execsize = 1 math and then
1618 * expanding that result out, but we would need to be careful with
1619 * masking.
1620 *
1621 * The hardware ignores source modifiers (negate and abs) on math
1622 * instructions, so we also move to a temp to set those up.
1623 */
1624 if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1625 !src.abs && !src.negate)
1626 return src;
1627
1628 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1629 * operands to math
1630 */
1631 if (devinfo->gen >= 7 && src.file != IMM)
1632 return src;
1633
1634 fs_reg expanded = vgrf(glsl_type::float_type);
1635 expanded.type = src.type;
1636 emit(BRW_OPCODE_MOV, expanded, src);
1637 return expanded;
1638 }
1639
1640 fs_inst *
1641 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1642 {
1643 switch (opcode) {
1644 case SHADER_OPCODE_RCP:
1645 case SHADER_OPCODE_RSQ:
1646 case SHADER_OPCODE_SQRT:
1647 case SHADER_OPCODE_EXP2:
1648 case SHADER_OPCODE_LOG2:
1649 case SHADER_OPCODE_SIN:
1650 case SHADER_OPCODE_COS:
1651 break;
1652 default:
1653 unreachable("not reached: bad math opcode");
1654 }
1655
1656 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1657 * might be able to do better by doing execsize = 1 math and then
1658 * expanding that result out, but we would need to be careful with
1659 * masking.
1660 *
1661 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1662 * instructions, so we also move to a temp to set those up.
1663 */
1664 if (devinfo->gen == 6 || devinfo->gen == 7)
1665 src = fix_math_operand(src);
1666
1667 fs_inst *inst = emit(opcode, dst, src);
1668
1669 if (devinfo->gen < 6) {
1670 inst->base_mrf = 2;
1671 inst->mlen = dispatch_width / 8;
1672 }
1673
1674 return inst;
1675 }
1676
1677 fs_inst *
1678 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1679 {
1680 int base_mrf = 2;
1681 fs_inst *inst;
1682
1683 if (devinfo->gen >= 8) {
1684 inst = emit(opcode, dst, src0, src1);
1685 } else if (devinfo->gen >= 6) {
1686 src0 = fix_math_operand(src0);
1687 src1 = fix_math_operand(src1);
1688
1689 inst = emit(opcode, dst, src0, src1);
1690 } else {
1691 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1692 * "Message Payload":
1693 *
1694 * "Operand0[7]. For the INT DIV functions, this operand is the
1695 * denominator."
1696 * ...
1697 * "Operand1[7]. For the INT DIV functions, this operand is the
1698 * numerator."
1699 */
1700 bool is_int_div = opcode != SHADER_OPCODE_POW;
1701 fs_reg &op0 = is_int_div ? src1 : src0;
1702 fs_reg &op1 = is_int_div ? src0 : src1;
1703
1704 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1705 inst = emit(opcode, dst, op0, reg_null_f);
1706
1707 inst->base_mrf = base_mrf;
1708 inst->mlen = 2 * dispatch_width / 8;
1709 }
1710 return inst;
1711 }
1712
1713 void
1714 fs_visitor::emit_discard_jump()
1715 {
1716 assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1717
1718 /* For performance, after a discard, jump to the end of the
1719 * shader if all relevant channels have been discarded.
1720 */
1721 fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1722 discard_jump->flag_subreg = 1;
1723
1724 discard_jump->predicate = (dispatch_width == 8)
1725 ? BRW_PREDICATE_ALIGN1_ANY8H
1726 : BRW_PREDICATE_ALIGN1_ANY16H;
1727 discard_jump->predicate_inverse = true;
1728 }
1729
1730 void
1731 fs_visitor::assign_curb_setup()
1732 {
1733 if (dispatch_width == 8) {
1734 prog_data->dispatch_grf_start_reg = payload.num_regs;
1735 } else {
1736 if (stage == MESA_SHADER_FRAGMENT) {
1737 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1738 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1739 } else if (stage == MESA_SHADER_COMPUTE) {
1740 brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1741 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1742 } else {
1743 unreachable("Unsupported shader type!");
1744 }
1745 }
1746
1747 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1748
1749 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1750 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1751 for (unsigned int i = 0; i < inst->sources; i++) {
1752 if (inst->src[i].file == UNIFORM) {
1753 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1754 int constant_nr;
1755 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1756 constant_nr = push_constant_loc[uniform_nr];
1757 } else {
1758 /* Section 5.11 of the OpenGL 4.1 spec says:
1759 * "Out-of-bounds reads return undefined values, which include
1760 * values from other variables of the active program or zero."
1761 * Just return the first push constant.
1762 */
1763 constant_nr = 0;
1764 }
1765
1766 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1767 constant_nr / 8,
1768 constant_nr % 8);
1769
1770 inst->src[i].file = HW_REG;
1771 inst->src[i].fixed_hw_reg = byte_offset(
1772 retype(brw_reg, inst->src[i].type),
1773 inst->src[i].subreg_offset);
1774 }
1775 }
1776 }
1777 }
1778
1779 void
1780 fs_visitor::calculate_urb_setup()
1781 {
1782 assert(stage == MESA_SHADER_FRAGMENT);
1783 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1784 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1785
1786 memset(prog_data->urb_setup, -1,
1787 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1788
1789 int urb_next = 0;
1790 /* Figure out where each of the incoming setup attributes lands. */
1791 if (devinfo->gen >= 6) {
1792 if (_mesa_bitcount_64(prog->InputsRead &
1793 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1794 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1795 * first 16 varying inputs, so we can put them wherever we want.
1796 * Just put them in order.
1797 *
1798 * This is useful because it means that (a) inputs not used by the
1799 * fragment shader won't take up valuable register space, and (b) we
1800 * won't have to recompile the fragment shader if it gets paired with
1801 * a different vertex (or geometry) shader.
1802 */
1803 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1804 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1805 BITFIELD64_BIT(i)) {
1806 prog_data->urb_setup[i] = urb_next++;
1807 }
1808 }
1809 } else {
1810 /* We have enough input varyings that the SF/SBE pipeline stage can't
1811 * arbitrarily rearrange them to suit our whim; we have to put them
1812 * in an order that matches the output of the previous pipeline stage
1813 * (geometry or vertex shader).
1814 */
1815 struct brw_vue_map prev_stage_vue_map;
1816 brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1817 key->input_slots_valid);
1818 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1819 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1820 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1821 slot++) {
1822 int varying = prev_stage_vue_map.slot_to_varying[slot];
1823 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1824 * unused.
1825 */
1826 if (varying != BRW_VARYING_SLOT_COUNT &&
1827 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1828 BITFIELD64_BIT(varying))) {
1829 prog_data->urb_setup[varying] = slot - first_slot;
1830 }
1831 }
1832 urb_next = prev_stage_vue_map.num_slots - first_slot;
1833 }
1834 } else {
1835 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1836 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1837 /* Point size is packed into the header, not as a general attribute */
1838 if (i == VARYING_SLOT_PSIZ)
1839 continue;
1840
1841 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1842 /* The back color slot is skipped when the front color is
1843 * also written to. In addition, some slots can be
1844 * written in the vertex shader and not read in the
1845 * fragment shader. So the register number must always be
1846 * incremented, mapped or not.
1847 */
1848 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1849 prog_data->urb_setup[i] = urb_next;
1850 urb_next++;
1851 }
1852 }
1853
1854 /*
1855 * It's a FS only attribute, and we did interpolation for this attribute
1856 * in SF thread. So, count it here, too.
1857 *
1858 * See compile_sf_prog() for more info.
1859 */
1860 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1861 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1862 }
1863
1864 prog_data->num_varying_inputs = urb_next;
1865 }
1866
1867 void
1868 fs_visitor::assign_urb_setup()
1869 {
1870 assert(stage == MESA_SHADER_FRAGMENT);
1871 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1872
1873 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1874
1875 /* Offset all the urb_setup[] index by the actual position of the
1876 * setup regs, now that the location of the constants has been chosen.
1877 */
1878 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1879 if (inst->opcode == FS_OPCODE_LINTERP) {
1880 assert(inst->src[1].file == HW_REG);
1881 inst->src[1].fixed_hw_reg.nr += urb_start;
1882 }
1883
1884 if (inst->opcode == FS_OPCODE_CINTERP) {
1885 assert(inst->src[0].file == HW_REG);
1886 inst->src[0].fixed_hw_reg.nr += urb_start;
1887 }
1888 }
1889
1890 /* Each attribute is 4 setup channels, each of which is half a reg. */
1891 this->first_non_payload_grf =
1892 urb_start + prog_data->num_varying_inputs * 2;
1893 }
1894
1895 void
1896 fs_visitor::assign_vs_urb_setup()
1897 {
1898 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1899 int grf, count, slot, channel, attr;
1900
1901 assert(stage == MESA_SHADER_VERTEX);
1902 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1903 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1904 count++;
1905
1906 /* Each attribute is 4 regs. */
1907 this->first_non_payload_grf =
1908 payload.num_regs + prog_data->curb_read_length + count * 4;
1909
1910 unsigned vue_entries =
1911 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1912
1913 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1914 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1915
1916 assert(vs_prog_data->base.urb_read_length <= 15);
1917
1918 /* Rewrite all ATTR file references to the hw grf that they land in. */
1919 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1920 for (int i = 0; i < inst->sources; i++) {
1921 if (inst->src[i].file == ATTR) {
1922
1923 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1924 slot = count - 1;
1925 } else {
1926 /* Attributes come in in a contiguous block, ordered by their
1927 * gl_vert_attrib value. That means we can compute the slot
1928 * number for an attribute by masking out the enabled
1929 * attributes before it and counting the bits.
1930 */
1931 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1932 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1933 BITFIELD64_MASK(attr));
1934 }
1935
1936 channel = inst->src[i].reg_offset & 3;
1937
1938 grf = payload.num_regs +
1939 prog_data->curb_read_length +
1940 slot * 4 + channel;
1941
1942 inst->src[i].file = HW_REG;
1943 inst->src[i].fixed_hw_reg =
1944 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1945 }
1946 }
1947 }
1948 }
1949
1950 /**
1951 * Split large virtual GRFs into separate components if we can.
1952 *
1953 * This is mostly duplicated with what brw_fs_vector_splitting does,
1954 * but that's really conservative because it's afraid of doing
1955 * splitting that doesn't result in real progress after the rest of
1956 * the optimization phases, which would cause infinite looping in
1957 * optimization. We can do it once here, safely. This also has the
1958 * opportunity to split interpolated values, or maybe even uniforms,
1959 * which we don't have at the IR level.
1960 *
1961 * We want to split, because virtual GRFs are what we register
1962 * allocate and spill (due to contiguousness requirements for some
1963 * instructions), and they're what we naturally generate in the
1964 * codegen process, but most virtual GRFs don't actually need to be
1965 * contiguous sets of GRFs. If we split, we'll end up with reduced
1966 * live intervals and better dead code elimination and coalescing.
1967 */
1968 void
1969 fs_visitor::split_virtual_grfs()
1970 {
1971 int num_vars = this->alloc.count;
1972
1973 /* Count the total number of registers */
1974 int reg_count = 0;
1975 int vgrf_to_reg[num_vars];
1976 for (int i = 0; i < num_vars; i++) {
1977 vgrf_to_reg[i] = reg_count;
1978 reg_count += alloc.sizes[i];
1979 }
1980
1981 /* An array of "split points". For each register slot, this indicates
1982 * if this slot can be separated from the previous slot. Every time an
1983 * instruction uses multiple elements of a register (as a source or
1984 * destination), we mark the used slots as inseparable. Then we go
1985 * through and split the registers into the smallest pieces we can.
1986 */
1987 bool split_points[reg_count];
1988 memset(split_points, 0, sizeof(split_points));
1989
1990 /* Mark all used registers as fully splittable */
1991 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1992 if (inst->dst.file == GRF) {
1993 int reg = vgrf_to_reg[inst->dst.reg];
1994 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1995 split_points[reg + j] = true;
1996 }
1997
1998 for (int i = 0; i < inst->sources; i++) {
1999 if (inst->src[i].file == GRF) {
2000 int reg = vgrf_to_reg[inst->src[i].reg];
2001 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
2002 split_points[reg + j] = true;
2003 }
2004 }
2005 }
2006
2007 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2008 if (inst->dst.file == GRF) {
2009 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2010 for (int j = 1; j < inst->regs_written; j++)
2011 split_points[reg + j] = false;
2012 }
2013 for (int i = 0; i < inst->sources; i++) {
2014 if (inst->src[i].file == GRF) {
2015 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2016 for (int j = 1; j < inst->regs_read(i); j++)
2017 split_points[reg + j] = false;
2018 }
2019 }
2020 }
2021
2022 int new_virtual_grf[reg_count];
2023 int new_reg_offset[reg_count];
2024
2025 int reg = 0;
2026 for (int i = 0; i < num_vars; i++) {
2027 /* The first one should always be 0 as a quick sanity check. */
2028 assert(split_points[reg] == false);
2029
2030 /* j = 0 case */
2031 new_reg_offset[reg] = 0;
2032 reg++;
2033 int offset = 1;
2034
2035 /* j > 0 case */
2036 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2037 /* If this is a split point, reset the offset to 0 and allocate a
2038 * new virtual GRF for the previous offset many registers
2039 */
2040 if (split_points[reg]) {
2041 assert(offset <= MAX_VGRF_SIZE);
2042 int grf = alloc.allocate(offset);
2043 for (int k = reg - offset; k < reg; k++)
2044 new_virtual_grf[k] = grf;
2045 offset = 0;
2046 }
2047 new_reg_offset[reg] = offset;
2048 offset++;
2049 reg++;
2050 }
2051
2052 /* The last one gets the original register number */
2053 assert(offset <= MAX_VGRF_SIZE);
2054 alloc.sizes[i] = offset;
2055 for (int k = reg - offset; k < reg; k++)
2056 new_virtual_grf[k] = i;
2057 }
2058 assert(reg == reg_count);
2059
2060 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2061 if (inst->dst.file == GRF) {
2062 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2063 inst->dst.reg = new_virtual_grf[reg];
2064 inst->dst.reg_offset = new_reg_offset[reg];
2065 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2066 }
2067 for (int i = 0; i < inst->sources; i++) {
2068 if (inst->src[i].file == GRF) {
2069 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2070 inst->src[i].reg = new_virtual_grf[reg];
2071 inst->src[i].reg_offset = new_reg_offset[reg];
2072 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2073 }
2074 }
2075 }
2076 invalidate_live_intervals();
2077 }
2078
2079 /**
2080 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2081 *
2082 * During code generation, we create tons of temporary variables, many of
2083 * which get immediately killed and are never used again. Yet, in later
2084 * optimization and analysis passes, such as compute_live_intervals, we need
2085 * to loop over all the virtual GRFs. Compacting them can save a lot of
2086 * overhead.
2087 */
2088 bool
2089 fs_visitor::compact_virtual_grfs()
2090 {
2091 bool progress = false;
2092 int remap_table[this->alloc.count];
2093 memset(remap_table, -1, sizeof(remap_table));
2094
2095 /* Mark which virtual GRFs are used. */
2096 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2097 if (inst->dst.file == GRF)
2098 remap_table[inst->dst.reg] = 0;
2099
2100 for (int i = 0; i < inst->sources; i++) {
2101 if (inst->src[i].file == GRF)
2102 remap_table[inst->src[i].reg] = 0;
2103 }
2104 }
2105
2106 /* Compact the GRF arrays. */
2107 int new_index = 0;
2108 for (unsigned i = 0; i < this->alloc.count; i++) {
2109 if (remap_table[i] == -1) {
2110 /* We just found an unused register. This means that we are
2111 * actually going to compact something.
2112 */
2113 progress = true;
2114 } else {
2115 remap_table[i] = new_index;
2116 alloc.sizes[new_index] = alloc.sizes[i];
2117 invalidate_live_intervals();
2118 ++new_index;
2119 }
2120 }
2121
2122 this->alloc.count = new_index;
2123
2124 /* Patch all the instructions to use the newly renumbered registers */
2125 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2126 if (inst->dst.file == GRF)
2127 inst->dst.reg = remap_table[inst->dst.reg];
2128
2129 for (int i = 0; i < inst->sources; i++) {
2130 if (inst->src[i].file == GRF)
2131 inst->src[i].reg = remap_table[inst->src[i].reg];
2132 }
2133 }
2134
2135 /* Patch all the references to delta_xy, since they're used in register
2136 * allocation. If they're unused, switch them to BAD_FILE so we don't
2137 * think some random VGRF is delta_xy.
2138 */
2139 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2140 if (delta_xy[i].file == GRF) {
2141 if (remap_table[delta_xy[i].reg] != -1) {
2142 delta_xy[i].reg = remap_table[delta_xy[i].reg];
2143 } else {
2144 delta_xy[i].file = BAD_FILE;
2145 }
2146 }
2147 }
2148
2149 return progress;
2150 }
2151
2152 /*
2153 * Implements array access of uniforms by inserting a
2154 * PULL_CONSTANT_LOAD instruction.
2155 *
2156 * Unlike temporary GRF array access (where we don't support it due to
2157 * the difficulty of doing relative addressing on instruction
2158 * destinations), we could potentially do array access of uniforms
2159 * that were loaded in GRF space as push constants. In real-world
2160 * usage we've seen, though, the arrays being used are always larger
2161 * than we could load as push constants, so just always move all
2162 * uniform array access out to a pull constant buffer.
2163 */
2164 void
2165 fs_visitor::move_uniform_array_access_to_pull_constants()
2166 {
2167 if (dispatch_width != 8)
2168 return;
2169
2170 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2171 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2172
2173 /* Walk through and find array access of uniforms. Put a copy of that
2174 * uniform in the pull constant buffer.
2175 *
2176 * Note that we don't move constant-indexed accesses to arrays. No
2177 * testing has been done of the performance impact of this choice.
2178 */
2179 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2180 for (int i = 0 ; i < inst->sources; i++) {
2181 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2182 continue;
2183
2184 int uniform = inst->src[i].reg;
2185
2186 /* If this array isn't already present in the pull constant buffer,
2187 * add it.
2188 */
2189 if (pull_constant_loc[uniform] == -1) {
2190 const gl_constant_value **values = &stage_prog_data->param[uniform];
2191
2192 assert(param_size[uniform]);
2193
2194 for (int j = 0; j < param_size[uniform]; j++) {
2195 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2196
2197 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2198 values[j];
2199 }
2200 }
2201 }
2202 }
2203 }
2204
2205 /**
2206 * Assign UNIFORM file registers to either push constants or pull constants.
2207 *
2208 * We allow a fragment shader to have more than the specified minimum
2209 * maximum number of fragment shader uniform components (64). If
2210 * there are too many of these, they'd fill up all of register space.
2211 * So, this will push some of them out to the pull constant buffer and
2212 * update the program to load them.
2213 */
2214 void
2215 fs_visitor::assign_constant_locations()
2216 {
2217 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2218 if (dispatch_width != 8)
2219 return;
2220
2221 /* Find which UNIFORM registers are still in use. */
2222 bool is_live[uniforms];
2223 for (unsigned int i = 0; i < uniforms; i++) {
2224 is_live[i] = false;
2225 }
2226
2227 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2228 for (int i = 0; i < inst->sources; i++) {
2229 if (inst->src[i].file != UNIFORM)
2230 continue;
2231
2232 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2233 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2234 is_live[constant_nr] = true;
2235 }
2236 }
2237
2238 /* Only allow 16 registers (128 uniform components) as push constants.
2239 *
2240 * Just demote the end of the list. We could probably do better
2241 * here, demoting things that are rarely used in the program first.
2242 *
2243 * If changing this value, note the limitation about total_regs in
2244 * brw_curbe.c.
2245 */
2246 unsigned int max_push_components = 16 * 8;
2247 unsigned int num_push_constants = 0;
2248
2249 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2250
2251 for (unsigned int i = 0; i < uniforms; i++) {
2252 if (!is_live[i] || pull_constant_loc[i] != -1) {
2253 /* This UNIFORM register is either dead, or has already been demoted
2254 * to a pull const. Mark it as no longer living in the param[] array.
2255 */
2256 push_constant_loc[i] = -1;
2257 continue;
2258 }
2259
2260 if (num_push_constants < max_push_components) {
2261 /* Retain as a push constant. Record the location in the params[]
2262 * array.
2263 */
2264 push_constant_loc[i] = num_push_constants++;
2265 } else {
2266 /* Demote to a pull constant. */
2267 push_constant_loc[i] = -1;
2268
2269 int pull_index = stage_prog_data->nr_pull_params++;
2270 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2271 pull_constant_loc[i] = pull_index;
2272 }
2273 }
2274
2275 stage_prog_data->nr_params = num_push_constants;
2276
2277 /* Up until now, the param[] array has been indexed by reg + reg_offset
2278 * of UNIFORM registers. Condense it to only contain the uniforms we
2279 * chose to upload as push constants.
2280 */
2281 for (unsigned int i = 0; i < uniforms; i++) {
2282 int remapped = push_constant_loc[i];
2283
2284 if (remapped == -1)
2285 continue;
2286
2287 assert(remapped <= (int)i);
2288 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2289 }
2290 }
2291
2292 /**
2293 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2294 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2295 */
2296 void
2297 fs_visitor::demote_pull_constants()
2298 {
2299 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2300 for (int i = 0; i < inst->sources; i++) {
2301 if (inst->src[i].file != UNIFORM)
2302 continue;
2303
2304 int pull_index;
2305 unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2306 if (location >= uniforms) /* Out of bounds access */
2307 pull_index = -1;
2308 else
2309 pull_index = pull_constant_loc[location];
2310
2311 if (pull_index == -1)
2312 continue;
2313
2314 /* Set up the annotation tracking for new generated instructions. */
2315 base_ir = inst->ir;
2316 current_annotation = inst->annotation;
2317
2318 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2319 fs_reg dst = vgrf(glsl_type::float_type);
2320
2321 /* Generate a pull load into dst. */
2322 if (inst->src[i].reladdr) {
2323 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2324 surf_index,
2325 *inst->src[i].reladdr,
2326 pull_index);
2327 inst->insert_before(block, &list);
2328 inst->src[i].reladdr = NULL;
2329 } else {
2330 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2331 fs_inst *pull =
2332 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2333 dst, surf_index, offset);
2334 inst->insert_before(block, pull);
2335 inst->src[i].set_smear(pull_index & 3);
2336 }
2337
2338 /* Rewrite the instruction to use the temporary VGRF. */
2339 inst->src[i].file = GRF;
2340 inst->src[i].reg = dst.reg;
2341 inst->src[i].reg_offset = 0;
2342 inst->src[i].width = dispatch_width;
2343 }
2344 }
2345 invalidate_live_intervals();
2346 }
2347
2348 bool
2349 fs_visitor::opt_algebraic()
2350 {
2351 bool progress = false;
2352
2353 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2354 switch (inst->opcode) {
2355 case BRW_OPCODE_MOV:
2356 if (inst->src[0].file != IMM)
2357 break;
2358
2359 if (inst->saturate) {
2360 if (inst->dst.type != inst->src[0].type)
2361 assert(!"unimplemented: saturate mixed types");
2362
2363 if (brw_saturate_immediate(inst->dst.type,
2364 &inst->src[0].fixed_hw_reg)) {
2365 inst->saturate = false;
2366 progress = true;
2367 }
2368 }
2369 break;
2370
2371 case BRW_OPCODE_MUL:
2372 if (inst->src[1].file != IMM)
2373 continue;
2374
2375 /* a * 1.0 = a */
2376 if (inst->src[1].is_one()) {
2377 inst->opcode = BRW_OPCODE_MOV;
2378 inst->src[1] = reg_undef;
2379 progress = true;
2380 break;
2381 }
2382
2383 /* a * -1.0 = -a */
2384 if (inst->src[1].is_negative_one()) {
2385 inst->opcode = BRW_OPCODE_MOV;
2386 inst->src[0].negate = !inst->src[0].negate;
2387 inst->src[1] = reg_undef;
2388 progress = true;
2389 break;
2390 }
2391
2392 /* a * 0.0 = 0.0 */
2393 if (inst->src[1].is_zero()) {
2394 inst->opcode = BRW_OPCODE_MOV;
2395 inst->src[0] = inst->src[1];
2396 inst->src[1] = reg_undef;
2397 progress = true;
2398 break;
2399 }
2400
2401 if (inst->src[0].file == IMM) {
2402 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2403 inst->opcode = BRW_OPCODE_MOV;
2404 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2405 inst->src[1] = reg_undef;
2406 progress = true;
2407 break;
2408 }
2409 break;
2410 case BRW_OPCODE_ADD:
2411 if (inst->src[1].file != IMM)
2412 continue;
2413
2414 /* a + 0.0 = a */
2415 if (inst->src[1].is_zero()) {
2416 inst->opcode = BRW_OPCODE_MOV;
2417 inst->src[1] = reg_undef;
2418 progress = true;
2419 break;
2420 }
2421
2422 if (inst->src[0].file == IMM) {
2423 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2424 inst->opcode = BRW_OPCODE_MOV;
2425 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2426 inst->src[1] = reg_undef;
2427 progress = true;
2428 break;
2429 }
2430 break;
2431 case BRW_OPCODE_OR:
2432 if (inst->src[0].equals(inst->src[1])) {
2433 inst->opcode = BRW_OPCODE_MOV;
2434 inst->src[1] = reg_undef;
2435 progress = true;
2436 break;
2437 }
2438 break;
2439 case BRW_OPCODE_LRP:
2440 if (inst->src[1].equals(inst->src[2])) {
2441 inst->opcode = BRW_OPCODE_MOV;
2442 inst->src[0] = inst->src[1];
2443 inst->src[1] = reg_undef;
2444 inst->src[2] = reg_undef;
2445 progress = true;
2446 break;
2447 }
2448 break;
2449 case BRW_OPCODE_CMP:
2450 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2451 inst->src[0].abs &&
2452 inst->src[0].negate &&
2453 inst->src[1].is_zero()) {
2454 inst->src[0].abs = false;
2455 inst->src[0].negate = false;
2456 inst->conditional_mod = BRW_CONDITIONAL_Z;
2457 progress = true;
2458 break;
2459 }
2460 break;
2461 case BRW_OPCODE_SEL:
2462 if (inst->src[0].equals(inst->src[1])) {
2463 inst->opcode = BRW_OPCODE_MOV;
2464 inst->src[1] = reg_undef;
2465 inst->predicate = BRW_PREDICATE_NONE;
2466 inst->predicate_inverse = false;
2467 progress = true;
2468 } else if (inst->saturate && inst->src[1].file == IMM) {
2469 switch (inst->conditional_mod) {
2470 case BRW_CONDITIONAL_LE:
2471 case BRW_CONDITIONAL_L:
2472 switch (inst->src[1].type) {
2473 case BRW_REGISTER_TYPE_F:
2474 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2475 inst->opcode = BRW_OPCODE_MOV;
2476 inst->src[1] = reg_undef;
2477 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2478 progress = true;
2479 }
2480 break;
2481 default:
2482 break;
2483 }
2484 break;
2485 case BRW_CONDITIONAL_GE:
2486 case BRW_CONDITIONAL_G:
2487 switch (inst->src[1].type) {
2488 case BRW_REGISTER_TYPE_F:
2489 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2490 inst->opcode = BRW_OPCODE_MOV;
2491 inst->src[1] = reg_undef;
2492 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2493 progress = true;
2494 }
2495 break;
2496 default:
2497 break;
2498 }
2499 default:
2500 break;
2501 }
2502 }
2503 break;
2504 case BRW_OPCODE_MAD:
2505 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2506 inst->opcode = BRW_OPCODE_MOV;
2507 inst->src[1] = reg_undef;
2508 inst->src[2] = reg_undef;
2509 progress = true;
2510 } else if (inst->src[0].is_zero()) {
2511 inst->opcode = BRW_OPCODE_MUL;
2512 inst->src[0] = inst->src[2];
2513 inst->src[2] = reg_undef;
2514 progress = true;
2515 } else if (inst->src[1].is_one()) {
2516 inst->opcode = BRW_OPCODE_ADD;
2517 inst->src[1] = inst->src[2];
2518 inst->src[2] = reg_undef;
2519 progress = true;
2520 } else if (inst->src[2].is_one()) {
2521 inst->opcode = BRW_OPCODE_ADD;
2522 inst->src[2] = reg_undef;
2523 progress = true;
2524 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2525 inst->opcode = BRW_OPCODE_ADD;
2526 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2527 inst->src[2] = reg_undef;
2528 progress = true;
2529 }
2530 break;
2531 case SHADER_OPCODE_RCP: {
2532 fs_inst *prev = (fs_inst *)inst->prev;
2533 if (prev->opcode == SHADER_OPCODE_SQRT) {
2534 if (inst->src[0].equals(prev->dst)) {
2535 inst->opcode = SHADER_OPCODE_RSQ;
2536 inst->src[0] = prev->src[0];
2537 progress = true;
2538 }
2539 }
2540 break;
2541 }
2542 case SHADER_OPCODE_BROADCAST:
2543 if (is_uniform(inst->src[0])) {
2544 inst->opcode = BRW_OPCODE_MOV;
2545 inst->sources = 1;
2546 inst->force_writemask_all = true;
2547 progress = true;
2548 } else if (inst->src[1].file == IMM) {
2549 inst->opcode = BRW_OPCODE_MOV;
2550 inst->src[0] = component(inst->src[0],
2551 inst->src[1].fixed_hw_reg.dw1.ud);
2552 inst->sources = 1;
2553 inst->force_writemask_all = true;
2554 progress = true;
2555 }
2556 break;
2557
2558 default:
2559 break;
2560 }
2561
2562 /* Swap if src[0] is immediate. */
2563 if (progress && inst->is_commutative()) {
2564 if (inst->src[0].file == IMM) {
2565 fs_reg tmp = inst->src[1];
2566 inst->src[1] = inst->src[0];
2567 inst->src[0] = tmp;
2568 }
2569 }
2570 }
2571 return progress;
2572 }
2573
2574 /**
2575 * Optimize sample messages that have constant zero values for the trailing
2576 * texture coordinates. We can just reduce the message length for these
2577 * instructions instead of reserving a register for it. Trailing parameters
2578 * that aren't sent default to zero anyway. This will cause the dead code
2579 * eliminator to remove the MOV instruction that would otherwise be emitted to
2580 * set up the zero value.
2581 */
2582 bool
2583 fs_visitor::opt_zero_samples()
2584 {
2585 /* Gen4 infers the texturing opcode based on the message length so we can't
2586 * change it.
2587 */
2588 if (devinfo->gen < 5)
2589 return false;
2590
2591 bool progress = false;
2592
2593 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2594 if (!inst->is_tex())
2595 continue;
2596
2597 fs_inst *load_payload = (fs_inst *) inst->prev;
2598
2599 if (load_payload->is_head_sentinel() ||
2600 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2601 continue;
2602
2603 /* We don't want to remove the message header or the first parameter.
2604 * Removing the first parameter is not allowed, see the Haswell PRM
2605 * volume 7, page 149:
2606 *
2607 * "Parameter 0 is required except for the sampleinfo message, which
2608 * has no parameter 0"
2609 */
2610 while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2611 load_payload->src[(inst->mlen - inst->header_size) /
2612 (dispatch_width / 8) +
2613 inst->header_size - 1].is_zero()) {
2614 inst->mlen -= dispatch_width / 8;
2615 progress = true;
2616 }
2617 }
2618
2619 if (progress)
2620 invalidate_live_intervals();
2621
2622 return progress;
2623 }
2624
2625 /**
2626 * Optimize sample messages which are followed by the final RT write.
2627 *
2628 * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2629 * results sent directly to the framebuffer, bypassing the EU. Recognize the
2630 * final texturing results copied to the framebuffer write payload and modify
2631 * them to write to the framebuffer directly.
2632 */
2633 bool
2634 fs_visitor::opt_sampler_eot()
2635 {
2636 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2637
2638 if (stage != MESA_SHADER_FRAGMENT)
2639 return false;
2640
2641 if (devinfo->gen < 9 && !devinfo->is_cherryview)
2642 return false;
2643
2644 /* FINISHME: It should be possible to implement this optimization when there
2645 * are multiple drawbuffers.
2646 */
2647 if (key->nr_color_regions != 1)
2648 return false;
2649
2650 /* Look for a texturing instruction immediately before the final FB_WRITE. */
2651 fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2652 assert(fb_write->eot);
2653 assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2654
2655 fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2656
2657 /* There wasn't one; nothing to do. */
2658 if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2659 return false;
2660
2661 /* This optimisation doesn't seem to work for textureGather for some
2662 * reason. I can't find any documentation or known workarounds to indicate
2663 * that this is expected, but considering that it is probably pretty
2664 * unlikely that a shader would directly write out the results from
2665 * textureGather we might as well just disable it.
2666 */
2667 if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
2668 tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2669 return false;
2670
2671 /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2672 * It's very likely to be the previous instruction.
2673 */
2674 fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2675 if (load_payload->is_head_sentinel() ||
2676 load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2677 return false;
2678
2679 assert(!tex_inst->eot); /* We can't get here twice */
2680 assert((tex_inst->offset & (0xff << 24)) == 0);
2681
2682 tex_inst->offset |= fb_write->target << 24;
2683 tex_inst->eot = true;
2684 tex_inst->dst = reg_null_ud;
2685 fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2686
2687 /* If a header is present, marking the eot is sufficient. Otherwise, we need
2688 * to create a new LOAD_PAYLOAD command with the same sources and a space
2689 * saved for the header. Using a new destination register not only makes sure
2690 * we have enough space, but it will make sure the dead code eliminator kills
2691 * the instruction that this will replace.
2692 */
2693 if (tex_inst->header_size != 0)
2694 return true;
2695
2696 fs_reg send_header = vgrf(load_payload->sources + 1);
2697 fs_reg *new_sources =
2698 ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2699
2700 new_sources[0] = fs_reg();
2701 for (int i = 0; i < load_payload->sources; i++)
2702 new_sources[i+1] = load_payload->src[i];
2703
2704 /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2705 * requires a lot of information about the sources to appropriately figure
2706 * out the number of registers needed to be used. Given this stage in our
2707 * optimization, we may not have the appropriate GRFs required by
2708 * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2709 * manually emit the instruction.
2710 */
2711 fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2712 load_payload->exec_size,
2713 send_header,
2714 new_sources,
2715 load_payload->sources + 1);
2716
2717 new_load_payload->regs_written = load_payload->regs_written + 1;
2718 new_load_payload->header_size = 1;
2719 tex_inst->mlen++;
2720 tex_inst->header_size = 1;
2721 tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2722 tex_inst->src[0] = send_header;
2723
2724 return true;
2725 }
2726
2727 bool
2728 fs_visitor::opt_register_renaming()
2729 {
2730 bool progress = false;
2731 int depth = 0;
2732
2733 int remap[alloc.count];
2734 memset(remap, -1, sizeof(int) * alloc.count);
2735
2736 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2737 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2738 depth++;
2739 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2740 inst->opcode == BRW_OPCODE_WHILE) {
2741 depth--;
2742 }
2743
2744 /* Rewrite instruction sources. */
2745 for (int i = 0; i < inst->sources; i++) {
2746 if (inst->src[i].file == GRF &&
2747 remap[inst->src[i].reg] != -1 &&
2748 remap[inst->src[i].reg] != inst->src[i].reg) {
2749 inst->src[i].reg = remap[inst->src[i].reg];
2750 progress = true;
2751 }
2752 }
2753
2754 const int dst = inst->dst.reg;
2755
2756 if (depth == 0 &&
2757 inst->dst.file == GRF &&
2758 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2759 !inst->is_partial_write()) {
2760 if (remap[dst] == -1) {
2761 remap[dst] = dst;
2762 } else {
2763 remap[dst] = alloc.allocate(inst->dst.width / 8);
2764 inst->dst.reg = remap[dst];
2765 progress = true;
2766 }
2767 } else if (inst->dst.file == GRF &&
2768 remap[dst] != -1 &&
2769 remap[dst] != dst) {
2770 inst->dst.reg = remap[dst];
2771 progress = true;
2772 }
2773 }
2774
2775 if (progress) {
2776 invalidate_live_intervals();
2777
2778 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2779 if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2780 delta_xy[i].reg = remap[delta_xy[i].reg];
2781 }
2782 }
2783 }
2784
2785 return progress;
2786 }
2787
2788 /**
2789 * Remove redundant or useless discard jumps.
2790 *
2791 * For example, we can eliminate jumps in the following sequence:
2792 *
2793 * discard-jump (redundant with the next jump)
2794 * discard-jump (useless; jumps to the next instruction)
2795 * placeholder-halt
2796 */
2797 bool
2798 fs_visitor::opt_redundant_discard_jumps()
2799 {
2800 bool progress = false;
2801
2802 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2803
2804 fs_inst *placeholder_halt = NULL;
2805 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2806 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2807 placeholder_halt = inst;
2808 break;
2809 }
2810 }
2811
2812 if (!placeholder_halt)
2813 return false;
2814
2815 /* Delete any HALTs immediately before the placeholder halt. */
2816 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2817 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2818 prev = (fs_inst *) placeholder_halt->prev) {
2819 prev->remove(last_bblock);
2820 progress = true;
2821 }
2822
2823 if (progress)
2824 invalidate_live_intervals();
2825
2826 return progress;
2827 }
2828
2829 bool
2830 fs_visitor::compute_to_mrf()
2831 {
2832 bool progress = false;
2833 int next_ip = 0;
2834
2835 /* No MRFs on Gen >= 7. */
2836 if (devinfo->gen >= 7)
2837 return false;
2838
2839 calculate_live_intervals();
2840
2841 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2842 int ip = next_ip;
2843 next_ip++;
2844
2845 if (inst->opcode != BRW_OPCODE_MOV ||
2846 inst->is_partial_write() ||
2847 inst->dst.file != MRF || inst->src[0].file != GRF ||
2848 inst->dst.type != inst->src[0].type ||
2849 inst->src[0].abs || inst->src[0].negate ||
2850 !inst->src[0].is_contiguous() ||
2851 inst->src[0].subreg_offset)
2852 continue;
2853
2854 /* Work out which hardware MRF registers are written by this
2855 * instruction.
2856 */
2857 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2858 int mrf_high;
2859 if (inst->dst.reg & BRW_MRF_COMPR4) {
2860 mrf_high = mrf_low + 4;
2861 } else if (inst->exec_size == 16) {
2862 mrf_high = mrf_low + 1;
2863 } else {
2864 mrf_high = mrf_low;
2865 }
2866
2867 /* Can't compute-to-MRF this GRF if someone else was going to
2868 * read it later.
2869 */
2870 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2871 continue;
2872
2873 /* Found a move of a GRF to a MRF. Let's see if we can go
2874 * rewrite the thing that made this GRF to write into the MRF.
2875 */
2876 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2877 if (scan_inst->dst.file == GRF &&
2878 scan_inst->dst.reg == inst->src[0].reg) {
2879 /* Found the last thing to write our reg we want to turn
2880 * into a compute-to-MRF.
2881 */
2882
2883 /* If this one instruction didn't populate all the
2884 * channels, bail. We might be able to rewrite everything
2885 * that writes that reg, but it would require smarter
2886 * tracking to delay the rewriting until complete success.
2887 */
2888 if (scan_inst->is_partial_write())
2889 break;
2890
2891 /* Things returning more than one register would need us to
2892 * understand coalescing out more than one MOV at a time.
2893 */
2894 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2895 break;
2896
2897 /* SEND instructions can't have MRF as a destination. */
2898 if (scan_inst->mlen)
2899 break;
2900
2901 if (devinfo->gen == 6) {
2902 /* gen6 math instructions must have the destination be
2903 * GRF, so no compute-to-MRF for them.
2904 */
2905 if (scan_inst->is_math()) {
2906 break;
2907 }
2908 }
2909
2910 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2911 /* Found the creator of our MRF's source value. */
2912 scan_inst->dst.file = MRF;
2913 scan_inst->dst.reg = inst->dst.reg;
2914 scan_inst->saturate |= inst->saturate;
2915 inst->remove(block);
2916 progress = true;
2917 }
2918 break;
2919 }
2920
2921 /* We don't handle control flow here. Most computation of
2922 * values that end up in MRFs are shortly before the MRF
2923 * write anyway.
2924 */
2925 if (block->start() == scan_inst)
2926 break;
2927
2928 /* You can't read from an MRF, so if someone else reads our
2929 * MRF's source GRF that we wanted to rewrite, that stops us.
2930 */
2931 bool interfered = false;
2932 for (int i = 0; i < scan_inst->sources; i++) {
2933 if (scan_inst->src[i].file == GRF &&
2934 scan_inst->src[i].reg == inst->src[0].reg &&
2935 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2936 interfered = true;
2937 }
2938 }
2939 if (interfered)
2940 break;
2941
2942 if (scan_inst->dst.file == MRF) {
2943 /* If somebody else writes our MRF here, we can't
2944 * compute-to-MRF before that.
2945 */
2946 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2947 int scan_mrf_high;
2948
2949 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2950 scan_mrf_high = scan_mrf_low + 4;
2951 } else if (scan_inst->exec_size == 16) {
2952 scan_mrf_high = scan_mrf_low + 1;
2953 } else {
2954 scan_mrf_high = scan_mrf_low;
2955 }
2956
2957 if (mrf_low == scan_mrf_low ||
2958 mrf_low == scan_mrf_high ||
2959 mrf_high == scan_mrf_low ||
2960 mrf_high == scan_mrf_high) {
2961 break;
2962 }
2963 }
2964
2965 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2966 /* Found a SEND instruction, which means that there are
2967 * live values in MRFs from base_mrf to base_mrf +
2968 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2969 * above it.
2970 */
2971 if (mrf_low >= scan_inst->base_mrf &&
2972 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2973 break;
2974 }
2975 if (mrf_high >= scan_inst->base_mrf &&
2976 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2977 break;
2978 }
2979 }
2980 }
2981 }
2982
2983 if (progress)
2984 invalidate_live_intervals();
2985
2986 return progress;
2987 }
2988
2989 /**
2990 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2991 * flow. We could probably do better here with some form of divergence
2992 * analysis.
2993 */
2994 bool
2995 fs_visitor::eliminate_find_live_channel()
2996 {
2997 bool progress = false;
2998 unsigned depth = 0;
2999
3000 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3001 switch (inst->opcode) {
3002 case BRW_OPCODE_IF:
3003 case BRW_OPCODE_DO:
3004 depth++;
3005 break;
3006
3007 case BRW_OPCODE_ENDIF:
3008 case BRW_OPCODE_WHILE:
3009 depth--;
3010 break;
3011
3012 case FS_OPCODE_DISCARD_JUMP:
3013 /* This can potentially make control flow non-uniform until the end
3014 * of the program.
3015 */
3016 return progress;
3017
3018 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
3019 if (depth == 0) {
3020 inst->opcode = BRW_OPCODE_MOV;
3021 inst->src[0] = fs_reg(0);
3022 inst->sources = 1;
3023 inst->force_writemask_all = true;
3024 progress = true;
3025 }
3026 break;
3027
3028 default:
3029 break;
3030 }
3031 }
3032
3033 return progress;
3034 }
3035
3036 /**
3037 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
3038 * instructions to FS_OPCODE_REP_FB_WRITE.
3039 */
3040 void
3041 fs_visitor::emit_repclear_shader()
3042 {
3043 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3044 int base_mrf = 1;
3045 int color_mrf = base_mrf + 2;
3046
3047 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
3048 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
3049 mov->force_writemask_all = true;
3050
3051 fs_inst *write;
3052 if (key->nr_color_regions == 1) {
3053 write = emit(FS_OPCODE_REP_FB_WRITE);
3054 write->saturate = key->clamp_fragment_color;
3055 write->base_mrf = color_mrf;
3056 write->target = 0;
3057 write->header_size = 0;
3058 write->mlen = 1;
3059 } else {
3060 assume(key->nr_color_regions > 0);
3061 for (int i = 0; i < key->nr_color_regions; ++i) {
3062 write = emit(FS_OPCODE_REP_FB_WRITE);
3063 write->saturate = key->clamp_fragment_color;
3064 write->base_mrf = base_mrf;
3065 write->target = i;
3066 write->header_size = 2;
3067 write->mlen = 3;
3068 }
3069 }
3070 write->eot = true;
3071
3072 calculate_cfg();
3073
3074 assign_constant_locations();
3075 assign_curb_setup();
3076
3077 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
3078 assert(mov->src[0].file == HW_REG);
3079 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
3080 }
3081
3082 /**
3083 * Walks through basic blocks, looking for repeated MRF writes and
3084 * removing the later ones.
3085 */
3086 bool
3087 fs_visitor::remove_duplicate_mrf_writes()
3088 {
3089 fs_inst *last_mrf_move[16];
3090 bool progress = false;
3091
3092 /* Need to update the MRF tracking for compressed instructions. */
3093 if (dispatch_width == 16)
3094 return false;
3095
3096 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3097
3098 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3099 if (inst->is_control_flow()) {
3100 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3101 }
3102
3103 if (inst->opcode == BRW_OPCODE_MOV &&
3104 inst->dst.file == MRF) {
3105 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3106 if (prev_inst && inst->equals(prev_inst)) {
3107 inst->remove(block);
3108 progress = true;
3109 continue;
3110 }
3111 }
3112
3113 /* Clear out the last-write records for MRFs that were overwritten. */
3114 if (inst->dst.file == MRF) {
3115 last_mrf_move[inst->dst.reg] = NULL;
3116 }
3117
3118 if (inst->mlen > 0 && inst->base_mrf != -1) {
3119 /* Found a SEND instruction, which will include two or fewer
3120 * implied MRF writes. We could do better here.
3121 */
3122 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3123 last_mrf_move[inst->base_mrf + i] = NULL;
3124 }
3125 }
3126
3127 /* Clear out any MRF move records whose sources got overwritten. */
3128 if (inst->dst.file == GRF) {
3129 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3130 if (last_mrf_move[i] &&
3131 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3132 last_mrf_move[i] = NULL;
3133 }
3134 }
3135 }
3136
3137 if (inst->opcode == BRW_OPCODE_MOV &&
3138 inst->dst.file == MRF &&
3139 inst->src[0].file == GRF &&
3140 !inst->is_partial_write()) {
3141 last_mrf_move[inst->dst.reg] = inst;
3142 }
3143 }
3144
3145 if (progress)
3146 invalidate_live_intervals();
3147
3148 return progress;
3149 }
3150
3151 static void
3152 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3153 {
3154 /* Clear the flag for registers that actually got read (as expected). */
3155 for (int i = 0; i < inst->sources; i++) {
3156 int grf;
3157 if (inst->src[i].file == GRF) {
3158 grf = inst->src[i].reg;
3159 } else if (inst->src[i].file == HW_REG &&
3160 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3161 grf = inst->src[i].fixed_hw_reg.nr;
3162 } else {
3163 continue;
3164 }
3165
3166 if (grf >= first_grf &&
3167 grf < first_grf + grf_len) {
3168 deps[grf - first_grf] = false;
3169 if (inst->exec_size == 16)
3170 deps[grf - first_grf + 1] = false;
3171 }
3172 }
3173 }
3174
3175 /**
3176 * Implements this workaround for the original 965:
3177 *
3178 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3179 * check for post destination dependencies on this instruction, software
3180 * must ensure that there is no destination hazard for the case of ‘write
3181 * followed by a posted write’ shown in the following example.
3182 *
3183 * 1. mov r3 0
3184 * 2. send r3.xy <rest of send instruction>
3185 * 3. mov r2 r3
3186 *
3187 * Due to no post-destination dependency check on the ‘send’, the above
3188 * code sequence could have two instructions (1 and 2) in flight at the
3189 * same time that both consider ‘r3’ as the target of their final writes.
3190 */
3191 void
3192 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3193 fs_inst *inst)
3194 {
3195 int write_len = inst->regs_written;
3196 int first_write_grf = inst->dst.reg;
3197 bool needs_dep[BRW_MAX_MRF];
3198 assert(write_len < (int)sizeof(needs_dep) - 1);
3199
3200 memset(needs_dep, false, sizeof(needs_dep));
3201 memset(needs_dep, true, write_len);
3202
3203 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3204
3205 /* Walk backwards looking for writes to registers we're writing which
3206 * aren't read since being written. If we hit the start of the program,
3207 * we assume that there are no outstanding dependencies on entry to the
3208 * program.
3209 */
3210 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3211 /* If we hit control flow, assume that there *are* outstanding
3212 * dependencies, and force their cleanup before our instruction.
3213 */
3214 if (block->start() == scan_inst) {
3215 for (int i = 0; i < write_len; i++) {
3216 if (needs_dep[i]) {
3217 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3218 }
3219 }
3220 return;
3221 }
3222
3223 /* We insert our reads as late as possible on the assumption that any
3224 * instruction but a MOV that might have left us an outstanding
3225 * dependency has more latency than a MOV.
3226 */
3227 if (scan_inst->dst.file == GRF) {
3228 for (int i = 0; i < scan_inst->regs_written; i++) {
3229 int reg = scan_inst->dst.reg + i;
3230
3231 if (reg >= first_write_grf &&
3232 reg < first_write_grf + write_len &&
3233 needs_dep[reg - first_write_grf]) {
3234 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3235 needs_dep[reg - first_write_grf] = false;
3236 if (scan_inst->exec_size == 16)
3237 needs_dep[reg - first_write_grf + 1] = false;
3238 }
3239 }
3240 }
3241
3242 /* Clear the flag for registers that actually got read (as expected). */
3243 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3244
3245 /* Continue the loop only if we haven't resolved all the dependencies */
3246 int i;
3247 for (i = 0; i < write_len; i++) {
3248 if (needs_dep[i])
3249 break;
3250 }
3251 if (i == write_len)
3252 return;
3253 }
3254 }
3255
3256 /**
3257 * Implements this workaround for the original 965:
3258 *
3259 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3260 * used as a destination register until after it has been sourced by an
3261 * instruction with a different destination register.
3262 */
3263 void
3264 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3265 {
3266 int write_len = inst->regs_written;
3267 int first_write_grf = inst->dst.reg;
3268 bool needs_dep[BRW_MAX_MRF];
3269 assert(write_len < (int)sizeof(needs_dep) - 1);
3270
3271 memset(needs_dep, false, sizeof(needs_dep));
3272 memset(needs_dep, true, write_len);
3273 /* Walk forwards looking for writes to registers we're writing which aren't
3274 * read before being written.
3275 */
3276 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3277 /* If we hit control flow, force resolve all remaining dependencies. */
3278 if (block->end() == scan_inst) {
3279 for (int i = 0; i < write_len; i++) {
3280 if (needs_dep[i])
3281 scan_inst->insert_before(block,
3282 DEP_RESOLVE_MOV(first_write_grf + i));
3283 }
3284 return;
3285 }
3286
3287 /* Clear the flag for registers that actually got read (as expected). */
3288 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3289
3290 /* We insert our reads as late as possible since they're reading the
3291 * result of a SEND, which has massive latency.
3292 */
3293 if (scan_inst->dst.file == GRF &&
3294 scan_inst->dst.reg >= first_write_grf &&
3295 scan_inst->dst.reg < first_write_grf + write_len &&
3296 needs_dep[scan_inst->dst.reg - first_write_grf]) {
3297 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3298 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3299 }
3300
3301 /* Continue the loop only if we haven't resolved all the dependencies */
3302 int i;
3303 for (i = 0; i < write_len; i++) {
3304 if (needs_dep[i])
3305 break;
3306 }
3307 if (i == write_len)
3308 return;
3309 }
3310 }
3311
3312 void
3313 fs_visitor::insert_gen4_send_dependency_workarounds()
3314 {
3315 if (devinfo->gen != 4 || devinfo->is_g4x)
3316 return;
3317
3318 bool progress = false;
3319
3320 /* Note that we're done with register allocation, so GRF fs_regs always
3321 * have a .reg_offset of 0.
3322 */
3323
3324 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3325 if (inst->mlen != 0 && inst->dst.file == GRF) {
3326 insert_gen4_pre_send_dependency_workarounds(block, inst);
3327 insert_gen4_post_send_dependency_workarounds(block, inst);
3328 progress = true;
3329 }
3330 }
3331
3332 if (progress)
3333 invalidate_live_intervals();
3334 }
3335
3336 /**
3337 * Turns the generic expression-style uniform pull constant load instruction
3338 * into a hardware-specific series of instructions for loading a pull
3339 * constant.
3340 *
3341 * The expression style allows the CSE pass before this to optimize out
3342 * repeated loads from the same offset, and gives the pre-register-allocation
3343 * scheduling full flexibility, while the conversion to native instructions
3344 * allows the post-register-allocation scheduler the best information
3345 * possible.
3346 *
3347 * Note that execution masking for setting up pull constant loads is special:
3348 * the channels that need to be written are unrelated to the current execution
3349 * mask, since a later instruction will use one of the result channels as a
3350 * source operand for all 8 or 16 of its channels.
3351 */
3352 void
3353 fs_visitor::lower_uniform_pull_constant_loads()
3354 {
3355 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3356 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3357 continue;
3358
3359 if (devinfo->gen >= 7) {
3360 /* The offset arg before was a vec4-aligned byte offset. We need to
3361 * turn it into a dword offset.
3362 */
3363 fs_reg const_offset_reg = inst->src[1];
3364 assert(const_offset_reg.file == IMM &&
3365 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3366 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3367 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3368
3369 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3370 * Reserve space for the register.
3371 */
3372 if (devinfo->gen >= 9) {
3373 payload.reg_offset++;
3374 alloc.sizes[payload.reg] = 2;
3375 }
3376
3377 /* This is actually going to be a MOV, but since only the first dword
3378 * is accessed, we have a special opcode to do just that one. Note
3379 * that this needs to be an operation that will be considered a def
3380 * by live variable analysis, or register allocation will explode.
3381 */
3382 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3383 8, payload, const_offset_reg);
3384 setup->force_writemask_all = true;
3385
3386 setup->ir = inst->ir;
3387 setup->annotation = inst->annotation;
3388 inst->insert_before(block, setup);
3389
3390 /* Similarly, this will only populate the first 4 channels of the
3391 * result register (since we only use smear values from 0-3), but we
3392 * don't tell the optimizer.
3393 */
3394 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3395 inst->src[1] = payload;
3396
3397 invalidate_live_intervals();
3398 } else {
3399 /* Before register allocation, we didn't tell the scheduler about the
3400 * MRF we use. We know it's safe to use this MRF because nothing
3401 * else does except for register spill/unspill, which generates and
3402 * uses its MRF within a single IR instruction.
3403 */
3404 inst->base_mrf = 14;
3405 inst->mlen = 1;
3406 }
3407 }
3408 }
3409
3410 bool
3411 fs_visitor::lower_load_payload()
3412 {
3413 bool progress = false;
3414
3415 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3416 if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3417 continue;
3418
3419 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3420 assert(inst->saturate == false);
3421
3422 fs_reg dst = inst->dst;
3423
3424 /* Get rid of COMPR4. We'll add it back in if we need it */
3425 if (dst.file == MRF)
3426 dst.reg = dst.reg & ~BRW_MRF_COMPR4;
3427
3428 dst.width = 8;
3429 for (uint8_t i = 0; i < inst->header_size; i++) {
3430 if (inst->src[i].file != BAD_FILE) {
3431 fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3432 fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3433 mov_src.width = 8;
3434 fs_inst *mov = MOV(mov_dst, mov_src);
3435 mov->force_writemask_all = true;
3436 inst->insert_before(block, mov);
3437 }
3438 dst = offset(dst, 1);
3439 }
3440
3441 dst.width = inst->exec_size;
3442 if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
3443 inst->exec_size > 8) {
3444 /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3445 * a straightforward copy. Instead, the result of the
3446 * LOAD_PAYLOAD is treated as interleaved and the first four
3447 * non-header sources are unpacked as:
3448 *
3449 * m + 0: r0
3450 * m + 1: g0
3451 * m + 2: b0
3452 * m + 3: a0
3453 * m + 4: r1
3454 * m + 5: g1
3455 * m + 6: b1
3456 * m + 7: a1
3457 *
3458 * This is used for gen <= 5 fb writes.
3459 */
3460 assert(inst->exec_size == 16);
3461 assert(inst->header_size + 4 <= inst->sources);
3462 for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3463 if (inst->src[i].file != BAD_FILE) {
3464 if (devinfo->has_compr4) {
3465 fs_reg compr4_dst = retype(dst, inst->src[i].type);
3466 compr4_dst.reg |= BRW_MRF_COMPR4;
3467
3468 fs_inst *mov = MOV(compr4_dst, inst->src[i]);
3469 mov->force_writemask_all = inst->force_writemask_all;
3470 inst->insert_before(block, mov);
3471 } else {
3472 /* Platform doesn't have COMPR4. We have to fake it */
3473 fs_reg mov_dst = retype(dst, inst->src[i].type);
3474 mov_dst.width = 8;
3475
3476 fs_inst *mov = MOV(mov_dst, half(inst->src[i], 0));
3477 mov->force_writemask_all = inst->force_writemask_all;
3478 inst->insert_before(block, mov);
3479
3480 mov = MOV(offset(mov_dst, 4), half(inst->src[i], 1));
3481 mov->force_writemask_all = inst->force_writemask_all;
3482 mov->force_sechalf = true;
3483 inst->insert_before(block, mov);
3484 }
3485 }
3486
3487 dst.reg++;
3488 }
3489
3490 /* The loop above only ever incremented us through the first set
3491 * of 4 registers. However, thanks to the magic of COMPR4, we
3492 * actually wrote to the first 8 registers, so we need to take
3493 * that into account now.
3494 */
3495 dst.reg += 4;
3496
3497 /* The COMPR4 code took care of the first 4 sources. We'll let
3498 * the regular path handle any remaining sources. Yes, we are
3499 * modifying the instruction but we're about to delete it so
3500 * this really doesn't hurt anything.
3501 */
3502 inst->header_size += 4;
3503 }
3504
3505 for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3506 if (inst->src[i].file != BAD_FILE) {
3507 fs_inst *mov = MOV(retype(dst, inst->src[i].type),
3508 inst->src[i]);
3509 mov->force_writemask_all = inst->force_writemask_all;
3510 mov->force_sechalf = inst->force_sechalf;
3511 inst->insert_before(block, mov);
3512 }
3513 dst = offset(dst, 1);
3514 }
3515
3516 inst->remove(block);
3517 progress = true;
3518 }
3519
3520 if (progress)
3521 invalidate_live_intervals();
3522
3523 return progress;
3524 }
3525
3526 bool
3527 fs_visitor::lower_integer_multiplication()
3528 {
3529 bool progress = false;
3530
3531 /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
3532 * directly, but Cherryview cannot.
3533 */
3534 if (devinfo->gen >= 8 && !devinfo->is_cherryview)
3535 return false;
3536
3537 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3538 if (inst->opcode != BRW_OPCODE_MUL ||
3539 inst->dst.is_accumulator() ||
3540 (inst->dst.type != BRW_REGISTER_TYPE_D &&
3541 inst->dst.type != BRW_REGISTER_TYPE_UD))
3542 continue;
3543
3544 #define insert(instr) inst->insert_before(block, instr)
3545
3546 /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3547 * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3548 * src1 are used.
3549 *
3550 * If multiplying by an immediate value that fits in 16-bits, do a
3551 * single MUL instruction with that value in the proper location.
3552 */
3553 if (inst->src[1].file == IMM &&
3554 inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
3555 if (devinfo->gen < 7) {
3556 fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
3557 inst->dst.type, dispatch_width);
3558 insert(MOV(imm, inst->src[1]));
3559 insert(MUL(inst->dst, imm, inst->src[0]));
3560 } else {
3561 insert(MUL(inst->dst, inst->src[0], inst->src[1]));
3562 }
3563 } else {
3564 /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3565 * do 32-bit integer multiplication in one instruction, but instead
3566 * must do a sequence (which actually calculates a 64-bit result):
3567 *
3568 * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
3569 * mach(8) null g3<8,8,1>D g4<8,8,1>D
3570 * mov(8) g2<1>D acc0<8,8,1>D
3571 *
3572 * But on Gen > 6, the ability to use second accumulator register
3573 * (acc1) for non-float data types was removed, preventing a simple
3574 * implementation in SIMD16. A 16-channel result can be calculated by
3575 * executing the three instructions twice in SIMD8, once with quarter
3576 * control of 1Q for the first eight channels and again with 2Q for
3577 * the second eight channels.
3578 *
3579 * Which accumulator register is implicitly accessed (by AccWrEnable
3580 * for instance) is determined by the quarter control. Unfortunately
3581 * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3582 * implicit accumulator access by an instruction with 2Q will access
3583 * acc1 regardless of whether the data type is usable in acc1.
3584 *
3585 * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3586 * integer data types.
3587 *
3588 * Since we only want the low 32-bits of the result, we can do two
3589 * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3590 * adjust the high result and add them (like the mach is doing):
3591 *
3592 * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
3593 * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
3594 * shl(8) g9<1>D g8<8,8,1>D 16D
3595 * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
3596 *
3597 * We avoid the shl instruction by realizing that we only want to add
3598 * the low 16-bits of the "high" result to the high 16-bits of the
3599 * "low" result and using proper regioning on the add:
3600 *
3601 * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
3602 * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
3603 * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
3604 *
3605 * Since it does not use the (single) accumulator register, we can
3606 * schedule multi-component multiplications much better.
3607 */
3608
3609 fs_reg low = inst->dst;
3610 fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
3611 inst->dst.type, dispatch_width);
3612
3613 if (brw->gen >= 7) {
3614 fs_reg src1_0_w = inst->src[1];
3615 fs_reg src1_1_w = inst->src[1];
3616
3617 if (inst->src[1].file == IMM) {
3618 src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
3619 src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
3620 } else {
3621 src1_0_w.type = BRW_REGISTER_TYPE_UW;
3622 src1_0_w.stride = 2;
3623
3624 src1_1_w.type = BRW_REGISTER_TYPE_UW;
3625 src1_1_w.stride = 2;
3626 src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3627 }
3628 insert(MUL(low, inst->src[0], src1_0_w));
3629 insert(MUL(high, inst->src[0], src1_1_w));
3630 } else {
3631 fs_reg src0_0_w = inst->src[0];
3632 fs_reg src0_1_w = inst->src[0];
3633
3634 src0_0_w.type = BRW_REGISTER_TYPE_UW;
3635 src0_0_w.stride = 2;
3636
3637 src0_1_w.type = BRW_REGISTER_TYPE_UW;
3638 src0_1_w.stride = 2;
3639 src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3640
3641 insert(MUL(low, src0_0_w, inst->src[1]));
3642 insert(MUL(high, src0_1_w, inst->src[1]));
3643 }
3644
3645 fs_reg dst = inst->dst;
3646 dst.type = BRW_REGISTER_TYPE_UW;
3647 dst.subreg_offset = 2;
3648 dst.stride = 2;
3649
3650 high.type = BRW_REGISTER_TYPE_UW;
3651 high.stride = 2;
3652
3653 low.type = BRW_REGISTER_TYPE_UW;
3654 low.subreg_offset = 2;
3655 low.stride = 2;
3656
3657 insert(ADD(dst, low, high));
3658 }
3659 #undef insert
3660
3661 inst->remove(block);
3662 progress = true;
3663 }
3664
3665 if (progress)
3666 invalidate_live_intervals();
3667
3668 return progress;
3669 }
3670
3671 void
3672 fs_visitor::dump_instructions()
3673 {
3674 dump_instructions(NULL);
3675 }
3676
3677 void
3678 fs_visitor::dump_instructions(const char *name)
3679 {
3680 FILE *file = stderr;
3681 if (name && geteuid() != 0) {
3682 file = fopen(name, "w");
3683 if (!file)
3684 file = stderr;
3685 }
3686
3687 if (cfg) {
3688 calculate_register_pressure();
3689 int ip = 0, max_pressure = 0;
3690 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3691 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3692 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3693 dump_instruction(inst, file);
3694 ip++;
3695 }
3696 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3697 } else {
3698 int ip = 0;
3699 foreach_in_list(backend_instruction, inst, &instructions) {
3700 fprintf(file, "%4d: ", ip++);
3701 dump_instruction(inst, file);
3702 }
3703 }
3704
3705 if (file != stderr) {
3706 fclose(file);
3707 }
3708 }
3709
3710 void
3711 fs_visitor::dump_instruction(backend_instruction *be_inst)
3712 {
3713 dump_instruction(be_inst, stderr);
3714 }
3715
3716 void
3717 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3718 {
3719 fs_inst *inst = (fs_inst *)be_inst;
3720
3721 if (inst->predicate) {
3722 fprintf(file, "(%cf0.%d) ",
3723 inst->predicate_inverse ? '-' : '+',
3724 inst->flag_subreg);
3725 }
3726
3727 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3728 if (inst->saturate)
3729 fprintf(file, ".sat");
3730 if (inst->conditional_mod) {
3731 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3732 if (!inst->predicate &&
3733 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3734 inst->opcode != BRW_OPCODE_IF &&
3735 inst->opcode != BRW_OPCODE_WHILE))) {
3736 fprintf(file, ".f0.%d", inst->flag_subreg);
3737 }
3738 }
3739 fprintf(file, "(%d) ", inst->exec_size);
3740
3741
3742 switch (inst->dst.file) {
3743 case GRF:
3744 fprintf(file, "vgrf%d", inst->dst.reg);
3745 if (inst->dst.width != dispatch_width)
3746 fprintf(file, "@%d", inst->dst.width);
3747 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3748 inst->dst.subreg_offset)
3749 fprintf(file, "+%d.%d",
3750 inst->dst.reg_offset, inst->dst.subreg_offset);
3751 break;
3752 case MRF:
3753 fprintf(file, "m%d", inst->dst.reg);
3754 break;
3755 case BAD_FILE:
3756 fprintf(file, "(null)");
3757 break;
3758 case UNIFORM:
3759 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3760 break;
3761 case ATTR:
3762 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3763 break;
3764 case HW_REG:
3765 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3766 switch (inst->dst.fixed_hw_reg.nr) {
3767 case BRW_ARF_NULL:
3768 fprintf(file, "null");
3769 break;
3770 case BRW_ARF_ADDRESS:
3771 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3772 break;
3773 case BRW_ARF_ACCUMULATOR:
3774 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3775 break;
3776 case BRW_ARF_FLAG:
3777 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3778 inst->dst.fixed_hw_reg.subnr);
3779 break;
3780 default:
3781 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3782 inst->dst.fixed_hw_reg.subnr);
3783 break;
3784 }
3785 } else {
3786 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3787 }
3788 if (inst->dst.fixed_hw_reg.subnr)
3789 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3790 break;
3791 default:
3792 fprintf(file, "???");
3793 break;
3794 }
3795 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3796
3797 for (int i = 0; i < inst->sources; i++) {
3798 if (inst->src[i].negate)
3799 fprintf(file, "-");
3800 if (inst->src[i].abs)
3801 fprintf(file, "|");
3802 switch (inst->src[i].file) {
3803 case GRF:
3804 fprintf(file, "vgrf%d", inst->src[i].reg);
3805 if (inst->src[i].width != dispatch_width)
3806 fprintf(file, "@%d", inst->src[i].width);
3807 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3808 inst->src[i].subreg_offset)
3809 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3810 inst->src[i].subreg_offset);
3811 break;
3812 case MRF:
3813 fprintf(file, "***m%d***", inst->src[i].reg);
3814 break;
3815 case ATTR:
3816 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3817 break;
3818 case UNIFORM:
3819 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3820 if (inst->src[i].reladdr) {
3821 fprintf(file, "+reladdr");
3822 } else if (inst->src[i].subreg_offset) {
3823 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3824 inst->src[i].subreg_offset);
3825 }
3826 break;
3827 case BAD_FILE:
3828 fprintf(file, "(null)");
3829 break;
3830 case IMM:
3831 switch (inst->src[i].type) {
3832 case BRW_REGISTER_TYPE_F:
3833 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3834 break;
3835 case BRW_REGISTER_TYPE_W:
3836 case BRW_REGISTER_TYPE_D:
3837 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3838 break;
3839 case BRW_REGISTER_TYPE_UW:
3840 case BRW_REGISTER_TYPE_UD:
3841 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3842 break;
3843 case BRW_REGISTER_TYPE_VF:
3844 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3845 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3846 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3847 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3848 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3849 break;
3850 default:
3851 fprintf(file, "???");
3852 break;
3853 }
3854 break;
3855 case HW_REG:
3856 if (inst->src[i].fixed_hw_reg.negate)
3857 fprintf(file, "-");
3858 if (inst->src[i].fixed_hw_reg.abs)
3859 fprintf(file, "|");
3860 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3861 switch (inst->src[i].fixed_hw_reg.nr) {
3862 case BRW_ARF_NULL:
3863 fprintf(file, "null");
3864 break;
3865 case BRW_ARF_ADDRESS:
3866 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3867 break;
3868 case BRW_ARF_ACCUMULATOR:
3869 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3870 break;
3871 case BRW_ARF_FLAG:
3872 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3873 inst->src[i].fixed_hw_reg.subnr);
3874 break;
3875 default:
3876 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3877 inst->src[i].fixed_hw_reg.subnr);
3878 break;
3879 }
3880 } else {
3881 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3882 }
3883 if (inst->src[i].fixed_hw_reg.subnr)
3884 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3885 if (inst->src[i].fixed_hw_reg.abs)
3886 fprintf(file, "|");
3887 break;
3888 default:
3889 fprintf(file, "???");
3890 break;
3891 }
3892 if (inst->src[i].abs)
3893 fprintf(file, "|");
3894
3895 if (inst->src[i].file != IMM) {
3896 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3897 }
3898
3899 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3900 fprintf(file, ", ");
3901 }
3902
3903 fprintf(file, " ");
3904
3905 if (dispatch_width == 16 && inst->exec_size == 8) {
3906 if (inst->force_sechalf)
3907 fprintf(file, "2ndhalf ");
3908 else
3909 fprintf(file, "1sthalf ");
3910 }
3911
3912 fprintf(file, "\n");
3913 }
3914
3915 /**
3916 * Possibly returns an instruction that set up @param reg.
3917 *
3918 * Sometimes we want to take the result of some expression/variable
3919 * dereference tree and rewrite the instruction generating the result
3920 * of the tree. When processing the tree, we know that the
3921 * instructions generated are all writing temporaries that are dead
3922 * outside of this tree. So, if we have some instructions that write
3923 * a temporary, we're free to point that temp write somewhere else.
3924 *
3925 * Note that this doesn't guarantee that the instruction generated
3926 * only reg -- it might be the size=4 destination of a texture instruction.
3927 */
3928 fs_inst *
3929 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3930 fs_inst *end,
3931 const fs_reg &reg)
3932 {
3933 if (end == start ||
3934 end->is_partial_write() ||
3935 reg.reladdr ||
3936 !reg.equals(end->dst)) {
3937 return NULL;
3938 } else {
3939 return end;
3940 }
3941 }
3942
3943 void
3944 fs_visitor::setup_payload_gen6()
3945 {
3946 bool uses_depth =
3947 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3948 unsigned barycentric_interp_modes =
3949 (stage == MESA_SHADER_FRAGMENT) ?
3950 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3951
3952 assert(devinfo->gen >= 6);
3953
3954 /* R0-1: masks, pixel X/Y coordinates. */
3955 payload.num_regs = 2;
3956 /* R2: only for 32-pixel dispatch.*/
3957
3958 /* R3-26: barycentric interpolation coordinates. These appear in the
3959 * same order that they appear in the brw_wm_barycentric_interp_mode
3960 * enum. Each set of coordinates occupies 2 registers if dispatch width
3961 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3962 * appear if they were enabled using the "Barycentric Interpolation
3963 * Mode" bits in WM_STATE.
3964 */
3965 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3966 if (barycentric_interp_modes & (1 << i)) {
3967 payload.barycentric_coord_reg[i] = payload.num_regs;
3968 payload.num_regs += 2;
3969 if (dispatch_width == 16) {
3970 payload.num_regs += 2;
3971 }
3972 }
3973 }
3974
3975 /* R27: interpolated depth if uses source depth */
3976 if (uses_depth) {
3977 payload.source_depth_reg = payload.num_regs;
3978 payload.num_regs++;
3979 if (dispatch_width == 16) {
3980 /* R28: interpolated depth if not SIMD8. */
3981 payload.num_regs++;
3982 }
3983 }
3984 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3985 if (uses_depth) {
3986 payload.source_w_reg = payload.num_regs;
3987 payload.num_regs++;
3988 if (dispatch_width == 16) {
3989 /* R30: interpolated W if not SIMD8. */
3990 payload.num_regs++;
3991 }
3992 }
3993
3994 if (stage == MESA_SHADER_FRAGMENT) {
3995 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3996 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3997 prog_data->uses_pos_offset = key->compute_pos_offset;
3998 /* R31: MSAA position offsets. */
3999 if (prog_data->uses_pos_offset) {
4000 payload.sample_pos_reg = payload.num_regs;
4001 payload.num_regs++;
4002 }
4003 }
4004
4005 /* R32: MSAA input coverage mask */
4006 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
4007 assert(devinfo->gen >= 7);
4008 payload.sample_mask_in_reg = payload.num_regs;
4009 payload.num_regs++;
4010 if (dispatch_width == 16) {
4011 /* R33: input coverage mask if not SIMD8. */
4012 payload.num_regs++;
4013 }
4014 }
4015
4016 /* R34-: bary for 32-pixel. */
4017 /* R58-59: interp W for 32-pixel. */
4018
4019 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
4020 source_depth_to_render_target = true;
4021 }
4022 }
4023
4024 void
4025 fs_visitor::setup_vs_payload()
4026 {
4027 /* R0: thread header, R1: urb handles */
4028 payload.num_regs = 2;
4029 }
4030
4031 void
4032 fs_visitor::setup_cs_payload()
4033 {
4034 assert(brw->gen >= 7);
4035
4036 payload.num_regs = 1;
4037 }
4038
4039 void
4040 fs_visitor::assign_binding_table_offsets()
4041 {
4042 assert(stage == MESA_SHADER_FRAGMENT);
4043 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
4044 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
4045 uint32_t next_binding_table_offset = 0;
4046
4047 /* If there are no color regions, we still perform an FB write to a null
4048 * renderbuffer, which we place at surface index 0.
4049 */
4050 prog_data->binding_table.render_target_start = next_binding_table_offset;
4051 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
4052
4053 assign_common_binding_table_offsets(next_binding_table_offset);
4054 }
4055
4056 void
4057 fs_visitor::calculate_register_pressure()
4058 {
4059 invalidate_live_intervals();
4060 calculate_live_intervals();
4061
4062 unsigned num_instructions = 0;
4063 foreach_block(block, cfg)
4064 num_instructions += block->instructions.length();
4065
4066 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
4067
4068 for (unsigned reg = 0; reg < alloc.count; reg++) {
4069 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
4070 regs_live_at_ip[ip] += alloc.sizes[reg];
4071 }
4072 }
4073
4074 void
4075 fs_visitor::optimize()
4076 {
4077 split_virtual_grfs();
4078
4079 move_uniform_array_access_to_pull_constants();
4080 assign_constant_locations();
4081 demote_pull_constants();
4082
4083 #define OPT(pass, args...) ({ \
4084 pass_num++; \
4085 bool this_progress = pass(args); \
4086 \
4087 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
4088 char filename[64]; \
4089 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
4090 stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
4091 \
4092 backend_visitor::dump_instructions(filename); \
4093 } \
4094 \
4095 progress = progress || this_progress; \
4096 this_progress; \
4097 })
4098
4099 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
4100 char filename[64];
4101 snprintf(filename, 64, "%s%d-%04d-00-start",
4102 stage_abbrev, dispatch_width,
4103 shader_prog ? shader_prog->Name : 0);
4104
4105 backend_visitor::dump_instructions(filename);
4106 }
4107
4108 bool progress;
4109 int iteration = 0;
4110 int pass_num = 0;
4111 do {
4112 progress = false;
4113 pass_num = 0;
4114 iteration++;
4115
4116 OPT(remove_duplicate_mrf_writes);
4117
4118 OPT(opt_algebraic);
4119 OPT(opt_cse);
4120 OPT(opt_copy_propagate);
4121 OPT(opt_peephole_predicated_break);
4122 OPT(opt_cmod_propagation);
4123 OPT(dead_code_eliminate);
4124 OPT(opt_peephole_sel);
4125 OPT(dead_control_flow_eliminate, this);
4126 OPT(opt_register_renaming);
4127 OPT(opt_redundant_discard_jumps);
4128 OPT(opt_saturate_propagation);
4129 OPT(opt_zero_samples);
4130 OPT(register_coalesce);
4131 OPT(compute_to_mrf);
4132 OPT(eliminate_find_live_channel);
4133
4134 OPT(compact_virtual_grfs);
4135 } while (progress);
4136
4137 pass_num = 0;
4138
4139 OPT(opt_sampler_eot);
4140
4141 if (OPT(lower_load_payload)) {
4142 split_virtual_grfs();
4143 OPT(register_coalesce);
4144 OPT(compute_to_mrf);
4145 OPT(dead_code_eliminate);
4146 }
4147
4148 OPT(opt_combine_constants);
4149 OPT(lower_integer_multiplication);
4150
4151 lower_uniform_pull_constant_loads();
4152 }
4153
4154 /**
4155 * Three source instruction must have a GRF/MRF destination register.
4156 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
4157 */
4158 void
4159 fs_visitor::fixup_3src_null_dest()
4160 {
4161 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
4162 if (inst->is_3src() && inst->dst.is_null()) {
4163 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
4164 inst->dst.type);
4165 }
4166 }
4167 }
4168
4169 void
4170 fs_visitor::allocate_registers()
4171 {
4172 bool allocated_without_spills;
4173
4174 static const enum instruction_scheduler_mode pre_modes[] = {
4175 SCHEDULE_PRE,
4176 SCHEDULE_PRE_NON_LIFO,
4177 SCHEDULE_PRE_LIFO,
4178 };
4179
4180 /* Try each scheduling heuristic to see if it can successfully register
4181 * allocate without spilling. They should be ordered by decreasing
4182 * performance but increasing likelihood of allocating.
4183 */
4184 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
4185 schedule_instructions(pre_modes[i]);
4186
4187 if (0) {
4188 assign_regs_trivial();
4189 allocated_without_spills = true;
4190 } else {
4191 allocated_without_spills = assign_regs(false);
4192 }
4193 if (allocated_without_spills)
4194 break;
4195 }
4196
4197 if (!allocated_without_spills) {
4198 /* We assume that any spilling is worse than just dropping back to
4199 * SIMD8. There's probably actually some intermediate point where
4200 * SIMD16 with a couple of spills is still better.
4201 */
4202 if (dispatch_width == 16) {
4203 fail("Failure to register allocate. Reduce number of "
4204 "live scalar values to avoid this.");
4205 } else {
4206 perf_debug("%s shader triggered register spilling. "
4207 "Try reducing the number of live scalar values to "
4208 "improve performance.\n", stage_name);
4209 }
4210
4211 /* Since we're out of heuristics, just go spill registers until we
4212 * get an allocation.
4213 */
4214 while (!assign_regs(true)) {
4215 if (failed)
4216 break;
4217 }
4218 }
4219
4220 /* This must come after all optimization and register allocation, since
4221 * it inserts dead code that happens to have side effects, and it does
4222 * so based on the actual physical registers in use.
4223 */
4224 insert_gen4_send_dependency_workarounds();
4225
4226 if (failed)
4227 return;
4228
4229 if (!allocated_without_spills)
4230 schedule_instructions(SCHEDULE_POST);
4231
4232 if (last_scratch > 0)
4233 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4234 }
4235
4236 bool
4237 fs_visitor::run_vs()
4238 {
4239 assert(stage == MESA_SHADER_VERTEX);
4240
4241 assign_common_binding_table_offsets(0);
4242 setup_vs_payload();
4243
4244 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4245 emit_shader_time_begin();
4246
4247 if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
4248 emit_nir_code();
4249 } else {
4250 foreach_in_list(ir_instruction, ir, shader->base.ir) {
4251 base_ir = ir;
4252 this->result = reg_undef;
4253 ir->accept(this);
4254 }
4255 base_ir = NULL;
4256 }
4257
4258 if (failed)
4259 return false;
4260
4261 emit_urb_writes();
4262
4263 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4264 emit_shader_time_end();
4265
4266 calculate_cfg();
4267
4268 optimize();
4269
4270 assign_curb_setup();
4271 assign_vs_urb_setup();
4272
4273 fixup_3src_null_dest();
4274 allocate_registers();
4275
4276 return !failed;
4277 }
4278
4279 bool
4280 fs_visitor::run_fs()
4281 {
4282 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4283 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4284
4285 assert(stage == MESA_SHADER_FRAGMENT);
4286
4287 sanity_param_count = prog->Parameters->NumParameters;
4288
4289 assign_binding_table_offsets();
4290
4291 if (devinfo->gen >= 6)
4292 setup_payload_gen6();
4293 else
4294 setup_payload_gen4();
4295
4296 if (0) {
4297 emit_dummy_fs();
4298 } else if (brw->use_rep_send && dispatch_width == 16) {
4299 emit_repclear_shader();
4300 } else {
4301 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4302 emit_shader_time_begin();
4303
4304 calculate_urb_setup();
4305 if (prog->InputsRead > 0) {
4306 if (devinfo->gen < 6)
4307 emit_interpolation_setup_gen4();
4308 else
4309 emit_interpolation_setup_gen6();
4310 }
4311
4312 /* We handle discards by keeping track of the still-live pixels in f0.1.
4313 * Initialize it with the dispatched pixels.
4314 */
4315 if (wm_prog_data->uses_kill) {
4316 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4317 discard_init->flag_subreg = 1;
4318 }
4319
4320 /* Generate FS IR for main(). (the visitor only descends into
4321 * functions called "main").
4322 */
4323 if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
4324 emit_nir_code();
4325 } else if (shader) {
4326 foreach_in_list(ir_instruction, ir, shader->base.ir) {
4327 base_ir = ir;
4328 this->result = reg_undef;
4329 ir->accept(this);
4330 }
4331 } else {
4332 emit_fragment_program_code();
4333 }
4334 base_ir = NULL;
4335 if (failed)
4336 return false;
4337
4338 if (wm_prog_data->uses_kill)
4339 emit(FS_OPCODE_PLACEHOLDER_HALT);
4340
4341 if (wm_key->alpha_test_func)
4342 emit_alpha_test();
4343
4344 emit_fb_writes();
4345
4346 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4347 emit_shader_time_end();
4348
4349 calculate_cfg();
4350
4351 optimize();
4352
4353 assign_curb_setup();
4354 assign_urb_setup();
4355
4356 fixup_3src_null_dest();
4357 allocate_registers();
4358
4359 if (failed)
4360 return false;
4361 }
4362
4363 if (dispatch_width == 8)
4364 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4365 else
4366 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4367
4368 /* If any state parameters were appended, then ParameterValues could have
4369 * been realloced, in which case the driver uniform storage set up by
4370 * _mesa_associate_uniform_storage() would point to freed memory. Make
4371 * sure that didn't happen.
4372 */
4373 assert(sanity_param_count == prog->Parameters->NumParameters);
4374
4375 return !failed;
4376 }
4377
4378 bool
4379 fs_visitor::run_cs()
4380 {
4381 assert(stage == MESA_SHADER_COMPUTE);
4382 assert(shader);
4383
4384 sanity_param_count = prog->Parameters->NumParameters;
4385
4386 assign_common_binding_table_offsets(0);
4387
4388 setup_cs_payload();
4389
4390 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4391 emit_shader_time_begin();
4392
4393 emit_nir_code();
4394
4395 if (failed)
4396 return false;
4397
4398 emit_cs_terminate();
4399
4400 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4401 emit_shader_time_end();
4402
4403 calculate_cfg();
4404
4405 optimize();
4406
4407 assign_curb_setup();
4408
4409 fixup_3src_null_dest();
4410 allocate_registers();
4411
4412 if (failed)
4413 return false;
4414
4415 /* If any state parameters were appended, then ParameterValues could have
4416 * been realloced, in which case the driver uniform storage set up by
4417 * _mesa_associate_uniform_storage() would point to freed memory. Make
4418 * sure that didn't happen.
4419 */
4420 assert(sanity_param_count == prog->Parameters->NumParameters);
4421
4422 return !failed;
4423 }
4424
4425 const unsigned *
4426 brw_wm_fs_emit(struct brw_context *brw,
4427 void *mem_ctx,
4428 const struct brw_wm_prog_key *key,
4429 struct brw_wm_prog_data *prog_data,
4430 struct gl_fragment_program *fp,
4431 struct gl_shader_program *prog,
4432 unsigned *final_assembly_size)
4433 {
4434 bool start_busy = false;
4435 double start_time = 0;
4436
4437 if (unlikely(brw->perf_debug)) {
4438 start_busy = (brw->batch.last_bo &&
4439 drm_intel_bo_busy(brw->batch.last_bo));
4440 start_time = get_time();
4441 }
4442
4443 struct brw_shader *shader = NULL;
4444 if (prog)
4445 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4446
4447 if (unlikely(INTEL_DEBUG & DEBUG_WM))
4448 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4449
4450 /* Now the main event: Visit the shader IR and generate our FS IR for it.
4451 */
4452 fs_visitor v(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4453 prog, &fp->Base, 8);
4454 if (!v.run_fs()) {
4455 if (prog) {
4456 prog->LinkStatus = false;
4457 ralloc_strcat(&prog->InfoLog, v.fail_msg);
4458 }
4459
4460 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4461 v.fail_msg);
4462
4463 return NULL;
4464 }
4465
4466 cfg_t *simd16_cfg = NULL;
4467 fs_visitor v2(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4468 prog, &fp->Base, 16);
4469 if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4470 if (!v.simd16_unsupported) {
4471 /* Try a SIMD16 compile */
4472 v2.import_uniforms(&v);
4473 if (!v2.run_fs()) {
4474 perf_debug("SIMD16 shader failed to compile, falling back to "
4475 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4476 } else {
4477 simd16_cfg = v2.cfg;
4478 }
4479 } else {
4480 perf_debug("SIMD16 shader unsupported, falling back to "
4481 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4482 }
4483 }
4484
4485 cfg_t *simd8_cfg;
4486 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4487 if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4488 simd8_cfg = NULL;
4489 prog_data->no_8 = true;
4490 } else {
4491 simd8_cfg = v.cfg;
4492 prog_data->no_8 = false;
4493 }
4494
4495 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4496 &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4497
4498 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4499 char *name;
4500 if (prog)
4501 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4502 prog->Label ? prog->Label : "unnamed",
4503 prog->Name);
4504 else
4505 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4506
4507 g.enable_debug(name);
4508 }
4509
4510 if (simd8_cfg)
4511 g.generate_code(simd8_cfg, 8);
4512 if (simd16_cfg)
4513 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4514
4515 if (unlikely(brw->perf_debug) && shader) {
4516 if (shader->compiled_once)
4517 brw_wm_debug_recompile(brw, prog, key);
4518 shader->compiled_once = true;
4519
4520 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4521 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4522 (get_time() - start_time) * 1000);
4523 }
4524 }
4525
4526 return g.get_assembly(final_assembly_size);
4527 }
4528
4529 extern "C" bool
4530 brw_fs_precompile(struct gl_context *ctx,
4531 struct gl_shader_program *shader_prog,
4532 struct gl_program *prog)
4533 {
4534 struct brw_context *brw = brw_context(ctx);
4535 struct brw_wm_prog_key key;
4536
4537 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4538 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4539 bool program_uses_dfdy = fp->UsesDFdy;
4540
4541 memset(&key, 0, sizeof(key));
4542
4543 if (brw->gen < 6) {
4544 if (fp->UsesKill)
4545 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4546
4547 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4548 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4549
4550 /* Just assume depth testing. */
4551 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4552 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4553 }
4554
4555 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4556 BRW_FS_VARYING_INPUT_MASK) > 16)
4557 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4558
4559 brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4560
4561 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4562 key.drawable_height = ctx->DrawBuffer->Height;
4563 }
4564
4565 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4566 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4567 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4568
4569 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4570 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4571 key.nr_color_regions > 1;
4572 }
4573
4574 key.program_string_id = bfp->id;
4575
4576 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4577 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4578
4579 bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4580
4581 brw->wm.base.prog_offset = old_prog_offset;
4582 brw->wm.prog_data = old_prog_data;
4583
4584 return success;
4585 }
4586
4587 void
4588 brw_setup_tex_for_precompile(struct brw_context *brw,
4589 struct brw_sampler_prog_key_data *tex,
4590 struct gl_program *prog)
4591 {
4592 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4593 unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4594 for (unsigned i = 0; i < sampler_count; i++) {
4595 if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4596 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4597 tex->swizzles[i] =
4598 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4599 } else {
4600 /* Color sampler: assume no swizzling. */
4601 tex->swizzles[i] = SWIZZLE_XYZW;
4602 }
4603 }
4604 }