ee14a7a6483791b45a1fea98b9a59671c6a391ec
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 void
53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
54 const fs_reg *src, unsigned sources)
55 {
56 memset(this, 0, sizeof(*this));
57
58 this->src = new fs_reg[MAX2(sources, 3)];
59 for (unsigned i = 0; i < sources; i++)
60 this->src[i] = src[i];
61
62 this->opcode = opcode;
63 this->dst = dst;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (unsigned i = 0; i < sources; ++i) {
79 if (src[i].file != GRF && src[i].file != ATTR)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 for (unsigned i = 0; i < sources; ++i) {
94 switch (this->src[i].file) {
95 case BAD_FILE:
96 this->src[i].effective_width = 8;
97 break;
98 case GRF:
99 case HW_REG:
100 case ATTR:
101 assert(this->src[i].width > 0);
102 if (this->src[i].width == 1) {
103 this->src[i].effective_width = this->exec_size;
104 } else {
105 this->src[i].effective_width = this->src[i].width;
106 }
107 break;
108 case IMM:
109 case UNIFORM:
110 this->src[i].effective_width = this->exec_size;
111 break;
112 default:
113 unreachable("Invalid source register file");
114 }
115 }
116 this->dst.effective_width = this->exec_size;
117
118 this->conditional_mod = BRW_CONDITIONAL_NONE;
119
120 /* This will be the case for almost all instructions. */
121 switch (dst.file) {
122 case GRF:
123 case HW_REG:
124 case MRF:
125 case ATTR:
126 this->regs_written =
127 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
128 break;
129 case BAD_FILE:
130 this->regs_written = 0;
131 break;
132 case IMM:
133 case UNIFORM:
134 unreachable("Invalid destination register file");
135 default:
136 unreachable("Invalid register file");
137 }
138
139 this->writes_accumulator = false;
140 }
141
142 fs_inst::fs_inst()
143 {
144 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
145 }
146
147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
148 {
149 init(opcode, exec_size, reg_undef, NULL, 0);
150 }
151
152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
153 {
154 init(opcode, 0, dst, NULL, 0);
155 }
156
157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
158 const fs_reg &src0)
159 {
160 const fs_reg src[1] = { src0 };
161 init(opcode, exec_size, dst, src, 1);
162 }
163
164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
165 {
166 const fs_reg src[1] = { src0 };
167 init(opcode, 0, dst, src, 1);
168 }
169
170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
171 const fs_reg &src0, const fs_reg &src1)
172 {
173 const fs_reg src[2] = { src0, src1 };
174 init(opcode, exec_size, dst, src, 2);
175 }
176
177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
178 const fs_reg &src1)
179 {
180 const fs_reg src[2] = { src0, src1 };
181 init(opcode, 0, dst, src, 2);
182 }
183
184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
185 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
186 {
187 const fs_reg src[3] = { src0, src1, src2 };
188 init(opcode, exec_size, dst, src, 3);
189 }
190
191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
192 const fs_reg &src1, const fs_reg &src2)
193 {
194 const fs_reg src[3] = { src0, src1, src2 };
195 init(opcode, 0, dst, src, 3);
196 }
197
198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
199 const fs_reg src[], unsigned sources)
200 {
201 init(opcode, 0, dst, src, sources);
202 }
203
204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
205 const fs_reg src[], unsigned sources)
206 {
207 init(opcode, exec_width, dst, src, sources);
208 }
209
210 fs_inst::fs_inst(const fs_inst &that)
211 {
212 memcpy(this, &that, sizeof(that));
213
214 this->src = new fs_reg[MAX2(that.sources, 3)];
215
216 for (unsigned i = 0; i < that.sources; i++)
217 this->src[i] = that.src[i];
218 }
219
220 fs_inst::~fs_inst()
221 {
222 delete[] this->src;
223 }
224
225 void
226 fs_inst::resize_sources(uint8_t num_sources)
227 {
228 if (this->sources != num_sources) {
229 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
230
231 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
232 src[i] = this->src[i];
233
234 delete[] this->src;
235 this->src = src;
236 this->sources = num_sources;
237 }
238 }
239
240 #define ALU1(op) \
241 fs_inst * \
242 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
243 { \
244 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
245 }
246
247 #define ALU2(op) \
248 fs_inst * \
249 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
250 const fs_reg &src1) \
251 { \
252 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
253 }
254
255 #define ALU2_ACC(op) \
256 fs_inst * \
257 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
258 const fs_reg &src1) \
259 { \
260 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
261 inst->writes_accumulator = true; \
262 return inst; \
263 }
264
265 #define ALU3(op) \
266 fs_inst * \
267 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
268 const fs_reg &src1, const fs_reg &src2) \
269 { \
270 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
271 }
272
273 ALU1(NOT)
274 ALU1(MOV)
275 ALU1(FRC)
276 ALU1(RNDD)
277 ALU1(RNDE)
278 ALU1(RNDZ)
279 ALU2(ADD)
280 ALU2(MUL)
281 ALU2_ACC(MACH)
282 ALU2(AND)
283 ALU2(OR)
284 ALU2(XOR)
285 ALU2(SHL)
286 ALU2(SHR)
287 ALU2(ASR)
288 ALU3(LRP)
289 ALU1(BFREV)
290 ALU3(BFE)
291 ALU2(BFI1)
292 ALU3(BFI2)
293 ALU1(FBH)
294 ALU1(FBL)
295 ALU1(CBIT)
296 ALU3(MAD)
297 ALU2_ACC(ADDC)
298 ALU2_ACC(SUBB)
299 ALU2(SEL)
300 ALU2(MAC)
301
302 /** Gen4 predicated IF. */
303 fs_inst *
304 fs_visitor::IF(enum brw_predicate predicate)
305 {
306 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
307 inst->predicate = predicate;
308 return inst;
309 }
310
311 /** Gen6 IF with embedded comparison. */
312 fs_inst *
313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
314 enum brw_conditional_mod condition)
315 {
316 assert(brw->gen == 6);
317 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
318 reg_null_d, src0, src1);
319 inst->conditional_mod = condition;
320 return inst;
321 }
322
323 /**
324 * CMP: Sets the low bit of the destination channels with the result
325 * of the comparison, while the upper bits are undefined, and updates
326 * the flag register with the packed 16 bits of the result.
327 */
328 fs_inst *
329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
330 enum brw_conditional_mod condition)
331 {
332 fs_inst *inst;
333
334 /* Take the instruction:
335 *
336 * CMP null<d> src0<f> src1<f>
337 *
338 * Original gen4 does type conversion to the destination type before
339 * comparison, producing garbage results for floating point comparisons.
340 *
341 * The destination type doesn't matter on newer generations, so we set the
342 * type to match src0 so we can compact the instruction.
343 */
344 dst.type = src0.type;
345 if (dst.file == HW_REG)
346 dst.fixed_hw_reg.type = dst.type;
347
348 resolve_ud_negate(&src0);
349 resolve_ud_negate(&src1);
350
351 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
352 inst->conditional_mod = condition;
353
354 return inst;
355 }
356
357 fs_inst *
358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
359 {
360 uint8_t exec_size = dst.width;
361 for (int i = 0; i < sources; ++i) {
362 assert(src[i].width % dst.width == 0);
363 if (src[i].width > exec_size)
364 exec_size = src[i].width;
365 }
366
367 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
368 dst, src, sources);
369 inst->regs_written = 0;
370 for (int i = 0; i < sources; ++i) {
371 /* The LOAD_PAYLOAD instruction only really makes sense if we are
372 * dealing with whole registers. If this ever changes, we can deal
373 * with it later.
374 */
375 int size = inst->src[i].effective_width * type_sz(src[i].type);
376 assert(size % 32 == 0);
377 inst->regs_written += (size + 31) / 32;
378 }
379
380 return inst;
381 }
382
383 exec_list
384 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
385 const fs_reg &surf_index,
386 const fs_reg &varying_offset,
387 uint32_t const_offset)
388 {
389 exec_list instructions;
390 fs_inst *inst;
391
392 /* We have our constant surface use a pitch of 4 bytes, so our index can
393 * be any component of a vector, and then we load 4 contiguous
394 * components starting from that.
395 *
396 * We break down the const_offset to a portion added to the variable
397 * offset and a portion done using reg_offset, which means that if you
398 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
399 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
400 * CSE can later notice that those loads are all the same and eliminate
401 * the redundant ones.
402 */
403 fs_reg vec4_offset = vgrf(glsl_type::int_type);
404 instructions.push_tail(ADD(vec4_offset,
405 varying_offset, fs_reg(const_offset & ~3)));
406
407 int scale = 1;
408 if (brw->gen == 4 && dst.width == 8) {
409 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
410 * u, v, r) as parameters, or we can just use the SIMD16 message
411 * consisting of (header, u). We choose the second, at the cost of a
412 * longer return length.
413 */
414 scale = 2;
415 }
416
417 enum opcode op;
418 if (brw->gen >= 7)
419 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
420 else
421 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
422
423 assert(dst.width % 8 == 0);
424 int regs_written = 4 * (dst.width / 8) * scale;
425 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
426 dst.type, dst.width);
427 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
428 inst->regs_written = regs_written;
429 instructions.push_tail(inst);
430
431 if (brw->gen < 7) {
432 inst->base_mrf = 13;
433 inst->header_present = true;
434 if (brw->gen == 4)
435 inst->mlen = 3;
436 else
437 inst->mlen = 1 + dispatch_width / 8;
438 }
439
440 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
441 instructions.push_tail(MOV(dst, result));
442
443 return instructions;
444 }
445
446 /**
447 * A helper for MOV generation for fixing up broken hardware SEND dependency
448 * handling.
449 */
450 fs_inst *
451 fs_visitor::DEP_RESOLVE_MOV(int grf)
452 {
453 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
454
455 inst->ir = NULL;
456 inst->annotation = "send dependency resolve";
457
458 /* The caller always wants uncompressed to emit the minimal extra
459 * dependencies, and to avoid having to deal with aligning its regs to 2.
460 */
461 inst->exec_size = 8;
462
463 return inst;
464 }
465
466 bool
467 fs_inst::equals(fs_inst *inst) const
468 {
469 return (opcode == inst->opcode &&
470 dst.equals(inst->dst) &&
471 src[0].equals(inst->src[0]) &&
472 src[1].equals(inst->src[1]) &&
473 src[2].equals(inst->src[2]) &&
474 saturate == inst->saturate &&
475 predicate == inst->predicate &&
476 conditional_mod == inst->conditional_mod &&
477 mlen == inst->mlen &&
478 base_mrf == inst->base_mrf &&
479 target == inst->target &&
480 eot == inst->eot &&
481 header_present == inst->header_present &&
482 shadow_compare == inst->shadow_compare &&
483 exec_size == inst->exec_size &&
484 offset == inst->offset);
485 }
486
487 bool
488 fs_inst::overwrites_reg(const fs_reg &reg) const
489 {
490 return (reg.file == dst.file &&
491 reg.reg == dst.reg &&
492 reg.reg_offset >= dst.reg_offset &&
493 reg.reg_offset < dst.reg_offset + regs_written);
494 }
495
496 bool
497 fs_inst::is_send_from_grf() const
498 {
499 switch (opcode) {
500 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
501 case SHADER_OPCODE_SHADER_TIME_ADD:
502 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
503 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
504 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
505 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
506 case SHADER_OPCODE_UNTYPED_ATOMIC:
507 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
508 case SHADER_OPCODE_URB_WRITE_SIMD8:
509 return true;
510 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
511 return src[1].file == GRF;
512 case FS_OPCODE_FB_WRITE:
513 return src[0].file == GRF;
514 default:
515 if (is_tex())
516 return src[0].file == GRF;
517
518 return false;
519 }
520 }
521
522 bool
523 fs_inst::can_do_source_mods(struct brw_context *brw)
524 {
525 if (brw->gen == 6 && is_math())
526 return false;
527
528 if (is_send_from_grf())
529 return false;
530
531 if (!backend_instruction::can_do_source_mods())
532 return false;
533
534 return true;
535 }
536
537 void
538 fs_reg::init()
539 {
540 memset(this, 0, sizeof(*this));
541 stride = 1;
542 }
543
544 /** Generic unset register constructor. */
545 fs_reg::fs_reg()
546 {
547 init();
548 this->file = BAD_FILE;
549 }
550
551 /** Immediate value constructor. */
552 fs_reg::fs_reg(float f)
553 {
554 init();
555 this->file = IMM;
556 this->type = BRW_REGISTER_TYPE_F;
557 this->fixed_hw_reg.dw1.f = f;
558 this->width = 1;
559 }
560
561 /** Immediate value constructor. */
562 fs_reg::fs_reg(int32_t i)
563 {
564 init();
565 this->file = IMM;
566 this->type = BRW_REGISTER_TYPE_D;
567 this->fixed_hw_reg.dw1.d = i;
568 this->width = 1;
569 }
570
571 /** Immediate value constructor. */
572 fs_reg::fs_reg(uint32_t u)
573 {
574 init();
575 this->file = IMM;
576 this->type = BRW_REGISTER_TYPE_UD;
577 this->fixed_hw_reg.dw1.ud = u;
578 this->width = 1;
579 }
580
581 /** Vector float immediate value constructor. */
582 fs_reg::fs_reg(uint8_t vf[4])
583 {
584 init();
585 this->file = IMM;
586 this->type = BRW_REGISTER_TYPE_VF;
587 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
588 }
589
590 /** Vector float immediate value constructor. */
591 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
592 {
593 init();
594 this->file = IMM;
595 this->type = BRW_REGISTER_TYPE_VF;
596 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
597 (vf1 << 8) |
598 (vf2 << 16) |
599 (vf3 << 24);
600 }
601
602 /** Fixed brw_reg. */
603 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
604 {
605 init();
606 this->file = HW_REG;
607 this->fixed_hw_reg = fixed_hw_reg;
608 this->type = fixed_hw_reg.type;
609 this->width = 1 << fixed_hw_reg.width;
610 }
611
612 bool
613 fs_reg::equals(const fs_reg &r) const
614 {
615 return (file == r.file &&
616 reg == r.reg &&
617 reg_offset == r.reg_offset &&
618 subreg_offset == r.subreg_offset &&
619 type == r.type &&
620 negate == r.negate &&
621 abs == r.abs &&
622 !reladdr && !r.reladdr &&
623 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
624 width == r.width &&
625 stride == r.stride);
626 }
627
628 fs_reg &
629 fs_reg::set_smear(unsigned subreg)
630 {
631 assert(file != HW_REG && file != IMM);
632 subreg_offset = subreg * type_sz(type);
633 stride = 0;
634 return *this;
635 }
636
637 bool
638 fs_reg::is_contiguous() const
639 {
640 return stride == 1;
641 }
642
643 int
644 fs_visitor::type_size(const struct glsl_type *type)
645 {
646 unsigned int size, i;
647
648 switch (type->base_type) {
649 case GLSL_TYPE_UINT:
650 case GLSL_TYPE_INT:
651 case GLSL_TYPE_FLOAT:
652 case GLSL_TYPE_BOOL:
653 return type->components();
654 case GLSL_TYPE_ARRAY:
655 return type_size(type->fields.array) * type->length;
656 case GLSL_TYPE_STRUCT:
657 size = 0;
658 for (i = 0; i < type->length; i++) {
659 size += type_size(type->fields.structure[i].type);
660 }
661 return size;
662 case GLSL_TYPE_SAMPLER:
663 /* Samplers take up no register space, since they're baked in at
664 * link time.
665 */
666 return 0;
667 case GLSL_TYPE_ATOMIC_UINT:
668 return 0;
669 case GLSL_TYPE_IMAGE:
670 case GLSL_TYPE_VOID:
671 case GLSL_TYPE_ERROR:
672 case GLSL_TYPE_INTERFACE:
673 case GLSL_TYPE_DOUBLE:
674 unreachable("not reached");
675 }
676
677 return 0;
678 }
679
680 /**
681 * Create a MOV to read the timestamp register.
682 *
683 * The caller is responsible for emitting the MOV. The return value is
684 * the destination of the MOV, with extra parameters set.
685 */
686 fs_reg
687 fs_visitor::get_timestamp(fs_inst **out_mov)
688 {
689 assert(brw->gen >= 7);
690
691 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
692 BRW_ARF_TIMESTAMP,
693 0),
694 BRW_REGISTER_TYPE_UD));
695
696 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
697
698 fs_inst *mov = MOV(dst, ts);
699 /* We want to read the 3 fields we care about even if it's not enabled in
700 * the dispatch.
701 */
702 mov->force_writemask_all = true;
703
704 /* The caller wants the low 32 bits of the timestamp. Since it's running
705 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
706 * which is plenty of time for our purposes. It is identical across the
707 * EUs, but since it's tracking GPU core speed it will increment at a
708 * varying rate as render P-states change.
709 *
710 * The caller could also check if render P-states have changed (or anything
711 * else that might disrupt timing) by setting smear to 2 and checking if
712 * that field is != 0.
713 */
714 dst.set_smear(0);
715
716 *out_mov = mov;
717 return dst;
718 }
719
720 void
721 fs_visitor::emit_shader_time_begin()
722 {
723 current_annotation = "shader time start";
724 fs_inst *mov;
725 shader_start_time = get_timestamp(&mov);
726 emit(mov);
727 }
728
729 void
730 fs_visitor::emit_shader_time_end()
731 {
732 current_annotation = "shader time end";
733
734 enum shader_time_shader_type type, written_type, reset_type;
735 switch (stage) {
736 case MESA_SHADER_VERTEX:
737 type = ST_VS;
738 written_type = ST_VS_WRITTEN;
739 reset_type = ST_VS_RESET;
740 break;
741 case MESA_SHADER_GEOMETRY:
742 type = ST_GS;
743 written_type = ST_GS_WRITTEN;
744 reset_type = ST_GS_RESET;
745 break;
746 case MESA_SHADER_FRAGMENT:
747 if (dispatch_width == 8) {
748 type = ST_FS8;
749 written_type = ST_FS8_WRITTEN;
750 reset_type = ST_FS8_RESET;
751 } else {
752 assert(dispatch_width == 16);
753 type = ST_FS16;
754 written_type = ST_FS16_WRITTEN;
755 reset_type = ST_FS16_RESET;
756 }
757 break;
758 default:
759 unreachable("fs_visitor::emit_shader_time_end missing code");
760 }
761
762 fs_inst *tm_read;
763 fs_reg shader_end_time = get_timestamp(&tm_read);
764 emit(tm_read);
765
766 /* Check that there weren't any timestamp reset events (assuming these
767 * were the only two timestamp reads that happened).
768 */
769 fs_reg reset = shader_end_time;
770 reset.set_smear(2);
771 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
772 test->conditional_mod = BRW_CONDITIONAL_Z;
773 test->force_writemask_all = true;
774 emit(IF(BRW_PREDICATE_NORMAL));
775
776 fs_reg start = shader_start_time;
777 start.negate = true;
778 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
779 diff.set_smear(0);
780 fs_inst *add = ADD(diff, start, shader_end_time);
781 add->force_writemask_all = true;
782 emit(add);
783
784 /* If there were no instructions between the two timestamp gets, the diff
785 * is 2 cycles. Remove that overhead, so I can forget about that when
786 * trying to determine the time taken for single instructions.
787 */
788 add = ADD(diff, diff, fs_reg(-2u));
789 add->force_writemask_all = true;
790 emit(add);
791
792 emit(SHADER_TIME_ADD(type, diff));
793 emit(SHADER_TIME_ADD(written_type, fs_reg(1u)));
794 emit(BRW_OPCODE_ELSE);
795 emit(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
796 emit(BRW_OPCODE_ENDIF);
797 }
798
799 fs_inst *
800 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
801 {
802 int shader_time_index =
803 brw_get_shader_time_index(brw, shader_prog, prog, type);
804 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
805
806 fs_reg payload;
807 if (dispatch_width == 8)
808 payload = vgrf(glsl_type::uvec2_type);
809 else
810 payload = vgrf(glsl_type::uint_type);
811
812 return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
813 fs_reg(), payload, offset, value);
814 }
815
816 void
817 fs_visitor::vfail(const char *format, va_list va)
818 {
819 char *msg;
820
821 if (failed)
822 return;
823
824 failed = true;
825
826 msg = ralloc_vasprintf(mem_ctx, format, va);
827 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
828
829 this->fail_msg = msg;
830
831 if (debug_enabled) {
832 fprintf(stderr, "%s", msg);
833 }
834 }
835
836 void
837 fs_visitor::fail(const char *format, ...)
838 {
839 va_list va;
840
841 va_start(va, format);
842 vfail(format, va);
843 va_end(va);
844 }
845
846 /**
847 * Mark this program as impossible to compile in SIMD16 mode.
848 *
849 * During the SIMD8 compile (which happens first), we can detect and flag
850 * things that are unsupported in SIMD16 mode, so the compiler can skip
851 * the SIMD16 compile altogether.
852 *
853 * During a SIMD16 compile (if one happens anyway), this just calls fail().
854 */
855 void
856 fs_visitor::no16(const char *format, ...)
857 {
858 va_list va;
859
860 va_start(va, format);
861
862 if (dispatch_width == 16) {
863 vfail(format, va);
864 } else {
865 simd16_unsupported = true;
866
867 if (brw->perf_debug) {
868 if (no16_msg)
869 ralloc_vasprintf_append(&no16_msg, format, va);
870 else
871 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
872 }
873 }
874
875 va_end(va);
876 }
877
878 fs_inst *
879 fs_visitor::emit(enum opcode opcode)
880 {
881 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
882 }
883
884 fs_inst *
885 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
886 {
887 return emit(new(mem_ctx) fs_inst(opcode, dst));
888 }
889
890 fs_inst *
891 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
892 {
893 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
894 }
895
896 fs_inst *
897 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
898 const fs_reg &src1)
899 {
900 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
901 }
902
903 fs_inst *
904 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
905 const fs_reg &src1, const fs_reg &src2)
906 {
907 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
908 }
909
910 fs_inst *
911 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
912 fs_reg src[], int sources)
913 {
914 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
915 }
916
917 /**
918 * Returns true if the instruction has a flag that means it won't
919 * update an entire destination register.
920 *
921 * For example, dead code elimination and live variable analysis want to know
922 * when a write to a variable screens off any preceding values that were in
923 * it.
924 */
925 bool
926 fs_inst::is_partial_write() const
927 {
928 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
929 (this->dst.width * type_sz(this->dst.type)) < 32 ||
930 !this->dst.is_contiguous());
931 }
932
933 int
934 fs_inst::regs_read(int arg) const
935 {
936 if (is_tex() && arg == 0 && src[0].file == GRF) {
937 return mlen;
938 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
939 return mlen;
940 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
941 return mlen;
942 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
943 return mlen;
944 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
945 return mlen;
946 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
947 return mlen;
948 }
949
950 switch (src[arg].file) {
951 case BAD_FILE:
952 case UNIFORM:
953 case IMM:
954 return 1;
955 case GRF:
956 case HW_REG:
957 if (src[arg].stride == 0) {
958 return 1;
959 } else {
960 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
961 return (size + 31) / 32;
962 }
963 case MRF:
964 unreachable("MRF registers are not allowed as sources");
965 default:
966 unreachable("Invalid register file");
967 }
968 }
969
970 bool
971 fs_inst::reads_flag() const
972 {
973 return predicate;
974 }
975
976 bool
977 fs_inst::writes_flag() const
978 {
979 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
980 opcode != BRW_OPCODE_IF &&
981 opcode != BRW_OPCODE_WHILE)) ||
982 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
983 }
984
985 /**
986 * Returns how many MRFs an FS opcode will write over.
987 *
988 * Note that this is not the 0 or 1 implied writes in an actual gen
989 * instruction -- the FS opcodes often generate MOVs in addition.
990 */
991 int
992 fs_visitor::implied_mrf_writes(fs_inst *inst)
993 {
994 if (inst->mlen == 0)
995 return 0;
996
997 if (inst->base_mrf == -1)
998 return 0;
999
1000 switch (inst->opcode) {
1001 case SHADER_OPCODE_RCP:
1002 case SHADER_OPCODE_RSQ:
1003 case SHADER_OPCODE_SQRT:
1004 case SHADER_OPCODE_EXP2:
1005 case SHADER_OPCODE_LOG2:
1006 case SHADER_OPCODE_SIN:
1007 case SHADER_OPCODE_COS:
1008 return 1 * dispatch_width / 8;
1009 case SHADER_OPCODE_POW:
1010 case SHADER_OPCODE_INT_QUOTIENT:
1011 case SHADER_OPCODE_INT_REMAINDER:
1012 return 2 * dispatch_width / 8;
1013 case SHADER_OPCODE_TEX:
1014 case FS_OPCODE_TXB:
1015 case SHADER_OPCODE_TXD:
1016 case SHADER_OPCODE_TXF:
1017 case SHADER_OPCODE_TXF_CMS:
1018 case SHADER_OPCODE_TXF_MCS:
1019 case SHADER_OPCODE_TG4:
1020 case SHADER_OPCODE_TG4_OFFSET:
1021 case SHADER_OPCODE_TXL:
1022 case SHADER_OPCODE_TXS:
1023 case SHADER_OPCODE_LOD:
1024 return 1;
1025 case FS_OPCODE_FB_WRITE:
1026 return 2;
1027 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1028 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1029 return 1;
1030 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1031 return inst->mlen;
1032 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1033 return 2;
1034 case SHADER_OPCODE_UNTYPED_ATOMIC:
1035 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1036 case SHADER_OPCODE_URB_WRITE_SIMD8:
1037 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1038 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1039 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1040 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1041 return 0;
1042 default:
1043 unreachable("not reached");
1044 }
1045 }
1046
1047 fs_reg
1048 fs_visitor::vgrf(const glsl_type *const type)
1049 {
1050 int reg_width = dispatch_width / 8;
1051 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1052 brw_type_for_base_type(type), dispatch_width);
1053 }
1054
1055 fs_reg
1056 fs_visitor::vgrf(int num_components)
1057 {
1058 int reg_width = dispatch_width / 8;
1059 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1060 BRW_REGISTER_TYPE_F, dispatch_width);
1061 }
1062
1063 /** Fixed HW reg constructor. */
1064 fs_reg::fs_reg(enum register_file file, int reg)
1065 {
1066 init();
1067 this->file = file;
1068 this->reg = reg;
1069 this->type = BRW_REGISTER_TYPE_F;
1070
1071 switch (file) {
1072 case UNIFORM:
1073 this->width = 1;
1074 break;
1075 default:
1076 this->width = 8;
1077 }
1078 }
1079
1080 /** Fixed HW reg constructor. */
1081 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1082 {
1083 init();
1084 this->file = file;
1085 this->reg = reg;
1086 this->type = type;
1087
1088 switch (file) {
1089 case UNIFORM:
1090 this->width = 1;
1091 break;
1092 default:
1093 this->width = 8;
1094 }
1095 }
1096
1097 /** Fixed HW reg constructor. */
1098 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1099 uint8_t width)
1100 {
1101 init();
1102 this->file = file;
1103 this->reg = reg;
1104 this->type = type;
1105 this->width = width;
1106 }
1107
1108 fs_reg *
1109 fs_visitor::variable_storage(ir_variable *var)
1110 {
1111 return (fs_reg *)hash_table_find(this->variable_ht, var);
1112 }
1113
1114 void
1115 import_uniforms_callback(const void *key,
1116 void *data,
1117 void *closure)
1118 {
1119 struct hash_table *dst_ht = (struct hash_table *)closure;
1120 const fs_reg *reg = (const fs_reg *)data;
1121
1122 if (reg->file != UNIFORM)
1123 return;
1124
1125 hash_table_insert(dst_ht, data, key);
1126 }
1127
1128 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1129 * This brings in those uniform definitions
1130 */
1131 void
1132 fs_visitor::import_uniforms(fs_visitor *v)
1133 {
1134 hash_table_call_foreach(v->variable_ht,
1135 import_uniforms_callback,
1136 variable_ht);
1137 this->push_constant_loc = v->push_constant_loc;
1138 this->pull_constant_loc = v->pull_constant_loc;
1139 this->uniforms = v->uniforms;
1140 this->param_size = v->param_size;
1141 }
1142
1143 /* Our support for uniforms is piggy-backed on the struct
1144 * gl_fragment_program, because that's where the values actually
1145 * get stored, rather than in some global gl_shader_program uniform
1146 * store.
1147 */
1148 void
1149 fs_visitor::setup_uniform_values(ir_variable *ir)
1150 {
1151 int namelen = strlen(ir->name);
1152
1153 /* The data for our (non-builtin) uniforms is stored in a series of
1154 * gl_uniform_driver_storage structs for each subcomponent that
1155 * glGetUniformLocation() could name. We know it's been set up in the same
1156 * order we'd walk the type, so walk the list of storage and find anything
1157 * with our name, or the prefix of a component that starts with our name.
1158 */
1159 unsigned params_before = uniforms;
1160 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1161 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1162
1163 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1164 (storage->name[namelen] != 0 &&
1165 storage->name[namelen] != '.' &&
1166 storage->name[namelen] != '[')) {
1167 continue;
1168 }
1169
1170 unsigned slots = storage->type->component_slots();
1171 if (storage->array_elements)
1172 slots *= storage->array_elements;
1173
1174 for (unsigned i = 0; i < slots; i++) {
1175 stage_prog_data->param[uniforms++] = &storage->storage[i];
1176 }
1177 }
1178
1179 /* Make sure we actually initialized the right amount of stuff here. */
1180 assert(params_before + ir->type->component_slots() == uniforms);
1181 (void)params_before;
1182 }
1183
1184
1185 /* Our support for builtin uniforms is even scarier than non-builtin.
1186 * It sits on top of the PROG_STATE_VAR parameters that are
1187 * automatically updated from GL context state.
1188 */
1189 void
1190 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1191 {
1192 const ir_state_slot *const slots = ir->get_state_slots();
1193 assert(slots != NULL);
1194
1195 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1196 /* This state reference has already been setup by ir_to_mesa, but we'll
1197 * get the same index back here.
1198 */
1199 int index = _mesa_add_state_reference(this->prog->Parameters,
1200 (gl_state_index *)slots[i].tokens);
1201
1202 /* Add each of the unique swizzles of the element as a parameter.
1203 * This'll end up matching the expected layout of the
1204 * array/matrix/structure we're trying to fill in.
1205 */
1206 int last_swiz = -1;
1207 for (unsigned int j = 0; j < 4; j++) {
1208 int swiz = GET_SWZ(slots[i].swizzle, j);
1209 if (swiz == last_swiz)
1210 break;
1211 last_swiz = swiz;
1212
1213 stage_prog_data->param[uniforms++] =
1214 &prog->Parameters->ParameterValues[index][swiz];
1215 }
1216 }
1217 }
1218
1219 fs_reg *
1220 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1221 bool origin_upper_left)
1222 {
1223 assert(stage == MESA_SHADER_FRAGMENT);
1224 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1225 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1226 fs_reg wpos = *reg;
1227 bool flip = !origin_upper_left ^ key->render_to_fbo;
1228
1229 /* gl_FragCoord.x */
1230 if (pixel_center_integer) {
1231 emit(MOV(wpos, this->pixel_x));
1232 } else {
1233 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1234 }
1235 wpos = offset(wpos, 1);
1236
1237 /* gl_FragCoord.y */
1238 if (!flip && pixel_center_integer) {
1239 emit(MOV(wpos, this->pixel_y));
1240 } else {
1241 fs_reg pixel_y = this->pixel_y;
1242 float offset = (pixel_center_integer ? 0.0 : 0.5);
1243
1244 if (flip) {
1245 pixel_y.negate = true;
1246 offset += key->drawable_height - 1.0;
1247 }
1248
1249 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1250 }
1251 wpos = offset(wpos, 1);
1252
1253 /* gl_FragCoord.z */
1254 if (brw->gen >= 6) {
1255 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1256 } else {
1257 emit(FS_OPCODE_LINTERP, wpos,
1258 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1259 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1260 interp_reg(VARYING_SLOT_POS, 2));
1261 }
1262 wpos = offset(wpos, 1);
1263
1264 /* gl_FragCoord.w: Already set up in emit_interpolation */
1265 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1266
1267 return reg;
1268 }
1269
1270 fs_inst *
1271 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1272 glsl_interp_qualifier interpolation_mode,
1273 bool is_centroid, bool is_sample)
1274 {
1275 brw_wm_barycentric_interp_mode barycoord_mode;
1276 if (brw->gen >= 6) {
1277 if (is_centroid) {
1278 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1279 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1280 else
1281 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1282 } else if (is_sample) {
1283 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1284 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1285 else
1286 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1287 } else {
1288 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1289 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1290 else
1291 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1292 }
1293 } else {
1294 /* On Ironlake and below, there is only one interpolation mode.
1295 * Centroid interpolation doesn't mean anything on this hardware --
1296 * there is no multisampling.
1297 */
1298 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1299 }
1300 return emit(FS_OPCODE_LINTERP, attr,
1301 this->delta_x[barycoord_mode],
1302 this->delta_y[barycoord_mode], interp);
1303 }
1304
1305 void
1306 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1307 const glsl_type *type,
1308 glsl_interp_qualifier interpolation_mode,
1309 int location, bool mod_centroid,
1310 bool mod_sample)
1311 {
1312 attr.type = brw_type_for_base_type(type->get_scalar_type());
1313
1314 assert(stage == MESA_SHADER_FRAGMENT);
1315 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1316 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1317
1318 unsigned int array_elements;
1319
1320 if (type->is_array()) {
1321 array_elements = type->length;
1322 if (array_elements == 0) {
1323 fail("dereferenced array '%s' has length 0\n", name);
1324 }
1325 type = type->fields.array;
1326 } else {
1327 array_elements = 1;
1328 }
1329
1330 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1331 bool is_gl_Color =
1332 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1333 if (key->flat_shade && is_gl_Color) {
1334 interpolation_mode = INTERP_QUALIFIER_FLAT;
1335 } else {
1336 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1337 }
1338 }
1339
1340 for (unsigned int i = 0; i < array_elements; i++) {
1341 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1342 if (prog_data->urb_setup[location] == -1) {
1343 /* If there's no incoming setup data for this slot, don't
1344 * emit interpolation for it.
1345 */
1346 attr = offset(attr, type->vector_elements);
1347 location++;
1348 continue;
1349 }
1350
1351 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1352 /* Constant interpolation (flat shading) case. The SF has
1353 * handed us defined values in only the constant offset
1354 * field of the setup reg.
1355 */
1356 for (unsigned int k = 0; k < type->vector_elements; k++) {
1357 struct brw_reg interp = interp_reg(location, k);
1358 interp = suboffset(interp, 3);
1359 interp.type = attr.type;
1360 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1361 attr = offset(attr, 1);
1362 }
1363 } else {
1364 /* Smooth/noperspective interpolation case. */
1365 for (unsigned int k = 0; k < type->vector_elements; k++) {
1366 struct brw_reg interp = interp_reg(location, k);
1367 if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1368 /* Get the pixel/sample mask into f0 so that we know
1369 * which pixels are lit. Then, for each channel that is
1370 * unlit, replace the centroid data with non-centroid
1371 * data.
1372 */
1373 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1374
1375 fs_inst *inst;
1376 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1377 false, false);
1378 inst->predicate = BRW_PREDICATE_NORMAL;
1379 inst->predicate_inverse = true;
1380 if (brw->has_pln)
1381 inst->no_dd_clear = true;
1382
1383 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1384 mod_centroid && !key->persample_shading,
1385 mod_sample || key->persample_shading);
1386 inst->predicate = BRW_PREDICATE_NORMAL;
1387 inst->predicate_inverse = false;
1388 if (brw->has_pln)
1389 inst->no_dd_check = true;
1390
1391 } else {
1392 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1393 mod_centroid && !key->persample_shading,
1394 mod_sample || key->persample_shading);
1395 }
1396 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1397 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1398 }
1399 attr = offset(attr, 1);
1400 }
1401
1402 }
1403 location++;
1404 }
1405 }
1406 }
1407
1408 fs_reg *
1409 fs_visitor::emit_frontfacing_interpolation()
1410 {
1411 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1412
1413 if (brw->gen >= 6) {
1414 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1415 * a boolean result from this (~0/true or 0/false).
1416 *
1417 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1418 * this task in only one instruction:
1419 * - a negation source modifier will flip the bit; and
1420 * - a W -> D type conversion will sign extend the bit into the high
1421 * word of the destination.
1422 *
1423 * An ASR 15 fills the low word of the destination.
1424 */
1425 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1426 g0.negate = true;
1427
1428 emit(ASR(*reg, g0, fs_reg(15)));
1429 } else {
1430 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1431 * a boolean result from this (1/true or 0/false).
1432 *
1433 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1434 * the negation source modifier to flip it. Unfortunately the SHR
1435 * instruction only operates on UD (or D with an abs source modifier)
1436 * sources without negation.
1437 *
1438 * Instead, use ASR (which will give ~0/true or 0/false).
1439 */
1440 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1441 g1_6.negate = true;
1442
1443 emit(ASR(*reg, g1_6, fs_reg(31)));
1444 }
1445
1446 return reg;
1447 }
1448
1449 void
1450 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1451 {
1452 assert(stage == MESA_SHADER_FRAGMENT);
1453 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1454 assert(dst.type == BRW_REGISTER_TYPE_F);
1455
1456 if (key->compute_pos_offset) {
1457 /* Convert int_sample_pos to floating point */
1458 emit(MOV(dst, int_sample_pos));
1459 /* Scale to the range [0, 1] */
1460 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1461 }
1462 else {
1463 /* From ARB_sample_shading specification:
1464 * "When rendering to a non-multisample buffer, or if multisample
1465 * rasterization is disabled, gl_SamplePosition will always be
1466 * (0.5, 0.5).
1467 */
1468 emit(MOV(dst, fs_reg(0.5f)));
1469 }
1470 }
1471
1472 fs_reg *
1473 fs_visitor::emit_samplepos_setup()
1474 {
1475 assert(brw->gen >= 6);
1476
1477 this->current_annotation = "compute sample position";
1478 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1479 fs_reg pos = *reg;
1480 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1481 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1482
1483 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1484 * mode will be enabled.
1485 *
1486 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1487 * R31.1:0 Position Offset X/Y for Slot[3:0]
1488 * R31.3:2 Position Offset X/Y for Slot[7:4]
1489 * .....
1490 *
1491 * The X, Y sample positions come in as bytes in thread payload. So, read
1492 * the positions using vstride=16, width=8, hstride=2.
1493 */
1494 struct brw_reg sample_pos_reg =
1495 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1496 BRW_REGISTER_TYPE_B), 16, 8, 2);
1497
1498 if (dispatch_width == 8) {
1499 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1500 } else {
1501 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1502 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1503 ->force_sechalf = true;
1504 }
1505 /* Compute gl_SamplePosition.x */
1506 compute_sample_position(pos, int_sample_x);
1507 pos = offset(pos, 1);
1508 if (dispatch_width == 8) {
1509 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1510 } else {
1511 emit(MOV(half(int_sample_y, 0),
1512 fs_reg(suboffset(sample_pos_reg, 1))));
1513 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1514 ->force_sechalf = true;
1515 }
1516 /* Compute gl_SamplePosition.y */
1517 compute_sample_position(pos, int_sample_y);
1518 return reg;
1519 }
1520
1521 fs_reg *
1522 fs_visitor::emit_sampleid_setup()
1523 {
1524 assert(stage == MESA_SHADER_FRAGMENT);
1525 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1526 assert(brw->gen >= 6);
1527
1528 this->current_annotation = "compute sample id";
1529 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1530
1531 if (key->compute_sample_id) {
1532 fs_reg t1 = vgrf(glsl_type::int_type);
1533 fs_reg t2 = vgrf(glsl_type::int_type);
1534 t2.type = BRW_REGISTER_TYPE_UW;
1535
1536 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1537 * 8x multisampling, subspan 0 will represent sample N (where N
1538 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1539 * 7. We can find the value of N by looking at R0.0 bits 7:6
1540 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1541 * (since samples are always delivered in pairs). That is, we
1542 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1543 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1544 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1545 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1546 * populating a temporary variable with the sequence (0, 1, 2, 3),
1547 * and then reading from it using vstride=1, width=4, hstride=0.
1548 * These computations hold good for 4x multisampling as well.
1549 *
1550 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1551 * the first four slots are sample 0 of subspan 0; the next four
1552 * are sample 1 of subspan 0; the third group is sample 0 of
1553 * subspan 1, and finally sample 1 of subspan 1.
1554 */
1555 fs_inst *inst;
1556 inst = emit(BRW_OPCODE_AND, t1,
1557 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1558 fs_reg(0xc0));
1559 inst->force_writemask_all = true;
1560 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1561 inst->force_writemask_all = true;
1562 /* This works for both SIMD8 and SIMD16 */
1563 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1564 inst->force_writemask_all = true;
1565 /* This special instruction takes care of setting vstride=1,
1566 * width=4, hstride=0 of t2 during an ADD instruction.
1567 */
1568 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1569 } else {
1570 /* As per GL_ARB_sample_shading specification:
1571 * "When rendering to a non-multisample buffer, or if multisample
1572 * rasterization is disabled, gl_SampleID will always be zero."
1573 */
1574 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1575 }
1576
1577 return reg;
1578 }
1579
1580 void
1581 fs_visitor::resolve_source_modifiers(fs_reg *src)
1582 {
1583 if (!src->abs && !src->negate)
1584 return;
1585
1586 fs_reg temp = retype(vgrf(1), src->type);
1587 emit(MOV(temp, *src));
1588 *src = temp;
1589 }
1590
1591 fs_reg
1592 fs_visitor::fix_math_operand(fs_reg src)
1593 {
1594 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1595 * might be able to do better by doing execsize = 1 math and then
1596 * expanding that result out, but we would need to be careful with
1597 * masking.
1598 *
1599 * The hardware ignores source modifiers (negate and abs) on math
1600 * instructions, so we also move to a temp to set those up.
1601 */
1602 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1603 !src.abs && !src.negate)
1604 return src;
1605
1606 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1607 * operands to math
1608 */
1609 if (brw->gen >= 7 && src.file != IMM)
1610 return src;
1611
1612 fs_reg expanded = vgrf(glsl_type::float_type);
1613 expanded.type = src.type;
1614 emit(BRW_OPCODE_MOV, expanded, src);
1615 return expanded;
1616 }
1617
1618 fs_inst *
1619 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1620 {
1621 switch (opcode) {
1622 case SHADER_OPCODE_RCP:
1623 case SHADER_OPCODE_RSQ:
1624 case SHADER_OPCODE_SQRT:
1625 case SHADER_OPCODE_EXP2:
1626 case SHADER_OPCODE_LOG2:
1627 case SHADER_OPCODE_SIN:
1628 case SHADER_OPCODE_COS:
1629 break;
1630 default:
1631 unreachable("not reached: bad math opcode");
1632 }
1633
1634 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1635 * might be able to do better by doing execsize = 1 math and then
1636 * expanding that result out, but we would need to be careful with
1637 * masking.
1638 *
1639 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1640 * instructions, so we also move to a temp to set those up.
1641 */
1642 if (brw->gen == 6 || brw->gen == 7)
1643 src = fix_math_operand(src);
1644
1645 fs_inst *inst = emit(opcode, dst, src);
1646
1647 if (brw->gen < 6) {
1648 inst->base_mrf = 2;
1649 inst->mlen = dispatch_width / 8;
1650 }
1651
1652 return inst;
1653 }
1654
1655 fs_inst *
1656 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1657 {
1658 int base_mrf = 2;
1659 fs_inst *inst;
1660
1661 if (brw->gen >= 8) {
1662 inst = emit(opcode, dst, src0, src1);
1663 } else if (brw->gen >= 6) {
1664 src0 = fix_math_operand(src0);
1665 src1 = fix_math_operand(src1);
1666
1667 inst = emit(opcode, dst, src0, src1);
1668 } else {
1669 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1670 * "Message Payload":
1671 *
1672 * "Operand0[7]. For the INT DIV functions, this operand is the
1673 * denominator."
1674 * ...
1675 * "Operand1[7]. For the INT DIV functions, this operand is the
1676 * numerator."
1677 */
1678 bool is_int_div = opcode != SHADER_OPCODE_POW;
1679 fs_reg &op0 = is_int_div ? src1 : src0;
1680 fs_reg &op1 = is_int_div ? src0 : src1;
1681
1682 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1683 inst = emit(opcode, dst, op0, reg_null_f);
1684
1685 inst->base_mrf = base_mrf;
1686 inst->mlen = 2 * dispatch_width / 8;
1687 }
1688 return inst;
1689 }
1690
1691 void
1692 fs_visitor::assign_curb_setup()
1693 {
1694 if (dispatch_width == 8) {
1695 prog_data->dispatch_grf_start_reg = payload.num_regs;
1696 } else {
1697 assert(stage == MESA_SHADER_FRAGMENT);
1698 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1699 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1700 }
1701
1702 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1703
1704 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1705 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1706 for (unsigned int i = 0; i < inst->sources; i++) {
1707 if (inst->src[i].file == UNIFORM) {
1708 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1709 int constant_nr;
1710 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1711 constant_nr = push_constant_loc[uniform_nr];
1712 } else {
1713 /* Section 5.11 of the OpenGL 4.1 spec says:
1714 * "Out-of-bounds reads return undefined values, which include
1715 * values from other variables of the active program or zero."
1716 * Just return the first push constant.
1717 */
1718 constant_nr = 0;
1719 }
1720
1721 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1722 constant_nr / 8,
1723 constant_nr % 8);
1724
1725 inst->src[i].file = HW_REG;
1726 inst->src[i].fixed_hw_reg = byte_offset(
1727 retype(brw_reg, inst->src[i].type),
1728 inst->src[i].subreg_offset);
1729 }
1730 }
1731 }
1732 }
1733
1734 void
1735 fs_visitor::calculate_urb_setup()
1736 {
1737 assert(stage == MESA_SHADER_FRAGMENT);
1738 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1739 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1740
1741 memset(prog_data->urb_setup, -1,
1742 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1743
1744 int urb_next = 0;
1745 /* Figure out where each of the incoming setup attributes lands. */
1746 if (brw->gen >= 6) {
1747 if (_mesa_bitcount_64(prog->InputsRead &
1748 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1749 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1750 * first 16 varying inputs, so we can put them wherever we want.
1751 * Just put them in order.
1752 *
1753 * This is useful because it means that (a) inputs not used by the
1754 * fragment shader won't take up valuable register space, and (b) we
1755 * won't have to recompile the fragment shader if it gets paired with
1756 * a different vertex (or geometry) shader.
1757 */
1758 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1759 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1760 BITFIELD64_BIT(i)) {
1761 prog_data->urb_setup[i] = urb_next++;
1762 }
1763 }
1764 } else {
1765 /* We have enough input varyings that the SF/SBE pipeline stage can't
1766 * arbitrarily rearrange them to suit our whim; we have to put them
1767 * in an order that matches the output of the previous pipeline stage
1768 * (geometry or vertex shader).
1769 */
1770 struct brw_vue_map prev_stage_vue_map;
1771 brw_compute_vue_map(brw, &prev_stage_vue_map,
1772 key->input_slots_valid);
1773 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1774 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1775 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1776 slot++) {
1777 int varying = prev_stage_vue_map.slot_to_varying[slot];
1778 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1779 * unused.
1780 */
1781 if (varying != BRW_VARYING_SLOT_COUNT &&
1782 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1783 BITFIELD64_BIT(varying))) {
1784 prog_data->urb_setup[varying] = slot - first_slot;
1785 }
1786 }
1787 urb_next = prev_stage_vue_map.num_slots - first_slot;
1788 }
1789 } else {
1790 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1791 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1792 /* Point size is packed into the header, not as a general attribute */
1793 if (i == VARYING_SLOT_PSIZ)
1794 continue;
1795
1796 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1797 /* The back color slot is skipped when the front color is
1798 * also written to. In addition, some slots can be
1799 * written in the vertex shader and not read in the
1800 * fragment shader. So the register number must always be
1801 * incremented, mapped or not.
1802 */
1803 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1804 prog_data->urb_setup[i] = urb_next;
1805 urb_next++;
1806 }
1807 }
1808
1809 /*
1810 * It's a FS only attribute, and we did interpolation for this attribute
1811 * in SF thread. So, count it here, too.
1812 *
1813 * See compile_sf_prog() for more info.
1814 */
1815 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1816 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1817 }
1818
1819 prog_data->num_varying_inputs = urb_next;
1820 }
1821
1822 void
1823 fs_visitor::assign_urb_setup()
1824 {
1825 assert(stage == MESA_SHADER_FRAGMENT);
1826 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1827
1828 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1829
1830 /* Offset all the urb_setup[] index by the actual position of the
1831 * setup regs, now that the location of the constants has been chosen.
1832 */
1833 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1834 if (inst->opcode == FS_OPCODE_LINTERP) {
1835 assert(inst->src[2].file == HW_REG);
1836 inst->src[2].fixed_hw_reg.nr += urb_start;
1837 }
1838
1839 if (inst->opcode == FS_OPCODE_CINTERP) {
1840 assert(inst->src[0].file == HW_REG);
1841 inst->src[0].fixed_hw_reg.nr += urb_start;
1842 }
1843 }
1844
1845 /* Each attribute is 4 setup channels, each of which is half a reg. */
1846 this->first_non_payload_grf =
1847 urb_start + prog_data->num_varying_inputs * 2;
1848 }
1849
1850 void
1851 fs_visitor::assign_vs_urb_setup()
1852 {
1853 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1854 int grf, count, slot, channel, attr;
1855
1856 assert(stage == MESA_SHADER_VERTEX);
1857 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1858 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1859 count++;
1860
1861 /* Each attribute is 4 regs. */
1862 this->first_non_payload_grf =
1863 payload.num_regs + prog_data->curb_read_length + count * 4;
1864
1865 unsigned vue_entries =
1866 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1867
1868 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1869 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1870
1871 assert(vs_prog_data->base.urb_read_length <= 15);
1872
1873 /* Rewrite all ATTR file references to the hw grf that they land in. */
1874 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1875 for (int i = 0; i < inst->sources; i++) {
1876 if (inst->src[i].file == ATTR) {
1877
1878 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1879 slot = count - 1;
1880 } else {
1881 /* Attributes come in in a contiguous block, ordered by their
1882 * gl_vert_attrib value. That means we can compute the slot
1883 * number for an attribute by masking out the enabled
1884 * attributes before it and counting the bits.
1885 */
1886 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1887 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1888 BITFIELD64_MASK(attr));
1889 }
1890
1891 channel = inst->src[i].reg_offset & 3;
1892
1893 grf = payload.num_regs +
1894 prog_data->curb_read_length +
1895 slot * 4 + channel;
1896
1897 inst->src[i].file = HW_REG;
1898 inst->src[i].fixed_hw_reg =
1899 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1900 }
1901 }
1902 }
1903 }
1904
1905 /**
1906 * Split large virtual GRFs into separate components if we can.
1907 *
1908 * This is mostly duplicated with what brw_fs_vector_splitting does,
1909 * but that's really conservative because it's afraid of doing
1910 * splitting that doesn't result in real progress after the rest of
1911 * the optimization phases, which would cause infinite looping in
1912 * optimization. We can do it once here, safely. This also has the
1913 * opportunity to split interpolated values, or maybe even uniforms,
1914 * which we don't have at the IR level.
1915 *
1916 * We want to split, because virtual GRFs are what we register
1917 * allocate and spill (due to contiguousness requirements for some
1918 * instructions), and they're what we naturally generate in the
1919 * codegen process, but most virtual GRFs don't actually need to be
1920 * contiguous sets of GRFs. If we split, we'll end up with reduced
1921 * live intervals and better dead code elimination and coalescing.
1922 */
1923 void
1924 fs_visitor::split_virtual_grfs()
1925 {
1926 int num_vars = this->alloc.count;
1927
1928 /* Count the total number of registers */
1929 int reg_count = 0;
1930 int vgrf_to_reg[num_vars];
1931 for (int i = 0; i < num_vars; i++) {
1932 vgrf_to_reg[i] = reg_count;
1933 reg_count += alloc.sizes[i];
1934 }
1935
1936 /* An array of "split points". For each register slot, this indicates
1937 * if this slot can be separated from the previous slot. Every time an
1938 * instruction uses multiple elements of a register (as a source or
1939 * destination), we mark the used slots as inseparable. Then we go
1940 * through and split the registers into the smallest pieces we can.
1941 */
1942 bool split_points[reg_count];
1943 memset(split_points, 0, sizeof(split_points));
1944
1945 /* Mark all used registers as fully splittable */
1946 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1947 if (inst->dst.file == GRF) {
1948 int reg = vgrf_to_reg[inst->dst.reg];
1949 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1950 split_points[reg + j] = true;
1951 }
1952
1953 for (int i = 0; i < inst->sources; i++) {
1954 if (inst->src[i].file == GRF) {
1955 int reg = vgrf_to_reg[inst->src[i].reg];
1956 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1957 split_points[reg + j] = true;
1958 }
1959 }
1960 }
1961
1962 if (brw->has_pln &&
1963 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1964 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1965 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1966 * Gen6, that was the only supported interpolation mode, and since Gen6,
1967 * delta_x and delta_y are in fixed hardware registers.
1968 */
1969 int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1970 split_points[vgrf_to_reg[vgrf] + 1] = false;
1971 }
1972
1973 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1974 if (inst->dst.file == GRF) {
1975 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1976 for (int j = 1; j < inst->regs_written; j++)
1977 split_points[reg + j] = false;
1978 }
1979 for (int i = 0; i < inst->sources; i++) {
1980 if (inst->src[i].file == GRF) {
1981 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1982 for (int j = 1; j < inst->regs_read(i); j++)
1983 split_points[reg + j] = false;
1984 }
1985 }
1986 }
1987
1988 int new_virtual_grf[reg_count];
1989 int new_reg_offset[reg_count];
1990
1991 int reg = 0;
1992 for (int i = 0; i < num_vars; i++) {
1993 /* The first one should always be 0 as a quick sanity check. */
1994 assert(split_points[reg] == false);
1995
1996 /* j = 0 case */
1997 new_reg_offset[reg] = 0;
1998 reg++;
1999 int offset = 1;
2000
2001 /* j > 0 case */
2002 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2003 /* If this is a split point, reset the offset to 0 and allocate a
2004 * new virtual GRF for the previous offset many registers
2005 */
2006 if (split_points[reg]) {
2007 assert(offset <= MAX_VGRF_SIZE);
2008 int grf = alloc.allocate(offset);
2009 for (int k = reg - offset; k < reg; k++)
2010 new_virtual_grf[k] = grf;
2011 offset = 0;
2012 }
2013 new_reg_offset[reg] = offset;
2014 offset++;
2015 reg++;
2016 }
2017
2018 /* The last one gets the original register number */
2019 assert(offset <= MAX_VGRF_SIZE);
2020 alloc.sizes[i] = offset;
2021 for (int k = reg - offset; k < reg; k++)
2022 new_virtual_grf[k] = i;
2023 }
2024 assert(reg == reg_count);
2025
2026 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2027 if (inst->dst.file == GRF) {
2028 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2029 inst->dst.reg = new_virtual_grf[reg];
2030 inst->dst.reg_offset = new_reg_offset[reg];
2031 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2032 }
2033 for (int i = 0; i < inst->sources; i++) {
2034 if (inst->src[i].file == GRF) {
2035 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2036 inst->src[i].reg = new_virtual_grf[reg];
2037 inst->src[i].reg_offset = new_reg_offset[reg];
2038 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2039 }
2040 }
2041 }
2042 invalidate_live_intervals();
2043 }
2044
2045 /**
2046 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2047 *
2048 * During code generation, we create tons of temporary variables, many of
2049 * which get immediately killed and are never used again. Yet, in later
2050 * optimization and analysis passes, such as compute_live_intervals, we need
2051 * to loop over all the virtual GRFs. Compacting them can save a lot of
2052 * overhead.
2053 */
2054 bool
2055 fs_visitor::compact_virtual_grfs()
2056 {
2057 bool progress = false;
2058 int remap_table[this->alloc.count];
2059 memset(remap_table, -1, sizeof(remap_table));
2060
2061 /* Mark which virtual GRFs are used. */
2062 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2063 if (inst->dst.file == GRF)
2064 remap_table[inst->dst.reg] = 0;
2065
2066 for (int i = 0; i < inst->sources; i++) {
2067 if (inst->src[i].file == GRF)
2068 remap_table[inst->src[i].reg] = 0;
2069 }
2070 }
2071
2072 /* Compact the GRF arrays. */
2073 int new_index = 0;
2074 for (unsigned i = 0; i < this->alloc.count; i++) {
2075 if (remap_table[i] == -1) {
2076 /* We just found an unused register. This means that we are
2077 * actually going to compact something.
2078 */
2079 progress = true;
2080 } else {
2081 remap_table[i] = new_index;
2082 alloc.sizes[new_index] = alloc.sizes[i];
2083 invalidate_live_intervals();
2084 ++new_index;
2085 }
2086 }
2087
2088 this->alloc.count = new_index;
2089
2090 /* Patch all the instructions to use the newly renumbered registers */
2091 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2092 if (inst->dst.file == GRF)
2093 inst->dst.reg = remap_table[inst->dst.reg];
2094
2095 for (int i = 0; i < inst->sources; i++) {
2096 if (inst->src[i].file == GRF)
2097 inst->src[i].reg = remap_table[inst->src[i].reg];
2098 }
2099 }
2100
2101 /* Patch all the references to delta_x/delta_y, since they're used in
2102 * register allocation. If they're unused, switch them to BAD_FILE so
2103 * we don't think some random VGRF is delta_x/delta_y.
2104 */
2105 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2106 if (delta_x[i].file == GRF) {
2107 if (remap_table[delta_x[i].reg] != -1) {
2108 delta_x[i].reg = remap_table[delta_x[i].reg];
2109 } else {
2110 delta_x[i].file = BAD_FILE;
2111 }
2112 }
2113 }
2114 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2115 if (delta_y[i].file == GRF) {
2116 if (remap_table[delta_y[i].reg] != -1) {
2117 delta_y[i].reg = remap_table[delta_y[i].reg];
2118 } else {
2119 delta_y[i].file = BAD_FILE;
2120 }
2121 }
2122 }
2123
2124 return progress;
2125 }
2126
2127 /*
2128 * Implements array access of uniforms by inserting a
2129 * PULL_CONSTANT_LOAD instruction.
2130 *
2131 * Unlike temporary GRF array access (where we don't support it due to
2132 * the difficulty of doing relative addressing on instruction
2133 * destinations), we could potentially do array access of uniforms
2134 * that were loaded in GRF space as push constants. In real-world
2135 * usage we've seen, though, the arrays being used are always larger
2136 * than we could load as push constants, so just always move all
2137 * uniform array access out to a pull constant buffer.
2138 */
2139 void
2140 fs_visitor::move_uniform_array_access_to_pull_constants()
2141 {
2142 if (dispatch_width != 8)
2143 return;
2144
2145 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2146 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2147
2148 /* Walk through and find array access of uniforms. Put a copy of that
2149 * uniform in the pull constant buffer.
2150 *
2151 * Note that we don't move constant-indexed accesses to arrays. No
2152 * testing has been done of the performance impact of this choice.
2153 */
2154 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2155 for (int i = 0 ; i < inst->sources; i++) {
2156 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2157 continue;
2158
2159 int uniform = inst->src[i].reg;
2160
2161 /* If this array isn't already present in the pull constant buffer,
2162 * add it.
2163 */
2164 if (pull_constant_loc[uniform] == -1) {
2165 const gl_constant_value **values = &stage_prog_data->param[uniform];
2166
2167 assert(param_size[uniform]);
2168
2169 for (int j = 0; j < param_size[uniform]; j++) {
2170 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2171
2172 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2173 values[j];
2174 }
2175 }
2176 }
2177 }
2178 }
2179
2180 /**
2181 * Assign UNIFORM file registers to either push constants or pull constants.
2182 *
2183 * We allow a fragment shader to have more than the specified minimum
2184 * maximum number of fragment shader uniform components (64). If
2185 * there are too many of these, they'd fill up all of register space.
2186 * So, this will push some of them out to the pull constant buffer and
2187 * update the program to load them.
2188 */
2189 void
2190 fs_visitor::assign_constant_locations()
2191 {
2192 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2193 if (dispatch_width != 8)
2194 return;
2195
2196 /* Find which UNIFORM registers are still in use. */
2197 bool is_live[uniforms];
2198 for (unsigned int i = 0; i < uniforms; i++) {
2199 is_live[i] = false;
2200 }
2201
2202 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2203 for (int i = 0; i < inst->sources; i++) {
2204 if (inst->src[i].file != UNIFORM)
2205 continue;
2206
2207 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2208 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2209 is_live[constant_nr] = true;
2210 }
2211 }
2212
2213 /* Only allow 16 registers (128 uniform components) as push constants.
2214 *
2215 * Just demote the end of the list. We could probably do better
2216 * here, demoting things that are rarely used in the program first.
2217 *
2218 * If changing this value, note the limitation about total_regs in
2219 * brw_curbe.c.
2220 */
2221 unsigned int max_push_components = 16 * 8;
2222 unsigned int num_push_constants = 0;
2223
2224 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2225
2226 for (unsigned int i = 0; i < uniforms; i++) {
2227 if (!is_live[i] || pull_constant_loc[i] != -1) {
2228 /* This UNIFORM register is either dead, or has already been demoted
2229 * to a pull const. Mark it as no longer living in the param[] array.
2230 */
2231 push_constant_loc[i] = -1;
2232 continue;
2233 }
2234
2235 if (num_push_constants < max_push_components) {
2236 /* Retain as a push constant. Record the location in the params[]
2237 * array.
2238 */
2239 push_constant_loc[i] = num_push_constants++;
2240 } else {
2241 /* Demote to a pull constant. */
2242 push_constant_loc[i] = -1;
2243
2244 int pull_index = stage_prog_data->nr_pull_params++;
2245 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2246 pull_constant_loc[i] = pull_index;
2247 }
2248 }
2249
2250 stage_prog_data->nr_params = num_push_constants;
2251
2252 /* Up until now, the param[] array has been indexed by reg + reg_offset
2253 * of UNIFORM registers. Condense it to only contain the uniforms we
2254 * chose to upload as push constants.
2255 */
2256 for (unsigned int i = 0; i < uniforms; i++) {
2257 int remapped = push_constant_loc[i];
2258
2259 if (remapped == -1)
2260 continue;
2261
2262 assert(remapped <= (int)i);
2263 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2264 }
2265 }
2266
2267 /**
2268 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2269 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2270 */
2271 void
2272 fs_visitor::demote_pull_constants()
2273 {
2274 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2275 for (int i = 0; i < inst->sources; i++) {
2276 if (inst->src[i].file != UNIFORM)
2277 continue;
2278
2279 int pull_index = pull_constant_loc[inst->src[i].reg +
2280 inst->src[i].reg_offset];
2281 if (pull_index == -1)
2282 continue;
2283
2284 /* Set up the annotation tracking for new generated instructions. */
2285 base_ir = inst->ir;
2286 current_annotation = inst->annotation;
2287
2288 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2289 fs_reg dst = vgrf(glsl_type::float_type);
2290
2291 /* Generate a pull load into dst. */
2292 if (inst->src[i].reladdr) {
2293 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2294 surf_index,
2295 *inst->src[i].reladdr,
2296 pull_index);
2297 inst->insert_before(block, &list);
2298 inst->src[i].reladdr = NULL;
2299 } else {
2300 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2301 fs_inst *pull =
2302 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2303 dst, surf_index, offset);
2304 inst->insert_before(block, pull);
2305 inst->src[i].set_smear(pull_index & 3);
2306 }
2307
2308 /* Rewrite the instruction to use the temporary VGRF. */
2309 inst->src[i].file = GRF;
2310 inst->src[i].reg = dst.reg;
2311 inst->src[i].reg_offset = 0;
2312 inst->src[i].width = dispatch_width;
2313 }
2314 }
2315 invalidate_live_intervals();
2316 }
2317
2318 bool
2319 fs_visitor::opt_algebraic()
2320 {
2321 bool progress = false;
2322
2323 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2324 switch (inst->opcode) {
2325 case BRW_OPCODE_MOV:
2326 if (inst->src[0].file != IMM)
2327 break;
2328
2329 if (inst->saturate) {
2330 if (inst->dst.type != inst->src[0].type)
2331 assert(!"unimplemented: saturate mixed types");
2332
2333 if (brw_saturate_immediate(inst->dst.type,
2334 &inst->src[0].fixed_hw_reg)) {
2335 inst->saturate = false;
2336 progress = true;
2337 }
2338 }
2339 break;
2340
2341 case BRW_OPCODE_MUL:
2342 if (inst->src[1].file != IMM)
2343 continue;
2344
2345 /* a * 1.0 = a */
2346 if (inst->src[1].is_one()) {
2347 inst->opcode = BRW_OPCODE_MOV;
2348 inst->src[1] = reg_undef;
2349 progress = true;
2350 break;
2351 }
2352
2353 /* a * -1.0 = -a */
2354 if (inst->src[1].is_negative_one()) {
2355 inst->opcode = BRW_OPCODE_MOV;
2356 inst->src[0].negate = !inst->src[0].negate;
2357 inst->src[1] = reg_undef;
2358 progress = true;
2359 break;
2360 }
2361
2362 /* a * 0.0 = 0.0 */
2363 if (inst->src[1].is_zero()) {
2364 inst->opcode = BRW_OPCODE_MOV;
2365 inst->src[0] = inst->src[1];
2366 inst->src[1] = reg_undef;
2367 progress = true;
2368 break;
2369 }
2370
2371 if (inst->src[0].file == IMM) {
2372 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2373 inst->opcode = BRW_OPCODE_MOV;
2374 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2375 inst->src[1] = reg_undef;
2376 progress = true;
2377 break;
2378 }
2379 break;
2380 case BRW_OPCODE_ADD:
2381 if (inst->src[1].file != IMM)
2382 continue;
2383
2384 /* a + 0.0 = a */
2385 if (inst->src[1].is_zero()) {
2386 inst->opcode = BRW_OPCODE_MOV;
2387 inst->src[1] = reg_undef;
2388 progress = true;
2389 break;
2390 }
2391
2392 if (inst->src[0].file == IMM) {
2393 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2394 inst->opcode = BRW_OPCODE_MOV;
2395 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2396 inst->src[1] = reg_undef;
2397 progress = true;
2398 break;
2399 }
2400 break;
2401 case BRW_OPCODE_OR:
2402 if (inst->src[0].equals(inst->src[1])) {
2403 inst->opcode = BRW_OPCODE_MOV;
2404 inst->src[1] = reg_undef;
2405 progress = true;
2406 break;
2407 }
2408 break;
2409 case BRW_OPCODE_LRP:
2410 if (inst->src[1].equals(inst->src[2])) {
2411 inst->opcode = BRW_OPCODE_MOV;
2412 inst->src[0] = inst->src[1];
2413 inst->src[1] = reg_undef;
2414 inst->src[2] = reg_undef;
2415 progress = true;
2416 break;
2417 }
2418 break;
2419 case BRW_OPCODE_CMP:
2420 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2421 inst->src[0].abs &&
2422 inst->src[0].negate &&
2423 inst->src[1].is_zero()) {
2424 inst->src[0].abs = false;
2425 inst->src[0].negate = false;
2426 inst->conditional_mod = BRW_CONDITIONAL_Z;
2427 progress = true;
2428 break;
2429 }
2430 break;
2431 case BRW_OPCODE_SEL:
2432 if (inst->src[0].equals(inst->src[1])) {
2433 inst->opcode = BRW_OPCODE_MOV;
2434 inst->src[1] = reg_undef;
2435 inst->predicate = BRW_PREDICATE_NONE;
2436 inst->predicate_inverse = false;
2437 progress = true;
2438 } else if (inst->saturate && inst->src[1].file == IMM) {
2439 switch (inst->conditional_mod) {
2440 case BRW_CONDITIONAL_LE:
2441 case BRW_CONDITIONAL_L:
2442 switch (inst->src[1].type) {
2443 case BRW_REGISTER_TYPE_F:
2444 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2445 inst->opcode = BRW_OPCODE_MOV;
2446 inst->src[1] = reg_undef;
2447 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2448 progress = true;
2449 }
2450 break;
2451 default:
2452 break;
2453 }
2454 break;
2455 case BRW_CONDITIONAL_GE:
2456 case BRW_CONDITIONAL_G:
2457 switch (inst->src[1].type) {
2458 case BRW_REGISTER_TYPE_F:
2459 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2460 inst->opcode = BRW_OPCODE_MOV;
2461 inst->src[1] = reg_undef;
2462 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2463 progress = true;
2464 }
2465 break;
2466 default:
2467 break;
2468 }
2469 default:
2470 break;
2471 }
2472 }
2473 break;
2474 case BRW_OPCODE_MAD:
2475 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2476 inst->opcode = BRW_OPCODE_MOV;
2477 inst->src[1] = reg_undef;
2478 inst->src[2] = reg_undef;
2479 progress = true;
2480 } else if (inst->src[0].is_zero()) {
2481 inst->opcode = BRW_OPCODE_MUL;
2482 inst->src[0] = inst->src[2];
2483 inst->src[2] = reg_undef;
2484 } else if (inst->src[1].is_one()) {
2485 inst->opcode = BRW_OPCODE_ADD;
2486 inst->src[1] = inst->src[2];
2487 inst->src[2] = reg_undef;
2488 progress = true;
2489 } else if (inst->src[2].is_one()) {
2490 inst->opcode = BRW_OPCODE_ADD;
2491 inst->src[2] = reg_undef;
2492 progress = true;
2493 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2494 inst->opcode = BRW_OPCODE_ADD;
2495 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2496 inst->src[2] = reg_undef;
2497 progress = true;
2498 }
2499 break;
2500 case SHADER_OPCODE_RCP: {
2501 fs_inst *prev = (fs_inst *)inst->prev;
2502 if (prev->opcode == SHADER_OPCODE_SQRT) {
2503 if (inst->src[0].equals(prev->dst)) {
2504 inst->opcode = SHADER_OPCODE_RSQ;
2505 inst->src[0] = prev->src[0];
2506 progress = true;
2507 }
2508 }
2509 break;
2510 }
2511 default:
2512 break;
2513 }
2514 }
2515
2516 return progress;
2517 }
2518
2519 bool
2520 fs_visitor::opt_register_renaming()
2521 {
2522 bool progress = false;
2523 int depth = 0;
2524
2525 int remap[alloc.count];
2526 memset(remap, -1, sizeof(int) * alloc.count);
2527
2528 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2529 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2530 depth++;
2531 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2532 inst->opcode == BRW_OPCODE_WHILE) {
2533 depth--;
2534 }
2535
2536 /* Rewrite instruction sources. */
2537 for (int i = 0; i < inst->sources; i++) {
2538 if (inst->src[i].file == GRF &&
2539 remap[inst->src[i].reg] != -1 &&
2540 remap[inst->src[i].reg] != inst->src[i].reg) {
2541 inst->src[i].reg = remap[inst->src[i].reg];
2542 progress = true;
2543 }
2544 }
2545
2546 const int dst = inst->dst.reg;
2547
2548 if (depth == 0 &&
2549 inst->dst.file == GRF &&
2550 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2551 !inst->is_partial_write()) {
2552 if (remap[dst] == -1) {
2553 remap[dst] = dst;
2554 } else {
2555 remap[dst] = alloc.allocate(inst->dst.width / 8);
2556 inst->dst.reg = remap[dst];
2557 progress = true;
2558 }
2559 } else if (inst->dst.file == GRF &&
2560 remap[dst] != -1 &&
2561 remap[dst] != dst) {
2562 inst->dst.reg = remap[dst];
2563 progress = true;
2564 }
2565 }
2566
2567 if (progress) {
2568 invalidate_live_intervals();
2569
2570 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2571 if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2572 delta_x[i].reg = remap[delta_x[i].reg];
2573 }
2574 }
2575 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2576 if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2577 delta_y[i].reg = remap[delta_y[i].reg];
2578 }
2579 }
2580 }
2581
2582 return progress;
2583 }
2584
2585 /**
2586 * Remove redundant or useless discard jumps.
2587 *
2588 * For example, we can eliminate jumps in the following sequence:
2589 *
2590 * discard-jump (redundant with the next jump)
2591 * discard-jump (useless; jumps to the next instruction)
2592 * placeholder-halt
2593 */
2594 bool
2595 fs_visitor::opt_redundant_discard_jumps()
2596 {
2597 bool progress = false;
2598
2599 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2600
2601 fs_inst *placeholder_halt = NULL;
2602 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2603 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2604 placeholder_halt = inst;
2605 break;
2606 }
2607 }
2608
2609 if (!placeholder_halt)
2610 return false;
2611
2612 /* Delete any HALTs immediately before the placeholder halt. */
2613 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2614 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2615 prev = (fs_inst *) placeholder_halt->prev) {
2616 prev->remove(last_bblock);
2617 progress = true;
2618 }
2619
2620 if (progress)
2621 invalidate_live_intervals();
2622
2623 return progress;
2624 }
2625
2626 bool
2627 fs_visitor::compute_to_mrf()
2628 {
2629 bool progress = false;
2630 int next_ip = 0;
2631
2632 /* No MRFs on Gen >= 7. */
2633 if (brw->gen >= 7)
2634 return false;
2635
2636 calculate_live_intervals();
2637
2638 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2639 int ip = next_ip;
2640 next_ip++;
2641
2642 if (inst->opcode != BRW_OPCODE_MOV ||
2643 inst->is_partial_write() ||
2644 inst->dst.file != MRF || inst->src[0].file != GRF ||
2645 inst->dst.type != inst->src[0].type ||
2646 inst->src[0].abs || inst->src[0].negate ||
2647 !inst->src[0].is_contiguous() ||
2648 inst->src[0].subreg_offset)
2649 continue;
2650
2651 /* Work out which hardware MRF registers are written by this
2652 * instruction.
2653 */
2654 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2655 int mrf_high;
2656 if (inst->dst.reg & BRW_MRF_COMPR4) {
2657 mrf_high = mrf_low + 4;
2658 } else if (inst->exec_size == 16) {
2659 mrf_high = mrf_low + 1;
2660 } else {
2661 mrf_high = mrf_low;
2662 }
2663
2664 /* Can't compute-to-MRF this GRF if someone else was going to
2665 * read it later.
2666 */
2667 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2668 continue;
2669
2670 /* Found a move of a GRF to a MRF. Let's see if we can go
2671 * rewrite the thing that made this GRF to write into the MRF.
2672 */
2673 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2674 if (scan_inst->dst.file == GRF &&
2675 scan_inst->dst.reg == inst->src[0].reg) {
2676 /* Found the last thing to write our reg we want to turn
2677 * into a compute-to-MRF.
2678 */
2679
2680 /* If this one instruction didn't populate all the
2681 * channels, bail. We might be able to rewrite everything
2682 * that writes that reg, but it would require smarter
2683 * tracking to delay the rewriting until complete success.
2684 */
2685 if (scan_inst->is_partial_write())
2686 break;
2687
2688 /* Things returning more than one register would need us to
2689 * understand coalescing out more than one MOV at a time.
2690 */
2691 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2692 break;
2693
2694 /* SEND instructions can't have MRF as a destination. */
2695 if (scan_inst->mlen)
2696 break;
2697
2698 if (brw->gen == 6) {
2699 /* gen6 math instructions must have the destination be
2700 * GRF, so no compute-to-MRF for them.
2701 */
2702 if (scan_inst->is_math()) {
2703 break;
2704 }
2705 }
2706
2707 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2708 /* Found the creator of our MRF's source value. */
2709 scan_inst->dst.file = MRF;
2710 scan_inst->dst.reg = inst->dst.reg;
2711 scan_inst->saturate |= inst->saturate;
2712 inst->remove(block);
2713 progress = true;
2714 }
2715 break;
2716 }
2717
2718 /* We don't handle control flow here. Most computation of
2719 * values that end up in MRFs are shortly before the MRF
2720 * write anyway.
2721 */
2722 if (block->start() == scan_inst)
2723 break;
2724
2725 /* You can't read from an MRF, so if someone else reads our
2726 * MRF's source GRF that we wanted to rewrite, that stops us.
2727 */
2728 bool interfered = false;
2729 for (int i = 0; i < scan_inst->sources; i++) {
2730 if (scan_inst->src[i].file == GRF &&
2731 scan_inst->src[i].reg == inst->src[0].reg &&
2732 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2733 interfered = true;
2734 }
2735 }
2736 if (interfered)
2737 break;
2738
2739 if (scan_inst->dst.file == MRF) {
2740 /* If somebody else writes our MRF here, we can't
2741 * compute-to-MRF before that.
2742 */
2743 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2744 int scan_mrf_high;
2745
2746 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2747 scan_mrf_high = scan_mrf_low + 4;
2748 } else if (scan_inst->exec_size == 16) {
2749 scan_mrf_high = scan_mrf_low + 1;
2750 } else {
2751 scan_mrf_high = scan_mrf_low;
2752 }
2753
2754 if (mrf_low == scan_mrf_low ||
2755 mrf_low == scan_mrf_high ||
2756 mrf_high == scan_mrf_low ||
2757 mrf_high == scan_mrf_high) {
2758 break;
2759 }
2760 }
2761
2762 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2763 /* Found a SEND instruction, which means that there are
2764 * live values in MRFs from base_mrf to base_mrf +
2765 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2766 * above it.
2767 */
2768 if (mrf_low >= scan_inst->base_mrf &&
2769 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2770 break;
2771 }
2772 if (mrf_high >= scan_inst->base_mrf &&
2773 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2774 break;
2775 }
2776 }
2777 }
2778 }
2779
2780 if (progress)
2781 invalidate_live_intervals();
2782
2783 return progress;
2784 }
2785
2786 /**
2787 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2788 * instructions to FS_OPCODE_REP_FB_WRITE.
2789 */
2790 void
2791 fs_visitor::emit_repclear_shader()
2792 {
2793 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2794 int base_mrf = 1;
2795 int color_mrf = base_mrf + 2;
2796
2797 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2798 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2799 mov->force_writemask_all = true;
2800
2801 fs_inst *write;
2802 if (key->nr_color_regions == 1) {
2803 write = emit(FS_OPCODE_REP_FB_WRITE);
2804 write->saturate = key->clamp_fragment_color;
2805 write->base_mrf = color_mrf;
2806 write->target = 0;
2807 write->header_present = false;
2808 write->mlen = 1;
2809 } else {
2810 assume(key->nr_color_regions > 0);
2811 for (int i = 0; i < key->nr_color_regions; ++i) {
2812 write = emit(FS_OPCODE_REP_FB_WRITE);
2813 write->saturate = key->clamp_fragment_color;
2814 write->base_mrf = base_mrf;
2815 write->target = i;
2816 write->header_present = true;
2817 write->mlen = 3;
2818 }
2819 }
2820 write->eot = true;
2821
2822 calculate_cfg();
2823
2824 assign_constant_locations();
2825 assign_curb_setup();
2826
2827 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2828 assert(mov->src[0].file == HW_REG);
2829 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2830 }
2831
2832 /**
2833 * Walks through basic blocks, looking for repeated MRF writes and
2834 * removing the later ones.
2835 */
2836 bool
2837 fs_visitor::remove_duplicate_mrf_writes()
2838 {
2839 fs_inst *last_mrf_move[16];
2840 bool progress = false;
2841
2842 /* Need to update the MRF tracking for compressed instructions. */
2843 if (dispatch_width == 16)
2844 return false;
2845
2846 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2847
2848 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2849 if (inst->is_control_flow()) {
2850 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2851 }
2852
2853 if (inst->opcode == BRW_OPCODE_MOV &&
2854 inst->dst.file == MRF) {
2855 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2856 if (prev_inst && inst->equals(prev_inst)) {
2857 inst->remove(block);
2858 progress = true;
2859 continue;
2860 }
2861 }
2862
2863 /* Clear out the last-write records for MRFs that were overwritten. */
2864 if (inst->dst.file == MRF) {
2865 last_mrf_move[inst->dst.reg] = NULL;
2866 }
2867
2868 if (inst->mlen > 0 && inst->base_mrf != -1) {
2869 /* Found a SEND instruction, which will include two or fewer
2870 * implied MRF writes. We could do better here.
2871 */
2872 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2873 last_mrf_move[inst->base_mrf + i] = NULL;
2874 }
2875 }
2876
2877 /* Clear out any MRF move records whose sources got overwritten. */
2878 if (inst->dst.file == GRF) {
2879 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2880 if (last_mrf_move[i] &&
2881 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2882 last_mrf_move[i] = NULL;
2883 }
2884 }
2885 }
2886
2887 if (inst->opcode == BRW_OPCODE_MOV &&
2888 inst->dst.file == MRF &&
2889 inst->src[0].file == GRF &&
2890 !inst->is_partial_write()) {
2891 last_mrf_move[inst->dst.reg] = inst;
2892 }
2893 }
2894
2895 if (progress)
2896 invalidate_live_intervals();
2897
2898 return progress;
2899 }
2900
2901 static void
2902 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
2903 {
2904 /* Clear the flag for registers that actually got read (as expected). */
2905 for (int i = 0; i < inst->sources; i++) {
2906 int grf;
2907 if (inst->src[i].file == GRF) {
2908 grf = inst->src[i].reg;
2909 } else if (inst->src[i].file == HW_REG &&
2910 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2911 grf = inst->src[i].fixed_hw_reg.nr;
2912 } else {
2913 continue;
2914 }
2915
2916 if (grf >= first_grf &&
2917 grf < first_grf + grf_len) {
2918 deps[grf - first_grf] = false;
2919 if (inst->exec_size == 16)
2920 deps[grf - first_grf + 1] = false;
2921 }
2922 }
2923 }
2924
2925 /**
2926 * Implements this workaround for the original 965:
2927 *
2928 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2929 * check for post destination dependencies on this instruction, software
2930 * must ensure that there is no destination hazard for the case of ‘write
2931 * followed by a posted write’ shown in the following example.
2932 *
2933 * 1. mov r3 0
2934 * 2. send r3.xy <rest of send instruction>
2935 * 3. mov r2 r3
2936 *
2937 * Due to no post-destination dependency check on the ‘send’, the above
2938 * code sequence could have two instructions (1 and 2) in flight at the
2939 * same time that both consider ‘r3’ as the target of their final writes.
2940 */
2941 void
2942 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2943 fs_inst *inst)
2944 {
2945 int write_len = inst->regs_written;
2946 int first_write_grf = inst->dst.reg;
2947 bool needs_dep[BRW_MAX_MRF];
2948 assert(write_len < (int)sizeof(needs_dep) - 1);
2949
2950 memset(needs_dep, false, sizeof(needs_dep));
2951 memset(needs_dep, true, write_len);
2952
2953 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
2954
2955 /* Walk backwards looking for writes to registers we're writing which
2956 * aren't read since being written. If we hit the start of the program,
2957 * we assume that there are no outstanding dependencies on entry to the
2958 * program.
2959 */
2960 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2961 /* If we hit control flow, assume that there *are* outstanding
2962 * dependencies, and force their cleanup before our instruction.
2963 */
2964 if (block->start() == scan_inst) {
2965 for (int i = 0; i < write_len; i++) {
2966 if (needs_dep[i]) {
2967 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2968 }
2969 }
2970 return;
2971 }
2972
2973 /* We insert our reads as late as possible on the assumption that any
2974 * instruction but a MOV that might have left us an outstanding
2975 * dependency has more latency than a MOV.
2976 */
2977 if (scan_inst->dst.file == GRF) {
2978 for (int i = 0; i < scan_inst->regs_written; i++) {
2979 int reg = scan_inst->dst.reg + i;
2980
2981 if (reg >= first_write_grf &&
2982 reg < first_write_grf + write_len &&
2983 needs_dep[reg - first_write_grf]) {
2984 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2985 needs_dep[reg - first_write_grf] = false;
2986 if (scan_inst->exec_size == 16)
2987 needs_dep[reg - first_write_grf + 1] = false;
2988 }
2989 }
2990 }
2991
2992 /* Clear the flag for registers that actually got read (as expected). */
2993 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2994
2995 /* Continue the loop only if we haven't resolved all the dependencies */
2996 int i;
2997 for (i = 0; i < write_len; i++) {
2998 if (needs_dep[i])
2999 break;
3000 }
3001 if (i == write_len)
3002 return;
3003 }
3004 }
3005
3006 /**
3007 * Implements this workaround for the original 965:
3008 *
3009 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3010 * used as a destination register until after it has been sourced by an
3011 * instruction with a different destination register.
3012 */
3013 void
3014 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3015 {
3016 int write_len = inst->regs_written;
3017 int first_write_grf = inst->dst.reg;
3018 bool needs_dep[BRW_MAX_MRF];
3019 assert(write_len < (int)sizeof(needs_dep) - 1);
3020
3021 memset(needs_dep, false, sizeof(needs_dep));
3022 memset(needs_dep, true, write_len);
3023 /* Walk forwards looking for writes to registers we're writing which aren't
3024 * read before being written.
3025 */
3026 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3027 /* If we hit control flow, force resolve all remaining dependencies. */
3028 if (block->end() == scan_inst) {
3029 for (int i = 0; i < write_len; i++) {
3030 if (needs_dep[i])
3031 scan_inst->insert_before(block,
3032 DEP_RESOLVE_MOV(first_write_grf + i));
3033 }
3034 return;
3035 }
3036
3037 /* Clear the flag for registers that actually got read (as expected). */
3038 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3039
3040 /* We insert our reads as late as possible since they're reading the
3041 * result of a SEND, which has massive latency.
3042 */
3043 if (scan_inst->dst.file == GRF &&
3044 scan_inst->dst.reg >= first_write_grf &&
3045 scan_inst->dst.reg < first_write_grf + write_len &&
3046 needs_dep[scan_inst->dst.reg - first_write_grf]) {
3047 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3048 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3049 }
3050
3051 /* Continue the loop only if we haven't resolved all the dependencies */
3052 int i;
3053 for (i = 0; i < write_len; i++) {
3054 if (needs_dep[i])
3055 break;
3056 }
3057 if (i == write_len)
3058 return;
3059 }
3060 }
3061
3062 void
3063 fs_visitor::insert_gen4_send_dependency_workarounds()
3064 {
3065 if (brw->gen != 4 || brw->is_g4x)
3066 return;
3067
3068 bool progress = false;
3069
3070 /* Note that we're done with register allocation, so GRF fs_regs always
3071 * have a .reg_offset of 0.
3072 */
3073
3074 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3075 if (inst->mlen != 0 && inst->dst.file == GRF) {
3076 insert_gen4_pre_send_dependency_workarounds(block, inst);
3077 insert_gen4_post_send_dependency_workarounds(block, inst);
3078 progress = true;
3079 }
3080 }
3081
3082 if (progress)
3083 invalidate_live_intervals();
3084 }
3085
3086 /**
3087 * Turns the generic expression-style uniform pull constant load instruction
3088 * into a hardware-specific series of instructions for loading a pull
3089 * constant.
3090 *
3091 * The expression style allows the CSE pass before this to optimize out
3092 * repeated loads from the same offset, and gives the pre-register-allocation
3093 * scheduling full flexibility, while the conversion to native instructions
3094 * allows the post-register-allocation scheduler the best information
3095 * possible.
3096 *
3097 * Note that execution masking for setting up pull constant loads is special:
3098 * the channels that need to be written are unrelated to the current execution
3099 * mask, since a later instruction will use one of the result channels as a
3100 * source operand for all 8 or 16 of its channels.
3101 */
3102 void
3103 fs_visitor::lower_uniform_pull_constant_loads()
3104 {
3105 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3106 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3107 continue;
3108
3109 if (brw->gen >= 7) {
3110 /* The offset arg before was a vec4-aligned byte offset. We need to
3111 * turn it into a dword offset.
3112 */
3113 fs_reg const_offset_reg = inst->src[1];
3114 assert(const_offset_reg.file == IMM &&
3115 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3116 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3117 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3118
3119 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3120 * Reserve space for the register.
3121 */
3122 if (brw->gen >= 9) {
3123 payload.reg_offset++;
3124 alloc.sizes[payload.reg] = 2;
3125 }
3126
3127 /* This is actually going to be a MOV, but since only the first dword
3128 * is accessed, we have a special opcode to do just that one. Note
3129 * that this needs to be an operation that will be considered a def
3130 * by live variable analysis, or register allocation will explode.
3131 */
3132 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3133 8, payload, const_offset_reg);
3134 setup->force_writemask_all = true;
3135
3136 setup->ir = inst->ir;
3137 setup->annotation = inst->annotation;
3138 inst->insert_before(block, setup);
3139
3140 /* Similarly, this will only populate the first 4 channels of the
3141 * result register (since we only use smear values from 0-3), but we
3142 * don't tell the optimizer.
3143 */
3144 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3145 inst->src[1] = payload;
3146
3147 invalidate_live_intervals();
3148 } else {
3149 /* Before register allocation, we didn't tell the scheduler about the
3150 * MRF we use. We know it's safe to use this MRF because nothing
3151 * else does except for register spill/unspill, which generates and
3152 * uses its MRF within a single IR instruction.
3153 */
3154 inst->base_mrf = 14;
3155 inst->mlen = 1;
3156 }
3157 }
3158 }
3159
3160 bool
3161 fs_visitor::lower_load_payload()
3162 {
3163 bool progress = false;
3164
3165 int vgrf_to_reg[alloc.count];
3166 int reg_count = 0;
3167 for (unsigned i = 0; i < alloc.count; ++i) {
3168 vgrf_to_reg[i] = reg_count;
3169 reg_count += alloc.sizes[i];
3170 }
3171
3172 struct {
3173 bool written:1; /* Whether this register has ever been written */
3174 bool force_writemask_all:1;
3175 bool force_sechalf:1;
3176 } metadata[reg_count];
3177 memset(metadata, 0, sizeof(metadata));
3178
3179 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3180 if (inst->dst.file == GRF) {
3181 const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3182 bool force_sechalf = inst->force_sechalf &&
3183 !inst->force_writemask_all;
3184 bool toggle_sechalf = inst->dst.width == 16 &&
3185 type_sz(inst->dst.type) == 4 &&
3186 !inst->force_writemask_all;
3187 for (int i = 0; i < inst->regs_written; ++i) {
3188 metadata[dst_reg + i].written = true;
3189 metadata[dst_reg + i].force_sechalf = force_sechalf;
3190 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3191 force_sechalf = (toggle_sechalf != force_sechalf);
3192 }
3193 }
3194
3195 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3196 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3197 fs_reg dst = inst->dst;
3198
3199 for (int i = 0; i < inst->sources; i++) {
3200 dst.width = inst->src[i].effective_width;
3201 dst.type = inst->src[i].type;
3202
3203 if (inst->src[i].file == BAD_FILE) {
3204 /* Do nothing but otherwise increment as normal */
3205 } else if (dst.file == MRF &&
3206 dst.width == 8 &&
3207 brw->has_compr4 &&
3208 i + 4 < inst->sources &&
3209 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3210 fs_reg compr4_dst = dst;
3211 compr4_dst.reg += BRW_MRF_COMPR4;
3212 compr4_dst.width = 16;
3213 fs_reg compr4_src = inst->src[i];
3214 compr4_src.width = 16;
3215 fs_inst *mov = MOV(compr4_dst, compr4_src);
3216 mov->force_writemask_all = true;
3217 inst->insert_before(block, mov);
3218 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3219 inst->src[i + 4].file = BAD_FILE;
3220 } else {
3221 fs_inst *mov = MOV(dst, inst->src[i]);
3222 if (inst->src[i].file == GRF) {
3223 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3224 inst->src[i].reg_offset;
3225 mov->force_sechalf = metadata[src_reg].force_sechalf;
3226 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3227 } else {
3228 /* We don't have any useful metadata for immediates or
3229 * uniforms. Assume that any of the channels of the
3230 * destination may be used.
3231 */
3232 assert(inst->src[i].file == IMM ||
3233 inst->src[i].file == UNIFORM);
3234 mov->force_writemask_all = true;
3235 }
3236
3237 if (dst.file == GRF) {
3238 const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3239 const bool force_writemask = mov->force_writemask_all;
3240 metadata[dst_reg].force_writemask_all = force_writemask;
3241 metadata[dst_reg].force_sechalf = mov->force_sechalf;
3242 if (dst.width * type_sz(dst.type) > 32) {
3243 assert(!mov->force_sechalf);
3244 metadata[dst_reg + 1].force_writemask_all = force_writemask;
3245 metadata[dst_reg + 1].force_sechalf = !force_writemask;
3246 }
3247 }
3248
3249 inst->insert_before(block, mov);
3250 }
3251
3252 dst = offset(dst, 1);
3253 }
3254
3255 inst->remove(block);
3256 progress = true;
3257 }
3258 }
3259
3260 if (progress)
3261 invalidate_live_intervals();
3262
3263 return progress;
3264 }
3265
3266 void
3267 fs_visitor::dump_instructions()
3268 {
3269 dump_instructions(NULL);
3270 }
3271
3272 void
3273 fs_visitor::dump_instructions(const char *name)
3274 {
3275 FILE *file = stderr;
3276 if (name && geteuid() != 0) {
3277 file = fopen(name, "w");
3278 if (!file)
3279 file = stderr;
3280 }
3281
3282 if (cfg) {
3283 calculate_register_pressure();
3284 int ip = 0, max_pressure = 0;
3285 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3286 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3287 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3288 dump_instruction(inst, file);
3289 ip++;
3290 }
3291 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3292 } else {
3293 int ip = 0;
3294 foreach_in_list(backend_instruction, inst, &instructions) {
3295 fprintf(file, "%4d: ", ip++);
3296 dump_instruction(inst, file);
3297 }
3298 }
3299
3300 if (file != stderr) {
3301 fclose(file);
3302 }
3303 }
3304
3305 void
3306 fs_visitor::dump_instruction(backend_instruction *be_inst)
3307 {
3308 dump_instruction(be_inst, stderr);
3309 }
3310
3311 void
3312 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3313 {
3314 fs_inst *inst = (fs_inst *)be_inst;
3315
3316 if (inst->predicate) {
3317 fprintf(file, "(%cf0.%d) ",
3318 inst->predicate_inverse ? '-' : '+',
3319 inst->flag_subreg);
3320 }
3321
3322 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3323 if (inst->saturate)
3324 fprintf(file, ".sat");
3325 if (inst->conditional_mod) {
3326 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3327 if (!inst->predicate &&
3328 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3329 inst->opcode != BRW_OPCODE_IF &&
3330 inst->opcode != BRW_OPCODE_WHILE))) {
3331 fprintf(file, ".f0.%d", inst->flag_subreg);
3332 }
3333 }
3334 fprintf(file, "(%d) ", inst->exec_size);
3335
3336
3337 switch (inst->dst.file) {
3338 case GRF:
3339 fprintf(file, "vgrf%d", inst->dst.reg);
3340 if (inst->dst.width != dispatch_width)
3341 fprintf(file, "@%d", inst->dst.width);
3342 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3343 inst->dst.subreg_offset)
3344 fprintf(file, "+%d.%d",
3345 inst->dst.reg_offset, inst->dst.subreg_offset);
3346 break;
3347 case MRF:
3348 fprintf(file, "m%d", inst->dst.reg);
3349 break;
3350 case BAD_FILE:
3351 fprintf(file, "(null)");
3352 break;
3353 case UNIFORM:
3354 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3355 break;
3356 case ATTR:
3357 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3358 break;
3359 case HW_REG:
3360 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3361 switch (inst->dst.fixed_hw_reg.nr) {
3362 case BRW_ARF_NULL:
3363 fprintf(file, "null");
3364 break;
3365 case BRW_ARF_ADDRESS:
3366 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3367 break;
3368 case BRW_ARF_ACCUMULATOR:
3369 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3370 break;
3371 case BRW_ARF_FLAG:
3372 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3373 inst->dst.fixed_hw_reg.subnr);
3374 break;
3375 default:
3376 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3377 inst->dst.fixed_hw_reg.subnr);
3378 break;
3379 }
3380 } else {
3381 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3382 }
3383 if (inst->dst.fixed_hw_reg.subnr)
3384 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3385 break;
3386 default:
3387 fprintf(file, "???");
3388 break;
3389 }
3390 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3391
3392 for (int i = 0; i < inst->sources; i++) {
3393 if (inst->src[i].negate)
3394 fprintf(file, "-");
3395 if (inst->src[i].abs)
3396 fprintf(file, "|");
3397 switch (inst->src[i].file) {
3398 case GRF:
3399 fprintf(file, "vgrf%d", inst->src[i].reg);
3400 if (inst->src[i].width != dispatch_width)
3401 fprintf(file, "@%d", inst->src[i].width);
3402 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3403 inst->src[i].subreg_offset)
3404 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3405 inst->src[i].subreg_offset);
3406 break;
3407 case MRF:
3408 fprintf(file, "***m%d***", inst->src[i].reg);
3409 break;
3410 case ATTR:
3411 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3412 break;
3413 case UNIFORM:
3414 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3415 if (inst->src[i].reladdr) {
3416 fprintf(file, "+reladdr");
3417 } else if (inst->src[i].subreg_offset) {
3418 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3419 inst->src[i].subreg_offset);
3420 }
3421 break;
3422 case BAD_FILE:
3423 fprintf(file, "(null)");
3424 break;
3425 case IMM:
3426 switch (inst->src[i].type) {
3427 case BRW_REGISTER_TYPE_F:
3428 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3429 break;
3430 case BRW_REGISTER_TYPE_W:
3431 case BRW_REGISTER_TYPE_D:
3432 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3433 break;
3434 case BRW_REGISTER_TYPE_UW:
3435 case BRW_REGISTER_TYPE_UD:
3436 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3437 break;
3438 case BRW_REGISTER_TYPE_VF:
3439 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3440 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3441 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3442 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3443 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3444 break;
3445 default:
3446 fprintf(file, "???");
3447 break;
3448 }
3449 break;
3450 case HW_REG:
3451 if (inst->src[i].fixed_hw_reg.negate)
3452 fprintf(file, "-");
3453 if (inst->src[i].fixed_hw_reg.abs)
3454 fprintf(file, "|");
3455 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3456 switch (inst->src[i].fixed_hw_reg.nr) {
3457 case BRW_ARF_NULL:
3458 fprintf(file, "null");
3459 break;
3460 case BRW_ARF_ADDRESS:
3461 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3462 break;
3463 case BRW_ARF_ACCUMULATOR:
3464 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3465 break;
3466 case BRW_ARF_FLAG:
3467 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3468 inst->src[i].fixed_hw_reg.subnr);
3469 break;
3470 default:
3471 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3472 inst->src[i].fixed_hw_reg.subnr);
3473 break;
3474 }
3475 } else {
3476 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3477 }
3478 if (inst->src[i].fixed_hw_reg.subnr)
3479 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3480 if (inst->src[i].fixed_hw_reg.abs)
3481 fprintf(file, "|");
3482 break;
3483 default:
3484 fprintf(file, "???");
3485 break;
3486 }
3487 if (inst->src[i].abs)
3488 fprintf(file, "|");
3489
3490 if (inst->src[i].file != IMM) {
3491 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3492 }
3493
3494 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3495 fprintf(file, ", ");
3496 }
3497
3498 fprintf(file, " ");
3499
3500 if (dispatch_width == 16 && inst->exec_size == 8) {
3501 if (inst->force_sechalf)
3502 fprintf(file, "2ndhalf ");
3503 else
3504 fprintf(file, "1sthalf ");
3505 }
3506
3507 fprintf(file, "\n");
3508 }
3509
3510 /**
3511 * Possibly returns an instruction that set up @param reg.
3512 *
3513 * Sometimes we want to take the result of some expression/variable
3514 * dereference tree and rewrite the instruction generating the result
3515 * of the tree. When processing the tree, we know that the
3516 * instructions generated are all writing temporaries that are dead
3517 * outside of this tree. So, if we have some instructions that write
3518 * a temporary, we're free to point that temp write somewhere else.
3519 *
3520 * Note that this doesn't guarantee that the instruction generated
3521 * only reg -- it might be the size=4 destination of a texture instruction.
3522 */
3523 fs_inst *
3524 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3525 fs_inst *end,
3526 const fs_reg &reg)
3527 {
3528 if (end == start ||
3529 end->is_partial_write() ||
3530 reg.reladdr ||
3531 !reg.equals(end->dst)) {
3532 return NULL;
3533 } else {
3534 return end;
3535 }
3536 }
3537
3538 void
3539 fs_visitor::setup_payload_gen6()
3540 {
3541 bool uses_depth =
3542 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3543 unsigned barycentric_interp_modes =
3544 (stage == MESA_SHADER_FRAGMENT) ?
3545 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3546
3547 assert(brw->gen >= 6);
3548
3549 /* R0-1: masks, pixel X/Y coordinates. */
3550 payload.num_regs = 2;
3551 /* R2: only for 32-pixel dispatch.*/
3552
3553 /* R3-26: barycentric interpolation coordinates. These appear in the
3554 * same order that they appear in the brw_wm_barycentric_interp_mode
3555 * enum. Each set of coordinates occupies 2 registers if dispatch width
3556 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3557 * appear if they were enabled using the "Barycentric Interpolation
3558 * Mode" bits in WM_STATE.
3559 */
3560 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3561 if (barycentric_interp_modes & (1 << i)) {
3562 payload.barycentric_coord_reg[i] = payload.num_regs;
3563 payload.num_regs += 2;
3564 if (dispatch_width == 16) {
3565 payload.num_regs += 2;
3566 }
3567 }
3568 }
3569
3570 /* R27: interpolated depth if uses source depth */
3571 if (uses_depth) {
3572 payload.source_depth_reg = payload.num_regs;
3573 payload.num_regs++;
3574 if (dispatch_width == 16) {
3575 /* R28: interpolated depth if not SIMD8. */
3576 payload.num_regs++;
3577 }
3578 }
3579 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3580 if (uses_depth) {
3581 payload.source_w_reg = payload.num_regs;
3582 payload.num_regs++;
3583 if (dispatch_width == 16) {
3584 /* R30: interpolated W if not SIMD8. */
3585 payload.num_regs++;
3586 }
3587 }
3588
3589 if (stage == MESA_SHADER_FRAGMENT) {
3590 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3591 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3592 prog_data->uses_pos_offset = key->compute_pos_offset;
3593 /* R31: MSAA position offsets. */
3594 if (prog_data->uses_pos_offset) {
3595 payload.sample_pos_reg = payload.num_regs;
3596 payload.num_regs++;
3597 }
3598 }
3599
3600 /* R32: MSAA input coverage mask */
3601 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3602 assert(brw->gen >= 7);
3603 payload.sample_mask_in_reg = payload.num_regs;
3604 payload.num_regs++;
3605 if (dispatch_width == 16) {
3606 /* R33: input coverage mask if not SIMD8. */
3607 payload.num_regs++;
3608 }
3609 }
3610
3611 /* R34-: bary for 32-pixel. */
3612 /* R58-59: interp W for 32-pixel. */
3613
3614 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3615 source_depth_to_render_target = true;
3616 }
3617 }
3618
3619 void
3620 fs_visitor::setup_vs_payload()
3621 {
3622 /* R0: thread header, R1: urb handles */
3623 payload.num_regs = 2;
3624 }
3625
3626 void
3627 fs_visitor::assign_binding_table_offsets()
3628 {
3629 assert(stage == MESA_SHADER_FRAGMENT);
3630 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3631 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3632 uint32_t next_binding_table_offset = 0;
3633
3634 /* If there are no color regions, we still perform an FB write to a null
3635 * renderbuffer, which we place at surface index 0.
3636 */
3637 prog_data->binding_table.render_target_start = next_binding_table_offset;
3638 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3639
3640 assign_common_binding_table_offsets(next_binding_table_offset);
3641 }
3642
3643 void
3644 fs_visitor::calculate_register_pressure()
3645 {
3646 invalidate_live_intervals();
3647 calculate_live_intervals();
3648
3649 unsigned num_instructions = 0;
3650 foreach_block(block, cfg)
3651 num_instructions += block->instructions.length();
3652
3653 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3654
3655 for (unsigned reg = 0; reg < alloc.count; reg++) {
3656 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3657 regs_live_at_ip[ip] += alloc.sizes[reg];
3658 }
3659 }
3660
3661 void
3662 fs_visitor::optimize()
3663 {
3664 const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3665
3666 split_virtual_grfs();
3667
3668 move_uniform_array_access_to_pull_constants();
3669 assign_constant_locations();
3670 demote_pull_constants();
3671
3672 #define OPT(pass, args...) ({ \
3673 pass_num++; \
3674 bool this_progress = pass(args); \
3675 \
3676 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3677 char filename[64]; \
3678 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3679 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3680 \
3681 backend_visitor::dump_instructions(filename); \
3682 } \
3683 \
3684 progress = progress || this_progress; \
3685 this_progress; \
3686 })
3687
3688 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3689 char filename[64];
3690 snprintf(filename, 64, "%s%d-%04d-00-start",
3691 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3692
3693 backend_visitor::dump_instructions(filename);
3694 }
3695
3696 bool progress;
3697 int iteration = 0;
3698 int pass_num = 0;
3699 do {
3700 progress = false;
3701 pass_num = 0;
3702 iteration++;
3703
3704 OPT(remove_duplicate_mrf_writes);
3705
3706 OPT(opt_algebraic);
3707 OPT(opt_cse);
3708 OPT(opt_copy_propagate);
3709 OPT(opt_peephole_predicated_break);
3710 OPT(opt_cmod_propagation);
3711 OPT(dead_code_eliminate);
3712 OPT(opt_peephole_sel);
3713 OPT(dead_control_flow_eliminate, this);
3714 OPT(opt_register_renaming);
3715 OPT(opt_redundant_discard_jumps);
3716 OPT(opt_saturate_propagation);
3717 OPT(register_coalesce);
3718 OPT(compute_to_mrf);
3719
3720 OPT(compact_virtual_grfs);
3721 } while (progress);
3722
3723 pass_num = 0;
3724
3725 if (OPT(lower_load_payload)) {
3726 split_virtual_grfs();
3727 OPT(register_coalesce);
3728 OPT(compute_to_mrf);
3729 OPT(dead_code_eliminate);
3730 }
3731
3732 OPT(opt_combine_constants);
3733
3734 lower_uniform_pull_constant_loads();
3735 }
3736
3737 /**
3738 * Three source instruction must have a GRF/MRF destination register.
3739 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
3740 */
3741 void
3742 fs_visitor::fixup_3src_null_dest()
3743 {
3744 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3745 if (inst->is_3src() && inst->dst.is_null()) {
3746 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3747 inst->dst.type);
3748 }
3749 }
3750 }
3751
3752 void
3753 fs_visitor::allocate_registers()
3754 {
3755 bool allocated_without_spills;
3756
3757 static const enum instruction_scheduler_mode pre_modes[] = {
3758 SCHEDULE_PRE,
3759 SCHEDULE_PRE_NON_LIFO,
3760 SCHEDULE_PRE_LIFO,
3761 };
3762
3763 /* Try each scheduling heuristic to see if it can successfully register
3764 * allocate without spilling. They should be ordered by decreasing
3765 * performance but increasing likelihood of allocating.
3766 */
3767 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3768 schedule_instructions(pre_modes[i]);
3769
3770 if (0) {
3771 assign_regs_trivial();
3772 allocated_without_spills = true;
3773 } else {
3774 allocated_without_spills = assign_regs(false);
3775 }
3776 if (allocated_without_spills)
3777 break;
3778 }
3779
3780 if (!allocated_without_spills) {
3781 const char *stage_name = stage == MESA_SHADER_VERTEX ?
3782 "Vertex" : "Fragment";
3783
3784 /* We assume that any spilling is worse than just dropping back to
3785 * SIMD8. There's probably actually some intermediate point where
3786 * SIMD16 with a couple of spills is still better.
3787 */
3788 if (dispatch_width == 16) {
3789 fail("Failure to register allocate. Reduce number of "
3790 "live scalar values to avoid this.");
3791 } else {
3792 perf_debug("%s shader triggered register spilling. "
3793 "Try reducing the number of live scalar values to "
3794 "improve performance.\n", stage_name);
3795 }
3796
3797 /* Since we're out of heuristics, just go spill registers until we
3798 * get an allocation.
3799 */
3800 while (!assign_regs(true)) {
3801 if (failed)
3802 break;
3803 }
3804 }
3805
3806 /* This must come after all optimization and register allocation, since
3807 * it inserts dead code that happens to have side effects, and it does
3808 * so based on the actual physical registers in use.
3809 */
3810 insert_gen4_send_dependency_workarounds();
3811
3812 if (failed)
3813 return;
3814
3815 if (!allocated_without_spills)
3816 schedule_instructions(SCHEDULE_POST);
3817
3818 if (last_scratch > 0)
3819 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3820 }
3821
3822 bool
3823 fs_visitor::run_vs()
3824 {
3825 assert(stage == MESA_SHADER_VERTEX);
3826
3827 assign_common_binding_table_offsets(0);
3828 setup_vs_payload();
3829
3830 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3831 emit_shader_time_begin();
3832
3833 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3834 base_ir = ir;
3835 this->result = reg_undef;
3836 ir->accept(this);
3837 }
3838 base_ir = NULL;
3839 if (failed)
3840 return false;
3841
3842 emit_urb_writes();
3843
3844 calculate_cfg();
3845
3846 optimize();
3847
3848 assign_curb_setup();
3849 assign_vs_urb_setup();
3850
3851 fixup_3src_null_dest();
3852 allocate_registers();
3853
3854 return !failed;
3855 }
3856
3857 bool
3858 fs_visitor::run_fs()
3859 {
3860 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3861 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3862
3863 assert(stage == MESA_SHADER_FRAGMENT);
3864
3865 sanity_param_count = prog->Parameters->NumParameters;
3866
3867 assign_binding_table_offsets();
3868
3869 if (brw->gen >= 6)
3870 setup_payload_gen6();
3871 else
3872 setup_payload_gen4();
3873
3874 if (0) {
3875 emit_dummy_fs();
3876 } else if (brw->use_rep_send && dispatch_width == 16) {
3877 emit_repclear_shader();
3878 } else {
3879 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3880 emit_shader_time_begin();
3881
3882 calculate_urb_setup();
3883 if (prog->InputsRead > 0) {
3884 if (brw->gen < 6)
3885 emit_interpolation_setup_gen4();
3886 else
3887 emit_interpolation_setup_gen6();
3888 }
3889
3890 /* We handle discards by keeping track of the still-live pixels in f0.1.
3891 * Initialize it with the dispatched pixels.
3892 */
3893 if (wm_prog_data->uses_kill) {
3894 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3895 discard_init->flag_subreg = 1;
3896 }
3897
3898 /* Generate FS IR for main(). (the visitor only descends into
3899 * functions called "main").
3900 */
3901 if (shader) {
3902 if (getenv("INTEL_USE_NIR") != NULL) {
3903 emit_nir_code();
3904 } else {
3905 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3906 base_ir = ir;
3907 this->result = reg_undef;
3908 ir->accept(this);
3909 }
3910 }
3911 } else {
3912 emit_fragment_program_code();
3913 }
3914 base_ir = NULL;
3915 if (failed)
3916 return false;
3917
3918 emit(FS_OPCODE_PLACEHOLDER_HALT);
3919
3920 if (wm_key->alpha_test_func)
3921 emit_alpha_test();
3922
3923 emit_fb_writes();
3924
3925 calculate_cfg();
3926
3927 optimize();
3928
3929 assign_curb_setup();
3930 assign_urb_setup();
3931
3932 fixup_3src_null_dest();
3933 allocate_registers();
3934
3935 if (failed)
3936 return false;
3937 }
3938
3939 if (dispatch_width == 8)
3940 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3941 else
3942 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3943
3944 /* If any state parameters were appended, then ParameterValues could have
3945 * been realloced, in which case the driver uniform storage set up by
3946 * _mesa_associate_uniform_storage() would point to freed memory. Make
3947 * sure that didn't happen.
3948 */
3949 assert(sanity_param_count == prog->Parameters->NumParameters);
3950
3951 return !failed;
3952 }
3953
3954 const unsigned *
3955 brw_wm_fs_emit(struct brw_context *brw,
3956 void *mem_ctx,
3957 const struct brw_wm_prog_key *key,
3958 struct brw_wm_prog_data *prog_data,
3959 struct gl_fragment_program *fp,
3960 struct gl_shader_program *prog,
3961 unsigned *final_assembly_size)
3962 {
3963 bool start_busy = false;
3964 double start_time = 0;
3965
3966 if (unlikely(brw->perf_debug)) {
3967 start_busy = (brw->batch.last_bo &&
3968 drm_intel_bo_busy(brw->batch.last_bo));
3969 start_time = get_time();
3970 }
3971
3972 struct brw_shader *shader = NULL;
3973 if (prog)
3974 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3975
3976 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3977 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3978
3979 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3980 */
3981 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3982 if (!v.run_fs()) {
3983 if (prog) {
3984 prog->LinkStatus = false;
3985 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3986 }
3987
3988 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3989 v.fail_msg);
3990
3991 return NULL;
3992 }
3993
3994 cfg_t *simd16_cfg = NULL;
3995 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3996 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3997 brw->use_rep_send)) {
3998 if (!v.simd16_unsupported) {
3999 /* Try a SIMD16 compile */
4000 v2.import_uniforms(&v);
4001 if (!v2.run_fs()) {
4002 perf_debug("SIMD16 shader failed to compile, falling back to "
4003 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4004 } else {
4005 simd16_cfg = v2.cfg;
4006 }
4007 } else {
4008 perf_debug("SIMD16 shader unsupported, falling back to "
4009 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4010 }
4011 }
4012
4013 cfg_t *simd8_cfg;
4014 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4015 if (no_simd8 && simd16_cfg) {
4016 simd8_cfg = NULL;
4017 prog_data->no_8 = true;
4018 } else {
4019 simd8_cfg = v.cfg;
4020 prog_data->no_8 = false;
4021 }
4022
4023 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4024 &fp->Base, v.runtime_check_aads_emit, "FS");
4025
4026 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4027 char *name;
4028 if (prog)
4029 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4030 prog->Label ? prog->Label : "unnamed",
4031 prog->Name);
4032 else
4033 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4034
4035 g.enable_debug(name);
4036 }
4037
4038 if (simd8_cfg)
4039 g.generate_code(simd8_cfg, 8);
4040 if (simd16_cfg)
4041 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4042
4043 if (unlikely(brw->perf_debug) && shader) {
4044 if (shader->compiled_once)
4045 brw_wm_debug_recompile(brw, prog, key);
4046 shader->compiled_once = true;
4047
4048 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4049 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4050 (get_time() - start_time) * 1000);
4051 }
4052 }
4053
4054 return g.get_assembly(final_assembly_size);
4055 }
4056
4057 extern "C" bool
4058 brw_fs_precompile(struct gl_context *ctx,
4059 struct gl_shader_program *shader_prog,
4060 struct gl_program *prog)
4061 {
4062 struct brw_context *brw = brw_context(ctx);
4063 struct brw_wm_prog_key key;
4064
4065 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4066 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4067 bool program_uses_dfdy = fp->UsesDFdy;
4068
4069 memset(&key, 0, sizeof(key));
4070
4071 if (brw->gen < 6) {
4072 if (fp->UsesKill)
4073 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4074
4075 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4076 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4077
4078 /* Just assume depth testing. */
4079 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4080 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4081 }
4082
4083 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4084 BRW_FS_VARYING_INPUT_MASK) > 16)
4085 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4086
4087 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4088 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
4089 for (unsigned i = 0; i < sampler_count; i++) {
4090 if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
4091 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4092 key.tex.swizzles[i] =
4093 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4094 } else {
4095 /* Color sampler: assume no swizzling. */
4096 key.tex.swizzles[i] = SWIZZLE_XYZW;
4097 }
4098 }
4099
4100 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4101 key.drawable_height = ctx->DrawBuffer->Height;
4102 }
4103
4104 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4105 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4106 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4107
4108 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4109 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4110 key.nr_color_regions > 1;
4111 }
4112
4113 key.program_string_id = bfp->id;
4114
4115 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4116 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4117
4118 bool success = do_wm_prog(brw, shader_prog, bfp, &key);
4119
4120 brw->wm.base.prog_offset = old_prog_offset;
4121 brw->wm.prog_data = old_prog_data;
4122
4123 return success;
4124 }