i965: Use SIMD16 instead of SIMD8 on Gen4 when possible.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include <sys/types.h>
32
33 #include "util/hash_table.h"
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "main/fbobject.h"
37 #include "program/prog_parameter.h"
38 #include "program/prog_print.h"
39 #include "util/register_allocate.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 #include "brw_fs.h"
45 #include "brw_cfg.h"
46 #include "brw_dead_control_flow.h"
47 #include "main/uniforms.h"
48 #include "brw_fs_live_variables.h"
49 #include "glsl/glsl_types.h"
50 #include "program/sampler.h"
51
52 void
53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
54 const fs_reg *src, unsigned sources)
55 {
56 memset(this, 0, sizeof(*this));
57
58 this->src = new fs_reg[MAX2(sources, 3)];
59 for (unsigned i = 0; i < sources; i++)
60 this->src[i] = src[i];
61
62 this->opcode = opcode;
63 this->dst = dst;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (unsigned i = 0; i < sources; ++i) {
79 if (src[i].file != GRF && src[i].file != ATTR)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 for (unsigned i = 0; i < sources; ++i) {
94 switch (this->src[i].file) {
95 case BAD_FILE:
96 this->src[i].effective_width = 8;
97 break;
98 case GRF:
99 case HW_REG:
100 case ATTR:
101 assert(this->src[i].width > 0);
102 if (this->src[i].width == 1) {
103 this->src[i].effective_width = this->exec_size;
104 } else {
105 this->src[i].effective_width = this->src[i].width;
106 }
107 break;
108 case IMM:
109 case UNIFORM:
110 this->src[i].effective_width = this->exec_size;
111 break;
112 default:
113 unreachable("Invalid source register file");
114 }
115 }
116 this->dst.effective_width = this->exec_size;
117
118 this->conditional_mod = BRW_CONDITIONAL_NONE;
119
120 /* This will be the case for almost all instructions. */
121 switch (dst.file) {
122 case GRF:
123 case HW_REG:
124 case MRF:
125 case ATTR:
126 this->regs_written =
127 DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
128 break;
129 case BAD_FILE:
130 this->regs_written = 0;
131 break;
132 case IMM:
133 case UNIFORM:
134 unreachable("Invalid destination register file");
135 default:
136 unreachable("Invalid register file");
137 }
138
139 this->writes_accumulator = false;
140 }
141
142 fs_inst::fs_inst()
143 {
144 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
145 }
146
147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
148 {
149 init(opcode, exec_size, reg_undef, NULL, 0);
150 }
151
152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
153 {
154 init(opcode, 0, dst, NULL, 0);
155 }
156
157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
158 const fs_reg &src0)
159 {
160 const fs_reg src[1] = { src0 };
161 init(opcode, exec_size, dst, src, 1);
162 }
163
164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
165 {
166 const fs_reg src[1] = { src0 };
167 init(opcode, 0, dst, src, 1);
168 }
169
170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
171 const fs_reg &src0, const fs_reg &src1)
172 {
173 const fs_reg src[2] = { src0, src1 };
174 init(opcode, exec_size, dst, src, 2);
175 }
176
177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
178 const fs_reg &src1)
179 {
180 const fs_reg src[2] = { src0, src1 };
181 init(opcode, 0, dst, src, 2);
182 }
183
184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
185 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
186 {
187 const fs_reg src[3] = { src0, src1, src2 };
188 init(opcode, exec_size, dst, src, 3);
189 }
190
191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
192 const fs_reg &src1, const fs_reg &src2)
193 {
194 const fs_reg src[3] = { src0, src1, src2 };
195 init(opcode, 0, dst, src, 3);
196 }
197
198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
199 const fs_reg src[], unsigned sources)
200 {
201 init(opcode, 0, dst, src, sources);
202 }
203
204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
205 const fs_reg src[], unsigned sources)
206 {
207 init(opcode, exec_width, dst, src, sources);
208 }
209
210 fs_inst::fs_inst(const fs_inst &that)
211 {
212 memcpy(this, &that, sizeof(that));
213
214 this->src = new fs_reg[MAX2(that.sources, 3)];
215
216 for (unsigned i = 0; i < that.sources; i++)
217 this->src[i] = that.src[i];
218 }
219
220 fs_inst::~fs_inst()
221 {
222 delete[] this->src;
223 }
224
225 void
226 fs_inst::resize_sources(uint8_t num_sources)
227 {
228 if (this->sources != num_sources) {
229 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
230
231 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
232 src[i] = this->src[i];
233
234 delete[] this->src;
235 this->src = src;
236 this->sources = num_sources;
237 }
238 }
239
240 #define ALU1(op) \
241 fs_inst * \
242 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
243 { \
244 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
245 }
246
247 #define ALU2(op) \
248 fs_inst * \
249 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
250 const fs_reg &src1) \
251 { \
252 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
253 }
254
255 #define ALU2_ACC(op) \
256 fs_inst * \
257 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
258 const fs_reg &src1) \
259 { \
260 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
261 inst->writes_accumulator = true; \
262 return inst; \
263 }
264
265 #define ALU3(op) \
266 fs_inst * \
267 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
268 const fs_reg &src1, const fs_reg &src2) \
269 { \
270 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
271 }
272
273 ALU1(NOT)
274 ALU1(MOV)
275 ALU1(FRC)
276 ALU1(RNDD)
277 ALU1(RNDE)
278 ALU1(RNDZ)
279 ALU2(ADD)
280 ALU2(MUL)
281 ALU2_ACC(MACH)
282 ALU2(AND)
283 ALU2(OR)
284 ALU2(XOR)
285 ALU2(SHL)
286 ALU2(SHR)
287 ALU2(ASR)
288 ALU3(LRP)
289 ALU1(BFREV)
290 ALU3(BFE)
291 ALU2(BFI1)
292 ALU3(BFI2)
293 ALU1(FBH)
294 ALU1(FBL)
295 ALU1(CBIT)
296 ALU3(MAD)
297 ALU2_ACC(ADDC)
298 ALU2_ACC(SUBB)
299 ALU2(SEL)
300 ALU2(MAC)
301
302 /** Gen4 predicated IF. */
303 fs_inst *
304 fs_visitor::IF(enum brw_predicate predicate)
305 {
306 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
307 inst->predicate = predicate;
308 return inst;
309 }
310
311 /** Gen6 IF with embedded comparison. */
312 fs_inst *
313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
314 enum brw_conditional_mod condition)
315 {
316 assert(brw->gen == 6);
317 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
318 reg_null_d, src0, src1);
319 inst->conditional_mod = condition;
320 return inst;
321 }
322
323 /**
324 * CMP: Sets the low bit of the destination channels with the result
325 * of the comparison, while the upper bits are undefined, and updates
326 * the flag register with the packed 16 bits of the result.
327 */
328 fs_inst *
329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
330 enum brw_conditional_mod condition)
331 {
332 fs_inst *inst;
333
334 /* Take the instruction:
335 *
336 * CMP null<d> src0<f> src1<f>
337 *
338 * Original gen4 does type conversion to the destination type before
339 * comparison, producing garbage results for floating point comparisons.
340 *
341 * The destination type doesn't matter on newer generations, so we set the
342 * type to match src0 so we can compact the instruction.
343 */
344 dst.type = src0.type;
345 if (dst.file == HW_REG)
346 dst.fixed_hw_reg.type = dst.type;
347
348 resolve_ud_negate(&src0);
349 resolve_ud_negate(&src1);
350
351 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
352 inst->conditional_mod = condition;
353
354 return inst;
355 }
356
357 fs_inst *
358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
359 {
360 uint8_t exec_size = dst.width;
361 for (int i = 0; i < sources; ++i) {
362 assert(src[i].width % dst.width == 0);
363 if (src[i].width > exec_size)
364 exec_size = src[i].width;
365 }
366
367 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
368 dst, src, sources);
369 inst->regs_written = 0;
370 for (int i = 0; i < sources; ++i) {
371 /* The LOAD_PAYLOAD instruction only really makes sense if we are
372 * dealing with whole registers. If this ever changes, we can deal
373 * with it later.
374 */
375 int size = inst->src[i].effective_width * type_sz(src[i].type);
376 assert(size % 32 == 0);
377 inst->regs_written += (size + 31) / 32;
378 }
379
380 return inst;
381 }
382
383 exec_list
384 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
385 const fs_reg &surf_index,
386 const fs_reg &varying_offset,
387 uint32_t const_offset)
388 {
389 exec_list instructions;
390 fs_inst *inst;
391
392 /* We have our constant surface use a pitch of 4 bytes, so our index can
393 * be any component of a vector, and then we load 4 contiguous
394 * components starting from that.
395 *
396 * We break down the const_offset to a portion added to the variable
397 * offset and a portion done using reg_offset, which means that if you
398 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
399 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
400 * CSE can later notice that those loads are all the same and eliminate
401 * the redundant ones.
402 */
403 fs_reg vec4_offset = vgrf(glsl_type::int_type);
404 instructions.push_tail(ADD(vec4_offset,
405 varying_offset, fs_reg(const_offset & ~3)));
406
407 int scale = 1;
408 if (brw->gen == 4 && dst.width == 8) {
409 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
410 * u, v, r) as parameters, or we can just use the SIMD16 message
411 * consisting of (header, u). We choose the second, at the cost of a
412 * longer return length.
413 */
414 scale = 2;
415 }
416
417 enum opcode op;
418 if (brw->gen >= 7)
419 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
420 else
421 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
422
423 assert(dst.width % 8 == 0);
424 int regs_written = 4 * (dst.width / 8) * scale;
425 fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
426 dst.type, dst.width);
427 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
428 inst->regs_written = regs_written;
429 instructions.push_tail(inst);
430
431 if (brw->gen < 7) {
432 inst->base_mrf = 13;
433 inst->header_present = true;
434 if (brw->gen == 4)
435 inst->mlen = 3;
436 else
437 inst->mlen = 1 + dispatch_width / 8;
438 }
439
440 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
441 instructions.push_tail(MOV(dst, result));
442
443 return instructions;
444 }
445
446 /**
447 * A helper for MOV generation for fixing up broken hardware SEND dependency
448 * handling.
449 */
450 fs_inst *
451 fs_visitor::DEP_RESOLVE_MOV(int grf)
452 {
453 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
454
455 inst->ir = NULL;
456 inst->annotation = "send dependency resolve";
457
458 /* The caller always wants uncompressed to emit the minimal extra
459 * dependencies, and to avoid having to deal with aligning its regs to 2.
460 */
461 inst->exec_size = 8;
462
463 return inst;
464 }
465
466 bool
467 fs_inst::equals(fs_inst *inst) const
468 {
469 return (opcode == inst->opcode &&
470 dst.equals(inst->dst) &&
471 src[0].equals(inst->src[0]) &&
472 src[1].equals(inst->src[1]) &&
473 src[2].equals(inst->src[2]) &&
474 saturate == inst->saturate &&
475 predicate == inst->predicate &&
476 conditional_mod == inst->conditional_mod &&
477 mlen == inst->mlen &&
478 base_mrf == inst->base_mrf &&
479 target == inst->target &&
480 eot == inst->eot &&
481 header_present == inst->header_present &&
482 shadow_compare == inst->shadow_compare &&
483 exec_size == inst->exec_size &&
484 offset == inst->offset);
485 }
486
487 bool
488 fs_inst::overwrites_reg(const fs_reg &reg) const
489 {
490 return reg.in_range(dst, regs_written);
491 }
492
493 bool
494 fs_inst::is_send_from_grf() const
495 {
496 switch (opcode) {
497 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
498 case SHADER_OPCODE_SHADER_TIME_ADD:
499 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
500 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
501 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
502 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
503 case SHADER_OPCODE_UNTYPED_ATOMIC:
504 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
505 case SHADER_OPCODE_URB_WRITE_SIMD8:
506 return true;
507 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
508 return src[1].file == GRF;
509 case FS_OPCODE_FB_WRITE:
510 return src[0].file == GRF;
511 default:
512 if (is_tex())
513 return src[0].file == GRF;
514
515 return false;
516 }
517 }
518
519 bool
520 fs_inst::can_do_source_mods(struct brw_context *brw)
521 {
522 if (brw->gen == 6 && is_math())
523 return false;
524
525 if (is_send_from_grf())
526 return false;
527
528 if (!backend_instruction::can_do_source_mods())
529 return false;
530
531 return true;
532 }
533
534 void
535 fs_reg::init()
536 {
537 memset(this, 0, sizeof(*this));
538 stride = 1;
539 }
540
541 /** Generic unset register constructor. */
542 fs_reg::fs_reg()
543 {
544 init();
545 this->file = BAD_FILE;
546 }
547
548 /** Immediate value constructor. */
549 fs_reg::fs_reg(float f)
550 {
551 init();
552 this->file = IMM;
553 this->type = BRW_REGISTER_TYPE_F;
554 this->fixed_hw_reg.dw1.f = f;
555 this->width = 1;
556 }
557
558 /** Immediate value constructor. */
559 fs_reg::fs_reg(int32_t i)
560 {
561 init();
562 this->file = IMM;
563 this->type = BRW_REGISTER_TYPE_D;
564 this->fixed_hw_reg.dw1.d = i;
565 this->width = 1;
566 }
567
568 /** Immediate value constructor. */
569 fs_reg::fs_reg(uint32_t u)
570 {
571 init();
572 this->file = IMM;
573 this->type = BRW_REGISTER_TYPE_UD;
574 this->fixed_hw_reg.dw1.ud = u;
575 this->width = 1;
576 }
577
578 /** Vector float immediate value constructor. */
579 fs_reg::fs_reg(uint8_t vf[4])
580 {
581 init();
582 this->file = IMM;
583 this->type = BRW_REGISTER_TYPE_VF;
584 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
585 }
586
587 /** Vector float immediate value constructor. */
588 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
589 {
590 init();
591 this->file = IMM;
592 this->type = BRW_REGISTER_TYPE_VF;
593 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
594 (vf1 << 8) |
595 (vf2 << 16) |
596 (vf3 << 24);
597 }
598
599 /** Fixed brw_reg. */
600 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
601 {
602 init();
603 this->file = HW_REG;
604 this->fixed_hw_reg = fixed_hw_reg;
605 this->type = fixed_hw_reg.type;
606 this->width = 1 << fixed_hw_reg.width;
607 }
608
609 bool
610 fs_reg::equals(const fs_reg &r) const
611 {
612 return (file == r.file &&
613 reg == r.reg &&
614 reg_offset == r.reg_offset &&
615 subreg_offset == r.subreg_offset &&
616 type == r.type &&
617 negate == r.negate &&
618 abs == r.abs &&
619 !reladdr && !r.reladdr &&
620 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
621 width == r.width &&
622 stride == r.stride);
623 }
624
625 fs_reg &
626 fs_reg::set_smear(unsigned subreg)
627 {
628 assert(file != HW_REG && file != IMM);
629 subreg_offset = subreg * type_sz(type);
630 stride = 0;
631 return *this;
632 }
633
634 bool
635 fs_reg::is_contiguous() const
636 {
637 return stride == 1;
638 }
639
640 int
641 fs_visitor::type_size(const struct glsl_type *type)
642 {
643 unsigned int size, i;
644
645 switch (type->base_type) {
646 case GLSL_TYPE_UINT:
647 case GLSL_TYPE_INT:
648 case GLSL_TYPE_FLOAT:
649 case GLSL_TYPE_BOOL:
650 return type->components();
651 case GLSL_TYPE_ARRAY:
652 return type_size(type->fields.array) * type->length;
653 case GLSL_TYPE_STRUCT:
654 size = 0;
655 for (i = 0; i < type->length; i++) {
656 size += type_size(type->fields.structure[i].type);
657 }
658 return size;
659 case GLSL_TYPE_SAMPLER:
660 /* Samplers take up no register space, since they're baked in at
661 * link time.
662 */
663 return 0;
664 case GLSL_TYPE_ATOMIC_UINT:
665 return 0;
666 case GLSL_TYPE_IMAGE:
667 case GLSL_TYPE_VOID:
668 case GLSL_TYPE_ERROR:
669 case GLSL_TYPE_INTERFACE:
670 case GLSL_TYPE_DOUBLE:
671 unreachable("not reached");
672 }
673
674 return 0;
675 }
676
677 /**
678 * Create a MOV to read the timestamp register.
679 *
680 * The caller is responsible for emitting the MOV. The return value is
681 * the destination of the MOV, with extra parameters set.
682 */
683 fs_reg
684 fs_visitor::get_timestamp(fs_inst **out_mov)
685 {
686 assert(brw->gen >= 7);
687
688 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
689 BRW_ARF_TIMESTAMP,
690 0),
691 BRW_REGISTER_TYPE_UD));
692
693 fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
694
695 fs_inst *mov = MOV(dst, ts);
696 /* We want to read the 3 fields we care about even if it's not enabled in
697 * the dispatch.
698 */
699 mov->force_writemask_all = true;
700
701 /* The caller wants the low 32 bits of the timestamp. Since it's running
702 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
703 * which is plenty of time for our purposes. It is identical across the
704 * EUs, but since it's tracking GPU core speed it will increment at a
705 * varying rate as render P-states change.
706 *
707 * The caller could also check if render P-states have changed (or anything
708 * else that might disrupt timing) by setting smear to 2 and checking if
709 * that field is != 0.
710 */
711 dst.set_smear(0);
712
713 *out_mov = mov;
714 return dst;
715 }
716
717 void
718 fs_visitor::emit_shader_time_begin()
719 {
720 current_annotation = "shader time start";
721 fs_inst *mov;
722 shader_start_time = get_timestamp(&mov);
723 emit(mov);
724 }
725
726 void
727 fs_visitor::emit_shader_time_end()
728 {
729 current_annotation = "shader time end";
730
731 enum shader_time_shader_type type, written_type, reset_type;
732 switch (stage) {
733 case MESA_SHADER_VERTEX:
734 type = ST_VS;
735 written_type = ST_VS_WRITTEN;
736 reset_type = ST_VS_RESET;
737 break;
738 case MESA_SHADER_GEOMETRY:
739 type = ST_GS;
740 written_type = ST_GS_WRITTEN;
741 reset_type = ST_GS_RESET;
742 break;
743 case MESA_SHADER_FRAGMENT:
744 if (dispatch_width == 8) {
745 type = ST_FS8;
746 written_type = ST_FS8_WRITTEN;
747 reset_type = ST_FS8_RESET;
748 } else {
749 assert(dispatch_width == 16);
750 type = ST_FS16;
751 written_type = ST_FS16_WRITTEN;
752 reset_type = ST_FS16_RESET;
753 }
754 break;
755 default:
756 unreachable("fs_visitor::emit_shader_time_end missing code");
757 }
758
759 /* Insert our code just before the final SEND with EOT. */
760 exec_node *end = this->instructions.get_tail();
761 assert(end && ((fs_inst *) end)->eot);
762
763 fs_inst *tm_read;
764 fs_reg shader_end_time = get_timestamp(&tm_read);
765 end->insert_before(tm_read);
766
767 /* Check that there weren't any timestamp reset events (assuming these
768 * were the only two timestamp reads that happened).
769 */
770 fs_reg reset = shader_end_time;
771 reset.set_smear(2);
772 fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
773 test->conditional_mod = BRW_CONDITIONAL_Z;
774 test->force_writemask_all = true;
775 end->insert_before(test);
776 end->insert_before(IF(BRW_PREDICATE_NORMAL));
777
778 fs_reg start = shader_start_time;
779 start.negate = true;
780 fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
781 diff.set_smear(0);
782 fs_inst *add = ADD(diff, start, shader_end_time);
783 add->force_writemask_all = true;
784 end->insert_before(add);
785
786 /* If there were no instructions between the two timestamp gets, the diff
787 * is 2 cycles. Remove that overhead, so I can forget about that when
788 * trying to determine the time taken for single instructions.
789 */
790 add = ADD(diff, diff, fs_reg(-2u));
791 add->force_writemask_all = true;
792 end->insert_before(add);
793
794 end->insert_before(SHADER_TIME_ADD(type, diff));
795 end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
796 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
797 end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
798 end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
799 }
800
801 fs_inst *
802 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
803 {
804 int shader_time_index =
805 brw_get_shader_time_index(brw, shader_prog, prog, type);
806 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
807
808 fs_reg payload;
809 if (dispatch_width == 8)
810 payload = vgrf(glsl_type::uvec2_type);
811 else
812 payload = vgrf(glsl_type::uint_type);
813
814 return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
815 fs_reg(), payload, offset, value);
816 }
817
818 void
819 fs_visitor::vfail(const char *format, va_list va)
820 {
821 char *msg;
822
823 if (failed)
824 return;
825
826 failed = true;
827
828 msg = ralloc_vasprintf(mem_ctx, format, va);
829 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
830
831 this->fail_msg = msg;
832
833 if (debug_enabled) {
834 fprintf(stderr, "%s", msg);
835 }
836 }
837
838 void
839 fs_visitor::fail(const char *format, ...)
840 {
841 va_list va;
842
843 va_start(va, format);
844 vfail(format, va);
845 va_end(va);
846 }
847
848 /**
849 * Mark this program as impossible to compile in SIMD16 mode.
850 *
851 * During the SIMD8 compile (which happens first), we can detect and flag
852 * things that are unsupported in SIMD16 mode, so the compiler can skip
853 * the SIMD16 compile altogether.
854 *
855 * During a SIMD16 compile (if one happens anyway), this just calls fail().
856 */
857 void
858 fs_visitor::no16(const char *format, ...)
859 {
860 va_list va;
861
862 va_start(va, format);
863
864 if (dispatch_width == 16) {
865 vfail(format, va);
866 } else {
867 simd16_unsupported = true;
868
869 if (brw->perf_debug) {
870 if (no16_msg)
871 ralloc_vasprintf_append(&no16_msg, format, va);
872 else
873 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
874 }
875 }
876
877 va_end(va);
878 }
879
880 fs_inst *
881 fs_visitor::emit(enum opcode opcode)
882 {
883 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
884 }
885
886 fs_inst *
887 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
888 {
889 return emit(new(mem_ctx) fs_inst(opcode, dst));
890 }
891
892 fs_inst *
893 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
894 {
895 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
896 }
897
898 fs_inst *
899 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
900 const fs_reg &src1)
901 {
902 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
903 }
904
905 fs_inst *
906 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
907 const fs_reg &src1, const fs_reg &src2)
908 {
909 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
910 }
911
912 fs_inst *
913 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
914 fs_reg src[], int sources)
915 {
916 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
917 }
918
919 /**
920 * Returns true if the instruction has a flag that means it won't
921 * update an entire destination register.
922 *
923 * For example, dead code elimination and live variable analysis want to know
924 * when a write to a variable screens off any preceding values that were in
925 * it.
926 */
927 bool
928 fs_inst::is_partial_write() const
929 {
930 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
931 (this->dst.width * type_sz(this->dst.type)) < 32 ||
932 !this->dst.is_contiguous());
933 }
934
935 int
936 fs_inst::regs_read(int arg) const
937 {
938 if (is_tex() && arg == 0 && src[0].file == GRF) {
939 return mlen;
940 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
941 return mlen;
942 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
943 return mlen;
944 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
945 return mlen;
946 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
947 return mlen;
948 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
949 return mlen;
950 }
951
952 switch (src[arg].file) {
953 case BAD_FILE:
954 case UNIFORM:
955 case IMM:
956 return 1;
957 case GRF:
958 case HW_REG:
959 if (src[arg].stride == 0) {
960 return 1;
961 } else {
962 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
963 return (size + 31) / 32;
964 }
965 case MRF:
966 unreachable("MRF registers are not allowed as sources");
967 default:
968 unreachable("Invalid register file");
969 }
970 }
971
972 bool
973 fs_inst::reads_flag() const
974 {
975 return predicate;
976 }
977
978 bool
979 fs_inst::writes_flag() const
980 {
981 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
982 opcode != BRW_OPCODE_IF &&
983 opcode != BRW_OPCODE_WHILE)) ||
984 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
985 }
986
987 /**
988 * Returns how many MRFs an FS opcode will write over.
989 *
990 * Note that this is not the 0 or 1 implied writes in an actual gen
991 * instruction -- the FS opcodes often generate MOVs in addition.
992 */
993 int
994 fs_visitor::implied_mrf_writes(fs_inst *inst)
995 {
996 if (inst->mlen == 0)
997 return 0;
998
999 if (inst->base_mrf == -1)
1000 return 0;
1001
1002 switch (inst->opcode) {
1003 case SHADER_OPCODE_RCP:
1004 case SHADER_OPCODE_RSQ:
1005 case SHADER_OPCODE_SQRT:
1006 case SHADER_OPCODE_EXP2:
1007 case SHADER_OPCODE_LOG2:
1008 case SHADER_OPCODE_SIN:
1009 case SHADER_OPCODE_COS:
1010 return 1 * dispatch_width / 8;
1011 case SHADER_OPCODE_POW:
1012 case SHADER_OPCODE_INT_QUOTIENT:
1013 case SHADER_OPCODE_INT_REMAINDER:
1014 return 2 * dispatch_width / 8;
1015 case SHADER_OPCODE_TEX:
1016 case FS_OPCODE_TXB:
1017 case SHADER_OPCODE_TXD:
1018 case SHADER_OPCODE_TXF:
1019 case SHADER_OPCODE_TXF_CMS:
1020 case SHADER_OPCODE_TXF_MCS:
1021 case SHADER_OPCODE_TG4:
1022 case SHADER_OPCODE_TG4_OFFSET:
1023 case SHADER_OPCODE_TXL:
1024 case SHADER_OPCODE_TXS:
1025 case SHADER_OPCODE_LOD:
1026 return 1;
1027 case FS_OPCODE_FB_WRITE:
1028 return 2;
1029 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1030 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1031 return 1;
1032 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1033 return inst->mlen;
1034 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1035 return 2;
1036 case SHADER_OPCODE_UNTYPED_ATOMIC:
1037 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1038 case SHADER_OPCODE_URB_WRITE_SIMD8:
1039 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1040 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1041 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1042 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1043 return 0;
1044 default:
1045 unreachable("not reached");
1046 }
1047 }
1048
1049 fs_reg
1050 fs_visitor::vgrf(const glsl_type *const type)
1051 {
1052 int reg_width = dispatch_width / 8;
1053 return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1054 brw_type_for_base_type(type), dispatch_width);
1055 }
1056
1057 fs_reg
1058 fs_visitor::vgrf(int num_components)
1059 {
1060 int reg_width = dispatch_width / 8;
1061 return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1062 BRW_REGISTER_TYPE_F, dispatch_width);
1063 }
1064
1065 /** Fixed HW reg constructor. */
1066 fs_reg::fs_reg(enum register_file file, int reg)
1067 {
1068 init();
1069 this->file = file;
1070 this->reg = reg;
1071 this->type = BRW_REGISTER_TYPE_F;
1072
1073 switch (file) {
1074 case UNIFORM:
1075 this->width = 1;
1076 break;
1077 default:
1078 this->width = 8;
1079 }
1080 }
1081
1082 /** Fixed HW reg constructor. */
1083 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1084 {
1085 init();
1086 this->file = file;
1087 this->reg = reg;
1088 this->type = type;
1089
1090 switch (file) {
1091 case UNIFORM:
1092 this->width = 1;
1093 break;
1094 default:
1095 this->width = 8;
1096 }
1097 }
1098
1099 /** Fixed HW reg constructor. */
1100 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1101 uint8_t width)
1102 {
1103 init();
1104 this->file = file;
1105 this->reg = reg;
1106 this->type = type;
1107 this->width = width;
1108 }
1109
1110 fs_reg *
1111 fs_visitor::variable_storage(ir_variable *var)
1112 {
1113 return (fs_reg *)hash_table_find(this->variable_ht, var);
1114 }
1115
1116 void
1117 import_uniforms_callback(const void *key,
1118 void *data,
1119 void *closure)
1120 {
1121 struct hash_table *dst_ht = (struct hash_table *)closure;
1122 const fs_reg *reg = (const fs_reg *)data;
1123
1124 if (reg->file != UNIFORM)
1125 return;
1126
1127 hash_table_insert(dst_ht, data, key);
1128 }
1129
1130 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1131 * This brings in those uniform definitions
1132 */
1133 void
1134 fs_visitor::import_uniforms(fs_visitor *v)
1135 {
1136 hash_table_call_foreach(v->variable_ht,
1137 import_uniforms_callback,
1138 variable_ht);
1139 this->push_constant_loc = v->push_constant_loc;
1140 this->pull_constant_loc = v->pull_constant_loc;
1141 this->uniforms = v->uniforms;
1142 this->param_size = v->param_size;
1143 }
1144
1145 /* Our support for uniforms is piggy-backed on the struct
1146 * gl_fragment_program, because that's where the values actually
1147 * get stored, rather than in some global gl_shader_program uniform
1148 * store.
1149 */
1150 void
1151 fs_visitor::setup_uniform_values(ir_variable *ir)
1152 {
1153 int namelen = strlen(ir->name);
1154
1155 /* The data for our (non-builtin) uniforms is stored in a series of
1156 * gl_uniform_driver_storage structs for each subcomponent that
1157 * glGetUniformLocation() could name. We know it's been set up in the same
1158 * order we'd walk the type, so walk the list of storage and find anything
1159 * with our name, or the prefix of a component that starts with our name.
1160 */
1161 unsigned params_before = uniforms;
1162 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1163 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1164
1165 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1166 (storage->name[namelen] != 0 &&
1167 storage->name[namelen] != '.' &&
1168 storage->name[namelen] != '[')) {
1169 continue;
1170 }
1171
1172 unsigned slots = storage->type->component_slots();
1173 if (storage->array_elements)
1174 slots *= storage->array_elements;
1175
1176 for (unsigned i = 0; i < slots; i++) {
1177 stage_prog_data->param[uniforms++] = &storage->storage[i];
1178 }
1179 }
1180
1181 /* Make sure we actually initialized the right amount of stuff here. */
1182 assert(params_before + ir->type->component_slots() == uniforms);
1183 (void)params_before;
1184 }
1185
1186
1187 /* Our support for builtin uniforms is even scarier than non-builtin.
1188 * It sits on top of the PROG_STATE_VAR parameters that are
1189 * automatically updated from GL context state.
1190 */
1191 void
1192 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1193 {
1194 const ir_state_slot *const slots = ir->get_state_slots();
1195 assert(slots != NULL);
1196
1197 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1198 /* This state reference has already been setup by ir_to_mesa, but we'll
1199 * get the same index back here.
1200 */
1201 int index = _mesa_add_state_reference(this->prog->Parameters,
1202 (gl_state_index *)slots[i].tokens);
1203
1204 /* Add each of the unique swizzles of the element as a parameter.
1205 * This'll end up matching the expected layout of the
1206 * array/matrix/structure we're trying to fill in.
1207 */
1208 int last_swiz = -1;
1209 for (unsigned int j = 0; j < 4; j++) {
1210 int swiz = GET_SWZ(slots[i].swizzle, j);
1211 if (swiz == last_swiz)
1212 break;
1213 last_swiz = swiz;
1214
1215 stage_prog_data->param[uniforms++] =
1216 &prog->Parameters->ParameterValues[index][swiz];
1217 }
1218 }
1219 }
1220
1221 fs_reg *
1222 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1223 bool origin_upper_left)
1224 {
1225 assert(stage == MESA_SHADER_FRAGMENT);
1226 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1227 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1228 fs_reg wpos = *reg;
1229 bool flip = !origin_upper_left ^ key->render_to_fbo;
1230
1231 /* gl_FragCoord.x */
1232 if (pixel_center_integer) {
1233 emit(MOV(wpos, this->pixel_x));
1234 } else {
1235 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1236 }
1237 wpos = offset(wpos, 1);
1238
1239 /* gl_FragCoord.y */
1240 if (!flip && pixel_center_integer) {
1241 emit(MOV(wpos, this->pixel_y));
1242 } else {
1243 fs_reg pixel_y = this->pixel_y;
1244 float offset = (pixel_center_integer ? 0.0 : 0.5);
1245
1246 if (flip) {
1247 pixel_y.negate = true;
1248 offset += key->drawable_height - 1.0;
1249 }
1250
1251 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1252 }
1253 wpos = offset(wpos, 1);
1254
1255 /* gl_FragCoord.z */
1256 if (brw->gen >= 6) {
1257 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1258 } else {
1259 emit(FS_OPCODE_LINTERP, wpos,
1260 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1261 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1262 interp_reg(VARYING_SLOT_POS, 2));
1263 }
1264 wpos = offset(wpos, 1);
1265
1266 /* gl_FragCoord.w: Already set up in emit_interpolation */
1267 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1268
1269 return reg;
1270 }
1271
1272 fs_inst *
1273 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1274 glsl_interp_qualifier interpolation_mode,
1275 bool is_centroid, bool is_sample)
1276 {
1277 brw_wm_barycentric_interp_mode barycoord_mode;
1278 if (brw->gen >= 6) {
1279 if (is_centroid) {
1280 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1281 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1282 else
1283 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1284 } else if (is_sample) {
1285 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1286 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1287 else
1288 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1289 } else {
1290 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1291 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1292 else
1293 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1294 }
1295 } else {
1296 /* On Ironlake and below, there is only one interpolation mode.
1297 * Centroid interpolation doesn't mean anything on this hardware --
1298 * there is no multisampling.
1299 */
1300 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1301 }
1302 return emit(FS_OPCODE_LINTERP, attr,
1303 this->delta_x[barycoord_mode],
1304 this->delta_y[barycoord_mode], interp);
1305 }
1306
1307 void
1308 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1309 const glsl_type *type,
1310 glsl_interp_qualifier interpolation_mode,
1311 int location, bool mod_centroid,
1312 bool mod_sample)
1313 {
1314 attr.type = brw_type_for_base_type(type->get_scalar_type());
1315
1316 assert(stage == MESA_SHADER_FRAGMENT);
1317 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1318 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1319
1320 unsigned int array_elements;
1321
1322 if (type->is_array()) {
1323 array_elements = type->length;
1324 if (array_elements == 0) {
1325 fail("dereferenced array '%s' has length 0\n", name);
1326 }
1327 type = type->fields.array;
1328 } else {
1329 array_elements = 1;
1330 }
1331
1332 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1333 bool is_gl_Color =
1334 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1335 if (key->flat_shade && is_gl_Color) {
1336 interpolation_mode = INTERP_QUALIFIER_FLAT;
1337 } else {
1338 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1339 }
1340 }
1341
1342 for (unsigned int i = 0; i < array_elements; i++) {
1343 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1344 if (prog_data->urb_setup[location] == -1) {
1345 /* If there's no incoming setup data for this slot, don't
1346 * emit interpolation for it.
1347 */
1348 attr = offset(attr, type->vector_elements);
1349 location++;
1350 continue;
1351 }
1352
1353 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1354 /* Constant interpolation (flat shading) case. The SF has
1355 * handed us defined values in only the constant offset
1356 * field of the setup reg.
1357 */
1358 for (unsigned int k = 0; k < type->vector_elements; k++) {
1359 struct brw_reg interp = interp_reg(location, k);
1360 interp = suboffset(interp, 3);
1361 interp.type = attr.type;
1362 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1363 attr = offset(attr, 1);
1364 }
1365 } else {
1366 /* Smooth/noperspective interpolation case. */
1367 for (unsigned int k = 0; k < type->vector_elements; k++) {
1368 struct brw_reg interp = interp_reg(location, k);
1369 if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1370 /* Get the pixel/sample mask into f0 so that we know
1371 * which pixels are lit. Then, for each channel that is
1372 * unlit, replace the centroid data with non-centroid
1373 * data.
1374 */
1375 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1376
1377 fs_inst *inst;
1378 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1379 false, false);
1380 inst->predicate = BRW_PREDICATE_NORMAL;
1381 inst->predicate_inverse = true;
1382 if (brw->has_pln)
1383 inst->no_dd_clear = true;
1384
1385 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1386 mod_centroid && !key->persample_shading,
1387 mod_sample || key->persample_shading);
1388 inst->predicate = BRW_PREDICATE_NORMAL;
1389 inst->predicate_inverse = false;
1390 if (brw->has_pln)
1391 inst->no_dd_check = true;
1392
1393 } else {
1394 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1395 mod_centroid && !key->persample_shading,
1396 mod_sample || key->persample_shading);
1397 }
1398 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1399 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1400 }
1401 attr = offset(attr, 1);
1402 }
1403
1404 }
1405 location++;
1406 }
1407 }
1408 }
1409
1410 fs_reg *
1411 fs_visitor::emit_frontfacing_interpolation()
1412 {
1413 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1414
1415 if (brw->gen >= 6) {
1416 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1417 * a boolean result from this (~0/true or 0/false).
1418 *
1419 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1420 * this task in only one instruction:
1421 * - a negation source modifier will flip the bit; and
1422 * - a W -> D type conversion will sign extend the bit into the high
1423 * word of the destination.
1424 *
1425 * An ASR 15 fills the low word of the destination.
1426 */
1427 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1428 g0.negate = true;
1429
1430 emit(ASR(*reg, g0, fs_reg(15)));
1431 } else {
1432 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1433 * a boolean result from this (1/true or 0/false).
1434 *
1435 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1436 * the negation source modifier to flip it. Unfortunately the SHR
1437 * instruction only operates on UD (or D with an abs source modifier)
1438 * sources without negation.
1439 *
1440 * Instead, use ASR (which will give ~0/true or 0/false).
1441 */
1442 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1443 g1_6.negate = true;
1444
1445 emit(ASR(*reg, g1_6, fs_reg(31)));
1446 }
1447
1448 return reg;
1449 }
1450
1451 void
1452 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1453 {
1454 assert(stage == MESA_SHADER_FRAGMENT);
1455 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1456 assert(dst.type == BRW_REGISTER_TYPE_F);
1457
1458 if (key->compute_pos_offset) {
1459 /* Convert int_sample_pos to floating point */
1460 emit(MOV(dst, int_sample_pos));
1461 /* Scale to the range [0, 1] */
1462 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1463 }
1464 else {
1465 /* From ARB_sample_shading specification:
1466 * "When rendering to a non-multisample buffer, or if multisample
1467 * rasterization is disabled, gl_SamplePosition will always be
1468 * (0.5, 0.5).
1469 */
1470 emit(MOV(dst, fs_reg(0.5f)));
1471 }
1472 }
1473
1474 fs_reg *
1475 fs_visitor::emit_samplepos_setup()
1476 {
1477 assert(brw->gen >= 6);
1478
1479 this->current_annotation = "compute sample position";
1480 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1481 fs_reg pos = *reg;
1482 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1483 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1484
1485 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1486 * mode will be enabled.
1487 *
1488 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1489 * R31.1:0 Position Offset X/Y for Slot[3:0]
1490 * R31.3:2 Position Offset X/Y for Slot[7:4]
1491 * .....
1492 *
1493 * The X, Y sample positions come in as bytes in thread payload. So, read
1494 * the positions using vstride=16, width=8, hstride=2.
1495 */
1496 struct brw_reg sample_pos_reg =
1497 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1498 BRW_REGISTER_TYPE_B), 16, 8, 2);
1499
1500 if (dispatch_width == 8) {
1501 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1502 } else {
1503 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1504 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1505 ->force_sechalf = true;
1506 }
1507 /* Compute gl_SamplePosition.x */
1508 compute_sample_position(pos, int_sample_x);
1509 pos = offset(pos, 1);
1510 if (dispatch_width == 8) {
1511 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1512 } else {
1513 emit(MOV(half(int_sample_y, 0),
1514 fs_reg(suboffset(sample_pos_reg, 1))));
1515 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1516 ->force_sechalf = true;
1517 }
1518 /* Compute gl_SamplePosition.y */
1519 compute_sample_position(pos, int_sample_y);
1520 return reg;
1521 }
1522
1523 fs_reg *
1524 fs_visitor::emit_sampleid_setup()
1525 {
1526 assert(stage == MESA_SHADER_FRAGMENT);
1527 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1528 assert(brw->gen >= 6);
1529
1530 this->current_annotation = "compute sample id";
1531 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1532
1533 if (key->compute_sample_id) {
1534 fs_reg t1 = vgrf(glsl_type::int_type);
1535 fs_reg t2 = vgrf(glsl_type::int_type);
1536 t2.type = BRW_REGISTER_TYPE_UW;
1537
1538 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1539 * 8x multisampling, subspan 0 will represent sample N (where N
1540 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1541 * 7. We can find the value of N by looking at R0.0 bits 7:6
1542 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1543 * (since samples are always delivered in pairs). That is, we
1544 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1545 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1546 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1547 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1548 * populating a temporary variable with the sequence (0, 1, 2, 3),
1549 * and then reading from it using vstride=1, width=4, hstride=0.
1550 * These computations hold good for 4x multisampling as well.
1551 *
1552 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1553 * the first four slots are sample 0 of subspan 0; the next four
1554 * are sample 1 of subspan 0; the third group is sample 0 of
1555 * subspan 1, and finally sample 1 of subspan 1.
1556 */
1557 fs_inst *inst;
1558 inst = emit(BRW_OPCODE_AND, t1,
1559 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1560 fs_reg(0xc0));
1561 inst->force_writemask_all = true;
1562 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1563 inst->force_writemask_all = true;
1564 /* This works for both SIMD8 and SIMD16 */
1565 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1566 inst->force_writemask_all = true;
1567 /* This special instruction takes care of setting vstride=1,
1568 * width=4, hstride=0 of t2 during an ADD instruction.
1569 */
1570 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1571 } else {
1572 /* As per GL_ARB_sample_shading specification:
1573 * "When rendering to a non-multisample buffer, or if multisample
1574 * rasterization is disabled, gl_SampleID will always be zero."
1575 */
1576 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1577 }
1578
1579 return reg;
1580 }
1581
1582 void
1583 fs_visitor::resolve_source_modifiers(fs_reg *src)
1584 {
1585 if (!src->abs && !src->negate)
1586 return;
1587
1588 fs_reg temp = retype(vgrf(1), src->type);
1589 emit(MOV(temp, *src));
1590 *src = temp;
1591 }
1592
1593 fs_reg
1594 fs_visitor::fix_math_operand(fs_reg src)
1595 {
1596 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1597 * might be able to do better by doing execsize = 1 math and then
1598 * expanding that result out, but we would need to be careful with
1599 * masking.
1600 *
1601 * The hardware ignores source modifiers (negate and abs) on math
1602 * instructions, so we also move to a temp to set those up.
1603 */
1604 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1605 !src.abs && !src.negate)
1606 return src;
1607
1608 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1609 * operands to math
1610 */
1611 if (brw->gen >= 7 && src.file != IMM)
1612 return src;
1613
1614 fs_reg expanded = vgrf(glsl_type::float_type);
1615 expanded.type = src.type;
1616 emit(BRW_OPCODE_MOV, expanded, src);
1617 return expanded;
1618 }
1619
1620 fs_inst *
1621 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1622 {
1623 switch (opcode) {
1624 case SHADER_OPCODE_RCP:
1625 case SHADER_OPCODE_RSQ:
1626 case SHADER_OPCODE_SQRT:
1627 case SHADER_OPCODE_EXP2:
1628 case SHADER_OPCODE_LOG2:
1629 case SHADER_OPCODE_SIN:
1630 case SHADER_OPCODE_COS:
1631 break;
1632 default:
1633 unreachable("not reached: bad math opcode");
1634 }
1635
1636 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1637 * might be able to do better by doing execsize = 1 math and then
1638 * expanding that result out, but we would need to be careful with
1639 * masking.
1640 *
1641 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1642 * instructions, so we also move to a temp to set those up.
1643 */
1644 if (brw->gen == 6 || brw->gen == 7)
1645 src = fix_math_operand(src);
1646
1647 fs_inst *inst = emit(opcode, dst, src);
1648
1649 if (brw->gen < 6) {
1650 inst->base_mrf = 2;
1651 inst->mlen = dispatch_width / 8;
1652 }
1653
1654 return inst;
1655 }
1656
1657 fs_inst *
1658 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1659 {
1660 int base_mrf = 2;
1661 fs_inst *inst;
1662
1663 if (brw->gen >= 8) {
1664 inst = emit(opcode, dst, src0, src1);
1665 } else if (brw->gen >= 6) {
1666 src0 = fix_math_operand(src0);
1667 src1 = fix_math_operand(src1);
1668
1669 inst = emit(opcode, dst, src0, src1);
1670 } else {
1671 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1672 * "Message Payload":
1673 *
1674 * "Operand0[7]. For the INT DIV functions, this operand is the
1675 * denominator."
1676 * ...
1677 * "Operand1[7]. For the INT DIV functions, this operand is the
1678 * numerator."
1679 */
1680 bool is_int_div = opcode != SHADER_OPCODE_POW;
1681 fs_reg &op0 = is_int_div ? src1 : src0;
1682 fs_reg &op1 = is_int_div ? src0 : src1;
1683
1684 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1685 inst = emit(opcode, dst, op0, reg_null_f);
1686
1687 inst->base_mrf = base_mrf;
1688 inst->mlen = 2 * dispatch_width / 8;
1689 }
1690 return inst;
1691 }
1692
1693 void
1694 fs_visitor::emit_discard_jump()
1695 {
1696 /* For performance, after a discard, jump to the end of the
1697 * shader if all relevant channels have been discarded.
1698 */
1699 fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1700 discard_jump->flag_subreg = 1;
1701
1702 discard_jump->predicate = (dispatch_width == 8)
1703 ? BRW_PREDICATE_ALIGN1_ANY8H
1704 : BRW_PREDICATE_ALIGN1_ANY16H;
1705 discard_jump->predicate_inverse = true;
1706 }
1707
1708 void
1709 fs_visitor::assign_curb_setup()
1710 {
1711 if (dispatch_width == 8) {
1712 prog_data->dispatch_grf_start_reg = payload.num_regs;
1713 } else {
1714 assert(stage == MESA_SHADER_FRAGMENT);
1715 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1716 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1717 }
1718
1719 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1720
1721 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1722 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1723 for (unsigned int i = 0; i < inst->sources; i++) {
1724 if (inst->src[i].file == UNIFORM) {
1725 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1726 int constant_nr;
1727 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1728 constant_nr = push_constant_loc[uniform_nr];
1729 } else {
1730 /* Section 5.11 of the OpenGL 4.1 spec says:
1731 * "Out-of-bounds reads return undefined values, which include
1732 * values from other variables of the active program or zero."
1733 * Just return the first push constant.
1734 */
1735 constant_nr = 0;
1736 }
1737
1738 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1739 constant_nr / 8,
1740 constant_nr % 8);
1741
1742 inst->src[i].file = HW_REG;
1743 inst->src[i].fixed_hw_reg = byte_offset(
1744 retype(brw_reg, inst->src[i].type),
1745 inst->src[i].subreg_offset);
1746 }
1747 }
1748 }
1749 }
1750
1751 void
1752 fs_visitor::calculate_urb_setup()
1753 {
1754 assert(stage == MESA_SHADER_FRAGMENT);
1755 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1756 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1757
1758 memset(prog_data->urb_setup, -1,
1759 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1760
1761 int urb_next = 0;
1762 /* Figure out where each of the incoming setup attributes lands. */
1763 if (brw->gen >= 6) {
1764 if (_mesa_bitcount_64(prog->InputsRead &
1765 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1766 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1767 * first 16 varying inputs, so we can put them wherever we want.
1768 * Just put them in order.
1769 *
1770 * This is useful because it means that (a) inputs not used by the
1771 * fragment shader won't take up valuable register space, and (b) we
1772 * won't have to recompile the fragment shader if it gets paired with
1773 * a different vertex (or geometry) shader.
1774 */
1775 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1776 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1777 BITFIELD64_BIT(i)) {
1778 prog_data->urb_setup[i] = urb_next++;
1779 }
1780 }
1781 } else {
1782 /* We have enough input varyings that the SF/SBE pipeline stage can't
1783 * arbitrarily rearrange them to suit our whim; we have to put them
1784 * in an order that matches the output of the previous pipeline stage
1785 * (geometry or vertex shader).
1786 */
1787 struct brw_vue_map prev_stage_vue_map;
1788 brw_compute_vue_map(brw, &prev_stage_vue_map,
1789 key->input_slots_valid);
1790 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1791 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1792 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1793 slot++) {
1794 int varying = prev_stage_vue_map.slot_to_varying[slot];
1795 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1796 * unused.
1797 */
1798 if (varying != BRW_VARYING_SLOT_COUNT &&
1799 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1800 BITFIELD64_BIT(varying))) {
1801 prog_data->urb_setup[varying] = slot - first_slot;
1802 }
1803 }
1804 urb_next = prev_stage_vue_map.num_slots - first_slot;
1805 }
1806 } else {
1807 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1808 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1809 /* Point size is packed into the header, not as a general attribute */
1810 if (i == VARYING_SLOT_PSIZ)
1811 continue;
1812
1813 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1814 /* The back color slot is skipped when the front color is
1815 * also written to. In addition, some slots can be
1816 * written in the vertex shader and not read in the
1817 * fragment shader. So the register number must always be
1818 * incremented, mapped or not.
1819 */
1820 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1821 prog_data->urb_setup[i] = urb_next;
1822 urb_next++;
1823 }
1824 }
1825
1826 /*
1827 * It's a FS only attribute, and we did interpolation for this attribute
1828 * in SF thread. So, count it here, too.
1829 *
1830 * See compile_sf_prog() for more info.
1831 */
1832 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1833 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1834 }
1835
1836 prog_data->num_varying_inputs = urb_next;
1837 }
1838
1839 void
1840 fs_visitor::assign_urb_setup()
1841 {
1842 assert(stage == MESA_SHADER_FRAGMENT);
1843 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1844
1845 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1846
1847 /* Offset all the urb_setup[] index by the actual position of the
1848 * setup regs, now that the location of the constants has been chosen.
1849 */
1850 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1851 if (inst->opcode == FS_OPCODE_LINTERP) {
1852 assert(inst->src[2].file == HW_REG);
1853 inst->src[2].fixed_hw_reg.nr += urb_start;
1854 }
1855
1856 if (inst->opcode == FS_OPCODE_CINTERP) {
1857 assert(inst->src[0].file == HW_REG);
1858 inst->src[0].fixed_hw_reg.nr += urb_start;
1859 }
1860 }
1861
1862 /* Each attribute is 4 setup channels, each of which is half a reg. */
1863 this->first_non_payload_grf =
1864 urb_start + prog_data->num_varying_inputs * 2;
1865 }
1866
1867 void
1868 fs_visitor::assign_vs_urb_setup()
1869 {
1870 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1871 int grf, count, slot, channel, attr;
1872
1873 assert(stage == MESA_SHADER_VERTEX);
1874 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1875 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1876 count++;
1877
1878 /* Each attribute is 4 regs. */
1879 this->first_non_payload_grf =
1880 payload.num_regs + prog_data->curb_read_length + count * 4;
1881
1882 unsigned vue_entries =
1883 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1884
1885 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1886 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1887
1888 assert(vs_prog_data->base.urb_read_length <= 15);
1889
1890 /* Rewrite all ATTR file references to the hw grf that they land in. */
1891 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1892 for (int i = 0; i < inst->sources; i++) {
1893 if (inst->src[i].file == ATTR) {
1894
1895 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1896 slot = count - 1;
1897 } else {
1898 /* Attributes come in in a contiguous block, ordered by their
1899 * gl_vert_attrib value. That means we can compute the slot
1900 * number for an attribute by masking out the enabled
1901 * attributes before it and counting the bits.
1902 */
1903 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1904 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1905 BITFIELD64_MASK(attr));
1906 }
1907
1908 channel = inst->src[i].reg_offset & 3;
1909
1910 grf = payload.num_regs +
1911 prog_data->curb_read_length +
1912 slot * 4 + channel;
1913
1914 inst->src[i].file = HW_REG;
1915 inst->src[i].fixed_hw_reg =
1916 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1917 }
1918 }
1919 }
1920 }
1921
1922 /**
1923 * Split large virtual GRFs into separate components if we can.
1924 *
1925 * This is mostly duplicated with what brw_fs_vector_splitting does,
1926 * but that's really conservative because it's afraid of doing
1927 * splitting that doesn't result in real progress after the rest of
1928 * the optimization phases, which would cause infinite looping in
1929 * optimization. We can do it once here, safely. This also has the
1930 * opportunity to split interpolated values, or maybe even uniforms,
1931 * which we don't have at the IR level.
1932 *
1933 * We want to split, because virtual GRFs are what we register
1934 * allocate and spill (due to contiguousness requirements for some
1935 * instructions), and they're what we naturally generate in the
1936 * codegen process, but most virtual GRFs don't actually need to be
1937 * contiguous sets of GRFs. If we split, we'll end up with reduced
1938 * live intervals and better dead code elimination and coalescing.
1939 */
1940 void
1941 fs_visitor::split_virtual_grfs()
1942 {
1943 int num_vars = this->alloc.count;
1944
1945 /* Count the total number of registers */
1946 int reg_count = 0;
1947 int vgrf_to_reg[num_vars];
1948 for (int i = 0; i < num_vars; i++) {
1949 vgrf_to_reg[i] = reg_count;
1950 reg_count += alloc.sizes[i];
1951 }
1952
1953 /* An array of "split points". For each register slot, this indicates
1954 * if this slot can be separated from the previous slot. Every time an
1955 * instruction uses multiple elements of a register (as a source or
1956 * destination), we mark the used slots as inseparable. Then we go
1957 * through and split the registers into the smallest pieces we can.
1958 */
1959 bool split_points[reg_count];
1960 memset(split_points, 0, sizeof(split_points));
1961
1962 /* Mark all used registers as fully splittable */
1963 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1964 if (inst->dst.file == GRF) {
1965 int reg = vgrf_to_reg[inst->dst.reg];
1966 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1967 split_points[reg + j] = true;
1968 }
1969
1970 for (int i = 0; i < inst->sources; i++) {
1971 if (inst->src[i].file == GRF) {
1972 int reg = vgrf_to_reg[inst->src[i].reg];
1973 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1974 split_points[reg + j] = true;
1975 }
1976 }
1977 }
1978
1979 if (brw->has_pln &&
1980 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1981 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1982 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1983 * Gen6, that was the only supported interpolation mode, and since Gen6,
1984 * delta_x and delta_y are in fixed hardware registers.
1985 */
1986 int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1987 split_points[vgrf_to_reg[vgrf] + 1] = false;
1988 }
1989
1990 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1991 if (inst->dst.file == GRF) {
1992 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1993 for (int j = 1; j < inst->regs_written; j++)
1994 split_points[reg + j] = false;
1995 }
1996 for (int i = 0; i < inst->sources; i++) {
1997 if (inst->src[i].file == GRF) {
1998 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1999 for (int j = 1; j < inst->regs_read(i); j++)
2000 split_points[reg + j] = false;
2001 }
2002 }
2003 }
2004
2005 int new_virtual_grf[reg_count];
2006 int new_reg_offset[reg_count];
2007
2008 int reg = 0;
2009 for (int i = 0; i < num_vars; i++) {
2010 /* The first one should always be 0 as a quick sanity check. */
2011 assert(split_points[reg] == false);
2012
2013 /* j = 0 case */
2014 new_reg_offset[reg] = 0;
2015 reg++;
2016 int offset = 1;
2017
2018 /* j > 0 case */
2019 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2020 /* If this is a split point, reset the offset to 0 and allocate a
2021 * new virtual GRF for the previous offset many registers
2022 */
2023 if (split_points[reg]) {
2024 assert(offset <= MAX_VGRF_SIZE);
2025 int grf = alloc.allocate(offset);
2026 for (int k = reg - offset; k < reg; k++)
2027 new_virtual_grf[k] = grf;
2028 offset = 0;
2029 }
2030 new_reg_offset[reg] = offset;
2031 offset++;
2032 reg++;
2033 }
2034
2035 /* The last one gets the original register number */
2036 assert(offset <= MAX_VGRF_SIZE);
2037 alloc.sizes[i] = offset;
2038 for (int k = reg - offset; k < reg; k++)
2039 new_virtual_grf[k] = i;
2040 }
2041 assert(reg == reg_count);
2042
2043 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2044 if (inst->dst.file == GRF) {
2045 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2046 inst->dst.reg = new_virtual_grf[reg];
2047 inst->dst.reg_offset = new_reg_offset[reg];
2048 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2049 }
2050 for (int i = 0; i < inst->sources; i++) {
2051 if (inst->src[i].file == GRF) {
2052 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2053 inst->src[i].reg = new_virtual_grf[reg];
2054 inst->src[i].reg_offset = new_reg_offset[reg];
2055 assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2056 }
2057 }
2058 }
2059 invalidate_live_intervals();
2060 }
2061
2062 /**
2063 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2064 *
2065 * During code generation, we create tons of temporary variables, many of
2066 * which get immediately killed and are never used again. Yet, in later
2067 * optimization and analysis passes, such as compute_live_intervals, we need
2068 * to loop over all the virtual GRFs. Compacting them can save a lot of
2069 * overhead.
2070 */
2071 bool
2072 fs_visitor::compact_virtual_grfs()
2073 {
2074 bool progress = false;
2075 int remap_table[this->alloc.count];
2076 memset(remap_table, -1, sizeof(remap_table));
2077
2078 /* Mark which virtual GRFs are used. */
2079 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2080 if (inst->dst.file == GRF)
2081 remap_table[inst->dst.reg] = 0;
2082
2083 for (int i = 0; i < inst->sources; i++) {
2084 if (inst->src[i].file == GRF)
2085 remap_table[inst->src[i].reg] = 0;
2086 }
2087 }
2088
2089 /* Compact the GRF arrays. */
2090 int new_index = 0;
2091 for (unsigned i = 0; i < this->alloc.count; i++) {
2092 if (remap_table[i] == -1) {
2093 /* We just found an unused register. This means that we are
2094 * actually going to compact something.
2095 */
2096 progress = true;
2097 } else {
2098 remap_table[i] = new_index;
2099 alloc.sizes[new_index] = alloc.sizes[i];
2100 invalidate_live_intervals();
2101 ++new_index;
2102 }
2103 }
2104
2105 this->alloc.count = new_index;
2106
2107 /* Patch all the instructions to use the newly renumbered registers */
2108 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2109 if (inst->dst.file == GRF)
2110 inst->dst.reg = remap_table[inst->dst.reg];
2111
2112 for (int i = 0; i < inst->sources; i++) {
2113 if (inst->src[i].file == GRF)
2114 inst->src[i].reg = remap_table[inst->src[i].reg];
2115 }
2116 }
2117
2118 /* Patch all the references to delta_x/delta_y, since they're used in
2119 * register allocation. If they're unused, switch them to BAD_FILE so
2120 * we don't think some random VGRF is delta_x/delta_y.
2121 */
2122 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2123 if (delta_x[i].file == GRF) {
2124 if (remap_table[delta_x[i].reg] != -1) {
2125 delta_x[i].reg = remap_table[delta_x[i].reg];
2126 } else {
2127 delta_x[i].file = BAD_FILE;
2128 }
2129 }
2130 }
2131 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2132 if (delta_y[i].file == GRF) {
2133 if (remap_table[delta_y[i].reg] != -1) {
2134 delta_y[i].reg = remap_table[delta_y[i].reg];
2135 } else {
2136 delta_y[i].file = BAD_FILE;
2137 }
2138 }
2139 }
2140
2141 return progress;
2142 }
2143
2144 /*
2145 * Implements array access of uniforms by inserting a
2146 * PULL_CONSTANT_LOAD instruction.
2147 *
2148 * Unlike temporary GRF array access (where we don't support it due to
2149 * the difficulty of doing relative addressing on instruction
2150 * destinations), we could potentially do array access of uniforms
2151 * that were loaded in GRF space as push constants. In real-world
2152 * usage we've seen, though, the arrays being used are always larger
2153 * than we could load as push constants, so just always move all
2154 * uniform array access out to a pull constant buffer.
2155 */
2156 void
2157 fs_visitor::move_uniform_array_access_to_pull_constants()
2158 {
2159 if (dispatch_width != 8)
2160 return;
2161
2162 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2163 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2164
2165 /* Walk through and find array access of uniforms. Put a copy of that
2166 * uniform in the pull constant buffer.
2167 *
2168 * Note that we don't move constant-indexed accesses to arrays. No
2169 * testing has been done of the performance impact of this choice.
2170 */
2171 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2172 for (int i = 0 ; i < inst->sources; i++) {
2173 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2174 continue;
2175
2176 int uniform = inst->src[i].reg;
2177
2178 /* If this array isn't already present in the pull constant buffer,
2179 * add it.
2180 */
2181 if (pull_constant_loc[uniform] == -1) {
2182 const gl_constant_value **values = &stage_prog_data->param[uniform];
2183
2184 assert(param_size[uniform]);
2185
2186 for (int j = 0; j < param_size[uniform]; j++) {
2187 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2188
2189 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2190 values[j];
2191 }
2192 }
2193 }
2194 }
2195 }
2196
2197 /**
2198 * Assign UNIFORM file registers to either push constants or pull constants.
2199 *
2200 * We allow a fragment shader to have more than the specified minimum
2201 * maximum number of fragment shader uniform components (64). If
2202 * there are too many of these, they'd fill up all of register space.
2203 * So, this will push some of them out to the pull constant buffer and
2204 * update the program to load them.
2205 */
2206 void
2207 fs_visitor::assign_constant_locations()
2208 {
2209 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2210 if (dispatch_width != 8)
2211 return;
2212
2213 /* Find which UNIFORM registers are still in use. */
2214 bool is_live[uniforms];
2215 for (unsigned int i = 0; i < uniforms; i++) {
2216 is_live[i] = false;
2217 }
2218
2219 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2220 for (int i = 0; i < inst->sources; i++) {
2221 if (inst->src[i].file != UNIFORM)
2222 continue;
2223
2224 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2225 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2226 is_live[constant_nr] = true;
2227 }
2228 }
2229
2230 /* Only allow 16 registers (128 uniform components) as push constants.
2231 *
2232 * Just demote the end of the list. We could probably do better
2233 * here, demoting things that are rarely used in the program first.
2234 *
2235 * If changing this value, note the limitation about total_regs in
2236 * brw_curbe.c.
2237 */
2238 unsigned int max_push_components = 16 * 8;
2239 unsigned int num_push_constants = 0;
2240
2241 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2242
2243 for (unsigned int i = 0; i < uniforms; i++) {
2244 if (!is_live[i] || pull_constant_loc[i] != -1) {
2245 /* This UNIFORM register is either dead, or has already been demoted
2246 * to a pull const. Mark it as no longer living in the param[] array.
2247 */
2248 push_constant_loc[i] = -1;
2249 continue;
2250 }
2251
2252 if (num_push_constants < max_push_components) {
2253 /* Retain as a push constant. Record the location in the params[]
2254 * array.
2255 */
2256 push_constant_loc[i] = num_push_constants++;
2257 } else {
2258 /* Demote to a pull constant. */
2259 push_constant_loc[i] = -1;
2260
2261 int pull_index = stage_prog_data->nr_pull_params++;
2262 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2263 pull_constant_loc[i] = pull_index;
2264 }
2265 }
2266
2267 stage_prog_data->nr_params = num_push_constants;
2268
2269 /* Up until now, the param[] array has been indexed by reg + reg_offset
2270 * of UNIFORM registers. Condense it to only contain the uniforms we
2271 * chose to upload as push constants.
2272 */
2273 for (unsigned int i = 0; i < uniforms; i++) {
2274 int remapped = push_constant_loc[i];
2275
2276 if (remapped == -1)
2277 continue;
2278
2279 assert(remapped <= (int)i);
2280 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2281 }
2282 }
2283
2284 /**
2285 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2286 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2287 */
2288 void
2289 fs_visitor::demote_pull_constants()
2290 {
2291 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2292 for (int i = 0; i < inst->sources; i++) {
2293 if (inst->src[i].file != UNIFORM)
2294 continue;
2295
2296 int pull_index;
2297 unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2298 if (location >= uniforms) /* Out of bounds access */
2299 pull_index = -1;
2300 else
2301 pull_index = pull_constant_loc[location];
2302
2303 if (pull_index == -1)
2304 continue;
2305
2306 /* Set up the annotation tracking for new generated instructions. */
2307 base_ir = inst->ir;
2308 current_annotation = inst->annotation;
2309
2310 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2311 fs_reg dst = vgrf(glsl_type::float_type);
2312
2313 /* Generate a pull load into dst. */
2314 if (inst->src[i].reladdr) {
2315 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2316 surf_index,
2317 *inst->src[i].reladdr,
2318 pull_index);
2319 inst->insert_before(block, &list);
2320 inst->src[i].reladdr = NULL;
2321 } else {
2322 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2323 fs_inst *pull =
2324 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2325 dst, surf_index, offset);
2326 inst->insert_before(block, pull);
2327 inst->src[i].set_smear(pull_index & 3);
2328 }
2329
2330 /* Rewrite the instruction to use the temporary VGRF. */
2331 inst->src[i].file = GRF;
2332 inst->src[i].reg = dst.reg;
2333 inst->src[i].reg_offset = 0;
2334 inst->src[i].width = dispatch_width;
2335 }
2336 }
2337 invalidate_live_intervals();
2338 }
2339
2340 bool
2341 fs_visitor::opt_algebraic()
2342 {
2343 bool progress = false;
2344
2345 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2346 switch (inst->opcode) {
2347 case BRW_OPCODE_MOV:
2348 if (inst->src[0].file != IMM)
2349 break;
2350
2351 if (inst->saturate) {
2352 if (inst->dst.type != inst->src[0].type)
2353 assert(!"unimplemented: saturate mixed types");
2354
2355 if (brw_saturate_immediate(inst->dst.type,
2356 &inst->src[0].fixed_hw_reg)) {
2357 inst->saturate = false;
2358 progress = true;
2359 }
2360 }
2361 break;
2362
2363 case BRW_OPCODE_MUL:
2364 if (inst->src[1].file != IMM)
2365 continue;
2366
2367 /* a * 1.0 = a */
2368 if (inst->src[1].is_one()) {
2369 inst->opcode = BRW_OPCODE_MOV;
2370 inst->src[1] = reg_undef;
2371 progress = true;
2372 break;
2373 }
2374
2375 /* a * -1.0 = -a */
2376 if (inst->src[1].is_negative_one()) {
2377 inst->opcode = BRW_OPCODE_MOV;
2378 inst->src[0].negate = !inst->src[0].negate;
2379 inst->src[1] = reg_undef;
2380 progress = true;
2381 break;
2382 }
2383
2384 /* a * 0.0 = 0.0 */
2385 if (inst->src[1].is_zero()) {
2386 inst->opcode = BRW_OPCODE_MOV;
2387 inst->src[0] = inst->src[1];
2388 inst->src[1] = reg_undef;
2389 progress = true;
2390 break;
2391 }
2392
2393 if (inst->src[0].file == IMM) {
2394 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2395 inst->opcode = BRW_OPCODE_MOV;
2396 inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2397 inst->src[1] = reg_undef;
2398 progress = true;
2399 break;
2400 }
2401 break;
2402 case BRW_OPCODE_ADD:
2403 if (inst->src[1].file != IMM)
2404 continue;
2405
2406 /* a + 0.0 = a */
2407 if (inst->src[1].is_zero()) {
2408 inst->opcode = BRW_OPCODE_MOV;
2409 inst->src[1] = reg_undef;
2410 progress = true;
2411 break;
2412 }
2413
2414 if (inst->src[0].file == IMM) {
2415 assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2416 inst->opcode = BRW_OPCODE_MOV;
2417 inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2418 inst->src[1] = reg_undef;
2419 progress = true;
2420 break;
2421 }
2422 break;
2423 case BRW_OPCODE_OR:
2424 if (inst->src[0].equals(inst->src[1])) {
2425 inst->opcode = BRW_OPCODE_MOV;
2426 inst->src[1] = reg_undef;
2427 progress = true;
2428 break;
2429 }
2430 break;
2431 case BRW_OPCODE_LRP:
2432 if (inst->src[1].equals(inst->src[2])) {
2433 inst->opcode = BRW_OPCODE_MOV;
2434 inst->src[0] = inst->src[1];
2435 inst->src[1] = reg_undef;
2436 inst->src[2] = reg_undef;
2437 progress = true;
2438 break;
2439 }
2440 break;
2441 case BRW_OPCODE_CMP:
2442 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2443 inst->src[0].abs &&
2444 inst->src[0].negate &&
2445 inst->src[1].is_zero()) {
2446 inst->src[0].abs = false;
2447 inst->src[0].negate = false;
2448 inst->conditional_mod = BRW_CONDITIONAL_Z;
2449 progress = true;
2450 break;
2451 }
2452 break;
2453 case BRW_OPCODE_SEL:
2454 if (inst->src[0].equals(inst->src[1])) {
2455 inst->opcode = BRW_OPCODE_MOV;
2456 inst->src[1] = reg_undef;
2457 inst->predicate = BRW_PREDICATE_NONE;
2458 inst->predicate_inverse = false;
2459 progress = true;
2460 } else if (inst->saturate && inst->src[1].file == IMM) {
2461 switch (inst->conditional_mod) {
2462 case BRW_CONDITIONAL_LE:
2463 case BRW_CONDITIONAL_L:
2464 switch (inst->src[1].type) {
2465 case BRW_REGISTER_TYPE_F:
2466 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2467 inst->opcode = BRW_OPCODE_MOV;
2468 inst->src[1] = reg_undef;
2469 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2470 progress = true;
2471 }
2472 break;
2473 default:
2474 break;
2475 }
2476 break;
2477 case BRW_CONDITIONAL_GE:
2478 case BRW_CONDITIONAL_G:
2479 switch (inst->src[1].type) {
2480 case BRW_REGISTER_TYPE_F:
2481 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2482 inst->opcode = BRW_OPCODE_MOV;
2483 inst->src[1] = reg_undef;
2484 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2485 progress = true;
2486 }
2487 break;
2488 default:
2489 break;
2490 }
2491 default:
2492 break;
2493 }
2494 }
2495 break;
2496 case BRW_OPCODE_MAD:
2497 if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2498 inst->opcode = BRW_OPCODE_MOV;
2499 inst->src[1] = reg_undef;
2500 inst->src[2] = reg_undef;
2501 progress = true;
2502 } else if (inst->src[0].is_zero()) {
2503 inst->opcode = BRW_OPCODE_MUL;
2504 inst->src[0] = inst->src[2];
2505 inst->src[2] = reg_undef;
2506 progress = true;
2507 } else if (inst->src[1].is_one()) {
2508 inst->opcode = BRW_OPCODE_ADD;
2509 inst->src[1] = inst->src[2];
2510 inst->src[2] = reg_undef;
2511 progress = true;
2512 } else if (inst->src[2].is_one()) {
2513 inst->opcode = BRW_OPCODE_ADD;
2514 inst->src[2] = reg_undef;
2515 progress = true;
2516 } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2517 inst->opcode = BRW_OPCODE_ADD;
2518 inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2519 inst->src[2] = reg_undef;
2520 progress = true;
2521 }
2522 break;
2523 case SHADER_OPCODE_RCP: {
2524 fs_inst *prev = (fs_inst *)inst->prev;
2525 if (prev->opcode == SHADER_OPCODE_SQRT) {
2526 if (inst->src[0].equals(prev->dst)) {
2527 inst->opcode = SHADER_OPCODE_RSQ;
2528 inst->src[0] = prev->src[0];
2529 progress = true;
2530 }
2531 }
2532 break;
2533 }
2534 default:
2535 break;
2536 }
2537
2538 /* Swap if src[0] is immediate. */
2539 if (progress && inst->is_commutative()) {
2540 if (inst->src[0].file == IMM) {
2541 fs_reg tmp = inst->src[1];
2542 inst->src[1] = inst->src[0];
2543 inst->src[0] = tmp;
2544 }
2545 }
2546 }
2547 return progress;
2548 }
2549
2550 bool
2551 fs_visitor::opt_register_renaming()
2552 {
2553 bool progress = false;
2554 int depth = 0;
2555
2556 int remap[alloc.count];
2557 memset(remap, -1, sizeof(int) * alloc.count);
2558
2559 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2560 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2561 depth++;
2562 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2563 inst->opcode == BRW_OPCODE_WHILE) {
2564 depth--;
2565 }
2566
2567 /* Rewrite instruction sources. */
2568 for (int i = 0; i < inst->sources; i++) {
2569 if (inst->src[i].file == GRF &&
2570 remap[inst->src[i].reg] != -1 &&
2571 remap[inst->src[i].reg] != inst->src[i].reg) {
2572 inst->src[i].reg = remap[inst->src[i].reg];
2573 progress = true;
2574 }
2575 }
2576
2577 const int dst = inst->dst.reg;
2578
2579 if (depth == 0 &&
2580 inst->dst.file == GRF &&
2581 alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2582 !inst->is_partial_write()) {
2583 if (remap[dst] == -1) {
2584 remap[dst] = dst;
2585 } else {
2586 remap[dst] = alloc.allocate(inst->dst.width / 8);
2587 inst->dst.reg = remap[dst];
2588 progress = true;
2589 }
2590 } else if (inst->dst.file == GRF &&
2591 remap[dst] != -1 &&
2592 remap[dst] != dst) {
2593 inst->dst.reg = remap[dst];
2594 progress = true;
2595 }
2596 }
2597
2598 if (progress) {
2599 invalidate_live_intervals();
2600
2601 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2602 if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2603 delta_x[i].reg = remap[delta_x[i].reg];
2604 }
2605 }
2606 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2607 if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2608 delta_y[i].reg = remap[delta_y[i].reg];
2609 }
2610 }
2611 }
2612
2613 return progress;
2614 }
2615
2616 /**
2617 * Remove redundant or useless discard jumps.
2618 *
2619 * For example, we can eliminate jumps in the following sequence:
2620 *
2621 * discard-jump (redundant with the next jump)
2622 * discard-jump (useless; jumps to the next instruction)
2623 * placeholder-halt
2624 */
2625 bool
2626 fs_visitor::opt_redundant_discard_jumps()
2627 {
2628 bool progress = false;
2629
2630 bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2631
2632 fs_inst *placeholder_halt = NULL;
2633 foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2634 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2635 placeholder_halt = inst;
2636 break;
2637 }
2638 }
2639
2640 if (!placeholder_halt)
2641 return false;
2642
2643 /* Delete any HALTs immediately before the placeholder halt. */
2644 for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2645 !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2646 prev = (fs_inst *) placeholder_halt->prev) {
2647 prev->remove(last_bblock);
2648 progress = true;
2649 }
2650
2651 if (progress)
2652 invalidate_live_intervals();
2653
2654 return progress;
2655 }
2656
2657 bool
2658 fs_visitor::compute_to_mrf()
2659 {
2660 bool progress = false;
2661 int next_ip = 0;
2662
2663 /* No MRFs on Gen >= 7. */
2664 if (brw->gen >= 7)
2665 return false;
2666
2667 calculate_live_intervals();
2668
2669 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2670 int ip = next_ip;
2671 next_ip++;
2672
2673 if (inst->opcode != BRW_OPCODE_MOV ||
2674 inst->is_partial_write() ||
2675 inst->dst.file != MRF || inst->src[0].file != GRF ||
2676 inst->dst.type != inst->src[0].type ||
2677 inst->src[0].abs || inst->src[0].negate ||
2678 !inst->src[0].is_contiguous() ||
2679 inst->src[0].subreg_offset)
2680 continue;
2681
2682 /* Work out which hardware MRF registers are written by this
2683 * instruction.
2684 */
2685 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2686 int mrf_high;
2687 if (inst->dst.reg & BRW_MRF_COMPR4) {
2688 mrf_high = mrf_low + 4;
2689 } else if (inst->exec_size == 16) {
2690 mrf_high = mrf_low + 1;
2691 } else {
2692 mrf_high = mrf_low;
2693 }
2694
2695 /* Can't compute-to-MRF this GRF if someone else was going to
2696 * read it later.
2697 */
2698 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2699 continue;
2700
2701 /* Found a move of a GRF to a MRF. Let's see if we can go
2702 * rewrite the thing that made this GRF to write into the MRF.
2703 */
2704 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2705 if (scan_inst->dst.file == GRF &&
2706 scan_inst->dst.reg == inst->src[0].reg) {
2707 /* Found the last thing to write our reg we want to turn
2708 * into a compute-to-MRF.
2709 */
2710
2711 /* If this one instruction didn't populate all the
2712 * channels, bail. We might be able to rewrite everything
2713 * that writes that reg, but it would require smarter
2714 * tracking to delay the rewriting until complete success.
2715 */
2716 if (scan_inst->is_partial_write())
2717 break;
2718
2719 /* Things returning more than one register would need us to
2720 * understand coalescing out more than one MOV at a time.
2721 */
2722 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2723 break;
2724
2725 /* SEND instructions can't have MRF as a destination. */
2726 if (scan_inst->mlen)
2727 break;
2728
2729 if (brw->gen == 6) {
2730 /* gen6 math instructions must have the destination be
2731 * GRF, so no compute-to-MRF for them.
2732 */
2733 if (scan_inst->is_math()) {
2734 break;
2735 }
2736 }
2737
2738 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2739 /* Found the creator of our MRF's source value. */
2740 scan_inst->dst.file = MRF;
2741 scan_inst->dst.reg = inst->dst.reg;
2742 scan_inst->saturate |= inst->saturate;
2743 inst->remove(block);
2744 progress = true;
2745 }
2746 break;
2747 }
2748
2749 /* We don't handle control flow here. Most computation of
2750 * values that end up in MRFs are shortly before the MRF
2751 * write anyway.
2752 */
2753 if (block->start() == scan_inst)
2754 break;
2755
2756 /* You can't read from an MRF, so if someone else reads our
2757 * MRF's source GRF that we wanted to rewrite, that stops us.
2758 */
2759 bool interfered = false;
2760 for (int i = 0; i < scan_inst->sources; i++) {
2761 if (scan_inst->src[i].file == GRF &&
2762 scan_inst->src[i].reg == inst->src[0].reg &&
2763 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2764 interfered = true;
2765 }
2766 }
2767 if (interfered)
2768 break;
2769
2770 if (scan_inst->dst.file == MRF) {
2771 /* If somebody else writes our MRF here, we can't
2772 * compute-to-MRF before that.
2773 */
2774 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2775 int scan_mrf_high;
2776
2777 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2778 scan_mrf_high = scan_mrf_low + 4;
2779 } else if (scan_inst->exec_size == 16) {
2780 scan_mrf_high = scan_mrf_low + 1;
2781 } else {
2782 scan_mrf_high = scan_mrf_low;
2783 }
2784
2785 if (mrf_low == scan_mrf_low ||
2786 mrf_low == scan_mrf_high ||
2787 mrf_high == scan_mrf_low ||
2788 mrf_high == scan_mrf_high) {
2789 break;
2790 }
2791 }
2792
2793 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2794 /* Found a SEND instruction, which means that there are
2795 * live values in MRFs from base_mrf to base_mrf +
2796 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2797 * above it.
2798 */
2799 if (mrf_low >= scan_inst->base_mrf &&
2800 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2801 break;
2802 }
2803 if (mrf_high >= scan_inst->base_mrf &&
2804 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2805 break;
2806 }
2807 }
2808 }
2809 }
2810
2811 if (progress)
2812 invalidate_live_intervals();
2813
2814 return progress;
2815 }
2816
2817 /**
2818 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2819 * instructions to FS_OPCODE_REP_FB_WRITE.
2820 */
2821 void
2822 fs_visitor::emit_repclear_shader()
2823 {
2824 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2825 int base_mrf = 1;
2826 int color_mrf = base_mrf + 2;
2827
2828 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2829 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2830 mov->force_writemask_all = true;
2831
2832 fs_inst *write;
2833 if (key->nr_color_regions == 1) {
2834 write = emit(FS_OPCODE_REP_FB_WRITE);
2835 write->saturate = key->clamp_fragment_color;
2836 write->base_mrf = color_mrf;
2837 write->target = 0;
2838 write->header_present = false;
2839 write->mlen = 1;
2840 } else {
2841 assume(key->nr_color_regions > 0);
2842 for (int i = 0; i < key->nr_color_regions; ++i) {
2843 write = emit(FS_OPCODE_REP_FB_WRITE);
2844 write->saturate = key->clamp_fragment_color;
2845 write->base_mrf = base_mrf;
2846 write->target = i;
2847 write->header_present = true;
2848 write->mlen = 3;
2849 }
2850 }
2851 write->eot = true;
2852
2853 calculate_cfg();
2854
2855 assign_constant_locations();
2856 assign_curb_setup();
2857
2858 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2859 assert(mov->src[0].file == HW_REG);
2860 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2861 }
2862
2863 /**
2864 * Walks through basic blocks, looking for repeated MRF writes and
2865 * removing the later ones.
2866 */
2867 bool
2868 fs_visitor::remove_duplicate_mrf_writes()
2869 {
2870 fs_inst *last_mrf_move[16];
2871 bool progress = false;
2872
2873 /* Need to update the MRF tracking for compressed instructions. */
2874 if (dispatch_width == 16)
2875 return false;
2876
2877 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2878
2879 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2880 if (inst->is_control_flow()) {
2881 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2882 }
2883
2884 if (inst->opcode == BRW_OPCODE_MOV &&
2885 inst->dst.file == MRF) {
2886 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2887 if (prev_inst && inst->equals(prev_inst)) {
2888 inst->remove(block);
2889 progress = true;
2890 continue;
2891 }
2892 }
2893
2894 /* Clear out the last-write records for MRFs that were overwritten. */
2895 if (inst->dst.file == MRF) {
2896 last_mrf_move[inst->dst.reg] = NULL;
2897 }
2898
2899 if (inst->mlen > 0 && inst->base_mrf != -1) {
2900 /* Found a SEND instruction, which will include two or fewer
2901 * implied MRF writes. We could do better here.
2902 */
2903 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2904 last_mrf_move[inst->base_mrf + i] = NULL;
2905 }
2906 }
2907
2908 /* Clear out any MRF move records whose sources got overwritten. */
2909 if (inst->dst.file == GRF) {
2910 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2911 if (last_mrf_move[i] &&
2912 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2913 last_mrf_move[i] = NULL;
2914 }
2915 }
2916 }
2917
2918 if (inst->opcode == BRW_OPCODE_MOV &&
2919 inst->dst.file == MRF &&
2920 inst->src[0].file == GRF &&
2921 !inst->is_partial_write()) {
2922 last_mrf_move[inst->dst.reg] = inst;
2923 }
2924 }
2925
2926 if (progress)
2927 invalidate_live_intervals();
2928
2929 return progress;
2930 }
2931
2932 static void
2933 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
2934 {
2935 /* Clear the flag for registers that actually got read (as expected). */
2936 for (int i = 0; i < inst->sources; i++) {
2937 int grf;
2938 if (inst->src[i].file == GRF) {
2939 grf = inst->src[i].reg;
2940 } else if (inst->src[i].file == HW_REG &&
2941 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2942 grf = inst->src[i].fixed_hw_reg.nr;
2943 } else {
2944 continue;
2945 }
2946
2947 if (grf >= first_grf &&
2948 grf < first_grf + grf_len) {
2949 deps[grf - first_grf] = false;
2950 if (inst->exec_size == 16)
2951 deps[grf - first_grf + 1] = false;
2952 }
2953 }
2954 }
2955
2956 /**
2957 * Implements this workaround for the original 965:
2958 *
2959 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2960 * check for post destination dependencies on this instruction, software
2961 * must ensure that there is no destination hazard for the case of ‘write
2962 * followed by a posted write’ shown in the following example.
2963 *
2964 * 1. mov r3 0
2965 * 2. send r3.xy <rest of send instruction>
2966 * 3. mov r2 r3
2967 *
2968 * Due to no post-destination dependency check on the ‘send’, the above
2969 * code sequence could have two instructions (1 and 2) in flight at the
2970 * same time that both consider ‘r3’ as the target of their final writes.
2971 */
2972 void
2973 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2974 fs_inst *inst)
2975 {
2976 int write_len = inst->regs_written;
2977 int first_write_grf = inst->dst.reg;
2978 bool needs_dep[BRW_MAX_MRF];
2979 assert(write_len < (int)sizeof(needs_dep) - 1);
2980
2981 memset(needs_dep, false, sizeof(needs_dep));
2982 memset(needs_dep, true, write_len);
2983
2984 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
2985
2986 /* Walk backwards looking for writes to registers we're writing which
2987 * aren't read since being written. If we hit the start of the program,
2988 * we assume that there are no outstanding dependencies on entry to the
2989 * program.
2990 */
2991 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2992 /* If we hit control flow, assume that there *are* outstanding
2993 * dependencies, and force their cleanup before our instruction.
2994 */
2995 if (block->start() == scan_inst) {
2996 for (int i = 0; i < write_len; i++) {
2997 if (needs_dep[i]) {
2998 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2999 }
3000 }
3001 return;
3002 }
3003
3004 /* We insert our reads as late as possible on the assumption that any
3005 * instruction but a MOV that might have left us an outstanding
3006 * dependency has more latency than a MOV.
3007 */
3008 if (scan_inst->dst.file == GRF) {
3009 for (int i = 0; i < scan_inst->regs_written; i++) {
3010 int reg = scan_inst->dst.reg + i;
3011
3012 if (reg >= first_write_grf &&
3013 reg < first_write_grf + write_len &&
3014 needs_dep[reg - first_write_grf]) {
3015 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3016 needs_dep[reg - first_write_grf] = false;
3017 if (scan_inst->exec_size == 16)
3018 needs_dep[reg - first_write_grf + 1] = false;
3019 }
3020 }
3021 }
3022
3023 /* Clear the flag for registers that actually got read (as expected). */
3024 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3025
3026 /* Continue the loop only if we haven't resolved all the dependencies */
3027 int i;
3028 for (i = 0; i < write_len; i++) {
3029 if (needs_dep[i])
3030 break;
3031 }
3032 if (i == write_len)
3033 return;
3034 }
3035 }
3036
3037 /**
3038 * Implements this workaround for the original 965:
3039 *
3040 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3041 * used as a destination register until after it has been sourced by an
3042 * instruction with a different destination register.
3043 */
3044 void
3045 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3046 {
3047 int write_len = inst->regs_written;
3048 int first_write_grf = inst->dst.reg;
3049 bool needs_dep[BRW_MAX_MRF];
3050 assert(write_len < (int)sizeof(needs_dep) - 1);
3051
3052 memset(needs_dep, false, sizeof(needs_dep));
3053 memset(needs_dep, true, write_len);
3054 /* Walk forwards looking for writes to registers we're writing which aren't
3055 * read before being written.
3056 */
3057 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3058 /* If we hit control flow, force resolve all remaining dependencies. */
3059 if (block->end() == scan_inst) {
3060 for (int i = 0; i < write_len; i++) {
3061 if (needs_dep[i])
3062 scan_inst->insert_before(block,
3063 DEP_RESOLVE_MOV(first_write_grf + i));
3064 }
3065 return;
3066 }
3067
3068 /* Clear the flag for registers that actually got read (as expected). */
3069 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3070
3071 /* We insert our reads as late as possible since they're reading the
3072 * result of a SEND, which has massive latency.
3073 */
3074 if (scan_inst->dst.file == GRF &&
3075 scan_inst->dst.reg >= first_write_grf &&
3076 scan_inst->dst.reg < first_write_grf + write_len &&
3077 needs_dep[scan_inst->dst.reg - first_write_grf]) {
3078 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3079 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3080 }
3081
3082 /* Continue the loop only if we haven't resolved all the dependencies */
3083 int i;
3084 for (i = 0; i < write_len; i++) {
3085 if (needs_dep[i])
3086 break;
3087 }
3088 if (i == write_len)
3089 return;
3090 }
3091 }
3092
3093 void
3094 fs_visitor::insert_gen4_send_dependency_workarounds()
3095 {
3096 if (brw->gen != 4 || brw->is_g4x)
3097 return;
3098
3099 bool progress = false;
3100
3101 /* Note that we're done with register allocation, so GRF fs_regs always
3102 * have a .reg_offset of 0.
3103 */
3104
3105 foreach_block_and_inst(block, fs_inst, inst, cfg) {
3106 if (inst->mlen != 0 && inst->dst.file == GRF) {
3107 insert_gen4_pre_send_dependency_workarounds(block, inst);
3108 insert_gen4_post_send_dependency_workarounds(block, inst);
3109 progress = true;
3110 }
3111 }
3112
3113 if (progress)
3114 invalidate_live_intervals();
3115 }
3116
3117 /**
3118 * Turns the generic expression-style uniform pull constant load instruction
3119 * into a hardware-specific series of instructions for loading a pull
3120 * constant.
3121 *
3122 * The expression style allows the CSE pass before this to optimize out
3123 * repeated loads from the same offset, and gives the pre-register-allocation
3124 * scheduling full flexibility, while the conversion to native instructions
3125 * allows the post-register-allocation scheduler the best information
3126 * possible.
3127 *
3128 * Note that execution masking for setting up pull constant loads is special:
3129 * the channels that need to be written are unrelated to the current execution
3130 * mask, since a later instruction will use one of the result channels as a
3131 * source operand for all 8 or 16 of its channels.
3132 */
3133 void
3134 fs_visitor::lower_uniform_pull_constant_loads()
3135 {
3136 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3137 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3138 continue;
3139
3140 if (brw->gen >= 7) {
3141 /* The offset arg before was a vec4-aligned byte offset. We need to
3142 * turn it into a dword offset.
3143 */
3144 fs_reg const_offset_reg = inst->src[1];
3145 assert(const_offset_reg.file == IMM &&
3146 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3147 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3148 fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3149
3150 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3151 * Reserve space for the register.
3152 */
3153 if (brw->gen >= 9) {
3154 payload.reg_offset++;
3155 alloc.sizes[payload.reg] = 2;
3156 }
3157
3158 /* This is actually going to be a MOV, but since only the first dword
3159 * is accessed, we have a special opcode to do just that one. Note
3160 * that this needs to be an operation that will be considered a def
3161 * by live variable analysis, or register allocation will explode.
3162 */
3163 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3164 8, payload, const_offset_reg);
3165 setup->force_writemask_all = true;
3166
3167 setup->ir = inst->ir;
3168 setup->annotation = inst->annotation;
3169 inst->insert_before(block, setup);
3170
3171 /* Similarly, this will only populate the first 4 channels of the
3172 * result register (since we only use smear values from 0-3), but we
3173 * don't tell the optimizer.
3174 */
3175 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3176 inst->src[1] = payload;
3177
3178 invalidate_live_intervals();
3179 } else {
3180 /* Before register allocation, we didn't tell the scheduler about the
3181 * MRF we use. We know it's safe to use this MRF because nothing
3182 * else does except for register spill/unspill, which generates and
3183 * uses its MRF within a single IR instruction.
3184 */
3185 inst->base_mrf = 14;
3186 inst->mlen = 1;
3187 }
3188 }
3189 }
3190
3191 bool
3192 fs_visitor::lower_load_payload()
3193 {
3194 bool progress = false;
3195
3196 int vgrf_to_reg[alloc.count];
3197 int reg_count = 0;
3198 for (unsigned i = 0; i < alloc.count; ++i) {
3199 vgrf_to_reg[i] = reg_count;
3200 reg_count += alloc.sizes[i];
3201 }
3202
3203 struct {
3204 bool written:1; /* Whether this register has ever been written */
3205 bool force_writemask_all:1;
3206 bool force_sechalf:1;
3207 } metadata[reg_count];
3208 memset(metadata, 0, sizeof(metadata));
3209
3210 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3211 if (inst->dst.file == GRF) {
3212 const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3213 bool force_sechalf = inst->force_sechalf &&
3214 !inst->force_writemask_all;
3215 bool toggle_sechalf = inst->dst.width == 16 &&
3216 type_sz(inst->dst.type) == 4 &&
3217 !inst->force_writemask_all;
3218 for (int i = 0; i < inst->regs_written; ++i) {
3219 metadata[dst_reg + i].written = true;
3220 metadata[dst_reg + i].force_sechalf = force_sechalf;
3221 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3222 force_sechalf = (toggle_sechalf != force_sechalf);
3223 }
3224 }
3225
3226 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3227 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3228 fs_reg dst = inst->dst;
3229
3230 for (int i = 0; i < inst->sources; i++) {
3231 dst.width = inst->src[i].effective_width;
3232 dst.type = inst->src[i].type;
3233
3234 if (inst->src[i].file == BAD_FILE) {
3235 /* Do nothing but otherwise increment as normal */
3236 } else if (dst.file == MRF &&
3237 dst.width == 8 &&
3238 brw->has_compr4 &&
3239 i + 4 < inst->sources &&
3240 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3241 fs_reg compr4_dst = dst;
3242 compr4_dst.reg += BRW_MRF_COMPR4;
3243 compr4_dst.width = 16;
3244 fs_reg compr4_src = inst->src[i];
3245 compr4_src.width = 16;
3246 fs_inst *mov = MOV(compr4_dst, compr4_src);
3247 mov->force_writemask_all = true;
3248 inst->insert_before(block, mov);
3249 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3250 inst->src[i + 4].file = BAD_FILE;
3251 } else {
3252 fs_inst *mov = MOV(dst, inst->src[i]);
3253 if (inst->src[i].file == GRF) {
3254 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3255 inst->src[i].reg_offset;
3256 mov->force_sechalf = metadata[src_reg].force_sechalf;
3257 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3258 } else {
3259 /* We don't have any useful metadata for immediates or
3260 * uniforms. Assume that any of the channels of the
3261 * destination may be used.
3262 */
3263 assert(inst->src[i].file == IMM ||
3264 inst->src[i].file == UNIFORM);
3265 mov->force_writemask_all = true;
3266 }
3267
3268 if (dst.file == GRF) {
3269 const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3270 const bool force_writemask = mov->force_writemask_all;
3271 metadata[dst_reg].force_writemask_all = force_writemask;
3272 metadata[dst_reg].force_sechalf = mov->force_sechalf;
3273 if (dst.width * type_sz(dst.type) > 32) {
3274 assert(!mov->force_sechalf);
3275 metadata[dst_reg + 1].force_writemask_all = force_writemask;
3276 metadata[dst_reg + 1].force_sechalf = !force_writemask;
3277 }
3278 }
3279
3280 inst->insert_before(block, mov);
3281 }
3282
3283 dst = offset(dst, 1);
3284 }
3285
3286 inst->remove(block);
3287 progress = true;
3288 }
3289 }
3290
3291 if (progress)
3292 invalidate_live_intervals();
3293
3294 return progress;
3295 }
3296
3297 void
3298 fs_visitor::dump_instructions()
3299 {
3300 dump_instructions(NULL);
3301 }
3302
3303 void
3304 fs_visitor::dump_instructions(const char *name)
3305 {
3306 FILE *file = stderr;
3307 if (name && geteuid() != 0) {
3308 file = fopen(name, "w");
3309 if (!file)
3310 file = stderr;
3311 }
3312
3313 if (cfg) {
3314 calculate_register_pressure();
3315 int ip = 0, max_pressure = 0;
3316 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3317 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3318 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3319 dump_instruction(inst, file);
3320 ip++;
3321 }
3322 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3323 } else {
3324 int ip = 0;
3325 foreach_in_list(backend_instruction, inst, &instructions) {
3326 fprintf(file, "%4d: ", ip++);
3327 dump_instruction(inst, file);
3328 }
3329 }
3330
3331 if (file != stderr) {
3332 fclose(file);
3333 }
3334 }
3335
3336 void
3337 fs_visitor::dump_instruction(backend_instruction *be_inst)
3338 {
3339 dump_instruction(be_inst, stderr);
3340 }
3341
3342 void
3343 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3344 {
3345 fs_inst *inst = (fs_inst *)be_inst;
3346
3347 if (inst->predicate) {
3348 fprintf(file, "(%cf0.%d) ",
3349 inst->predicate_inverse ? '-' : '+',
3350 inst->flag_subreg);
3351 }
3352
3353 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3354 if (inst->saturate)
3355 fprintf(file, ".sat");
3356 if (inst->conditional_mod) {
3357 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3358 if (!inst->predicate &&
3359 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3360 inst->opcode != BRW_OPCODE_IF &&
3361 inst->opcode != BRW_OPCODE_WHILE))) {
3362 fprintf(file, ".f0.%d", inst->flag_subreg);
3363 }
3364 }
3365 fprintf(file, "(%d) ", inst->exec_size);
3366
3367
3368 switch (inst->dst.file) {
3369 case GRF:
3370 fprintf(file, "vgrf%d", inst->dst.reg);
3371 if (inst->dst.width != dispatch_width)
3372 fprintf(file, "@%d", inst->dst.width);
3373 if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3374 inst->dst.subreg_offset)
3375 fprintf(file, "+%d.%d",
3376 inst->dst.reg_offset, inst->dst.subreg_offset);
3377 break;
3378 case MRF:
3379 fprintf(file, "m%d", inst->dst.reg);
3380 break;
3381 case BAD_FILE:
3382 fprintf(file, "(null)");
3383 break;
3384 case UNIFORM:
3385 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3386 break;
3387 case ATTR:
3388 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3389 break;
3390 case HW_REG:
3391 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3392 switch (inst->dst.fixed_hw_reg.nr) {
3393 case BRW_ARF_NULL:
3394 fprintf(file, "null");
3395 break;
3396 case BRW_ARF_ADDRESS:
3397 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3398 break;
3399 case BRW_ARF_ACCUMULATOR:
3400 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3401 break;
3402 case BRW_ARF_FLAG:
3403 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3404 inst->dst.fixed_hw_reg.subnr);
3405 break;
3406 default:
3407 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3408 inst->dst.fixed_hw_reg.subnr);
3409 break;
3410 }
3411 } else {
3412 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3413 }
3414 if (inst->dst.fixed_hw_reg.subnr)
3415 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3416 break;
3417 default:
3418 fprintf(file, "???");
3419 break;
3420 }
3421 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3422
3423 for (int i = 0; i < inst->sources; i++) {
3424 if (inst->src[i].negate)
3425 fprintf(file, "-");
3426 if (inst->src[i].abs)
3427 fprintf(file, "|");
3428 switch (inst->src[i].file) {
3429 case GRF:
3430 fprintf(file, "vgrf%d", inst->src[i].reg);
3431 if (inst->src[i].width != dispatch_width)
3432 fprintf(file, "@%d", inst->src[i].width);
3433 if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3434 inst->src[i].subreg_offset)
3435 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3436 inst->src[i].subreg_offset);
3437 break;
3438 case MRF:
3439 fprintf(file, "***m%d***", inst->src[i].reg);
3440 break;
3441 case ATTR:
3442 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3443 break;
3444 case UNIFORM:
3445 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3446 if (inst->src[i].reladdr) {
3447 fprintf(file, "+reladdr");
3448 } else if (inst->src[i].subreg_offset) {
3449 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3450 inst->src[i].subreg_offset);
3451 }
3452 break;
3453 case BAD_FILE:
3454 fprintf(file, "(null)");
3455 break;
3456 case IMM:
3457 switch (inst->src[i].type) {
3458 case BRW_REGISTER_TYPE_F:
3459 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3460 break;
3461 case BRW_REGISTER_TYPE_W:
3462 case BRW_REGISTER_TYPE_D:
3463 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3464 break;
3465 case BRW_REGISTER_TYPE_UW:
3466 case BRW_REGISTER_TYPE_UD:
3467 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3468 break;
3469 case BRW_REGISTER_TYPE_VF:
3470 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3471 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3472 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3473 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3474 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3475 break;
3476 default:
3477 fprintf(file, "???");
3478 break;
3479 }
3480 break;
3481 case HW_REG:
3482 if (inst->src[i].fixed_hw_reg.negate)
3483 fprintf(file, "-");
3484 if (inst->src[i].fixed_hw_reg.abs)
3485 fprintf(file, "|");
3486 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3487 switch (inst->src[i].fixed_hw_reg.nr) {
3488 case BRW_ARF_NULL:
3489 fprintf(file, "null");
3490 break;
3491 case BRW_ARF_ADDRESS:
3492 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3493 break;
3494 case BRW_ARF_ACCUMULATOR:
3495 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3496 break;
3497 case BRW_ARF_FLAG:
3498 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3499 inst->src[i].fixed_hw_reg.subnr);
3500 break;
3501 default:
3502 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3503 inst->src[i].fixed_hw_reg.subnr);
3504 break;
3505 }
3506 } else {
3507 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3508 }
3509 if (inst->src[i].fixed_hw_reg.subnr)
3510 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3511 if (inst->src[i].fixed_hw_reg.abs)
3512 fprintf(file, "|");
3513 break;
3514 default:
3515 fprintf(file, "???");
3516 break;
3517 }
3518 if (inst->src[i].abs)
3519 fprintf(file, "|");
3520
3521 if (inst->src[i].file != IMM) {
3522 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3523 }
3524
3525 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3526 fprintf(file, ", ");
3527 }
3528
3529 fprintf(file, " ");
3530
3531 if (dispatch_width == 16 && inst->exec_size == 8) {
3532 if (inst->force_sechalf)
3533 fprintf(file, "2ndhalf ");
3534 else
3535 fprintf(file, "1sthalf ");
3536 }
3537
3538 fprintf(file, "\n");
3539 }
3540
3541 /**
3542 * Possibly returns an instruction that set up @param reg.
3543 *
3544 * Sometimes we want to take the result of some expression/variable
3545 * dereference tree and rewrite the instruction generating the result
3546 * of the tree. When processing the tree, we know that the
3547 * instructions generated are all writing temporaries that are dead
3548 * outside of this tree. So, if we have some instructions that write
3549 * a temporary, we're free to point that temp write somewhere else.
3550 *
3551 * Note that this doesn't guarantee that the instruction generated
3552 * only reg -- it might be the size=4 destination of a texture instruction.
3553 */
3554 fs_inst *
3555 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3556 fs_inst *end,
3557 const fs_reg &reg)
3558 {
3559 if (end == start ||
3560 end->is_partial_write() ||
3561 reg.reladdr ||
3562 !reg.equals(end->dst)) {
3563 return NULL;
3564 } else {
3565 return end;
3566 }
3567 }
3568
3569 void
3570 fs_visitor::setup_payload_gen6()
3571 {
3572 bool uses_depth =
3573 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3574 unsigned barycentric_interp_modes =
3575 (stage == MESA_SHADER_FRAGMENT) ?
3576 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3577
3578 assert(brw->gen >= 6);
3579
3580 /* R0-1: masks, pixel X/Y coordinates. */
3581 payload.num_regs = 2;
3582 /* R2: only for 32-pixel dispatch.*/
3583
3584 /* R3-26: barycentric interpolation coordinates. These appear in the
3585 * same order that they appear in the brw_wm_barycentric_interp_mode
3586 * enum. Each set of coordinates occupies 2 registers if dispatch width
3587 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3588 * appear if they were enabled using the "Barycentric Interpolation
3589 * Mode" bits in WM_STATE.
3590 */
3591 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3592 if (barycentric_interp_modes & (1 << i)) {
3593 payload.barycentric_coord_reg[i] = payload.num_regs;
3594 payload.num_regs += 2;
3595 if (dispatch_width == 16) {
3596 payload.num_regs += 2;
3597 }
3598 }
3599 }
3600
3601 /* R27: interpolated depth if uses source depth */
3602 if (uses_depth) {
3603 payload.source_depth_reg = payload.num_regs;
3604 payload.num_regs++;
3605 if (dispatch_width == 16) {
3606 /* R28: interpolated depth if not SIMD8. */
3607 payload.num_regs++;
3608 }
3609 }
3610 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3611 if (uses_depth) {
3612 payload.source_w_reg = payload.num_regs;
3613 payload.num_regs++;
3614 if (dispatch_width == 16) {
3615 /* R30: interpolated W if not SIMD8. */
3616 payload.num_regs++;
3617 }
3618 }
3619
3620 if (stage == MESA_SHADER_FRAGMENT) {
3621 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3622 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3623 prog_data->uses_pos_offset = key->compute_pos_offset;
3624 /* R31: MSAA position offsets. */
3625 if (prog_data->uses_pos_offset) {
3626 payload.sample_pos_reg = payload.num_regs;
3627 payload.num_regs++;
3628 }
3629 }
3630
3631 /* R32: MSAA input coverage mask */
3632 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3633 assert(brw->gen >= 7);
3634 payload.sample_mask_in_reg = payload.num_regs;
3635 payload.num_regs++;
3636 if (dispatch_width == 16) {
3637 /* R33: input coverage mask if not SIMD8. */
3638 payload.num_regs++;
3639 }
3640 }
3641
3642 /* R34-: bary for 32-pixel. */
3643 /* R58-59: interp W for 32-pixel. */
3644
3645 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3646 source_depth_to_render_target = true;
3647 }
3648 }
3649
3650 void
3651 fs_visitor::setup_vs_payload()
3652 {
3653 /* R0: thread header, R1: urb handles */
3654 payload.num_regs = 2;
3655 }
3656
3657 void
3658 fs_visitor::assign_binding_table_offsets()
3659 {
3660 assert(stage == MESA_SHADER_FRAGMENT);
3661 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3662 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3663 uint32_t next_binding_table_offset = 0;
3664
3665 /* If there are no color regions, we still perform an FB write to a null
3666 * renderbuffer, which we place at surface index 0.
3667 */
3668 prog_data->binding_table.render_target_start = next_binding_table_offset;
3669 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3670
3671 assign_common_binding_table_offsets(next_binding_table_offset);
3672 }
3673
3674 void
3675 fs_visitor::calculate_register_pressure()
3676 {
3677 invalidate_live_intervals();
3678 calculate_live_intervals();
3679
3680 unsigned num_instructions = 0;
3681 foreach_block(block, cfg)
3682 num_instructions += block->instructions.length();
3683
3684 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3685
3686 for (unsigned reg = 0; reg < alloc.count; reg++) {
3687 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3688 regs_live_at_ip[ip] += alloc.sizes[reg];
3689 }
3690 }
3691
3692 void
3693 fs_visitor::optimize()
3694 {
3695 const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3696
3697 split_virtual_grfs();
3698
3699 move_uniform_array_access_to_pull_constants();
3700 assign_constant_locations();
3701 demote_pull_constants();
3702
3703 #define OPT(pass, args...) ({ \
3704 pass_num++; \
3705 bool this_progress = pass(args); \
3706 \
3707 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3708 char filename[64]; \
3709 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3710 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3711 \
3712 backend_visitor::dump_instructions(filename); \
3713 } \
3714 \
3715 progress = progress || this_progress; \
3716 this_progress; \
3717 })
3718
3719 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3720 char filename[64];
3721 snprintf(filename, 64, "%s%d-%04d-00-start",
3722 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3723
3724 backend_visitor::dump_instructions(filename);
3725 }
3726
3727 bool progress;
3728 int iteration = 0;
3729 int pass_num = 0;
3730 do {
3731 progress = false;
3732 pass_num = 0;
3733 iteration++;
3734
3735 OPT(remove_duplicate_mrf_writes);
3736
3737 OPT(opt_algebraic);
3738 OPT(opt_cse);
3739 OPT(opt_copy_propagate);
3740 OPT(opt_peephole_predicated_break);
3741 OPT(opt_cmod_propagation);
3742 OPT(dead_code_eliminate);
3743 OPT(opt_peephole_sel);
3744 OPT(dead_control_flow_eliminate, this);
3745 OPT(opt_register_renaming);
3746 OPT(opt_redundant_discard_jumps);
3747 OPT(opt_saturate_propagation);
3748 OPT(register_coalesce);
3749 OPT(compute_to_mrf);
3750
3751 OPT(compact_virtual_grfs);
3752 } while (progress);
3753
3754 pass_num = 0;
3755
3756 if (OPT(lower_load_payload)) {
3757 split_virtual_grfs();
3758 OPT(register_coalesce);
3759 OPT(compute_to_mrf);
3760 OPT(dead_code_eliminate);
3761 }
3762
3763 OPT(opt_combine_constants);
3764
3765 lower_uniform_pull_constant_loads();
3766 }
3767
3768 /**
3769 * Three source instruction must have a GRF/MRF destination register.
3770 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
3771 */
3772 void
3773 fs_visitor::fixup_3src_null_dest()
3774 {
3775 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3776 if (inst->is_3src() && inst->dst.is_null()) {
3777 inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3778 inst->dst.type);
3779 }
3780 }
3781 }
3782
3783 void
3784 fs_visitor::allocate_registers()
3785 {
3786 bool allocated_without_spills;
3787
3788 static const enum instruction_scheduler_mode pre_modes[] = {
3789 SCHEDULE_PRE,
3790 SCHEDULE_PRE_NON_LIFO,
3791 SCHEDULE_PRE_LIFO,
3792 };
3793
3794 /* Try each scheduling heuristic to see if it can successfully register
3795 * allocate without spilling. They should be ordered by decreasing
3796 * performance but increasing likelihood of allocating.
3797 */
3798 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3799 schedule_instructions(pre_modes[i]);
3800
3801 if (0) {
3802 assign_regs_trivial();
3803 allocated_without_spills = true;
3804 } else {
3805 allocated_without_spills = assign_regs(false);
3806 }
3807 if (allocated_without_spills)
3808 break;
3809 }
3810
3811 if (!allocated_without_spills) {
3812 const char *stage_name = stage == MESA_SHADER_VERTEX ?
3813 "Vertex" : "Fragment";
3814
3815 /* We assume that any spilling is worse than just dropping back to
3816 * SIMD8. There's probably actually some intermediate point where
3817 * SIMD16 with a couple of spills is still better.
3818 */
3819 if (dispatch_width == 16) {
3820 fail("Failure to register allocate. Reduce number of "
3821 "live scalar values to avoid this.");
3822 } else {
3823 perf_debug("%s shader triggered register spilling. "
3824 "Try reducing the number of live scalar values to "
3825 "improve performance.\n", stage_name);
3826 }
3827
3828 /* Since we're out of heuristics, just go spill registers until we
3829 * get an allocation.
3830 */
3831 while (!assign_regs(true)) {
3832 if (failed)
3833 break;
3834 }
3835 }
3836
3837 /* This must come after all optimization and register allocation, since
3838 * it inserts dead code that happens to have side effects, and it does
3839 * so based on the actual physical registers in use.
3840 */
3841 insert_gen4_send_dependency_workarounds();
3842
3843 if (failed)
3844 return;
3845
3846 if (!allocated_without_spills)
3847 schedule_instructions(SCHEDULE_POST);
3848
3849 if (last_scratch > 0)
3850 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3851 }
3852
3853 bool
3854 fs_visitor::run_vs()
3855 {
3856 assert(stage == MESA_SHADER_VERTEX);
3857
3858 assign_common_binding_table_offsets(0);
3859 setup_vs_payload();
3860
3861 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3862 emit_shader_time_begin();
3863
3864 if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
3865 emit_nir_code();
3866 } else {
3867 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3868 base_ir = ir;
3869 this->result = reg_undef;
3870 ir->accept(this);
3871 }
3872 base_ir = NULL;
3873 }
3874
3875 if (failed)
3876 return false;
3877
3878 emit_urb_writes();
3879
3880 calculate_cfg();
3881
3882 optimize();
3883
3884 assign_curb_setup();
3885 assign_vs_urb_setup();
3886
3887 fixup_3src_null_dest();
3888 allocate_registers();
3889
3890 return !failed;
3891 }
3892
3893 bool
3894 fs_visitor::run_fs()
3895 {
3896 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3897 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3898
3899 assert(stage == MESA_SHADER_FRAGMENT);
3900
3901 sanity_param_count = prog->Parameters->NumParameters;
3902
3903 assign_binding_table_offsets();
3904
3905 if (brw->gen >= 6)
3906 setup_payload_gen6();
3907 else
3908 setup_payload_gen4();
3909
3910 if (0) {
3911 emit_dummy_fs();
3912 } else if (brw->use_rep_send && dispatch_width == 16) {
3913 emit_repclear_shader();
3914 } else {
3915 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3916 emit_shader_time_begin();
3917
3918 calculate_urb_setup();
3919 if (prog->InputsRead > 0) {
3920 if (brw->gen < 6)
3921 emit_interpolation_setup_gen4();
3922 else
3923 emit_interpolation_setup_gen6();
3924 }
3925
3926 /* We handle discards by keeping track of the still-live pixels in f0.1.
3927 * Initialize it with the dispatched pixels.
3928 */
3929 if (wm_prog_data->uses_kill) {
3930 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3931 discard_init->flag_subreg = 1;
3932 }
3933
3934 /* Generate FS IR for main(). (the visitor only descends into
3935 * functions called "main").
3936 */
3937 if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
3938 emit_nir_code();
3939 } else if (shader) {
3940 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3941 base_ir = ir;
3942 this->result = reg_undef;
3943 ir->accept(this);
3944 }
3945 } else {
3946 emit_fragment_program_code();
3947 }
3948 base_ir = NULL;
3949 if (failed)
3950 return false;
3951
3952 emit(FS_OPCODE_PLACEHOLDER_HALT);
3953
3954 if (wm_key->alpha_test_func)
3955 emit_alpha_test();
3956
3957 emit_fb_writes();
3958
3959 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3960 emit_shader_time_end();
3961
3962 calculate_cfg();
3963
3964 optimize();
3965
3966 assign_curb_setup();
3967 assign_urb_setup();
3968
3969 fixup_3src_null_dest();
3970 allocate_registers();
3971
3972 if (failed)
3973 return false;
3974 }
3975
3976 if (dispatch_width == 8)
3977 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3978 else
3979 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3980
3981 /* If any state parameters were appended, then ParameterValues could have
3982 * been realloced, in which case the driver uniform storage set up by
3983 * _mesa_associate_uniform_storage() would point to freed memory. Make
3984 * sure that didn't happen.
3985 */
3986 assert(sanity_param_count == prog->Parameters->NumParameters);
3987
3988 return !failed;
3989 }
3990
3991 const unsigned *
3992 brw_wm_fs_emit(struct brw_context *brw,
3993 void *mem_ctx,
3994 const struct brw_wm_prog_key *key,
3995 struct brw_wm_prog_data *prog_data,
3996 struct gl_fragment_program *fp,
3997 struct gl_shader_program *prog,
3998 unsigned *final_assembly_size)
3999 {
4000 bool start_busy = false;
4001 double start_time = 0;
4002
4003 if (unlikely(brw->perf_debug)) {
4004 start_busy = (brw->batch.last_bo &&
4005 drm_intel_bo_busy(brw->batch.last_bo));
4006 start_time = get_time();
4007 }
4008
4009 struct brw_shader *shader = NULL;
4010 if (prog)
4011 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4012
4013 if (unlikely(INTEL_DEBUG & DEBUG_WM))
4014 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4015
4016 /* Now the main event: Visit the shader IR and generate our FS IR for it.
4017 */
4018 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
4019 if (!v.run_fs()) {
4020 if (prog) {
4021 prog->LinkStatus = false;
4022 ralloc_strcat(&prog->InfoLog, v.fail_msg);
4023 }
4024
4025 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4026 v.fail_msg);
4027
4028 return NULL;
4029 }
4030
4031 cfg_t *simd16_cfg = NULL;
4032 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
4033 if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4034 if (!v.simd16_unsupported) {
4035 /* Try a SIMD16 compile */
4036 v2.import_uniforms(&v);
4037 if (!v2.run_fs()) {
4038 perf_debug("SIMD16 shader failed to compile, falling back to "
4039 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4040 } else {
4041 simd16_cfg = v2.cfg;
4042 }
4043 } else {
4044 perf_debug("SIMD16 shader unsupported, falling back to "
4045 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4046 }
4047 }
4048
4049 cfg_t *simd8_cfg;
4050 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4051 if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4052 simd8_cfg = NULL;
4053 prog_data->no_8 = true;
4054 } else {
4055 simd8_cfg = v.cfg;
4056 prog_data->no_8 = false;
4057 }
4058
4059 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4060 &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4061
4062 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4063 char *name;
4064 if (prog)
4065 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4066 prog->Label ? prog->Label : "unnamed",
4067 prog->Name);
4068 else
4069 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4070
4071 g.enable_debug(name);
4072 }
4073
4074 if (simd8_cfg)
4075 g.generate_code(simd8_cfg, 8);
4076 if (simd16_cfg)
4077 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4078
4079 if (unlikely(brw->perf_debug) && shader) {
4080 if (shader->compiled_once)
4081 brw_wm_debug_recompile(brw, prog, key);
4082 shader->compiled_once = true;
4083
4084 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4085 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4086 (get_time() - start_time) * 1000);
4087 }
4088 }
4089
4090 return g.get_assembly(final_assembly_size);
4091 }
4092
4093 extern "C" bool
4094 brw_fs_precompile(struct gl_context *ctx,
4095 struct gl_shader_program *shader_prog,
4096 struct gl_program *prog)
4097 {
4098 struct brw_context *brw = brw_context(ctx);
4099 struct brw_wm_prog_key key;
4100
4101 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4102 struct brw_fragment_program *bfp = brw_fragment_program(fp);
4103 bool program_uses_dfdy = fp->UsesDFdy;
4104
4105 memset(&key, 0, sizeof(key));
4106
4107 if (brw->gen < 6) {
4108 if (fp->UsesKill)
4109 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4110
4111 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4112 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4113
4114 /* Just assume depth testing. */
4115 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4116 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4117 }
4118
4119 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4120 BRW_FS_VARYING_INPUT_MASK) > 16)
4121 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4122
4123 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4124 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
4125 for (unsigned i = 0; i < sampler_count; i++) {
4126 if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
4127 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4128 key.tex.swizzles[i] =
4129 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4130 } else {
4131 /* Color sampler: assume no swizzling. */
4132 key.tex.swizzles[i] = SWIZZLE_XYZW;
4133 }
4134 }
4135
4136 if (fp->Base.InputsRead & VARYING_BIT_POS) {
4137 key.drawable_height = ctx->DrawBuffer->Height;
4138 }
4139
4140 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4141 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4142 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4143
4144 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4145 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4146 key.nr_color_regions > 1;
4147 }
4148
4149 key.program_string_id = bfp->id;
4150
4151 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4152 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4153
4154 bool success = brw_compile_wm_prog(brw, shader_prog, bfp, &key);
4155
4156 brw->wm.base.prog_offset = old_prog_offset;
4157 brw->wm.prog_data = old_prog_data;
4158
4159 return success;
4160 }