i965/fs: Properly calculate the number of instructions in calculate_register_pressure
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "util/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "util/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_cfg.h"
50 #include "brw_dead_control_flow.h"
51 #include "main/uniforms.h"
52 #include "brw_fs_live_variables.h"
53 #include "glsl/glsl_types.h"
54
55 void
56 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
57 fs_reg *src, int sources)
58 {
59 memset(this, 0, sizeof(*this));
60
61 this->opcode = opcode;
62 this->dst = dst;
63 this->src = src;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (int i = 0; i < sources; ++i) {
79 if (src[i].file != GRF)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 for (int i = 0; i < sources; ++i) {
94 switch (this->src[i].file) {
95 case BAD_FILE:
96 this->src[i].effective_width = 8;
97 break;
98 case GRF:
99 case HW_REG:
100 assert(this->src[i].width > 0);
101 if (this->src[i].width == 1) {
102 this->src[i].effective_width = this->exec_size;
103 } else {
104 this->src[i].effective_width = this->src[i].width;
105 }
106 break;
107 case IMM:
108 case UNIFORM:
109 this->src[i].effective_width = this->exec_size;
110 break;
111 default:
112 unreachable("Invalid source register file");
113 }
114 }
115 this->dst.effective_width = this->exec_size;
116
117 this->conditional_mod = BRW_CONDITIONAL_NONE;
118
119 /* This will be the case for almost all instructions. */
120 switch (dst.file) {
121 case GRF:
122 case HW_REG:
123 case MRF:
124 this->regs_written = (dst.width * dst.stride * type_sz(dst.type) + 31) / 32;
125 break;
126 case BAD_FILE:
127 this->regs_written = 0;
128 break;
129 case IMM:
130 case UNIFORM:
131 unreachable("Invalid destination register file");
132 default:
133 unreachable("Invalid register file");
134 }
135
136 this->writes_accumulator = false;
137 }
138
139 fs_inst::fs_inst()
140 {
141 fs_reg *src = ralloc_array(this, fs_reg, 3);
142 init(BRW_OPCODE_NOP, 8, dst, src, 0);
143 }
144
145 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
146 {
147 fs_reg *src = ralloc_array(this, fs_reg, 3);
148 init(opcode, exec_size, reg_undef, src, 0);
149 }
150
151 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
152 {
153 fs_reg *src = ralloc_array(this, fs_reg, 3);
154 init(opcode, 0, dst, src, 0);
155 }
156
157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
158 const fs_reg &src0)
159 {
160 fs_reg *src = ralloc_array(this, fs_reg, 3);
161 src[0] = src0;
162 init(opcode, exec_size, dst, src, 1);
163 }
164
165 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
166 {
167 fs_reg *src = ralloc_array(this, fs_reg, 3);
168 src[0] = src0;
169 init(opcode, 0, dst, src, 1);
170 }
171
172 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
173 const fs_reg &src0, const fs_reg &src1)
174 {
175 fs_reg *src = ralloc_array(this, fs_reg, 3);
176 src[0] = src0;
177 src[1] = src1;
178 init(opcode, exec_size, dst, src, 2);
179 }
180
181 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
182 const fs_reg &src1)
183 {
184 fs_reg *src = ralloc_array(this, fs_reg, 3);
185 src[0] = src0;
186 src[1] = src1;
187 init(opcode, 0, dst, src, 2);
188 }
189
190 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
191 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
192 {
193 fs_reg *src = ralloc_array(this, fs_reg, 3);
194 src[0] = src0;
195 src[1] = src1;
196 src[2] = src2;
197 init(opcode, exec_size, dst, src, 3);
198 }
199
200 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
201 const fs_reg &src1, const fs_reg &src2)
202 {
203 fs_reg *src = ralloc_array(this, fs_reg, 3);
204 src[0] = src0;
205 src[1] = src1;
206 src[2] = src2;
207 init(opcode, 0, dst, src, 3);
208 }
209
210 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
211 {
212 init(opcode, 0, dst, src, sources);
213 }
214
215 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
216 fs_reg src[], int sources)
217 {
218 init(opcode, exec_width, dst, src, sources);
219 }
220
221 fs_inst::fs_inst(const fs_inst &that)
222 {
223 memcpy(this, &that, sizeof(that));
224
225 this->src = ralloc_array(this, fs_reg, that.sources);
226
227 for (int i = 0; i < that.sources; i++)
228 this->src[i] = that.src[i];
229 }
230
231 void
232 fs_inst::resize_sources(uint8_t num_sources)
233 {
234 if (this->sources != num_sources) {
235 this->src = reralloc(this, this->src, fs_reg, num_sources);
236 this->sources = num_sources;
237 }
238 }
239
240 #define ALU1(op) \
241 fs_inst * \
242 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
243 { \
244 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
245 }
246
247 #define ALU2(op) \
248 fs_inst * \
249 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
250 const fs_reg &src1) \
251 { \
252 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
253 }
254
255 #define ALU2_ACC(op) \
256 fs_inst * \
257 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
258 const fs_reg &src1) \
259 { \
260 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
261 inst->writes_accumulator = true; \
262 return inst; \
263 }
264
265 #define ALU3(op) \
266 fs_inst * \
267 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
268 const fs_reg &src1, const fs_reg &src2) \
269 { \
270 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
271 }
272
273 ALU1(NOT)
274 ALU1(MOV)
275 ALU1(FRC)
276 ALU1(RNDD)
277 ALU1(RNDE)
278 ALU1(RNDZ)
279 ALU2(ADD)
280 ALU2(MUL)
281 ALU2_ACC(MACH)
282 ALU2(AND)
283 ALU2(OR)
284 ALU2(XOR)
285 ALU2(SHL)
286 ALU2(SHR)
287 ALU2(ASR)
288 ALU3(LRP)
289 ALU1(BFREV)
290 ALU3(BFE)
291 ALU2(BFI1)
292 ALU3(BFI2)
293 ALU1(FBH)
294 ALU1(FBL)
295 ALU1(CBIT)
296 ALU3(MAD)
297 ALU2_ACC(ADDC)
298 ALU2_ACC(SUBB)
299 ALU2(SEL)
300 ALU2(MAC)
301
302 /** Gen4 predicated IF. */
303 fs_inst *
304 fs_visitor::IF(enum brw_predicate predicate)
305 {
306 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
307 inst->predicate = predicate;
308 return inst;
309 }
310
311 /** Gen6 IF with embedded comparison. */
312 fs_inst *
313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
314 enum brw_conditional_mod condition)
315 {
316 assert(brw->gen == 6);
317 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
318 reg_null_d, src0, src1);
319 inst->conditional_mod = condition;
320 return inst;
321 }
322
323 /**
324 * CMP: Sets the low bit of the destination channels with the result
325 * of the comparison, while the upper bits are undefined, and updates
326 * the flag register with the packed 16 bits of the result.
327 */
328 fs_inst *
329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
330 enum brw_conditional_mod condition)
331 {
332 fs_inst *inst;
333
334 /* Take the instruction:
335 *
336 * CMP null<d> src0<f> src1<f>
337 *
338 * Original gen4 does type conversion to the destination type before
339 * comparison, producing garbage results for floating point comparisons.
340 * gen5 does the comparison on the execution type (resolved source types),
341 * so dst type doesn't matter. gen6 does comparison and then uses the
342 * result as if it was the dst type with no conversion, which happens to
343 * mostly work out for float-interpreted-as-int since our comparisons are
344 * for >0, =0, <0.
345 */
346 if (brw->gen == 4) {
347 dst.type = src0.type;
348 if (dst.file == HW_REG)
349 dst.fixed_hw_reg.type = dst.type;
350 }
351
352 resolve_ud_negate(&src0);
353 resolve_ud_negate(&src1);
354
355 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
356 inst->conditional_mod = condition;
357
358 return inst;
359 }
360
361 fs_inst *
362 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
363 {
364 uint8_t exec_size = dst.width;
365 for (int i = 0; i < sources; ++i) {
366 assert(src[i].width % dst.width == 0);
367 if (src[i].width > exec_size)
368 exec_size = src[i].width;
369 }
370
371 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
372 dst, src, sources);
373 inst->regs_written = 0;
374 for (int i = 0; i < sources; ++i) {
375 /* The LOAD_PAYLOAD instruction only really makes sense if we are
376 * dealing with whole registers. If this ever changes, we can deal
377 * with it later.
378 */
379 int size = src[i].effective_width * type_sz(src[i].type);
380 assert(size % 32 == 0);
381 inst->regs_written += (size + 31) / 32;
382 }
383
384 return inst;
385 }
386
387 exec_list
388 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
389 const fs_reg &surf_index,
390 const fs_reg &varying_offset,
391 uint32_t const_offset)
392 {
393 exec_list instructions;
394 fs_inst *inst;
395
396 /* We have our constant surface use a pitch of 4 bytes, so our index can
397 * be any component of a vector, and then we load 4 contiguous
398 * components starting from that.
399 *
400 * We break down the const_offset to a portion added to the variable
401 * offset and a portion done using reg_offset, which means that if you
402 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
403 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
404 * CSE can later notice that those loads are all the same and eliminate
405 * the redundant ones.
406 */
407 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
408 instructions.push_tail(ADD(vec4_offset,
409 varying_offset, fs_reg(const_offset & ~3)));
410
411 int scale = 1;
412 if (brw->gen == 4 && dst.width == 8) {
413 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
414 * u, v, r) as parameters, or we can just use the SIMD16 message
415 * consisting of (header, u). We choose the second, at the cost of a
416 * longer return length.
417 */
418 scale = 2;
419 }
420
421 enum opcode op;
422 if (brw->gen >= 7)
423 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
424 else
425 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
426
427 assert(dst.width % 8 == 0);
428 int regs_written = 4 * (dst.width / 8) * scale;
429 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(regs_written),
430 dst.type, dst.width);
431 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
432 inst->regs_written = regs_written;
433 instructions.push_tail(inst);
434
435 if (brw->gen < 7) {
436 inst->base_mrf = 13;
437 inst->header_present = true;
438 if (brw->gen == 4)
439 inst->mlen = 3;
440 else
441 inst->mlen = 1 + dispatch_width / 8;
442 }
443
444 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
445 instructions.push_tail(MOV(dst, result));
446
447 return instructions;
448 }
449
450 /**
451 * A helper for MOV generation for fixing up broken hardware SEND dependency
452 * handling.
453 */
454 fs_inst *
455 fs_visitor::DEP_RESOLVE_MOV(int grf)
456 {
457 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
458
459 inst->ir = NULL;
460 inst->annotation = "send dependency resolve";
461
462 /* The caller always wants uncompressed to emit the minimal extra
463 * dependencies, and to avoid having to deal with aligning its regs to 2.
464 */
465 inst->exec_size = 8;
466
467 return inst;
468 }
469
470 bool
471 fs_inst::equals(fs_inst *inst) const
472 {
473 return (opcode == inst->opcode &&
474 dst.equals(inst->dst) &&
475 src[0].equals(inst->src[0]) &&
476 src[1].equals(inst->src[1]) &&
477 src[2].equals(inst->src[2]) &&
478 saturate == inst->saturate &&
479 predicate == inst->predicate &&
480 conditional_mod == inst->conditional_mod &&
481 mlen == inst->mlen &&
482 base_mrf == inst->base_mrf &&
483 target == inst->target &&
484 eot == inst->eot &&
485 header_present == inst->header_present &&
486 shadow_compare == inst->shadow_compare &&
487 exec_size == inst->exec_size &&
488 offset == inst->offset);
489 }
490
491 bool
492 fs_inst::overwrites_reg(const fs_reg &reg) const
493 {
494 return (reg.file == dst.file &&
495 reg.reg == dst.reg &&
496 reg.reg_offset >= dst.reg_offset &&
497 reg.reg_offset < dst.reg_offset + regs_written);
498 }
499
500 bool
501 fs_inst::is_send_from_grf() const
502 {
503 switch (opcode) {
504 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
505 case SHADER_OPCODE_SHADER_TIME_ADD:
506 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
507 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
508 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
509 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
510 case SHADER_OPCODE_UNTYPED_ATOMIC:
511 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
512 return true;
513 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
514 return src[1].file == GRF;
515 case FS_OPCODE_FB_WRITE:
516 return src[0].file == GRF;
517 default:
518 if (is_tex())
519 return src[0].file == GRF;
520
521 return false;
522 }
523 }
524
525 bool
526 fs_inst::can_do_source_mods(struct brw_context *brw)
527 {
528 if (brw->gen == 6 && is_math())
529 return false;
530
531 if (is_send_from_grf())
532 return false;
533
534 if (!backend_instruction::can_do_source_mods())
535 return false;
536
537 return true;
538 }
539
540 void
541 fs_reg::init()
542 {
543 memset(this, 0, sizeof(*this));
544 stride = 1;
545 }
546
547 /** Generic unset register constructor. */
548 fs_reg::fs_reg()
549 {
550 init();
551 this->file = BAD_FILE;
552 }
553
554 /** Immediate value constructor. */
555 fs_reg::fs_reg(float f)
556 {
557 init();
558 this->file = IMM;
559 this->type = BRW_REGISTER_TYPE_F;
560 this->fixed_hw_reg.dw1.f = f;
561 this->width = 1;
562 }
563
564 /** Immediate value constructor. */
565 fs_reg::fs_reg(int32_t i)
566 {
567 init();
568 this->file = IMM;
569 this->type = BRW_REGISTER_TYPE_D;
570 this->fixed_hw_reg.dw1.d = i;
571 this->width = 1;
572 }
573
574 /** Immediate value constructor. */
575 fs_reg::fs_reg(uint32_t u)
576 {
577 init();
578 this->file = IMM;
579 this->type = BRW_REGISTER_TYPE_UD;
580 this->fixed_hw_reg.dw1.ud = u;
581 this->width = 1;
582 }
583
584 /** Fixed brw_reg. */
585 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
586 {
587 init();
588 this->file = HW_REG;
589 this->fixed_hw_reg = fixed_hw_reg;
590 this->type = fixed_hw_reg.type;
591 this->width = 1 << fixed_hw_reg.width;
592 }
593
594 bool
595 fs_reg::equals(const fs_reg &r) const
596 {
597 return (file == r.file &&
598 reg == r.reg &&
599 reg_offset == r.reg_offset &&
600 subreg_offset == r.subreg_offset &&
601 type == r.type &&
602 negate == r.negate &&
603 abs == r.abs &&
604 !reladdr && !r.reladdr &&
605 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
606 width == r.width &&
607 stride == r.stride);
608 }
609
610 fs_reg &
611 fs_reg::apply_stride(unsigned stride)
612 {
613 assert((this->stride * stride) <= 4 &&
614 (is_power_of_two(stride) || stride == 0) &&
615 file != HW_REG && file != IMM);
616 this->stride *= stride;
617 return *this;
618 }
619
620 fs_reg &
621 fs_reg::set_smear(unsigned subreg)
622 {
623 assert(file != HW_REG && file != IMM);
624 subreg_offset = subreg * type_sz(type);
625 stride = 0;
626 return *this;
627 }
628
629 bool
630 fs_reg::is_contiguous() const
631 {
632 return stride == 1;
633 }
634
635 bool
636 fs_reg::is_valid_3src() const
637 {
638 return file == GRF || file == UNIFORM;
639 }
640
641 int
642 fs_visitor::type_size(const struct glsl_type *type)
643 {
644 unsigned int size, i;
645
646 switch (type->base_type) {
647 case GLSL_TYPE_UINT:
648 case GLSL_TYPE_INT:
649 case GLSL_TYPE_FLOAT:
650 case GLSL_TYPE_BOOL:
651 return type->components();
652 case GLSL_TYPE_ARRAY:
653 return type_size(type->fields.array) * type->length;
654 case GLSL_TYPE_STRUCT:
655 size = 0;
656 for (i = 0; i < type->length; i++) {
657 size += type_size(type->fields.structure[i].type);
658 }
659 return size;
660 case GLSL_TYPE_SAMPLER:
661 /* Samplers take up no register space, since they're baked in at
662 * link time.
663 */
664 return 0;
665 case GLSL_TYPE_ATOMIC_UINT:
666 return 0;
667 case GLSL_TYPE_IMAGE:
668 case GLSL_TYPE_VOID:
669 case GLSL_TYPE_ERROR:
670 case GLSL_TYPE_INTERFACE:
671 unreachable("not reached");
672 }
673
674 return 0;
675 }
676
677 fs_reg
678 fs_visitor::get_timestamp()
679 {
680 assert(brw->gen >= 7);
681
682 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
683 BRW_ARF_TIMESTAMP,
684 0),
685 BRW_REGISTER_TYPE_UD));
686
687 fs_reg dst = fs_reg(this, glsl_type::uint_type);
688
689 fs_inst *mov = emit(MOV(dst, ts));
690 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
691 * even if it's not enabled in the dispatch.
692 */
693 mov->force_writemask_all = true;
694 mov->exec_size = 8;
695
696 /* The caller wants the low 32 bits of the timestamp. Since it's running
697 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
698 * which is plenty of time for our purposes. It is identical across the
699 * EUs, but since it's tracking GPU core speed it will increment at a
700 * varying rate as render P-states change.
701 *
702 * The caller could also check if render P-states have changed (or anything
703 * else that might disrupt timing) by setting smear to 2 and checking if
704 * that field is != 0.
705 */
706 dst.set_smear(0);
707
708 return dst;
709 }
710
711 void
712 fs_visitor::emit_shader_time_begin()
713 {
714 current_annotation = "shader time start";
715 shader_start_time = get_timestamp();
716 }
717
718 void
719 fs_visitor::emit_shader_time_end()
720 {
721 current_annotation = "shader time end";
722
723 enum shader_time_shader_type type, written_type, reset_type;
724 if (dispatch_width == 8) {
725 type = ST_FS8;
726 written_type = ST_FS8_WRITTEN;
727 reset_type = ST_FS8_RESET;
728 } else {
729 assert(dispatch_width == 16);
730 type = ST_FS16;
731 written_type = ST_FS16_WRITTEN;
732 reset_type = ST_FS16_RESET;
733 }
734
735 fs_reg shader_end_time = get_timestamp();
736
737 /* Check that there weren't any timestamp reset events (assuming these
738 * were the only two timestamp reads that happened).
739 */
740 fs_reg reset = shader_end_time;
741 reset.set_smear(2);
742 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
743 test->conditional_mod = BRW_CONDITIONAL_Z;
744 emit(IF(BRW_PREDICATE_NORMAL));
745
746 push_force_uncompressed();
747 fs_reg start = shader_start_time;
748 start.negate = true;
749 fs_reg diff = fs_reg(this, glsl_type::uint_type);
750 emit(ADD(diff, start, shader_end_time));
751
752 /* If there were no instructions between the two timestamp gets, the diff
753 * is 2 cycles. Remove that overhead, so I can forget about that when
754 * trying to determine the time taken for single instructions.
755 */
756 emit(ADD(diff, diff, fs_reg(-2u)));
757
758 emit_shader_time_write(type, diff);
759 emit_shader_time_write(written_type, fs_reg(1u));
760 emit(BRW_OPCODE_ELSE);
761 emit_shader_time_write(reset_type, fs_reg(1u));
762 emit(BRW_OPCODE_ENDIF);
763
764 pop_force_uncompressed();
765 }
766
767 void
768 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
769 fs_reg value)
770 {
771 int shader_time_index =
772 brw_get_shader_time_index(brw, shader_prog, prog, type);
773 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
774
775 fs_reg payload;
776 if (dispatch_width == 8)
777 payload = fs_reg(this, glsl_type::uvec2_type);
778 else
779 payload = fs_reg(this, glsl_type::uint_type);
780
781 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
782 fs_reg(), payload, offset, value));
783 }
784
785 void
786 fs_visitor::vfail(const char *format, va_list va)
787 {
788 char *msg;
789
790 if (failed)
791 return;
792
793 failed = true;
794
795 msg = ralloc_vasprintf(mem_ctx, format, va);
796 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
797
798 this->fail_msg = msg;
799
800 if (INTEL_DEBUG & DEBUG_WM) {
801 fprintf(stderr, "%s", msg);
802 }
803 }
804
805 void
806 fs_visitor::fail(const char *format, ...)
807 {
808 va_list va;
809
810 va_start(va, format);
811 vfail(format, va);
812 va_end(va);
813 }
814
815 /**
816 * Mark this program as impossible to compile in SIMD16 mode.
817 *
818 * During the SIMD8 compile (which happens first), we can detect and flag
819 * things that are unsupported in SIMD16 mode, so the compiler can skip
820 * the SIMD16 compile altogether.
821 *
822 * During a SIMD16 compile (if one happens anyway), this just calls fail().
823 */
824 void
825 fs_visitor::no16(const char *format, ...)
826 {
827 va_list va;
828
829 va_start(va, format);
830
831 if (dispatch_width == 16) {
832 vfail(format, va);
833 } else {
834 simd16_unsupported = true;
835
836 if (brw->perf_debug) {
837 if (no16_msg)
838 ralloc_vasprintf_append(&no16_msg, format, va);
839 else
840 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
841 }
842 }
843
844 va_end(va);
845 }
846
847 fs_inst *
848 fs_visitor::emit(enum opcode opcode)
849 {
850 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
851 }
852
853 fs_inst *
854 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
855 {
856 return emit(new(mem_ctx) fs_inst(opcode, dst));
857 }
858
859 fs_inst *
860 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
861 {
862 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
863 }
864
865 fs_inst *
866 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
867 const fs_reg &src1)
868 {
869 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
870 }
871
872 fs_inst *
873 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
874 const fs_reg &src1, const fs_reg &src2)
875 {
876 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
877 }
878
879 fs_inst *
880 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
881 fs_reg src[], int sources)
882 {
883 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
884 }
885
886 void
887 fs_visitor::push_force_uncompressed()
888 {
889 force_uncompressed_stack++;
890 }
891
892 void
893 fs_visitor::pop_force_uncompressed()
894 {
895 force_uncompressed_stack--;
896 assert(force_uncompressed_stack >= 0);
897 }
898
899 /**
900 * Returns true if the instruction has a flag that means it won't
901 * update an entire destination register.
902 *
903 * For example, dead code elimination and live variable analysis want to know
904 * when a write to a variable screens off any preceding values that were in
905 * it.
906 */
907 bool
908 fs_inst::is_partial_write() const
909 {
910 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
911 (this->dst.width * type_sz(this->dst.type)) < 32 ||
912 !this->dst.is_contiguous());
913 }
914
915 int
916 fs_inst::regs_read(fs_visitor *v, int arg) const
917 {
918 if (is_tex() && arg == 0 && src[0].file == GRF) {
919 return mlen;
920 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
921 return mlen;
922 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
923 return mlen;
924 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
925 return mlen;
926 }
927
928 switch (src[arg].file) {
929 case BAD_FILE:
930 case UNIFORM:
931 case IMM:
932 return 1;
933 case GRF:
934 case HW_REG:
935 if (src[arg].stride == 0) {
936 return 1;
937 } else {
938 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
939 return (size + 31) / 32;
940 }
941 case MRF:
942 unreachable("MRF registers are not allowed as sources");
943 default:
944 unreachable("Invalid register file");
945 }
946 }
947
948 bool
949 fs_inst::reads_flag() const
950 {
951 return predicate;
952 }
953
954 bool
955 fs_inst::writes_flag() const
956 {
957 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
958 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
959 }
960
961 /**
962 * Returns how many MRFs an FS opcode will write over.
963 *
964 * Note that this is not the 0 or 1 implied writes in an actual gen
965 * instruction -- the FS opcodes often generate MOVs in addition.
966 */
967 int
968 fs_visitor::implied_mrf_writes(fs_inst *inst)
969 {
970 if (inst->mlen == 0)
971 return 0;
972
973 if (inst->base_mrf == -1)
974 return 0;
975
976 switch (inst->opcode) {
977 case SHADER_OPCODE_RCP:
978 case SHADER_OPCODE_RSQ:
979 case SHADER_OPCODE_SQRT:
980 case SHADER_OPCODE_EXP2:
981 case SHADER_OPCODE_LOG2:
982 case SHADER_OPCODE_SIN:
983 case SHADER_OPCODE_COS:
984 return 1 * dispatch_width / 8;
985 case SHADER_OPCODE_POW:
986 case SHADER_OPCODE_INT_QUOTIENT:
987 case SHADER_OPCODE_INT_REMAINDER:
988 return 2 * dispatch_width / 8;
989 case SHADER_OPCODE_TEX:
990 case FS_OPCODE_TXB:
991 case SHADER_OPCODE_TXD:
992 case SHADER_OPCODE_TXF:
993 case SHADER_OPCODE_TXF_CMS:
994 case SHADER_OPCODE_TXF_MCS:
995 case SHADER_OPCODE_TG4:
996 case SHADER_OPCODE_TG4_OFFSET:
997 case SHADER_OPCODE_TXL:
998 case SHADER_OPCODE_TXS:
999 case SHADER_OPCODE_LOD:
1000 return 1;
1001 case FS_OPCODE_FB_WRITE:
1002 return 2;
1003 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1004 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1005 return 1;
1006 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1007 return inst->mlen;
1008 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1009 return 2;
1010 case SHADER_OPCODE_UNTYPED_ATOMIC:
1011 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1012 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1013 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1014 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1015 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1016 return 0;
1017 default:
1018 unreachable("not reached");
1019 }
1020 }
1021
1022 int
1023 fs_visitor::virtual_grf_alloc(int size)
1024 {
1025 if (virtual_grf_array_size <= virtual_grf_count) {
1026 if (virtual_grf_array_size == 0)
1027 virtual_grf_array_size = 16;
1028 else
1029 virtual_grf_array_size *= 2;
1030 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
1031 virtual_grf_array_size);
1032 }
1033 virtual_grf_sizes[virtual_grf_count] = size;
1034 return virtual_grf_count++;
1035 }
1036
1037 /** Fixed HW reg constructor. */
1038 fs_reg::fs_reg(enum register_file file, int reg)
1039 {
1040 init();
1041 this->file = file;
1042 this->reg = reg;
1043 this->type = BRW_REGISTER_TYPE_F;
1044
1045 switch (file) {
1046 case UNIFORM:
1047 this->width = 1;
1048 break;
1049 default:
1050 this->width = 8;
1051 }
1052 }
1053
1054 /** Fixed HW reg constructor. */
1055 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1056 {
1057 init();
1058 this->file = file;
1059 this->reg = reg;
1060 this->type = type;
1061
1062 switch (file) {
1063 case UNIFORM:
1064 this->width = 1;
1065 break;
1066 default:
1067 this->width = 8;
1068 }
1069 }
1070
1071 /** Fixed HW reg constructor. */
1072 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1073 uint8_t width)
1074 {
1075 init();
1076 this->file = file;
1077 this->reg = reg;
1078 this->type = type;
1079 this->width = width;
1080 }
1081
1082 /** Automatic reg constructor. */
1083 fs_reg::fs_reg(fs_visitor *v, const struct glsl_type *type)
1084 {
1085 init();
1086 int reg_width = v->dispatch_width / 8;
1087
1088 this->file = GRF;
1089 this->reg = v->virtual_grf_alloc(v->type_size(type) * reg_width);
1090 this->reg_offset = 0;
1091 this->type = brw_type_for_base_type(type);
1092 this->width = v->dispatch_width;
1093 assert(this->width == 8 || this->width == 16);
1094 }
1095
1096 fs_reg *
1097 fs_visitor::variable_storage(ir_variable *var)
1098 {
1099 return (fs_reg *)hash_table_find(this->variable_ht, var);
1100 }
1101
1102 void
1103 import_uniforms_callback(const void *key,
1104 void *data,
1105 void *closure)
1106 {
1107 struct hash_table *dst_ht = (struct hash_table *)closure;
1108 const fs_reg *reg = (const fs_reg *)data;
1109
1110 if (reg->file != UNIFORM)
1111 return;
1112
1113 hash_table_insert(dst_ht, data, key);
1114 }
1115
1116 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1117 * This brings in those uniform definitions
1118 */
1119 void
1120 fs_visitor::import_uniforms(fs_visitor *v)
1121 {
1122 hash_table_call_foreach(v->variable_ht,
1123 import_uniforms_callback,
1124 variable_ht);
1125 this->push_constant_loc = v->push_constant_loc;
1126 this->pull_constant_loc = v->pull_constant_loc;
1127 this->uniforms = v->uniforms;
1128 this->param_size = v->param_size;
1129 }
1130
1131 /* Our support for uniforms is piggy-backed on the struct
1132 * gl_fragment_program, because that's where the values actually
1133 * get stored, rather than in some global gl_shader_program uniform
1134 * store.
1135 */
1136 void
1137 fs_visitor::setup_uniform_values(ir_variable *ir)
1138 {
1139 int namelen = strlen(ir->name);
1140
1141 /* The data for our (non-builtin) uniforms is stored in a series of
1142 * gl_uniform_driver_storage structs for each subcomponent that
1143 * glGetUniformLocation() could name. We know it's been set up in the same
1144 * order we'd walk the type, so walk the list of storage and find anything
1145 * with our name, or the prefix of a component that starts with our name.
1146 */
1147 unsigned params_before = uniforms;
1148 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1149 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1150
1151 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1152 (storage->name[namelen] != 0 &&
1153 storage->name[namelen] != '.' &&
1154 storage->name[namelen] != '[')) {
1155 continue;
1156 }
1157
1158 unsigned slots = storage->type->component_slots();
1159 if (storage->array_elements)
1160 slots *= storage->array_elements;
1161
1162 for (unsigned i = 0; i < slots; i++) {
1163 stage_prog_data->param[uniforms++] = &storage->storage[i];
1164 }
1165 }
1166
1167 /* Make sure we actually initialized the right amount of stuff here. */
1168 assert(params_before + ir->type->component_slots() == uniforms);
1169 (void)params_before;
1170 }
1171
1172
1173 /* Our support for builtin uniforms is even scarier than non-builtin.
1174 * It sits on top of the PROG_STATE_VAR parameters that are
1175 * automatically updated from GL context state.
1176 */
1177 void
1178 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1179 {
1180 const ir_state_slot *const slots = ir->state_slots;
1181 assert(ir->state_slots != NULL);
1182
1183 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
1184 /* This state reference has already been setup by ir_to_mesa, but we'll
1185 * get the same index back here.
1186 */
1187 int index = _mesa_add_state_reference(this->prog->Parameters,
1188 (gl_state_index *)slots[i].tokens);
1189
1190 /* Add each of the unique swizzles of the element as a parameter.
1191 * This'll end up matching the expected layout of the
1192 * array/matrix/structure we're trying to fill in.
1193 */
1194 int last_swiz = -1;
1195 for (unsigned int j = 0; j < 4; j++) {
1196 int swiz = GET_SWZ(slots[i].swizzle, j);
1197 if (swiz == last_swiz)
1198 break;
1199 last_swiz = swiz;
1200
1201 stage_prog_data->param[uniforms++] =
1202 &prog->Parameters->ParameterValues[index][swiz];
1203 }
1204 }
1205 }
1206
1207 fs_reg *
1208 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1209 {
1210 assert(stage == MESA_SHADER_FRAGMENT);
1211 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1212 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1213 fs_reg wpos = *reg;
1214 bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1215
1216 /* gl_FragCoord.x */
1217 if (ir->data.pixel_center_integer) {
1218 emit(MOV(wpos, this->pixel_x));
1219 } else {
1220 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1221 }
1222 wpos = offset(wpos, 1);
1223
1224 /* gl_FragCoord.y */
1225 if (!flip && ir->data.pixel_center_integer) {
1226 emit(MOV(wpos, this->pixel_y));
1227 } else {
1228 fs_reg pixel_y = this->pixel_y;
1229 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1230
1231 if (flip) {
1232 pixel_y.negate = true;
1233 offset += key->drawable_height - 1.0;
1234 }
1235
1236 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1237 }
1238 wpos = offset(wpos, 1);
1239
1240 /* gl_FragCoord.z */
1241 if (brw->gen >= 6) {
1242 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1243 } else {
1244 emit(FS_OPCODE_LINTERP, wpos,
1245 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1246 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1247 interp_reg(VARYING_SLOT_POS, 2));
1248 }
1249 wpos = offset(wpos, 1);
1250
1251 /* gl_FragCoord.w: Already set up in emit_interpolation */
1252 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1253
1254 return reg;
1255 }
1256
1257 fs_inst *
1258 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1259 glsl_interp_qualifier interpolation_mode,
1260 bool is_centroid, bool is_sample)
1261 {
1262 brw_wm_barycentric_interp_mode barycoord_mode;
1263 if (brw->gen >= 6) {
1264 if (is_centroid) {
1265 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1266 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1267 else
1268 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1269 } else if (is_sample) {
1270 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1271 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1272 else
1273 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1274 } else {
1275 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1276 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1277 else
1278 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1279 }
1280 } else {
1281 /* On Ironlake and below, there is only one interpolation mode.
1282 * Centroid interpolation doesn't mean anything on this hardware --
1283 * there is no multisampling.
1284 */
1285 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1286 }
1287 return emit(FS_OPCODE_LINTERP, attr,
1288 this->delta_x[barycoord_mode],
1289 this->delta_y[barycoord_mode], interp);
1290 }
1291
1292 fs_reg *
1293 fs_visitor::emit_general_interpolation(ir_variable *ir)
1294 {
1295 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1296 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1297 fs_reg attr = *reg;
1298
1299 assert(stage == MESA_SHADER_FRAGMENT);
1300 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1301 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1302
1303 unsigned int array_elements;
1304 const glsl_type *type;
1305
1306 if (ir->type->is_array()) {
1307 array_elements = ir->type->length;
1308 if (array_elements == 0) {
1309 fail("dereferenced array '%s' has length 0\n", ir->name);
1310 }
1311 type = ir->type->fields.array;
1312 } else {
1313 array_elements = 1;
1314 type = ir->type;
1315 }
1316
1317 glsl_interp_qualifier interpolation_mode =
1318 ir->determine_interpolation_mode(key->flat_shade);
1319
1320 int location = ir->data.location;
1321 for (unsigned int i = 0; i < array_elements; i++) {
1322 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1323 if (prog_data->urb_setup[location] == -1) {
1324 /* If there's no incoming setup data for this slot, don't
1325 * emit interpolation for it.
1326 */
1327 attr = offset(attr, type->vector_elements);
1328 location++;
1329 continue;
1330 }
1331
1332 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1333 /* Constant interpolation (flat shading) case. The SF has
1334 * handed us defined values in only the constant offset
1335 * field of the setup reg.
1336 */
1337 for (unsigned int k = 0; k < type->vector_elements; k++) {
1338 struct brw_reg interp = interp_reg(location, k);
1339 interp = suboffset(interp, 3);
1340 interp.type = reg->type;
1341 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1342 attr = offset(attr, 1);
1343 }
1344 } else {
1345 /* Smooth/noperspective interpolation case. */
1346 for (unsigned int k = 0; k < type->vector_elements; k++) {
1347 struct brw_reg interp = interp_reg(location, k);
1348 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1349 /* Get the pixel/sample mask into f0 so that we know
1350 * which pixels are lit. Then, for each channel that is
1351 * unlit, replace the centroid data with non-centroid
1352 * data.
1353 */
1354 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1355
1356 fs_inst *inst;
1357 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1358 false, false);
1359 inst->predicate = BRW_PREDICATE_NORMAL;
1360 inst->predicate_inverse = true;
1361 if (brw->has_pln)
1362 inst->no_dd_clear = true;
1363
1364 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1365 ir->data.centroid && !key->persample_shading,
1366 ir->data.sample || key->persample_shading);
1367 inst->predicate = BRW_PREDICATE_NORMAL;
1368 inst->predicate_inverse = false;
1369 if (brw->has_pln)
1370 inst->no_dd_check = true;
1371
1372 } else {
1373 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1374 ir->data.centroid && !key->persample_shading,
1375 ir->data.sample || key->persample_shading);
1376 }
1377 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1378 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1379 }
1380 attr = offset(attr, 1);
1381 }
1382
1383 }
1384 location++;
1385 }
1386 }
1387
1388 return reg;
1389 }
1390
1391 fs_reg *
1392 fs_visitor::emit_frontfacing_interpolation()
1393 {
1394 fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::bool_type);
1395
1396 if (brw->gen >= 6) {
1397 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1398 * a boolean result from this (~0/true or 0/false).
1399 *
1400 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1401 * this task in only one instruction:
1402 * - a negation source modifier will flip the bit; and
1403 * - a W -> D type conversion will sign extend the bit into the high
1404 * word of the destination.
1405 *
1406 * An ASR 15 fills the low word of the destination.
1407 */
1408 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1409 g0.negate = true;
1410
1411 emit(ASR(*reg, g0, fs_reg(15)));
1412 } else {
1413 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1414 * a boolean result from this (1/true or 0/false).
1415 *
1416 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1417 * the negation source modifier to flip it. Unfortunately the SHR
1418 * instruction only operates on UD (or D with an abs source modifier)
1419 * sources without negation.
1420 *
1421 * Instead, use ASR (which will give ~0/true or 0/false) followed by an
1422 * AND 1.
1423 */
1424 fs_reg asr = fs_reg(this, glsl_type::bool_type);
1425 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1426 g1_6.negate = true;
1427
1428 emit(ASR(asr, g1_6, fs_reg(31)));
1429 emit(AND(*reg, asr, fs_reg(1)));
1430 }
1431
1432 return reg;
1433 }
1434
1435 void
1436 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1437 {
1438 assert(stage == MESA_SHADER_FRAGMENT);
1439 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1440 assert(dst.type == BRW_REGISTER_TYPE_F);
1441
1442 if (key->compute_pos_offset) {
1443 /* Convert int_sample_pos to floating point */
1444 emit(MOV(dst, int_sample_pos));
1445 /* Scale to the range [0, 1] */
1446 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1447 }
1448 else {
1449 /* From ARB_sample_shading specification:
1450 * "When rendering to a non-multisample buffer, or if multisample
1451 * rasterization is disabled, gl_SamplePosition will always be
1452 * (0.5, 0.5).
1453 */
1454 emit(MOV(dst, fs_reg(0.5f)));
1455 }
1456 }
1457
1458 fs_reg *
1459 fs_visitor::emit_samplepos_setup()
1460 {
1461 assert(brw->gen >= 6);
1462
1463 this->current_annotation = "compute sample position";
1464 fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::vec2_type);
1465 fs_reg pos = *reg;
1466 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1467 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1468
1469 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1470 * mode will be enabled.
1471 *
1472 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1473 * R31.1:0 Position Offset X/Y for Slot[3:0]
1474 * R31.3:2 Position Offset X/Y for Slot[7:4]
1475 * .....
1476 *
1477 * The X, Y sample positions come in as bytes in thread payload. So, read
1478 * the positions using vstride=16, width=8, hstride=2.
1479 */
1480 struct brw_reg sample_pos_reg =
1481 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1482 BRW_REGISTER_TYPE_B), 16, 8, 2);
1483
1484 if (dispatch_width == 8) {
1485 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1486 } else {
1487 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1488 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1489 ->force_sechalf = true;
1490 }
1491 /* Compute gl_SamplePosition.x */
1492 compute_sample_position(pos, int_sample_x);
1493 pos = offset(pos, 1);
1494 if (dispatch_width == 8) {
1495 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1496 } else {
1497 emit(MOV(half(int_sample_y, 0),
1498 fs_reg(suboffset(sample_pos_reg, 1))));
1499 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1500 ->force_sechalf = true;
1501 }
1502 /* Compute gl_SamplePosition.y */
1503 compute_sample_position(pos, int_sample_y);
1504 return reg;
1505 }
1506
1507 fs_reg *
1508 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1509 {
1510 assert(stage == MESA_SHADER_FRAGMENT);
1511 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1512 assert(brw->gen >= 6);
1513
1514 this->current_annotation = "compute sample id";
1515 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1516
1517 if (key->compute_sample_id) {
1518 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1519 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1520 t2.type = BRW_REGISTER_TYPE_UW;
1521
1522 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1523 * 8x multisampling, subspan 0 will represent sample N (where N
1524 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1525 * 7. We can find the value of N by looking at R0.0 bits 7:6
1526 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1527 * (since samples are always delivered in pairs). That is, we
1528 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1529 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1530 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1531 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1532 * populating a temporary variable with the sequence (0, 1, 2, 3),
1533 * and then reading from it using vstride=1, width=4, hstride=0.
1534 * These computations hold good for 4x multisampling as well.
1535 *
1536 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1537 * the first four slots are sample 0 of subspan 0; the next four
1538 * are sample 1 of subspan 0; the third group is sample 0 of
1539 * subspan 1, and finally sample 1 of subspan 1.
1540 */
1541 fs_inst *inst;
1542 inst = emit(BRW_OPCODE_AND, t1,
1543 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1544 fs_reg(0xc0));
1545 inst->force_writemask_all = true;
1546 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1547 inst->force_writemask_all = true;
1548 /* This works for both SIMD8 and SIMD16 */
1549 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1550 inst->force_writemask_all = true;
1551 /* This special instruction takes care of setting vstride=1,
1552 * width=4, hstride=0 of t2 during an ADD instruction.
1553 */
1554 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1555 } else {
1556 /* As per GL_ARB_sample_shading specification:
1557 * "When rendering to a non-multisample buffer, or if multisample
1558 * rasterization is disabled, gl_SampleID will always be zero."
1559 */
1560 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1561 }
1562
1563 return reg;
1564 }
1565
1566 fs_reg
1567 fs_visitor::fix_math_operand(fs_reg src)
1568 {
1569 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1570 * might be able to do better by doing execsize = 1 math and then
1571 * expanding that result out, but we would need to be careful with
1572 * masking.
1573 *
1574 * The hardware ignores source modifiers (negate and abs) on math
1575 * instructions, so we also move to a temp to set those up.
1576 */
1577 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1578 !src.abs && !src.negate)
1579 return src;
1580
1581 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1582 * operands to math
1583 */
1584 if (brw->gen >= 7 && src.file != IMM)
1585 return src;
1586
1587 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1588 expanded.type = src.type;
1589 emit(BRW_OPCODE_MOV, expanded, src);
1590 return expanded;
1591 }
1592
1593 fs_inst *
1594 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1595 {
1596 switch (opcode) {
1597 case SHADER_OPCODE_RCP:
1598 case SHADER_OPCODE_RSQ:
1599 case SHADER_OPCODE_SQRT:
1600 case SHADER_OPCODE_EXP2:
1601 case SHADER_OPCODE_LOG2:
1602 case SHADER_OPCODE_SIN:
1603 case SHADER_OPCODE_COS:
1604 break;
1605 default:
1606 unreachable("not reached: bad math opcode");
1607 }
1608
1609 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1610 * might be able to do better by doing execsize = 1 math and then
1611 * expanding that result out, but we would need to be careful with
1612 * masking.
1613 *
1614 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1615 * instructions, so we also move to a temp to set those up.
1616 */
1617 if (brw->gen == 6 || brw->gen == 7)
1618 src = fix_math_operand(src);
1619
1620 fs_inst *inst = emit(opcode, dst, src);
1621
1622 if (brw->gen < 6) {
1623 inst->base_mrf = 2;
1624 inst->mlen = dispatch_width / 8;
1625 }
1626
1627 return inst;
1628 }
1629
1630 fs_inst *
1631 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1632 {
1633 int base_mrf = 2;
1634 fs_inst *inst;
1635
1636 if (brw->gen >= 8) {
1637 inst = emit(opcode, dst, src0, src1);
1638 } else if (brw->gen >= 6) {
1639 src0 = fix_math_operand(src0);
1640 src1 = fix_math_operand(src1);
1641
1642 inst = emit(opcode, dst, src0, src1);
1643 } else {
1644 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1645 * "Message Payload":
1646 *
1647 * "Operand0[7]. For the INT DIV functions, this operand is the
1648 * denominator."
1649 * ...
1650 * "Operand1[7]. For the INT DIV functions, this operand is the
1651 * numerator."
1652 */
1653 bool is_int_div = opcode != SHADER_OPCODE_POW;
1654 fs_reg &op0 = is_int_div ? src1 : src0;
1655 fs_reg &op1 = is_int_div ? src0 : src1;
1656
1657 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1658 inst = emit(opcode, dst, op0, reg_null_f);
1659
1660 inst->base_mrf = base_mrf;
1661 inst->mlen = 2 * dispatch_width / 8;
1662 }
1663 return inst;
1664 }
1665
1666 void
1667 fs_visitor::assign_curb_setup()
1668 {
1669 if (dispatch_width == 8) {
1670 prog_data->dispatch_grf_start_reg = payload.num_regs;
1671 } else {
1672 assert(stage == MESA_SHADER_FRAGMENT);
1673 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1674 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1675 }
1676
1677 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1678
1679 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1680 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1681 for (unsigned int i = 0; i < inst->sources; i++) {
1682 if (inst->src[i].file == UNIFORM) {
1683 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1684 int constant_nr;
1685 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1686 constant_nr = push_constant_loc[uniform_nr];
1687 } else {
1688 /* Section 5.11 of the OpenGL 4.1 spec says:
1689 * "Out-of-bounds reads return undefined values, which include
1690 * values from other variables of the active program or zero."
1691 * Just return the first push constant.
1692 */
1693 constant_nr = 0;
1694 }
1695
1696 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1697 constant_nr / 8,
1698 constant_nr % 8);
1699
1700 inst->src[i].file = HW_REG;
1701 inst->src[i].fixed_hw_reg = byte_offset(
1702 retype(brw_reg, inst->src[i].type),
1703 inst->src[i].subreg_offset);
1704 }
1705 }
1706 }
1707 }
1708
1709 void
1710 fs_visitor::calculate_urb_setup()
1711 {
1712 assert(stage == MESA_SHADER_FRAGMENT);
1713 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1714 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1715
1716 memset(prog_data->urb_setup, -1,
1717 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1718
1719 int urb_next = 0;
1720 /* Figure out where each of the incoming setup attributes lands. */
1721 if (brw->gen >= 6) {
1722 if (_mesa_bitcount_64(prog->InputsRead &
1723 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1724 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1725 * first 16 varying inputs, so we can put them wherever we want.
1726 * Just put them in order.
1727 *
1728 * This is useful because it means that (a) inputs not used by the
1729 * fragment shader won't take up valuable register space, and (b) we
1730 * won't have to recompile the fragment shader if it gets paired with
1731 * a different vertex (or geometry) shader.
1732 */
1733 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1734 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1735 BITFIELD64_BIT(i)) {
1736 prog_data->urb_setup[i] = urb_next++;
1737 }
1738 }
1739 } else {
1740 /* We have enough input varyings that the SF/SBE pipeline stage can't
1741 * arbitrarily rearrange them to suit our whim; we have to put them
1742 * in an order that matches the output of the previous pipeline stage
1743 * (geometry or vertex shader).
1744 */
1745 struct brw_vue_map prev_stage_vue_map;
1746 brw_compute_vue_map(brw, &prev_stage_vue_map,
1747 key->input_slots_valid);
1748 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1749 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1750 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1751 slot++) {
1752 int varying = prev_stage_vue_map.slot_to_varying[slot];
1753 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1754 * unused.
1755 */
1756 if (varying != BRW_VARYING_SLOT_COUNT &&
1757 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1758 BITFIELD64_BIT(varying))) {
1759 prog_data->urb_setup[varying] = slot - first_slot;
1760 }
1761 }
1762 urb_next = prev_stage_vue_map.num_slots - first_slot;
1763 }
1764 } else {
1765 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1766 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1767 /* Point size is packed into the header, not as a general attribute */
1768 if (i == VARYING_SLOT_PSIZ)
1769 continue;
1770
1771 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1772 /* The back color slot is skipped when the front color is
1773 * also written to. In addition, some slots can be
1774 * written in the vertex shader and not read in the
1775 * fragment shader. So the register number must always be
1776 * incremented, mapped or not.
1777 */
1778 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1779 prog_data->urb_setup[i] = urb_next;
1780 urb_next++;
1781 }
1782 }
1783
1784 /*
1785 * It's a FS only attribute, and we did interpolation for this attribute
1786 * in SF thread. So, count it here, too.
1787 *
1788 * See compile_sf_prog() for more info.
1789 */
1790 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1791 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1792 }
1793
1794 prog_data->num_varying_inputs = urb_next;
1795 }
1796
1797 void
1798 fs_visitor::assign_urb_setup()
1799 {
1800 assert(stage == MESA_SHADER_FRAGMENT);
1801 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1802
1803 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1804
1805 /* Offset all the urb_setup[] index by the actual position of the
1806 * setup regs, now that the location of the constants has been chosen.
1807 */
1808 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1809 if (inst->opcode == FS_OPCODE_LINTERP) {
1810 assert(inst->src[2].file == HW_REG);
1811 inst->src[2].fixed_hw_reg.nr += urb_start;
1812 }
1813
1814 if (inst->opcode == FS_OPCODE_CINTERP) {
1815 assert(inst->src[0].file == HW_REG);
1816 inst->src[0].fixed_hw_reg.nr += urb_start;
1817 }
1818 }
1819
1820 /* Each attribute is 4 setup channels, each of which is half a reg. */
1821 this->first_non_payload_grf =
1822 urb_start + prog_data->num_varying_inputs * 2;
1823 }
1824
1825 /**
1826 * Split large virtual GRFs into separate components if we can.
1827 *
1828 * This is mostly duplicated with what brw_fs_vector_splitting does,
1829 * but that's really conservative because it's afraid of doing
1830 * splitting that doesn't result in real progress after the rest of
1831 * the optimization phases, which would cause infinite looping in
1832 * optimization. We can do it once here, safely. This also has the
1833 * opportunity to split interpolated values, or maybe even uniforms,
1834 * which we don't have at the IR level.
1835 *
1836 * We want to split, because virtual GRFs are what we register
1837 * allocate and spill (due to contiguousness requirements for some
1838 * instructions), and they're what we naturally generate in the
1839 * codegen process, but most virtual GRFs don't actually need to be
1840 * contiguous sets of GRFs. If we split, we'll end up with reduced
1841 * live intervals and better dead code elimination and coalescing.
1842 */
1843 void
1844 fs_visitor::split_virtual_grfs()
1845 {
1846 int num_vars = this->virtual_grf_count;
1847
1848 /* Count the total number of registers */
1849 int reg_count = 0;
1850 int vgrf_to_reg[num_vars];
1851 for (int i = 0; i < num_vars; i++) {
1852 vgrf_to_reg[i] = reg_count;
1853 reg_count += virtual_grf_sizes[i];
1854 }
1855
1856 /* An array of "split points". For each register slot, this indicates
1857 * if this slot can be separated from the previous slot. Every time an
1858 * instruction uses multiple elements of a register (as a source or
1859 * destination), we mark the used slots as inseparable. Then we go
1860 * through and split the registers into the smallest pieces we can.
1861 */
1862 bool split_points[reg_count];
1863 memset(split_points, 0, sizeof(split_points));
1864
1865 /* Mark all used registers as fully splittable */
1866 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1867 if (inst->dst.file == GRF) {
1868 int reg = vgrf_to_reg[inst->dst.reg];
1869 for (int j = 1; j < this->virtual_grf_sizes[inst->dst.reg]; j++)
1870 split_points[reg + j] = true;
1871 }
1872
1873 for (int i = 0; i < inst->sources; i++) {
1874 if (inst->src[i].file == GRF) {
1875 int reg = vgrf_to_reg[inst->src[i].reg];
1876 for (int j = 1; j < this->virtual_grf_sizes[inst->src[i].reg]; j++)
1877 split_points[reg + j] = true;
1878 }
1879 }
1880 }
1881
1882 if (brw->has_pln &&
1883 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1884 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1885 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1886 * Gen6, that was the only supported interpolation mode, and since Gen6,
1887 * delta_x and delta_y are in fixed hardware registers.
1888 */
1889 int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1890 split_points[vgrf_to_reg[vgrf] + 1] = false;
1891 }
1892
1893 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1894 if (inst->dst.file == GRF) {
1895 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1896 for (int j = 1; j < inst->regs_written; j++)
1897 split_points[reg + j] = false;
1898 }
1899 for (int i = 0; i < inst->sources; i++) {
1900 if (inst->src[i].file == GRF) {
1901 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1902 for (int j = 1; j < inst->regs_read(this, i); j++)
1903 split_points[reg + j] = false;
1904 }
1905 }
1906 }
1907
1908 int new_virtual_grf[reg_count];
1909 int new_reg_offset[reg_count];
1910
1911 int reg = 0;
1912 for (int i = 0; i < num_vars; i++) {
1913 /* The first one should always be 0 as a quick sanity check. */
1914 assert(split_points[reg] == false);
1915
1916 /* j = 0 case */
1917 new_reg_offset[reg] = 0;
1918 reg++;
1919 int offset = 1;
1920
1921 /* j > 0 case */
1922 for (int j = 1; j < virtual_grf_sizes[i]; j++) {
1923 /* If this is a split point, reset the offset to 0 and allocate a
1924 * new virtual GRF for the previous offset many registers
1925 */
1926 if (split_points[reg]) {
1927 int grf = virtual_grf_alloc(offset);
1928 for (int k = reg - offset; k < reg; k++)
1929 new_virtual_grf[k] = grf;
1930 offset = 0;
1931 }
1932 new_reg_offset[reg] = offset;
1933 offset++;
1934 reg++;
1935 }
1936
1937 /* The last one gets the original register number */
1938 virtual_grf_sizes[i] = offset;
1939 for (int k = reg - offset; k < reg; k++)
1940 new_virtual_grf[k] = i;
1941 }
1942 assert(reg == reg_count);
1943
1944 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1945 if (inst->dst.file == GRF) {
1946 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1947 inst->dst.reg = new_virtual_grf[reg];
1948 inst->dst.reg_offset = new_reg_offset[reg];
1949 assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
1950 }
1951 for (int i = 0; i < inst->sources; i++) {
1952 if (inst->src[i].file == GRF) {
1953 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1954 inst->src[i].reg = new_virtual_grf[reg];
1955 inst->src[i].reg_offset = new_reg_offset[reg];
1956 assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
1957 }
1958 }
1959 }
1960 invalidate_live_intervals();
1961 }
1962
1963 /**
1964 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1965 *
1966 * During code generation, we create tons of temporary variables, many of
1967 * which get immediately killed and are never used again. Yet, in later
1968 * optimization and analysis passes, such as compute_live_intervals, we need
1969 * to loop over all the virtual GRFs. Compacting them can save a lot of
1970 * overhead.
1971 */
1972 bool
1973 fs_visitor::compact_virtual_grfs()
1974 {
1975 bool progress = false;
1976 int remap_table[this->virtual_grf_count];
1977 memset(remap_table, -1, sizeof(remap_table));
1978
1979 /* Mark which virtual GRFs are used. */
1980 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1981 if (inst->dst.file == GRF)
1982 remap_table[inst->dst.reg] = 0;
1983
1984 for (int i = 0; i < inst->sources; i++) {
1985 if (inst->src[i].file == GRF)
1986 remap_table[inst->src[i].reg] = 0;
1987 }
1988 }
1989
1990 /* Compact the GRF arrays. */
1991 int new_index = 0;
1992 for (int i = 0; i < this->virtual_grf_count; i++) {
1993 if (remap_table[i] == -1) {
1994 /* We just found an unused register. This means that we are
1995 * actually going to compact something.
1996 */
1997 progress = true;
1998 } else {
1999 remap_table[i] = new_index;
2000 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
2001 invalidate_live_intervals();
2002 ++new_index;
2003 }
2004 }
2005
2006 this->virtual_grf_count = new_index;
2007
2008 /* Patch all the instructions to use the newly renumbered registers */
2009 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2010 if (inst->dst.file == GRF)
2011 inst->dst.reg = remap_table[inst->dst.reg];
2012
2013 for (int i = 0; i < inst->sources; i++) {
2014 if (inst->src[i].file == GRF)
2015 inst->src[i].reg = remap_table[inst->src[i].reg];
2016 }
2017 }
2018
2019 /* Patch all the references to delta_x/delta_y, since they're used in
2020 * register allocation. If they're unused, switch them to BAD_FILE so
2021 * we don't think some random VGRF is delta_x/delta_y.
2022 */
2023 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2024 if (delta_x[i].file == GRF) {
2025 if (remap_table[delta_x[i].reg] != -1) {
2026 delta_x[i].reg = remap_table[delta_x[i].reg];
2027 } else {
2028 delta_x[i].file = BAD_FILE;
2029 }
2030 }
2031 }
2032 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2033 if (delta_y[i].file == GRF) {
2034 if (remap_table[delta_y[i].reg] != -1) {
2035 delta_y[i].reg = remap_table[delta_y[i].reg];
2036 } else {
2037 delta_y[i].file = BAD_FILE;
2038 }
2039 }
2040 }
2041
2042 return progress;
2043 }
2044
2045 /*
2046 * Implements array access of uniforms by inserting a
2047 * PULL_CONSTANT_LOAD instruction.
2048 *
2049 * Unlike temporary GRF array access (where we don't support it due to
2050 * the difficulty of doing relative addressing on instruction
2051 * destinations), we could potentially do array access of uniforms
2052 * that were loaded in GRF space as push constants. In real-world
2053 * usage we've seen, though, the arrays being used are always larger
2054 * than we could load as push constants, so just always move all
2055 * uniform array access out to a pull constant buffer.
2056 */
2057 void
2058 fs_visitor::move_uniform_array_access_to_pull_constants()
2059 {
2060 if (dispatch_width != 8)
2061 return;
2062
2063 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2064 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2065
2066 /* Walk through and find array access of uniforms. Put a copy of that
2067 * uniform in the pull constant buffer.
2068 *
2069 * Note that we don't move constant-indexed accesses to arrays. No
2070 * testing has been done of the performance impact of this choice.
2071 */
2072 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2073 for (int i = 0 ; i < inst->sources; i++) {
2074 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2075 continue;
2076
2077 int uniform = inst->src[i].reg;
2078
2079 /* If this array isn't already present in the pull constant buffer,
2080 * add it.
2081 */
2082 if (pull_constant_loc[uniform] == -1) {
2083 const gl_constant_value **values = &stage_prog_data->param[uniform];
2084
2085 assert(param_size[uniform]);
2086
2087 for (int j = 0; j < param_size[uniform]; j++) {
2088 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2089
2090 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2091 values[j];
2092 }
2093 }
2094 }
2095 }
2096 }
2097
2098 /**
2099 * Assign UNIFORM file registers to either push constants or pull constants.
2100 *
2101 * We allow a fragment shader to have more than the specified minimum
2102 * maximum number of fragment shader uniform components (64). If
2103 * there are too many of these, they'd fill up all of register space.
2104 * So, this will push some of them out to the pull constant buffer and
2105 * update the program to load them.
2106 */
2107 void
2108 fs_visitor::assign_constant_locations()
2109 {
2110 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2111 if (dispatch_width != 8)
2112 return;
2113
2114 /* Find which UNIFORM registers are still in use. */
2115 bool is_live[uniforms];
2116 for (unsigned int i = 0; i < uniforms; i++) {
2117 is_live[i] = false;
2118 }
2119
2120 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2121 for (int i = 0; i < inst->sources; i++) {
2122 if (inst->src[i].file != UNIFORM)
2123 continue;
2124
2125 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2126 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2127 is_live[constant_nr] = true;
2128 }
2129 }
2130
2131 /* Only allow 16 registers (128 uniform components) as push constants.
2132 *
2133 * Just demote the end of the list. We could probably do better
2134 * here, demoting things that are rarely used in the program first.
2135 *
2136 * If changing this value, note the limitation about total_regs in
2137 * brw_curbe.c.
2138 */
2139 unsigned int max_push_components = 16 * 8;
2140 unsigned int num_push_constants = 0;
2141
2142 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2143
2144 for (unsigned int i = 0; i < uniforms; i++) {
2145 if (!is_live[i] || pull_constant_loc[i] != -1) {
2146 /* This UNIFORM register is either dead, or has already been demoted
2147 * to a pull const. Mark it as no longer living in the param[] array.
2148 */
2149 push_constant_loc[i] = -1;
2150 continue;
2151 }
2152
2153 if (num_push_constants < max_push_components) {
2154 /* Retain as a push constant. Record the location in the params[]
2155 * array.
2156 */
2157 push_constant_loc[i] = num_push_constants++;
2158 } else {
2159 /* Demote to a pull constant. */
2160 push_constant_loc[i] = -1;
2161
2162 int pull_index = stage_prog_data->nr_pull_params++;
2163 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2164 pull_constant_loc[i] = pull_index;
2165 }
2166 }
2167
2168 stage_prog_data->nr_params = num_push_constants;
2169
2170 /* Up until now, the param[] array has been indexed by reg + reg_offset
2171 * of UNIFORM registers. Condense it to only contain the uniforms we
2172 * chose to upload as push constants.
2173 */
2174 for (unsigned int i = 0; i < uniforms; i++) {
2175 int remapped = push_constant_loc[i];
2176
2177 if (remapped == -1)
2178 continue;
2179
2180 assert(remapped <= (int)i);
2181 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2182 }
2183 }
2184
2185 /**
2186 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2187 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2188 */
2189 void
2190 fs_visitor::demote_pull_constants()
2191 {
2192 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2193 for (int i = 0; i < inst->sources; i++) {
2194 if (inst->src[i].file != UNIFORM)
2195 continue;
2196
2197 int pull_index = pull_constant_loc[inst->src[i].reg +
2198 inst->src[i].reg_offset];
2199 if (pull_index == -1)
2200 continue;
2201
2202 /* Set up the annotation tracking for new generated instructions. */
2203 base_ir = inst->ir;
2204 current_annotation = inst->annotation;
2205
2206 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2207 fs_reg dst = fs_reg(this, glsl_type::float_type);
2208
2209 /* Generate a pull load into dst. */
2210 if (inst->src[i].reladdr) {
2211 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2212 surf_index,
2213 *inst->src[i].reladdr,
2214 pull_index);
2215 inst->insert_before(block, &list);
2216 inst->src[i].reladdr = NULL;
2217 } else {
2218 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2219 fs_inst *pull =
2220 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2221 dst, surf_index, offset);
2222 inst->insert_before(block, pull);
2223 inst->src[i].set_smear(pull_index & 3);
2224 }
2225
2226 /* Rewrite the instruction to use the temporary VGRF. */
2227 inst->src[i].file = GRF;
2228 inst->src[i].reg = dst.reg;
2229 inst->src[i].reg_offset = 0;
2230 inst->src[i].width = dispatch_width;
2231 }
2232 }
2233 invalidate_live_intervals();
2234 }
2235
2236 bool
2237 fs_visitor::opt_algebraic()
2238 {
2239 bool progress = false;
2240
2241 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2242 switch (inst->opcode) {
2243 case BRW_OPCODE_MUL:
2244 if (inst->src[1].file != IMM)
2245 continue;
2246
2247 /* a * 1.0 = a */
2248 if (inst->src[1].is_one()) {
2249 inst->opcode = BRW_OPCODE_MOV;
2250 inst->src[1] = reg_undef;
2251 progress = true;
2252 break;
2253 }
2254
2255 /* a * 0.0 = 0.0 */
2256 if (inst->src[1].is_zero()) {
2257 inst->opcode = BRW_OPCODE_MOV;
2258 inst->src[0] = inst->src[1];
2259 inst->src[1] = reg_undef;
2260 progress = true;
2261 break;
2262 }
2263
2264 break;
2265 case BRW_OPCODE_ADD:
2266 if (inst->src[1].file != IMM)
2267 continue;
2268
2269 /* a + 0.0 = a */
2270 if (inst->src[1].is_zero()) {
2271 inst->opcode = BRW_OPCODE_MOV;
2272 inst->src[1] = reg_undef;
2273 progress = true;
2274 break;
2275 }
2276 break;
2277 case BRW_OPCODE_OR:
2278 if (inst->src[0].equals(inst->src[1])) {
2279 inst->opcode = BRW_OPCODE_MOV;
2280 inst->src[1] = reg_undef;
2281 progress = true;
2282 break;
2283 }
2284 break;
2285 case BRW_OPCODE_LRP:
2286 if (inst->src[1].equals(inst->src[2])) {
2287 inst->opcode = BRW_OPCODE_MOV;
2288 inst->src[0] = inst->src[1];
2289 inst->src[1] = reg_undef;
2290 inst->src[2] = reg_undef;
2291 progress = true;
2292 break;
2293 }
2294 break;
2295 case BRW_OPCODE_SEL:
2296 if (inst->src[0].equals(inst->src[1])) {
2297 inst->opcode = BRW_OPCODE_MOV;
2298 inst->src[1] = reg_undef;
2299 inst->predicate = BRW_PREDICATE_NONE;
2300 inst->predicate_inverse = false;
2301 progress = true;
2302 } else if (inst->saturate && inst->src[1].file == IMM) {
2303 switch (inst->conditional_mod) {
2304 case BRW_CONDITIONAL_LE:
2305 case BRW_CONDITIONAL_L:
2306 switch (inst->src[1].type) {
2307 case BRW_REGISTER_TYPE_F:
2308 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2309 inst->opcode = BRW_OPCODE_MOV;
2310 inst->src[1] = reg_undef;
2311 progress = true;
2312 }
2313 break;
2314 default:
2315 break;
2316 }
2317 break;
2318 case BRW_CONDITIONAL_GE:
2319 case BRW_CONDITIONAL_G:
2320 switch (inst->src[1].type) {
2321 case BRW_REGISTER_TYPE_F:
2322 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2323 inst->opcode = BRW_OPCODE_MOV;
2324 inst->src[1] = reg_undef;
2325 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2326 progress = true;
2327 }
2328 break;
2329 default:
2330 break;
2331 }
2332 default:
2333 break;
2334 }
2335 }
2336 break;
2337 default:
2338 break;
2339 }
2340 }
2341
2342 return progress;
2343 }
2344
2345 bool
2346 fs_visitor::opt_register_renaming()
2347 {
2348 bool progress = false;
2349 int depth = 0;
2350
2351 int remap[virtual_grf_count];
2352 memset(remap, -1, sizeof(int) * virtual_grf_count);
2353
2354 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2355 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2356 depth++;
2357 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2358 inst->opcode == BRW_OPCODE_WHILE) {
2359 depth--;
2360 }
2361
2362 /* Rewrite instruction sources. */
2363 for (int i = 0; i < inst->sources; i++) {
2364 if (inst->src[i].file == GRF &&
2365 remap[inst->src[i].reg] != -1 &&
2366 remap[inst->src[i].reg] != inst->src[i].reg) {
2367 inst->src[i].reg = remap[inst->src[i].reg];
2368 progress = true;
2369 }
2370 }
2371
2372 const int dst = inst->dst.reg;
2373
2374 if (depth == 0 &&
2375 inst->dst.file == GRF &&
2376 virtual_grf_sizes[inst->dst.reg] == inst->dst.width / 8 &&
2377 !inst->is_partial_write()) {
2378 if (remap[dst] == -1) {
2379 remap[dst] = dst;
2380 } else {
2381 remap[dst] = virtual_grf_alloc(inst->dst.width / 8);
2382 inst->dst.reg = remap[dst];
2383 progress = true;
2384 }
2385 } else if (inst->dst.file == GRF &&
2386 remap[dst] != -1 &&
2387 remap[dst] != dst) {
2388 inst->dst.reg = remap[dst];
2389 progress = true;
2390 }
2391 }
2392
2393 if (progress) {
2394 invalidate_live_intervals();
2395
2396 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2397 if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2398 delta_x[i].reg = remap[delta_x[i].reg];
2399 }
2400 }
2401 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2402 if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2403 delta_y[i].reg = remap[delta_y[i].reg];
2404 }
2405 }
2406 }
2407
2408 return progress;
2409 }
2410
2411 bool
2412 fs_visitor::compute_to_mrf()
2413 {
2414 bool progress = false;
2415 int next_ip = 0;
2416
2417 calculate_live_intervals();
2418
2419 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2420 int ip = next_ip;
2421 next_ip++;
2422
2423 if (inst->opcode != BRW_OPCODE_MOV ||
2424 inst->is_partial_write() ||
2425 inst->dst.file != MRF || inst->src[0].file != GRF ||
2426 inst->dst.type != inst->src[0].type ||
2427 inst->src[0].abs || inst->src[0].negate ||
2428 !inst->src[0].is_contiguous() ||
2429 inst->src[0].subreg_offset)
2430 continue;
2431
2432 /* Work out which hardware MRF registers are written by this
2433 * instruction.
2434 */
2435 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2436 int mrf_high;
2437 if (inst->dst.reg & BRW_MRF_COMPR4) {
2438 mrf_high = mrf_low + 4;
2439 } else if (inst->exec_size == 16) {
2440 mrf_high = mrf_low + 1;
2441 } else {
2442 mrf_high = mrf_low;
2443 }
2444
2445 /* Can't compute-to-MRF this GRF if someone else was going to
2446 * read it later.
2447 */
2448 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2449 continue;
2450
2451 /* Found a move of a GRF to a MRF. Let's see if we can go
2452 * rewrite the thing that made this GRF to write into the MRF.
2453 */
2454 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2455 if (scan_inst->dst.file == GRF &&
2456 scan_inst->dst.reg == inst->src[0].reg) {
2457 /* Found the last thing to write our reg we want to turn
2458 * into a compute-to-MRF.
2459 */
2460
2461 /* If this one instruction didn't populate all the
2462 * channels, bail. We might be able to rewrite everything
2463 * that writes that reg, but it would require smarter
2464 * tracking to delay the rewriting until complete success.
2465 */
2466 if (scan_inst->is_partial_write())
2467 break;
2468
2469 /* Things returning more than one register would need us to
2470 * understand coalescing out more than one MOV at a time.
2471 */
2472 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2473 break;
2474
2475 /* SEND instructions can't have MRF as a destination. */
2476 if (scan_inst->mlen)
2477 break;
2478
2479 if (brw->gen == 6) {
2480 /* gen6 math instructions must have the destination be
2481 * GRF, so no compute-to-MRF for them.
2482 */
2483 if (scan_inst->is_math()) {
2484 break;
2485 }
2486 }
2487
2488 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2489 /* Found the creator of our MRF's source value. */
2490 scan_inst->dst.file = MRF;
2491 scan_inst->dst.reg = inst->dst.reg;
2492 scan_inst->saturate |= inst->saturate;
2493 inst->remove(block);
2494 progress = true;
2495 }
2496 break;
2497 }
2498
2499 /* We don't handle control flow here. Most computation of
2500 * values that end up in MRFs are shortly before the MRF
2501 * write anyway.
2502 */
2503 if (block->start() == scan_inst)
2504 break;
2505
2506 /* You can't read from an MRF, so if someone else reads our
2507 * MRF's source GRF that we wanted to rewrite, that stops us.
2508 */
2509 bool interfered = false;
2510 for (int i = 0; i < scan_inst->sources; i++) {
2511 if (scan_inst->src[i].file == GRF &&
2512 scan_inst->src[i].reg == inst->src[0].reg &&
2513 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2514 interfered = true;
2515 }
2516 }
2517 if (interfered)
2518 break;
2519
2520 if (scan_inst->dst.file == MRF) {
2521 /* If somebody else writes our MRF here, we can't
2522 * compute-to-MRF before that.
2523 */
2524 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2525 int scan_mrf_high;
2526
2527 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2528 scan_mrf_high = scan_mrf_low + 4;
2529 } else if (scan_inst->exec_size == 16) {
2530 scan_mrf_high = scan_mrf_low + 1;
2531 } else {
2532 scan_mrf_high = scan_mrf_low;
2533 }
2534
2535 if (mrf_low == scan_mrf_low ||
2536 mrf_low == scan_mrf_high ||
2537 mrf_high == scan_mrf_low ||
2538 mrf_high == scan_mrf_high) {
2539 break;
2540 }
2541 }
2542
2543 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2544 /* Found a SEND instruction, which means that there are
2545 * live values in MRFs from base_mrf to base_mrf +
2546 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2547 * above it.
2548 */
2549 if (mrf_low >= scan_inst->base_mrf &&
2550 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2551 break;
2552 }
2553 if (mrf_high >= scan_inst->base_mrf &&
2554 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2555 break;
2556 }
2557 }
2558 }
2559 }
2560
2561 if (progress)
2562 invalidate_live_intervals();
2563
2564 return progress;
2565 }
2566
2567 /**
2568 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2569 * instructions to FS_OPCODE_REP_FB_WRITE.
2570 */
2571 void
2572 fs_visitor::emit_repclear_shader()
2573 {
2574 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2575 int base_mrf = 1;
2576 int color_mrf = base_mrf + 2;
2577
2578 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2579 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2580 mov->force_writemask_all = true;
2581
2582 fs_inst *write;
2583 if (key->nr_color_regions == 1) {
2584 write = emit(FS_OPCODE_REP_FB_WRITE);
2585 write->saturate = key->clamp_fragment_color;
2586 write->base_mrf = color_mrf;
2587 write->target = 0;
2588 write->header_present = false;
2589 write->mlen = 1;
2590 } else {
2591 for (int i = 0; i < key->nr_color_regions; ++i) {
2592 write = emit(FS_OPCODE_REP_FB_WRITE);
2593 write->saturate = key->clamp_fragment_color;
2594 write->base_mrf = base_mrf;
2595 write->target = i;
2596 write->header_present = true;
2597 write->mlen = 3;
2598 }
2599 }
2600 write->eot = true;
2601
2602 calculate_cfg();
2603
2604 assign_constant_locations();
2605 assign_curb_setup();
2606
2607 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2608 assert(mov->src[0].file == HW_REG);
2609 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2610 }
2611
2612 /**
2613 * Walks through basic blocks, looking for repeated MRF writes and
2614 * removing the later ones.
2615 */
2616 bool
2617 fs_visitor::remove_duplicate_mrf_writes()
2618 {
2619 fs_inst *last_mrf_move[16];
2620 bool progress = false;
2621
2622 /* Need to update the MRF tracking for compressed instructions. */
2623 if (dispatch_width == 16)
2624 return false;
2625
2626 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2627
2628 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2629 if (inst->is_control_flow()) {
2630 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2631 }
2632
2633 if (inst->opcode == BRW_OPCODE_MOV &&
2634 inst->dst.file == MRF) {
2635 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2636 if (prev_inst && inst->equals(prev_inst)) {
2637 inst->remove(block);
2638 progress = true;
2639 continue;
2640 }
2641 }
2642
2643 /* Clear out the last-write records for MRFs that were overwritten. */
2644 if (inst->dst.file == MRF) {
2645 last_mrf_move[inst->dst.reg] = NULL;
2646 }
2647
2648 if (inst->mlen > 0 && inst->base_mrf != -1) {
2649 /* Found a SEND instruction, which will include two or fewer
2650 * implied MRF writes. We could do better here.
2651 */
2652 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2653 last_mrf_move[inst->base_mrf + i] = NULL;
2654 }
2655 }
2656
2657 /* Clear out any MRF move records whose sources got overwritten. */
2658 if (inst->dst.file == GRF) {
2659 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2660 if (last_mrf_move[i] &&
2661 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2662 last_mrf_move[i] = NULL;
2663 }
2664 }
2665 }
2666
2667 if (inst->opcode == BRW_OPCODE_MOV &&
2668 inst->dst.file == MRF &&
2669 inst->src[0].file == GRF &&
2670 !inst->is_partial_write()) {
2671 last_mrf_move[inst->dst.reg] = inst;
2672 }
2673 }
2674
2675 if (progress)
2676 invalidate_live_intervals();
2677
2678 return progress;
2679 }
2680
2681 static void
2682 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2683 int first_grf, int grf_len)
2684 {
2685 /* Clear the flag for registers that actually got read (as expected). */
2686 for (int i = 0; i < inst->sources; i++) {
2687 int grf;
2688 if (inst->src[i].file == GRF) {
2689 grf = inst->src[i].reg;
2690 } else if (inst->src[i].file == HW_REG &&
2691 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2692 grf = inst->src[i].fixed_hw_reg.nr;
2693 } else {
2694 continue;
2695 }
2696
2697 if (grf >= first_grf &&
2698 grf < first_grf + grf_len) {
2699 deps[grf - first_grf] = false;
2700 if (inst->exec_size == 16)
2701 deps[grf - first_grf + 1] = false;
2702 }
2703 }
2704 }
2705
2706 /**
2707 * Implements this workaround for the original 965:
2708 *
2709 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2710 * check for post destination dependencies on this instruction, software
2711 * must ensure that there is no destination hazard for the case of ‘write
2712 * followed by a posted write’ shown in the following example.
2713 *
2714 * 1. mov r3 0
2715 * 2. send r3.xy <rest of send instruction>
2716 * 3. mov r2 r3
2717 *
2718 * Due to no post-destination dependency check on the ‘send’, the above
2719 * code sequence could have two instructions (1 and 2) in flight at the
2720 * same time that both consider ‘r3’ as the target of their final writes.
2721 */
2722 void
2723 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2724 fs_inst *inst)
2725 {
2726 int write_len = inst->regs_written;
2727 int first_write_grf = inst->dst.reg;
2728 bool needs_dep[BRW_MAX_MRF];
2729 assert(write_len < (int)sizeof(needs_dep) - 1);
2730
2731 memset(needs_dep, false, sizeof(needs_dep));
2732 memset(needs_dep, true, write_len);
2733
2734 clear_deps_for_inst_src(inst, dispatch_width,
2735 needs_dep, first_write_grf, write_len);
2736
2737 /* Walk backwards looking for writes to registers we're writing which
2738 * aren't read since being written. If we hit the start of the program,
2739 * we assume that there are no outstanding dependencies on entry to the
2740 * program.
2741 */
2742 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2743 /* If we hit control flow, assume that there *are* outstanding
2744 * dependencies, and force their cleanup before our instruction.
2745 */
2746 if (block->start() == scan_inst) {
2747 for (int i = 0; i < write_len; i++) {
2748 if (needs_dep[i]) {
2749 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2750 }
2751 }
2752 return;
2753 }
2754
2755 /* We insert our reads as late as possible on the assumption that any
2756 * instruction but a MOV that might have left us an outstanding
2757 * dependency has more latency than a MOV.
2758 */
2759 if (scan_inst->dst.file == GRF) {
2760 for (int i = 0; i < scan_inst->regs_written; i++) {
2761 int reg = scan_inst->dst.reg + i;
2762
2763 if (reg >= first_write_grf &&
2764 reg < first_write_grf + write_len &&
2765 needs_dep[reg - first_write_grf]) {
2766 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2767 needs_dep[reg - first_write_grf] = false;
2768 if (scan_inst->exec_size == 16)
2769 needs_dep[reg - first_write_grf + 1] = false;
2770 }
2771 }
2772 }
2773
2774 /* Clear the flag for registers that actually got read (as expected). */
2775 clear_deps_for_inst_src(scan_inst, dispatch_width,
2776 needs_dep, first_write_grf, write_len);
2777
2778 /* Continue the loop only if we haven't resolved all the dependencies */
2779 int i;
2780 for (i = 0; i < write_len; i++) {
2781 if (needs_dep[i])
2782 break;
2783 }
2784 if (i == write_len)
2785 return;
2786 }
2787 }
2788
2789 /**
2790 * Implements this workaround for the original 965:
2791 *
2792 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2793 * used as a destination register until after it has been sourced by an
2794 * instruction with a different destination register.
2795 */
2796 void
2797 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2798 {
2799 int write_len = inst->regs_written;
2800 int first_write_grf = inst->dst.reg;
2801 bool needs_dep[BRW_MAX_MRF];
2802 assert(write_len < (int)sizeof(needs_dep) - 1);
2803
2804 memset(needs_dep, false, sizeof(needs_dep));
2805 memset(needs_dep, true, write_len);
2806 /* Walk forwards looking for writes to registers we're writing which aren't
2807 * read before being written.
2808 */
2809 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2810 /* If we hit control flow, force resolve all remaining dependencies. */
2811 if (block->end() == scan_inst) {
2812 for (int i = 0; i < write_len; i++) {
2813 if (needs_dep[i])
2814 scan_inst->insert_before(block,
2815 DEP_RESOLVE_MOV(first_write_grf + i));
2816 }
2817 return;
2818 }
2819
2820 /* Clear the flag for registers that actually got read (as expected). */
2821 clear_deps_for_inst_src(scan_inst, dispatch_width,
2822 needs_dep, first_write_grf, write_len);
2823
2824 /* We insert our reads as late as possible since they're reading the
2825 * result of a SEND, which has massive latency.
2826 */
2827 if (scan_inst->dst.file == GRF &&
2828 scan_inst->dst.reg >= first_write_grf &&
2829 scan_inst->dst.reg < first_write_grf + write_len &&
2830 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2831 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
2832 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2833 }
2834
2835 /* Continue the loop only if we haven't resolved all the dependencies */
2836 int i;
2837 for (i = 0; i < write_len; i++) {
2838 if (needs_dep[i])
2839 break;
2840 }
2841 if (i == write_len)
2842 return;
2843 }
2844
2845 /* If we hit the end of the program, resolve all remaining dependencies out
2846 * of paranoia.
2847 */
2848 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2849 assert(last_inst->eot);
2850 for (int i = 0; i < write_len; i++) {
2851 if (needs_dep[i])
2852 last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2853 }
2854 }
2855
2856 void
2857 fs_visitor::insert_gen4_send_dependency_workarounds()
2858 {
2859 if (brw->gen != 4 || brw->is_g4x)
2860 return;
2861
2862 bool progress = false;
2863
2864 /* Note that we're done with register allocation, so GRF fs_regs always
2865 * have a .reg_offset of 0.
2866 */
2867
2868 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2869 if (inst->mlen != 0 && inst->dst.file == GRF) {
2870 insert_gen4_pre_send_dependency_workarounds(block, inst);
2871 insert_gen4_post_send_dependency_workarounds(block, inst);
2872 progress = true;
2873 }
2874 }
2875
2876 if (progress)
2877 invalidate_live_intervals();
2878 }
2879
2880 /**
2881 * Turns the generic expression-style uniform pull constant load instruction
2882 * into a hardware-specific series of instructions for loading a pull
2883 * constant.
2884 *
2885 * The expression style allows the CSE pass before this to optimize out
2886 * repeated loads from the same offset, and gives the pre-register-allocation
2887 * scheduling full flexibility, while the conversion to native instructions
2888 * allows the post-register-allocation scheduler the best information
2889 * possible.
2890 *
2891 * Note that execution masking for setting up pull constant loads is special:
2892 * the channels that need to be written are unrelated to the current execution
2893 * mask, since a later instruction will use one of the result channels as a
2894 * source operand for all 8 or 16 of its channels.
2895 */
2896 void
2897 fs_visitor::lower_uniform_pull_constant_loads()
2898 {
2899 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2900 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2901 continue;
2902
2903 if (brw->gen >= 7) {
2904 /* The offset arg before was a vec4-aligned byte offset. We need to
2905 * turn it into a dword offset.
2906 */
2907 fs_reg const_offset_reg = inst->src[1];
2908 assert(const_offset_reg.file == IMM &&
2909 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2910 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2911 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2912
2913 /* This is actually going to be a MOV, but since only the first dword
2914 * is accessed, we have a special opcode to do just that one. Note
2915 * that this needs to be an operation that will be considered a def
2916 * by live variable analysis, or register allocation will explode.
2917 */
2918 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2919 8, payload, const_offset_reg);
2920 setup->force_writemask_all = true;
2921
2922 setup->ir = inst->ir;
2923 setup->annotation = inst->annotation;
2924 inst->insert_before(block, setup);
2925
2926 /* Similarly, this will only populate the first 4 channels of the
2927 * result register (since we only use smear values from 0-3), but we
2928 * don't tell the optimizer.
2929 */
2930 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2931 inst->src[1] = payload;
2932
2933 invalidate_live_intervals();
2934 } else {
2935 /* Before register allocation, we didn't tell the scheduler about the
2936 * MRF we use. We know it's safe to use this MRF because nothing
2937 * else does except for register spill/unspill, which generates and
2938 * uses its MRF within a single IR instruction.
2939 */
2940 inst->base_mrf = 14;
2941 inst->mlen = 1;
2942 }
2943 }
2944 }
2945
2946 bool
2947 fs_visitor::lower_load_payload()
2948 {
2949 bool progress = false;
2950
2951 int vgrf_to_reg[virtual_grf_count];
2952 int reg_count = 16; /* Leave room for MRF */
2953 for (int i = 0; i < virtual_grf_count; ++i) {
2954 vgrf_to_reg[i] = reg_count;
2955 reg_count += virtual_grf_sizes[i];
2956 }
2957
2958 struct {
2959 bool written:1; /* Whether this register has ever been written */
2960 bool force_writemask_all:1;
2961 bool force_sechalf:1;
2962 } metadata[reg_count];
2963 memset(metadata, 0, sizeof(metadata));
2964
2965 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2966 int dst_reg;
2967 if (inst->dst.file == MRF) {
2968 dst_reg = inst->dst.reg;
2969 } else if (inst->dst.file == GRF) {
2970 dst_reg = vgrf_to_reg[inst->dst.reg];
2971 }
2972
2973 if (inst->dst.file == MRF || inst->dst.file == GRF) {
2974 bool force_sechalf = inst->force_sechalf;
2975 bool toggle_sechalf = inst->dst.width == 16 &&
2976 type_sz(inst->dst.type) == 4;
2977 for (int i = 0; i < inst->regs_written; ++i) {
2978 metadata[dst_reg + i].written = true;
2979 metadata[dst_reg + i].force_sechalf = force_sechalf;
2980 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
2981 force_sechalf = (toggle_sechalf != force_sechalf);
2982 }
2983 }
2984
2985 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
2986 assert(inst->dst.file == MRF || inst->dst.file == GRF);
2987 fs_reg dst = inst->dst;
2988
2989 for (int i = 0; i < inst->sources; i++) {
2990 dst.width = inst->src[i].effective_width;
2991 dst.type = inst->src[i].type;
2992
2993 if (inst->src[i].file == BAD_FILE) {
2994 /* Do nothing but otherwise increment as normal */
2995 } else if (dst.file == MRF &&
2996 dst.width == 8 &&
2997 brw->has_compr4 &&
2998 i + 4 < inst->sources &&
2999 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3000 fs_reg compr4_dst = dst;
3001 compr4_dst.reg += BRW_MRF_COMPR4;
3002 compr4_dst.width = 16;
3003 fs_reg compr4_src = inst->src[i];
3004 compr4_src.width = 16;
3005 fs_inst *mov = MOV(compr4_dst, compr4_src);
3006 mov->force_writemask_all = true;
3007 inst->insert_before(block, mov);
3008 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3009 inst->src[i + 4].file = BAD_FILE;
3010 } else {
3011 fs_inst *mov = MOV(dst, inst->src[i]);
3012 if (inst->src[i].file == GRF) {
3013 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3014 inst->src[i].reg_offset;
3015 mov->force_sechalf = metadata[src_reg].force_sechalf;
3016 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3017 metadata[dst_reg] = metadata[src_reg];
3018 if (dst.width * type_sz(dst.type) > 32) {
3019 assert((!metadata[src_reg].written ||
3020 !metadata[src_reg].force_sechalf) &&
3021 (!metadata[src_reg + 1].written ||
3022 metadata[src_reg + 1].force_sechalf));
3023 metadata[dst_reg + 1] = metadata[src_reg + 1];
3024 }
3025 } else {
3026 metadata[dst_reg].force_writemask_all = false;
3027 metadata[dst_reg].force_sechalf = false;
3028 if (dst.width == 16) {
3029 metadata[dst_reg + 1].force_writemask_all = false;
3030 metadata[dst_reg + 1].force_sechalf = true;
3031 }
3032 }
3033 inst->insert_before(block, mov);
3034 }
3035
3036 dst = offset(dst, 1);
3037 }
3038
3039 inst->remove(block);
3040 progress = true;
3041 }
3042 }
3043
3044 if (progress)
3045 invalidate_live_intervals();
3046
3047 return progress;
3048 }
3049
3050 void
3051 fs_visitor::dump_instructions()
3052 {
3053 dump_instructions(NULL);
3054 }
3055
3056 void
3057 fs_visitor::dump_instructions(const char *name)
3058 {
3059 calculate_register_pressure();
3060 FILE *file = stderr;
3061 if (name && geteuid() != 0) {
3062 file = fopen(name, "w");
3063 if (!file)
3064 file = stderr;
3065 }
3066
3067 int ip = 0, max_pressure = 0;
3068 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3069 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3070 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3071 dump_instruction(inst, file);
3072 ++ip;
3073 }
3074 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3075
3076 if (file != stderr) {
3077 fclose(file);
3078 }
3079 }
3080
3081 void
3082 fs_visitor::dump_instruction(backend_instruction *be_inst)
3083 {
3084 dump_instruction(be_inst, stderr);
3085 }
3086
3087 void
3088 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3089 {
3090 fs_inst *inst = (fs_inst *)be_inst;
3091
3092 if (inst->predicate) {
3093 fprintf(file, "(%cf0.%d) ",
3094 inst->predicate_inverse ? '-' : '+',
3095 inst->flag_subreg);
3096 }
3097
3098 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3099 if (inst->saturate)
3100 fprintf(file, ".sat");
3101 if (inst->conditional_mod) {
3102 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3103 if (!inst->predicate &&
3104 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3105 inst->opcode != BRW_OPCODE_IF &&
3106 inst->opcode != BRW_OPCODE_WHILE))) {
3107 fprintf(file, ".f0.%d", inst->flag_subreg);
3108 }
3109 }
3110 fprintf(file, "(%d) ", inst->exec_size);
3111
3112
3113 switch (inst->dst.file) {
3114 case GRF:
3115 fprintf(file, "vgrf%d", inst->dst.reg);
3116 if (inst->dst.width != dispatch_width)
3117 fprintf(file, "@%d", inst->dst.width);
3118 if (virtual_grf_sizes[inst->dst.reg] != inst->dst.width / 8 ||
3119 inst->dst.subreg_offset)
3120 fprintf(file, "+%d.%d",
3121 inst->dst.reg_offset, inst->dst.subreg_offset);
3122 break;
3123 case MRF:
3124 fprintf(file, "m%d", inst->dst.reg);
3125 break;
3126 case BAD_FILE:
3127 fprintf(file, "(null)");
3128 break;
3129 case UNIFORM:
3130 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3131 break;
3132 case HW_REG:
3133 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3134 switch (inst->dst.fixed_hw_reg.nr) {
3135 case BRW_ARF_NULL:
3136 fprintf(file, "null");
3137 break;
3138 case BRW_ARF_ADDRESS:
3139 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3140 break;
3141 case BRW_ARF_ACCUMULATOR:
3142 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3143 break;
3144 case BRW_ARF_FLAG:
3145 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3146 inst->dst.fixed_hw_reg.subnr);
3147 break;
3148 default:
3149 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3150 inst->dst.fixed_hw_reg.subnr);
3151 break;
3152 }
3153 } else {
3154 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3155 }
3156 if (inst->dst.fixed_hw_reg.subnr)
3157 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3158 break;
3159 default:
3160 fprintf(file, "???");
3161 break;
3162 }
3163 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3164
3165 for (int i = 0; i < inst->sources; i++) {
3166 if (inst->src[i].negate)
3167 fprintf(file, "-");
3168 if (inst->src[i].abs)
3169 fprintf(file, "|");
3170 switch (inst->src[i].file) {
3171 case GRF:
3172 fprintf(file, "vgrf%d", inst->src[i].reg);
3173 if (inst->src[i].width != dispatch_width)
3174 fprintf(file, "@%d", inst->src[i].width);
3175 if (virtual_grf_sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3176 inst->src[i].subreg_offset)
3177 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3178 inst->src[i].subreg_offset);
3179 break;
3180 case MRF:
3181 fprintf(file, "***m%d***", inst->src[i].reg);
3182 break;
3183 case UNIFORM:
3184 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3185 if (inst->src[i].reladdr) {
3186 fprintf(file, "+reladdr");
3187 } else if (inst->src[i].subreg_offset) {
3188 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3189 inst->src[i].subreg_offset);
3190 }
3191 break;
3192 case BAD_FILE:
3193 fprintf(file, "(null)");
3194 break;
3195 case IMM:
3196 switch (inst->src[i].type) {
3197 case BRW_REGISTER_TYPE_F:
3198 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3199 break;
3200 case BRW_REGISTER_TYPE_D:
3201 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3202 break;
3203 case BRW_REGISTER_TYPE_UD:
3204 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3205 break;
3206 default:
3207 fprintf(file, "???");
3208 break;
3209 }
3210 break;
3211 case HW_REG:
3212 if (inst->src[i].fixed_hw_reg.negate)
3213 fprintf(file, "-");
3214 if (inst->src[i].fixed_hw_reg.abs)
3215 fprintf(file, "|");
3216 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3217 switch (inst->src[i].fixed_hw_reg.nr) {
3218 case BRW_ARF_NULL:
3219 fprintf(file, "null");
3220 break;
3221 case BRW_ARF_ADDRESS:
3222 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3223 break;
3224 case BRW_ARF_ACCUMULATOR:
3225 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3226 break;
3227 case BRW_ARF_FLAG:
3228 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3229 inst->src[i].fixed_hw_reg.subnr);
3230 break;
3231 default:
3232 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3233 inst->src[i].fixed_hw_reg.subnr);
3234 break;
3235 }
3236 } else {
3237 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3238 }
3239 if (inst->src[i].fixed_hw_reg.subnr)
3240 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3241 if (inst->src[i].fixed_hw_reg.abs)
3242 fprintf(file, "|");
3243 break;
3244 default:
3245 fprintf(file, "???");
3246 break;
3247 }
3248 if (inst->src[i].abs)
3249 fprintf(file, "|");
3250
3251 if (inst->src[i].file != IMM) {
3252 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3253 }
3254
3255 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3256 fprintf(file, ", ");
3257 }
3258
3259 fprintf(file, " ");
3260
3261 if (dispatch_width == 16 && inst->exec_size == 8) {
3262 if (inst->force_sechalf)
3263 fprintf(file, "2ndhalf ");
3264 else
3265 fprintf(file, "1sthalf ");
3266 }
3267
3268 fprintf(file, "\n");
3269 }
3270
3271 /**
3272 * Possibly returns an instruction that set up @param reg.
3273 *
3274 * Sometimes we want to take the result of some expression/variable
3275 * dereference tree and rewrite the instruction generating the result
3276 * of the tree. When processing the tree, we know that the
3277 * instructions generated are all writing temporaries that are dead
3278 * outside of this tree. So, if we have some instructions that write
3279 * a temporary, we're free to point that temp write somewhere else.
3280 *
3281 * Note that this doesn't guarantee that the instruction generated
3282 * only reg -- it might be the size=4 destination of a texture instruction.
3283 */
3284 fs_inst *
3285 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3286 fs_inst *end,
3287 const fs_reg &reg)
3288 {
3289 if (end == start ||
3290 end->is_partial_write() ||
3291 reg.reladdr ||
3292 !reg.equals(end->dst)) {
3293 return NULL;
3294 } else {
3295 return end;
3296 }
3297 }
3298
3299 void
3300 fs_visitor::setup_payload_gen6()
3301 {
3302 bool uses_depth =
3303 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3304 unsigned barycentric_interp_modes =
3305 (stage == MESA_SHADER_FRAGMENT) ?
3306 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3307
3308 assert(brw->gen >= 6);
3309
3310 /* R0-1: masks, pixel X/Y coordinates. */
3311 payload.num_regs = 2;
3312 /* R2: only for 32-pixel dispatch.*/
3313
3314 /* R3-26: barycentric interpolation coordinates. These appear in the
3315 * same order that they appear in the brw_wm_barycentric_interp_mode
3316 * enum. Each set of coordinates occupies 2 registers if dispatch width
3317 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3318 * appear if they were enabled using the "Barycentric Interpolation
3319 * Mode" bits in WM_STATE.
3320 */
3321 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3322 if (barycentric_interp_modes & (1 << i)) {
3323 payload.barycentric_coord_reg[i] = payload.num_regs;
3324 payload.num_regs += 2;
3325 if (dispatch_width == 16) {
3326 payload.num_regs += 2;
3327 }
3328 }
3329 }
3330
3331 /* R27: interpolated depth if uses source depth */
3332 if (uses_depth) {
3333 payload.source_depth_reg = payload.num_regs;
3334 payload.num_regs++;
3335 if (dispatch_width == 16) {
3336 /* R28: interpolated depth if not SIMD8. */
3337 payload.num_regs++;
3338 }
3339 }
3340 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3341 if (uses_depth) {
3342 payload.source_w_reg = payload.num_regs;
3343 payload.num_regs++;
3344 if (dispatch_width == 16) {
3345 /* R30: interpolated W if not SIMD8. */
3346 payload.num_regs++;
3347 }
3348 }
3349
3350 if (stage == MESA_SHADER_FRAGMENT) {
3351 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3352 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3353 prog_data->uses_pos_offset = key->compute_pos_offset;
3354 /* R31: MSAA position offsets. */
3355 if (prog_data->uses_pos_offset) {
3356 payload.sample_pos_reg = payload.num_regs;
3357 payload.num_regs++;
3358 }
3359 }
3360
3361 /* R32: MSAA input coverage mask */
3362 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3363 assert(brw->gen >= 7);
3364 payload.sample_mask_in_reg = payload.num_regs;
3365 payload.num_regs++;
3366 if (dispatch_width == 16) {
3367 /* R33: input coverage mask if not SIMD8. */
3368 payload.num_regs++;
3369 }
3370 }
3371
3372 /* R34-: bary for 32-pixel. */
3373 /* R58-59: interp W for 32-pixel. */
3374
3375 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3376 source_depth_to_render_target = true;
3377 }
3378 }
3379
3380 void
3381 fs_visitor::assign_binding_table_offsets()
3382 {
3383 assert(stage == MESA_SHADER_FRAGMENT);
3384 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3385 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3386 uint32_t next_binding_table_offset = 0;
3387
3388 /* If there are no color regions, we still perform an FB write to a null
3389 * renderbuffer, which we place at surface index 0.
3390 */
3391 prog_data->binding_table.render_target_start = next_binding_table_offset;
3392 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3393
3394 assign_common_binding_table_offsets(next_binding_table_offset);
3395 }
3396
3397 void
3398 fs_visitor::calculate_register_pressure()
3399 {
3400 invalidate_live_intervals();
3401 calculate_live_intervals();
3402
3403 unsigned num_instructions = 0;
3404 foreach_block(block, cfg)
3405 num_instructions += block->instructions.length();
3406
3407 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3408
3409 for (int reg = 0; reg < virtual_grf_count; reg++) {
3410 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3411 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3412 }
3413 }
3414
3415 /**
3416 * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
3417 *
3418 * The needs_unlit_centroid_workaround ends up producing one of these per
3419 * channel of centroid input, so it's good to clean them up.
3420 *
3421 * An assumption here is that nothing ever modifies the dispatched pixels
3422 * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
3423 * dictates that anyway.
3424 */
3425 void
3426 fs_visitor::opt_drop_redundant_mov_to_flags()
3427 {
3428 bool flag_mov_found[2] = {false};
3429
3430 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3431 if (inst->is_control_flow()) {
3432 memset(flag_mov_found, 0, sizeof(flag_mov_found));
3433 } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
3434 if (!flag_mov_found[inst->flag_subreg])
3435 flag_mov_found[inst->flag_subreg] = true;
3436 else
3437 inst->remove(block);
3438 } else if (inst->writes_flag()) {
3439 flag_mov_found[inst->flag_subreg] = false;
3440 }
3441 }
3442 }
3443
3444 bool
3445 fs_visitor::run()
3446 {
3447 sanity_param_count = prog->Parameters->NumParameters;
3448 bool allocated_without_spills;
3449
3450 assign_binding_table_offsets();
3451
3452 if (brw->gen >= 6)
3453 setup_payload_gen6();
3454 else
3455 setup_payload_gen4();
3456
3457 if (0) {
3458 emit_dummy_fs();
3459 } else if (brw->use_rep_send && dispatch_width == 16) {
3460 emit_repclear_shader();
3461 allocated_without_spills = true;
3462 } else {
3463 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3464 emit_shader_time_begin();
3465
3466 calculate_urb_setup();
3467 if (prog->InputsRead > 0) {
3468 if (brw->gen < 6)
3469 emit_interpolation_setup_gen4();
3470 else
3471 emit_interpolation_setup_gen6();
3472 }
3473
3474 /* We handle discards by keeping track of the still-live pixels in f0.1.
3475 * Initialize it with the dispatched pixels.
3476 */
3477 bool uses_kill =
3478 (stage == MESA_SHADER_FRAGMENT) &&
3479 ((brw_wm_prog_data*) this->prog_data)->uses_kill;
3480 bool alpha_test_func =
3481 (stage == MESA_SHADER_FRAGMENT) &&
3482 ((brw_wm_prog_key*) this->key)->alpha_test_func;
3483 if (uses_kill || alpha_test_func) {
3484 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3485 discard_init->flag_subreg = 1;
3486 }
3487
3488 /* Generate FS IR for main(). (the visitor only descends into
3489 * functions called "main").
3490 */
3491 if (shader) {
3492 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3493 base_ir = ir;
3494 this->result = reg_undef;
3495 ir->accept(this);
3496 }
3497 } else {
3498 emit_fragment_program_code();
3499 }
3500 base_ir = NULL;
3501 if (failed)
3502 return false;
3503
3504 emit(FS_OPCODE_PLACEHOLDER_HALT);
3505
3506 if (alpha_test_func)
3507 emit_alpha_test();
3508
3509 emit_fb_writes();
3510
3511 calculate_cfg();
3512
3513 split_virtual_grfs();
3514
3515 move_uniform_array_access_to_pull_constants();
3516 assign_constant_locations();
3517 demote_pull_constants();
3518
3519 opt_drop_redundant_mov_to_flags();
3520
3521 #define OPT(pass, args...) do { \
3522 pass_num++; \
3523 bool this_progress = pass(args); \
3524 \
3525 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3526 char filename[64]; \
3527 snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass, \
3528 dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3529 \
3530 backend_visitor::dump_instructions(filename); \
3531 } \
3532 \
3533 progress = progress || this_progress; \
3534 } while (false)
3535
3536 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3537 char filename[64];
3538 snprintf(filename, 64, "fs%d-%04d-00-start",
3539 dispatch_width, shader_prog ? shader_prog->Name : 0);
3540
3541 backend_visitor::dump_instructions(filename);
3542 }
3543
3544 bool progress;
3545 int iteration = 0;
3546 do {
3547 progress = false;
3548 iteration++;
3549 int pass_num = 0;
3550
3551 OPT(remove_duplicate_mrf_writes);
3552
3553 OPT(opt_algebraic);
3554 OPT(opt_cse);
3555 OPT(opt_copy_propagate);
3556 OPT(opt_peephole_predicated_break);
3557 OPT(dead_code_eliminate);
3558 OPT(opt_peephole_sel);
3559 OPT(dead_control_flow_eliminate, this);
3560 OPT(opt_register_renaming);
3561 OPT(opt_saturate_propagation);
3562 OPT(register_coalesce);
3563 OPT(compute_to_mrf);
3564
3565 OPT(compact_virtual_grfs);
3566 } while (progress);
3567
3568 if (lower_load_payload()) {
3569 split_virtual_grfs();
3570 register_coalesce();
3571 compute_to_mrf();
3572 dead_code_eliminate();
3573 }
3574
3575 lower_uniform_pull_constant_loads();
3576
3577 assign_curb_setup();
3578 assign_urb_setup();
3579
3580 static enum instruction_scheduler_mode pre_modes[] = {
3581 SCHEDULE_PRE,
3582 SCHEDULE_PRE_NON_LIFO,
3583 SCHEDULE_PRE_LIFO,
3584 };
3585
3586 /* Try each scheduling heuristic to see if it can successfully register
3587 * allocate without spilling. They should be ordered by decreasing
3588 * performance but increasing likelihood of allocating.
3589 */
3590 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3591 schedule_instructions(pre_modes[i]);
3592
3593 if (0) {
3594 assign_regs_trivial();
3595 allocated_without_spills = true;
3596 } else {
3597 allocated_without_spills = assign_regs(false);
3598 }
3599 if (allocated_without_spills)
3600 break;
3601 }
3602
3603 if (!allocated_without_spills) {
3604 /* We assume that any spilling is worse than just dropping back to
3605 * SIMD8. There's probably actually some intermediate point where
3606 * SIMD16 with a couple of spills is still better.
3607 */
3608 if (dispatch_width == 16) {
3609 fail("Failure to register allocate. Reduce number of "
3610 "live scalar values to avoid this.");
3611 } else {
3612 perf_debug("Fragment shader triggered register spilling. "
3613 "Try reducing the number of live scalar values to "
3614 "improve performance.\n");
3615 }
3616
3617 /* Since we're out of heuristics, just go spill registers until we
3618 * get an allocation.
3619 */
3620 while (!assign_regs(true)) {
3621 if (failed)
3622 break;
3623 }
3624 }
3625 }
3626 assert(force_uncompressed_stack == 0);
3627
3628 /* This must come after all optimization and register allocation, since
3629 * it inserts dead code that happens to have side effects, and it does
3630 * so based on the actual physical registers in use.
3631 */
3632 insert_gen4_send_dependency_workarounds();
3633
3634 if (failed)
3635 return false;
3636
3637 if (!allocated_without_spills)
3638 schedule_instructions(SCHEDULE_POST);
3639
3640 if (last_scratch > 0) {
3641 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3642 }
3643
3644 if (stage == MESA_SHADER_FRAGMENT) {
3645 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3646 if (dispatch_width == 8)
3647 prog_data->reg_blocks = brw_register_blocks(grf_used);
3648 else
3649 prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3650 }
3651
3652 /* If any state parameters were appended, then ParameterValues could have
3653 * been realloced, in which case the driver uniform storage set up by
3654 * _mesa_associate_uniform_storage() would point to freed memory. Make
3655 * sure that didn't happen.
3656 */
3657 assert(sanity_param_count == prog->Parameters->NumParameters);
3658
3659 return !failed;
3660 }
3661
3662 const unsigned *
3663 brw_wm_fs_emit(struct brw_context *brw,
3664 void *mem_ctx,
3665 const struct brw_wm_prog_key *key,
3666 struct brw_wm_prog_data *prog_data,
3667 struct gl_fragment_program *fp,
3668 struct gl_shader_program *prog,
3669 unsigned *final_assembly_size)
3670 {
3671 bool start_busy = false;
3672 double start_time = 0;
3673
3674 if (unlikely(brw->perf_debug)) {
3675 start_busy = (brw->batch.last_bo &&
3676 drm_intel_bo_busy(brw->batch.last_bo));
3677 start_time = get_time();
3678 }
3679
3680 struct brw_shader *shader = NULL;
3681 if (prog)
3682 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3683
3684 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3685 brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3686
3687 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3688 */
3689 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3690 if (!v.run()) {
3691 if (prog) {
3692 prog->LinkStatus = false;
3693 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3694 }
3695
3696 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3697 v.fail_msg);
3698
3699 return NULL;
3700 }
3701
3702 cfg_t *simd16_cfg = NULL;
3703 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3704 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3705 brw->use_rep_send)) {
3706 if (!v.simd16_unsupported) {
3707 /* Try a SIMD16 compile */
3708 v2.import_uniforms(&v);
3709 if (!v2.run()) {
3710 perf_debug("SIMD16 shader failed to compile, falling back to "
3711 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3712 } else {
3713 simd16_cfg = v2.cfg;
3714 }
3715 } else {
3716 perf_debug("SIMD16 shader unsupported, falling back to "
3717 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3718 }
3719 }
3720
3721 cfg_t *simd8_cfg;
3722 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3723 if (no_simd8 && simd16_cfg) {
3724 simd8_cfg = NULL;
3725 prog_data->no_8 = true;
3726 } else {
3727 simd8_cfg = v.cfg;
3728 prog_data->no_8 = false;
3729 }
3730
3731 const unsigned *assembly = NULL;
3732 fs_generator g(brw, mem_ctx, key, prog_data, prog, fp,
3733 v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
3734 assembly = g.generate_assembly(simd8_cfg, simd16_cfg,
3735 final_assembly_size);
3736
3737 if (unlikely(brw->perf_debug) && shader) {
3738 if (shader->compiled_once)
3739 brw_wm_debug_recompile(brw, prog, key);
3740 shader->compiled_once = true;
3741
3742 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3743 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3744 (get_time() - start_time) * 1000);
3745 }
3746 }
3747
3748 return assembly;
3749 }
3750
3751 bool
3752 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3753 {
3754 struct brw_context *brw = brw_context(ctx);
3755 struct brw_wm_prog_key key;
3756
3757 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3758 return true;
3759
3760 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3761 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3762 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3763 bool program_uses_dfdy = fp->UsesDFdy;
3764
3765 memset(&key, 0, sizeof(key));
3766
3767 if (brw->gen < 6) {
3768 if (fp->UsesKill)
3769 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3770
3771 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3772 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3773
3774 /* Just assume depth testing. */
3775 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3776 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3777 }
3778
3779 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3780 BRW_FS_VARYING_INPUT_MASK) > 16)
3781 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3782
3783 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3784 for (unsigned i = 0; i < sampler_count; i++) {
3785 if (fp->Base.ShadowSamplers & (1 << i)) {
3786 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3787 key.tex.swizzles[i] =
3788 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3789 } else {
3790 /* Color sampler: assume no swizzling. */
3791 key.tex.swizzles[i] = SWIZZLE_XYZW;
3792 }
3793 }
3794
3795 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3796 key.drawable_height = ctx->DrawBuffer->Height;
3797 }
3798
3799 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3800 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3801 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3802
3803 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3804 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3805 key.nr_color_regions > 1;
3806 }
3807
3808 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3809 * quality of the derivatives is likely to be determined by the driconf
3810 * option.
3811 */
3812 key.high_quality_derivatives = brw->disable_derivative_optimization;
3813
3814 key.program_string_id = bfp->id;
3815
3816 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3817 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3818
3819 bool success = do_wm_prog(brw, prog, bfp, &key);
3820
3821 brw->wm.base.prog_offset = old_prog_offset;
3822 brw->wm.prog_data = old_prog_data;
3823
3824 return success;
3825 }