i965: Generalize fs_generator further
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "util/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "util/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_cfg.h"
50 #include "brw_dead_control_flow.h"
51 #include "main/uniforms.h"
52 #include "brw_fs_live_variables.h"
53 #include "glsl/glsl_types.h"
54
55 void
56 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
57 fs_reg *src, int sources)
58 {
59 memset(this, 0, sizeof(*this));
60
61 this->opcode = opcode;
62 this->dst = dst;
63 this->src = src;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (int i = 0; i < sources; ++i) {
79 if (src[i].file != GRF)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 for (int i = 0; i < sources; ++i) {
94 switch (this->src[i].file) {
95 case BAD_FILE:
96 this->src[i].effective_width = 8;
97 break;
98 case GRF:
99 case HW_REG:
100 assert(this->src[i].width > 0);
101 if (this->src[i].width == 1) {
102 this->src[i].effective_width = this->exec_size;
103 } else {
104 this->src[i].effective_width = this->src[i].width;
105 }
106 break;
107 case IMM:
108 case UNIFORM:
109 this->src[i].effective_width = this->exec_size;
110 break;
111 default:
112 unreachable("Invalid source register file");
113 }
114 }
115 this->dst.effective_width = this->exec_size;
116
117 this->conditional_mod = BRW_CONDITIONAL_NONE;
118
119 /* This will be the case for almost all instructions. */
120 switch (dst.file) {
121 case GRF:
122 case HW_REG:
123 case MRF:
124 this->regs_written = (dst.width * dst.stride * type_sz(dst.type) + 31) / 32;
125 break;
126 case BAD_FILE:
127 this->regs_written = 0;
128 break;
129 case IMM:
130 case UNIFORM:
131 unreachable("Invalid destination register file");
132 default:
133 unreachable("Invalid register file");
134 }
135
136 this->writes_accumulator = false;
137 }
138
139 fs_inst::fs_inst()
140 {
141 fs_reg *src = ralloc_array(this, fs_reg, 3);
142 init(BRW_OPCODE_NOP, 8, dst, src, 0);
143 }
144
145 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
146 {
147 fs_reg *src = ralloc_array(this, fs_reg, 3);
148 init(opcode, exec_size, reg_undef, src, 0);
149 }
150
151 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
152 {
153 fs_reg *src = ralloc_array(this, fs_reg, 3);
154 init(opcode, 0, dst, src, 0);
155 }
156
157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
158 const fs_reg &src0)
159 {
160 fs_reg *src = ralloc_array(this, fs_reg, 3);
161 src[0] = src0;
162 init(opcode, exec_size, dst, src, 1);
163 }
164
165 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
166 {
167 fs_reg *src = ralloc_array(this, fs_reg, 3);
168 src[0] = src0;
169 init(opcode, 0, dst, src, 1);
170 }
171
172 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
173 const fs_reg &src0, const fs_reg &src1)
174 {
175 fs_reg *src = ralloc_array(this, fs_reg, 3);
176 src[0] = src0;
177 src[1] = src1;
178 init(opcode, exec_size, dst, src, 2);
179 }
180
181 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
182 const fs_reg &src1)
183 {
184 fs_reg *src = ralloc_array(this, fs_reg, 3);
185 src[0] = src0;
186 src[1] = src1;
187 init(opcode, 0, dst, src, 2);
188 }
189
190 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
191 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
192 {
193 fs_reg *src = ralloc_array(this, fs_reg, 3);
194 src[0] = src0;
195 src[1] = src1;
196 src[2] = src2;
197 init(opcode, exec_size, dst, src, 3);
198 }
199
200 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
201 const fs_reg &src1, const fs_reg &src2)
202 {
203 fs_reg *src = ralloc_array(this, fs_reg, 3);
204 src[0] = src0;
205 src[1] = src1;
206 src[2] = src2;
207 init(opcode, 0, dst, src, 3);
208 }
209
210 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
211 {
212 init(opcode, 0, dst, src, sources);
213 }
214
215 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
216 fs_reg src[], int sources)
217 {
218 init(opcode, exec_width, dst, src, sources);
219 }
220
221 fs_inst::fs_inst(const fs_inst &that)
222 {
223 memcpy(this, &that, sizeof(that));
224
225 this->src = ralloc_array(this, fs_reg, that.sources);
226
227 for (int i = 0; i < that.sources; i++)
228 this->src[i] = that.src[i];
229 }
230
231 void
232 fs_inst::resize_sources(uint8_t num_sources)
233 {
234 if (this->sources != num_sources) {
235 this->src = reralloc(this, this->src, fs_reg, num_sources);
236 this->sources = num_sources;
237 }
238 }
239
240 #define ALU1(op) \
241 fs_inst * \
242 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
243 { \
244 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
245 }
246
247 #define ALU2(op) \
248 fs_inst * \
249 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
250 const fs_reg &src1) \
251 { \
252 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
253 }
254
255 #define ALU2_ACC(op) \
256 fs_inst * \
257 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
258 const fs_reg &src1) \
259 { \
260 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
261 inst->writes_accumulator = true; \
262 return inst; \
263 }
264
265 #define ALU3(op) \
266 fs_inst * \
267 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
268 const fs_reg &src1, const fs_reg &src2) \
269 { \
270 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
271 }
272
273 ALU1(NOT)
274 ALU1(MOV)
275 ALU1(FRC)
276 ALU1(RNDD)
277 ALU1(RNDE)
278 ALU1(RNDZ)
279 ALU2(ADD)
280 ALU2(MUL)
281 ALU2_ACC(MACH)
282 ALU2(AND)
283 ALU2(OR)
284 ALU2(XOR)
285 ALU2(SHL)
286 ALU2(SHR)
287 ALU2(ASR)
288 ALU3(LRP)
289 ALU1(BFREV)
290 ALU3(BFE)
291 ALU2(BFI1)
292 ALU3(BFI2)
293 ALU1(FBH)
294 ALU1(FBL)
295 ALU1(CBIT)
296 ALU3(MAD)
297 ALU2_ACC(ADDC)
298 ALU2_ACC(SUBB)
299 ALU2(SEL)
300 ALU2(MAC)
301
302 /** Gen4 predicated IF. */
303 fs_inst *
304 fs_visitor::IF(enum brw_predicate predicate)
305 {
306 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
307 inst->predicate = predicate;
308 return inst;
309 }
310
311 /** Gen6 IF with embedded comparison. */
312 fs_inst *
313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
314 enum brw_conditional_mod condition)
315 {
316 assert(brw->gen == 6);
317 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
318 reg_null_d, src0, src1);
319 inst->conditional_mod = condition;
320 return inst;
321 }
322
323 /**
324 * CMP: Sets the low bit of the destination channels with the result
325 * of the comparison, while the upper bits are undefined, and updates
326 * the flag register with the packed 16 bits of the result.
327 */
328 fs_inst *
329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
330 enum brw_conditional_mod condition)
331 {
332 fs_inst *inst;
333
334 /* Take the instruction:
335 *
336 * CMP null<d> src0<f> src1<f>
337 *
338 * Original gen4 does type conversion to the destination type before
339 * comparison, producing garbage results for floating point comparisons.
340 * gen5 does the comparison on the execution type (resolved source types),
341 * so dst type doesn't matter. gen6 does comparison and then uses the
342 * result as if it was the dst type with no conversion, which happens to
343 * mostly work out for float-interpreted-as-int since our comparisons are
344 * for >0, =0, <0.
345 */
346 if (brw->gen == 4) {
347 dst.type = src0.type;
348 if (dst.file == HW_REG)
349 dst.fixed_hw_reg.type = dst.type;
350 }
351
352 resolve_ud_negate(&src0);
353 resolve_ud_negate(&src1);
354
355 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
356 inst->conditional_mod = condition;
357
358 return inst;
359 }
360
361 fs_inst *
362 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
363 {
364 uint8_t exec_size = dst.width;
365 for (int i = 0; i < sources; ++i) {
366 assert(src[i].width % dst.width == 0);
367 if (src[i].width > exec_size)
368 exec_size = src[i].width;
369 }
370
371 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
372 dst, src, sources);
373 inst->regs_written = 0;
374 for (int i = 0; i < sources; ++i) {
375 /* The LOAD_PAYLOAD instruction only really makes sense if we are
376 * dealing with whole registers. If this ever changes, we can deal
377 * with it later.
378 */
379 int size = src[i].effective_width * type_sz(src[i].type);
380 assert(size % 32 == 0);
381 inst->regs_written += (size + 31) / 32;
382 }
383
384 return inst;
385 }
386
387 exec_list
388 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
389 const fs_reg &surf_index,
390 const fs_reg &varying_offset,
391 uint32_t const_offset)
392 {
393 exec_list instructions;
394 fs_inst *inst;
395
396 /* We have our constant surface use a pitch of 4 bytes, so our index can
397 * be any component of a vector, and then we load 4 contiguous
398 * components starting from that.
399 *
400 * We break down the const_offset to a portion added to the variable
401 * offset and a portion done using reg_offset, which means that if you
402 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
403 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
404 * CSE can later notice that those loads are all the same and eliminate
405 * the redundant ones.
406 */
407 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
408 instructions.push_tail(ADD(vec4_offset,
409 varying_offset, fs_reg(const_offset & ~3)));
410
411 int scale = 1;
412 if (brw->gen == 4 && dst.width == 8) {
413 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
414 * u, v, r) as parameters, or we can just use the SIMD16 message
415 * consisting of (header, u). We choose the second, at the cost of a
416 * longer return length.
417 */
418 scale = 2;
419 }
420
421 enum opcode op;
422 if (brw->gen >= 7)
423 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
424 else
425 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
426
427 assert(dst.width % 8 == 0);
428 int regs_written = 4 * (dst.width / 8) * scale;
429 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(regs_written),
430 dst.type, dst.width);
431 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
432 inst->regs_written = regs_written;
433 instructions.push_tail(inst);
434
435 if (brw->gen < 7) {
436 inst->base_mrf = 13;
437 inst->header_present = true;
438 if (brw->gen == 4)
439 inst->mlen = 3;
440 else
441 inst->mlen = 1 + dispatch_width / 8;
442 }
443
444 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
445 instructions.push_tail(MOV(dst, result));
446
447 return instructions;
448 }
449
450 /**
451 * A helper for MOV generation for fixing up broken hardware SEND dependency
452 * handling.
453 */
454 fs_inst *
455 fs_visitor::DEP_RESOLVE_MOV(int grf)
456 {
457 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
458
459 inst->ir = NULL;
460 inst->annotation = "send dependency resolve";
461
462 /* The caller always wants uncompressed to emit the minimal extra
463 * dependencies, and to avoid having to deal with aligning its regs to 2.
464 */
465 inst->exec_size = 8;
466
467 return inst;
468 }
469
470 bool
471 fs_inst::equals(fs_inst *inst) const
472 {
473 return (opcode == inst->opcode &&
474 dst.equals(inst->dst) &&
475 src[0].equals(inst->src[0]) &&
476 src[1].equals(inst->src[1]) &&
477 src[2].equals(inst->src[2]) &&
478 saturate == inst->saturate &&
479 predicate == inst->predicate &&
480 conditional_mod == inst->conditional_mod &&
481 mlen == inst->mlen &&
482 base_mrf == inst->base_mrf &&
483 target == inst->target &&
484 eot == inst->eot &&
485 header_present == inst->header_present &&
486 shadow_compare == inst->shadow_compare &&
487 exec_size == inst->exec_size &&
488 offset == inst->offset);
489 }
490
491 bool
492 fs_inst::overwrites_reg(const fs_reg &reg) const
493 {
494 return (reg.file == dst.file &&
495 reg.reg == dst.reg &&
496 reg.reg_offset >= dst.reg_offset &&
497 reg.reg_offset < dst.reg_offset + regs_written);
498 }
499
500 bool
501 fs_inst::is_send_from_grf() const
502 {
503 switch (opcode) {
504 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
505 case SHADER_OPCODE_SHADER_TIME_ADD:
506 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
507 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
508 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
509 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
510 case SHADER_OPCODE_UNTYPED_ATOMIC:
511 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
512 return true;
513 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
514 return src[1].file == GRF;
515 case FS_OPCODE_FB_WRITE:
516 return src[0].file == GRF;
517 default:
518 if (is_tex())
519 return src[0].file == GRF;
520
521 return false;
522 }
523 }
524
525 bool
526 fs_inst::can_do_source_mods(struct brw_context *brw)
527 {
528 if (brw->gen == 6 && is_math())
529 return false;
530
531 if (is_send_from_grf())
532 return false;
533
534 if (!backend_instruction::can_do_source_mods())
535 return false;
536
537 return true;
538 }
539
540 void
541 fs_reg::init()
542 {
543 memset(this, 0, sizeof(*this));
544 stride = 1;
545 }
546
547 /** Generic unset register constructor. */
548 fs_reg::fs_reg()
549 {
550 init();
551 this->file = BAD_FILE;
552 }
553
554 /** Immediate value constructor. */
555 fs_reg::fs_reg(float f)
556 {
557 init();
558 this->file = IMM;
559 this->type = BRW_REGISTER_TYPE_F;
560 this->fixed_hw_reg.dw1.f = f;
561 this->width = 1;
562 }
563
564 /** Immediate value constructor. */
565 fs_reg::fs_reg(int32_t i)
566 {
567 init();
568 this->file = IMM;
569 this->type = BRW_REGISTER_TYPE_D;
570 this->fixed_hw_reg.dw1.d = i;
571 this->width = 1;
572 }
573
574 /** Immediate value constructor. */
575 fs_reg::fs_reg(uint32_t u)
576 {
577 init();
578 this->file = IMM;
579 this->type = BRW_REGISTER_TYPE_UD;
580 this->fixed_hw_reg.dw1.ud = u;
581 this->width = 1;
582 }
583
584 /** Vector float immediate value constructor. */
585 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
586 {
587 init();
588 this->file = IMM;
589 this->type = BRW_REGISTER_TYPE_VF;
590 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
591 (vf1 << 8) |
592 (vf2 << 16) |
593 (vf3 << 24);
594 }
595
596 /** Fixed brw_reg. */
597 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
598 {
599 init();
600 this->file = HW_REG;
601 this->fixed_hw_reg = fixed_hw_reg;
602 this->type = fixed_hw_reg.type;
603 this->width = 1 << fixed_hw_reg.width;
604 }
605
606 bool
607 fs_reg::equals(const fs_reg &r) const
608 {
609 return (file == r.file &&
610 reg == r.reg &&
611 reg_offset == r.reg_offset &&
612 subreg_offset == r.subreg_offset &&
613 type == r.type &&
614 negate == r.negate &&
615 abs == r.abs &&
616 !reladdr && !r.reladdr &&
617 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
618 width == r.width &&
619 stride == r.stride);
620 }
621
622 fs_reg &
623 fs_reg::set_smear(unsigned subreg)
624 {
625 assert(file != HW_REG && file != IMM);
626 subreg_offset = subreg * type_sz(type);
627 stride = 0;
628 return *this;
629 }
630
631 bool
632 fs_reg::is_contiguous() const
633 {
634 return stride == 1;
635 }
636
637 int
638 fs_visitor::type_size(const struct glsl_type *type)
639 {
640 unsigned int size, i;
641
642 switch (type->base_type) {
643 case GLSL_TYPE_UINT:
644 case GLSL_TYPE_INT:
645 case GLSL_TYPE_FLOAT:
646 case GLSL_TYPE_BOOL:
647 return type->components();
648 case GLSL_TYPE_ARRAY:
649 return type_size(type->fields.array) * type->length;
650 case GLSL_TYPE_STRUCT:
651 size = 0;
652 for (i = 0; i < type->length; i++) {
653 size += type_size(type->fields.structure[i].type);
654 }
655 return size;
656 case GLSL_TYPE_SAMPLER:
657 /* Samplers take up no register space, since they're baked in at
658 * link time.
659 */
660 return 0;
661 case GLSL_TYPE_ATOMIC_UINT:
662 return 0;
663 case GLSL_TYPE_IMAGE:
664 case GLSL_TYPE_VOID:
665 case GLSL_TYPE_ERROR:
666 case GLSL_TYPE_INTERFACE:
667 unreachable("not reached");
668 }
669
670 return 0;
671 }
672
673 fs_reg
674 fs_visitor::get_timestamp()
675 {
676 assert(brw->gen >= 7);
677
678 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
679 BRW_ARF_TIMESTAMP,
680 0),
681 BRW_REGISTER_TYPE_UD));
682
683 fs_reg dst = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 4);
684
685 fs_inst *mov = emit(MOV(dst, ts));
686 /* We want to read the 3 fields we care about even if it's not enabled in
687 * the dispatch.
688 */
689 mov->force_writemask_all = true;
690
691 /* The caller wants the low 32 bits of the timestamp. Since it's running
692 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
693 * which is plenty of time for our purposes. It is identical across the
694 * EUs, but since it's tracking GPU core speed it will increment at a
695 * varying rate as render P-states change.
696 *
697 * The caller could also check if render P-states have changed (or anything
698 * else that might disrupt timing) by setting smear to 2 and checking if
699 * that field is != 0.
700 */
701 dst.set_smear(0);
702
703 return dst;
704 }
705
706 void
707 fs_visitor::emit_shader_time_begin()
708 {
709 current_annotation = "shader time start";
710 shader_start_time = get_timestamp();
711 }
712
713 void
714 fs_visitor::emit_shader_time_end()
715 {
716 current_annotation = "shader time end";
717
718 enum shader_time_shader_type type, written_type, reset_type;
719 if (dispatch_width == 8) {
720 type = ST_FS8;
721 written_type = ST_FS8_WRITTEN;
722 reset_type = ST_FS8_RESET;
723 } else {
724 assert(dispatch_width == 16);
725 type = ST_FS16;
726 written_type = ST_FS16_WRITTEN;
727 reset_type = ST_FS16_RESET;
728 }
729
730 fs_reg shader_end_time = get_timestamp();
731
732 /* Check that there weren't any timestamp reset events (assuming these
733 * were the only two timestamp reads that happened).
734 */
735 fs_reg reset = shader_end_time;
736 reset.set_smear(2);
737 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
738 test->conditional_mod = BRW_CONDITIONAL_Z;
739 emit(IF(BRW_PREDICATE_NORMAL));
740
741 fs_reg start = shader_start_time;
742 start.negate = true;
743 fs_reg diff = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 1);
744 emit(ADD(diff, start, shader_end_time));
745
746 /* If there were no instructions between the two timestamp gets, the diff
747 * is 2 cycles. Remove that overhead, so I can forget about that when
748 * trying to determine the time taken for single instructions.
749 */
750 emit(ADD(diff, diff, fs_reg(-2u)));
751
752 emit_shader_time_write(type, diff);
753 emit_shader_time_write(written_type, fs_reg(1u));
754 emit(BRW_OPCODE_ELSE);
755 emit_shader_time_write(reset_type, fs_reg(1u));
756 emit(BRW_OPCODE_ENDIF);
757 }
758
759 void
760 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
761 fs_reg value)
762 {
763 int shader_time_index =
764 brw_get_shader_time_index(brw, shader_prog, prog, type);
765 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
766
767 fs_reg payload;
768 if (dispatch_width == 8)
769 payload = fs_reg(this, glsl_type::uvec2_type);
770 else
771 payload = fs_reg(this, glsl_type::uint_type);
772
773 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
774 fs_reg(), payload, offset, value));
775 }
776
777 void
778 fs_visitor::vfail(const char *format, va_list va)
779 {
780 char *msg;
781
782 if (failed)
783 return;
784
785 failed = true;
786
787 msg = ralloc_vasprintf(mem_ctx, format, va);
788 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
789
790 this->fail_msg = msg;
791
792 if (INTEL_DEBUG & DEBUG_WM) {
793 fprintf(stderr, "%s", msg);
794 }
795 }
796
797 void
798 fs_visitor::fail(const char *format, ...)
799 {
800 va_list va;
801
802 va_start(va, format);
803 vfail(format, va);
804 va_end(va);
805 }
806
807 /**
808 * Mark this program as impossible to compile in SIMD16 mode.
809 *
810 * During the SIMD8 compile (which happens first), we can detect and flag
811 * things that are unsupported in SIMD16 mode, so the compiler can skip
812 * the SIMD16 compile altogether.
813 *
814 * During a SIMD16 compile (if one happens anyway), this just calls fail().
815 */
816 void
817 fs_visitor::no16(const char *format, ...)
818 {
819 va_list va;
820
821 va_start(va, format);
822
823 if (dispatch_width == 16) {
824 vfail(format, va);
825 } else {
826 simd16_unsupported = true;
827
828 if (brw->perf_debug) {
829 if (no16_msg)
830 ralloc_vasprintf_append(&no16_msg, format, va);
831 else
832 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
833 }
834 }
835
836 va_end(va);
837 }
838
839 fs_inst *
840 fs_visitor::emit(enum opcode opcode)
841 {
842 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
843 }
844
845 fs_inst *
846 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
847 {
848 return emit(new(mem_ctx) fs_inst(opcode, dst));
849 }
850
851 fs_inst *
852 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
853 {
854 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
855 }
856
857 fs_inst *
858 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
859 const fs_reg &src1)
860 {
861 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
862 }
863
864 fs_inst *
865 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
866 const fs_reg &src1, const fs_reg &src2)
867 {
868 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
869 }
870
871 fs_inst *
872 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
873 fs_reg src[], int sources)
874 {
875 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
876 }
877
878 /**
879 * Returns true if the instruction has a flag that means it won't
880 * update an entire destination register.
881 *
882 * For example, dead code elimination and live variable analysis want to know
883 * when a write to a variable screens off any preceding values that were in
884 * it.
885 */
886 bool
887 fs_inst::is_partial_write() const
888 {
889 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
890 (this->dst.width * type_sz(this->dst.type)) < 32 ||
891 !this->dst.is_contiguous());
892 }
893
894 int
895 fs_inst::regs_read(fs_visitor *v, int arg) const
896 {
897 if (is_tex() && arg == 0 && src[0].file == GRF) {
898 return mlen;
899 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
900 return mlen;
901 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
902 return mlen;
903 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
904 return mlen;
905 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
906 return mlen;
907 }
908
909 switch (src[arg].file) {
910 case BAD_FILE:
911 case UNIFORM:
912 case IMM:
913 return 1;
914 case GRF:
915 case HW_REG:
916 if (src[arg].stride == 0) {
917 return 1;
918 } else {
919 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
920 return (size + 31) / 32;
921 }
922 case MRF:
923 unreachable("MRF registers are not allowed as sources");
924 default:
925 unreachable("Invalid register file");
926 }
927 }
928
929 bool
930 fs_inst::reads_flag() const
931 {
932 return predicate;
933 }
934
935 bool
936 fs_inst::writes_flag() const
937 {
938 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
939 opcode != BRW_OPCODE_IF &&
940 opcode != BRW_OPCODE_WHILE)) ||
941 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
942 }
943
944 /**
945 * Returns how many MRFs an FS opcode will write over.
946 *
947 * Note that this is not the 0 or 1 implied writes in an actual gen
948 * instruction -- the FS opcodes often generate MOVs in addition.
949 */
950 int
951 fs_visitor::implied_mrf_writes(fs_inst *inst)
952 {
953 if (inst->mlen == 0)
954 return 0;
955
956 if (inst->base_mrf == -1)
957 return 0;
958
959 switch (inst->opcode) {
960 case SHADER_OPCODE_RCP:
961 case SHADER_OPCODE_RSQ:
962 case SHADER_OPCODE_SQRT:
963 case SHADER_OPCODE_EXP2:
964 case SHADER_OPCODE_LOG2:
965 case SHADER_OPCODE_SIN:
966 case SHADER_OPCODE_COS:
967 return 1 * dispatch_width / 8;
968 case SHADER_OPCODE_POW:
969 case SHADER_OPCODE_INT_QUOTIENT:
970 case SHADER_OPCODE_INT_REMAINDER:
971 return 2 * dispatch_width / 8;
972 case SHADER_OPCODE_TEX:
973 case FS_OPCODE_TXB:
974 case SHADER_OPCODE_TXD:
975 case SHADER_OPCODE_TXF:
976 case SHADER_OPCODE_TXF_CMS:
977 case SHADER_OPCODE_TXF_MCS:
978 case SHADER_OPCODE_TG4:
979 case SHADER_OPCODE_TG4_OFFSET:
980 case SHADER_OPCODE_TXL:
981 case SHADER_OPCODE_TXS:
982 case SHADER_OPCODE_LOD:
983 return 1;
984 case FS_OPCODE_FB_WRITE:
985 return 2;
986 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
987 case SHADER_OPCODE_GEN4_SCRATCH_READ:
988 return 1;
989 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
990 return inst->mlen;
991 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
992 return 2;
993 case SHADER_OPCODE_UNTYPED_ATOMIC:
994 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
995 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
996 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
997 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
998 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
999 return 0;
1000 default:
1001 unreachable("not reached");
1002 }
1003 }
1004
1005 int
1006 fs_visitor::virtual_grf_alloc(int size)
1007 {
1008 if (virtual_grf_array_size <= virtual_grf_count) {
1009 if (virtual_grf_array_size == 0)
1010 virtual_grf_array_size = 16;
1011 else
1012 virtual_grf_array_size *= 2;
1013 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
1014 virtual_grf_array_size);
1015 }
1016 virtual_grf_sizes[virtual_grf_count] = size;
1017 return virtual_grf_count++;
1018 }
1019
1020 /** Fixed HW reg constructor. */
1021 fs_reg::fs_reg(enum register_file file, int reg)
1022 {
1023 init();
1024 this->file = file;
1025 this->reg = reg;
1026 this->type = BRW_REGISTER_TYPE_F;
1027
1028 switch (file) {
1029 case UNIFORM:
1030 this->width = 1;
1031 break;
1032 default:
1033 this->width = 8;
1034 }
1035 }
1036
1037 /** Fixed HW reg constructor. */
1038 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1039 {
1040 init();
1041 this->file = file;
1042 this->reg = reg;
1043 this->type = type;
1044
1045 switch (file) {
1046 case UNIFORM:
1047 this->width = 1;
1048 break;
1049 default:
1050 this->width = 8;
1051 }
1052 }
1053
1054 /** Fixed HW reg constructor. */
1055 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1056 uint8_t width)
1057 {
1058 init();
1059 this->file = file;
1060 this->reg = reg;
1061 this->type = type;
1062 this->width = width;
1063 }
1064
1065 /** Automatic reg constructor. */
1066 fs_reg::fs_reg(fs_visitor *v, const struct glsl_type *type)
1067 {
1068 init();
1069 int reg_width = v->dispatch_width / 8;
1070
1071 this->file = GRF;
1072 this->reg = v->virtual_grf_alloc(v->type_size(type) * reg_width);
1073 this->reg_offset = 0;
1074 this->type = brw_type_for_base_type(type);
1075 this->width = v->dispatch_width;
1076 assert(this->width == 8 || this->width == 16);
1077 }
1078
1079 fs_reg *
1080 fs_visitor::variable_storage(ir_variable *var)
1081 {
1082 return (fs_reg *)hash_table_find(this->variable_ht, var);
1083 }
1084
1085 void
1086 import_uniforms_callback(const void *key,
1087 void *data,
1088 void *closure)
1089 {
1090 struct hash_table *dst_ht = (struct hash_table *)closure;
1091 const fs_reg *reg = (const fs_reg *)data;
1092
1093 if (reg->file != UNIFORM)
1094 return;
1095
1096 hash_table_insert(dst_ht, data, key);
1097 }
1098
1099 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1100 * This brings in those uniform definitions
1101 */
1102 void
1103 fs_visitor::import_uniforms(fs_visitor *v)
1104 {
1105 hash_table_call_foreach(v->variable_ht,
1106 import_uniforms_callback,
1107 variable_ht);
1108 this->push_constant_loc = v->push_constant_loc;
1109 this->pull_constant_loc = v->pull_constant_loc;
1110 this->uniforms = v->uniforms;
1111 this->param_size = v->param_size;
1112 }
1113
1114 /* Our support for uniforms is piggy-backed on the struct
1115 * gl_fragment_program, because that's where the values actually
1116 * get stored, rather than in some global gl_shader_program uniform
1117 * store.
1118 */
1119 void
1120 fs_visitor::setup_uniform_values(ir_variable *ir)
1121 {
1122 int namelen = strlen(ir->name);
1123
1124 /* The data for our (non-builtin) uniforms is stored in a series of
1125 * gl_uniform_driver_storage structs for each subcomponent that
1126 * glGetUniformLocation() could name. We know it's been set up in the same
1127 * order we'd walk the type, so walk the list of storage and find anything
1128 * with our name, or the prefix of a component that starts with our name.
1129 */
1130 unsigned params_before = uniforms;
1131 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1132 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1133
1134 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1135 (storage->name[namelen] != 0 &&
1136 storage->name[namelen] != '.' &&
1137 storage->name[namelen] != '[')) {
1138 continue;
1139 }
1140
1141 unsigned slots = storage->type->component_slots();
1142 if (storage->array_elements)
1143 slots *= storage->array_elements;
1144
1145 for (unsigned i = 0; i < slots; i++) {
1146 stage_prog_data->param[uniforms++] = &storage->storage[i];
1147 }
1148 }
1149
1150 /* Make sure we actually initialized the right amount of stuff here. */
1151 assert(params_before + ir->type->component_slots() == uniforms);
1152 (void)params_before;
1153 }
1154
1155
1156 /* Our support for builtin uniforms is even scarier than non-builtin.
1157 * It sits on top of the PROG_STATE_VAR parameters that are
1158 * automatically updated from GL context state.
1159 */
1160 void
1161 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1162 {
1163 const ir_state_slot *const slots = ir->get_state_slots();
1164 assert(slots != NULL);
1165
1166 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1167 /* This state reference has already been setup by ir_to_mesa, but we'll
1168 * get the same index back here.
1169 */
1170 int index = _mesa_add_state_reference(this->prog->Parameters,
1171 (gl_state_index *)slots[i].tokens);
1172
1173 /* Add each of the unique swizzles of the element as a parameter.
1174 * This'll end up matching the expected layout of the
1175 * array/matrix/structure we're trying to fill in.
1176 */
1177 int last_swiz = -1;
1178 for (unsigned int j = 0; j < 4; j++) {
1179 int swiz = GET_SWZ(slots[i].swizzle, j);
1180 if (swiz == last_swiz)
1181 break;
1182 last_swiz = swiz;
1183
1184 stage_prog_data->param[uniforms++] =
1185 &prog->Parameters->ParameterValues[index][swiz];
1186 }
1187 }
1188 }
1189
1190 fs_reg *
1191 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1192 {
1193 assert(stage == MESA_SHADER_FRAGMENT);
1194 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1195 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1196 fs_reg wpos = *reg;
1197 bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1198
1199 /* gl_FragCoord.x */
1200 if (ir->data.pixel_center_integer) {
1201 emit(MOV(wpos, this->pixel_x));
1202 } else {
1203 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1204 }
1205 wpos = offset(wpos, 1);
1206
1207 /* gl_FragCoord.y */
1208 if (!flip && ir->data.pixel_center_integer) {
1209 emit(MOV(wpos, this->pixel_y));
1210 } else {
1211 fs_reg pixel_y = this->pixel_y;
1212 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1213
1214 if (flip) {
1215 pixel_y.negate = true;
1216 offset += key->drawable_height - 1.0;
1217 }
1218
1219 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1220 }
1221 wpos = offset(wpos, 1);
1222
1223 /* gl_FragCoord.z */
1224 if (brw->gen >= 6) {
1225 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1226 } else {
1227 emit(FS_OPCODE_LINTERP, wpos,
1228 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1229 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1230 interp_reg(VARYING_SLOT_POS, 2));
1231 }
1232 wpos = offset(wpos, 1);
1233
1234 /* gl_FragCoord.w: Already set up in emit_interpolation */
1235 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1236
1237 return reg;
1238 }
1239
1240 fs_inst *
1241 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1242 glsl_interp_qualifier interpolation_mode,
1243 bool is_centroid, bool is_sample)
1244 {
1245 brw_wm_barycentric_interp_mode barycoord_mode;
1246 if (brw->gen >= 6) {
1247 if (is_centroid) {
1248 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1249 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1250 else
1251 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1252 } else if (is_sample) {
1253 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1254 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1255 else
1256 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1257 } else {
1258 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1259 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1260 else
1261 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1262 }
1263 } else {
1264 /* On Ironlake and below, there is only one interpolation mode.
1265 * Centroid interpolation doesn't mean anything on this hardware --
1266 * there is no multisampling.
1267 */
1268 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1269 }
1270 return emit(FS_OPCODE_LINTERP, attr,
1271 this->delta_x[barycoord_mode],
1272 this->delta_y[barycoord_mode], interp);
1273 }
1274
1275 fs_reg *
1276 fs_visitor::emit_general_interpolation(ir_variable *ir)
1277 {
1278 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1279 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1280 fs_reg attr = *reg;
1281
1282 assert(stage == MESA_SHADER_FRAGMENT);
1283 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1284 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1285
1286 unsigned int array_elements;
1287 const glsl_type *type;
1288
1289 if (ir->type->is_array()) {
1290 array_elements = ir->type->length;
1291 if (array_elements == 0) {
1292 fail("dereferenced array '%s' has length 0\n", ir->name);
1293 }
1294 type = ir->type->fields.array;
1295 } else {
1296 array_elements = 1;
1297 type = ir->type;
1298 }
1299
1300 glsl_interp_qualifier interpolation_mode =
1301 ir->determine_interpolation_mode(key->flat_shade);
1302
1303 int location = ir->data.location;
1304 for (unsigned int i = 0; i < array_elements; i++) {
1305 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1306 if (prog_data->urb_setup[location] == -1) {
1307 /* If there's no incoming setup data for this slot, don't
1308 * emit interpolation for it.
1309 */
1310 attr = offset(attr, type->vector_elements);
1311 location++;
1312 continue;
1313 }
1314
1315 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1316 /* Constant interpolation (flat shading) case. The SF has
1317 * handed us defined values in only the constant offset
1318 * field of the setup reg.
1319 */
1320 for (unsigned int k = 0; k < type->vector_elements; k++) {
1321 struct brw_reg interp = interp_reg(location, k);
1322 interp = suboffset(interp, 3);
1323 interp.type = reg->type;
1324 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1325 attr = offset(attr, 1);
1326 }
1327 } else {
1328 /* Smooth/noperspective interpolation case. */
1329 for (unsigned int k = 0; k < type->vector_elements; k++) {
1330 struct brw_reg interp = interp_reg(location, k);
1331 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1332 /* Get the pixel/sample mask into f0 so that we know
1333 * which pixels are lit. Then, for each channel that is
1334 * unlit, replace the centroid data with non-centroid
1335 * data.
1336 */
1337 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1338
1339 fs_inst *inst;
1340 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1341 false, false);
1342 inst->predicate = BRW_PREDICATE_NORMAL;
1343 inst->predicate_inverse = true;
1344 if (brw->has_pln)
1345 inst->no_dd_clear = true;
1346
1347 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1348 ir->data.centroid && !key->persample_shading,
1349 ir->data.sample || key->persample_shading);
1350 inst->predicate = BRW_PREDICATE_NORMAL;
1351 inst->predicate_inverse = false;
1352 if (brw->has_pln)
1353 inst->no_dd_check = true;
1354
1355 } else {
1356 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1357 ir->data.centroid && !key->persample_shading,
1358 ir->data.sample || key->persample_shading);
1359 }
1360 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1361 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1362 }
1363 attr = offset(attr, 1);
1364 }
1365
1366 }
1367 location++;
1368 }
1369 }
1370
1371 return reg;
1372 }
1373
1374 fs_reg *
1375 fs_visitor::emit_frontfacing_interpolation()
1376 {
1377 fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::bool_type);
1378
1379 if (brw->gen >= 6) {
1380 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1381 * a boolean result from this (~0/true or 0/false).
1382 *
1383 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1384 * this task in only one instruction:
1385 * - a negation source modifier will flip the bit; and
1386 * - a W -> D type conversion will sign extend the bit into the high
1387 * word of the destination.
1388 *
1389 * An ASR 15 fills the low word of the destination.
1390 */
1391 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1392 g0.negate = true;
1393
1394 emit(ASR(*reg, g0, fs_reg(15)));
1395 } else {
1396 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1397 * a boolean result from this (1/true or 0/false).
1398 *
1399 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1400 * the negation source modifier to flip it. Unfortunately the SHR
1401 * instruction only operates on UD (or D with an abs source modifier)
1402 * sources without negation.
1403 *
1404 * Instead, use ASR (which will give ~0/true or 0/false).
1405 */
1406 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1407 g1_6.negate = true;
1408
1409 emit(ASR(*reg, g1_6, fs_reg(31)));
1410 }
1411
1412 return reg;
1413 }
1414
1415 void
1416 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1417 {
1418 assert(stage == MESA_SHADER_FRAGMENT);
1419 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1420 assert(dst.type == BRW_REGISTER_TYPE_F);
1421
1422 if (key->compute_pos_offset) {
1423 /* Convert int_sample_pos to floating point */
1424 emit(MOV(dst, int_sample_pos));
1425 /* Scale to the range [0, 1] */
1426 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1427 }
1428 else {
1429 /* From ARB_sample_shading specification:
1430 * "When rendering to a non-multisample buffer, or if multisample
1431 * rasterization is disabled, gl_SamplePosition will always be
1432 * (0.5, 0.5).
1433 */
1434 emit(MOV(dst, fs_reg(0.5f)));
1435 }
1436 }
1437
1438 fs_reg *
1439 fs_visitor::emit_samplepos_setup()
1440 {
1441 assert(brw->gen >= 6);
1442
1443 this->current_annotation = "compute sample position";
1444 fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::vec2_type);
1445 fs_reg pos = *reg;
1446 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1447 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1448
1449 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1450 * mode will be enabled.
1451 *
1452 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1453 * R31.1:0 Position Offset X/Y for Slot[3:0]
1454 * R31.3:2 Position Offset X/Y for Slot[7:4]
1455 * .....
1456 *
1457 * The X, Y sample positions come in as bytes in thread payload. So, read
1458 * the positions using vstride=16, width=8, hstride=2.
1459 */
1460 struct brw_reg sample_pos_reg =
1461 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1462 BRW_REGISTER_TYPE_B), 16, 8, 2);
1463
1464 if (dispatch_width == 8) {
1465 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1466 } else {
1467 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1468 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1469 ->force_sechalf = true;
1470 }
1471 /* Compute gl_SamplePosition.x */
1472 compute_sample_position(pos, int_sample_x);
1473 pos = offset(pos, 1);
1474 if (dispatch_width == 8) {
1475 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1476 } else {
1477 emit(MOV(half(int_sample_y, 0),
1478 fs_reg(suboffset(sample_pos_reg, 1))));
1479 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1480 ->force_sechalf = true;
1481 }
1482 /* Compute gl_SamplePosition.y */
1483 compute_sample_position(pos, int_sample_y);
1484 return reg;
1485 }
1486
1487 fs_reg *
1488 fs_visitor::emit_sampleid_setup()
1489 {
1490 assert(stage == MESA_SHADER_FRAGMENT);
1491 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1492 assert(brw->gen >= 6);
1493
1494 this->current_annotation = "compute sample id";
1495 fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::int_type);
1496
1497 if (key->compute_sample_id) {
1498 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1499 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1500 t2.type = BRW_REGISTER_TYPE_UW;
1501
1502 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1503 * 8x multisampling, subspan 0 will represent sample N (where N
1504 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1505 * 7. We can find the value of N by looking at R0.0 bits 7:6
1506 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1507 * (since samples are always delivered in pairs). That is, we
1508 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1509 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1510 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1511 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1512 * populating a temporary variable with the sequence (0, 1, 2, 3),
1513 * and then reading from it using vstride=1, width=4, hstride=0.
1514 * These computations hold good for 4x multisampling as well.
1515 *
1516 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1517 * the first four slots are sample 0 of subspan 0; the next four
1518 * are sample 1 of subspan 0; the third group is sample 0 of
1519 * subspan 1, and finally sample 1 of subspan 1.
1520 */
1521 fs_inst *inst;
1522 inst = emit(BRW_OPCODE_AND, t1,
1523 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1524 fs_reg(0xc0));
1525 inst->force_writemask_all = true;
1526 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1527 inst->force_writemask_all = true;
1528 /* This works for both SIMD8 and SIMD16 */
1529 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1530 inst->force_writemask_all = true;
1531 /* This special instruction takes care of setting vstride=1,
1532 * width=4, hstride=0 of t2 during an ADD instruction.
1533 */
1534 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1535 } else {
1536 /* As per GL_ARB_sample_shading specification:
1537 * "When rendering to a non-multisample buffer, or if multisample
1538 * rasterization is disabled, gl_SampleID will always be zero."
1539 */
1540 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1541 }
1542
1543 return reg;
1544 }
1545
1546 fs_reg
1547 fs_visitor::fix_math_operand(fs_reg src)
1548 {
1549 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1550 * might be able to do better by doing execsize = 1 math and then
1551 * expanding that result out, but we would need to be careful with
1552 * masking.
1553 *
1554 * The hardware ignores source modifiers (negate and abs) on math
1555 * instructions, so we also move to a temp to set those up.
1556 */
1557 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1558 !src.abs && !src.negate)
1559 return src;
1560
1561 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1562 * operands to math
1563 */
1564 if (brw->gen >= 7 && src.file != IMM)
1565 return src;
1566
1567 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1568 expanded.type = src.type;
1569 emit(BRW_OPCODE_MOV, expanded, src);
1570 return expanded;
1571 }
1572
1573 fs_inst *
1574 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1575 {
1576 switch (opcode) {
1577 case SHADER_OPCODE_RCP:
1578 case SHADER_OPCODE_RSQ:
1579 case SHADER_OPCODE_SQRT:
1580 case SHADER_OPCODE_EXP2:
1581 case SHADER_OPCODE_LOG2:
1582 case SHADER_OPCODE_SIN:
1583 case SHADER_OPCODE_COS:
1584 break;
1585 default:
1586 unreachable("not reached: bad math opcode");
1587 }
1588
1589 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1590 * might be able to do better by doing execsize = 1 math and then
1591 * expanding that result out, but we would need to be careful with
1592 * masking.
1593 *
1594 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1595 * instructions, so we also move to a temp to set those up.
1596 */
1597 if (brw->gen == 6 || brw->gen == 7)
1598 src = fix_math_operand(src);
1599
1600 fs_inst *inst = emit(opcode, dst, src);
1601
1602 if (brw->gen < 6) {
1603 inst->base_mrf = 2;
1604 inst->mlen = dispatch_width / 8;
1605 }
1606
1607 return inst;
1608 }
1609
1610 fs_inst *
1611 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1612 {
1613 int base_mrf = 2;
1614 fs_inst *inst;
1615
1616 if (brw->gen >= 8) {
1617 inst = emit(opcode, dst, src0, src1);
1618 } else if (brw->gen >= 6) {
1619 src0 = fix_math_operand(src0);
1620 src1 = fix_math_operand(src1);
1621
1622 inst = emit(opcode, dst, src0, src1);
1623 } else {
1624 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1625 * "Message Payload":
1626 *
1627 * "Operand0[7]. For the INT DIV functions, this operand is the
1628 * denominator."
1629 * ...
1630 * "Operand1[7]. For the INT DIV functions, this operand is the
1631 * numerator."
1632 */
1633 bool is_int_div = opcode != SHADER_OPCODE_POW;
1634 fs_reg &op0 = is_int_div ? src1 : src0;
1635 fs_reg &op1 = is_int_div ? src0 : src1;
1636
1637 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1638 inst = emit(opcode, dst, op0, reg_null_f);
1639
1640 inst->base_mrf = base_mrf;
1641 inst->mlen = 2 * dispatch_width / 8;
1642 }
1643 return inst;
1644 }
1645
1646 void
1647 fs_visitor::assign_curb_setup()
1648 {
1649 if (dispatch_width == 8) {
1650 prog_data->dispatch_grf_start_reg = payload.num_regs;
1651 } else {
1652 assert(stage == MESA_SHADER_FRAGMENT);
1653 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1654 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1655 }
1656
1657 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1658
1659 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1660 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1661 for (unsigned int i = 0; i < inst->sources; i++) {
1662 if (inst->src[i].file == UNIFORM) {
1663 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1664 int constant_nr;
1665 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1666 constant_nr = push_constant_loc[uniform_nr];
1667 } else {
1668 /* Section 5.11 of the OpenGL 4.1 spec says:
1669 * "Out-of-bounds reads return undefined values, which include
1670 * values from other variables of the active program or zero."
1671 * Just return the first push constant.
1672 */
1673 constant_nr = 0;
1674 }
1675
1676 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1677 constant_nr / 8,
1678 constant_nr % 8);
1679
1680 inst->src[i].file = HW_REG;
1681 inst->src[i].fixed_hw_reg = byte_offset(
1682 retype(brw_reg, inst->src[i].type),
1683 inst->src[i].subreg_offset);
1684 }
1685 }
1686 }
1687 }
1688
1689 void
1690 fs_visitor::calculate_urb_setup()
1691 {
1692 assert(stage == MESA_SHADER_FRAGMENT);
1693 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1694 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1695
1696 memset(prog_data->urb_setup, -1,
1697 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1698
1699 int urb_next = 0;
1700 /* Figure out where each of the incoming setup attributes lands. */
1701 if (brw->gen >= 6) {
1702 if (_mesa_bitcount_64(prog->InputsRead &
1703 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1704 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1705 * first 16 varying inputs, so we can put them wherever we want.
1706 * Just put them in order.
1707 *
1708 * This is useful because it means that (a) inputs not used by the
1709 * fragment shader won't take up valuable register space, and (b) we
1710 * won't have to recompile the fragment shader if it gets paired with
1711 * a different vertex (or geometry) shader.
1712 */
1713 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1714 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1715 BITFIELD64_BIT(i)) {
1716 prog_data->urb_setup[i] = urb_next++;
1717 }
1718 }
1719 } else {
1720 /* We have enough input varyings that the SF/SBE pipeline stage can't
1721 * arbitrarily rearrange them to suit our whim; we have to put them
1722 * in an order that matches the output of the previous pipeline stage
1723 * (geometry or vertex shader).
1724 */
1725 struct brw_vue_map prev_stage_vue_map;
1726 brw_compute_vue_map(brw, &prev_stage_vue_map,
1727 key->input_slots_valid);
1728 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1729 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1730 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1731 slot++) {
1732 int varying = prev_stage_vue_map.slot_to_varying[slot];
1733 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1734 * unused.
1735 */
1736 if (varying != BRW_VARYING_SLOT_COUNT &&
1737 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1738 BITFIELD64_BIT(varying))) {
1739 prog_data->urb_setup[varying] = slot - first_slot;
1740 }
1741 }
1742 urb_next = prev_stage_vue_map.num_slots - first_slot;
1743 }
1744 } else {
1745 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1746 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1747 /* Point size is packed into the header, not as a general attribute */
1748 if (i == VARYING_SLOT_PSIZ)
1749 continue;
1750
1751 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1752 /* The back color slot is skipped when the front color is
1753 * also written to. In addition, some slots can be
1754 * written in the vertex shader and not read in the
1755 * fragment shader. So the register number must always be
1756 * incremented, mapped or not.
1757 */
1758 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1759 prog_data->urb_setup[i] = urb_next;
1760 urb_next++;
1761 }
1762 }
1763
1764 /*
1765 * It's a FS only attribute, and we did interpolation for this attribute
1766 * in SF thread. So, count it here, too.
1767 *
1768 * See compile_sf_prog() for more info.
1769 */
1770 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1771 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1772 }
1773
1774 prog_data->num_varying_inputs = urb_next;
1775 }
1776
1777 void
1778 fs_visitor::assign_urb_setup()
1779 {
1780 assert(stage == MESA_SHADER_FRAGMENT);
1781 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1782
1783 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1784
1785 /* Offset all the urb_setup[] index by the actual position of the
1786 * setup regs, now that the location of the constants has been chosen.
1787 */
1788 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1789 if (inst->opcode == FS_OPCODE_LINTERP) {
1790 assert(inst->src[2].file == HW_REG);
1791 inst->src[2].fixed_hw_reg.nr += urb_start;
1792 }
1793
1794 if (inst->opcode == FS_OPCODE_CINTERP) {
1795 assert(inst->src[0].file == HW_REG);
1796 inst->src[0].fixed_hw_reg.nr += urb_start;
1797 }
1798 }
1799
1800 /* Each attribute is 4 setup channels, each of which is half a reg. */
1801 this->first_non_payload_grf =
1802 urb_start + prog_data->num_varying_inputs * 2;
1803 }
1804
1805 /**
1806 * Split large virtual GRFs into separate components if we can.
1807 *
1808 * This is mostly duplicated with what brw_fs_vector_splitting does,
1809 * but that's really conservative because it's afraid of doing
1810 * splitting that doesn't result in real progress after the rest of
1811 * the optimization phases, which would cause infinite looping in
1812 * optimization. We can do it once here, safely. This also has the
1813 * opportunity to split interpolated values, or maybe even uniforms,
1814 * which we don't have at the IR level.
1815 *
1816 * We want to split, because virtual GRFs are what we register
1817 * allocate and spill (due to contiguousness requirements for some
1818 * instructions), and they're what we naturally generate in the
1819 * codegen process, but most virtual GRFs don't actually need to be
1820 * contiguous sets of GRFs. If we split, we'll end up with reduced
1821 * live intervals and better dead code elimination and coalescing.
1822 */
1823 void
1824 fs_visitor::split_virtual_grfs()
1825 {
1826 int num_vars = this->virtual_grf_count;
1827
1828 /* Count the total number of registers */
1829 int reg_count = 0;
1830 int vgrf_to_reg[num_vars];
1831 for (int i = 0; i < num_vars; i++) {
1832 vgrf_to_reg[i] = reg_count;
1833 reg_count += virtual_grf_sizes[i];
1834 }
1835
1836 /* An array of "split points". For each register slot, this indicates
1837 * if this slot can be separated from the previous slot. Every time an
1838 * instruction uses multiple elements of a register (as a source or
1839 * destination), we mark the used slots as inseparable. Then we go
1840 * through and split the registers into the smallest pieces we can.
1841 */
1842 bool split_points[reg_count];
1843 memset(split_points, 0, sizeof(split_points));
1844
1845 /* Mark all used registers as fully splittable */
1846 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1847 if (inst->dst.file == GRF) {
1848 int reg = vgrf_to_reg[inst->dst.reg];
1849 for (int j = 1; j < this->virtual_grf_sizes[inst->dst.reg]; j++)
1850 split_points[reg + j] = true;
1851 }
1852
1853 for (int i = 0; i < inst->sources; i++) {
1854 if (inst->src[i].file == GRF) {
1855 int reg = vgrf_to_reg[inst->src[i].reg];
1856 for (int j = 1; j < this->virtual_grf_sizes[inst->src[i].reg]; j++)
1857 split_points[reg + j] = true;
1858 }
1859 }
1860 }
1861
1862 if (brw->has_pln &&
1863 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1864 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1865 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1866 * Gen6, that was the only supported interpolation mode, and since Gen6,
1867 * delta_x and delta_y are in fixed hardware registers.
1868 */
1869 int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1870 split_points[vgrf_to_reg[vgrf] + 1] = false;
1871 }
1872
1873 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1874 if (inst->dst.file == GRF) {
1875 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1876 for (int j = 1; j < inst->regs_written; j++)
1877 split_points[reg + j] = false;
1878 }
1879 for (int i = 0; i < inst->sources; i++) {
1880 if (inst->src[i].file == GRF) {
1881 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1882 for (int j = 1; j < inst->regs_read(this, i); j++)
1883 split_points[reg + j] = false;
1884 }
1885 }
1886 }
1887
1888 int new_virtual_grf[reg_count];
1889 int new_reg_offset[reg_count];
1890
1891 int reg = 0;
1892 for (int i = 0; i < num_vars; i++) {
1893 /* The first one should always be 0 as a quick sanity check. */
1894 assert(split_points[reg] == false);
1895
1896 /* j = 0 case */
1897 new_reg_offset[reg] = 0;
1898 reg++;
1899 int offset = 1;
1900
1901 /* j > 0 case */
1902 for (int j = 1; j < virtual_grf_sizes[i]; j++) {
1903 /* If this is a split point, reset the offset to 0 and allocate a
1904 * new virtual GRF for the previous offset many registers
1905 */
1906 if (split_points[reg]) {
1907 assert(offset <= MAX_VGRF_SIZE);
1908 int grf = virtual_grf_alloc(offset);
1909 for (int k = reg - offset; k < reg; k++)
1910 new_virtual_grf[k] = grf;
1911 offset = 0;
1912 }
1913 new_reg_offset[reg] = offset;
1914 offset++;
1915 reg++;
1916 }
1917
1918 /* The last one gets the original register number */
1919 assert(offset <= MAX_VGRF_SIZE);
1920 virtual_grf_sizes[i] = offset;
1921 for (int k = reg - offset; k < reg; k++)
1922 new_virtual_grf[k] = i;
1923 }
1924 assert(reg == reg_count);
1925
1926 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1927 if (inst->dst.file == GRF) {
1928 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1929 inst->dst.reg = new_virtual_grf[reg];
1930 inst->dst.reg_offset = new_reg_offset[reg];
1931 assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
1932 }
1933 for (int i = 0; i < inst->sources; i++) {
1934 if (inst->src[i].file == GRF) {
1935 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1936 inst->src[i].reg = new_virtual_grf[reg];
1937 inst->src[i].reg_offset = new_reg_offset[reg];
1938 assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
1939 }
1940 }
1941 }
1942 invalidate_live_intervals();
1943 }
1944
1945 /**
1946 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1947 *
1948 * During code generation, we create tons of temporary variables, many of
1949 * which get immediately killed and are never used again. Yet, in later
1950 * optimization and analysis passes, such as compute_live_intervals, we need
1951 * to loop over all the virtual GRFs. Compacting them can save a lot of
1952 * overhead.
1953 */
1954 bool
1955 fs_visitor::compact_virtual_grfs()
1956 {
1957 bool progress = false;
1958 int remap_table[this->virtual_grf_count];
1959 memset(remap_table, -1, sizeof(remap_table));
1960
1961 /* Mark which virtual GRFs are used. */
1962 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1963 if (inst->dst.file == GRF)
1964 remap_table[inst->dst.reg] = 0;
1965
1966 for (int i = 0; i < inst->sources; i++) {
1967 if (inst->src[i].file == GRF)
1968 remap_table[inst->src[i].reg] = 0;
1969 }
1970 }
1971
1972 /* Compact the GRF arrays. */
1973 int new_index = 0;
1974 for (int i = 0; i < this->virtual_grf_count; i++) {
1975 if (remap_table[i] == -1) {
1976 /* We just found an unused register. This means that we are
1977 * actually going to compact something.
1978 */
1979 progress = true;
1980 } else {
1981 remap_table[i] = new_index;
1982 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1983 invalidate_live_intervals();
1984 ++new_index;
1985 }
1986 }
1987
1988 this->virtual_grf_count = new_index;
1989
1990 /* Patch all the instructions to use the newly renumbered registers */
1991 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1992 if (inst->dst.file == GRF)
1993 inst->dst.reg = remap_table[inst->dst.reg];
1994
1995 for (int i = 0; i < inst->sources; i++) {
1996 if (inst->src[i].file == GRF)
1997 inst->src[i].reg = remap_table[inst->src[i].reg];
1998 }
1999 }
2000
2001 /* Patch all the references to delta_x/delta_y, since they're used in
2002 * register allocation. If they're unused, switch them to BAD_FILE so
2003 * we don't think some random VGRF is delta_x/delta_y.
2004 */
2005 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2006 if (delta_x[i].file == GRF) {
2007 if (remap_table[delta_x[i].reg] != -1) {
2008 delta_x[i].reg = remap_table[delta_x[i].reg];
2009 } else {
2010 delta_x[i].file = BAD_FILE;
2011 }
2012 }
2013 }
2014 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2015 if (delta_y[i].file == GRF) {
2016 if (remap_table[delta_y[i].reg] != -1) {
2017 delta_y[i].reg = remap_table[delta_y[i].reg];
2018 } else {
2019 delta_y[i].file = BAD_FILE;
2020 }
2021 }
2022 }
2023
2024 return progress;
2025 }
2026
2027 /*
2028 * Implements array access of uniforms by inserting a
2029 * PULL_CONSTANT_LOAD instruction.
2030 *
2031 * Unlike temporary GRF array access (where we don't support it due to
2032 * the difficulty of doing relative addressing on instruction
2033 * destinations), we could potentially do array access of uniforms
2034 * that were loaded in GRF space as push constants. In real-world
2035 * usage we've seen, though, the arrays being used are always larger
2036 * than we could load as push constants, so just always move all
2037 * uniform array access out to a pull constant buffer.
2038 */
2039 void
2040 fs_visitor::move_uniform_array_access_to_pull_constants()
2041 {
2042 if (dispatch_width != 8)
2043 return;
2044
2045 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2046 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2047
2048 /* Walk through and find array access of uniforms. Put a copy of that
2049 * uniform in the pull constant buffer.
2050 *
2051 * Note that we don't move constant-indexed accesses to arrays. No
2052 * testing has been done of the performance impact of this choice.
2053 */
2054 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2055 for (int i = 0 ; i < inst->sources; i++) {
2056 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2057 continue;
2058
2059 int uniform = inst->src[i].reg;
2060
2061 /* If this array isn't already present in the pull constant buffer,
2062 * add it.
2063 */
2064 if (pull_constant_loc[uniform] == -1) {
2065 const gl_constant_value **values = &stage_prog_data->param[uniform];
2066
2067 assert(param_size[uniform]);
2068
2069 for (int j = 0; j < param_size[uniform]; j++) {
2070 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2071
2072 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2073 values[j];
2074 }
2075 }
2076 }
2077 }
2078 }
2079
2080 /**
2081 * Assign UNIFORM file registers to either push constants or pull constants.
2082 *
2083 * We allow a fragment shader to have more than the specified minimum
2084 * maximum number of fragment shader uniform components (64). If
2085 * there are too many of these, they'd fill up all of register space.
2086 * So, this will push some of them out to the pull constant buffer and
2087 * update the program to load them.
2088 */
2089 void
2090 fs_visitor::assign_constant_locations()
2091 {
2092 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2093 if (dispatch_width != 8)
2094 return;
2095
2096 /* Find which UNIFORM registers are still in use. */
2097 bool is_live[uniforms];
2098 for (unsigned int i = 0; i < uniforms; i++) {
2099 is_live[i] = false;
2100 }
2101
2102 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2103 for (int i = 0; i < inst->sources; i++) {
2104 if (inst->src[i].file != UNIFORM)
2105 continue;
2106
2107 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2108 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2109 is_live[constant_nr] = true;
2110 }
2111 }
2112
2113 /* Only allow 16 registers (128 uniform components) as push constants.
2114 *
2115 * Just demote the end of the list. We could probably do better
2116 * here, demoting things that are rarely used in the program first.
2117 *
2118 * If changing this value, note the limitation about total_regs in
2119 * brw_curbe.c.
2120 */
2121 unsigned int max_push_components = 16 * 8;
2122 unsigned int num_push_constants = 0;
2123
2124 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2125
2126 for (unsigned int i = 0; i < uniforms; i++) {
2127 if (!is_live[i] || pull_constant_loc[i] != -1) {
2128 /* This UNIFORM register is either dead, or has already been demoted
2129 * to a pull const. Mark it as no longer living in the param[] array.
2130 */
2131 push_constant_loc[i] = -1;
2132 continue;
2133 }
2134
2135 if (num_push_constants < max_push_components) {
2136 /* Retain as a push constant. Record the location in the params[]
2137 * array.
2138 */
2139 push_constant_loc[i] = num_push_constants++;
2140 } else {
2141 /* Demote to a pull constant. */
2142 push_constant_loc[i] = -1;
2143
2144 int pull_index = stage_prog_data->nr_pull_params++;
2145 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2146 pull_constant_loc[i] = pull_index;
2147 }
2148 }
2149
2150 stage_prog_data->nr_params = num_push_constants;
2151
2152 /* Up until now, the param[] array has been indexed by reg + reg_offset
2153 * of UNIFORM registers. Condense it to only contain the uniforms we
2154 * chose to upload as push constants.
2155 */
2156 for (unsigned int i = 0; i < uniforms; i++) {
2157 int remapped = push_constant_loc[i];
2158
2159 if (remapped == -1)
2160 continue;
2161
2162 assert(remapped <= (int)i);
2163 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2164 }
2165 }
2166
2167 /**
2168 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2169 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2170 */
2171 void
2172 fs_visitor::demote_pull_constants()
2173 {
2174 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2175 for (int i = 0; i < inst->sources; i++) {
2176 if (inst->src[i].file != UNIFORM)
2177 continue;
2178
2179 int pull_index = pull_constant_loc[inst->src[i].reg +
2180 inst->src[i].reg_offset];
2181 if (pull_index == -1)
2182 continue;
2183
2184 /* Set up the annotation tracking for new generated instructions. */
2185 base_ir = inst->ir;
2186 current_annotation = inst->annotation;
2187
2188 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2189 fs_reg dst = fs_reg(this, glsl_type::float_type);
2190
2191 /* Generate a pull load into dst. */
2192 if (inst->src[i].reladdr) {
2193 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2194 surf_index,
2195 *inst->src[i].reladdr,
2196 pull_index);
2197 inst->insert_before(block, &list);
2198 inst->src[i].reladdr = NULL;
2199 } else {
2200 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2201 fs_inst *pull =
2202 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2203 dst, surf_index, offset);
2204 inst->insert_before(block, pull);
2205 inst->src[i].set_smear(pull_index & 3);
2206 }
2207
2208 /* Rewrite the instruction to use the temporary VGRF. */
2209 inst->src[i].file = GRF;
2210 inst->src[i].reg = dst.reg;
2211 inst->src[i].reg_offset = 0;
2212 inst->src[i].width = dispatch_width;
2213 }
2214 }
2215 invalidate_live_intervals();
2216 }
2217
2218 bool
2219 fs_visitor::opt_algebraic()
2220 {
2221 bool progress = false;
2222
2223 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2224 switch (inst->opcode) {
2225 case BRW_OPCODE_MUL:
2226 if (inst->src[1].file != IMM)
2227 continue;
2228
2229 /* a * 1.0 = a */
2230 if (inst->src[1].is_one()) {
2231 inst->opcode = BRW_OPCODE_MOV;
2232 inst->src[1] = reg_undef;
2233 progress = true;
2234 break;
2235 }
2236
2237 /* a * 0.0 = 0.0 */
2238 if (inst->src[1].is_zero()) {
2239 inst->opcode = BRW_OPCODE_MOV;
2240 inst->src[0] = inst->src[1];
2241 inst->src[1] = reg_undef;
2242 progress = true;
2243 break;
2244 }
2245
2246 break;
2247 case BRW_OPCODE_ADD:
2248 if (inst->src[1].file != IMM)
2249 continue;
2250
2251 /* a + 0.0 = a */
2252 if (inst->src[1].is_zero()) {
2253 inst->opcode = BRW_OPCODE_MOV;
2254 inst->src[1] = reg_undef;
2255 progress = true;
2256 break;
2257 }
2258 break;
2259 case BRW_OPCODE_OR:
2260 if (inst->src[0].equals(inst->src[1])) {
2261 inst->opcode = BRW_OPCODE_MOV;
2262 inst->src[1] = reg_undef;
2263 progress = true;
2264 break;
2265 }
2266 break;
2267 case BRW_OPCODE_LRP:
2268 if (inst->src[1].equals(inst->src[2])) {
2269 inst->opcode = BRW_OPCODE_MOV;
2270 inst->src[0] = inst->src[1];
2271 inst->src[1] = reg_undef;
2272 inst->src[2] = reg_undef;
2273 progress = true;
2274 break;
2275 }
2276 break;
2277 case BRW_OPCODE_SEL:
2278 if (inst->src[0].equals(inst->src[1])) {
2279 inst->opcode = BRW_OPCODE_MOV;
2280 inst->src[1] = reg_undef;
2281 inst->predicate = BRW_PREDICATE_NONE;
2282 inst->predicate_inverse = false;
2283 progress = true;
2284 } else if (inst->saturate && inst->src[1].file == IMM) {
2285 switch (inst->conditional_mod) {
2286 case BRW_CONDITIONAL_LE:
2287 case BRW_CONDITIONAL_L:
2288 switch (inst->src[1].type) {
2289 case BRW_REGISTER_TYPE_F:
2290 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2291 inst->opcode = BRW_OPCODE_MOV;
2292 inst->src[1] = reg_undef;
2293 progress = true;
2294 }
2295 break;
2296 default:
2297 break;
2298 }
2299 break;
2300 case BRW_CONDITIONAL_GE:
2301 case BRW_CONDITIONAL_G:
2302 switch (inst->src[1].type) {
2303 case BRW_REGISTER_TYPE_F:
2304 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2305 inst->opcode = BRW_OPCODE_MOV;
2306 inst->src[1] = reg_undef;
2307 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2308 progress = true;
2309 }
2310 break;
2311 default:
2312 break;
2313 }
2314 default:
2315 break;
2316 }
2317 }
2318 break;
2319 case SHADER_OPCODE_RCP: {
2320 fs_inst *prev = (fs_inst *)inst->prev;
2321 if (prev->opcode == SHADER_OPCODE_SQRT) {
2322 if (inst->src[0].equals(prev->dst)) {
2323 inst->opcode = SHADER_OPCODE_RSQ;
2324 inst->src[0] = prev->src[0];
2325 progress = true;
2326 }
2327 }
2328 break;
2329 }
2330 default:
2331 break;
2332 }
2333 }
2334
2335 return progress;
2336 }
2337
2338 bool
2339 fs_visitor::opt_register_renaming()
2340 {
2341 bool progress = false;
2342 int depth = 0;
2343
2344 int remap[virtual_grf_count];
2345 memset(remap, -1, sizeof(int) * virtual_grf_count);
2346
2347 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2348 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2349 depth++;
2350 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2351 inst->opcode == BRW_OPCODE_WHILE) {
2352 depth--;
2353 }
2354
2355 /* Rewrite instruction sources. */
2356 for (int i = 0; i < inst->sources; i++) {
2357 if (inst->src[i].file == GRF &&
2358 remap[inst->src[i].reg] != -1 &&
2359 remap[inst->src[i].reg] != inst->src[i].reg) {
2360 inst->src[i].reg = remap[inst->src[i].reg];
2361 progress = true;
2362 }
2363 }
2364
2365 const int dst = inst->dst.reg;
2366
2367 if (depth == 0 &&
2368 inst->dst.file == GRF &&
2369 virtual_grf_sizes[inst->dst.reg] == inst->dst.width / 8 &&
2370 !inst->is_partial_write()) {
2371 if (remap[dst] == -1) {
2372 remap[dst] = dst;
2373 } else {
2374 remap[dst] = virtual_grf_alloc(inst->dst.width / 8);
2375 inst->dst.reg = remap[dst];
2376 progress = true;
2377 }
2378 } else if (inst->dst.file == GRF &&
2379 remap[dst] != -1 &&
2380 remap[dst] != dst) {
2381 inst->dst.reg = remap[dst];
2382 progress = true;
2383 }
2384 }
2385
2386 if (progress) {
2387 invalidate_live_intervals();
2388
2389 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2390 if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2391 delta_x[i].reg = remap[delta_x[i].reg];
2392 }
2393 }
2394 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2395 if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2396 delta_y[i].reg = remap[delta_y[i].reg];
2397 }
2398 }
2399 }
2400
2401 return progress;
2402 }
2403
2404 bool
2405 fs_visitor::compute_to_mrf()
2406 {
2407 bool progress = false;
2408 int next_ip = 0;
2409
2410 /* No MRFs on Gen >= 7. */
2411 if (brw->gen >= 7)
2412 return false;
2413
2414 calculate_live_intervals();
2415
2416 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2417 int ip = next_ip;
2418 next_ip++;
2419
2420 if (inst->opcode != BRW_OPCODE_MOV ||
2421 inst->is_partial_write() ||
2422 inst->dst.file != MRF || inst->src[0].file != GRF ||
2423 inst->dst.type != inst->src[0].type ||
2424 inst->src[0].abs || inst->src[0].negate ||
2425 !inst->src[0].is_contiguous() ||
2426 inst->src[0].subreg_offset)
2427 continue;
2428
2429 /* Work out which hardware MRF registers are written by this
2430 * instruction.
2431 */
2432 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2433 int mrf_high;
2434 if (inst->dst.reg & BRW_MRF_COMPR4) {
2435 mrf_high = mrf_low + 4;
2436 } else if (inst->exec_size == 16) {
2437 mrf_high = mrf_low + 1;
2438 } else {
2439 mrf_high = mrf_low;
2440 }
2441
2442 /* Can't compute-to-MRF this GRF if someone else was going to
2443 * read it later.
2444 */
2445 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2446 continue;
2447
2448 /* Found a move of a GRF to a MRF. Let's see if we can go
2449 * rewrite the thing that made this GRF to write into the MRF.
2450 */
2451 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2452 if (scan_inst->dst.file == GRF &&
2453 scan_inst->dst.reg == inst->src[0].reg) {
2454 /* Found the last thing to write our reg we want to turn
2455 * into a compute-to-MRF.
2456 */
2457
2458 /* If this one instruction didn't populate all the
2459 * channels, bail. We might be able to rewrite everything
2460 * that writes that reg, but it would require smarter
2461 * tracking to delay the rewriting until complete success.
2462 */
2463 if (scan_inst->is_partial_write())
2464 break;
2465
2466 /* Things returning more than one register would need us to
2467 * understand coalescing out more than one MOV at a time.
2468 */
2469 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2470 break;
2471
2472 /* SEND instructions can't have MRF as a destination. */
2473 if (scan_inst->mlen)
2474 break;
2475
2476 if (brw->gen == 6) {
2477 /* gen6 math instructions must have the destination be
2478 * GRF, so no compute-to-MRF for them.
2479 */
2480 if (scan_inst->is_math()) {
2481 break;
2482 }
2483 }
2484
2485 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2486 /* Found the creator of our MRF's source value. */
2487 scan_inst->dst.file = MRF;
2488 scan_inst->dst.reg = inst->dst.reg;
2489 scan_inst->saturate |= inst->saturate;
2490 inst->remove(block);
2491 progress = true;
2492 }
2493 break;
2494 }
2495
2496 /* We don't handle control flow here. Most computation of
2497 * values that end up in MRFs are shortly before the MRF
2498 * write anyway.
2499 */
2500 if (block->start() == scan_inst)
2501 break;
2502
2503 /* You can't read from an MRF, so if someone else reads our
2504 * MRF's source GRF that we wanted to rewrite, that stops us.
2505 */
2506 bool interfered = false;
2507 for (int i = 0; i < scan_inst->sources; i++) {
2508 if (scan_inst->src[i].file == GRF &&
2509 scan_inst->src[i].reg == inst->src[0].reg &&
2510 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2511 interfered = true;
2512 }
2513 }
2514 if (interfered)
2515 break;
2516
2517 if (scan_inst->dst.file == MRF) {
2518 /* If somebody else writes our MRF here, we can't
2519 * compute-to-MRF before that.
2520 */
2521 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2522 int scan_mrf_high;
2523
2524 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2525 scan_mrf_high = scan_mrf_low + 4;
2526 } else if (scan_inst->exec_size == 16) {
2527 scan_mrf_high = scan_mrf_low + 1;
2528 } else {
2529 scan_mrf_high = scan_mrf_low;
2530 }
2531
2532 if (mrf_low == scan_mrf_low ||
2533 mrf_low == scan_mrf_high ||
2534 mrf_high == scan_mrf_low ||
2535 mrf_high == scan_mrf_high) {
2536 break;
2537 }
2538 }
2539
2540 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2541 /* Found a SEND instruction, which means that there are
2542 * live values in MRFs from base_mrf to base_mrf +
2543 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2544 * above it.
2545 */
2546 if (mrf_low >= scan_inst->base_mrf &&
2547 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2548 break;
2549 }
2550 if (mrf_high >= scan_inst->base_mrf &&
2551 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2552 break;
2553 }
2554 }
2555 }
2556 }
2557
2558 if (progress)
2559 invalidate_live_intervals();
2560
2561 return progress;
2562 }
2563
2564 /**
2565 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2566 * instructions to FS_OPCODE_REP_FB_WRITE.
2567 */
2568 void
2569 fs_visitor::emit_repclear_shader()
2570 {
2571 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2572 int base_mrf = 1;
2573 int color_mrf = base_mrf + 2;
2574
2575 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2576 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2577 mov->force_writemask_all = true;
2578
2579 fs_inst *write;
2580 if (key->nr_color_regions == 1) {
2581 write = emit(FS_OPCODE_REP_FB_WRITE);
2582 write->saturate = key->clamp_fragment_color;
2583 write->base_mrf = color_mrf;
2584 write->target = 0;
2585 write->header_present = false;
2586 write->mlen = 1;
2587 } else {
2588 assume(key->nr_color_regions > 0);
2589 for (int i = 0; i < key->nr_color_regions; ++i) {
2590 write = emit(FS_OPCODE_REP_FB_WRITE);
2591 write->saturate = key->clamp_fragment_color;
2592 write->base_mrf = base_mrf;
2593 write->target = i;
2594 write->header_present = true;
2595 write->mlen = 3;
2596 }
2597 }
2598 write->eot = true;
2599
2600 calculate_cfg();
2601
2602 assign_constant_locations();
2603 assign_curb_setup();
2604
2605 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2606 assert(mov->src[0].file == HW_REG);
2607 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2608 }
2609
2610 /**
2611 * Walks through basic blocks, looking for repeated MRF writes and
2612 * removing the later ones.
2613 */
2614 bool
2615 fs_visitor::remove_duplicate_mrf_writes()
2616 {
2617 fs_inst *last_mrf_move[16];
2618 bool progress = false;
2619
2620 /* Need to update the MRF tracking for compressed instructions. */
2621 if (dispatch_width == 16)
2622 return false;
2623
2624 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2625
2626 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2627 if (inst->is_control_flow()) {
2628 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2629 }
2630
2631 if (inst->opcode == BRW_OPCODE_MOV &&
2632 inst->dst.file == MRF) {
2633 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2634 if (prev_inst && inst->equals(prev_inst)) {
2635 inst->remove(block);
2636 progress = true;
2637 continue;
2638 }
2639 }
2640
2641 /* Clear out the last-write records for MRFs that were overwritten. */
2642 if (inst->dst.file == MRF) {
2643 last_mrf_move[inst->dst.reg] = NULL;
2644 }
2645
2646 if (inst->mlen > 0 && inst->base_mrf != -1) {
2647 /* Found a SEND instruction, which will include two or fewer
2648 * implied MRF writes. We could do better here.
2649 */
2650 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2651 last_mrf_move[inst->base_mrf + i] = NULL;
2652 }
2653 }
2654
2655 /* Clear out any MRF move records whose sources got overwritten. */
2656 if (inst->dst.file == GRF) {
2657 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2658 if (last_mrf_move[i] &&
2659 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2660 last_mrf_move[i] = NULL;
2661 }
2662 }
2663 }
2664
2665 if (inst->opcode == BRW_OPCODE_MOV &&
2666 inst->dst.file == MRF &&
2667 inst->src[0].file == GRF &&
2668 !inst->is_partial_write()) {
2669 last_mrf_move[inst->dst.reg] = inst;
2670 }
2671 }
2672
2673 if (progress)
2674 invalidate_live_intervals();
2675
2676 return progress;
2677 }
2678
2679 static void
2680 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2681 int first_grf, int grf_len)
2682 {
2683 /* Clear the flag for registers that actually got read (as expected). */
2684 for (int i = 0; i < inst->sources; i++) {
2685 int grf;
2686 if (inst->src[i].file == GRF) {
2687 grf = inst->src[i].reg;
2688 } else if (inst->src[i].file == HW_REG &&
2689 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2690 grf = inst->src[i].fixed_hw_reg.nr;
2691 } else {
2692 continue;
2693 }
2694
2695 if (grf >= first_grf &&
2696 grf < first_grf + grf_len) {
2697 deps[grf - first_grf] = false;
2698 if (inst->exec_size == 16)
2699 deps[grf - first_grf + 1] = false;
2700 }
2701 }
2702 }
2703
2704 /**
2705 * Implements this workaround for the original 965:
2706 *
2707 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2708 * check for post destination dependencies on this instruction, software
2709 * must ensure that there is no destination hazard for the case of ‘write
2710 * followed by a posted write’ shown in the following example.
2711 *
2712 * 1. mov r3 0
2713 * 2. send r3.xy <rest of send instruction>
2714 * 3. mov r2 r3
2715 *
2716 * Due to no post-destination dependency check on the ‘send’, the above
2717 * code sequence could have two instructions (1 and 2) in flight at the
2718 * same time that both consider ‘r3’ as the target of their final writes.
2719 */
2720 void
2721 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2722 fs_inst *inst)
2723 {
2724 int write_len = inst->regs_written;
2725 int first_write_grf = inst->dst.reg;
2726 bool needs_dep[BRW_MAX_MRF];
2727 assert(write_len < (int)sizeof(needs_dep) - 1);
2728
2729 memset(needs_dep, false, sizeof(needs_dep));
2730 memset(needs_dep, true, write_len);
2731
2732 clear_deps_for_inst_src(inst, dispatch_width,
2733 needs_dep, first_write_grf, write_len);
2734
2735 /* Walk backwards looking for writes to registers we're writing which
2736 * aren't read since being written. If we hit the start of the program,
2737 * we assume that there are no outstanding dependencies on entry to the
2738 * program.
2739 */
2740 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2741 /* If we hit control flow, assume that there *are* outstanding
2742 * dependencies, and force their cleanup before our instruction.
2743 */
2744 if (block->start() == scan_inst) {
2745 for (int i = 0; i < write_len; i++) {
2746 if (needs_dep[i]) {
2747 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2748 }
2749 }
2750 return;
2751 }
2752
2753 /* We insert our reads as late as possible on the assumption that any
2754 * instruction but a MOV that might have left us an outstanding
2755 * dependency has more latency than a MOV.
2756 */
2757 if (scan_inst->dst.file == GRF) {
2758 for (int i = 0; i < scan_inst->regs_written; i++) {
2759 int reg = scan_inst->dst.reg + i;
2760
2761 if (reg >= first_write_grf &&
2762 reg < first_write_grf + write_len &&
2763 needs_dep[reg - first_write_grf]) {
2764 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2765 needs_dep[reg - first_write_grf] = false;
2766 if (scan_inst->exec_size == 16)
2767 needs_dep[reg - first_write_grf + 1] = false;
2768 }
2769 }
2770 }
2771
2772 /* Clear the flag for registers that actually got read (as expected). */
2773 clear_deps_for_inst_src(scan_inst, dispatch_width,
2774 needs_dep, first_write_grf, write_len);
2775
2776 /* Continue the loop only if we haven't resolved all the dependencies */
2777 int i;
2778 for (i = 0; i < write_len; i++) {
2779 if (needs_dep[i])
2780 break;
2781 }
2782 if (i == write_len)
2783 return;
2784 }
2785 }
2786
2787 /**
2788 * Implements this workaround for the original 965:
2789 *
2790 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2791 * used as a destination register until after it has been sourced by an
2792 * instruction with a different destination register.
2793 */
2794 void
2795 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2796 {
2797 int write_len = inst->regs_written;
2798 int first_write_grf = inst->dst.reg;
2799 bool needs_dep[BRW_MAX_MRF];
2800 assert(write_len < (int)sizeof(needs_dep) - 1);
2801
2802 memset(needs_dep, false, sizeof(needs_dep));
2803 memset(needs_dep, true, write_len);
2804 /* Walk forwards looking for writes to registers we're writing which aren't
2805 * read before being written.
2806 */
2807 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2808 /* If we hit control flow, force resolve all remaining dependencies. */
2809 if (block->end() == scan_inst) {
2810 for (int i = 0; i < write_len; i++) {
2811 if (needs_dep[i])
2812 scan_inst->insert_before(block,
2813 DEP_RESOLVE_MOV(first_write_grf + i));
2814 }
2815 return;
2816 }
2817
2818 /* Clear the flag for registers that actually got read (as expected). */
2819 clear_deps_for_inst_src(scan_inst, dispatch_width,
2820 needs_dep, first_write_grf, write_len);
2821
2822 /* We insert our reads as late as possible since they're reading the
2823 * result of a SEND, which has massive latency.
2824 */
2825 if (scan_inst->dst.file == GRF &&
2826 scan_inst->dst.reg >= first_write_grf &&
2827 scan_inst->dst.reg < first_write_grf + write_len &&
2828 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2829 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
2830 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2831 }
2832
2833 /* Continue the loop only if we haven't resolved all the dependencies */
2834 int i;
2835 for (i = 0; i < write_len; i++) {
2836 if (needs_dep[i])
2837 break;
2838 }
2839 if (i == write_len)
2840 return;
2841 }
2842
2843 /* If we hit the end of the program, resolve all remaining dependencies out
2844 * of paranoia.
2845 */
2846 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2847 assert(last_inst->eot);
2848 for (int i = 0; i < write_len; i++) {
2849 if (needs_dep[i])
2850 last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2851 }
2852 }
2853
2854 void
2855 fs_visitor::insert_gen4_send_dependency_workarounds()
2856 {
2857 if (brw->gen != 4 || brw->is_g4x)
2858 return;
2859
2860 bool progress = false;
2861
2862 /* Note that we're done with register allocation, so GRF fs_regs always
2863 * have a .reg_offset of 0.
2864 */
2865
2866 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2867 if (inst->mlen != 0 && inst->dst.file == GRF) {
2868 insert_gen4_pre_send_dependency_workarounds(block, inst);
2869 insert_gen4_post_send_dependency_workarounds(block, inst);
2870 progress = true;
2871 }
2872 }
2873
2874 if (progress)
2875 invalidate_live_intervals();
2876 }
2877
2878 /**
2879 * Turns the generic expression-style uniform pull constant load instruction
2880 * into a hardware-specific series of instructions for loading a pull
2881 * constant.
2882 *
2883 * The expression style allows the CSE pass before this to optimize out
2884 * repeated loads from the same offset, and gives the pre-register-allocation
2885 * scheduling full flexibility, while the conversion to native instructions
2886 * allows the post-register-allocation scheduler the best information
2887 * possible.
2888 *
2889 * Note that execution masking for setting up pull constant loads is special:
2890 * the channels that need to be written are unrelated to the current execution
2891 * mask, since a later instruction will use one of the result channels as a
2892 * source operand for all 8 or 16 of its channels.
2893 */
2894 void
2895 fs_visitor::lower_uniform_pull_constant_loads()
2896 {
2897 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2898 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2899 continue;
2900
2901 if (brw->gen >= 7) {
2902 /* The offset arg before was a vec4-aligned byte offset. We need to
2903 * turn it into a dword offset.
2904 */
2905 fs_reg const_offset_reg = inst->src[1];
2906 assert(const_offset_reg.file == IMM &&
2907 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2908 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2909 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2910
2911 /* This is actually going to be a MOV, but since only the first dword
2912 * is accessed, we have a special opcode to do just that one. Note
2913 * that this needs to be an operation that will be considered a def
2914 * by live variable analysis, or register allocation will explode.
2915 */
2916 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2917 8, payload, const_offset_reg);
2918 setup->force_writemask_all = true;
2919
2920 setup->ir = inst->ir;
2921 setup->annotation = inst->annotation;
2922 inst->insert_before(block, setup);
2923
2924 /* Similarly, this will only populate the first 4 channels of the
2925 * result register (since we only use smear values from 0-3), but we
2926 * don't tell the optimizer.
2927 */
2928 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2929 inst->src[1] = payload;
2930
2931 invalidate_live_intervals();
2932 } else {
2933 /* Before register allocation, we didn't tell the scheduler about the
2934 * MRF we use. We know it's safe to use this MRF because nothing
2935 * else does except for register spill/unspill, which generates and
2936 * uses its MRF within a single IR instruction.
2937 */
2938 inst->base_mrf = 14;
2939 inst->mlen = 1;
2940 }
2941 }
2942 }
2943
2944 bool
2945 fs_visitor::lower_load_payload()
2946 {
2947 bool progress = false;
2948
2949 int vgrf_to_reg[virtual_grf_count];
2950 int reg_count = 16; /* Leave room for MRF */
2951 for (int i = 0; i < virtual_grf_count; ++i) {
2952 vgrf_to_reg[i] = reg_count;
2953 reg_count += virtual_grf_sizes[i];
2954 }
2955
2956 struct {
2957 bool written:1; /* Whether this register has ever been written */
2958 bool force_writemask_all:1;
2959 bool force_sechalf:1;
2960 } metadata[reg_count];
2961 memset(metadata, 0, sizeof(metadata));
2962
2963 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2964 int dst_reg;
2965 if (inst->dst.file == GRF) {
2966 dst_reg = vgrf_to_reg[inst->dst.reg];
2967 } else {
2968 /* MRF */
2969 dst_reg = inst->dst.reg;
2970 }
2971
2972 if (inst->dst.file == MRF || inst->dst.file == GRF) {
2973 bool force_sechalf = inst->force_sechalf;
2974 bool toggle_sechalf = inst->dst.width == 16 &&
2975 type_sz(inst->dst.type) == 4;
2976 for (int i = 0; i < inst->regs_written; ++i) {
2977 metadata[dst_reg + i].written = true;
2978 metadata[dst_reg + i].force_sechalf = force_sechalf;
2979 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
2980 force_sechalf = (toggle_sechalf != force_sechalf);
2981 }
2982 }
2983
2984 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
2985 assert(inst->dst.file == MRF || inst->dst.file == GRF);
2986 fs_reg dst = inst->dst;
2987
2988 for (int i = 0; i < inst->sources; i++) {
2989 dst.width = inst->src[i].effective_width;
2990 dst.type = inst->src[i].type;
2991
2992 if (inst->src[i].file == BAD_FILE) {
2993 /* Do nothing but otherwise increment as normal */
2994 } else if (dst.file == MRF &&
2995 dst.width == 8 &&
2996 brw->has_compr4 &&
2997 i + 4 < inst->sources &&
2998 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
2999 fs_reg compr4_dst = dst;
3000 compr4_dst.reg += BRW_MRF_COMPR4;
3001 compr4_dst.width = 16;
3002 fs_reg compr4_src = inst->src[i];
3003 compr4_src.width = 16;
3004 fs_inst *mov = MOV(compr4_dst, compr4_src);
3005 mov->force_writemask_all = true;
3006 inst->insert_before(block, mov);
3007 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3008 inst->src[i + 4].file = BAD_FILE;
3009 } else {
3010 fs_inst *mov = MOV(dst, inst->src[i]);
3011 if (inst->src[i].file == GRF) {
3012 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3013 inst->src[i].reg_offset;
3014 mov->force_sechalf = metadata[src_reg].force_sechalf;
3015 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3016 metadata[dst_reg] = metadata[src_reg];
3017 if (dst.width * type_sz(dst.type) > 32) {
3018 assert((!metadata[src_reg].written ||
3019 !metadata[src_reg].force_sechalf) &&
3020 (!metadata[src_reg + 1].written ||
3021 metadata[src_reg + 1].force_sechalf));
3022 metadata[dst_reg + 1] = metadata[src_reg + 1];
3023 }
3024 } else {
3025 metadata[dst_reg].force_writemask_all = false;
3026 metadata[dst_reg].force_sechalf = false;
3027 if (dst.width == 16) {
3028 metadata[dst_reg + 1].force_writemask_all = false;
3029 metadata[dst_reg + 1].force_sechalf = true;
3030 }
3031 }
3032 inst->insert_before(block, mov);
3033 }
3034
3035 dst = offset(dst, 1);
3036 }
3037
3038 inst->remove(block);
3039 progress = true;
3040 }
3041 }
3042
3043 if (progress)
3044 invalidate_live_intervals();
3045
3046 return progress;
3047 }
3048
3049 void
3050 fs_visitor::dump_instructions()
3051 {
3052 dump_instructions(NULL);
3053 }
3054
3055 void
3056 fs_visitor::dump_instructions(const char *name)
3057 {
3058 calculate_register_pressure();
3059 FILE *file = stderr;
3060 if (name && geteuid() != 0) {
3061 file = fopen(name, "w");
3062 if (!file)
3063 file = stderr;
3064 }
3065
3066 int ip = 0, max_pressure = 0;
3067 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3068 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3069 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3070 dump_instruction(inst, file);
3071 ++ip;
3072 }
3073 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3074
3075 if (file != stderr) {
3076 fclose(file);
3077 }
3078 }
3079
3080 void
3081 fs_visitor::dump_instruction(backend_instruction *be_inst)
3082 {
3083 dump_instruction(be_inst, stderr);
3084 }
3085
3086 void
3087 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3088 {
3089 fs_inst *inst = (fs_inst *)be_inst;
3090
3091 if (inst->predicate) {
3092 fprintf(file, "(%cf0.%d) ",
3093 inst->predicate_inverse ? '-' : '+',
3094 inst->flag_subreg);
3095 }
3096
3097 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3098 if (inst->saturate)
3099 fprintf(file, ".sat");
3100 if (inst->conditional_mod) {
3101 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3102 if (!inst->predicate &&
3103 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3104 inst->opcode != BRW_OPCODE_IF &&
3105 inst->opcode != BRW_OPCODE_WHILE))) {
3106 fprintf(file, ".f0.%d", inst->flag_subreg);
3107 }
3108 }
3109 fprintf(file, "(%d) ", inst->exec_size);
3110
3111
3112 switch (inst->dst.file) {
3113 case GRF:
3114 fprintf(file, "vgrf%d", inst->dst.reg);
3115 if (inst->dst.width != dispatch_width)
3116 fprintf(file, "@%d", inst->dst.width);
3117 if (virtual_grf_sizes[inst->dst.reg] != inst->dst.width / 8 ||
3118 inst->dst.subreg_offset)
3119 fprintf(file, "+%d.%d",
3120 inst->dst.reg_offset, inst->dst.subreg_offset);
3121 break;
3122 case MRF:
3123 fprintf(file, "m%d", inst->dst.reg);
3124 break;
3125 case BAD_FILE:
3126 fprintf(file, "(null)");
3127 break;
3128 case UNIFORM:
3129 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3130 break;
3131 case HW_REG:
3132 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3133 switch (inst->dst.fixed_hw_reg.nr) {
3134 case BRW_ARF_NULL:
3135 fprintf(file, "null");
3136 break;
3137 case BRW_ARF_ADDRESS:
3138 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3139 break;
3140 case BRW_ARF_ACCUMULATOR:
3141 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3142 break;
3143 case BRW_ARF_FLAG:
3144 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3145 inst->dst.fixed_hw_reg.subnr);
3146 break;
3147 default:
3148 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3149 inst->dst.fixed_hw_reg.subnr);
3150 break;
3151 }
3152 } else {
3153 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3154 }
3155 if (inst->dst.fixed_hw_reg.subnr)
3156 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3157 break;
3158 default:
3159 fprintf(file, "???");
3160 break;
3161 }
3162 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3163
3164 for (int i = 0; i < inst->sources; i++) {
3165 if (inst->src[i].negate)
3166 fprintf(file, "-");
3167 if (inst->src[i].abs)
3168 fprintf(file, "|");
3169 switch (inst->src[i].file) {
3170 case GRF:
3171 fprintf(file, "vgrf%d", inst->src[i].reg);
3172 if (inst->src[i].width != dispatch_width)
3173 fprintf(file, "@%d", inst->src[i].width);
3174 if (virtual_grf_sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3175 inst->src[i].subreg_offset)
3176 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3177 inst->src[i].subreg_offset);
3178 break;
3179 case MRF:
3180 fprintf(file, "***m%d***", inst->src[i].reg);
3181 break;
3182 case UNIFORM:
3183 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3184 if (inst->src[i].reladdr) {
3185 fprintf(file, "+reladdr");
3186 } else if (inst->src[i].subreg_offset) {
3187 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3188 inst->src[i].subreg_offset);
3189 }
3190 break;
3191 case BAD_FILE:
3192 fprintf(file, "(null)");
3193 break;
3194 case IMM:
3195 switch (inst->src[i].type) {
3196 case BRW_REGISTER_TYPE_F:
3197 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3198 break;
3199 case BRW_REGISTER_TYPE_D:
3200 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3201 break;
3202 case BRW_REGISTER_TYPE_UD:
3203 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3204 break;
3205 case BRW_REGISTER_TYPE_VF:
3206 fprintf(stderr, "[%-gF, %-gF, %-gF, %-gF]",
3207 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3208 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3209 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3210 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3211 break;
3212 default:
3213 fprintf(file, "???");
3214 break;
3215 }
3216 break;
3217 case HW_REG:
3218 if (inst->src[i].fixed_hw_reg.negate)
3219 fprintf(file, "-");
3220 if (inst->src[i].fixed_hw_reg.abs)
3221 fprintf(file, "|");
3222 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3223 switch (inst->src[i].fixed_hw_reg.nr) {
3224 case BRW_ARF_NULL:
3225 fprintf(file, "null");
3226 break;
3227 case BRW_ARF_ADDRESS:
3228 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3229 break;
3230 case BRW_ARF_ACCUMULATOR:
3231 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3232 break;
3233 case BRW_ARF_FLAG:
3234 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3235 inst->src[i].fixed_hw_reg.subnr);
3236 break;
3237 default:
3238 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3239 inst->src[i].fixed_hw_reg.subnr);
3240 break;
3241 }
3242 } else {
3243 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3244 }
3245 if (inst->src[i].fixed_hw_reg.subnr)
3246 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3247 if (inst->src[i].fixed_hw_reg.abs)
3248 fprintf(file, "|");
3249 break;
3250 default:
3251 fprintf(file, "???");
3252 break;
3253 }
3254 if (inst->src[i].abs)
3255 fprintf(file, "|");
3256
3257 if (inst->src[i].file != IMM) {
3258 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3259 }
3260
3261 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3262 fprintf(file, ", ");
3263 }
3264
3265 fprintf(file, " ");
3266
3267 if (dispatch_width == 16 && inst->exec_size == 8) {
3268 if (inst->force_sechalf)
3269 fprintf(file, "2ndhalf ");
3270 else
3271 fprintf(file, "1sthalf ");
3272 }
3273
3274 fprintf(file, "\n");
3275 }
3276
3277 /**
3278 * Possibly returns an instruction that set up @param reg.
3279 *
3280 * Sometimes we want to take the result of some expression/variable
3281 * dereference tree and rewrite the instruction generating the result
3282 * of the tree. When processing the tree, we know that the
3283 * instructions generated are all writing temporaries that are dead
3284 * outside of this tree. So, if we have some instructions that write
3285 * a temporary, we're free to point that temp write somewhere else.
3286 *
3287 * Note that this doesn't guarantee that the instruction generated
3288 * only reg -- it might be the size=4 destination of a texture instruction.
3289 */
3290 fs_inst *
3291 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3292 fs_inst *end,
3293 const fs_reg &reg)
3294 {
3295 if (end == start ||
3296 end->is_partial_write() ||
3297 reg.reladdr ||
3298 !reg.equals(end->dst)) {
3299 return NULL;
3300 } else {
3301 return end;
3302 }
3303 }
3304
3305 void
3306 fs_visitor::setup_payload_gen6()
3307 {
3308 bool uses_depth =
3309 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3310 unsigned barycentric_interp_modes =
3311 (stage == MESA_SHADER_FRAGMENT) ?
3312 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3313
3314 assert(brw->gen >= 6);
3315
3316 /* R0-1: masks, pixel X/Y coordinates. */
3317 payload.num_regs = 2;
3318 /* R2: only for 32-pixel dispatch.*/
3319
3320 /* R3-26: barycentric interpolation coordinates. These appear in the
3321 * same order that they appear in the brw_wm_barycentric_interp_mode
3322 * enum. Each set of coordinates occupies 2 registers if dispatch width
3323 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3324 * appear if they were enabled using the "Barycentric Interpolation
3325 * Mode" bits in WM_STATE.
3326 */
3327 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3328 if (barycentric_interp_modes & (1 << i)) {
3329 payload.barycentric_coord_reg[i] = payload.num_regs;
3330 payload.num_regs += 2;
3331 if (dispatch_width == 16) {
3332 payload.num_regs += 2;
3333 }
3334 }
3335 }
3336
3337 /* R27: interpolated depth if uses source depth */
3338 if (uses_depth) {
3339 payload.source_depth_reg = payload.num_regs;
3340 payload.num_regs++;
3341 if (dispatch_width == 16) {
3342 /* R28: interpolated depth if not SIMD8. */
3343 payload.num_regs++;
3344 }
3345 }
3346 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3347 if (uses_depth) {
3348 payload.source_w_reg = payload.num_regs;
3349 payload.num_regs++;
3350 if (dispatch_width == 16) {
3351 /* R30: interpolated W if not SIMD8. */
3352 payload.num_regs++;
3353 }
3354 }
3355
3356 if (stage == MESA_SHADER_FRAGMENT) {
3357 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3358 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3359 prog_data->uses_pos_offset = key->compute_pos_offset;
3360 /* R31: MSAA position offsets. */
3361 if (prog_data->uses_pos_offset) {
3362 payload.sample_pos_reg = payload.num_regs;
3363 payload.num_regs++;
3364 }
3365 }
3366
3367 /* R32: MSAA input coverage mask */
3368 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3369 assert(brw->gen >= 7);
3370 payload.sample_mask_in_reg = payload.num_regs;
3371 payload.num_regs++;
3372 if (dispatch_width == 16) {
3373 /* R33: input coverage mask if not SIMD8. */
3374 payload.num_regs++;
3375 }
3376 }
3377
3378 /* R34-: bary for 32-pixel. */
3379 /* R58-59: interp W for 32-pixel. */
3380
3381 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3382 source_depth_to_render_target = true;
3383 }
3384 }
3385
3386 void
3387 fs_visitor::assign_binding_table_offsets()
3388 {
3389 assert(stage == MESA_SHADER_FRAGMENT);
3390 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3391 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3392 uint32_t next_binding_table_offset = 0;
3393
3394 /* If there are no color regions, we still perform an FB write to a null
3395 * renderbuffer, which we place at surface index 0.
3396 */
3397 prog_data->binding_table.render_target_start = next_binding_table_offset;
3398 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3399
3400 assign_common_binding_table_offsets(next_binding_table_offset);
3401 }
3402
3403 void
3404 fs_visitor::calculate_register_pressure()
3405 {
3406 invalidate_live_intervals();
3407 calculate_live_intervals();
3408
3409 unsigned num_instructions = 0;
3410 foreach_block(block, cfg)
3411 num_instructions += block->instructions.length();
3412
3413 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3414
3415 for (int reg = 0; reg < virtual_grf_count; reg++) {
3416 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3417 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3418 }
3419 }
3420
3421 void
3422 fs_visitor::optimize()
3423 {
3424 calculate_cfg();
3425
3426 split_virtual_grfs();
3427
3428 move_uniform_array_access_to_pull_constants();
3429 assign_constant_locations();
3430 demote_pull_constants();
3431
3432 #define OPT(pass, args...) do { \
3433 pass_num++; \
3434 bool this_progress = pass(args); \
3435 \
3436 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3437 char filename[64]; \
3438 snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass, \
3439 dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3440 \
3441 backend_visitor::dump_instructions(filename); \
3442 } \
3443 \
3444 progress = progress || this_progress; \
3445 } while (false)
3446
3447 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3448 char filename[64];
3449 snprintf(filename, 64, "fs%d-%04d-00-start",
3450 dispatch_width, shader_prog ? shader_prog->Name : 0);
3451
3452 backend_visitor::dump_instructions(filename);
3453 }
3454
3455 bool progress;
3456 int iteration = 0;
3457 do {
3458 progress = false;
3459 iteration++;
3460 int pass_num = 0;
3461
3462 OPT(remove_duplicate_mrf_writes);
3463
3464 OPT(opt_algebraic);
3465 OPT(opt_cse);
3466 OPT(opt_copy_propagate);
3467 OPT(opt_peephole_predicated_break);
3468 OPT(dead_code_eliminate);
3469 OPT(opt_peephole_sel);
3470 OPT(dead_control_flow_eliminate, this);
3471 OPT(opt_register_renaming);
3472 OPT(opt_saturate_propagation);
3473 OPT(register_coalesce);
3474 OPT(compute_to_mrf);
3475
3476 OPT(compact_virtual_grfs);
3477 } while (progress);
3478
3479 if (lower_load_payload()) {
3480 split_virtual_grfs();
3481 register_coalesce();
3482 compute_to_mrf();
3483 dead_code_eliminate();
3484 }
3485
3486 lower_uniform_pull_constant_loads();
3487 }
3488
3489 void
3490 fs_visitor::allocate_registers()
3491 {
3492 bool allocated_without_spills;
3493
3494 static enum instruction_scheduler_mode pre_modes[] = {
3495 SCHEDULE_PRE,
3496 SCHEDULE_PRE_NON_LIFO,
3497 SCHEDULE_PRE_LIFO,
3498 };
3499
3500 /* Try each scheduling heuristic to see if it can successfully register
3501 * allocate without spilling. They should be ordered by decreasing
3502 * performance but increasing likelihood of allocating.
3503 */
3504 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3505 schedule_instructions(pre_modes[i]);
3506
3507 if (0) {
3508 assign_regs_trivial();
3509 allocated_without_spills = true;
3510 } else {
3511 allocated_without_spills = assign_regs(false);
3512 }
3513 if (allocated_without_spills)
3514 break;
3515 }
3516
3517 if (!allocated_without_spills) {
3518 /* We assume that any spilling is worse than just dropping back to
3519 * SIMD8. There's probably actually some intermediate point where
3520 * SIMD16 with a couple of spills is still better.
3521 */
3522 if (dispatch_width == 16) {
3523 fail("Failure to register allocate. Reduce number of "
3524 "live scalar values to avoid this.");
3525 } else {
3526 perf_debug("Fragment shader triggered register spilling. "
3527 "Try reducing the number of live scalar values to "
3528 "improve performance.\n");
3529 }
3530
3531 /* Since we're out of heuristics, just go spill registers until we
3532 * get an allocation.
3533 */
3534 while (!assign_regs(true)) {
3535 if (failed)
3536 break;
3537 }
3538 }
3539
3540 /* This must come after all optimization and register allocation, since
3541 * it inserts dead code that happens to have side effects, and it does
3542 * so based on the actual physical registers in use.
3543 */
3544 insert_gen4_send_dependency_workarounds();
3545
3546 if (failed)
3547 return;
3548
3549 if (!allocated_without_spills)
3550 schedule_instructions(SCHEDULE_POST);
3551
3552 if (last_scratch > 0)
3553 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3554 }
3555
3556 bool
3557 fs_visitor::run()
3558 {
3559 sanity_param_count = prog->Parameters->NumParameters;
3560
3561 assign_binding_table_offsets();
3562
3563 if (brw->gen >= 6)
3564 setup_payload_gen6();
3565 else
3566 setup_payload_gen4();
3567
3568 if (0) {
3569 emit_dummy_fs();
3570 } else if (brw->use_rep_send && dispatch_width == 16) {
3571 emit_repclear_shader();
3572 } else {
3573 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3574 emit_shader_time_begin();
3575
3576 calculate_urb_setup();
3577 if (prog->InputsRead > 0) {
3578 if (brw->gen < 6)
3579 emit_interpolation_setup_gen4();
3580 else
3581 emit_interpolation_setup_gen6();
3582 }
3583
3584 /* We handle discards by keeping track of the still-live pixels in f0.1.
3585 * Initialize it with the dispatched pixels.
3586 */
3587 bool uses_kill =
3588 (stage == MESA_SHADER_FRAGMENT) &&
3589 ((brw_wm_prog_data*) this->prog_data)->uses_kill;
3590 bool alpha_test_func =
3591 (stage == MESA_SHADER_FRAGMENT) &&
3592 ((brw_wm_prog_key*) this->key)->alpha_test_func;
3593 if (uses_kill) {
3594 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3595 discard_init->flag_subreg = 1;
3596 }
3597
3598 /* Generate FS IR for main(). (the visitor only descends into
3599 * functions called "main").
3600 */
3601 if (shader) {
3602 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3603 base_ir = ir;
3604 this->result = reg_undef;
3605 ir->accept(this);
3606 }
3607 } else {
3608 emit_fragment_program_code();
3609 }
3610 base_ir = NULL;
3611 if (failed)
3612 return false;
3613
3614 emit(FS_OPCODE_PLACEHOLDER_HALT);
3615
3616 if (alpha_test_func)
3617 emit_alpha_test();
3618
3619 emit_fb_writes();
3620
3621 optimize();
3622
3623 assign_curb_setup();
3624 assign_urb_setup();
3625
3626 allocate_registers();
3627
3628 if (failed)
3629 return false;
3630 }
3631
3632 if (stage == MESA_SHADER_FRAGMENT) {
3633 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3634 if (dispatch_width == 8)
3635 prog_data->reg_blocks = brw_register_blocks(grf_used);
3636 else
3637 prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3638 }
3639
3640 /* If any state parameters were appended, then ParameterValues could have
3641 * been realloced, in which case the driver uniform storage set up by
3642 * _mesa_associate_uniform_storage() would point to freed memory. Make
3643 * sure that didn't happen.
3644 */
3645 assert(sanity_param_count == prog->Parameters->NumParameters);
3646
3647 return !failed;
3648 }
3649
3650 const unsigned *
3651 brw_wm_fs_emit(struct brw_context *brw,
3652 void *mem_ctx,
3653 const struct brw_wm_prog_key *key,
3654 struct brw_wm_prog_data *prog_data,
3655 struct gl_fragment_program *fp,
3656 struct gl_shader_program *prog,
3657 unsigned *final_assembly_size)
3658 {
3659 bool start_busy = false;
3660 double start_time = 0;
3661
3662 if (unlikely(brw->perf_debug)) {
3663 start_busy = (brw->batch.last_bo &&
3664 drm_intel_bo_busy(brw->batch.last_bo));
3665 start_time = get_time();
3666 }
3667
3668 struct brw_shader *shader = NULL;
3669 if (prog)
3670 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3671
3672 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3673 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3674
3675 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3676 */
3677 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3678 if (!v.run()) {
3679 if (prog) {
3680 prog->LinkStatus = false;
3681 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3682 }
3683
3684 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3685 v.fail_msg);
3686
3687 return NULL;
3688 }
3689
3690 cfg_t *simd16_cfg = NULL;
3691 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3692 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3693 brw->use_rep_send)) {
3694 if (!v.simd16_unsupported) {
3695 /* Try a SIMD16 compile */
3696 v2.import_uniforms(&v);
3697 if (!v2.run()) {
3698 perf_debug("SIMD16 shader failed to compile, falling back to "
3699 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3700 } else {
3701 simd16_cfg = v2.cfg;
3702 }
3703 } else {
3704 perf_debug("SIMD16 shader unsupported, falling back to "
3705 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3706 }
3707 }
3708
3709 cfg_t *simd8_cfg;
3710 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3711 if (no_simd8 && simd16_cfg) {
3712 simd8_cfg = NULL;
3713 prog_data->no_8 = true;
3714 } else {
3715 simd8_cfg = v.cfg;
3716 prog_data->no_8 = false;
3717 }
3718
3719 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base, prog, &fp->Base,
3720 v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
3721 if (simd8_cfg)
3722 g.generate_code(simd8_cfg, 8);
3723 if (simd16_cfg)
3724 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
3725
3726 if (unlikely(brw->perf_debug) && shader) {
3727 if (shader->compiled_once)
3728 brw_wm_debug_recompile(brw, prog, key);
3729 shader->compiled_once = true;
3730
3731 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3732 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3733 (get_time() - start_time) * 1000);
3734 }
3735 }
3736
3737 return g.get_assembly(final_assembly_size);
3738 }
3739
3740 extern "C" bool
3741 brw_fs_precompile(struct gl_context *ctx,
3742 struct gl_shader_program *shader_prog,
3743 struct gl_program *prog)
3744 {
3745 struct brw_context *brw = brw_context(ctx);
3746 struct brw_wm_prog_key key;
3747
3748 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
3749 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3750 bool program_uses_dfdy = fp->UsesDFdy;
3751
3752 memset(&key, 0, sizeof(key));
3753
3754 if (brw->gen < 6) {
3755 if (fp->UsesKill)
3756 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3757
3758 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3759 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3760
3761 /* Just assume depth testing. */
3762 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3763 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3764 }
3765
3766 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3767 BRW_FS_VARYING_INPUT_MASK) > 16)
3768 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3769
3770 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3771 for (unsigned i = 0; i < sampler_count; i++) {
3772 if (fp->Base.ShadowSamplers & (1 << i)) {
3773 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3774 key.tex.swizzles[i] =
3775 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3776 } else {
3777 /* Color sampler: assume no swizzling. */
3778 key.tex.swizzles[i] = SWIZZLE_XYZW;
3779 }
3780 }
3781
3782 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3783 key.drawable_height = ctx->DrawBuffer->Height;
3784 }
3785
3786 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3787 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3788 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3789
3790 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3791 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3792 key.nr_color_regions > 1;
3793 }
3794
3795 key.program_string_id = bfp->id;
3796
3797 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3798 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3799
3800 bool success = do_wm_prog(brw, shader_prog, bfp, &key);
3801
3802 brw->wm.base.prog_offset = old_prog_offset;
3803 brw->wm.prog_data = old_prog_data;
3804
3805 return success;
3806 }