i965: Set prog_data->uses_kill if simulating alpha test via discards.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "util/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "util/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_cfg.h"
50 #include "brw_dead_control_flow.h"
51 #include "main/uniforms.h"
52 #include "brw_fs_live_variables.h"
53 #include "glsl/glsl_types.h"
54
55 void
56 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
57 fs_reg *src, int sources)
58 {
59 memset(this, 0, sizeof(*this));
60
61 this->opcode = opcode;
62 this->dst = dst;
63 this->src = src;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (int i = 0; i < sources; ++i) {
79 if (src[i].file != GRF)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 for (int i = 0; i < sources; ++i) {
94 switch (this->src[i].file) {
95 case BAD_FILE:
96 this->src[i].effective_width = 8;
97 break;
98 case GRF:
99 case HW_REG:
100 assert(this->src[i].width > 0);
101 if (this->src[i].width == 1) {
102 this->src[i].effective_width = this->exec_size;
103 } else {
104 this->src[i].effective_width = this->src[i].width;
105 }
106 break;
107 case IMM:
108 case UNIFORM:
109 this->src[i].effective_width = this->exec_size;
110 break;
111 default:
112 unreachable("Invalid source register file");
113 }
114 }
115 this->dst.effective_width = this->exec_size;
116
117 this->conditional_mod = BRW_CONDITIONAL_NONE;
118
119 /* This will be the case for almost all instructions. */
120 switch (dst.file) {
121 case GRF:
122 case HW_REG:
123 case MRF:
124 this->regs_written = (dst.width * dst.stride * type_sz(dst.type) + 31) / 32;
125 break;
126 case BAD_FILE:
127 this->regs_written = 0;
128 break;
129 case IMM:
130 case UNIFORM:
131 unreachable("Invalid destination register file");
132 default:
133 unreachable("Invalid register file");
134 }
135
136 this->writes_accumulator = false;
137 }
138
139 fs_inst::fs_inst()
140 {
141 fs_reg *src = ralloc_array(this, fs_reg, 3);
142 init(BRW_OPCODE_NOP, 8, dst, src, 0);
143 }
144
145 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
146 {
147 fs_reg *src = ralloc_array(this, fs_reg, 3);
148 init(opcode, exec_size, reg_undef, src, 0);
149 }
150
151 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
152 {
153 fs_reg *src = ralloc_array(this, fs_reg, 3);
154 init(opcode, 0, dst, src, 0);
155 }
156
157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
158 const fs_reg &src0)
159 {
160 fs_reg *src = ralloc_array(this, fs_reg, 3);
161 src[0] = src0;
162 init(opcode, exec_size, dst, src, 1);
163 }
164
165 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
166 {
167 fs_reg *src = ralloc_array(this, fs_reg, 3);
168 src[0] = src0;
169 init(opcode, 0, dst, src, 1);
170 }
171
172 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
173 const fs_reg &src0, const fs_reg &src1)
174 {
175 fs_reg *src = ralloc_array(this, fs_reg, 3);
176 src[0] = src0;
177 src[1] = src1;
178 init(opcode, exec_size, dst, src, 2);
179 }
180
181 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
182 const fs_reg &src1)
183 {
184 fs_reg *src = ralloc_array(this, fs_reg, 3);
185 src[0] = src0;
186 src[1] = src1;
187 init(opcode, 0, dst, src, 2);
188 }
189
190 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
191 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
192 {
193 fs_reg *src = ralloc_array(this, fs_reg, 3);
194 src[0] = src0;
195 src[1] = src1;
196 src[2] = src2;
197 init(opcode, exec_size, dst, src, 3);
198 }
199
200 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
201 const fs_reg &src1, const fs_reg &src2)
202 {
203 fs_reg *src = ralloc_array(this, fs_reg, 3);
204 src[0] = src0;
205 src[1] = src1;
206 src[2] = src2;
207 init(opcode, 0, dst, src, 3);
208 }
209
210 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
211 {
212 init(opcode, 0, dst, src, sources);
213 }
214
215 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
216 fs_reg src[], int sources)
217 {
218 init(opcode, exec_width, dst, src, sources);
219 }
220
221 fs_inst::fs_inst(const fs_inst &that)
222 {
223 memcpy(this, &that, sizeof(that));
224
225 this->src = ralloc_array(this, fs_reg, that.sources);
226
227 for (int i = 0; i < that.sources; i++)
228 this->src[i] = that.src[i];
229 }
230
231 void
232 fs_inst::resize_sources(uint8_t num_sources)
233 {
234 if (this->sources != num_sources) {
235 this->src = reralloc(this, this->src, fs_reg, num_sources);
236 this->sources = num_sources;
237 }
238 }
239
240 #define ALU1(op) \
241 fs_inst * \
242 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
243 { \
244 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
245 }
246
247 #define ALU2(op) \
248 fs_inst * \
249 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
250 const fs_reg &src1) \
251 { \
252 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
253 }
254
255 #define ALU2_ACC(op) \
256 fs_inst * \
257 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
258 const fs_reg &src1) \
259 { \
260 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
261 inst->writes_accumulator = true; \
262 return inst; \
263 }
264
265 #define ALU3(op) \
266 fs_inst * \
267 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
268 const fs_reg &src1, const fs_reg &src2) \
269 { \
270 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
271 }
272
273 ALU1(NOT)
274 ALU1(MOV)
275 ALU1(FRC)
276 ALU1(RNDD)
277 ALU1(RNDE)
278 ALU1(RNDZ)
279 ALU2(ADD)
280 ALU2(MUL)
281 ALU2_ACC(MACH)
282 ALU2(AND)
283 ALU2(OR)
284 ALU2(XOR)
285 ALU2(SHL)
286 ALU2(SHR)
287 ALU2(ASR)
288 ALU3(LRP)
289 ALU1(BFREV)
290 ALU3(BFE)
291 ALU2(BFI1)
292 ALU3(BFI2)
293 ALU1(FBH)
294 ALU1(FBL)
295 ALU1(CBIT)
296 ALU3(MAD)
297 ALU2_ACC(ADDC)
298 ALU2_ACC(SUBB)
299 ALU2(SEL)
300 ALU2(MAC)
301
302 /** Gen4 predicated IF. */
303 fs_inst *
304 fs_visitor::IF(enum brw_predicate predicate)
305 {
306 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
307 inst->predicate = predicate;
308 return inst;
309 }
310
311 /** Gen6 IF with embedded comparison. */
312 fs_inst *
313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
314 enum brw_conditional_mod condition)
315 {
316 assert(brw->gen == 6);
317 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
318 reg_null_d, src0, src1);
319 inst->conditional_mod = condition;
320 return inst;
321 }
322
323 /**
324 * CMP: Sets the low bit of the destination channels with the result
325 * of the comparison, while the upper bits are undefined, and updates
326 * the flag register with the packed 16 bits of the result.
327 */
328 fs_inst *
329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
330 enum brw_conditional_mod condition)
331 {
332 fs_inst *inst;
333
334 /* Take the instruction:
335 *
336 * CMP null<d> src0<f> src1<f>
337 *
338 * Original gen4 does type conversion to the destination type before
339 * comparison, producing garbage results for floating point comparisons.
340 * gen5 does the comparison on the execution type (resolved source types),
341 * so dst type doesn't matter. gen6 does comparison and then uses the
342 * result as if it was the dst type with no conversion, which happens to
343 * mostly work out for float-interpreted-as-int since our comparisons are
344 * for >0, =0, <0.
345 */
346 if (brw->gen == 4) {
347 dst.type = src0.type;
348 if (dst.file == HW_REG)
349 dst.fixed_hw_reg.type = dst.type;
350 }
351
352 resolve_ud_negate(&src0);
353 resolve_ud_negate(&src1);
354
355 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
356 inst->conditional_mod = condition;
357
358 return inst;
359 }
360
361 fs_inst *
362 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
363 {
364 uint8_t exec_size = dst.width;
365 for (int i = 0; i < sources; ++i) {
366 assert(src[i].width % dst.width == 0);
367 if (src[i].width > exec_size)
368 exec_size = src[i].width;
369 }
370
371 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
372 dst, src, sources);
373 inst->regs_written = 0;
374 for (int i = 0; i < sources; ++i) {
375 /* The LOAD_PAYLOAD instruction only really makes sense if we are
376 * dealing with whole registers. If this ever changes, we can deal
377 * with it later.
378 */
379 int size = src[i].effective_width * type_sz(src[i].type);
380 assert(size % 32 == 0);
381 inst->regs_written += (size + 31) / 32;
382 }
383
384 return inst;
385 }
386
387 exec_list
388 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
389 const fs_reg &surf_index,
390 const fs_reg &varying_offset,
391 uint32_t const_offset)
392 {
393 exec_list instructions;
394 fs_inst *inst;
395
396 /* We have our constant surface use a pitch of 4 bytes, so our index can
397 * be any component of a vector, and then we load 4 contiguous
398 * components starting from that.
399 *
400 * We break down the const_offset to a portion added to the variable
401 * offset and a portion done using reg_offset, which means that if you
402 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
403 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
404 * CSE can later notice that those loads are all the same and eliminate
405 * the redundant ones.
406 */
407 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
408 instructions.push_tail(ADD(vec4_offset,
409 varying_offset, fs_reg(const_offset & ~3)));
410
411 int scale = 1;
412 if (brw->gen == 4 && dst.width == 8) {
413 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
414 * u, v, r) as parameters, or we can just use the SIMD16 message
415 * consisting of (header, u). We choose the second, at the cost of a
416 * longer return length.
417 */
418 scale = 2;
419 }
420
421 enum opcode op;
422 if (brw->gen >= 7)
423 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
424 else
425 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
426
427 assert(dst.width % 8 == 0);
428 int regs_written = 4 * (dst.width / 8) * scale;
429 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(regs_written),
430 dst.type, dst.width);
431 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
432 inst->regs_written = regs_written;
433 instructions.push_tail(inst);
434
435 if (brw->gen < 7) {
436 inst->base_mrf = 13;
437 inst->header_present = true;
438 if (brw->gen == 4)
439 inst->mlen = 3;
440 else
441 inst->mlen = 1 + dispatch_width / 8;
442 }
443
444 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
445 instructions.push_tail(MOV(dst, result));
446
447 return instructions;
448 }
449
450 /**
451 * A helper for MOV generation for fixing up broken hardware SEND dependency
452 * handling.
453 */
454 fs_inst *
455 fs_visitor::DEP_RESOLVE_MOV(int grf)
456 {
457 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
458
459 inst->ir = NULL;
460 inst->annotation = "send dependency resolve";
461
462 /* The caller always wants uncompressed to emit the minimal extra
463 * dependencies, and to avoid having to deal with aligning its regs to 2.
464 */
465 inst->exec_size = 8;
466
467 return inst;
468 }
469
470 bool
471 fs_inst::equals(fs_inst *inst) const
472 {
473 return (opcode == inst->opcode &&
474 dst.equals(inst->dst) &&
475 src[0].equals(inst->src[0]) &&
476 src[1].equals(inst->src[1]) &&
477 src[2].equals(inst->src[2]) &&
478 saturate == inst->saturate &&
479 predicate == inst->predicate &&
480 conditional_mod == inst->conditional_mod &&
481 mlen == inst->mlen &&
482 base_mrf == inst->base_mrf &&
483 target == inst->target &&
484 eot == inst->eot &&
485 header_present == inst->header_present &&
486 shadow_compare == inst->shadow_compare &&
487 exec_size == inst->exec_size &&
488 offset == inst->offset);
489 }
490
491 bool
492 fs_inst::overwrites_reg(const fs_reg &reg) const
493 {
494 return (reg.file == dst.file &&
495 reg.reg == dst.reg &&
496 reg.reg_offset >= dst.reg_offset &&
497 reg.reg_offset < dst.reg_offset + regs_written);
498 }
499
500 bool
501 fs_inst::is_send_from_grf() const
502 {
503 switch (opcode) {
504 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
505 case SHADER_OPCODE_SHADER_TIME_ADD:
506 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
507 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
508 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
509 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
510 case SHADER_OPCODE_UNTYPED_ATOMIC:
511 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
512 return true;
513 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
514 return src[1].file == GRF;
515 case FS_OPCODE_FB_WRITE:
516 return src[0].file == GRF;
517 default:
518 if (is_tex())
519 return src[0].file == GRF;
520
521 return false;
522 }
523 }
524
525 bool
526 fs_inst::can_do_source_mods(struct brw_context *brw)
527 {
528 if (brw->gen == 6 && is_math())
529 return false;
530
531 if (is_send_from_grf())
532 return false;
533
534 if (!backend_instruction::can_do_source_mods())
535 return false;
536
537 return true;
538 }
539
540 void
541 fs_reg::init()
542 {
543 memset(this, 0, sizeof(*this));
544 stride = 1;
545 }
546
547 /** Generic unset register constructor. */
548 fs_reg::fs_reg()
549 {
550 init();
551 this->file = BAD_FILE;
552 }
553
554 /** Immediate value constructor. */
555 fs_reg::fs_reg(float f)
556 {
557 init();
558 this->file = IMM;
559 this->type = BRW_REGISTER_TYPE_F;
560 this->fixed_hw_reg.dw1.f = f;
561 this->width = 1;
562 }
563
564 /** Immediate value constructor. */
565 fs_reg::fs_reg(int32_t i)
566 {
567 init();
568 this->file = IMM;
569 this->type = BRW_REGISTER_TYPE_D;
570 this->fixed_hw_reg.dw1.d = i;
571 this->width = 1;
572 }
573
574 /** Immediate value constructor. */
575 fs_reg::fs_reg(uint32_t u)
576 {
577 init();
578 this->file = IMM;
579 this->type = BRW_REGISTER_TYPE_UD;
580 this->fixed_hw_reg.dw1.ud = u;
581 this->width = 1;
582 }
583
584 /** Vector float immediate value constructor. */
585 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
586 {
587 init();
588 this->file = IMM;
589 this->type = BRW_REGISTER_TYPE_VF;
590 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
591 (vf1 << 8) |
592 (vf2 << 16) |
593 (vf3 << 24);
594 }
595
596 /** Fixed brw_reg. */
597 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
598 {
599 init();
600 this->file = HW_REG;
601 this->fixed_hw_reg = fixed_hw_reg;
602 this->type = fixed_hw_reg.type;
603 this->width = 1 << fixed_hw_reg.width;
604 }
605
606 bool
607 fs_reg::equals(const fs_reg &r) const
608 {
609 return (file == r.file &&
610 reg == r.reg &&
611 reg_offset == r.reg_offset &&
612 subreg_offset == r.subreg_offset &&
613 type == r.type &&
614 negate == r.negate &&
615 abs == r.abs &&
616 !reladdr && !r.reladdr &&
617 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
618 width == r.width &&
619 stride == r.stride);
620 }
621
622 fs_reg &
623 fs_reg::set_smear(unsigned subreg)
624 {
625 assert(file != HW_REG && file != IMM);
626 subreg_offset = subreg * type_sz(type);
627 stride = 0;
628 return *this;
629 }
630
631 bool
632 fs_reg::is_contiguous() const
633 {
634 return stride == 1;
635 }
636
637 int
638 fs_visitor::type_size(const struct glsl_type *type)
639 {
640 unsigned int size, i;
641
642 switch (type->base_type) {
643 case GLSL_TYPE_UINT:
644 case GLSL_TYPE_INT:
645 case GLSL_TYPE_FLOAT:
646 case GLSL_TYPE_BOOL:
647 return type->components();
648 case GLSL_TYPE_ARRAY:
649 return type_size(type->fields.array) * type->length;
650 case GLSL_TYPE_STRUCT:
651 size = 0;
652 for (i = 0; i < type->length; i++) {
653 size += type_size(type->fields.structure[i].type);
654 }
655 return size;
656 case GLSL_TYPE_SAMPLER:
657 /* Samplers take up no register space, since they're baked in at
658 * link time.
659 */
660 return 0;
661 case GLSL_TYPE_ATOMIC_UINT:
662 return 0;
663 case GLSL_TYPE_IMAGE:
664 case GLSL_TYPE_VOID:
665 case GLSL_TYPE_ERROR:
666 case GLSL_TYPE_INTERFACE:
667 unreachable("not reached");
668 }
669
670 return 0;
671 }
672
673 fs_reg
674 fs_visitor::get_timestamp()
675 {
676 assert(brw->gen >= 7);
677
678 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
679 BRW_ARF_TIMESTAMP,
680 0),
681 BRW_REGISTER_TYPE_UD));
682
683 fs_reg dst = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 4);
684
685 fs_inst *mov = emit(MOV(dst, ts));
686 /* We want to read the 3 fields we care about even if it's not enabled in
687 * the dispatch.
688 */
689 mov->force_writemask_all = true;
690
691 /* The caller wants the low 32 bits of the timestamp. Since it's running
692 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
693 * which is plenty of time for our purposes. It is identical across the
694 * EUs, but since it's tracking GPU core speed it will increment at a
695 * varying rate as render P-states change.
696 *
697 * The caller could also check if render P-states have changed (or anything
698 * else that might disrupt timing) by setting smear to 2 and checking if
699 * that field is != 0.
700 */
701 dst.set_smear(0);
702
703 return dst;
704 }
705
706 void
707 fs_visitor::emit_shader_time_begin()
708 {
709 current_annotation = "shader time start";
710 shader_start_time = get_timestamp();
711 }
712
713 void
714 fs_visitor::emit_shader_time_end()
715 {
716 current_annotation = "shader time end";
717
718 enum shader_time_shader_type type, written_type, reset_type;
719 if (dispatch_width == 8) {
720 type = ST_FS8;
721 written_type = ST_FS8_WRITTEN;
722 reset_type = ST_FS8_RESET;
723 } else {
724 assert(dispatch_width == 16);
725 type = ST_FS16;
726 written_type = ST_FS16_WRITTEN;
727 reset_type = ST_FS16_RESET;
728 }
729
730 fs_reg shader_end_time = get_timestamp();
731
732 /* Check that there weren't any timestamp reset events (assuming these
733 * were the only two timestamp reads that happened).
734 */
735 fs_reg reset = shader_end_time;
736 reset.set_smear(2);
737 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
738 test->conditional_mod = BRW_CONDITIONAL_Z;
739 emit(IF(BRW_PREDICATE_NORMAL));
740
741 fs_reg start = shader_start_time;
742 start.negate = true;
743 fs_reg diff = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 1);
744 emit(ADD(diff, start, shader_end_time));
745
746 /* If there were no instructions between the two timestamp gets, the diff
747 * is 2 cycles. Remove that overhead, so I can forget about that when
748 * trying to determine the time taken for single instructions.
749 */
750 emit(ADD(diff, diff, fs_reg(-2u)));
751
752 emit_shader_time_write(type, diff);
753 emit_shader_time_write(written_type, fs_reg(1u));
754 emit(BRW_OPCODE_ELSE);
755 emit_shader_time_write(reset_type, fs_reg(1u));
756 emit(BRW_OPCODE_ENDIF);
757 }
758
759 void
760 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
761 fs_reg value)
762 {
763 int shader_time_index =
764 brw_get_shader_time_index(brw, shader_prog, prog, type);
765 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
766
767 fs_reg payload;
768 if (dispatch_width == 8)
769 payload = fs_reg(this, glsl_type::uvec2_type);
770 else
771 payload = fs_reg(this, glsl_type::uint_type);
772
773 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
774 fs_reg(), payload, offset, value));
775 }
776
777 void
778 fs_visitor::vfail(const char *format, va_list va)
779 {
780 char *msg;
781
782 if (failed)
783 return;
784
785 failed = true;
786
787 msg = ralloc_vasprintf(mem_ctx, format, va);
788 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
789
790 this->fail_msg = msg;
791
792 if (INTEL_DEBUG & DEBUG_WM) {
793 fprintf(stderr, "%s", msg);
794 }
795 }
796
797 void
798 fs_visitor::fail(const char *format, ...)
799 {
800 va_list va;
801
802 va_start(va, format);
803 vfail(format, va);
804 va_end(va);
805 }
806
807 /**
808 * Mark this program as impossible to compile in SIMD16 mode.
809 *
810 * During the SIMD8 compile (which happens first), we can detect and flag
811 * things that are unsupported in SIMD16 mode, so the compiler can skip
812 * the SIMD16 compile altogether.
813 *
814 * During a SIMD16 compile (if one happens anyway), this just calls fail().
815 */
816 void
817 fs_visitor::no16(const char *format, ...)
818 {
819 va_list va;
820
821 va_start(va, format);
822
823 if (dispatch_width == 16) {
824 vfail(format, va);
825 } else {
826 simd16_unsupported = true;
827
828 if (brw->perf_debug) {
829 if (no16_msg)
830 ralloc_vasprintf_append(&no16_msg, format, va);
831 else
832 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
833 }
834 }
835
836 va_end(va);
837 }
838
839 fs_inst *
840 fs_visitor::emit(enum opcode opcode)
841 {
842 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
843 }
844
845 fs_inst *
846 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
847 {
848 return emit(new(mem_ctx) fs_inst(opcode, dst));
849 }
850
851 fs_inst *
852 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
853 {
854 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
855 }
856
857 fs_inst *
858 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
859 const fs_reg &src1)
860 {
861 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
862 }
863
864 fs_inst *
865 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
866 const fs_reg &src1, const fs_reg &src2)
867 {
868 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
869 }
870
871 fs_inst *
872 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
873 fs_reg src[], int sources)
874 {
875 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
876 }
877
878 /**
879 * Returns true if the instruction has a flag that means it won't
880 * update an entire destination register.
881 *
882 * For example, dead code elimination and live variable analysis want to know
883 * when a write to a variable screens off any preceding values that were in
884 * it.
885 */
886 bool
887 fs_inst::is_partial_write() const
888 {
889 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
890 (this->dst.width * type_sz(this->dst.type)) < 32 ||
891 !this->dst.is_contiguous());
892 }
893
894 int
895 fs_inst::regs_read(fs_visitor *v, int arg) const
896 {
897 if (is_tex() && arg == 0 && src[0].file == GRF) {
898 return mlen;
899 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
900 return mlen;
901 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
902 return mlen;
903 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
904 return mlen;
905 }
906
907 switch (src[arg].file) {
908 case BAD_FILE:
909 case UNIFORM:
910 case IMM:
911 return 1;
912 case GRF:
913 case HW_REG:
914 if (src[arg].stride == 0) {
915 return 1;
916 } else {
917 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
918 return (size + 31) / 32;
919 }
920 case MRF:
921 unreachable("MRF registers are not allowed as sources");
922 default:
923 unreachable("Invalid register file");
924 }
925 }
926
927 bool
928 fs_inst::reads_flag() const
929 {
930 return predicate;
931 }
932
933 bool
934 fs_inst::writes_flag() const
935 {
936 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
937 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
938 }
939
940 /**
941 * Returns how many MRFs an FS opcode will write over.
942 *
943 * Note that this is not the 0 or 1 implied writes in an actual gen
944 * instruction -- the FS opcodes often generate MOVs in addition.
945 */
946 int
947 fs_visitor::implied_mrf_writes(fs_inst *inst)
948 {
949 if (inst->mlen == 0)
950 return 0;
951
952 if (inst->base_mrf == -1)
953 return 0;
954
955 switch (inst->opcode) {
956 case SHADER_OPCODE_RCP:
957 case SHADER_OPCODE_RSQ:
958 case SHADER_OPCODE_SQRT:
959 case SHADER_OPCODE_EXP2:
960 case SHADER_OPCODE_LOG2:
961 case SHADER_OPCODE_SIN:
962 case SHADER_OPCODE_COS:
963 return 1 * dispatch_width / 8;
964 case SHADER_OPCODE_POW:
965 case SHADER_OPCODE_INT_QUOTIENT:
966 case SHADER_OPCODE_INT_REMAINDER:
967 return 2 * dispatch_width / 8;
968 case SHADER_OPCODE_TEX:
969 case FS_OPCODE_TXB:
970 case SHADER_OPCODE_TXD:
971 case SHADER_OPCODE_TXF:
972 case SHADER_OPCODE_TXF_CMS:
973 case SHADER_OPCODE_TXF_MCS:
974 case SHADER_OPCODE_TG4:
975 case SHADER_OPCODE_TG4_OFFSET:
976 case SHADER_OPCODE_TXL:
977 case SHADER_OPCODE_TXS:
978 case SHADER_OPCODE_LOD:
979 return 1;
980 case FS_OPCODE_FB_WRITE:
981 return 2;
982 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
983 case SHADER_OPCODE_GEN4_SCRATCH_READ:
984 return 1;
985 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
986 return inst->mlen;
987 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
988 return 2;
989 case SHADER_OPCODE_UNTYPED_ATOMIC:
990 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
991 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
992 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
993 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
994 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
995 return 0;
996 default:
997 unreachable("not reached");
998 }
999 }
1000
1001 int
1002 fs_visitor::virtual_grf_alloc(int size)
1003 {
1004 if (virtual_grf_array_size <= virtual_grf_count) {
1005 if (virtual_grf_array_size == 0)
1006 virtual_grf_array_size = 16;
1007 else
1008 virtual_grf_array_size *= 2;
1009 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
1010 virtual_grf_array_size);
1011 }
1012 virtual_grf_sizes[virtual_grf_count] = size;
1013 return virtual_grf_count++;
1014 }
1015
1016 /** Fixed HW reg constructor. */
1017 fs_reg::fs_reg(enum register_file file, int reg)
1018 {
1019 init();
1020 this->file = file;
1021 this->reg = reg;
1022 this->type = BRW_REGISTER_TYPE_F;
1023
1024 switch (file) {
1025 case UNIFORM:
1026 this->width = 1;
1027 break;
1028 default:
1029 this->width = 8;
1030 }
1031 }
1032
1033 /** Fixed HW reg constructor. */
1034 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1035 {
1036 init();
1037 this->file = file;
1038 this->reg = reg;
1039 this->type = type;
1040
1041 switch (file) {
1042 case UNIFORM:
1043 this->width = 1;
1044 break;
1045 default:
1046 this->width = 8;
1047 }
1048 }
1049
1050 /** Fixed HW reg constructor. */
1051 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1052 uint8_t width)
1053 {
1054 init();
1055 this->file = file;
1056 this->reg = reg;
1057 this->type = type;
1058 this->width = width;
1059 }
1060
1061 /** Automatic reg constructor. */
1062 fs_reg::fs_reg(fs_visitor *v, const struct glsl_type *type)
1063 {
1064 init();
1065 int reg_width = v->dispatch_width / 8;
1066
1067 this->file = GRF;
1068 this->reg = v->virtual_grf_alloc(v->type_size(type) * reg_width);
1069 this->reg_offset = 0;
1070 this->type = brw_type_for_base_type(type);
1071 this->width = v->dispatch_width;
1072 assert(this->width == 8 || this->width == 16);
1073 }
1074
1075 fs_reg *
1076 fs_visitor::variable_storage(ir_variable *var)
1077 {
1078 return (fs_reg *)hash_table_find(this->variable_ht, var);
1079 }
1080
1081 void
1082 import_uniforms_callback(const void *key,
1083 void *data,
1084 void *closure)
1085 {
1086 struct hash_table *dst_ht = (struct hash_table *)closure;
1087 const fs_reg *reg = (const fs_reg *)data;
1088
1089 if (reg->file != UNIFORM)
1090 return;
1091
1092 hash_table_insert(dst_ht, data, key);
1093 }
1094
1095 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1096 * This brings in those uniform definitions
1097 */
1098 void
1099 fs_visitor::import_uniforms(fs_visitor *v)
1100 {
1101 hash_table_call_foreach(v->variable_ht,
1102 import_uniforms_callback,
1103 variable_ht);
1104 this->push_constant_loc = v->push_constant_loc;
1105 this->pull_constant_loc = v->pull_constant_loc;
1106 this->uniforms = v->uniforms;
1107 this->param_size = v->param_size;
1108 }
1109
1110 /* Our support for uniforms is piggy-backed on the struct
1111 * gl_fragment_program, because that's where the values actually
1112 * get stored, rather than in some global gl_shader_program uniform
1113 * store.
1114 */
1115 void
1116 fs_visitor::setup_uniform_values(ir_variable *ir)
1117 {
1118 int namelen = strlen(ir->name);
1119
1120 /* The data for our (non-builtin) uniforms is stored in a series of
1121 * gl_uniform_driver_storage structs for each subcomponent that
1122 * glGetUniformLocation() could name. We know it's been set up in the same
1123 * order we'd walk the type, so walk the list of storage and find anything
1124 * with our name, or the prefix of a component that starts with our name.
1125 */
1126 unsigned params_before = uniforms;
1127 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1128 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1129
1130 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1131 (storage->name[namelen] != 0 &&
1132 storage->name[namelen] != '.' &&
1133 storage->name[namelen] != '[')) {
1134 continue;
1135 }
1136
1137 unsigned slots = storage->type->component_slots();
1138 if (storage->array_elements)
1139 slots *= storage->array_elements;
1140
1141 for (unsigned i = 0; i < slots; i++) {
1142 stage_prog_data->param[uniforms++] = &storage->storage[i];
1143 }
1144 }
1145
1146 /* Make sure we actually initialized the right amount of stuff here. */
1147 assert(params_before + ir->type->component_slots() == uniforms);
1148 (void)params_before;
1149 }
1150
1151
1152 /* Our support for builtin uniforms is even scarier than non-builtin.
1153 * It sits on top of the PROG_STATE_VAR parameters that are
1154 * automatically updated from GL context state.
1155 */
1156 void
1157 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1158 {
1159 const ir_state_slot *const slots = ir->get_state_slots();
1160 assert(slots != NULL);
1161
1162 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1163 /* This state reference has already been setup by ir_to_mesa, but we'll
1164 * get the same index back here.
1165 */
1166 int index = _mesa_add_state_reference(this->prog->Parameters,
1167 (gl_state_index *)slots[i].tokens);
1168
1169 /* Add each of the unique swizzles of the element as a parameter.
1170 * This'll end up matching the expected layout of the
1171 * array/matrix/structure we're trying to fill in.
1172 */
1173 int last_swiz = -1;
1174 for (unsigned int j = 0; j < 4; j++) {
1175 int swiz = GET_SWZ(slots[i].swizzle, j);
1176 if (swiz == last_swiz)
1177 break;
1178 last_swiz = swiz;
1179
1180 stage_prog_data->param[uniforms++] =
1181 &prog->Parameters->ParameterValues[index][swiz];
1182 }
1183 }
1184 }
1185
1186 fs_reg *
1187 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1188 {
1189 assert(stage == MESA_SHADER_FRAGMENT);
1190 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1191 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1192 fs_reg wpos = *reg;
1193 bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1194
1195 /* gl_FragCoord.x */
1196 if (ir->data.pixel_center_integer) {
1197 emit(MOV(wpos, this->pixel_x));
1198 } else {
1199 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1200 }
1201 wpos = offset(wpos, 1);
1202
1203 /* gl_FragCoord.y */
1204 if (!flip && ir->data.pixel_center_integer) {
1205 emit(MOV(wpos, this->pixel_y));
1206 } else {
1207 fs_reg pixel_y = this->pixel_y;
1208 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1209
1210 if (flip) {
1211 pixel_y.negate = true;
1212 offset += key->drawable_height - 1.0;
1213 }
1214
1215 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1216 }
1217 wpos = offset(wpos, 1);
1218
1219 /* gl_FragCoord.z */
1220 if (brw->gen >= 6) {
1221 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1222 } else {
1223 emit(FS_OPCODE_LINTERP, wpos,
1224 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1225 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1226 interp_reg(VARYING_SLOT_POS, 2));
1227 }
1228 wpos = offset(wpos, 1);
1229
1230 /* gl_FragCoord.w: Already set up in emit_interpolation */
1231 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1232
1233 return reg;
1234 }
1235
1236 fs_inst *
1237 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1238 glsl_interp_qualifier interpolation_mode,
1239 bool is_centroid, bool is_sample)
1240 {
1241 brw_wm_barycentric_interp_mode barycoord_mode;
1242 if (brw->gen >= 6) {
1243 if (is_centroid) {
1244 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1245 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1246 else
1247 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1248 } else if (is_sample) {
1249 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1250 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1251 else
1252 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1253 } else {
1254 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1255 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1256 else
1257 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1258 }
1259 } else {
1260 /* On Ironlake and below, there is only one interpolation mode.
1261 * Centroid interpolation doesn't mean anything on this hardware --
1262 * there is no multisampling.
1263 */
1264 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1265 }
1266 return emit(FS_OPCODE_LINTERP, attr,
1267 this->delta_x[barycoord_mode],
1268 this->delta_y[barycoord_mode], interp);
1269 }
1270
1271 fs_reg *
1272 fs_visitor::emit_general_interpolation(ir_variable *ir)
1273 {
1274 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1275 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1276 fs_reg attr = *reg;
1277
1278 assert(stage == MESA_SHADER_FRAGMENT);
1279 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1280 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1281
1282 unsigned int array_elements;
1283 const glsl_type *type;
1284
1285 if (ir->type->is_array()) {
1286 array_elements = ir->type->length;
1287 if (array_elements == 0) {
1288 fail("dereferenced array '%s' has length 0\n", ir->name);
1289 }
1290 type = ir->type->fields.array;
1291 } else {
1292 array_elements = 1;
1293 type = ir->type;
1294 }
1295
1296 glsl_interp_qualifier interpolation_mode =
1297 ir->determine_interpolation_mode(key->flat_shade);
1298
1299 int location = ir->data.location;
1300 for (unsigned int i = 0; i < array_elements; i++) {
1301 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1302 if (prog_data->urb_setup[location] == -1) {
1303 /* If there's no incoming setup data for this slot, don't
1304 * emit interpolation for it.
1305 */
1306 attr = offset(attr, type->vector_elements);
1307 location++;
1308 continue;
1309 }
1310
1311 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1312 /* Constant interpolation (flat shading) case. The SF has
1313 * handed us defined values in only the constant offset
1314 * field of the setup reg.
1315 */
1316 for (unsigned int k = 0; k < type->vector_elements; k++) {
1317 struct brw_reg interp = interp_reg(location, k);
1318 interp = suboffset(interp, 3);
1319 interp.type = reg->type;
1320 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1321 attr = offset(attr, 1);
1322 }
1323 } else {
1324 /* Smooth/noperspective interpolation case. */
1325 for (unsigned int k = 0; k < type->vector_elements; k++) {
1326 struct brw_reg interp = interp_reg(location, k);
1327 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1328 /* Get the pixel/sample mask into f0 so that we know
1329 * which pixels are lit. Then, for each channel that is
1330 * unlit, replace the centroid data with non-centroid
1331 * data.
1332 */
1333 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1334
1335 fs_inst *inst;
1336 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1337 false, false);
1338 inst->predicate = BRW_PREDICATE_NORMAL;
1339 inst->predicate_inverse = true;
1340 if (brw->has_pln)
1341 inst->no_dd_clear = true;
1342
1343 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1344 ir->data.centroid && !key->persample_shading,
1345 ir->data.sample || key->persample_shading);
1346 inst->predicate = BRW_PREDICATE_NORMAL;
1347 inst->predicate_inverse = false;
1348 if (brw->has_pln)
1349 inst->no_dd_check = true;
1350
1351 } else {
1352 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1353 ir->data.centroid && !key->persample_shading,
1354 ir->data.sample || key->persample_shading);
1355 }
1356 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1357 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1358 }
1359 attr = offset(attr, 1);
1360 }
1361
1362 }
1363 location++;
1364 }
1365 }
1366
1367 return reg;
1368 }
1369
1370 fs_reg *
1371 fs_visitor::emit_frontfacing_interpolation()
1372 {
1373 fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::bool_type);
1374
1375 if (brw->gen >= 6) {
1376 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1377 * a boolean result from this (~0/true or 0/false).
1378 *
1379 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1380 * this task in only one instruction:
1381 * - a negation source modifier will flip the bit; and
1382 * - a W -> D type conversion will sign extend the bit into the high
1383 * word of the destination.
1384 *
1385 * An ASR 15 fills the low word of the destination.
1386 */
1387 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1388 g0.negate = true;
1389
1390 emit(ASR(*reg, g0, fs_reg(15)));
1391 } else {
1392 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1393 * a boolean result from this (1/true or 0/false).
1394 *
1395 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1396 * the negation source modifier to flip it. Unfortunately the SHR
1397 * instruction only operates on UD (or D with an abs source modifier)
1398 * sources without negation.
1399 *
1400 * Instead, use ASR (which will give ~0/true or 0/false) followed by an
1401 * AND 1.
1402 */
1403 fs_reg asr = fs_reg(this, glsl_type::bool_type);
1404 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1405 g1_6.negate = true;
1406
1407 emit(ASR(asr, g1_6, fs_reg(31)));
1408 emit(AND(*reg, asr, fs_reg(1)));
1409 }
1410
1411 return reg;
1412 }
1413
1414 void
1415 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1416 {
1417 assert(stage == MESA_SHADER_FRAGMENT);
1418 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1419 assert(dst.type == BRW_REGISTER_TYPE_F);
1420
1421 if (key->compute_pos_offset) {
1422 /* Convert int_sample_pos to floating point */
1423 emit(MOV(dst, int_sample_pos));
1424 /* Scale to the range [0, 1] */
1425 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1426 }
1427 else {
1428 /* From ARB_sample_shading specification:
1429 * "When rendering to a non-multisample buffer, or if multisample
1430 * rasterization is disabled, gl_SamplePosition will always be
1431 * (0.5, 0.5).
1432 */
1433 emit(MOV(dst, fs_reg(0.5f)));
1434 }
1435 }
1436
1437 fs_reg *
1438 fs_visitor::emit_samplepos_setup()
1439 {
1440 assert(brw->gen >= 6);
1441
1442 this->current_annotation = "compute sample position";
1443 fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::vec2_type);
1444 fs_reg pos = *reg;
1445 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1446 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1447
1448 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1449 * mode will be enabled.
1450 *
1451 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1452 * R31.1:0 Position Offset X/Y for Slot[3:0]
1453 * R31.3:2 Position Offset X/Y for Slot[7:4]
1454 * .....
1455 *
1456 * The X, Y sample positions come in as bytes in thread payload. So, read
1457 * the positions using vstride=16, width=8, hstride=2.
1458 */
1459 struct brw_reg sample_pos_reg =
1460 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1461 BRW_REGISTER_TYPE_B), 16, 8, 2);
1462
1463 if (dispatch_width == 8) {
1464 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1465 } else {
1466 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1467 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1468 ->force_sechalf = true;
1469 }
1470 /* Compute gl_SamplePosition.x */
1471 compute_sample_position(pos, int_sample_x);
1472 pos = offset(pos, 1);
1473 if (dispatch_width == 8) {
1474 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1475 } else {
1476 emit(MOV(half(int_sample_y, 0),
1477 fs_reg(suboffset(sample_pos_reg, 1))));
1478 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1479 ->force_sechalf = true;
1480 }
1481 /* Compute gl_SamplePosition.y */
1482 compute_sample_position(pos, int_sample_y);
1483 return reg;
1484 }
1485
1486 fs_reg *
1487 fs_visitor::emit_sampleid_setup()
1488 {
1489 assert(stage == MESA_SHADER_FRAGMENT);
1490 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1491 assert(brw->gen >= 6);
1492
1493 this->current_annotation = "compute sample id";
1494 fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::int_type);
1495
1496 if (key->compute_sample_id) {
1497 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1498 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1499 t2.type = BRW_REGISTER_TYPE_UW;
1500
1501 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1502 * 8x multisampling, subspan 0 will represent sample N (where N
1503 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1504 * 7. We can find the value of N by looking at R0.0 bits 7:6
1505 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1506 * (since samples are always delivered in pairs). That is, we
1507 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1508 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1509 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1510 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1511 * populating a temporary variable with the sequence (0, 1, 2, 3),
1512 * and then reading from it using vstride=1, width=4, hstride=0.
1513 * These computations hold good for 4x multisampling as well.
1514 *
1515 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1516 * the first four slots are sample 0 of subspan 0; the next four
1517 * are sample 1 of subspan 0; the third group is sample 0 of
1518 * subspan 1, and finally sample 1 of subspan 1.
1519 */
1520 fs_inst *inst;
1521 inst = emit(BRW_OPCODE_AND, t1,
1522 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1523 fs_reg(0xc0));
1524 inst->force_writemask_all = true;
1525 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1526 inst->force_writemask_all = true;
1527 /* This works for both SIMD8 and SIMD16 */
1528 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1529 inst->force_writemask_all = true;
1530 /* This special instruction takes care of setting vstride=1,
1531 * width=4, hstride=0 of t2 during an ADD instruction.
1532 */
1533 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1534 } else {
1535 /* As per GL_ARB_sample_shading specification:
1536 * "When rendering to a non-multisample buffer, or if multisample
1537 * rasterization is disabled, gl_SampleID will always be zero."
1538 */
1539 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1540 }
1541
1542 return reg;
1543 }
1544
1545 fs_reg
1546 fs_visitor::fix_math_operand(fs_reg src)
1547 {
1548 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1549 * might be able to do better by doing execsize = 1 math and then
1550 * expanding that result out, but we would need to be careful with
1551 * masking.
1552 *
1553 * The hardware ignores source modifiers (negate and abs) on math
1554 * instructions, so we also move to a temp to set those up.
1555 */
1556 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1557 !src.abs && !src.negate)
1558 return src;
1559
1560 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1561 * operands to math
1562 */
1563 if (brw->gen >= 7 && src.file != IMM)
1564 return src;
1565
1566 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1567 expanded.type = src.type;
1568 emit(BRW_OPCODE_MOV, expanded, src);
1569 return expanded;
1570 }
1571
1572 fs_inst *
1573 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1574 {
1575 switch (opcode) {
1576 case SHADER_OPCODE_RCP:
1577 case SHADER_OPCODE_RSQ:
1578 case SHADER_OPCODE_SQRT:
1579 case SHADER_OPCODE_EXP2:
1580 case SHADER_OPCODE_LOG2:
1581 case SHADER_OPCODE_SIN:
1582 case SHADER_OPCODE_COS:
1583 break;
1584 default:
1585 unreachable("not reached: bad math opcode");
1586 }
1587
1588 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1589 * might be able to do better by doing execsize = 1 math and then
1590 * expanding that result out, but we would need to be careful with
1591 * masking.
1592 *
1593 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1594 * instructions, so we also move to a temp to set those up.
1595 */
1596 if (brw->gen == 6 || brw->gen == 7)
1597 src = fix_math_operand(src);
1598
1599 fs_inst *inst = emit(opcode, dst, src);
1600
1601 if (brw->gen < 6) {
1602 inst->base_mrf = 2;
1603 inst->mlen = dispatch_width / 8;
1604 }
1605
1606 return inst;
1607 }
1608
1609 fs_inst *
1610 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1611 {
1612 int base_mrf = 2;
1613 fs_inst *inst;
1614
1615 if (brw->gen >= 8) {
1616 inst = emit(opcode, dst, src0, src1);
1617 } else if (brw->gen >= 6) {
1618 src0 = fix_math_operand(src0);
1619 src1 = fix_math_operand(src1);
1620
1621 inst = emit(opcode, dst, src0, src1);
1622 } else {
1623 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1624 * "Message Payload":
1625 *
1626 * "Operand0[7]. For the INT DIV functions, this operand is the
1627 * denominator."
1628 * ...
1629 * "Operand1[7]. For the INT DIV functions, this operand is the
1630 * numerator."
1631 */
1632 bool is_int_div = opcode != SHADER_OPCODE_POW;
1633 fs_reg &op0 = is_int_div ? src1 : src0;
1634 fs_reg &op1 = is_int_div ? src0 : src1;
1635
1636 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1637 inst = emit(opcode, dst, op0, reg_null_f);
1638
1639 inst->base_mrf = base_mrf;
1640 inst->mlen = 2 * dispatch_width / 8;
1641 }
1642 return inst;
1643 }
1644
1645 void
1646 fs_visitor::assign_curb_setup()
1647 {
1648 if (dispatch_width == 8) {
1649 prog_data->dispatch_grf_start_reg = payload.num_regs;
1650 } else {
1651 assert(stage == MESA_SHADER_FRAGMENT);
1652 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1653 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1654 }
1655
1656 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1657
1658 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1659 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1660 for (unsigned int i = 0; i < inst->sources; i++) {
1661 if (inst->src[i].file == UNIFORM) {
1662 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1663 int constant_nr;
1664 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1665 constant_nr = push_constant_loc[uniform_nr];
1666 } else {
1667 /* Section 5.11 of the OpenGL 4.1 spec says:
1668 * "Out-of-bounds reads return undefined values, which include
1669 * values from other variables of the active program or zero."
1670 * Just return the first push constant.
1671 */
1672 constant_nr = 0;
1673 }
1674
1675 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1676 constant_nr / 8,
1677 constant_nr % 8);
1678
1679 inst->src[i].file = HW_REG;
1680 inst->src[i].fixed_hw_reg = byte_offset(
1681 retype(brw_reg, inst->src[i].type),
1682 inst->src[i].subreg_offset);
1683 }
1684 }
1685 }
1686 }
1687
1688 void
1689 fs_visitor::calculate_urb_setup()
1690 {
1691 assert(stage == MESA_SHADER_FRAGMENT);
1692 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1693 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1694
1695 memset(prog_data->urb_setup, -1,
1696 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1697
1698 int urb_next = 0;
1699 /* Figure out where each of the incoming setup attributes lands. */
1700 if (brw->gen >= 6) {
1701 if (_mesa_bitcount_64(prog->InputsRead &
1702 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1703 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1704 * first 16 varying inputs, so we can put them wherever we want.
1705 * Just put them in order.
1706 *
1707 * This is useful because it means that (a) inputs not used by the
1708 * fragment shader won't take up valuable register space, and (b) we
1709 * won't have to recompile the fragment shader if it gets paired with
1710 * a different vertex (or geometry) shader.
1711 */
1712 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1713 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1714 BITFIELD64_BIT(i)) {
1715 prog_data->urb_setup[i] = urb_next++;
1716 }
1717 }
1718 } else {
1719 /* We have enough input varyings that the SF/SBE pipeline stage can't
1720 * arbitrarily rearrange them to suit our whim; we have to put them
1721 * in an order that matches the output of the previous pipeline stage
1722 * (geometry or vertex shader).
1723 */
1724 struct brw_vue_map prev_stage_vue_map;
1725 brw_compute_vue_map(brw, &prev_stage_vue_map,
1726 key->input_slots_valid);
1727 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1728 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1729 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1730 slot++) {
1731 int varying = prev_stage_vue_map.slot_to_varying[slot];
1732 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1733 * unused.
1734 */
1735 if (varying != BRW_VARYING_SLOT_COUNT &&
1736 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1737 BITFIELD64_BIT(varying))) {
1738 prog_data->urb_setup[varying] = slot - first_slot;
1739 }
1740 }
1741 urb_next = prev_stage_vue_map.num_slots - first_slot;
1742 }
1743 } else {
1744 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1745 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1746 /* Point size is packed into the header, not as a general attribute */
1747 if (i == VARYING_SLOT_PSIZ)
1748 continue;
1749
1750 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1751 /* The back color slot is skipped when the front color is
1752 * also written to. In addition, some slots can be
1753 * written in the vertex shader and not read in the
1754 * fragment shader. So the register number must always be
1755 * incremented, mapped or not.
1756 */
1757 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1758 prog_data->urb_setup[i] = urb_next;
1759 urb_next++;
1760 }
1761 }
1762
1763 /*
1764 * It's a FS only attribute, and we did interpolation for this attribute
1765 * in SF thread. So, count it here, too.
1766 *
1767 * See compile_sf_prog() for more info.
1768 */
1769 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1770 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1771 }
1772
1773 prog_data->num_varying_inputs = urb_next;
1774 }
1775
1776 void
1777 fs_visitor::assign_urb_setup()
1778 {
1779 assert(stage == MESA_SHADER_FRAGMENT);
1780 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1781
1782 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1783
1784 /* Offset all the urb_setup[] index by the actual position of the
1785 * setup regs, now that the location of the constants has been chosen.
1786 */
1787 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1788 if (inst->opcode == FS_OPCODE_LINTERP) {
1789 assert(inst->src[2].file == HW_REG);
1790 inst->src[2].fixed_hw_reg.nr += urb_start;
1791 }
1792
1793 if (inst->opcode == FS_OPCODE_CINTERP) {
1794 assert(inst->src[0].file == HW_REG);
1795 inst->src[0].fixed_hw_reg.nr += urb_start;
1796 }
1797 }
1798
1799 /* Each attribute is 4 setup channels, each of which is half a reg. */
1800 this->first_non_payload_grf =
1801 urb_start + prog_data->num_varying_inputs * 2;
1802 }
1803
1804 /**
1805 * Split large virtual GRFs into separate components if we can.
1806 *
1807 * This is mostly duplicated with what brw_fs_vector_splitting does,
1808 * but that's really conservative because it's afraid of doing
1809 * splitting that doesn't result in real progress after the rest of
1810 * the optimization phases, which would cause infinite looping in
1811 * optimization. We can do it once here, safely. This also has the
1812 * opportunity to split interpolated values, or maybe even uniforms,
1813 * which we don't have at the IR level.
1814 *
1815 * We want to split, because virtual GRFs are what we register
1816 * allocate and spill (due to contiguousness requirements for some
1817 * instructions), and they're what we naturally generate in the
1818 * codegen process, but most virtual GRFs don't actually need to be
1819 * contiguous sets of GRFs. If we split, we'll end up with reduced
1820 * live intervals and better dead code elimination and coalescing.
1821 */
1822 void
1823 fs_visitor::split_virtual_grfs()
1824 {
1825 int num_vars = this->virtual_grf_count;
1826
1827 /* Count the total number of registers */
1828 int reg_count = 0;
1829 int vgrf_to_reg[num_vars];
1830 for (int i = 0; i < num_vars; i++) {
1831 vgrf_to_reg[i] = reg_count;
1832 reg_count += virtual_grf_sizes[i];
1833 }
1834
1835 /* An array of "split points". For each register slot, this indicates
1836 * if this slot can be separated from the previous slot. Every time an
1837 * instruction uses multiple elements of a register (as a source or
1838 * destination), we mark the used slots as inseparable. Then we go
1839 * through and split the registers into the smallest pieces we can.
1840 */
1841 bool split_points[reg_count];
1842 memset(split_points, 0, sizeof(split_points));
1843
1844 /* Mark all used registers as fully splittable */
1845 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1846 if (inst->dst.file == GRF) {
1847 int reg = vgrf_to_reg[inst->dst.reg];
1848 for (int j = 1; j < this->virtual_grf_sizes[inst->dst.reg]; j++)
1849 split_points[reg + j] = true;
1850 }
1851
1852 for (int i = 0; i < inst->sources; i++) {
1853 if (inst->src[i].file == GRF) {
1854 int reg = vgrf_to_reg[inst->src[i].reg];
1855 for (int j = 1; j < this->virtual_grf_sizes[inst->src[i].reg]; j++)
1856 split_points[reg + j] = true;
1857 }
1858 }
1859 }
1860
1861 if (brw->has_pln &&
1862 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1863 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1864 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1865 * Gen6, that was the only supported interpolation mode, and since Gen6,
1866 * delta_x and delta_y are in fixed hardware registers.
1867 */
1868 int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1869 split_points[vgrf_to_reg[vgrf] + 1] = false;
1870 }
1871
1872 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1873 if (inst->dst.file == GRF) {
1874 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1875 for (int j = 1; j < inst->regs_written; j++)
1876 split_points[reg + j] = false;
1877 }
1878 for (int i = 0; i < inst->sources; i++) {
1879 if (inst->src[i].file == GRF) {
1880 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1881 for (int j = 1; j < inst->regs_read(this, i); j++)
1882 split_points[reg + j] = false;
1883 }
1884 }
1885 }
1886
1887 int new_virtual_grf[reg_count];
1888 int new_reg_offset[reg_count];
1889
1890 int reg = 0;
1891 for (int i = 0; i < num_vars; i++) {
1892 /* The first one should always be 0 as a quick sanity check. */
1893 assert(split_points[reg] == false);
1894
1895 /* j = 0 case */
1896 new_reg_offset[reg] = 0;
1897 reg++;
1898 int offset = 1;
1899
1900 /* j > 0 case */
1901 for (int j = 1; j < virtual_grf_sizes[i]; j++) {
1902 /* If this is a split point, reset the offset to 0 and allocate a
1903 * new virtual GRF for the previous offset many registers
1904 */
1905 if (split_points[reg]) {
1906 assert(offset <= MAX_VGRF_SIZE);
1907 int grf = virtual_grf_alloc(offset);
1908 for (int k = reg - offset; k < reg; k++)
1909 new_virtual_grf[k] = grf;
1910 offset = 0;
1911 }
1912 new_reg_offset[reg] = offset;
1913 offset++;
1914 reg++;
1915 }
1916
1917 /* The last one gets the original register number */
1918 assert(offset <= MAX_VGRF_SIZE);
1919 virtual_grf_sizes[i] = offset;
1920 for (int k = reg - offset; k < reg; k++)
1921 new_virtual_grf[k] = i;
1922 }
1923 assert(reg == reg_count);
1924
1925 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1926 if (inst->dst.file == GRF) {
1927 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1928 inst->dst.reg = new_virtual_grf[reg];
1929 inst->dst.reg_offset = new_reg_offset[reg];
1930 assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
1931 }
1932 for (int i = 0; i < inst->sources; i++) {
1933 if (inst->src[i].file == GRF) {
1934 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1935 inst->src[i].reg = new_virtual_grf[reg];
1936 inst->src[i].reg_offset = new_reg_offset[reg];
1937 assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
1938 }
1939 }
1940 }
1941 invalidate_live_intervals();
1942 }
1943
1944 /**
1945 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1946 *
1947 * During code generation, we create tons of temporary variables, many of
1948 * which get immediately killed and are never used again. Yet, in later
1949 * optimization and analysis passes, such as compute_live_intervals, we need
1950 * to loop over all the virtual GRFs. Compacting them can save a lot of
1951 * overhead.
1952 */
1953 bool
1954 fs_visitor::compact_virtual_grfs()
1955 {
1956 bool progress = false;
1957 int remap_table[this->virtual_grf_count];
1958 memset(remap_table, -1, sizeof(remap_table));
1959
1960 /* Mark which virtual GRFs are used. */
1961 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1962 if (inst->dst.file == GRF)
1963 remap_table[inst->dst.reg] = 0;
1964
1965 for (int i = 0; i < inst->sources; i++) {
1966 if (inst->src[i].file == GRF)
1967 remap_table[inst->src[i].reg] = 0;
1968 }
1969 }
1970
1971 /* Compact the GRF arrays. */
1972 int new_index = 0;
1973 for (int i = 0; i < this->virtual_grf_count; i++) {
1974 if (remap_table[i] == -1) {
1975 /* We just found an unused register. This means that we are
1976 * actually going to compact something.
1977 */
1978 progress = true;
1979 } else {
1980 remap_table[i] = new_index;
1981 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1982 invalidate_live_intervals();
1983 ++new_index;
1984 }
1985 }
1986
1987 this->virtual_grf_count = new_index;
1988
1989 /* Patch all the instructions to use the newly renumbered registers */
1990 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1991 if (inst->dst.file == GRF)
1992 inst->dst.reg = remap_table[inst->dst.reg];
1993
1994 for (int i = 0; i < inst->sources; i++) {
1995 if (inst->src[i].file == GRF)
1996 inst->src[i].reg = remap_table[inst->src[i].reg];
1997 }
1998 }
1999
2000 /* Patch all the references to delta_x/delta_y, since they're used in
2001 * register allocation. If they're unused, switch them to BAD_FILE so
2002 * we don't think some random VGRF is delta_x/delta_y.
2003 */
2004 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2005 if (delta_x[i].file == GRF) {
2006 if (remap_table[delta_x[i].reg] != -1) {
2007 delta_x[i].reg = remap_table[delta_x[i].reg];
2008 } else {
2009 delta_x[i].file = BAD_FILE;
2010 }
2011 }
2012 }
2013 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2014 if (delta_y[i].file == GRF) {
2015 if (remap_table[delta_y[i].reg] != -1) {
2016 delta_y[i].reg = remap_table[delta_y[i].reg];
2017 } else {
2018 delta_y[i].file = BAD_FILE;
2019 }
2020 }
2021 }
2022
2023 return progress;
2024 }
2025
2026 /*
2027 * Implements array access of uniforms by inserting a
2028 * PULL_CONSTANT_LOAD instruction.
2029 *
2030 * Unlike temporary GRF array access (where we don't support it due to
2031 * the difficulty of doing relative addressing on instruction
2032 * destinations), we could potentially do array access of uniforms
2033 * that were loaded in GRF space as push constants. In real-world
2034 * usage we've seen, though, the arrays being used are always larger
2035 * than we could load as push constants, so just always move all
2036 * uniform array access out to a pull constant buffer.
2037 */
2038 void
2039 fs_visitor::move_uniform_array_access_to_pull_constants()
2040 {
2041 if (dispatch_width != 8)
2042 return;
2043
2044 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2045 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2046
2047 /* Walk through and find array access of uniforms. Put a copy of that
2048 * uniform in the pull constant buffer.
2049 *
2050 * Note that we don't move constant-indexed accesses to arrays. No
2051 * testing has been done of the performance impact of this choice.
2052 */
2053 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2054 for (int i = 0 ; i < inst->sources; i++) {
2055 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2056 continue;
2057
2058 int uniform = inst->src[i].reg;
2059
2060 /* If this array isn't already present in the pull constant buffer,
2061 * add it.
2062 */
2063 if (pull_constant_loc[uniform] == -1) {
2064 const gl_constant_value **values = &stage_prog_data->param[uniform];
2065
2066 assert(param_size[uniform]);
2067
2068 for (int j = 0; j < param_size[uniform]; j++) {
2069 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2070
2071 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2072 values[j];
2073 }
2074 }
2075 }
2076 }
2077 }
2078
2079 /**
2080 * Assign UNIFORM file registers to either push constants or pull constants.
2081 *
2082 * We allow a fragment shader to have more than the specified minimum
2083 * maximum number of fragment shader uniform components (64). If
2084 * there are too many of these, they'd fill up all of register space.
2085 * So, this will push some of them out to the pull constant buffer and
2086 * update the program to load them.
2087 */
2088 void
2089 fs_visitor::assign_constant_locations()
2090 {
2091 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2092 if (dispatch_width != 8)
2093 return;
2094
2095 /* Find which UNIFORM registers are still in use. */
2096 bool is_live[uniforms];
2097 for (unsigned int i = 0; i < uniforms; i++) {
2098 is_live[i] = false;
2099 }
2100
2101 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2102 for (int i = 0; i < inst->sources; i++) {
2103 if (inst->src[i].file != UNIFORM)
2104 continue;
2105
2106 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2107 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2108 is_live[constant_nr] = true;
2109 }
2110 }
2111
2112 /* Only allow 16 registers (128 uniform components) as push constants.
2113 *
2114 * Just demote the end of the list. We could probably do better
2115 * here, demoting things that are rarely used in the program first.
2116 *
2117 * If changing this value, note the limitation about total_regs in
2118 * brw_curbe.c.
2119 */
2120 unsigned int max_push_components = 16 * 8;
2121 unsigned int num_push_constants = 0;
2122
2123 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2124
2125 for (unsigned int i = 0; i < uniforms; i++) {
2126 if (!is_live[i] || pull_constant_loc[i] != -1) {
2127 /* This UNIFORM register is either dead, or has already been demoted
2128 * to a pull const. Mark it as no longer living in the param[] array.
2129 */
2130 push_constant_loc[i] = -1;
2131 continue;
2132 }
2133
2134 if (num_push_constants < max_push_components) {
2135 /* Retain as a push constant. Record the location in the params[]
2136 * array.
2137 */
2138 push_constant_loc[i] = num_push_constants++;
2139 } else {
2140 /* Demote to a pull constant. */
2141 push_constant_loc[i] = -1;
2142
2143 int pull_index = stage_prog_data->nr_pull_params++;
2144 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2145 pull_constant_loc[i] = pull_index;
2146 }
2147 }
2148
2149 stage_prog_data->nr_params = num_push_constants;
2150
2151 /* Up until now, the param[] array has been indexed by reg + reg_offset
2152 * of UNIFORM registers. Condense it to only contain the uniforms we
2153 * chose to upload as push constants.
2154 */
2155 for (unsigned int i = 0; i < uniforms; i++) {
2156 int remapped = push_constant_loc[i];
2157
2158 if (remapped == -1)
2159 continue;
2160
2161 assert(remapped <= (int)i);
2162 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2163 }
2164 }
2165
2166 /**
2167 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2168 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2169 */
2170 void
2171 fs_visitor::demote_pull_constants()
2172 {
2173 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2174 for (int i = 0; i < inst->sources; i++) {
2175 if (inst->src[i].file != UNIFORM)
2176 continue;
2177
2178 int pull_index = pull_constant_loc[inst->src[i].reg +
2179 inst->src[i].reg_offset];
2180 if (pull_index == -1)
2181 continue;
2182
2183 /* Set up the annotation tracking for new generated instructions. */
2184 base_ir = inst->ir;
2185 current_annotation = inst->annotation;
2186
2187 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2188 fs_reg dst = fs_reg(this, glsl_type::float_type);
2189
2190 /* Generate a pull load into dst. */
2191 if (inst->src[i].reladdr) {
2192 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2193 surf_index,
2194 *inst->src[i].reladdr,
2195 pull_index);
2196 inst->insert_before(block, &list);
2197 inst->src[i].reladdr = NULL;
2198 } else {
2199 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2200 fs_inst *pull =
2201 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2202 dst, surf_index, offset);
2203 inst->insert_before(block, pull);
2204 inst->src[i].set_smear(pull_index & 3);
2205 }
2206
2207 /* Rewrite the instruction to use the temporary VGRF. */
2208 inst->src[i].file = GRF;
2209 inst->src[i].reg = dst.reg;
2210 inst->src[i].reg_offset = 0;
2211 inst->src[i].width = dispatch_width;
2212 }
2213 }
2214 invalidate_live_intervals();
2215 }
2216
2217 bool
2218 fs_visitor::opt_algebraic()
2219 {
2220 bool progress = false;
2221
2222 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2223 switch (inst->opcode) {
2224 case BRW_OPCODE_MUL:
2225 if (inst->src[1].file != IMM)
2226 continue;
2227
2228 /* a * 1.0 = a */
2229 if (inst->src[1].is_one()) {
2230 inst->opcode = BRW_OPCODE_MOV;
2231 inst->src[1] = reg_undef;
2232 progress = true;
2233 break;
2234 }
2235
2236 /* a * 0.0 = 0.0 */
2237 if (inst->src[1].is_zero()) {
2238 inst->opcode = BRW_OPCODE_MOV;
2239 inst->src[0] = inst->src[1];
2240 inst->src[1] = reg_undef;
2241 progress = true;
2242 break;
2243 }
2244
2245 break;
2246 case BRW_OPCODE_ADD:
2247 if (inst->src[1].file != IMM)
2248 continue;
2249
2250 /* a + 0.0 = a */
2251 if (inst->src[1].is_zero()) {
2252 inst->opcode = BRW_OPCODE_MOV;
2253 inst->src[1] = reg_undef;
2254 progress = true;
2255 break;
2256 }
2257 break;
2258 case BRW_OPCODE_OR:
2259 if (inst->src[0].equals(inst->src[1])) {
2260 inst->opcode = BRW_OPCODE_MOV;
2261 inst->src[1] = reg_undef;
2262 progress = true;
2263 break;
2264 }
2265 break;
2266 case BRW_OPCODE_LRP:
2267 if (inst->src[1].equals(inst->src[2])) {
2268 inst->opcode = BRW_OPCODE_MOV;
2269 inst->src[0] = inst->src[1];
2270 inst->src[1] = reg_undef;
2271 inst->src[2] = reg_undef;
2272 progress = true;
2273 break;
2274 }
2275 break;
2276 case BRW_OPCODE_SEL:
2277 if (inst->src[0].equals(inst->src[1])) {
2278 inst->opcode = BRW_OPCODE_MOV;
2279 inst->src[1] = reg_undef;
2280 inst->predicate = BRW_PREDICATE_NONE;
2281 inst->predicate_inverse = false;
2282 progress = true;
2283 } else if (inst->saturate && inst->src[1].file == IMM) {
2284 switch (inst->conditional_mod) {
2285 case BRW_CONDITIONAL_LE:
2286 case BRW_CONDITIONAL_L:
2287 switch (inst->src[1].type) {
2288 case BRW_REGISTER_TYPE_F:
2289 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2290 inst->opcode = BRW_OPCODE_MOV;
2291 inst->src[1] = reg_undef;
2292 progress = true;
2293 }
2294 break;
2295 default:
2296 break;
2297 }
2298 break;
2299 case BRW_CONDITIONAL_GE:
2300 case BRW_CONDITIONAL_G:
2301 switch (inst->src[1].type) {
2302 case BRW_REGISTER_TYPE_F:
2303 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2304 inst->opcode = BRW_OPCODE_MOV;
2305 inst->src[1] = reg_undef;
2306 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2307 progress = true;
2308 }
2309 break;
2310 default:
2311 break;
2312 }
2313 default:
2314 break;
2315 }
2316 }
2317 break;
2318 case SHADER_OPCODE_RCP: {
2319 fs_inst *prev = (fs_inst *)inst->prev;
2320 if (prev->opcode == SHADER_OPCODE_SQRT) {
2321 if (inst->src[0].equals(prev->dst)) {
2322 inst->opcode = SHADER_OPCODE_RSQ;
2323 inst->src[0] = prev->src[0];
2324 progress = true;
2325 }
2326 }
2327 break;
2328 }
2329 default:
2330 break;
2331 }
2332 }
2333
2334 return progress;
2335 }
2336
2337 bool
2338 fs_visitor::opt_register_renaming()
2339 {
2340 bool progress = false;
2341 int depth = 0;
2342
2343 int remap[virtual_grf_count];
2344 memset(remap, -1, sizeof(int) * virtual_grf_count);
2345
2346 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2347 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2348 depth++;
2349 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2350 inst->opcode == BRW_OPCODE_WHILE) {
2351 depth--;
2352 }
2353
2354 /* Rewrite instruction sources. */
2355 for (int i = 0; i < inst->sources; i++) {
2356 if (inst->src[i].file == GRF &&
2357 remap[inst->src[i].reg] != -1 &&
2358 remap[inst->src[i].reg] != inst->src[i].reg) {
2359 inst->src[i].reg = remap[inst->src[i].reg];
2360 progress = true;
2361 }
2362 }
2363
2364 const int dst = inst->dst.reg;
2365
2366 if (depth == 0 &&
2367 inst->dst.file == GRF &&
2368 virtual_grf_sizes[inst->dst.reg] == inst->dst.width / 8 &&
2369 !inst->is_partial_write()) {
2370 if (remap[dst] == -1) {
2371 remap[dst] = dst;
2372 } else {
2373 remap[dst] = virtual_grf_alloc(inst->dst.width / 8);
2374 inst->dst.reg = remap[dst];
2375 progress = true;
2376 }
2377 } else if (inst->dst.file == GRF &&
2378 remap[dst] != -1 &&
2379 remap[dst] != dst) {
2380 inst->dst.reg = remap[dst];
2381 progress = true;
2382 }
2383 }
2384
2385 if (progress) {
2386 invalidate_live_intervals();
2387
2388 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2389 if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2390 delta_x[i].reg = remap[delta_x[i].reg];
2391 }
2392 }
2393 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2394 if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2395 delta_y[i].reg = remap[delta_y[i].reg];
2396 }
2397 }
2398 }
2399
2400 return progress;
2401 }
2402
2403 bool
2404 fs_visitor::compute_to_mrf()
2405 {
2406 bool progress = false;
2407 int next_ip = 0;
2408
2409 /* No MRFs on Gen >= 7. */
2410 if (brw->gen >= 7)
2411 return false;
2412
2413 calculate_live_intervals();
2414
2415 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2416 int ip = next_ip;
2417 next_ip++;
2418
2419 if (inst->opcode != BRW_OPCODE_MOV ||
2420 inst->is_partial_write() ||
2421 inst->dst.file != MRF || inst->src[0].file != GRF ||
2422 inst->dst.type != inst->src[0].type ||
2423 inst->src[0].abs || inst->src[0].negate ||
2424 !inst->src[0].is_contiguous() ||
2425 inst->src[0].subreg_offset)
2426 continue;
2427
2428 /* Work out which hardware MRF registers are written by this
2429 * instruction.
2430 */
2431 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2432 int mrf_high;
2433 if (inst->dst.reg & BRW_MRF_COMPR4) {
2434 mrf_high = mrf_low + 4;
2435 } else if (inst->exec_size == 16) {
2436 mrf_high = mrf_low + 1;
2437 } else {
2438 mrf_high = mrf_low;
2439 }
2440
2441 /* Can't compute-to-MRF this GRF if someone else was going to
2442 * read it later.
2443 */
2444 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2445 continue;
2446
2447 /* Found a move of a GRF to a MRF. Let's see if we can go
2448 * rewrite the thing that made this GRF to write into the MRF.
2449 */
2450 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2451 if (scan_inst->dst.file == GRF &&
2452 scan_inst->dst.reg == inst->src[0].reg) {
2453 /* Found the last thing to write our reg we want to turn
2454 * into a compute-to-MRF.
2455 */
2456
2457 /* If this one instruction didn't populate all the
2458 * channels, bail. We might be able to rewrite everything
2459 * that writes that reg, but it would require smarter
2460 * tracking to delay the rewriting until complete success.
2461 */
2462 if (scan_inst->is_partial_write())
2463 break;
2464
2465 /* Things returning more than one register would need us to
2466 * understand coalescing out more than one MOV at a time.
2467 */
2468 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2469 break;
2470
2471 /* SEND instructions can't have MRF as a destination. */
2472 if (scan_inst->mlen)
2473 break;
2474
2475 if (brw->gen == 6) {
2476 /* gen6 math instructions must have the destination be
2477 * GRF, so no compute-to-MRF for them.
2478 */
2479 if (scan_inst->is_math()) {
2480 break;
2481 }
2482 }
2483
2484 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2485 /* Found the creator of our MRF's source value. */
2486 scan_inst->dst.file = MRF;
2487 scan_inst->dst.reg = inst->dst.reg;
2488 scan_inst->saturate |= inst->saturate;
2489 inst->remove(block);
2490 progress = true;
2491 }
2492 break;
2493 }
2494
2495 /* We don't handle control flow here. Most computation of
2496 * values that end up in MRFs are shortly before the MRF
2497 * write anyway.
2498 */
2499 if (block->start() == scan_inst)
2500 break;
2501
2502 /* You can't read from an MRF, so if someone else reads our
2503 * MRF's source GRF that we wanted to rewrite, that stops us.
2504 */
2505 bool interfered = false;
2506 for (int i = 0; i < scan_inst->sources; i++) {
2507 if (scan_inst->src[i].file == GRF &&
2508 scan_inst->src[i].reg == inst->src[0].reg &&
2509 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2510 interfered = true;
2511 }
2512 }
2513 if (interfered)
2514 break;
2515
2516 if (scan_inst->dst.file == MRF) {
2517 /* If somebody else writes our MRF here, we can't
2518 * compute-to-MRF before that.
2519 */
2520 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2521 int scan_mrf_high;
2522
2523 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2524 scan_mrf_high = scan_mrf_low + 4;
2525 } else if (scan_inst->exec_size == 16) {
2526 scan_mrf_high = scan_mrf_low + 1;
2527 } else {
2528 scan_mrf_high = scan_mrf_low;
2529 }
2530
2531 if (mrf_low == scan_mrf_low ||
2532 mrf_low == scan_mrf_high ||
2533 mrf_high == scan_mrf_low ||
2534 mrf_high == scan_mrf_high) {
2535 break;
2536 }
2537 }
2538
2539 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2540 /* Found a SEND instruction, which means that there are
2541 * live values in MRFs from base_mrf to base_mrf +
2542 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2543 * above it.
2544 */
2545 if (mrf_low >= scan_inst->base_mrf &&
2546 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2547 break;
2548 }
2549 if (mrf_high >= scan_inst->base_mrf &&
2550 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2551 break;
2552 }
2553 }
2554 }
2555 }
2556
2557 if (progress)
2558 invalidate_live_intervals();
2559
2560 return progress;
2561 }
2562
2563 /**
2564 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2565 * instructions to FS_OPCODE_REP_FB_WRITE.
2566 */
2567 void
2568 fs_visitor::emit_repclear_shader()
2569 {
2570 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2571 int base_mrf = 1;
2572 int color_mrf = base_mrf + 2;
2573
2574 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2575 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2576 mov->force_writemask_all = true;
2577
2578 fs_inst *write;
2579 if (key->nr_color_regions == 1) {
2580 write = emit(FS_OPCODE_REP_FB_WRITE);
2581 write->saturate = key->clamp_fragment_color;
2582 write->base_mrf = color_mrf;
2583 write->target = 0;
2584 write->header_present = false;
2585 write->mlen = 1;
2586 } else {
2587 assume(key->nr_color_regions > 0);
2588 for (int i = 0; i < key->nr_color_regions; ++i) {
2589 write = emit(FS_OPCODE_REP_FB_WRITE);
2590 write->saturate = key->clamp_fragment_color;
2591 write->base_mrf = base_mrf;
2592 write->target = i;
2593 write->header_present = true;
2594 write->mlen = 3;
2595 }
2596 }
2597 write->eot = true;
2598
2599 calculate_cfg();
2600
2601 assign_constant_locations();
2602 assign_curb_setup();
2603
2604 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2605 assert(mov->src[0].file == HW_REG);
2606 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2607 }
2608
2609 /**
2610 * Walks through basic blocks, looking for repeated MRF writes and
2611 * removing the later ones.
2612 */
2613 bool
2614 fs_visitor::remove_duplicate_mrf_writes()
2615 {
2616 fs_inst *last_mrf_move[16];
2617 bool progress = false;
2618
2619 /* Need to update the MRF tracking for compressed instructions. */
2620 if (dispatch_width == 16)
2621 return false;
2622
2623 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2624
2625 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2626 if (inst->is_control_flow()) {
2627 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2628 }
2629
2630 if (inst->opcode == BRW_OPCODE_MOV &&
2631 inst->dst.file == MRF) {
2632 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2633 if (prev_inst && inst->equals(prev_inst)) {
2634 inst->remove(block);
2635 progress = true;
2636 continue;
2637 }
2638 }
2639
2640 /* Clear out the last-write records for MRFs that were overwritten. */
2641 if (inst->dst.file == MRF) {
2642 last_mrf_move[inst->dst.reg] = NULL;
2643 }
2644
2645 if (inst->mlen > 0 && inst->base_mrf != -1) {
2646 /* Found a SEND instruction, which will include two or fewer
2647 * implied MRF writes. We could do better here.
2648 */
2649 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2650 last_mrf_move[inst->base_mrf + i] = NULL;
2651 }
2652 }
2653
2654 /* Clear out any MRF move records whose sources got overwritten. */
2655 if (inst->dst.file == GRF) {
2656 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2657 if (last_mrf_move[i] &&
2658 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2659 last_mrf_move[i] = NULL;
2660 }
2661 }
2662 }
2663
2664 if (inst->opcode == BRW_OPCODE_MOV &&
2665 inst->dst.file == MRF &&
2666 inst->src[0].file == GRF &&
2667 !inst->is_partial_write()) {
2668 last_mrf_move[inst->dst.reg] = inst;
2669 }
2670 }
2671
2672 if (progress)
2673 invalidate_live_intervals();
2674
2675 return progress;
2676 }
2677
2678 static void
2679 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2680 int first_grf, int grf_len)
2681 {
2682 /* Clear the flag for registers that actually got read (as expected). */
2683 for (int i = 0; i < inst->sources; i++) {
2684 int grf;
2685 if (inst->src[i].file == GRF) {
2686 grf = inst->src[i].reg;
2687 } else if (inst->src[i].file == HW_REG &&
2688 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2689 grf = inst->src[i].fixed_hw_reg.nr;
2690 } else {
2691 continue;
2692 }
2693
2694 if (grf >= first_grf &&
2695 grf < first_grf + grf_len) {
2696 deps[grf - first_grf] = false;
2697 if (inst->exec_size == 16)
2698 deps[grf - first_grf + 1] = false;
2699 }
2700 }
2701 }
2702
2703 /**
2704 * Implements this workaround for the original 965:
2705 *
2706 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2707 * check for post destination dependencies on this instruction, software
2708 * must ensure that there is no destination hazard for the case of ‘write
2709 * followed by a posted write’ shown in the following example.
2710 *
2711 * 1. mov r3 0
2712 * 2. send r3.xy <rest of send instruction>
2713 * 3. mov r2 r3
2714 *
2715 * Due to no post-destination dependency check on the ‘send’, the above
2716 * code sequence could have two instructions (1 and 2) in flight at the
2717 * same time that both consider ‘r3’ as the target of their final writes.
2718 */
2719 void
2720 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2721 fs_inst *inst)
2722 {
2723 int write_len = inst->regs_written;
2724 int first_write_grf = inst->dst.reg;
2725 bool needs_dep[BRW_MAX_MRF];
2726 assert(write_len < (int)sizeof(needs_dep) - 1);
2727
2728 memset(needs_dep, false, sizeof(needs_dep));
2729 memset(needs_dep, true, write_len);
2730
2731 clear_deps_for_inst_src(inst, dispatch_width,
2732 needs_dep, first_write_grf, write_len);
2733
2734 /* Walk backwards looking for writes to registers we're writing which
2735 * aren't read since being written. If we hit the start of the program,
2736 * we assume that there are no outstanding dependencies on entry to the
2737 * program.
2738 */
2739 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2740 /* If we hit control flow, assume that there *are* outstanding
2741 * dependencies, and force their cleanup before our instruction.
2742 */
2743 if (block->start() == scan_inst) {
2744 for (int i = 0; i < write_len; i++) {
2745 if (needs_dep[i]) {
2746 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2747 }
2748 }
2749 return;
2750 }
2751
2752 /* We insert our reads as late as possible on the assumption that any
2753 * instruction but a MOV that might have left us an outstanding
2754 * dependency has more latency than a MOV.
2755 */
2756 if (scan_inst->dst.file == GRF) {
2757 for (int i = 0; i < scan_inst->regs_written; i++) {
2758 int reg = scan_inst->dst.reg + i;
2759
2760 if (reg >= first_write_grf &&
2761 reg < first_write_grf + write_len &&
2762 needs_dep[reg - first_write_grf]) {
2763 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2764 needs_dep[reg - first_write_grf] = false;
2765 if (scan_inst->exec_size == 16)
2766 needs_dep[reg - first_write_grf + 1] = false;
2767 }
2768 }
2769 }
2770
2771 /* Clear the flag for registers that actually got read (as expected). */
2772 clear_deps_for_inst_src(scan_inst, dispatch_width,
2773 needs_dep, first_write_grf, write_len);
2774
2775 /* Continue the loop only if we haven't resolved all the dependencies */
2776 int i;
2777 for (i = 0; i < write_len; i++) {
2778 if (needs_dep[i])
2779 break;
2780 }
2781 if (i == write_len)
2782 return;
2783 }
2784 }
2785
2786 /**
2787 * Implements this workaround for the original 965:
2788 *
2789 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2790 * used as a destination register until after it has been sourced by an
2791 * instruction with a different destination register.
2792 */
2793 void
2794 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2795 {
2796 int write_len = inst->regs_written;
2797 int first_write_grf = inst->dst.reg;
2798 bool needs_dep[BRW_MAX_MRF];
2799 assert(write_len < (int)sizeof(needs_dep) - 1);
2800
2801 memset(needs_dep, false, sizeof(needs_dep));
2802 memset(needs_dep, true, write_len);
2803 /* Walk forwards looking for writes to registers we're writing which aren't
2804 * read before being written.
2805 */
2806 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2807 /* If we hit control flow, force resolve all remaining dependencies. */
2808 if (block->end() == scan_inst) {
2809 for (int i = 0; i < write_len; i++) {
2810 if (needs_dep[i])
2811 scan_inst->insert_before(block,
2812 DEP_RESOLVE_MOV(first_write_grf + i));
2813 }
2814 return;
2815 }
2816
2817 /* Clear the flag for registers that actually got read (as expected). */
2818 clear_deps_for_inst_src(scan_inst, dispatch_width,
2819 needs_dep, first_write_grf, write_len);
2820
2821 /* We insert our reads as late as possible since they're reading the
2822 * result of a SEND, which has massive latency.
2823 */
2824 if (scan_inst->dst.file == GRF &&
2825 scan_inst->dst.reg >= first_write_grf &&
2826 scan_inst->dst.reg < first_write_grf + write_len &&
2827 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2828 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
2829 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2830 }
2831
2832 /* Continue the loop only if we haven't resolved all the dependencies */
2833 int i;
2834 for (i = 0; i < write_len; i++) {
2835 if (needs_dep[i])
2836 break;
2837 }
2838 if (i == write_len)
2839 return;
2840 }
2841
2842 /* If we hit the end of the program, resolve all remaining dependencies out
2843 * of paranoia.
2844 */
2845 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2846 assert(last_inst->eot);
2847 for (int i = 0; i < write_len; i++) {
2848 if (needs_dep[i])
2849 last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2850 }
2851 }
2852
2853 void
2854 fs_visitor::insert_gen4_send_dependency_workarounds()
2855 {
2856 if (brw->gen != 4 || brw->is_g4x)
2857 return;
2858
2859 bool progress = false;
2860
2861 /* Note that we're done with register allocation, so GRF fs_regs always
2862 * have a .reg_offset of 0.
2863 */
2864
2865 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2866 if (inst->mlen != 0 && inst->dst.file == GRF) {
2867 insert_gen4_pre_send_dependency_workarounds(block, inst);
2868 insert_gen4_post_send_dependency_workarounds(block, inst);
2869 progress = true;
2870 }
2871 }
2872
2873 if (progress)
2874 invalidate_live_intervals();
2875 }
2876
2877 /**
2878 * Turns the generic expression-style uniform pull constant load instruction
2879 * into a hardware-specific series of instructions for loading a pull
2880 * constant.
2881 *
2882 * The expression style allows the CSE pass before this to optimize out
2883 * repeated loads from the same offset, and gives the pre-register-allocation
2884 * scheduling full flexibility, while the conversion to native instructions
2885 * allows the post-register-allocation scheduler the best information
2886 * possible.
2887 *
2888 * Note that execution masking for setting up pull constant loads is special:
2889 * the channels that need to be written are unrelated to the current execution
2890 * mask, since a later instruction will use one of the result channels as a
2891 * source operand for all 8 or 16 of its channels.
2892 */
2893 void
2894 fs_visitor::lower_uniform_pull_constant_loads()
2895 {
2896 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2897 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2898 continue;
2899
2900 if (brw->gen >= 7) {
2901 /* The offset arg before was a vec4-aligned byte offset. We need to
2902 * turn it into a dword offset.
2903 */
2904 fs_reg const_offset_reg = inst->src[1];
2905 assert(const_offset_reg.file == IMM &&
2906 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2907 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2908 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2909
2910 /* This is actually going to be a MOV, but since only the first dword
2911 * is accessed, we have a special opcode to do just that one. Note
2912 * that this needs to be an operation that will be considered a def
2913 * by live variable analysis, or register allocation will explode.
2914 */
2915 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2916 8, payload, const_offset_reg);
2917 setup->force_writemask_all = true;
2918
2919 setup->ir = inst->ir;
2920 setup->annotation = inst->annotation;
2921 inst->insert_before(block, setup);
2922
2923 /* Similarly, this will only populate the first 4 channels of the
2924 * result register (since we only use smear values from 0-3), but we
2925 * don't tell the optimizer.
2926 */
2927 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2928 inst->src[1] = payload;
2929
2930 invalidate_live_intervals();
2931 } else {
2932 /* Before register allocation, we didn't tell the scheduler about the
2933 * MRF we use. We know it's safe to use this MRF because nothing
2934 * else does except for register spill/unspill, which generates and
2935 * uses its MRF within a single IR instruction.
2936 */
2937 inst->base_mrf = 14;
2938 inst->mlen = 1;
2939 }
2940 }
2941 }
2942
2943 bool
2944 fs_visitor::lower_load_payload()
2945 {
2946 bool progress = false;
2947
2948 int vgrf_to_reg[virtual_grf_count];
2949 int reg_count = 16; /* Leave room for MRF */
2950 for (int i = 0; i < virtual_grf_count; ++i) {
2951 vgrf_to_reg[i] = reg_count;
2952 reg_count += virtual_grf_sizes[i];
2953 }
2954
2955 struct {
2956 bool written:1; /* Whether this register has ever been written */
2957 bool force_writemask_all:1;
2958 bool force_sechalf:1;
2959 } metadata[reg_count];
2960 memset(metadata, 0, sizeof(metadata));
2961
2962 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2963 int dst_reg;
2964 if (inst->dst.file == GRF) {
2965 dst_reg = vgrf_to_reg[inst->dst.reg];
2966 } else {
2967 /* MRF */
2968 dst_reg = inst->dst.reg;
2969 }
2970
2971 if (inst->dst.file == MRF || inst->dst.file == GRF) {
2972 bool force_sechalf = inst->force_sechalf;
2973 bool toggle_sechalf = inst->dst.width == 16 &&
2974 type_sz(inst->dst.type) == 4;
2975 for (int i = 0; i < inst->regs_written; ++i) {
2976 metadata[dst_reg + i].written = true;
2977 metadata[dst_reg + i].force_sechalf = force_sechalf;
2978 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
2979 force_sechalf = (toggle_sechalf != force_sechalf);
2980 }
2981 }
2982
2983 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
2984 assert(inst->dst.file == MRF || inst->dst.file == GRF);
2985 fs_reg dst = inst->dst;
2986
2987 for (int i = 0; i < inst->sources; i++) {
2988 dst.width = inst->src[i].effective_width;
2989 dst.type = inst->src[i].type;
2990
2991 if (inst->src[i].file == BAD_FILE) {
2992 /* Do nothing but otherwise increment as normal */
2993 } else if (dst.file == MRF &&
2994 dst.width == 8 &&
2995 brw->has_compr4 &&
2996 i + 4 < inst->sources &&
2997 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
2998 fs_reg compr4_dst = dst;
2999 compr4_dst.reg += BRW_MRF_COMPR4;
3000 compr4_dst.width = 16;
3001 fs_reg compr4_src = inst->src[i];
3002 compr4_src.width = 16;
3003 fs_inst *mov = MOV(compr4_dst, compr4_src);
3004 mov->force_writemask_all = true;
3005 inst->insert_before(block, mov);
3006 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3007 inst->src[i + 4].file = BAD_FILE;
3008 } else {
3009 fs_inst *mov = MOV(dst, inst->src[i]);
3010 if (inst->src[i].file == GRF) {
3011 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3012 inst->src[i].reg_offset;
3013 mov->force_sechalf = metadata[src_reg].force_sechalf;
3014 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3015 metadata[dst_reg] = metadata[src_reg];
3016 if (dst.width * type_sz(dst.type) > 32) {
3017 assert((!metadata[src_reg].written ||
3018 !metadata[src_reg].force_sechalf) &&
3019 (!metadata[src_reg + 1].written ||
3020 metadata[src_reg + 1].force_sechalf));
3021 metadata[dst_reg + 1] = metadata[src_reg + 1];
3022 }
3023 } else {
3024 metadata[dst_reg].force_writemask_all = false;
3025 metadata[dst_reg].force_sechalf = false;
3026 if (dst.width == 16) {
3027 metadata[dst_reg + 1].force_writemask_all = false;
3028 metadata[dst_reg + 1].force_sechalf = true;
3029 }
3030 }
3031 inst->insert_before(block, mov);
3032 }
3033
3034 dst = offset(dst, 1);
3035 }
3036
3037 inst->remove(block);
3038 progress = true;
3039 }
3040 }
3041
3042 if (progress)
3043 invalidate_live_intervals();
3044
3045 return progress;
3046 }
3047
3048 void
3049 fs_visitor::dump_instructions()
3050 {
3051 dump_instructions(NULL);
3052 }
3053
3054 void
3055 fs_visitor::dump_instructions(const char *name)
3056 {
3057 calculate_register_pressure();
3058 FILE *file = stderr;
3059 if (name && geteuid() != 0) {
3060 file = fopen(name, "w");
3061 if (!file)
3062 file = stderr;
3063 }
3064
3065 int ip = 0, max_pressure = 0;
3066 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3067 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3068 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3069 dump_instruction(inst, file);
3070 ++ip;
3071 }
3072 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3073
3074 if (file != stderr) {
3075 fclose(file);
3076 }
3077 }
3078
3079 void
3080 fs_visitor::dump_instruction(backend_instruction *be_inst)
3081 {
3082 dump_instruction(be_inst, stderr);
3083 }
3084
3085 void
3086 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3087 {
3088 fs_inst *inst = (fs_inst *)be_inst;
3089
3090 if (inst->predicate) {
3091 fprintf(file, "(%cf0.%d) ",
3092 inst->predicate_inverse ? '-' : '+',
3093 inst->flag_subreg);
3094 }
3095
3096 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3097 if (inst->saturate)
3098 fprintf(file, ".sat");
3099 if (inst->conditional_mod) {
3100 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3101 if (!inst->predicate &&
3102 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3103 inst->opcode != BRW_OPCODE_IF &&
3104 inst->opcode != BRW_OPCODE_WHILE))) {
3105 fprintf(file, ".f0.%d", inst->flag_subreg);
3106 }
3107 }
3108 fprintf(file, "(%d) ", inst->exec_size);
3109
3110
3111 switch (inst->dst.file) {
3112 case GRF:
3113 fprintf(file, "vgrf%d", inst->dst.reg);
3114 if (inst->dst.width != dispatch_width)
3115 fprintf(file, "@%d", inst->dst.width);
3116 if (virtual_grf_sizes[inst->dst.reg] != inst->dst.width / 8 ||
3117 inst->dst.subreg_offset)
3118 fprintf(file, "+%d.%d",
3119 inst->dst.reg_offset, inst->dst.subreg_offset);
3120 break;
3121 case MRF:
3122 fprintf(file, "m%d", inst->dst.reg);
3123 break;
3124 case BAD_FILE:
3125 fprintf(file, "(null)");
3126 break;
3127 case UNIFORM:
3128 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3129 break;
3130 case HW_REG:
3131 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3132 switch (inst->dst.fixed_hw_reg.nr) {
3133 case BRW_ARF_NULL:
3134 fprintf(file, "null");
3135 break;
3136 case BRW_ARF_ADDRESS:
3137 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3138 break;
3139 case BRW_ARF_ACCUMULATOR:
3140 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3141 break;
3142 case BRW_ARF_FLAG:
3143 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3144 inst->dst.fixed_hw_reg.subnr);
3145 break;
3146 default:
3147 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3148 inst->dst.fixed_hw_reg.subnr);
3149 break;
3150 }
3151 } else {
3152 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3153 }
3154 if (inst->dst.fixed_hw_reg.subnr)
3155 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3156 break;
3157 default:
3158 fprintf(file, "???");
3159 break;
3160 }
3161 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3162
3163 for (int i = 0; i < inst->sources; i++) {
3164 if (inst->src[i].negate)
3165 fprintf(file, "-");
3166 if (inst->src[i].abs)
3167 fprintf(file, "|");
3168 switch (inst->src[i].file) {
3169 case GRF:
3170 fprintf(file, "vgrf%d", inst->src[i].reg);
3171 if (inst->src[i].width != dispatch_width)
3172 fprintf(file, "@%d", inst->src[i].width);
3173 if (virtual_grf_sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3174 inst->src[i].subreg_offset)
3175 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3176 inst->src[i].subreg_offset);
3177 break;
3178 case MRF:
3179 fprintf(file, "***m%d***", inst->src[i].reg);
3180 break;
3181 case UNIFORM:
3182 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3183 if (inst->src[i].reladdr) {
3184 fprintf(file, "+reladdr");
3185 } else if (inst->src[i].subreg_offset) {
3186 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3187 inst->src[i].subreg_offset);
3188 }
3189 break;
3190 case BAD_FILE:
3191 fprintf(file, "(null)");
3192 break;
3193 case IMM:
3194 switch (inst->src[i].type) {
3195 case BRW_REGISTER_TYPE_F:
3196 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3197 break;
3198 case BRW_REGISTER_TYPE_D:
3199 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3200 break;
3201 case BRW_REGISTER_TYPE_UD:
3202 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3203 break;
3204 case BRW_REGISTER_TYPE_VF:
3205 fprintf(stderr, "[%-gF, %-gF, %-gF, %-gF]",
3206 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3207 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3208 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3209 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3210 break;
3211 default:
3212 fprintf(file, "???");
3213 break;
3214 }
3215 break;
3216 case HW_REG:
3217 if (inst->src[i].fixed_hw_reg.negate)
3218 fprintf(file, "-");
3219 if (inst->src[i].fixed_hw_reg.abs)
3220 fprintf(file, "|");
3221 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3222 switch (inst->src[i].fixed_hw_reg.nr) {
3223 case BRW_ARF_NULL:
3224 fprintf(file, "null");
3225 break;
3226 case BRW_ARF_ADDRESS:
3227 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3228 break;
3229 case BRW_ARF_ACCUMULATOR:
3230 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3231 break;
3232 case BRW_ARF_FLAG:
3233 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3234 inst->src[i].fixed_hw_reg.subnr);
3235 break;
3236 default:
3237 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3238 inst->src[i].fixed_hw_reg.subnr);
3239 break;
3240 }
3241 } else {
3242 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3243 }
3244 if (inst->src[i].fixed_hw_reg.subnr)
3245 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3246 if (inst->src[i].fixed_hw_reg.abs)
3247 fprintf(file, "|");
3248 break;
3249 default:
3250 fprintf(file, "???");
3251 break;
3252 }
3253 if (inst->src[i].abs)
3254 fprintf(file, "|");
3255
3256 if (inst->src[i].file != IMM) {
3257 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3258 }
3259
3260 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3261 fprintf(file, ", ");
3262 }
3263
3264 fprintf(file, " ");
3265
3266 if (dispatch_width == 16 && inst->exec_size == 8) {
3267 if (inst->force_sechalf)
3268 fprintf(file, "2ndhalf ");
3269 else
3270 fprintf(file, "1sthalf ");
3271 }
3272
3273 fprintf(file, "\n");
3274 }
3275
3276 /**
3277 * Possibly returns an instruction that set up @param reg.
3278 *
3279 * Sometimes we want to take the result of some expression/variable
3280 * dereference tree and rewrite the instruction generating the result
3281 * of the tree. When processing the tree, we know that the
3282 * instructions generated are all writing temporaries that are dead
3283 * outside of this tree. So, if we have some instructions that write
3284 * a temporary, we're free to point that temp write somewhere else.
3285 *
3286 * Note that this doesn't guarantee that the instruction generated
3287 * only reg -- it might be the size=4 destination of a texture instruction.
3288 */
3289 fs_inst *
3290 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3291 fs_inst *end,
3292 const fs_reg &reg)
3293 {
3294 if (end == start ||
3295 end->is_partial_write() ||
3296 reg.reladdr ||
3297 !reg.equals(end->dst)) {
3298 return NULL;
3299 } else {
3300 return end;
3301 }
3302 }
3303
3304 void
3305 fs_visitor::setup_payload_gen6()
3306 {
3307 bool uses_depth =
3308 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3309 unsigned barycentric_interp_modes =
3310 (stage == MESA_SHADER_FRAGMENT) ?
3311 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3312
3313 assert(brw->gen >= 6);
3314
3315 /* R0-1: masks, pixel X/Y coordinates. */
3316 payload.num_regs = 2;
3317 /* R2: only for 32-pixel dispatch.*/
3318
3319 /* R3-26: barycentric interpolation coordinates. These appear in the
3320 * same order that they appear in the brw_wm_barycentric_interp_mode
3321 * enum. Each set of coordinates occupies 2 registers if dispatch width
3322 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3323 * appear if they were enabled using the "Barycentric Interpolation
3324 * Mode" bits in WM_STATE.
3325 */
3326 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3327 if (barycentric_interp_modes & (1 << i)) {
3328 payload.barycentric_coord_reg[i] = payload.num_regs;
3329 payload.num_regs += 2;
3330 if (dispatch_width == 16) {
3331 payload.num_regs += 2;
3332 }
3333 }
3334 }
3335
3336 /* R27: interpolated depth if uses source depth */
3337 if (uses_depth) {
3338 payload.source_depth_reg = payload.num_regs;
3339 payload.num_regs++;
3340 if (dispatch_width == 16) {
3341 /* R28: interpolated depth if not SIMD8. */
3342 payload.num_regs++;
3343 }
3344 }
3345 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3346 if (uses_depth) {
3347 payload.source_w_reg = payload.num_regs;
3348 payload.num_regs++;
3349 if (dispatch_width == 16) {
3350 /* R30: interpolated W if not SIMD8. */
3351 payload.num_regs++;
3352 }
3353 }
3354
3355 if (stage == MESA_SHADER_FRAGMENT) {
3356 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3357 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3358 prog_data->uses_pos_offset = key->compute_pos_offset;
3359 /* R31: MSAA position offsets. */
3360 if (prog_data->uses_pos_offset) {
3361 payload.sample_pos_reg = payload.num_regs;
3362 payload.num_regs++;
3363 }
3364 }
3365
3366 /* R32: MSAA input coverage mask */
3367 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3368 assert(brw->gen >= 7);
3369 payload.sample_mask_in_reg = payload.num_regs;
3370 payload.num_regs++;
3371 if (dispatch_width == 16) {
3372 /* R33: input coverage mask if not SIMD8. */
3373 payload.num_regs++;
3374 }
3375 }
3376
3377 /* R34-: bary for 32-pixel. */
3378 /* R58-59: interp W for 32-pixel. */
3379
3380 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3381 source_depth_to_render_target = true;
3382 }
3383 }
3384
3385 void
3386 fs_visitor::assign_binding_table_offsets()
3387 {
3388 assert(stage == MESA_SHADER_FRAGMENT);
3389 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3390 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3391 uint32_t next_binding_table_offset = 0;
3392
3393 /* If there are no color regions, we still perform an FB write to a null
3394 * renderbuffer, which we place at surface index 0.
3395 */
3396 prog_data->binding_table.render_target_start = next_binding_table_offset;
3397 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3398
3399 assign_common_binding_table_offsets(next_binding_table_offset);
3400 }
3401
3402 void
3403 fs_visitor::calculate_register_pressure()
3404 {
3405 invalidate_live_intervals();
3406 calculate_live_intervals();
3407
3408 unsigned num_instructions = 0;
3409 foreach_block(block, cfg)
3410 num_instructions += block->instructions.length();
3411
3412 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3413
3414 for (int reg = 0; reg < virtual_grf_count; reg++) {
3415 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3416 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3417 }
3418 }
3419
3420 /**
3421 * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
3422 *
3423 * The needs_unlit_centroid_workaround ends up producing one of these per
3424 * channel of centroid input, so it's good to clean them up.
3425 *
3426 * An assumption here is that nothing ever modifies the dispatched pixels
3427 * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
3428 * dictates that anyway.
3429 */
3430 void
3431 fs_visitor::opt_drop_redundant_mov_to_flags()
3432 {
3433 bool flag_mov_found[2] = {false};
3434
3435 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3436 if (inst->is_control_flow()) {
3437 memset(flag_mov_found, 0, sizeof(flag_mov_found));
3438 } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
3439 if (!flag_mov_found[inst->flag_subreg])
3440 flag_mov_found[inst->flag_subreg] = true;
3441 else
3442 inst->remove(block);
3443 } else if (inst->writes_flag()) {
3444 flag_mov_found[inst->flag_subreg] = false;
3445 }
3446 }
3447 }
3448
3449 void
3450 fs_visitor::optimize()
3451 {
3452 calculate_cfg();
3453
3454 split_virtual_grfs();
3455
3456 move_uniform_array_access_to_pull_constants();
3457 assign_constant_locations();
3458 demote_pull_constants();
3459
3460 opt_drop_redundant_mov_to_flags();
3461
3462 #define OPT(pass, args...) do { \
3463 pass_num++; \
3464 bool this_progress = pass(args); \
3465 \
3466 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3467 char filename[64]; \
3468 snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass, \
3469 dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3470 \
3471 backend_visitor::dump_instructions(filename); \
3472 } \
3473 \
3474 progress = progress || this_progress; \
3475 } while (false)
3476
3477 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3478 char filename[64];
3479 snprintf(filename, 64, "fs%d-%04d-00-start",
3480 dispatch_width, shader_prog ? shader_prog->Name : 0);
3481
3482 backend_visitor::dump_instructions(filename);
3483 }
3484
3485 bool progress;
3486 int iteration = 0;
3487 do {
3488 progress = false;
3489 iteration++;
3490 int pass_num = 0;
3491
3492 OPT(remove_duplicate_mrf_writes);
3493
3494 OPT(opt_algebraic);
3495 OPT(opt_cse);
3496 OPT(opt_copy_propagate);
3497 OPT(opt_peephole_predicated_break);
3498 OPT(dead_code_eliminate);
3499 OPT(opt_peephole_sel);
3500 OPT(dead_control_flow_eliminate, this);
3501 OPT(opt_register_renaming);
3502 OPT(opt_saturate_propagation);
3503 OPT(register_coalesce);
3504 OPT(compute_to_mrf);
3505
3506 OPT(compact_virtual_grfs);
3507 } while (progress);
3508
3509 if (lower_load_payload()) {
3510 split_virtual_grfs();
3511 register_coalesce();
3512 compute_to_mrf();
3513 dead_code_eliminate();
3514 }
3515
3516 lower_uniform_pull_constant_loads();
3517 }
3518
3519 void
3520 fs_visitor::allocate_registers()
3521 {
3522 bool allocated_without_spills;
3523
3524 static enum instruction_scheduler_mode pre_modes[] = {
3525 SCHEDULE_PRE,
3526 SCHEDULE_PRE_NON_LIFO,
3527 SCHEDULE_PRE_LIFO,
3528 };
3529
3530 /* Try each scheduling heuristic to see if it can successfully register
3531 * allocate without spilling. They should be ordered by decreasing
3532 * performance but increasing likelihood of allocating.
3533 */
3534 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3535 schedule_instructions(pre_modes[i]);
3536
3537 if (0) {
3538 assign_regs_trivial();
3539 allocated_without_spills = true;
3540 } else {
3541 allocated_without_spills = assign_regs(false);
3542 }
3543 if (allocated_without_spills)
3544 break;
3545 }
3546
3547 if (!allocated_without_spills) {
3548 /* We assume that any spilling is worse than just dropping back to
3549 * SIMD8. There's probably actually some intermediate point where
3550 * SIMD16 with a couple of spills is still better.
3551 */
3552 if (dispatch_width == 16) {
3553 fail("Failure to register allocate. Reduce number of "
3554 "live scalar values to avoid this.");
3555 } else {
3556 perf_debug("Fragment shader triggered register spilling. "
3557 "Try reducing the number of live scalar values to "
3558 "improve performance.\n");
3559 }
3560
3561 /* Since we're out of heuristics, just go spill registers until we
3562 * get an allocation.
3563 */
3564 while (!assign_regs(true)) {
3565 if (failed)
3566 break;
3567 }
3568 }
3569
3570 /* This must come after all optimization and register allocation, since
3571 * it inserts dead code that happens to have side effects, and it does
3572 * so based on the actual physical registers in use.
3573 */
3574 insert_gen4_send_dependency_workarounds();
3575
3576 if (failed)
3577 return;
3578
3579 if (!allocated_without_spills)
3580 schedule_instructions(SCHEDULE_POST);
3581
3582 if (last_scratch > 0)
3583 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3584 }
3585
3586 bool
3587 fs_visitor::run()
3588 {
3589 sanity_param_count = prog->Parameters->NumParameters;
3590
3591 assign_binding_table_offsets();
3592
3593 if (brw->gen >= 6)
3594 setup_payload_gen6();
3595 else
3596 setup_payload_gen4();
3597
3598 if (0) {
3599 emit_dummy_fs();
3600 } else if (brw->use_rep_send && dispatch_width == 16) {
3601 emit_repclear_shader();
3602 } else {
3603 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3604 emit_shader_time_begin();
3605
3606 calculate_urb_setup();
3607 if (prog->InputsRead > 0) {
3608 if (brw->gen < 6)
3609 emit_interpolation_setup_gen4();
3610 else
3611 emit_interpolation_setup_gen6();
3612 }
3613
3614 /* We handle discards by keeping track of the still-live pixels in f0.1.
3615 * Initialize it with the dispatched pixels.
3616 */
3617 bool uses_kill =
3618 (stage == MESA_SHADER_FRAGMENT) &&
3619 ((brw_wm_prog_data*) this->prog_data)->uses_kill;
3620 bool alpha_test_func =
3621 (stage == MESA_SHADER_FRAGMENT) &&
3622 ((brw_wm_prog_key*) this->key)->alpha_test_func;
3623 if (uses_kill) {
3624 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3625 discard_init->flag_subreg = 1;
3626 }
3627
3628 /* Generate FS IR for main(). (the visitor only descends into
3629 * functions called "main").
3630 */
3631 if (shader) {
3632 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3633 base_ir = ir;
3634 this->result = reg_undef;
3635 ir->accept(this);
3636 }
3637 } else {
3638 emit_fragment_program_code();
3639 }
3640 base_ir = NULL;
3641 if (failed)
3642 return false;
3643
3644 emit(FS_OPCODE_PLACEHOLDER_HALT);
3645
3646 if (alpha_test_func)
3647 emit_alpha_test();
3648
3649 emit_fb_writes();
3650
3651 optimize();
3652
3653 assign_curb_setup();
3654 assign_urb_setup();
3655
3656 allocate_registers();
3657
3658 if (failed)
3659 return false;
3660 }
3661
3662 if (stage == MESA_SHADER_FRAGMENT) {
3663 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3664 if (dispatch_width == 8)
3665 prog_data->reg_blocks = brw_register_blocks(grf_used);
3666 else
3667 prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3668 }
3669
3670 /* If any state parameters were appended, then ParameterValues could have
3671 * been realloced, in which case the driver uniform storage set up by
3672 * _mesa_associate_uniform_storage() would point to freed memory. Make
3673 * sure that didn't happen.
3674 */
3675 assert(sanity_param_count == prog->Parameters->NumParameters);
3676
3677 return !failed;
3678 }
3679
3680 const unsigned *
3681 brw_wm_fs_emit(struct brw_context *brw,
3682 void *mem_ctx,
3683 const struct brw_wm_prog_key *key,
3684 struct brw_wm_prog_data *prog_data,
3685 struct gl_fragment_program *fp,
3686 struct gl_shader_program *prog,
3687 unsigned *final_assembly_size)
3688 {
3689 bool start_busy = false;
3690 double start_time = 0;
3691
3692 if (unlikely(brw->perf_debug)) {
3693 start_busy = (brw->batch.last_bo &&
3694 drm_intel_bo_busy(brw->batch.last_bo));
3695 start_time = get_time();
3696 }
3697
3698 struct brw_shader *shader = NULL;
3699 if (prog)
3700 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3701
3702 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3703 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3704
3705 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3706 */
3707 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3708 if (!v.run()) {
3709 if (prog) {
3710 prog->LinkStatus = false;
3711 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3712 }
3713
3714 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3715 v.fail_msg);
3716
3717 return NULL;
3718 }
3719
3720 cfg_t *simd16_cfg = NULL;
3721 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3722 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3723 brw->use_rep_send)) {
3724 if (!v.simd16_unsupported) {
3725 /* Try a SIMD16 compile */
3726 v2.import_uniforms(&v);
3727 if (!v2.run()) {
3728 perf_debug("SIMD16 shader failed to compile, falling back to "
3729 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3730 } else {
3731 simd16_cfg = v2.cfg;
3732 }
3733 } else {
3734 perf_debug("SIMD16 shader unsupported, falling back to "
3735 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3736 }
3737 }
3738
3739 cfg_t *simd8_cfg;
3740 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3741 if (no_simd8 && simd16_cfg) {
3742 simd8_cfg = NULL;
3743 prog_data->no_8 = true;
3744 } else {
3745 simd8_cfg = v.cfg;
3746 prog_data->no_8 = false;
3747 }
3748
3749 fs_generator g(brw, mem_ctx, key, prog_data, prog, fp,
3750 v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
3751 if (simd8_cfg)
3752 g.generate_code(simd8_cfg, 8);
3753 if (simd16_cfg)
3754 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
3755
3756 if (unlikely(brw->perf_debug) && shader) {
3757 if (shader->compiled_once)
3758 brw_wm_debug_recompile(brw, prog, key);
3759 shader->compiled_once = true;
3760
3761 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3762 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3763 (get_time() - start_time) * 1000);
3764 }
3765 }
3766
3767 return g.get_assembly(final_assembly_size);
3768 }
3769
3770 extern "C" bool
3771 brw_fs_precompile(struct gl_context *ctx,
3772 struct gl_shader_program *shader_prog,
3773 struct gl_program *prog)
3774 {
3775 struct brw_context *brw = brw_context(ctx);
3776 struct brw_wm_prog_key key;
3777
3778 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
3779 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3780 bool program_uses_dfdy = fp->UsesDFdy;
3781
3782 memset(&key, 0, sizeof(key));
3783
3784 if (brw->gen < 6) {
3785 if (fp->UsesKill)
3786 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3787
3788 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3789 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3790
3791 /* Just assume depth testing. */
3792 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3793 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3794 }
3795
3796 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3797 BRW_FS_VARYING_INPUT_MASK) > 16)
3798 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3799
3800 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3801 for (unsigned i = 0; i < sampler_count; i++) {
3802 if (fp->Base.ShadowSamplers & (1 << i)) {
3803 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3804 key.tex.swizzles[i] =
3805 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3806 } else {
3807 /* Color sampler: assume no swizzling. */
3808 key.tex.swizzles[i] = SWIZZLE_XYZW;
3809 }
3810 }
3811
3812 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3813 key.drawable_height = ctx->DrawBuffer->Height;
3814 }
3815
3816 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3817 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3818 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3819
3820 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3821 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3822 key.nr_color_regions > 1;
3823 }
3824
3825 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3826 * quality of the derivatives is likely to be determined by the driconf
3827 * option.
3828 */
3829 key.high_quality_derivatives = brw->disable_derivative_optimization;
3830
3831 key.program_string_id = bfp->id;
3832
3833 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3834 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3835
3836 bool success = do_wm_prog(brw, shader_prog, bfp, &key);
3837
3838 brw->wm.base.prog_offset = old_prog_offset;
3839 brw->wm.prog_data = old_prog_data;
3840
3841 return success;
3842 }