i965/fs: Fix the build
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "util/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "util/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_cfg.h"
50 #include "brw_dead_control_flow.h"
51 #include "main/uniforms.h"
52 #include "brw_fs_live_variables.h"
53 #include "glsl/glsl_types.h"
54
55 void
56 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
57 fs_reg *src, int sources)
58 {
59 memset(this, 0, sizeof(*this));
60
61 this->opcode = opcode;
62 this->dst = dst;
63 this->src = src;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (int i = 0; i < sources; ++i) {
79 if (src[i].file != GRF)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 for (int i = 0; i < sources; ++i) {
94 switch (this->src[i].file) {
95 case BAD_FILE:
96 this->src[i].effective_width = 8;
97 break;
98 case GRF:
99 case HW_REG:
100 assert(this->src[i].width > 0);
101 if (this->src[i].width == 1) {
102 this->src[i].effective_width = this->exec_size;
103 } else {
104 this->src[i].effective_width = this->src[i].width;
105 }
106 break;
107 case IMM:
108 case UNIFORM:
109 this->src[i].effective_width = this->exec_size;
110 break;
111 default:
112 unreachable("Invalid source register file");
113 }
114 }
115 this->dst.effective_width = this->exec_size;
116
117 this->conditional_mod = BRW_CONDITIONAL_NONE;
118
119 /* This will be the case for almost all instructions. */
120 switch (dst.file) {
121 case GRF:
122 case HW_REG:
123 case MRF:
124 this->regs_written = (dst.width * dst.stride * type_sz(dst.type) + 31) / 32;
125 break;
126 case BAD_FILE:
127 this->regs_written = 0;
128 break;
129 case IMM:
130 case UNIFORM:
131 unreachable("Invalid destination register file");
132 default:
133 unreachable("Invalid register file");
134 }
135
136 this->writes_accumulator = false;
137 }
138
139 fs_inst::fs_inst()
140 {
141 fs_reg *src = ralloc_array(this, fs_reg, 3);
142 init(BRW_OPCODE_NOP, 8, dst, src, 0);
143 }
144
145 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
146 {
147 fs_reg *src = ralloc_array(this, fs_reg, 3);
148 init(opcode, exec_size, reg_undef, src, 0);
149 }
150
151 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
152 {
153 fs_reg *src = ralloc_array(this, fs_reg, 3);
154 init(opcode, 0, dst, src, 0);
155 }
156
157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
158 const fs_reg &src0)
159 {
160 fs_reg *src = ralloc_array(this, fs_reg, 3);
161 src[0] = src0;
162 init(opcode, exec_size, dst, src, 1);
163 }
164
165 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
166 {
167 fs_reg *src = ralloc_array(this, fs_reg, 3);
168 src[0] = src0;
169 init(opcode, 0, dst, src, 1);
170 }
171
172 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
173 const fs_reg &src0, const fs_reg &src1)
174 {
175 fs_reg *src = ralloc_array(this, fs_reg, 3);
176 src[0] = src0;
177 src[1] = src1;
178 init(opcode, exec_size, dst, src, 2);
179 }
180
181 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
182 const fs_reg &src1)
183 {
184 fs_reg *src = ralloc_array(this, fs_reg, 3);
185 src[0] = src0;
186 src[1] = src1;
187 init(opcode, 0, dst, src, 2);
188 }
189
190 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
191 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
192 {
193 fs_reg *src = ralloc_array(this, fs_reg, 3);
194 src[0] = src0;
195 src[1] = src1;
196 src[2] = src2;
197 init(opcode, exec_size, dst, src, 3);
198 }
199
200 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
201 const fs_reg &src1, const fs_reg &src2)
202 {
203 fs_reg *src = ralloc_array(this, fs_reg, 3);
204 src[0] = src0;
205 src[1] = src1;
206 src[2] = src2;
207 init(opcode, 0, dst, src, 3);
208 }
209
210 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
211 {
212 init(opcode, 0, dst, src, sources);
213 }
214
215 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
216 fs_reg src[], int sources)
217 {
218 init(opcode, exec_width, dst, src, sources);
219 }
220
221 fs_inst::fs_inst(const fs_inst &that)
222 {
223 memcpy(this, &that, sizeof(that));
224
225 this->src = ralloc_array(this, fs_reg, that.sources);
226
227 for (int i = 0; i < that.sources; i++)
228 this->src[i] = that.src[i];
229 }
230
231 void
232 fs_inst::resize_sources(uint8_t num_sources)
233 {
234 if (this->sources != num_sources) {
235 this->src = reralloc(this, this->src, fs_reg, num_sources);
236 this->sources = num_sources;
237 }
238 }
239
240 #define ALU1(op) \
241 fs_inst * \
242 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
243 { \
244 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
245 }
246
247 #define ALU2(op) \
248 fs_inst * \
249 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
250 const fs_reg &src1) \
251 { \
252 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
253 }
254
255 #define ALU2_ACC(op) \
256 fs_inst * \
257 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
258 const fs_reg &src1) \
259 { \
260 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
261 inst->writes_accumulator = true; \
262 return inst; \
263 }
264
265 #define ALU3(op) \
266 fs_inst * \
267 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
268 const fs_reg &src1, const fs_reg &src2) \
269 { \
270 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
271 }
272
273 ALU1(NOT)
274 ALU1(MOV)
275 ALU1(FRC)
276 ALU1(RNDD)
277 ALU1(RNDE)
278 ALU1(RNDZ)
279 ALU2(ADD)
280 ALU2(MUL)
281 ALU2_ACC(MACH)
282 ALU2(AND)
283 ALU2(OR)
284 ALU2(XOR)
285 ALU2(SHL)
286 ALU2(SHR)
287 ALU2(ASR)
288 ALU3(LRP)
289 ALU1(BFREV)
290 ALU3(BFE)
291 ALU2(BFI1)
292 ALU3(BFI2)
293 ALU1(FBH)
294 ALU1(FBL)
295 ALU1(CBIT)
296 ALU3(MAD)
297 ALU2_ACC(ADDC)
298 ALU2_ACC(SUBB)
299 ALU2(SEL)
300 ALU2(MAC)
301
302 /** Gen4 predicated IF. */
303 fs_inst *
304 fs_visitor::IF(enum brw_predicate predicate)
305 {
306 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
307 inst->predicate = predicate;
308 return inst;
309 }
310
311 /** Gen6 IF with embedded comparison. */
312 fs_inst *
313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
314 enum brw_conditional_mod condition)
315 {
316 assert(brw->gen == 6);
317 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
318 reg_null_d, src0, src1);
319 inst->conditional_mod = condition;
320 return inst;
321 }
322
323 /**
324 * CMP: Sets the low bit of the destination channels with the result
325 * of the comparison, while the upper bits are undefined, and updates
326 * the flag register with the packed 16 bits of the result.
327 */
328 fs_inst *
329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
330 enum brw_conditional_mod condition)
331 {
332 fs_inst *inst;
333
334 /* Take the instruction:
335 *
336 * CMP null<d> src0<f> src1<f>
337 *
338 * Original gen4 does type conversion to the destination type before
339 * comparison, producing garbage results for floating point comparisons.
340 * gen5 does the comparison on the execution type (resolved source types),
341 * so dst type doesn't matter. gen6 does comparison and then uses the
342 * result as if it was the dst type with no conversion, which happens to
343 * mostly work out for float-interpreted-as-int since our comparisons are
344 * for >0, =0, <0.
345 */
346 if (brw->gen == 4) {
347 dst.type = src0.type;
348 if (dst.file == HW_REG)
349 dst.fixed_hw_reg.type = dst.type;
350 }
351
352 resolve_ud_negate(&src0);
353 resolve_ud_negate(&src1);
354
355 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
356 inst->conditional_mod = condition;
357
358 return inst;
359 }
360
361 fs_inst *
362 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
363 {
364 uint8_t exec_size = dst.width;
365 for (int i = 0; i < sources; ++i) {
366 assert(src[i].width % dst.width == 0);
367 if (src[i].width > exec_size)
368 exec_size = src[i].width;
369 }
370
371 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
372 dst, src, sources);
373 inst->regs_written = 0;
374 for (int i = 0; i < sources; ++i) {
375 /* The LOAD_PAYLOAD instruction only really makes sense if we are
376 * dealing with whole registers. If this ever changes, we can deal
377 * with it later.
378 */
379 int size = src[i].effective_width * type_sz(src[i].type);
380 assert(size % 32 == 0);
381 inst->regs_written += (size + 31) / 32;
382 }
383
384 return inst;
385 }
386
387 exec_list
388 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
389 const fs_reg &surf_index,
390 const fs_reg &varying_offset,
391 uint32_t const_offset)
392 {
393 exec_list instructions;
394 fs_inst *inst;
395
396 /* We have our constant surface use a pitch of 4 bytes, so our index can
397 * be any component of a vector, and then we load 4 contiguous
398 * components starting from that.
399 *
400 * We break down the const_offset to a portion added to the variable
401 * offset and a portion done using reg_offset, which means that if you
402 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
403 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
404 * CSE can later notice that those loads are all the same and eliminate
405 * the redundant ones.
406 */
407 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
408 instructions.push_tail(ADD(vec4_offset,
409 varying_offset, fs_reg(const_offset & ~3)));
410
411 int scale = 1;
412 if (brw->gen == 4 && dst.width == 8) {
413 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
414 * u, v, r) as parameters, or we can just use the SIMD16 message
415 * consisting of (header, u). We choose the second, at the cost of a
416 * longer return length.
417 */
418 scale = 2;
419 }
420
421 enum opcode op;
422 if (brw->gen >= 7)
423 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
424 else
425 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
426
427 assert(dst.width % 8 == 0);
428 int regs_written = 4 * (dst.width / 8) * scale;
429 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(regs_written),
430 dst.type, dst.width);
431 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
432 inst->regs_written = regs_written;
433 instructions.push_tail(inst);
434
435 if (brw->gen < 7) {
436 inst->base_mrf = 13;
437 inst->header_present = true;
438 if (brw->gen == 4)
439 inst->mlen = 3;
440 else
441 inst->mlen = 1 + dispatch_width / 8;
442 }
443
444 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
445 instructions.push_tail(MOV(dst, result));
446
447 return instructions;
448 }
449
450 /**
451 * A helper for MOV generation for fixing up broken hardware SEND dependency
452 * handling.
453 */
454 fs_inst *
455 fs_visitor::DEP_RESOLVE_MOV(int grf)
456 {
457 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
458
459 inst->ir = NULL;
460 inst->annotation = "send dependency resolve";
461
462 /* The caller always wants uncompressed to emit the minimal extra
463 * dependencies, and to avoid having to deal with aligning its regs to 2.
464 */
465 inst->exec_size = 8;
466
467 return inst;
468 }
469
470 bool
471 fs_inst::equals(fs_inst *inst) const
472 {
473 return (opcode == inst->opcode &&
474 dst.equals(inst->dst) &&
475 src[0].equals(inst->src[0]) &&
476 src[1].equals(inst->src[1]) &&
477 src[2].equals(inst->src[2]) &&
478 saturate == inst->saturate &&
479 predicate == inst->predicate &&
480 conditional_mod == inst->conditional_mod &&
481 mlen == inst->mlen &&
482 base_mrf == inst->base_mrf &&
483 target == inst->target &&
484 eot == inst->eot &&
485 header_present == inst->header_present &&
486 shadow_compare == inst->shadow_compare &&
487 exec_size == inst->exec_size &&
488 offset == inst->offset);
489 }
490
491 bool
492 fs_inst::overwrites_reg(const fs_reg &reg) const
493 {
494 return (reg.file == dst.file &&
495 reg.reg == dst.reg &&
496 reg.reg_offset >= dst.reg_offset &&
497 reg.reg_offset < dst.reg_offset + regs_written);
498 }
499
500 bool
501 fs_inst::is_send_from_grf() const
502 {
503 switch (opcode) {
504 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
505 case SHADER_OPCODE_SHADER_TIME_ADD:
506 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
507 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
508 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
509 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
510 case SHADER_OPCODE_UNTYPED_ATOMIC:
511 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
512 return true;
513 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
514 return src[1].file == GRF;
515 case FS_OPCODE_FB_WRITE:
516 return src[0].file == GRF;
517 default:
518 if (is_tex())
519 return src[0].file == GRF;
520
521 return false;
522 }
523 }
524
525 bool
526 fs_inst::can_do_source_mods(struct brw_context *brw)
527 {
528 if (brw->gen == 6 && is_math())
529 return false;
530
531 if (is_send_from_grf())
532 return false;
533
534 if (!backend_instruction::can_do_source_mods())
535 return false;
536
537 return true;
538 }
539
540 void
541 fs_reg::init()
542 {
543 memset(this, 0, sizeof(*this));
544 stride = 1;
545 }
546
547 /** Generic unset register constructor. */
548 fs_reg::fs_reg()
549 {
550 init();
551 this->file = BAD_FILE;
552 }
553
554 /** Immediate value constructor. */
555 fs_reg::fs_reg(float f)
556 {
557 init();
558 this->file = IMM;
559 this->type = BRW_REGISTER_TYPE_F;
560 this->fixed_hw_reg.dw1.f = f;
561 this->width = 1;
562 }
563
564 /** Immediate value constructor. */
565 fs_reg::fs_reg(int32_t i)
566 {
567 init();
568 this->file = IMM;
569 this->type = BRW_REGISTER_TYPE_D;
570 this->fixed_hw_reg.dw1.d = i;
571 this->width = 1;
572 }
573
574 /** Immediate value constructor. */
575 fs_reg::fs_reg(uint32_t u)
576 {
577 init();
578 this->file = IMM;
579 this->type = BRW_REGISTER_TYPE_UD;
580 this->fixed_hw_reg.dw1.ud = u;
581 this->width = 1;
582 }
583
584 /** Fixed brw_reg. */
585 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
586 {
587 init();
588 this->file = HW_REG;
589 this->fixed_hw_reg = fixed_hw_reg;
590 this->type = fixed_hw_reg.type;
591 this->width = 1 << fixed_hw_reg.width;
592 }
593
594 bool
595 fs_reg::equals(const fs_reg &r) const
596 {
597 return (file == r.file &&
598 reg == r.reg &&
599 reg_offset == r.reg_offset &&
600 subreg_offset == r.subreg_offset &&
601 type == r.type &&
602 negate == r.negate &&
603 abs == r.abs &&
604 !reladdr && !r.reladdr &&
605 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
606 width == r.width &&
607 stride == r.stride);
608 }
609
610 fs_reg &
611 fs_reg::apply_stride(unsigned stride)
612 {
613 assert((this->stride * stride) <= 4 &&
614 (is_power_of_two(stride) || stride == 0) &&
615 file != HW_REG && file != IMM);
616 this->stride *= stride;
617 return *this;
618 }
619
620 fs_reg &
621 fs_reg::set_smear(unsigned subreg)
622 {
623 assert(file != HW_REG && file != IMM);
624 subreg_offset = subreg * type_sz(type);
625 stride = 0;
626 return *this;
627 }
628
629 bool
630 fs_reg::is_contiguous() const
631 {
632 return stride == 1;
633 }
634
635 bool
636 fs_reg::is_valid_3src() const
637 {
638 return file == GRF || file == UNIFORM;
639 }
640
641 int
642 fs_visitor::type_size(const struct glsl_type *type)
643 {
644 unsigned int size, i;
645
646 switch (type->base_type) {
647 case GLSL_TYPE_UINT:
648 case GLSL_TYPE_INT:
649 case GLSL_TYPE_FLOAT:
650 case GLSL_TYPE_BOOL:
651 return type->components();
652 case GLSL_TYPE_ARRAY:
653 return type_size(type->fields.array) * type->length;
654 case GLSL_TYPE_STRUCT:
655 size = 0;
656 for (i = 0; i < type->length; i++) {
657 size += type_size(type->fields.structure[i].type);
658 }
659 return size;
660 case GLSL_TYPE_SAMPLER:
661 /* Samplers take up no register space, since they're baked in at
662 * link time.
663 */
664 return 0;
665 case GLSL_TYPE_ATOMIC_UINT:
666 return 0;
667 case GLSL_TYPE_IMAGE:
668 case GLSL_TYPE_VOID:
669 case GLSL_TYPE_ERROR:
670 case GLSL_TYPE_INTERFACE:
671 unreachable("not reached");
672 }
673
674 return 0;
675 }
676
677 fs_reg
678 fs_visitor::get_timestamp()
679 {
680 assert(brw->gen >= 7);
681
682 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
683 BRW_ARF_TIMESTAMP,
684 0),
685 BRW_REGISTER_TYPE_UD));
686
687 fs_reg dst = fs_reg(this, glsl_type::uint_type);
688
689 fs_inst *mov = emit(MOV(dst, ts));
690 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
691 * even if it's not enabled in the dispatch.
692 */
693 mov->force_writemask_all = true;
694 mov->exec_size = 8;
695
696 /* The caller wants the low 32 bits of the timestamp. Since it's running
697 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
698 * which is plenty of time for our purposes. It is identical across the
699 * EUs, but since it's tracking GPU core speed it will increment at a
700 * varying rate as render P-states change.
701 *
702 * The caller could also check if render P-states have changed (or anything
703 * else that might disrupt timing) by setting smear to 2 and checking if
704 * that field is != 0.
705 */
706 dst.set_smear(0);
707
708 return dst;
709 }
710
711 void
712 fs_visitor::emit_shader_time_begin()
713 {
714 current_annotation = "shader time start";
715 shader_start_time = get_timestamp();
716 }
717
718 void
719 fs_visitor::emit_shader_time_end()
720 {
721 current_annotation = "shader time end";
722
723 enum shader_time_shader_type type, written_type, reset_type;
724 if (dispatch_width == 8) {
725 type = ST_FS8;
726 written_type = ST_FS8_WRITTEN;
727 reset_type = ST_FS8_RESET;
728 } else {
729 assert(dispatch_width == 16);
730 type = ST_FS16;
731 written_type = ST_FS16_WRITTEN;
732 reset_type = ST_FS16_RESET;
733 }
734
735 fs_reg shader_end_time = get_timestamp();
736
737 /* Check that there weren't any timestamp reset events (assuming these
738 * were the only two timestamp reads that happened).
739 */
740 fs_reg reset = shader_end_time;
741 reset.set_smear(2);
742 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
743 test->conditional_mod = BRW_CONDITIONAL_Z;
744 emit(IF(BRW_PREDICATE_NORMAL));
745
746 push_force_uncompressed();
747 fs_reg start = shader_start_time;
748 start.negate = true;
749 fs_reg diff = fs_reg(this, glsl_type::uint_type);
750 emit(ADD(diff, start, shader_end_time));
751
752 /* If there were no instructions between the two timestamp gets, the diff
753 * is 2 cycles. Remove that overhead, so I can forget about that when
754 * trying to determine the time taken for single instructions.
755 */
756 emit(ADD(diff, diff, fs_reg(-2u)));
757
758 emit_shader_time_write(type, diff);
759 emit_shader_time_write(written_type, fs_reg(1u));
760 emit(BRW_OPCODE_ELSE);
761 emit_shader_time_write(reset_type, fs_reg(1u));
762 emit(BRW_OPCODE_ENDIF);
763
764 pop_force_uncompressed();
765 }
766
767 void
768 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
769 fs_reg value)
770 {
771 int shader_time_index =
772 brw_get_shader_time_index(brw, shader_prog, prog, type);
773 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
774
775 fs_reg payload;
776 if (dispatch_width == 8)
777 payload = fs_reg(this, glsl_type::uvec2_type);
778 else
779 payload = fs_reg(this, glsl_type::uint_type);
780
781 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
782 fs_reg(), payload, offset, value));
783 }
784
785 void
786 fs_visitor::vfail(const char *format, va_list va)
787 {
788 char *msg;
789
790 if (failed)
791 return;
792
793 failed = true;
794
795 msg = ralloc_vasprintf(mem_ctx, format, va);
796 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
797
798 this->fail_msg = msg;
799
800 if (INTEL_DEBUG & DEBUG_WM) {
801 fprintf(stderr, "%s", msg);
802 }
803 }
804
805 void
806 fs_visitor::fail(const char *format, ...)
807 {
808 va_list va;
809
810 va_start(va, format);
811 vfail(format, va);
812 va_end(va);
813 }
814
815 /**
816 * Mark this program as impossible to compile in SIMD16 mode.
817 *
818 * During the SIMD8 compile (which happens first), we can detect and flag
819 * things that are unsupported in SIMD16 mode, so the compiler can skip
820 * the SIMD16 compile altogether.
821 *
822 * During a SIMD16 compile (if one happens anyway), this just calls fail().
823 */
824 void
825 fs_visitor::no16(const char *format, ...)
826 {
827 va_list va;
828
829 va_start(va, format);
830
831 if (dispatch_width == 16) {
832 vfail(format, va);
833 } else {
834 simd16_unsupported = true;
835
836 if (brw->perf_debug) {
837 if (no16_msg)
838 ralloc_vasprintf_append(&no16_msg, format, va);
839 else
840 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
841 }
842 }
843
844 va_end(va);
845 }
846
847 fs_inst *
848 fs_visitor::emit(enum opcode opcode)
849 {
850 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
851 }
852
853 fs_inst *
854 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
855 {
856 return emit(new(mem_ctx) fs_inst(opcode, dst));
857 }
858
859 fs_inst *
860 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
861 {
862 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
863 }
864
865 fs_inst *
866 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
867 const fs_reg &src1)
868 {
869 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
870 }
871
872 fs_inst *
873 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
874 const fs_reg &src1, const fs_reg &src2)
875 {
876 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
877 }
878
879 fs_inst *
880 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
881 fs_reg src[], int sources)
882 {
883 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
884 }
885
886 void
887 fs_visitor::push_force_uncompressed()
888 {
889 force_uncompressed_stack++;
890 }
891
892 void
893 fs_visitor::pop_force_uncompressed()
894 {
895 force_uncompressed_stack--;
896 assert(force_uncompressed_stack >= 0);
897 }
898
899 /**
900 * Returns true if the instruction has a flag that means it won't
901 * update an entire destination register.
902 *
903 * For example, dead code elimination and live variable analysis want to know
904 * when a write to a variable screens off any preceding values that were in
905 * it.
906 */
907 bool
908 fs_inst::is_partial_write() const
909 {
910 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
911 (this->dst.width * type_sz(this->dst.type)) < 32 ||
912 !this->dst.is_contiguous());
913 }
914
915 int
916 fs_inst::regs_read(fs_visitor *v, int arg) const
917 {
918 if (is_tex() && arg == 0 && src[0].file == GRF) {
919 return mlen;
920 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
921 return mlen;
922 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
923 return mlen;
924 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
925 return mlen;
926 }
927
928 switch (src[arg].file) {
929 case BAD_FILE:
930 case UNIFORM:
931 case IMM:
932 return 1;
933 case GRF:
934 case HW_REG:
935 if (src[arg].stride == 0) {
936 return 1;
937 } else {
938 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
939 return (size + 31) / 32;
940 }
941 case MRF:
942 unreachable("MRF registers are not allowed as sources");
943 default:
944 unreachable("Invalid register file");
945 }
946 }
947
948 bool
949 fs_inst::reads_flag() const
950 {
951 return predicate;
952 }
953
954 bool
955 fs_inst::writes_flag() const
956 {
957 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
958 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
959 }
960
961 /**
962 * Returns how many MRFs an FS opcode will write over.
963 *
964 * Note that this is not the 0 or 1 implied writes in an actual gen
965 * instruction -- the FS opcodes often generate MOVs in addition.
966 */
967 int
968 fs_visitor::implied_mrf_writes(fs_inst *inst)
969 {
970 if (inst->mlen == 0)
971 return 0;
972
973 if (inst->base_mrf == -1)
974 return 0;
975
976 switch (inst->opcode) {
977 case SHADER_OPCODE_RCP:
978 case SHADER_OPCODE_RSQ:
979 case SHADER_OPCODE_SQRT:
980 case SHADER_OPCODE_EXP2:
981 case SHADER_OPCODE_LOG2:
982 case SHADER_OPCODE_SIN:
983 case SHADER_OPCODE_COS:
984 return 1 * dispatch_width / 8;
985 case SHADER_OPCODE_POW:
986 case SHADER_OPCODE_INT_QUOTIENT:
987 case SHADER_OPCODE_INT_REMAINDER:
988 return 2 * dispatch_width / 8;
989 case SHADER_OPCODE_TEX:
990 case FS_OPCODE_TXB:
991 case SHADER_OPCODE_TXD:
992 case SHADER_OPCODE_TXF:
993 case SHADER_OPCODE_TXF_CMS:
994 case SHADER_OPCODE_TXF_MCS:
995 case SHADER_OPCODE_TG4:
996 case SHADER_OPCODE_TG4_OFFSET:
997 case SHADER_OPCODE_TXL:
998 case SHADER_OPCODE_TXS:
999 case SHADER_OPCODE_LOD:
1000 return 1;
1001 case FS_OPCODE_FB_WRITE:
1002 return 2;
1003 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1004 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1005 return 1;
1006 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1007 return inst->mlen;
1008 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1009 return 2;
1010 case SHADER_OPCODE_UNTYPED_ATOMIC:
1011 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1012 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1013 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1014 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1015 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1016 return 0;
1017 default:
1018 unreachable("not reached");
1019 }
1020 }
1021
1022 int
1023 fs_visitor::virtual_grf_alloc(int size)
1024 {
1025 if (virtual_grf_array_size <= virtual_grf_count) {
1026 if (virtual_grf_array_size == 0)
1027 virtual_grf_array_size = 16;
1028 else
1029 virtual_grf_array_size *= 2;
1030 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
1031 virtual_grf_array_size);
1032 }
1033 virtual_grf_sizes[virtual_grf_count] = size;
1034 return virtual_grf_count++;
1035 }
1036
1037 /** Fixed HW reg constructor. */
1038 fs_reg::fs_reg(enum register_file file, int reg)
1039 {
1040 init();
1041 this->file = file;
1042 this->reg = reg;
1043 this->type = BRW_REGISTER_TYPE_F;
1044
1045 switch (file) {
1046 case UNIFORM:
1047 this->width = 1;
1048 break;
1049 default:
1050 this->width = 8;
1051 }
1052 }
1053
1054 /** Fixed HW reg constructor. */
1055 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1056 {
1057 init();
1058 this->file = file;
1059 this->reg = reg;
1060 this->type = type;
1061
1062 switch (file) {
1063 case UNIFORM:
1064 this->width = 1;
1065 break;
1066 default:
1067 this->width = 8;
1068 }
1069 }
1070
1071 /** Fixed HW reg constructor. */
1072 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1073 uint8_t width)
1074 {
1075 init();
1076 this->file = file;
1077 this->reg = reg;
1078 this->type = type;
1079 this->width = width;
1080 }
1081
1082 /** Automatic reg constructor. */
1083 fs_reg::fs_reg(fs_visitor *v, const struct glsl_type *type)
1084 {
1085 init();
1086 int reg_width = v->dispatch_width / 8;
1087
1088 this->file = GRF;
1089 this->reg = v->virtual_grf_alloc(v->type_size(type) * reg_width);
1090 this->reg_offset = 0;
1091 this->type = brw_type_for_base_type(type);
1092 this->width = v->dispatch_width;
1093 assert(this->width == 8 || this->width == 16);
1094 }
1095
1096 fs_reg *
1097 fs_visitor::variable_storage(ir_variable *var)
1098 {
1099 return (fs_reg *)hash_table_find(this->variable_ht, var);
1100 }
1101
1102 void
1103 import_uniforms_callback(const void *key,
1104 void *data,
1105 void *closure)
1106 {
1107 struct hash_table *dst_ht = (struct hash_table *)closure;
1108 const fs_reg *reg = (const fs_reg *)data;
1109
1110 if (reg->file != UNIFORM)
1111 return;
1112
1113 hash_table_insert(dst_ht, data, key);
1114 }
1115
1116 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1117 * This brings in those uniform definitions
1118 */
1119 void
1120 fs_visitor::import_uniforms(fs_visitor *v)
1121 {
1122 hash_table_call_foreach(v->variable_ht,
1123 import_uniforms_callback,
1124 variable_ht);
1125 this->push_constant_loc = v->push_constant_loc;
1126 this->pull_constant_loc = v->pull_constant_loc;
1127 this->uniforms = v->uniforms;
1128 this->param_size = v->param_size;
1129 }
1130
1131 /* Our support for uniforms is piggy-backed on the struct
1132 * gl_fragment_program, because that's where the values actually
1133 * get stored, rather than in some global gl_shader_program uniform
1134 * store.
1135 */
1136 void
1137 fs_visitor::setup_uniform_values(ir_variable *ir)
1138 {
1139 int namelen = strlen(ir->name);
1140
1141 /* The data for our (non-builtin) uniforms is stored in a series of
1142 * gl_uniform_driver_storage structs for each subcomponent that
1143 * glGetUniformLocation() could name. We know it's been set up in the same
1144 * order we'd walk the type, so walk the list of storage and find anything
1145 * with our name, or the prefix of a component that starts with our name.
1146 */
1147 unsigned params_before = uniforms;
1148 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1149 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1150
1151 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1152 (storage->name[namelen] != 0 &&
1153 storage->name[namelen] != '.' &&
1154 storage->name[namelen] != '[')) {
1155 continue;
1156 }
1157
1158 unsigned slots = storage->type->component_slots();
1159 if (storage->array_elements)
1160 slots *= storage->array_elements;
1161
1162 for (unsigned i = 0; i < slots; i++) {
1163 stage_prog_data->param[uniforms++] = &storage->storage[i];
1164 }
1165 }
1166
1167 /* Make sure we actually initialized the right amount of stuff here. */
1168 assert(params_before + ir->type->component_slots() == uniforms);
1169 (void)params_before;
1170 }
1171
1172
1173 /* Our support for builtin uniforms is even scarier than non-builtin.
1174 * It sits on top of the PROG_STATE_VAR parameters that are
1175 * automatically updated from GL context state.
1176 */
1177 void
1178 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1179 {
1180 const ir_state_slot *const slots = ir->get_state_slots();
1181 assert(slots != NULL);
1182
1183 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1184 /* This state reference has already been setup by ir_to_mesa, but we'll
1185 * get the same index back here.
1186 */
1187 int index = _mesa_add_state_reference(this->prog->Parameters,
1188 (gl_state_index *)slots[i].tokens);
1189
1190 /* Add each of the unique swizzles of the element as a parameter.
1191 * This'll end up matching the expected layout of the
1192 * array/matrix/structure we're trying to fill in.
1193 */
1194 int last_swiz = -1;
1195 for (unsigned int j = 0; j < 4; j++) {
1196 int swiz = GET_SWZ(slots[i].swizzle, j);
1197 if (swiz == last_swiz)
1198 break;
1199 last_swiz = swiz;
1200
1201 stage_prog_data->param[uniforms++] =
1202 &prog->Parameters->ParameterValues[index][swiz];
1203 }
1204 }
1205 }
1206
1207 fs_reg *
1208 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1209 {
1210 assert(stage == MESA_SHADER_FRAGMENT);
1211 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1212 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1213 fs_reg wpos = *reg;
1214 bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1215
1216 /* gl_FragCoord.x */
1217 if (ir->data.pixel_center_integer) {
1218 emit(MOV(wpos, this->pixel_x));
1219 } else {
1220 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1221 }
1222 wpos = offset(wpos, 1);
1223
1224 /* gl_FragCoord.y */
1225 if (!flip && ir->data.pixel_center_integer) {
1226 emit(MOV(wpos, this->pixel_y));
1227 } else {
1228 fs_reg pixel_y = this->pixel_y;
1229 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1230
1231 if (flip) {
1232 pixel_y.negate = true;
1233 offset += key->drawable_height - 1.0;
1234 }
1235
1236 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1237 }
1238 wpos = offset(wpos, 1);
1239
1240 /* gl_FragCoord.z */
1241 if (brw->gen >= 6) {
1242 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1243 } else {
1244 emit(FS_OPCODE_LINTERP, wpos,
1245 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1246 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1247 interp_reg(VARYING_SLOT_POS, 2));
1248 }
1249 wpos = offset(wpos, 1);
1250
1251 /* gl_FragCoord.w: Already set up in emit_interpolation */
1252 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1253
1254 return reg;
1255 }
1256
1257 fs_inst *
1258 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1259 glsl_interp_qualifier interpolation_mode,
1260 bool is_centroid, bool is_sample)
1261 {
1262 brw_wm_barycentric_interp_mode barycoord_mode;
1263 if (brw->gen >= 6) {
1264 if (is_centroid) {
1265 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1266 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1267 else
1268 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1269 } else if (is_sample) {
1270 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1271 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1272 else
1273 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1274 } else {
1275 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1276 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1277 else
1278 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1279 }
1280 } else {
1281 /* On Ironlake and below, there is only one interpolation mode.
1282 * Centroid interpolation doesn't mean anything on this hardware --
1283 * there is no multisampling.
1284 */
1285 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1286 }
1287 return emit(FS_OPCODE_LINTERP, attr,
1288 this->delta_x[barycoord_mode],
1289 this->delta_y[barycoord_mode], interp);
1290 }
1291
1292 fs_reg *
1293 fs_visitor::emit_general_interpolation(ir_variable *ir)
1294 {
1295 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1296 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1297 fs_reg attr = *reg;
1298
1299 assert(stage == MESA_SHADER_FRAGMENT);
1300 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1301 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1302
1303 unsigned int array_elements;
1304 const glsl_type *type;
1305
1306 if (ir->type->is_array()) {
1307 array_elements = ir->type->length;
1308 if (array_elements == 0) {
1309 fail("dereferenced array '%s' has length 0\n", ir->name);
1310 }
1311 type = ir->type->fields.array;
1312 } else {
1313 array_elements = 1;
1314 type = ir->type;
1315 }
1316
1317 glsl_interp_qualifier interpolation_mode =
1318 ir->determine_interpolation_mode(key->flat_shade);
1319
1320 int location = ir->data.location;
1321 for (unsigned int i = 0; i < array_elements; i++) {
1322 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1323 if (prog_data->urb_setup[location] == -1) {
1324 /* If there's no incoming setup data for this slot, don't
1325 * emit interpolation for it.
1326 */
1327 attr = offset(attr, type->vector_elements);
1328 location++;
1329 continue;
1330 }
1331
1332 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1333 /* Constant interpolation (flat shading) case. The SF has
1334 * handed us defined values in only the constant offset
1335 * field of the setup reg.
1336 */
1337 for (unsigned int k = 0; k < type->vector_elements; k++) {
1338 struct brw_reg interp = interp_reg(location, k);
1339 interp = suboffset(interp, 3);
1340 interp.type = reg->type;
1341 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1342 attr = offset(attr, 1);
1343 }
1344 } else {
1345 /* Smooth/noperspective interpolation case. */
1346 for (unsigned int k = 0; k < type->vector_elements; k++) {
1347 struct brw_reg interp = interp_reg(location, k);
1348 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1349 /* Get the pixel/sample mask into f0 so that we know
1350 * which pixels are lit. Then, for each channel that is
1351 * unlit, replace the centroid data with non-centroid
1352 * data.
1353 */
1354 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1355
1356 fs_inst *inst;
1357 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1358 false, false);
1359 inst->predicate = BRW_PREDICATE_NORMAL;
1360 inst->predicate_inverse = true;
1361 if (brw->has_pln)
1362 inst->no_dd_clear = true;
1363
1364 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1365 ir->data.centroid && !key->persample_shading,
1366 ir->data.sample || key->persample_shading);
1367 inst->predicate = BRW_PREDICATE_NORMAL;
1368 inst->predicate_inverse = false;
1369 if (brw->has_pln)
1370 inst->no_dd_check = true;
1371
1372 } else {
1373 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1374 ir->data.centroid && !key->persample_shading,
1375 ir->data.sample || key->persample_shading);
1376 }
1377 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1378 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1379 }
1380 attr = offset(attr, 1);
1381 }
1382
1383 }
1384 location++;
1385 }
1386 }
1387
1388 return reg;
1389 }
1390
1391 fs_reg *
1392 fs_visitor::emit_frontfacing_interpolation()
1393 {
1394 fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::bool_type);
1395
1396 if (brw->gen >= 6) {
1397 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1398 * a boolean result from this (~0/true or 0/false).
1399 *
1400 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1401 * this task in only one instruction:
1402 * - a negation source modifier will flip the bit; and
1403 * - a W -> D type conversion will sign extend the bit into the high
1404 * word of the destination.
1405 *
1406 * An ASR 15 fills the low word of the destination.
1407 */
1408 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1409 g0.negate = true;
1410
1411 emit(ASR(*reg, g0, fs_reg(15)));
1412 } else {
1413 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1414 * a boolean result from this (1/true or 0/false).
1415 *
1416 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1417 * the negation source modifier to flip it. Unfortunately the SHR
1418 * instruction only operates on UD (or D with an abs source modifier)
1419 * sources without negation.
1420 *
1421 * Instead, use ASR (which will give ~0/true or 0/false) followed by an
1422 * AND 1.
1423 */
1424 fs_reg asr = fs_reg(this, glsl_type::bool_type);
1425 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1426 g1_6.negate = true;
1427
1428 emit(ASR(asr, g1_6, fs_reg(31)));
1429 emit(AND(*reg, asr, fs_reg(1)));
1430 }
1431
1432 return reg;
1433 }
1434
1435 void
1436 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1437 {
1438 assert(stage == MESA_SHADER_FRAGMENT);
1439 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1440 assert(dst.type == BRW_REGISTER_TYPE_F);
1441
1442 if (key->compute_pos_offset) {
1443 /* Convert int_sample_pos to floating point */
1444 emit(MOV(dst, int_sample_pos));
1445 /* Scale to the range [0, 1] */
1446 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1447 }
1448 else {
1449 /* From ARB_sample_shading specification:
1450 * "When rendering to a non-multisample buffer, or if multisample
1451 * rasterization is disabled, gl_SamplePosition will always be
1452 * (0.5, 0.5).
1453 */
1454 emit(MOV(dst, fs_reg(0.5f)));
1455 }
1456 }
1457
1458 fs_reg *
1459 fs_visitor::emit_samplepos_setup()
1460 {
1461 assert(brw->gen >= 6);
1462
1463 this->current_annotation = "compute sample position";
1464 fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::vec2_type);
1465 fs_reg pos = *reg;
1466 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1467 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1468
1469 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1470 * mode will be enabled.
1471 *
1472 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1473 * R31.1:0 Position Offset X/Y for Slot[3:0]
1474 * R31.3:2 Position Offset X/Y for Slot[7:4]
1475 * .....
1476 *
1477 * The X, Y sample positions come in as bytes in thread payload. So, read
1478 * the positions using vstride=16, width=8, hstride=2.
1479 */
1480 struct brw_reg sample_pos_reg =
1481 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1482 BRW_REGISTER_TYPE_B), 16, 8, 2);
1483
1484 if (dispatch_width == 8) {
1485 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1486 } else {
1487 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1488 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1489 ->force_sechalf = true;
1490 }
1491 /* Compute gl_SamplePosition.x */
1492 compute_sample_position(pos, int_sample_x);
1493 pos = offset(pos, 1);
1494 if (dispatch_width == 8) {
1495 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1496 } else {
1497 emit(MOV(half(int_sample_y, 0),
1498 fs_reg(suboffset(sample_pos_reg, 1))));
1499 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1500 ->force_sechalf = true;
1501 }
1502 /* Compute gl_SamplePosition.y */
1503 compute_sample_position(pos, int_sample_y);
1504 return reg;
1505 }
1506
1507 fs_reg *
1508 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1509 {
1510 assert(stage == MESA_SHADER_FRAGMENT);
1511 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1512 assert(brw->gen >= 6);
1513
1514 this->current_annotation = "compute sample id";
1515 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1516
1517 if (key->compute_sample_id) {
1518 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1519 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1520 t2.type = BRW_REGISTER_TYPE_UW;
1521
1522 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1523 * 8x multisampling, subspan 0 will represent sample N (where N
1524 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1525 * 7. We can find the value of N by looking at R0.0 bits 7:6
1526 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1527 * (since samples are always delivered in pairs). That is, we
1528 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1529 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1530 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1531 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1532 * populating a temporary variable with the sequence (0, 1, 2, 3),
1533 * and then reading from it using vstride=1, width=4, hstride=0.
1534 * These computations hold good for 4x multisampling as well.
1535 *
1536 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1537 * the first four slots are sample 0 of subspan 0; the next four
1538 * are sample 1 of subspan 0; the third group is sample 0 of
1539 * subspan 1, and finally sample 1 of subspan 1.
1540 */
1541 fs_inst *inst;
1542 inst = emit(BRW_OPCODE_AND, t1,
1543 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1544 fs_reg(0xc0));
1545 inst->force_writemask_all = true;
1546 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1547 inst->force_writemask_all = true;
1548 /* This works for both SIMD8 and SIMD16 */
1549 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1550 inst->force_writemask_all = true;
1551 /* This special instruction takes care of setting vstride=1,
1552 * width=4, hstride=0 of t2 during an ADD instruction.
1553 */
1554 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1555 } else {
1556 /* As per GL_ARB_sample_shading specification:
1557 * "When rendering to a non-multisample buffer, or if multisample
1558 * rasterization is disabled, gl_SampleID will always be zero."
1559 */
1560 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1561 }
1562
1563 return reg;
1564 }
1565
1566 fs_reg
1567 fs_visitor::fix_math_operand(fs_reg src)
1568 {
1569 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1570 * might be able to do better by doing execsize = 1 math and then
1571 * expanding that result out, but we would need to be careful with
1572 * masking.
1573 *
1574 * The hardware ignores source modifiers (negate and abs) on math
1575 * instructions, so we also move to a temp to set those up.
1576 */
1577 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1578 !src.abs && !src.negate)
1579 return src;
1580
1581 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1582 * operands to math
1583 */
1584 if (brw->gen >= 7 && src.file != IMM)
1585 return src;
1586
1587 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1588 expanded.type = src.type;
1589 emit(BRW_OPCODE_MOV, expanded, src);
1590 return expanded;
1591 }
1592
1593 fs_inst *
1594 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1595 {
1596 switch (opcode) {
1597 case SHADER_OPCODE_RCP:
1598 case SHADER_OPCODE_RSQ:
1599 case SHADER_OPCODE_SQRT:
1600 case SHADER_OPCODE_EXP2:
1601 case SHADER_OPCODE_LOG2:
1602 case SHADER_OPCODE_SIN:
1603 case SHADER_OPCODE_COS:
1604 break;
1605 default:
1606 unreachable("not reached: bad math opcode");
1607 }
1608
1609 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1610 * might be able to do better by doing execsize = 1 math and then
1611 * expanding that result out, but we would need to be careful with
1612 * masking.
1613 *
1614 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1615 * instructions, so we also move to a temp to set those up.
1616 */
1617 if (brw->gen == 6 || brw->gen == 7)
1618 src = fix_math_operand(src);
1619
1620 fs_inst *inst = emit(opcode, dst, src);
1621
1622 if (brw->gen < 6) {
1623 inst->base_mrf = 2;
1624 inst->mlen = dispatch_width / 8;
1625 }
1626
1627 return inst;
1628 }
1629
1630 fs_inst *
1631 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1632 {
1633 int base_mrf = 2;
1634 fs_inst *inst;
1635
1636 if (brw->gen >= 8) {
1637 inst = emit(opcode, dst, src0, src1);
1638 } else if (brw->gen >= 6) {
1639 src0 = fix_math_operand(src0);
1640 src1 = fix_math_operand(src1);
1641
1642 inst = emit(opcode, dst, src0, src1);
1643 } else {
1644 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1645 * "Message Payload":
1646 *
1647 * "Operand0[7]. For the INT DIV functions, this operand is the
1648 * denominator."
1649 * ...
1650 * "Operand1[7]. For the INT DIV functions, this operand is the
1651 * numerator."
1652 */
1653 bool is_int_div = opcode != SHADER_OPCODE_POW;
1654 fs_reg &op0 = is_int_div ? src1 : src0;
1655 fs_reg &op1 = is_int_div ? src0 : src1;
1656
1657 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1658 inst = emit(opcode, dst, op0, reg_null_f);
1659
1660 inst->base_mrf = base_mrf;
1661 inst->mlen = 2 * dispatch_width / 8;
1662 }
1663 return inst;
1664 }
1665
1666 void
1667 fs_visitor::assign_curb_setup()
1668 {
1669 if (dispatch_width == 8) {
1670 prog_data->dispatch_grf_start_reg = payload.num_regs;
1671 } else {
1672 assert(stage == MESA_SHADER_FRAGMENT);
1673 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1674 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1675 }
1676
1677 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1678
1679 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1680 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1681 for (unsigned int i = 0; i < inst->sources; i++) {
1682 if (inst->src[i].file == UNIFORM) {
1683 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1684 int constant_nr;
1685 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1686 constant_nr = push_constant_loc[uniform_nr];
1687 } else {
1688 /* Section 5.11 of the OpenGL 4.1 spec says:
1689 * "Out-of-bounds reads return undefined values, which include
1690 * values from other variables of the active program or zero."
1691 * Just return the first push constant.
1692 */
1693 constant_nr = 0;
1694 }
1695
1696 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1697 constant_nr / 8,
1698 constant_nr % 8);
1699
1700 inst->src[i].file = HW_REG;
1701 inst->src[i].fixed_hw_reg = byte_offset(
1702 retype(brw_reg, inst->src[i].type),
1703 inst->src[i].subreg_offset);
1704 }
1705 }
1706 }
1707 }
1708
1709 void
1710 fs_visitor::calculate_urb_setup()
1711 {
1712 assert(stage == MESA_SHADER_FRAGMENT);
1713 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1714 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1715
1716 memset(prog_data->urb_setup, -1,
1717 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1718
1719 int urb_next = 0;
1720 /* Figure out where each of the incoming setup attributes lands. */
1721 if (brw->gen >= 6) {
1722 if (_mesa_bitcount_64(prog->InputsRead &
1723 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1724 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1725 * first 16 varying inputs, so we can put them wherever we want.
1726 * Just put them in order.
1727 *
1728 * This is useful because it means that (a) inputs not used by the
1729 * fragment shader won't take up valuable register space, and (b) we
1730 * won't have to recompile the fragment shader if it gets paired with
1731 * a different vertex (or geometry) shader.
1732 */
1733 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1734 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1735 BITFIELD64_BIT(i)) {
1736 prog_data->urb_setup[i] = urb_next++;
1737 }
1738 }
1739 } else {
1740 /* We have enough input varyings that the SF/SBE pipeline stage can't
1741 * arbitrarily rearrange them to suit our whim; we have to put them
1742 * in an order that matches the output of the previous pipeline stage
1743 * (geometry or vertex shader).
1744 */
1745 struct brw_vue_map prev_stage_vue_map;
1746 brw_compute_vue_map(brw, &prev_stage_vue_map,
1747 key->input_slots_valid);
1748 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1749 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1750 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1751 slot++) {
1752 int varying = prev_stage_vue_map.slot_to_varying[slot];
1753 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1754 * unused.
1755 */
1756 if (varying != BRW_VARYING_SLOT_COUNT &&
1757 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1758 BITFIELD64_BIT(varying))) {
1759 prog_data->urb_setup[varying] = slot - first_slot;
1760 }
1761 }
1762 urb_next = prev_stage_vue_map.num_slots - first_slot;
1763 }
1764 } else {
1765 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1766 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1767 /* Point size is packed into the header, not as a general attribute */
1768 if (i == VARYING_SLOT_PSIZ)
1769 continue;
1770
1771 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1772 /* The back color slot is skipped when the front color is
1773 * also written to. In addition, some slots can be
1774 * written in the vertex shader and not read in the
1775 * fragment shader. So the register number must always be
1776 * incremented, mapped or not.
1777 */
1778 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1779 prog_data->urb_setup[i] = urb_next;
1780 urb_next++;
1781 }
1782 }
1783
1784 /*
1785 * It's a FS only attribute, and we did interpolation for this attribute
1786 * in SF thread. So, count it here, too.
1787 *
1788 * See compile_sf_prog() for more info.
1789 */
1790 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1791 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1792 }
1793
1794 prog_data->num_varying_inputs = urb_next;
1795 }
1796
1797 void
1798 fs_visitor::assign_urb_setup()
1799 {
1800 assert(stage == MESA_SHADER_FRAGMENT);
1801 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1802
1803 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1804
1805 /* Offset all the urb_setup[] index by the actual position of the
1806 * setup regs, now that the location of the constants has been chosen.
1807 */
1808 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1809 if (inst->opcode == FS_OPCODE_LINTERP) {
1810 assert(inst->src[2].file == HW_REG);
1811 inst->src[2].fixed_hw_reg.nr += urb_start;
1812 }
1813
1814 if (inst->opcode == FS_OPCODE_CINTERP) {
1815 assert(inst->src[0].file == HW_REG);
1816 inst->src[0].fixed_hw_reg.nr += urb_start;
1817 }
1818 }
1819
1820 /* Each attribute is 4 setup channels, each of which is half a reg. */
1821 this->first_non_payload_grf =
1822 urb_start + prog_data->num_varying_inputs * 2;
1823 }
1824
1825 /**
1826 * Split large virtual GRFs into separate components if we can.
1827 *
1828 * This is mostly duplicated with what brw_fs_vector_splitting does,
1829 * but that's really conservative because it's afraid of doing
1830 * splitting that doesn't result in real progress after the rest of
1831 * the optimization phases, which would cause infinite looping in
1832 * optimization. We can do it once here, safely. This also has the
1833 * opportunity to split interpolated values, or maybe even uniforms,
1834 * which we don't have at the IR level.
1835 *
1836 * We want to split, because virtual GRFs are what we register
1837 * allocate and spill (due to contiguousness requirements for some
1838 * instructions), and they're what we naturally generate in the
1839 * codegen process, but most virtual GRFs don't actually need to be
1840 * contiguous sets of GRFs. If we split, we'll end up with reduced
1841 * live intervals and better dead code elimination and coalescing.
1842 */
1843 void
1844 fs_visitor::split_virtual_grfs()
1845 {
1846 int num_vars = this->virtual_grf_count;
1847
1848 /* Count the total number of registers */
1849 int reg_count = 0;
1850 int vgrf_to_reg[num_vars];
1851 for (int i = 0; i < num_vars; i++) {
1852 vgrf_to_reg[i] = reg_count;
1853 reg_count += virtual_grf_sizes[i];
1854 }
1855
1856 /* An array of "split points". For each register slot, this indicates
1857 * if this slot can be separated from the previous slot. Every time an
1858 * instruction uses multiple elements of a register (as a source or
1859 * destination), we mark the used slots as inseparable. Then we go
1860 * through and split the registers into the smallest pieces we can.
1861 */
1862 bool split_points[reg_count];
1863 memset(split_points, 0, sizeof(split_points));
1864
1865 /* Mark all used registers as fully splittable */
1866 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1867 if (inst->dst.file == GRF) {
1868 int reg = vgrf_to_reg[inst->dst.reg];
1869 for (int j = 1; j < this->virtual_grf_sizes[inst->dst.reg]; j++)
1870 split_points[reg + j] = true;
1871 }
1872
1873 for (int i = 0; i < inst->sources; i++) {
1874 if (inst->src[i].file == GRF) {
1875 int reg = vgrf_to_reg[inst->src[i].reg];
1876 for (int j = 1; j < this->virtual_grf_sizes[inst->src[i].reg]; j++)
1877 split_points[reg + j] = true;
1878 }
1879 }
1880 }
1881
1882 if (brw->has_pln &&
1883 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1884 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1885 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1886 * Gen6, that was the only supported interpolation mode, and since Gen6,
1887 * delta_x and delta_y are in fixed hardware registers.
1888 */
1889 int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1890 split_points[vgrf_to_reg[vgrf] + 1] = false;
1891 }
1892
1893 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1894 if (inst->dst.file == GRF) {
1895 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1896 for (int j = 1; j < inst->regs_written; j++)
1897 split_points[reg + j] = false;
1898 }
1899 for (int i = 0; i < inst->sources; i++) {
1900 if (inst->src[i].file == GRF) {
1901 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1902 for (int j = 1; j < inst->regs_read(this, i); j++)
1903 split_points[reg + j] = false;
1904 }
1905 }
1906 }
1907
1908 int new_virtual_grf[reg_count];
1909 int new_reg_offset[reg_count];
1910
1911 int reg = 0;
1912 for (int i = 0; i < num_vars; i++) {
1913 /* The first one should always be 0 as a quick sanity check. */
1914 assert(split_points[reg] == false);
1915
1916 /* j = 0 case */
1917 new_reg_offset[reg] = 0;
1918 reg++;
1919 int offset = 1;
1920
1921 /* j > 0 case */
1922 for (int j = 1; j < virtual_grf_sizes[i]; j++) {
1923 /* If this is a split point, reset the offset to 0 and allocate a
1924 * new virtual GRF for the previous offset many registers
1925 */
1926 if (split_points[reg]) {
1927 int grf = virtual_grf_alloc(offset);
1928 for (int k = reg - offset; k < reg; k++)
1929 new_virtual_grf[k] = grf;
1930 offset = 0;
1931 }
1932 new_reg_offset[reg] = offset;
1933 offset++;
1934 reg++;
1935 }
1936
1937 /* The last one gets the original register number */
1938 virtual_grf_sizes[i] = offset;
1939 for (int k = reg - offset; k < reg; k++)
1940 new_virtual_grf[k] = i;
1941 }
1942 assert(reg == reg_count);
1943
1944 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1945 if (inst->dst.file == GRF) {
1946 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1947 inst->dst.reg = new_virtual_grf[reg];
1948 inst->dst.reg_offset = new_reg_offset[reg];
1949 assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
1950 }
1951 for (int i = 0; i < inst->sources; i++) {
1952 if (inst->src[i].file == GRF) {
1953 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1954 inst->src[i].reg = new_virtual_grf[reg];
1955 inst->src[i].reg_offset = new_reg_offset[reg];
1956 assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
1957 }
1958 }
1959 }
1960 invalidate_live_intervals();
1961 }
1962
1963 /**
1964 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1965 *
1966 * During code generation, we create tons of temporary variables, many of
1967 * which get immediately killed and are never used again. Yet, in later
1968 * optimization and analysis passes, such as compute_live_intervals, we need
1969 * to loop over all the virtual GRFs. Compacting them can save a lot of
1970 * overhead.
1971 */
1972 bool
1973 fs_visitor::compact_virtual_grfs()
1974 {
1975 bool progress = false;
1976 int remap_table[this->virtual_grf_count];
1977 memset(remap_table, -1, sizeof(remap_table));
1978
1979 /* Mark which virtual GRFs are used. */
1980 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1981 if (inst->dst.file == GRF)
1982 remap_table[inst->dst.reg] = 0;
1983
1984 for (int i = 0; i < inst->sources; i++) {
1985 if (inst->src[i].file == GRF)
1986 remap_table[inst->src[i].reg] = 0;
1987 }
1988 }
1989
1990 /* Compact the GRF arrays. */
1991 int new_index = 0;
1992 for (int i = 0; i < this->virtual_grf_count; i++) {
1993 if (remap_table[i] == -1) {
1994 /* We just found an unused register. This means that we are
1995 * actually going to compact something.
1996 */
1997 progress = true;
1998 } else {
1999 remap_table[i] = new_index;
2000 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
2001 invalidate_live_intervals();
2002 ++new_index;
2003 }
2004 }
2005
2006 this->virtual_grf_count = new_index;
2007
2008 /* Patch all the instructions to use the newly renumbered registers */
2009 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2010 if (inst->dst.file == GRF)
2011 inst->dst.reg = remap_table[inst->dst.reg];
2012
2013 for (int i = 0; i < inst->sources; i++) {
2014 if (inst->src[i].file == GRF)
2015 inst->src[i].reg = remap_table[inst->src[i].reg];
2016 }
2017 }
2018
2019 /* Patch all the references to delta_x/delta_y, since they're used in
2020 * register allocation. If they're unused, switch them to BAD_FILE so
2021 * we don't think some random VGRF is delta_x/delta_y.
2022 */
2023 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2024 if (delta_x[i].file == GRF) {
2025 if (remap_table[delta_x[i].reg] != -1) {
2026 delta_x[i].reg = remap_table[delta_x[i].reg];
2027 } else {
2028 delta_x[i].file = BAD_FILE;
2029 }
2030 }
2031 }
2032 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2033 if (delta_y[i].file == GRF) {
2034 if (remap_table[delta_y[i].reg] != -1) {
2035 delta_y[i].reg = remap_table[delta_y[i].reg];
2036 } else {
2037 delta_y[i].file = BAD_FILE;
2038 }
2039 }
2040 }
2041
2042 return progress;
2043 }
2044
2045 /*
2046 * Implements array access of uniforms by inserting a
2047 * PULL_CONSTANT_LOAD instruction.
2048 *
2049 * Unlike temporary GRF array access (where we don't support it due to
2050 * the difficulty of doing relative addressing on instruction
2051 * destinations), we could potentially do array access of uniforms
2052 * that were loaded in GRF space as push constants. In real-world
2053 * usage we've seen, though, the arrays being used are always larger
2054 * than we could load as push constants, so just always move all
2055 * uniform array access out to a pull constant buffer.
2056 */
2057 void
2058 fs_visitor::move_uniform_array_access_to_pull_constants()
2059 {
2060 if (dispatch_width != 8)
2061 return;
2062
2063 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2064 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2065
2066 /* Walk through and find array access of uniforms. Put a copy of that
2067 * uniform in the pull constant buffer.
2068 *
2069 * Note that we don't move constant-indexed accesses to arrays. No
2070 * testing has been done of the performance impact of this choice.
2071 */
2072 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2073 for (int i = 0 ; i < inst->sources; i++) {
2074 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2075 continue;
2076
2077 int uniform = inst->src[i].reg;
2078
2079 /* If this array isn't already present in the pull constant buffer,
2080 * add it.
2081 */
2082 if (pull_constant_loc[uniform] == -1) {
2083 const gl_constant_value **values = &stage_prog_data->param[uniform];
2084
2085 assert(param_size[uniform]);
2086
2087 for (int j = 0; j < param_size[uniform]; j++) {
2088 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2089
2090 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2091 values[j];
2092 }
2093 }
2094 }
2095 }
2096 }
2097
2098 /**
2099 * Assign UNIFORM file registers to either push constants or pull constants.
2100 *
2101 * We allow a fragment shader to have more than the specified minimum
2102 * maximum number of fragment shader uniform components (64). If
2103 * there are too many of these, they'd fill up all of register space.
2104 * So, this will push some of them out to the pull constant buffer and
2105 * update the program to load them.
2106 */
2107 void
2108 fs_visitor::assign_constant_locations()
2109 {
2110 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2111 if (dispatch_width != 8)
2112 return;
2113
2114 /* Find which UNIFORM registers are still in use. */
2115 bool is_live[uniforms];
2116 for (unsigned int i = 0; i < uniforms; i++) {
2117 is_live[i] = false;
2118 }
2119
2120 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2121 for (int i = 0; i < inst->sources; i++) {
2122 if (inst->src[i].file != UNIFORM)
2123 continue;
2124
2125 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2126 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2127 is_live[constant_nr] = true;
2128 }
2129 }
2130
2131 /* Only allow 16 registers (128 uniform components) as push constants.
2132 *
2133 * Just demote the end of the list. We could probably do better
2134 * here, demoting things that are rarely used in the program first.
2135 *
2136 * If changing this value, note the limitation about total_regs in
2137 * brw_curbe.c.
2138 */
2139 unsigned int max_push_components = 16 * 8;
2140 unsigned int num_push_constants = 0;
2141
2142 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2143
2144 for (unsigned int i = 0; i < uniforms; i++) {
2145 if (!is_live[i] || pull_constant_loc[i] != -1) {
2146 /* This UNIFORM register is either dead, or has already been demoted
2147 * to a pull const. Mark it as no longer living in the param[] array.
2148 */
2149 push_constant_loc[i] = -1;
2150 continue;
2151 }
2152
2153 if (num_push_constants < max_push_components) {
2154 /* Retain as a push constant. Record the location in the params[]
2155 * array.
2156 */
2157 push_constant_loc[i] = num_push_constants++;
2158 } else {
2159 /* Demote to a pull constant. */
2160 push_constant_loc[i] = -1;
2161
2162 int pull_index = stage_prog_data->nr_pull_params++;
2163 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2164 pull_constant_loc[i] = pull_index;
2165 }
2166 }
2167
2168 stage_prog_data->nr_params = num_push_constants;
2169
2170 /* Up until now, the param[] array has been indexed by reg + reg_offset
2171 * of UNIFORM registers. Condense it to only contain the uniforms we
2172 * chose to upload as push constants.
2173 */
2174 for (unsigned int i = 0; i < uniforms; i++) {
2175 int remapped = push_constant_loc[i];
2176
2177 if (remapped == -1)
2178 continue;
2179
2180 assert(remapped <= (int)i);
2181 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2182 }
2183 }
2184
2185 /**
2186 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2187 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2188 */
2189 void
2190 fs_visitor::demote_pull_constants()
2191 {
2192 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2193 for (int i = 0; i < inst->sources; i++) {
2194 if (inst->src[i].file != UNIFORM)
2195 continue;
2196
2197 int pull_index = pull_constant_loc[inst->src[i].reg +
2198 inst->src[i].reg_offset];
2199 if (pull_index == -1)
2200 continue;
2201
2202 /* Set up the annotation tracking for new generated instructions. */
2203 base_ir = inst->ir;
2204 current_annotation = inst->annotation;
2205
2206 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2207 fs_reg dst = fs_reg(this, glsl_type::float_type);
2208
2209 /* Generate a pull load into dst. */
2210 if (inst->src[i].reladdr) {
2211 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2212 surf_index,
2213 *inst->src[i].reladdr,
2214 pull_index);
2215 inst->insert_before(block, &list);
2216 inst->src[i].reladdr = NULL;
2217 } else {
2218 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2219 fs_inst *pull =
2220 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2221 dst, surf_index, offset);
2222 inst->insert_before(block, pull);
2223 inst->src[i].set_smear(pull_index & 3);
2224 }
2225
2226 /* Rewrite the instruction to use the temporary VGRF. */
2227 inst->src[i].file = GRF;
2228 inst->src[i].reg = dst.reg;
2229 inst->src[i].reg_offset = 0;
2230 inst->src[i].width = dispatch_width;
2231 }
2232 }
2233 invalidate_live_intervals();
2234 }
2235
2236 bool
2237 fs_visitor::opt_algebraic()
2238 {
2239 bool progress = false;
2240
2241 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2242 switch (inst->opcode) {
2243 case BRW_OPCODE_MUL:
2244 if (inst->src[1].file != IMM)
2245 continue;
2246
2247 /* a * 1.0 = a */
2248 if (inst->src[1].is_one()) {
2249 inst->opcode = BRW_OPCODE_MOV;
2250 inst->src[1] = reg_undef;
2251 progress = true;
2252 break;
2253 }
2254
2255 /* a * 0.0 = 0.0 */
2256 if (inst->src[1].is_zero()) {
2257 inst->opcode = BRW_OPCODE_MOV;
2258 inst->src[0] = inst->src[1];
2259 inst->src[1] = reg_undef;
2260 progress = true;
2261 break;
2262 }
2263
2264 break;
2265 case BRW_OPCODE_ADD:
2266 if (inst->src[1].file != IMM)
2267 continue;
2268
2269 /* a + 0.0 = a */
2270 if (inst->src[1].is_zero()) {
2271 inst->opcode = BRW_OPCODE_MOV;
2272 inst->src[1] = reg_undef;
2273 progress = true;
2274 break;
2275 }
2276 break;
2277 case BRW_OPCODE_OR:
2278 if (inst->src[0].equals(inst->src[1])) {
2279 inst->opcode = BRW_OPCODE_MOV;
2280 inst->src[1] = reg_undef;
2281 progress = true;
2282 break;
2283 }
2284 break;
2285 case BRW_OPCODE_LRP:
2286 if (inst->src[1].equals(inst->src[2])) {
2287 inst->opcode = BRW_OPCODE_MOV;
2288 inst->src[0] = inst->src[1];
2289 inst->src[1] = reg_undef;
2290 inst->src[2] = reg_undef;
2291 progress = true;
2292 break;
2293 }
2294 break;
2295 case BRW_OPCODE_SEL:
2296 if (inst->src[0].equals(inst->src[1])) {
2297 inst->opcode = BRW_OPCODE_MOV;
2298 inst->src[1] = reg_undef;
2299 inst->predicate = BRW_PREDICATE_NONE;
2300 inst->predicate_inverse = false;
2301 progress = true;
2302 } else if (inst->saturate && inst->src[1].file == IMM) {
2303 switch (inst->conditional_mod) {
2304 case BRW_CONDITIONAL_LE:
2305 case BRW_CONDITIONAL_L:
2306 switch (inst->src[1].type) {
2307 case BRW_REGISTER_TYPE_F:
2308 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2309 inst->opcode = BRW_OPCODE_MOV;
2310 inst->src[1] = reg_undef;
2311 progress = true;
2312 }
2313 break;
2314 default:
2315 break;
2316 }
2317 break;
2318 case BRW_CONDITIONAL_GE:
2319 case BRW_CONDITIONAL_G:
2320 switch (inst->src[1].type) {
2321 case BRW_REGISTER_TYPE_F:
2322 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2323 inst->opcode = BRW_OPCODE_MOV;
2324 inst->src[1] = reg_undef;
2325 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2326 progress = true;
2327 }
2328 break;
2329 default:
2330 break;
2331 }
2332 default:
2333 break;
2334 }
2335 }
2336 break;
2337 case SHADER_OPCODE_RCP: {
2338 fs_inst *prev = (fs_inst *)inst->prev;
2339 if (prev->opcode == SHADER_OPCODE_SQRT) {
2340 if (inst->src[0].equals(prev->dst)) {
2341 inst->opcode = SHADER_OPCODE_RSQ;
2342 inst->src[0] = prev->src[0];
2343 progress = true;
2344 }
2345 }
2346 break;
2347 }
2348 default:
2349 break;
2350 }
2351 }
2352
2353 return progress;
2354 }
2355
2356 bool
2357 fs_visitor::opt_register_renaming()
2358 {
2359 bool progress = false;
2360 int depth = 0;
2361
2362 int remap[virtual_grf_count];
2363 memset(remap, -1, sizeof(int) * virtual_grf_count);
2364
2365 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2366 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2367 depth++;
2368 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2369 inst->opcode == BRW_OPCODE_WHILE) {
2370 depth--;
2371 }
2372
2373 /* Rewrite instruction sources. */
2374 for (int i = 0; i < inst->sources; i++) {
2375 if (inst->src[i].file == GRF &&
2376 remap[inst->src[i].reg] != -1 &&
2377 remap[inst->src[i].reg] != inst->src[i].reg) {
2378 inst->src[i].reg = remap[inst->src[i].reg];
2379 progress = true;
2380 }
2381 }
2382
2383 const int dst = inst->dst.reg;
2384
2385 if (depth == 0 &&
2386 inst->dst.file == GRF &&
2387 virtual_grf_sizes[inst->dst.reg] == inst->dst.width / 8 &&
2388 !inst->is_partial_write()) {
2389 if (remap[dst] == -1) {
2390 remap[dst] = dst;
2391 } else {
2392 remap[dst] = virtual_grf_alloc(inst->dst.width / 8);
2393 inst->dst.reg = remap[dst];
2394 progress = true;
2395 }
2396 } else if (inst->dst.file == GRF &&
2397 remap[dst] != -1 &&
2398 remap[dst] != dst) {
2399 inst->dst.reg = remap[dst];
2400 progress = true;
2401 }
2402 }
2403
2404 if (progress) {
2405 invalidate_live_intervals();
2406
2407 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2408 if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2409 delta_x[i].reg = remap[delta_x[i].reg];
2410 }
2411 }
2412 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2413 if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2414 delta_y[i].reg = remap[delta_y[i].reg];
2415 }
2416 }
2417 }
2418
2419 return progress;
2420 }
2421
2422 bool
2423 fs_visitor::compute_to_mrf()
2424 {
2425 bool progress = false;
2426 int next_ip = 0;
2427
2428 calculate_live_intervals();
2429
2430 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2431 int ip = next_ip;
2432 next_ip++;
2433
2434 if (inst->opcode != BRW_OPCODE_MOV ||
2435 inst->is_partial_write() ||
2436 inst->dst.file != MRF || inst->src[0].file != GRF ||
2437 inst->dst.type != inst->src[0].type ||
2438 inst->src[0].abs || inst->src[0].negate ||
2439 !inst->src[0].is_contiguous() ||
2440 inst->src[0].subreg_offset)
2441 continue;
2442
2443 /* Work out which hardware MRF registers are written by this
2444 * instruction.
2445 */
2446 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2447 int mrf_high;
2448 if (inst->dst.reg & BRW_MRF_COMPR4) {
2449 mrf_high = mrf_low + 4;
2450 } else if (inst->exec_size == 16) {
2451 mrf_high = mrf_low + 1;
2452 } else {
2453 mrf_high = mrf_low;
2454 }
2455
2456 /* Can't compute-to-MRF this GRF if someone else was going to
2457 * read it later.
2458 */
2459 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2460 continue;
2461
2462 /* Found a move of a GRF to a MRF. Let's see if we can go
2463 * rewrite the thing that made this GRF to write into the MRF.
2464 */
2465 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2466 if (scan_inst->dst.file == GRF &&
2467 scan_inst->dst.reg == inst->src[0].reg) {
2468 /* Found the last thing to write our reg we want to turn
2469 * into a compute-to-MRF.
2470 */
2471
2472 /* If this one instruction didn't populate all the
2473 * channels, bail. We might be able to rewrite everything
2474 * that writes that reg, but it would require smarter
2475 * tracking to delay the rewriting until complete success.
2476 */
2477 if (scan_inst->is_partial_write())
2478 break;
2479
2480 /* Things returning more than one register would need us to
2481 * understand coalescing out more than one MOV at a time.
2482 */
2483 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2484 break;
2485
2486 /* SEND instructions can't have MRF as a destination. */
2487 if (scan_inst->mlen)
2488 break;
2489
2490 if (brw->gen == 6) {
2491 /* gen6 math instructions must have the destination be
2492 * GRF, so no compute-to-MRF for them.
2493 */
2494 if (scan_inst->is_math()) {
2495 break;
2496 }
2497 }
2498
2499 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2500 /* Found the creator of our MRF's source value. */
2501 scan_inst->dst.file = MRF;
2502 scan_inst->dst.reg = inst->dst.reg;
2503 scan_inst->saturate |= inst->saturate;
2504 inst->remove(block);
2505 progress = true;
2506 }
2507 break;
2508 }
2509
2510 /* We don't handle control flow here. Most computation of
2511 * values that end up in MRFs are shortly before the MRF
2512 * write anyway.
2513 */
2514 if (block->start() == scan_inst)
2515 break;
2516
2517 /* You can't read from an MRF, so if someone else reads our
2518 * MRF's source GRF that we wanted to rewrite, that stops us.
2519 */
2520 bool interfered = false;
2521 for (int i = 0; i < scan_inst->sources; i++) {
2522 if (scan_inst->src[i].file == GRF &&
2523 scan_inst->src[i].reg == inst->src[0].reg &&
2524 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2525 interfered = true;
2526 }
2527 }
2528 if (interfered)
2529 break;
2530
2531 if (scan_inst->dst.file == MRF) {
2532 /* If somebody else writes our MRF here, we can't
2533 * compute-to-MRF before that.
2534 */
2535 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2536 int scan_mrf_high;
2537
2538 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2539 scan_mrf_high = scan_mrf_low + 4;
2540 } else if (scan_inst->exec_size == 16) {
2541 scan_mrf_high = scan_mrf_low + 1;
2542 } else {
2543 scan_mrf_high = scan_mrf_low;
2544 }
2545
2546 if (mrf_low == scan_mrf_low ||
2547 mrf_low == scan_mrf_high ||
2548 mrf_high == scan_mrf_low ||
2549 mrf_high == scan_mrf_high) {
2550 break;
2551 }
2552 }
2553
2554 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2555 /* Found a SEND instruction, which means that there are
2556 * live values in MRFs from base_mrf to base_mrf +
2557 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2558 * above it.
2559 */
2560 if (mrf_low >= scan_inst->base_mrf &&
2561 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2562 break;
2563 }
2564 if (mrf_high >= scan_inst->base_mrf &&
2565 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2566 break;
2567 }
2568 }
2569 }
2570 }
2571
2572 if (progress)
2573 invalidate_live_intervals();
2574
2575 return progress;
2576 }
2577
2578 /**
2579 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2580 * instructions to FS_OPCODE_REP_FB_WRITE.
2581 */
2582 void
2583 fs_visitor::emit_repclear_shader()
2584 {
2585 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2586 int base_mrf = 1;
2587 int color_mrf = base_mrf + 2;
2588
2589 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2590 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2591 mov->force_writemask_all = true;
2592
2593 fs_inst *write;
2594 if (key->nr_color_regions == 1) {
2595 write = emit(FS_OPCODE_REP_FB_WRITE);
2596 write->saturate = key->clamp_fragment_color;
2597 write->base_mrf = color_mrf;
2598 write->target = 0;
2599 write->header_present = false;
2600 write->mlen = 1;
2601 } else {
2602 for (int i = 0; i < key->nr_color_regions; ++i) {
2603 write = emit(FS_OPCODE_REP_FB_WRITE);
2604 write->saturate = key->clamp_fragment_color;
2605 write->base_mrf = base_mrf;
2606 write->target = i;
2607 write->header_present = true;
2608 write->mlen = 3;
2609 }
2610 }
2611 write->eot = true;
2612
2613 calculate_cfg();
2614
2615 assign_constant_locations();
2616 assign_curb_setup();
2617
2618 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2619 assert(mov->src[0].file == HW_REG);
2620 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2621 }
2622
2623 /**
2624 * Walks through basic blocks, looking for repeated MRF writes and
2625 * removing the later ones.
2626 */
2627 bool
2628 fs_visitor::remove_duplicate_mrf_writes()
2629 {
2630 fs_inst *last_mrf_move[16];
2631 bool progress = false;
2632
2633 /* Need to update the MRF tracking for compressed instructions. */
2634 if (dispatch_width == 16)
2635 return false;
2636
2637 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2638
2639 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2640 if (inst->is_control_flow()) {
2641 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2642 }
2643
2644 if (inst->opcode == BRW_OPCODE_MOV &&
2645 inst->dst.file == MRF) {
2646 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2647 if (prev_inst && inst->equals(prev_inst)) {
2648 inst->remove(block);
2649 progress = true;
2650 continue;
2651 }
2652 }
2653
2654 /* Clear out the last-write records for MRFs that were overwritten. */
2655 if (inst->dst.file == MRF) {
2656 last_mrf_move[inst->dst.reg] = NULL;
2657 }
2658
2659 if (inst->mlen > 0 && inst->base_mrf != -1) {
2660 /* Found a SEND instruction, which will include two or fewer
2661 * implied MRF writes. We could do better here.
2662 */
2663 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2664 last_mrf_move[inst->base_mrf + i] = NULL;
2665 }
2666 }
2667
2668 /* Clear out any MRF move records whose sources got overwritten. */
2669 if (inst->dst.file == GRF) {
2670 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2671 if (last_mrf_move[i] &&
2672 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2673 last_mrf_move[i] = NULL;
2674 }
2675 }
2676 }
2677
2678 if (inst->opcode == BRW_OPCODE_MOV &&
2679 inst->dst.file == MRF &&
2680 inst->src[0].file == GRF &&
2681 !inst->is_partial_write()) {
2682 last_mrf_move[inst->dst.reg] = inst;
2683 }
2684 }
2685
2686 if (progress)
2687 invalidate_live_intervals();
2688
2689 return progress;
2690 }
2691
2692 static void
2693 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2694 int first_grf, int grf_len)
2695 {
2696 /* Clear the flag for registers that actually got read (as expected). */
2697 for (int i = 0; i < inst->sources; i++) {
2698 int grf;
2699 if (inst->src[i].file == GRF) {
2700 grf = inst->src[i].reg;
2701 } else if (inst->src[i].file == HW_REG &&
2702 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2703 grf = inst->src[i].fixed_hw_reg.nr;
2704 } else {
2705 continue;
2706 }
2707
2708 if (grf >= first_grf &&
2709 grf < first_grf + grf_len) {
2710 deps[grf - first_grf] = false;
2711 if (inst->exec_size == 16)
2712 deps[grf - first_grf + 1] = false;
2713 }
2714 }
2715 }
2716
2717 /**
2718 * Implements this workaround for the original 965:
2719 *
2720 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2721 * check for post destination dependencies on this instruction, software
2722 * must ensure that there is no destination hazard for the case of ‘write
2723 * followed by a posted write’ shown in the following example.
2724 *
2725 * 1. mov r3 0
2726 * 2. send r3.xy <rest of send instruction>
2727 * 3. mov r2 r3
2728 *
2729 * Due to no post-destination dependency check on the ‘send’, the above
2730 * code sequence could have two instructions (1 and 2) in flight at the
2731 * same time that both consider ‘r3’ as the target of their final writes.
2732 */
2733 void
2734 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2735 fs_inst *inst)
2736 {
2737 int write_len = inst->regs_written;
2738 int first_write_grf = inst->dst.reg;
2739 bool needs_dep[BRW_MAX_MRF];
2740 assert(write_len < (int)sizeof(needs_dep) - 1);
2741
2742 memset(needs_dep, false, sizeof(needs_dep));
2743 memset(needs_dep, true, write_len);
2744
2745 clear_deps_for_inst_src(inst, dispatch_width,
2746 needs_dep, first_write_grf, write_len);
2747
2748 /* Walk backwards looking for writes to registers we're writing which
2749 * aren't read since being written. If we hit the start of the program,
2750 * we assume that there are no outstanding dependencies on entry to the
2751 * program.
2752 */
2753 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2754 /* If we hit control flow, assume that there *are* outstanding
2755 * dependencies, and force their cleanup before our instruction.
2756 */
2757 if (block->start() == scan_inst) {
2758 for (int i = 0; i < write_len; i++) {
2759 if (needs_dep[i]) {
2760 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2761 }
2762 }
2763 return;
2764 }
2765
2766 /* We insert our reads as late as possible on the assumption that any
2767 * instruction but a MOV that might have left us an outstanding
2768 * dependency has more latency than a MOV.
2769 */
2770 if (scan_inst->dst.file == GRF) {
2771 for (int i = 0; i < scan_inst->regs_written; i++) {
2772 int reg = scan_inst->dst.reg + i;
2773
2774 if (reg >= first_write_grf &&
2775 reg < first_write_grf + write_len &&
2776 needs_dep[reg - first_write_grf]) {
2777 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2778 needs_dep[reg - first_write_grf] = false;
2779 if (scan_inst->exec_size == 16)
2780 needs_dep[reg - first_write_grf + 1] = false;
2781 }
2782 }
2783 }
2784
2785 /* Clear the flag for registers that actually got read (as expected). */
2786 clear_deps_for_inst_src(scan_inst, dispatch_width,
2787 needs_dep, first_write_grf, write_len);
2788
2789 /* Continue the loop only if we haven't resolved all the dependencies */
2790 int i;
2791 for (i = 0; i < write_len; i++) {
2792 if (needs_dep[i])
2793 break;
2794 }
2795 if (i == write_len)
2796 return;
2797 }
2798 }
2799
2800 /**
2801 * Implements this workaround for the original 965:
2802 *
2803 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2804 * used as a destination register until after it has been sourced by an
2805 * instruction with a different destination register.
2806 */
2807 void
2808 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2809 {
2810 int write_len = inst->regs_written;
2811 int first_write_grf = inst->dst.reg;
2812 bool needs_dep[BRW_MAX_MRF];
2813 assert(write_len < (int)sizeof(needs_dep) - 1);
2814
2815 memset(needs_dep, false, sizeof(needs_dep));
2816 memset(needs_dep, true, write_len);
2817 /* Walk forwards looking for writes to registers we're writing which aren't
2818 * read before being written.
2819 */
2820 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2821 /* If we hit control flow, force resolve all remaining dependencies. */
2822 if (block->end() == scan_inst) {
2823 for (int i = 0; i < write_len; i++) {
2824 if (needs_dep[i])
2825 scan_inst->insert_before(block,
2826 DEP_RESOLVE_MOV(first_write_grf + i));
2827 }
2828 return;
2829 }
2830
2831 /* Clear the flag for registers that actually got read (as expected). */
2832 clear_deps_for_inst_src(scan_inst, dispatch_width,
2833 needs_dep, first_write_grf, write_len);
2834
2835 /* We insert our reads as late as possible since they're reading the
2836 * result of a SEND, which has massive latency.
2837 */
2838 if (scan_inst->dst.file == GRF &&
2839 scan_inst->dst.reg >= first_write_grf &&
2840 scan_inst->dst.reg < first_write_grf + write_len &&
2841 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2842 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
2843 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2844 }
2845
2846 /* Continue the loop only if we haven't resolved all the dependencies */
2847 int i;
2848 for (i = 0; i < write_len; i++) {
2849 if (needs_dep[i])
2850 break;
2851 }
2852 if (i == write_len)
2853 return;
2854 }
2855
2856 /* If we hit the end of the program, resolve all remaining dependencies out
2857 * of paranoia.
2858 */
2859 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2860 assert(last_inst->eot);
2861 for (int i = 0; i < write_len; i++) {
2862 if (needs_dep[i])
2863 last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2864 }
2865 }
2866
2867 void
2868 fs_visitor::insert_gen4_send_dependency_workarounds()
2869 {
2870 if (brw->gen != 4 || brw->is_g4x)
2871 return;
2872
2873 bool progress = false;
2874
2875 /* Note that we're done with register allocation, so GRF fs_regs always
2876 * have a .reg_offset of 0.
2877 */
2878
2879 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2880 if (inst->mlen != 0 && inst->dst.file == GRF) {
2881 insert_gen4_pre_send_dependency_workarounds(block, inst);
2882 insert_gen4_post_send_dependency_workarounds(block, inst);
2883 progress = true;
2884 }
2885 }
2886
2887 if (progress)
2888 invalidate_live_intervals();
2889 }
2890
2891 /**
2892 * Turns the generic expression-style uniform pull constant load instruction
2893 * into a hardware-specific series of instructions for loading a pull
2894 * constant.
2895 *
2896 * The expression style allows the CSE pass before this to optimize out
2897 * repeated loads from the same offset, and gives the pre-register-allocation
2898 * scheduling full flexibility, while the conversion to native instructions
2899 * allows the post-register-allocation scheduler the best information
2900 * possible.
2901 *
2902 * Note that execution masking for setting up pull constant loads is special:
2903 * the channels that need to be written are unrelated to the current execution
2904 * mask, since a later instruction will use one of the result channels as a
2905 * source operand for all 8 or 16 of its channels.
2906 */
2907 void
2908 fs_visitor::lower_uniform_pull_constant_loads()
2909 {
2910 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2911 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2912 continue;
2913
2914 if (brw->gen >= 7) {
2915 /* The offset arg before was a vec4-aligned byte offset. We need to
2916 * turn it into a dword offset.
2917 */
2918 fs_reg const_offset_reg = inst->src[1];
2919 assert(const_offset_reg.file == IMM &&
2920 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2921 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2922 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2923
2924 /* This is actually going to be a MOV, but since only the first dword
2925 * is accessed, we have a special opcode to do just that one. Note
2926 * that this needs to be an operation that will be considered a def
2927 * by live variable analysis, or register allocation will explode.
2928 */
2929 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2930 8, payload, const_offset_reg);
2931 setup->force_writemask_all = true;
2932
2933 setup->ir = inst->ir;
2934 setup->annotation = inst->annotation;
2935 inst->insert_before(block, setup);
2936
2937 /* Similarly, this will only populate the first 4 channels of the
2938 * result register (since we only use smear values from 0-3), but we
2939 * don't tell the optimizer.
2940 */
2941 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2942 inst->src[1] = payload;
2943
2944 invalidate_live_intervals();
2945 } else {
2946 /* Before register allocation, we didn't tell the scheduler about the
2947 * MRF we use. We know it's safe to use this MRF because nothing
2948 * else does except for register spill/unspill, which generates and
2949 * uses its MRF within a single IR instruction.
2950 */
2951 inst->base_mrf = 14;
2952 inst->mlen = 1;
2953 }
2954 }
2955 }
2956
2957 bool
2958 fs_visitor::lower_load_payload()
2959 {
2960 bool progress = false;
2961
2962 int vgrf_to_reg[virtual_grf_count];
2963 int reg_count = 16; /* Leave room for MRF */
2964 for (int i = 0; i < virtual_grf_count; ++i) {
2965 vgrf_to_reg[i] = reg_count;
2966 reg_count += virtual_grf_sizes[i];
2967 }
2968
2969 struct {
2970 bool written:1; /* Whether this register has ever been written */
2971 bool force_writemask_all:1;
2972 bool force_sechalf:1;
2973 } metadata[reg_count];
2974 memset(metadata, 0, sizeof(metadata));
2975
2976 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2977 int dst_reg;
2978 if (inst->dst.file == GRF) {
2979 dst_reg = vgrf_to_reg[inst->dst.reg];
2980 } else {
2981 /* MRF */
2982 dst_reg = inst->dst.reg;
2983 }
2984
2985 if (inst->dst.file == MRF || inst->dst.file == GRF) {
2986 bool force_sechalf = inst->force_sechalf;
2987 bool toggle_sechalf = inst->dst.width == 16 &&
2988 type_sz(inst->dst.type) == 4;
2989 for (int i = 0; i < inst->regs_written; ++i) {
2990 metadata[dst_reg + i].written = true;
2991 metadata[dst_reg + i].force_sechalf = force_sechalf;
2992 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
2993 force_sechalf = (toggle_sechalf != force_sechalf);
2994 }
2995 }
2996
2997 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
2998 assert(inst->dst.file == MRF || inst->dst.file == GRF);
2999 fs_reg dst = inst->dst;
3000
3001 for (int i = 0; i < inst->sources; i++) {
3002 dst.width = inst->src[i].effective_width;
3003 dst.type = inst->src[i].type;
3004
3005 if (inst->src[i].file == BAD_FILE) {
3006 /* Do nothing but otherwise increment as normal */
3007 } else if (dst.file == MRF &&
3008 dst.width == 8 &&
3009 brw->has_compr4 &&
3010 i + 4 < inst->sources &&
3011 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3012 fs_reg compr4_dst = dst;
3013 compr4_dst.reg += BRW_MRF_COMPR4;
3014 compr4_dst.width = 16;
3015 fs_reg compr4_src = inst->src[i];
3016 compr4_src.width = 16;
3017 fs_inst *mov = MOV(compr4_dst, compr4_src);
3018 mov->force_writemask_all = true;
3019 inst->insert_before(block, mov);
3020 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3021 inst->src[i + 4].file = BAD_FILE;
3022 } else {
3023 fs_inst *mov = MOV(dst, inst->src[i]);
3024 if (inst->src[i].file == GRF) {
3025 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3026 inst->src[i].reg_offset;
3027 mov->force_sechalf = metadata[src_reg].force_sechalf;
3028 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3029 metadata[dst_reg] = metadata[src_reg];
3030 if (dst.width * type_sz(dst.type) > 32) {
3031 assert((!metadata[src_reg].written ||
3032 !metadata[src_reg].force_sechalf) &&
3033 (!metadata[src_reg + 1].written ||
3034 metadata[src_reg + 1].force_sechalf));
3035 metadata[dst_reg + 1] = metadata[src_reg + 1];
3036 }
3037 } else {
3038 metadata[dst_reg].force_writemask_all = false;
3039 metadata[dst_reg].force_sechalf = false;
3040 if (dst.width == 16) {
3041 metadata[dst_reg + 1].force_writemask_all = false;
3042 metadata[dst_reg + 1].force_sechalf = true;
3043 }
3044 }
3045 inst->insert_before(block, mov);
3046 }
3047
3048 dst = offset(dst, 1);
3049 }
3050
3051 inst->remove(block);
3052 progress = true;
3053 }
3054 }
3055
3056 if (progress)
3057 invalidate_live_intervals();
3058
3059 return progress;
3060 }
3061
3062 void
3063 fs_visitor::dump_instructions()
3064 {
3065 dump_instructions(NULL);
3066 }
3067
3068 void
3069 fs_visitor::dump_instructions(const char *name)
3070 {
3071 calculate_register_pressure();
3072 FILE *file = stderr;
3073 if (name && geteuid() != 0) {
3074 file = fopen(name, "w");
3075 if (!file)
3076 file = stderr;
3077 }
3078
3079 int ip = 0, max_pressure = 0;
3080 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3081 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3082 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3083 dump_instruction(inst, file);
3084 ++ip;
3085 }
3086 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3087
3088 if (file != stderr) {
3089 fclose(file);
3090 }
3091 }
3092
3093 void
3094 fs_visitor::dump_instruction(backend_instruction *be_inst)
3095 {
3096 dump_instruction(be_inst, stderr);
3097 }
3098
3099 void
3100 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3101 {
3102 fs_inst *inst = (fs_inst *)be_inst;
3103
3104 if (inst->predicate) {
3105 fprintf(file, "(%cf0.%d) ",
3106 inst->predicate_inverse ? '-' : '+',
3107 inst->flag_subreg);
3108 }
3109
3110 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3111 if (inst->saturate)
3112 fprintf(file, ".sat");
3113 if (inst->conditional_mod) {
3114 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3115 if (!inst->predicate &&
3116 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3117 inst->opcode != BRW_OPCODE_IF &&
3118 inst->opcode != BRW_OPCODE_WHILE))) {
3119 fprintf(file, ".f0.%d", inst->flag_subreg);
3120 }
3121 }
3122 fprintf(file, "(%d) ", inst->exec_size);
3123
3124
3125 switch (inst->dst.file) {
3126 case GRF:
3127 fprintf(file, "vgrf%d", inst->dst.reg);
3128 if (inst->dst.width != dispatch_width)
3129 fprintf(file, "@%d", inst->dst.width);
3130 if (virtual_grf_sizes[inst->dst.reg] != inst->dst.width / 8 ||
3131 inst->dst.subreg_offset)
3132 fprintf(file, "+%d.%d",
3133 inst->dst.reg_offset, inst->dst.subreg_offset);
3134 break;
3135 case MRF:
3136 fprintf(file, "m%d", inst->dst.reg);
3137 break;
3138 case BAD_FILE:
3139 fprintf(file, "(null)");
3140 break;
3141 case UNIFORM:
3142 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3143 break;
3144 case HW_REG:
3145 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3146 switch (inst->dst.fixed_hw_reg.nr) {
3147 case BRW_ARF_NULL:
3148 fprintf(file, "null");
3149 break;
3150 case BRW_ARF_ADDRESS:
3151 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3152 break;
3153 case BRW_ARF_ACCUMULATOR:
3154 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3155 break;
3156 case BRW_ARF_FLAG:
3157 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3158 inst->dst.fixed_hw_reg.subnr);
3159 break;
3160 default:
3161 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3162 inst->dst.fixed_hw_reg.subnr);
3163 break;
3164 }
3165 } else {
3166 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3167 }
3168 if (inst->dst.fixed_hw_reg.subnr)
3169 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3170 break;
3171 default:
3172 fprintf(file, "???");
3173 break;
3174 }
3175 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3176
3177 for (int i = 0; i < inst->sources; i++) {
3178 if (inst->src[i].negate)
3179 fprintf(file, "-");
3180 if (inst->src[i].abs)
3181 fprintf(file, "|");
3182 switch (inst->src[i].file) {
3183 case GRF:
3184 fprintf(file, "vgrf%d", inst->src[i].reg);
3185 if (inst->src[i].width != dispatch_width)
3186 fprintf(file, "@%d", inst->src[i].width);
3187 if (virtual_grf_sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3188 inst->src[i].subreg_offset)
3189 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3190 inst->src[i].subreg_offset);
3191 break;
3192 case MRF:
3193 fprintf(file, "***m%d***", inst->src[i].reg);
3194 break;
3195 case UNIFORM:
3196 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3197 if (inst->src[i].reladdr) {
3198 fprintf(file, "+reladdr");
3199 } else if (inst->src[i].subreg_offset) {
3200 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3201 inst->src[i].subreg_offset);
3202 }
3203 break;
3204 case BAD_FILE:
3205 fprintf(file, "(null)");
3206 break;
3207 case IMM:
3208 switch (inst->src[i].type) {
3209 case BRW_REGISTER_TYPE_F:
3210 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3211 break;
3212 case BRW_REGISTER_TYPE_D:
3213 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3214 break;
3215 case BRW_REGISTER_TYPE_UD:
3216 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3217 break;
3218 default:
3219 fprintf(file, "???");
3220 break;
3221 }
3222 break;
3223 case HW_REG:
3224 if (inst->src[i].fixed_hw_reg.negate)
3225 fprintf(file, "-");
3226 if (inst->src[i].fixed_hw_reg.abs)
3227 fprintf(file, "|");
3228 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3229 switch (inst->src[i].fixed_hw_reg.nr) {
3230 case BRW_ARF_NULL:
3231 fprintf(file, "null");
3232 break;
3233 case BRW_ARF_ADDRESS:
3234 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3235 break;
3236 case BRW_ARF_ACCUMULATOR:
3237 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3238 break;
3239 case BRW_ARF_FLAG:
3240 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3241 inst->src[i].fixed_hw_reg.subnr);
3242 break;
3243 default:
3244 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3245 inst->src[i].fixed_hw_reg.subnr);
3246 break;
3247 }
3248 } else {
3249 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3250 }
3251 if (inst->src[i].fixed_hw_reg.subnr)
3252 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3253 if (inst->src[i].fixed_hw_reg.abs)
3254 fprintf(file, "|");
3255 break;
3256 default:
3257 fprintf(file, "???");
3258 break;
3259 }
3260 if (inst->src[i].abs)
3261 fprintf(file, "|");
3262
3263 if (inst->src[i].file != IMM) {
3264 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3265 }
3266
3267 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3268 fprintf(file, ", ");
3269 }
3270
3271 fprintf(file, " ");
3272
3273 if (dispatch_width == 16 && inst->exec_size == 8) {
3274 if (inst->force_sechalf)
3275 fprintf(file, "2ndhalf ");
3276 else
3277 fprintf(file, "1sthalf ");
3278 }
3279
3280 fprintf(file, "\n");
3281 }
3282
3283 /**
3284 * Possibly returns an instruction that set up @param reg.
3285 *
3286 * Sometimes we want to take the result of some expression/variable
3287 * dereference tree and rewrite the instruction generating the result
3288 * of the tree. When processing the tree, we know that the
3289 * instructions generated are all writing temporaries that are dead
3290 * outside of this tree. So, if we have some instructions that write
3291 * a temporary, we're free to point that temp write somewhere else.
3292 *
3293 * Note that this doesn't guarantee that the instruction generated
3294 * only reg -- it might be the size=4 destination of a texture instruction.
3295 */
3296 fs_inst *
3297 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3298 fs_inst *end,
3299 const fs_reg &reg)
3300 {
3301 if (end == start ||
3302 end->is_partial_write() ||
3303 reg.reladdr ||
3304 !reg.equals(end->dst)) {
3305 return NULL;
3306 } else {
3307 return end;
3308 }
3309 }
3310
3311 void
3312 fs_visitor::setup_payload_gen6()
3313 {
3314 bool uses_depth =
3315 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3316 unsigned barycentric_interp_modes =
3317 (stage == MESA_SHADER_FRAGMENT) ?
3318 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3319
3320 assert(brw->gen >= 6);
3321
3322 /* R0-1: masks, pixel X/Y coordinates. */
3323 payload.num_regs = 2;
3324 /* R2: only for 32-pixel dispatch.*/
3325
3326 /* R3-26: barycentric interpolation coordinates. These appear in the
3327 * same order that they appear in the brw_wm_barycentric_interp_mode
3328 * enum. Each set of coordinates occupies 2 registers if dispatch width
3329 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3330 * appear if they were enabled using the "Barycentric Interpolation
3331 * Mode" bits in WM_STATE.
3332 */
3333 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3334 if (barycentric_interp_modes & (1 << i)) {
3335 payload.barycentric_coord_reg[i] = payload.num_regs;
3336 payload.num_regs += 2;
3337 if (dispatch_width == 16) {
3338 payload.num_regs += 2;
3339 }
3340 }
3341 }
3342
3343 /* R27: interpolated depth if uses source depth */
3344 if (uses_depth) {
3345 payload.source_depth_reg = payload.num_regs;
3346 payload.num_regs++;
3347 if (dispatch_width == 16) {
3348 /* R28: interpolated depth if not SIMD8. */
3349 payload.num_regs++;
3350 }
3351 }
3352 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3353 if (uses_depth) {
3354 payload.source_w_reg = payload.num_regs;
3355 payload.num_regs++;
3356 if (dispatch_width == 16) {
3357 /* R30: interpolated W if not SIMD8. */
3358 payload.num_regs++;
3359 }
3360 }
3361
3362 if (stage == MESA_SHADER_FRAGMENT) {
3363 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3364 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3365 prog_data->uses_pos_offset = key->compute_pos_offset;
3366 /* R31: MSAA position offsets. */
3367 if (prog_data->uses_pos_offset) {
3368 payload.sample_pos_reg = payload.num_regs;
3369 payload.num_regs++;
3370 }
3371 }
3372
3373 /* R32: MSAA input coverage mask */
3374 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3375 assert(brw->gen >= 7);
3376 payload.sample_mask_in_reg = payload.num_regs;
3377 payload.num_regs++;
3378 if (dispatch_width == 16) {
3379 /* R33: input coverage mask if not SIMD8. */
3380 payload.num_regs++;
3381 }
3382 }
3383
3384 /* R34-: bary for 32-pixel. */
3385 /* R58-59: interp W for 32-pixel. */
3386
3387 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3388 source_depth_to_render_target = true;
3389 }
3390 }
3391
3392 void
3393 fs_visitor::assign_binding_table_offsets()
3394 {
3395 assert(stage == MESA_SHADER_FRAGMENT);
3396 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3397 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3398 uint32_t next_binding_table_offset = 0;
3399
3400 /* If there are no color regions, we still perform an FB write to a null
3401 * renderbuffer, which we place at surface index 0.
3402 */
3403 prog_data->binding_table.render_target_start = next_binding_table_offset;
3404 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3405
3406 assign_common_binding_table_offsets(next_binding_table_offset);
3407 }
3408
3409 void
3410 fs_visitor::calculate_register_pressure()
3411 {
3412 invalidate_live_intervals();
3413 calculate_live_intervals();
3414
3415 unsigned num_instructions = 0;
3416 foreach_block(block, cfg)
3417 num_instructions += block->instructions.length();
3418
3419 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3420
3421 for (int reg = 0; reg < virtual_grf_count; reg++) {
3422 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3423 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3424 }
3425 }
3426
3427 /**
3428 * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
3429 *
3430 * The needs_unlit_centroid_workaround ends up producing one of these per
3431 * channel of centroid input, so it's good to clean them up.
3432 *
3433 * An assumption here is that nothing ever modifies the dispatched pixels
3434 * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
3435 * dictates that anyway.
3436 */
3437 void
3438 fs_visitor::opt_drop_redundant_mov_to_flags()
3439 {
3440 bool flag_mov_found[2] = {false};
3441
3442 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3443 if (inst->is_control_flow()) {
3444 memset(flag_mov_found, 0, sizeof(flag_mov_found));
3445 } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
3446 if (!flag_mov_found[inst->flag_subreg])
3447 flag_mov_found[inst->flag_subreg] = true;
3448 else
3449 inst->remove(block);
3450 } else if (inst->writes_flag()) {
3451 flag_mov_found[inst->flag_subreg] = false;
3452 }
3453 }
3454 }
3455
3456 bool
3457 fs_visitor::run()
3458 {
3459 sanity_param_count = prog->Parameters->NumParameters;
3460 bool allocated_without_spills;
3461
3462 assign_binding_table_offsets();
3463
3464 if (brw->gen >= 6)
3465 setup_payload_gen6();
3466 else
3467 setup_payload_gen4();
3468
3469 if (0) {
3470 emit_dummy_fs();
3471 } else if (brw->use_rep_send && dispatch_width == 16) {
3472 emit_repclear_shader();
3473 allocated_without_spills = true;
3474 } else {
3475 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3476 emit_shader_time_begin();
3477
3478 calculate_urb_setup();
3479 if (prog->InputsRead > 0) {
3480 if (brw->gen < 6)
3481 emit_interpolation_setup_gen4();
3482 else
3483 emit_interpolation_setup_gen6();
3484 }
3485
3486 /* We handle discards by keeping track of the still-live pixels in f0.1.
3487 * Initialize it with the dispatched pixels.
3488 */
3489 bool uses_kill =
3490 (stage == MESA_SHADER_FRAGMENT) &&
3491 ((brw_wm_prog_data*) this->prog_data)->uses_kill;
3492 bool alpha_test_func =
3493 (stage == MESA_SHADER_FRAGMENT) &&
3494 ((brw_wm_prog_key*) this->key)->alpha_test_func;
3495 if (uses_kill || alpha_test_func) {
3496 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3497 discard_init->flag_subreg = 1;
3498 }
3499
3500 /* Generate FS IR for main(). (the visitor only descends into
3501 * functions called "main").
3502 */
3503 if (shader) {
3504 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3505 base_ir = ir;
3506 this->result = reg_undef;
3507 ir->accept(this);
3508 }
3509 } else {
3510 emit_fragment_program_code();
3511 }
3512 base_ir = NULL;
3513 if (failed)
3514 return false;
3515
3516 emit(FS_OPCODE_PLACEHOLDER_HALT);
3517
3518 if (alpha_test_func)
3519 emit_alpha_test();
3520
3521 emit_fb_writes();
3522
3523 calculate_cfg();
3524
3525 split_virtual_grfs();
3526
3527 move_uniform_array_access_to_pull_constants();
3528 assign_constant_locations();
3529 demote_pull_constants();
3530
3531 opt_drop_redundant_mov_to_flags();
3532
3533 #define OPT(pass, args...) do { \
3534 pass_num++; \
3535 bool this_progress = pass(args); \
3536 \
3537 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3538 char filename[64]; \
3539 snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass, \
3540 dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3541 \
3542 backend_visitor::dump_instructions(filename); \
3543 } \
3544 \
3545 progress = progress || this_progress; \
3546 } while (false)
3547
3548 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3549 char filename[64];
3550 snprintf(filename, 64, "fs%d-%04d-00-start",
3551 dispatch_width, shader_prog ? shader_prog->Name : 0);
3552
3553 backend_visitor::dump_instructions(filename);
3554 }
3555
3556 bool progress;
3557 int iteration = 0;
3558 do {
3559 progress = false;
3560 iteration++;
3561 int pass_num = 0;
3562
3563 OPT(remove_duplicate_mrf_writes);
3564
3565 OPT(opt_algebraic);
3566 OPT(opt_cse);
3567 OPT(opt_copy_propagate);
3568 OPT(opt_peephole_predicated_break);
3569 OPT(dead_code_eliminate);
3570 OPT(opt_peephole_sel);
3571 OPT(dead_control_flow_eliminate, this);
3572 OPT(opt_register_renaming);
3573 OPT(opt_saturate_propagation);
3574 OPT(register_coalesce);
3575 OPT(compute_to_mrf);
3576
3577 OPT(compact_virtual_grfs);
3578 } while (progress);
3579
3580 if (lower_load_payload()) {
3581 split_virtual_grfs();
3582 register_coalesce();
3583 compute_to_mrf();
3584 dead_code_eliminate();
3585 }
3586
3587 lower_uniform_pull_constant_loads();
3588
3589 assign_curb_setup();
3590 assign_urb_setup();
3591
3592 static enum instruction_scheduler_mode pre_modes[] = {
3593 SCHEDULE_PRE,
3594 SCHEDULE_PRE_NON_LIFO,
3595 SCHEDULE_PRE_LIFO,
3596 };
3597
3598 /* Try each scheduling heuristic to see if it can successfully register
3599 * allocate without spilling. They should be ordered by decreasing
3600 * performance but increasing likelihood of allocating.
3601 */
3602 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3603 schedule_instructions(pre_modes[i]);
3604
3605 if (0) {
3606 assign_regs_trivial();
3607 allocated_without_spills = true;
3608 } else {
3609 allocated_without_spills = assign_regs(false);
3610 }
3611 if (allocated_without_spills)
3612 break;
3613 }
3614
3615 if (!allocated_without_spills) {
3616 /* We assume that any spilling is worse than just dropping back to
3617 * SIMD8. There's probably actually some intermediate point where
3618 * SIMD16 with a couple of spills is still better.
3619 */
3620 if (dispatch_width == 16) {
3621 fail("Failure to register allocate. Reduce number of "
3622 "live scalar values to avoid this.");
3623 } else {
3624 perf_debug("Fragment shader triggered register spilling. "
3625 "Try reducing the number of live scalar values to "
3626 "improve performance.\n");
3627 }
3628
3629 /* Since we're out of heuristics, just go spill registers until we
3630 * get an allocation.
3631 */
3632 while (!assign_regs(true)) {
3633 if (failed)
3634 break;
3635 }
3636 }
3637 }
3638 assert(force_uncompressed_stack == 0);
3639
3640 /* This must come after all optimization and register allocation, since
3641 * it inserts dead code that happens to have side effects, and it does
3642 * so based on the actual physical registers in use.
3643 */
3644 insert_gen4_send_dependency_workarounds();
3645
3646 if (failed)
3647 return false;
3648
3649 if (!allocated_without_spills)
3650 schedule_instructions(SCHEDULE_POST);
3651
3652 if (last_scratch > 0) {
3653 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3654 }
3655
3656 if (stage == MESA_SHADER_FRAGMENT) {
3657 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3658 if (dispatch_width == 8)
3659 prog_data->reg_blocks = brw_register_blocks(grf_used);
3660 else
3661 prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3662 }
3663
3664 /* If any state parameters were appended, then ParameterValues could have
3665 * been realloced, in which case the driver uniform storage set up by
3666 * _mesa_associate_uniform_storage() would point to freed memory. Make
3667 * sure that didn't happen.
3668 */
3669 assert(sanity_param_count == prog->Parameters->NumParameters);
3670
3671 return !failed;
3672 }
3673
3674 const unsigned *
3675 brw_wm_fs_emit(struct brw_context *brw,
3676 void *mem_ctx,
3677 const struct brw_wm_prog_key *key,
3678 struct brw_wm_prog_data *prog_data,
3679 struct gl_fragment_program *fp,
3680 struct gl_shader_program *prog,
3681 unsigned *final_assembly_size)
3682 {
3683 bool start_busy = false;
3684 double start_time = 0;
3685
3686 if (unlikely(brw->perf_debug)) {
3687 start_busy = (brw->batch.last_bo &&
3688 drm_intel_bo_busy(brw->batch.last_bo));
3689 start_time = get_time();
3690 }
3691
3692 struct brw_shader *shader = NULL;
3693 if (prog)
3694 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3695
3696 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3697 brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3698
3699 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3700 */
3701 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3702 if (!v.run()) {
3703 if (prog) {
3704 prog->LinkStatus = false;
3705 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3706 }
3707
3708 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3709 v.fail_msg);
3710
3711 return NULL;
3712 }
3713
3714 cfg_t *simd16_cfg = NULL;
3715 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3716 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3717 brw->use_rep_send)) {
3718 if (!v.simd16_unsupported) {
3719 /* Try a SIMD16 compile */
3720 v2.import_uniforms(&v);
3721 if (!v2.run()) {
3722 perf_debug("SIMD16 shader failed to compile, falling back to "
3723 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3724 } else {
3725 simd16_cfg = v2.cfg;
3726 }
3727 } else {
3728 perf_debug("SIMD16 shader unsupported, falling back to "
3729 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3730 }
3731 }
3732
3733 cfg_t *simd8_cfg;
3734 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3735 if (no_simd8 && simd16_cfg) {
3736 simd8_cfg = NULL;
3737 prog_data->no_8 = true;
3738 } else {
3739 simd8_cfg = v.cfg;
3740 prog_data->no_8 = false;
3741 }
3742
3743 const unsigned *assembly = NULL;
3744 fs_generator g(brw, mem_ctx, key, prog_data, prog, fp,
3745 v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
3746 assembly = g.generate_assembly(simd8_cfg, simd16_cfg,
3747 final_assembly_size);
3748
3749 if (unlikely(brw->perf_debug) && shader) {
3750 if (shader->compiled_once)
3751 brw_wm_debug_recompile(brw, prog, key);
3752 shader->compiled_once = true;
3753
3754 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3755 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3756 (get_time() - start_time) * 1000);
3757 }
3758 }
3759
3760 return assembly;
3761 }
3762
3763 bool
3764 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3765 {
3766 struct brw_context *brw = brw_context(ctx);
3767 struct brw_wm_prog_key key;
3768
3769 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3770 return true;
3771
3772 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3773 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3774 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3775 bool program_uses_dfdy = fp->UsesDFdy;
3776
3777 memset(&key, 0, sizeof(key));
3778
3779 if (brw->gen < 6) {
3780 if (fp->UsesKill)
3781 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3782
3783 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3784 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3785
3786 /* Just assume depth testing. */
3787 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3788 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3789 }
3790
3791 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3792 BRW_FS_VARYING_INPUT_MASK) > 16)
3793 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3794
3795 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3796 for (unsigned i = 0; i < sampler_count; i++) {
3797 if (fp->Base.ShadowSamplers & (1 << i)) {
3798 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3799 key.tex.swizzles[i] =
3800 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3801 } else {
3802 /* Color sampler: assume no swizzling. */
3803 key.tex.swizzles[i] = SWIZZLE_XYZW;
3804 }
3805 }
3806
3807 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3808 key.drawable_height = ctx->DrawBuffer->Height;
3809 }
3810
3811 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3812 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3813 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3814
3815 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3816 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3817 key.nr_color_regions > 1;
3818 }
3819
3820 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3821 * quality of the derivatives is likely to be determined by the driconf
3822 * option.
3823 */
3824 key.high_quality_derivatives = brw->disable_derivative_optimization;
3825
3826 key.program_string_id = bfp->id;
3827
3828 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3829 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3830
3831 bool success = do_wm_prog(brw, prog, bfp, &key);
3832
3833 brw->wm.base.prog_offset = old_prog_offset;
3834 brw->wm.prog_data = old_prog_data;
3835
3836 return success;
3837 }