i965/fs: Optimize sqrt+inv into rsq.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "util/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "util/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_cfg.h"
50 #include "brw_dead_control_flow.h"
51 #include "main/uniforms.h"
52 #include "brw_fs_live_variables.h"
53 #include "glsl/glsl_types.h"
54
55 void
56 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
57 fs_reg *src, int sources)
58 {
59 memset(this, 0, sizeof(*this));
60
61 this->opcode = opcode;
62 this->dst = dst;
63 this->src = src;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (int i = 0; i < sources; ++i) {
79 if (src[i].file != GRF)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 for (int i = 0; i < sources; ++i) {
94 switch (this->src[i].file) {
95 case BAD_FILE:
96 this->src[i].effective_width = 8;
97 break;
98 case GRF:
99 case HW_REG:
100 assert(this->src[i].width > 0);
101 if (this->src[i].width == 1) {
102 this->src[i].effective_width = this->exec_size;
103 } else {
104 this->src[i].effective_width = this->src[i].width;
105 }
106 break;
107 case IMM:
108 case UNIFORM:
109 this->src[i].effective_width = this->exec_size;
110 break;
111 default:
112 unreachable("Invalid source register file");
113 }
114 }
115 this->dst.effective_width = this->exec_size;
116
117 this->conditional_mod = BRW_CONDITIONAL_NONE;
118
119 /* This will be the case for almost all instructions. */
120 switch (dst.file) {
121 case GRF:
122 case HW_REG:
123 case MRF:
124 this->regs_written = (dst.width * dst.stride * type_sz(dst.type) + 31) / 32;
125 break;
126 case BAD_FILE:
127 this->regs_written = 0;
128 break;
129 case IMM:
130 case UNIFORM:
131 unreachable("Invalid destination register file");
132 default:
133 unreachable("Invalid register file");
134 }
135
136 this->writes_accumulator = false;
137 }
138
139 fs_inst::fs_inst()
140 {
141 fs_reg *src = ralloc_array(this, fs_reg, 3);
142 init(BRW_OPCODE_NOP, 8, dst, src, 0);
143 }
144
145 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
146 {
147 fs_reg *src = ralloc_array(this, fs_reg, 3);
148 init(opcode, exec_size, reg_undef, src, 0);
149 }
150
151 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
152 {
153 fs_reg *src = ralloc_array(this, fs_reg, 3);
154 init(opcode, 0, dst, src, 0);
155 }
156
157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
158 const fs_reg &src0)
159 {
160 fs_reg *src = ralloc_array(this, fs_reg, 3);
161 src[0] = src0;
162 init(opcode, exec_size, dst, src, 1);
163 }
164
165 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
166 {
167 fs_reg *src = ralloc_array(this, fs_reg, 3);
168 src[0] = src0;
169 init(opcode, 0, dst, src, 1);
170 }
171
172 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
173 const fs_reg &src0, const fs_reg &src1)
174 {
175 fs_reg *src = ralloc_array(this, fs_reg, 3);
176 src[0] = src0;
177 src[1] = src1;
178 init(opcode, exec_size, dst, src, 2);
179 }
180
181 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
182 const fs_reg &src1)
183 {
184 fs_reg *src = ralloc_array(this, fs_reg, 3);
185 src[0] = src0;
186 src[1] = src1;
187 init(opcode, 0, dst, src, 2);
188 }
189
190 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
191 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
192 {
193 fs_reg *src = ralloc_array(this, fs_reg, 3);
194 src[0] = src0;
195 src[1] = src1;
196 src[2] = src2;
197 init(opcode, exec_size, dst, src, 3);
198 }
199
200 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
201 const fs_reg &src1, const fs_reg &src2)
202 {
203 fs_reg *src = ralloc_array(this, fs_reg, 3);
204 src[0] = src0;
205 src[1] = src1;
206 src[2] = src2;
207 init(opcode, 0, dst, src, 3);
208 }
209
210 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
211 {
212 init(opcode, 0, dst, src, sources);
213 }
214
215 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
216 fs_reg src[], int sources)
217 {
218 init(opcode, exec_width, dst, src, sources);
219 }
220
221 fs_inst::fs_inst(const fs_inst &that)
222 {
223 memcpy(this, &that, sizeof(that));
224
225 this->src = ralloc_array(this, fs_reg, that.sources);
226
227 for (int i = 0; i < that.sources; i++)
228 this->src[i] = that.src[i];
229 }
230
231 void
232 fs_inst::resize_sources(uint8_t num_sources)
233 {
234 if (this->sources != num_sources) {
235 this->src = reralloc(this, this->src, fs_reg, num_sources);
236 this->sources = num_sources;
237 }
238 }
239
240 #define ALU1(op) \
241 fs_inst * \
242 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
243 { \
244 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
245 }
246
247 #define ALU2(op) \
248 fs_inst * \
249 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
250 const fs_reg &src1) \
251 { \
252 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
253 }
254
255 #define ALU2_ACC(op) \
256 fs_inst * \
257 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
258 const fs_reg &src1) \
259 { \
260 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
261 inst->writes_accumulator = true; \
262 return inst; \
263 }
264
265 #define ALU3(op) \
266 fs_inst * \
267 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
268 const fs_reg &src1, const fs_reg &src2) \
269 { \
270 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
271 }
272
273 ALU1(NOT)
274 ALU1(MOV)
275 ALU1(FRC)
276 ALU1(RNDD)
277 ALU1(RNDE)
278 ALU1(RNDZ)
279 ALU2(ADD)
280 ALU2(MUL)
281 ALU2_ACC(MACH)
282 ALU2(AND)
283 ALU2(OR)
284 ALU2(XOR)
285 ALU2(SHL)
286 ALU2(SHR)
287 ALU2(ASR)
288 ALU3(LRP)
289 ALU1(BFREV)
290 ALU3(BFE)
291 ALU2(BFI1)
292 ALU3(BFI2)
293 ALU1(FBH)
294 ALU1(FBL)
295 ALU1(CBIT)
296 ALU3(MAD)
297 ALU2_ACC(ADDC)
298 ALU2_ACC(SUBB)
299 ALU2(SEL)
300 ALU2(MAC)
301
302 /** Gen4 predicated IF. */
303 fs_inst *
304 fs_visitor::IF(enum brw_predicate predicate)
305 {
306 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
307 inst->predicate = predicate;
308 return inst;
309 }
310
311 /** Gen6 IF with embedded comparison. */
312 fs_inst *
313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
314 enum brw_conditional_mod condition)
315 {
316 assert(brw->gen == 6);
317 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
318 reg_null_d, src0, src1);
319 inst->conditional_mod = condition;
320 return inst;
321 }
322
323 /**
324 * CMP: Sets the low bit of the destination channels with the result
325 * of the comparison, while the upper bits are undefined, and updates
326 * the flag register with the packed 16 bits of the result.
327 */
328 fs_inst *
329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
330 enum brw_conditional_mod condition)
331 {
332 fs_inst *inst;
333
334 /* Take the instruction:
335 *
336 * CMP null<d> src0<f> src1<f>
337 *
338 * Original gen4 does type conversion to the destination type before
339 * comparison, producing garbage results for floating point comparisons.
340 * gen5 does the comparison on the execution type (resolved source types),
341 * so dst type doesn't matter. gen6 does comparison and then uses the
342 * result as if it was the dst type with no conversion, which happens to
343 * mostly work out for float-interpreted-as-int since our comparisons are
344 * for >0, =0, <0.
345 */
346 if (brw->gen == 4) {
347 dst.type = src0.type;
348 if (dst.file == HW_REG)
349 dst.fixed_hw_reg.type = dst.type;
350 }
351
352 resolve_ud_negate(&src0);
353 resolve_ud_negate(&src1);
354
355 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
356 inst->conditional_mod = condition;
357
358 return inst;
359 }
360
361 fs_inst *
362 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
363 {
364 uint8_t exec_size = dst.width;
365 for (int i = 0; i < sources; ++i) {
366 assert(src[i].width % dst.width == 0);
367 if (src[i].width > exec_size)
368 exec_size = src[i].width;
369 }
370
371 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
372 dst, src, sources);
373 inst->regs_written = 0;
374 for (int i = 0; i < sources; ++i) {
375 /* The LOAD_PAYLOAD instruction only really makes sense if we are
376 * dealing with whole registers. If this ever changes, we can deal
377 * with it later.
378 */
379 int size = src[i].effective_width * type_sz(src[i].type);
380 assert(size % 32 == 0);
381 inst->regs_written += (size + 31) / 32;
382 }
383
384 return inst;
385 }
386
387 exec_list
388 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
389 const fs_reg &surf_index,
390 const fs_reg &varying_offset,
391 uint32_t const_offset)
392 {
393 exec_list instructions;
394 fs_inst *inst;
395
396 /* We have our constant surface use a pitch of 4 bytes, so our index can
397 * be any component of a vector, and then we load 4 contiguous
398 * components starting from that.
399 *
400 * We break down the const_offset to a portion added to the variable
401 * offset and a portion done using reg_offset, which means that if you
402 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
403 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
404 * CSE can later notice that those loads are all the same and eliminate
405 * the redundant ones.
406 */
407 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
408 instructions.push_tail(ADD(vec4_offset,
409 varying_offset, fs_reg(const_offset & ~3)));
410
411 int scale = 1;
412 if (brw->gen == 4 && dst.width == 8) {
413 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
414 * u, v, r) as parameters, or we can just use the SIMD16 message
415 * consisting of (header, u). We choose the second, at the cost of a
416 * longer return length.
417 */
418 scale = 2;
419 }
420
421 enum opcode op;
422 if (brw->gen >= 7)
423 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
424 else
425 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
426
427 assert(dst.width % 8 == 0);
428 int regs_written = 4 * (dst.width / 8) * scale;
429 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(regs_written),
430 dst.type, dst.width);
431 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
432 inst->regs_written = regs_written;
433 instructions.push_tail(inst);
434
435 if (brw->gen < 7) {
436 inst->base_mrf = 13;
437 inst->header_present = true;
438 if (brw->gen == 4)
439 inst->mlen = 3;
440 else
441 inst->mlen = 1 + dispatch_width / 8;
442 }
443
444 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
445 instructions.push_tail(MOV(dst, result));
446
447 return instructions;
448 }
449
450 /**
451 * A helper for MOV generation for fixing up broken hardware SEND dependency
452 * handling.
453 */
454 fs_inst *
455 fs_visitor::DEP_RESOLVE_MOV(int grf)
456 {
457 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
458
459 inst->ir = NULL;
460 inst->annotation = "send dependency resolve";
461
462 /* The caller always wants uncompressed to emit the minimal extra
463 * dependencies, and to avoid having to deal with aligning its regs to 2.
464 */
465 inst->exec_size = 8;
466
467 return inst;
468 }
469
470 bool
471 fs_inst::equals(fs_inst *inst) const
472 {
473 return (opcode == inst->opcode &&
474 dst.equals(inst->dst) &&
475 src[0].equals(inst->src[0]) &&
476 src[1].equals(inst->src[1]) &&
477 src[2].equals(inst->src[2]) &&
478 saturate == inst->saturate &&
479 predicate == inst->predicate &&
480 conditional_mod == inst->conditional_mod &&
481 mlen == inst->mlen &&
482 base_mrf == inst->base_mrf &&
483 target == inst->target &&
484 eot == inst->eot &&
485 header_present == inst->header_present &&
486 shadow_compare == inst->shadow_compare &&
487 exec_size == inst->exec_size &&
488 offset == inst->offset);
489 }
490
491 bool
492 fs_inst::overwrites_reg(const fs_reg &reg) const
493 {
494 return (reg.file == dst.file &&
495 reg.reg == dst.reg &&
496 reg.reg_offset >= dst.reg_offset &&
497 reg.reg_offset < dst.reg_offset + regs_written);
498 }
499
500 bool
501 fs_inst::is_send_from_grf() const
502 {
503 switch (opcode) {
504 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
505 case SHADER_OPCODE_SHADER_TIME_ADD:
506 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
507 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
508 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
509 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
510 case SHADER_OPCODE_UNTYPED_ATOMIC:
511 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
512 return true;
513 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
514 return src[1].file == GRF;
515 case FS_OPCODE_FB_WRITE:
516 return src[0].file == GRF;
517 default:
518 if (is_tex())
519 return src[0].file == GRF;
520
521 return false;
522 }
523 }
524
525 bool
526 fs_inst::can_do_source_mods(struct brw_context *brw)
527 {
528 if (brw->gen == 6 && is_math())
529 return false;
530
531 if (is_send_from_grf())
532 return false;
533
534 if (!backend_instruction::can_do_source_mods())
535 return false;
536
537 return true;
538 }
539
540 void
541 fs_reg::init()
542 {
543 memset(this, 0, sizeof(*this));
544 stride = 1;
545 }
546
547 /** Generic unset register constructor. */
548 fs_reg::fs_reg()
549 {
550 init();
551 this->file = BAD_FILE;
552 }
553
554 /** Immediate value constructor. */
555 fs_reg::fs_reg(float f)
556 {
557 init();
558 this->file = IMM;
559 this->type = BRW_REGISTER_TYPE_F;
560 this->fixed_hw_reg.dw1.f = f;
561 this->width = 1;
562 }
563
564 /** Immediate value constructor. */
565 fs_reg::fs_reg(int32_t i)
566 {
567 init();
568 this->file = IMM;
569 this->type = BRW_REGISTER_TYPE_D;
570 this->fixed_hw_reg.dw1.d = i;
571 this->width = 1;
572 }
573
574 /** Immediate value constructor. */
575 fs_reg::fs_reg(uint32_t u)
576 {
577 init();
578 this->file = IMM;
579 this->type = BRW_REGISTER_TYPE_UD;
580 this->fixed_hw_reg.dw1.ud = u;
581 this->width = 1;
582 }
583
584 /** Fixed brw_reg. */
585 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
586 {
587 init();
588 this->file = HW_REG;
589 this->fixed_hw_reg = fixed_hw_reg;
590 this->type = fixed_hw_reg.type;
591 this->width = 1 << fixed_hw_reg.width;
592 }
593
594 bool
595 fs_reg::equals(const fs_reg &r) const
596 {
597 return (file == r.file &&
598 reg == r.reg &&
599 reg_offset == r.reg_offset &&
600 subreg_offset == r.subreg_offset &&
601 type == r.type &&
602 negate == r.negate &&
603 abs == r.abs &&
604 !reladdr && !r.reladdr &&
605 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
606 width == r.width &&
607 stride == r.stride);
608 }
609
610 fs_reg &
611 fs_reg::apply_stride(unsigned stride)
612 {
613 assert((this->stride * stride) <= 4 &&
614 (is_power_of_two(stride) || stride == 0) &&
615 file != HW_REG && file != IMM);
616 this->stride *= stride;
617 return *this;
618 }
619
620 fs_reg &
621 fs_reg::set_smear(unsigned subreg)
622 {
623 assert(file != HW_REG && file != IMM);
624 subreg_offset = subreg * type_sz(type);
625 stride = 0;
626 return *this;
627 }
628
629 bool
630 fs_reg::is_contiguous() const
631 {
632 return stride == 1;
633 }
634
635 bool
636 fs_reg::is_valid_3src() const
637 {
638 return file == GRF || file == UNIFORM;
639 }
640
641 int
642 fs_visitor::type_size(const struct glsl_type *type)
643 {
644 unsigned int size, i;
645
646 switch (type->base_type) {
647 case GLSL_TYPE_UINT:
648 case GLSL_TYPE_INT:
649 case GLSL_TYPE_FLOAT:
650 case GLSL_TYPE_BOOL:
651 return type->components();
652 case GLSL_TYPE_ARRAY:
653 return type_size(type->fields.array) * type->length;
654 case GLSL_TYPE_STRUCT:
655 size = 0;
656 for (i = 0; i < type->length; i++) {
657 size += type_size(type->fields.structure[i].type);
658 }
659 return size;
660 case GLSL_TYPE_SAMPLER:
661 /* Samplers take up no register space, since they're baked in at
662 * link time.
663 */
664 return 0;
665 case GLSL_TYPE_ATOMIC_UINT:
666 return 0;
667 case GLSL_TYPE_IMAGE:
668 case GLSL_TYPE_VOID:
669 case GLSL_TYPE_ERROR:
670 case GLSL_TYPE_INTERFACE:
671 unreachable("not reached");
672 }
673
674 return 0;
675 }
676
677 fs_reg
678 fs_visitor::get_timestamp()
679 {
680 assert(brw->gen >= 7);
681
682 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
683 BRW_ARF_TIMESTAMP,
684 0),
685 BRW_REGISTER_TYPE_UD));
686
687 fs_reg dst = fs_reg(this, glsl_type::uint_type);
688
689 fs_inst *mov = emit(MOV(dst, ts));
690 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
691 * even if it's not enabled in the dispatch.
692 */
693 mov->force_writemask_all = true;
694 mov->exec_size = 8;
695
696 /* The caller wants the low 32 bits of the timestamp. Since it's running
697 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
698 * which is plenty of time for our purposes. It is identical across the
699 * EUs, but since it's tracking GPU core speed it will increment at a
700 * varying rate as render P-states change.
701 *
702 * The caller could also check if render P-states have changed (or anything
703 * else that might disrupt timing) by setting smear to 2 and checking if
704 * that field is != 0.
705 */
706 dst.set_smear(0);
707
708 return dst;
709 }
710
711 void
712 fs_visitor::emit_shader_time_begin()
713 {
714 current_annotation = "shader time start";
715 shader_start_time = get_timestamp();
716 }
717
718 void
719 fs_visitor::emit_shader_time_end()
720 {
721 current_annotation = "shader time end";
722
723 enum shader_time_shader_type type, written_type, reset_type;
724 if (dispatch_width == 8) {
725 type = ST_FS8;
726 written_type = ST_FS8_WRITTEN;
727 reset_type = ST_FS8_RESET;
728 } else {
729 assert(dispatch_width == 16);
730 type = ST_FS16;
731 written_type = ST_FS16_WRITTEN;
732 reset_type = ST_FS16_RESET;
733 }
734
735 fs_reg shader_end_time = get_timestamp();
736
737 /* Check that there weren't any timestamp reset events (assuming these
738 * were the only two timestamp reads that happened).
739 */
740 fs_reg reset = shader_end_time;
741 reset.set_smear(2);
742 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
743 test->conditional_mod = BRW_CONDITIONAL_Z;
744 emit(IF(BRW_PREDICATE_NORMAL));
745
746 push_force_uncompressed();
747 fs_reg start = shader_start_time;
748 start.negate = true;
749 fs_reg diff = fs_reg(this, glsl_type::uint_type);
750 emit(ADD(diff, start, shader_end_time));
751
752 /* If there were no instructions between the two timestamp gets, the diff
753 * is 2 cycles. Remove that overhead, so I can forget about that when
754 * trying to determine the time taken for single instructions.
755 */
756 emit(ADD(diff, diff, fs_reg(-2u)));
757
758 emit_shader_time_write(type, diff);
759 emit_shader_time_write(written_type, fs_reg(1u));
760 emit(BRW_OPCODE_ELSE);
761 emit_shader_time_write(reset_type, fs_reg(1u));
762 emit(BRW_OPCODE_ENDIF);
763
764 pop_force_uncompressed();
765 }
766
767 void
768 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
769 fs_reg value)
770 {
771 int shader_time_index =
772 brw_get_shader_time_index(brw, shader_prog, prog, type);
773 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
774
775 fs_reg payload;
776 if (dispatch_width == 8)
777 payload = fs_reg(this, glsl_type::uvec2_type);
778 else
779 payload = fs_reg(this, glsl_type::uint_type);
780
781 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
782 fs_reg(), payload, offset, value));
783 }
784
785 void
786 fs_visitor::vfail(const char *format, va_list va)
787 {
788 char *msg;
789
790 if (failed)
791 return;
792
793 failed = true;
794
795 msg = ralloc_vasprintf(mem_ctx, format, va);
796 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
797
798 this->fail_msg = msg;
799
800 if (INTEL_DEBUG & DEBUG_WM) {
801 fprintf(stderr, "%s", msg);
802 }
803 }
804
805 void
806 fs_visitor::fail(const char *format, ...)
807 {
808 va_list va;
809
810 va_start(va, format);
811 vfail(format, va);
812 va_end(va);
813 }
814
815 /**
816 * Mark this program as impossible to compile in SIMD16 mode.
817 *
818 * During the SIMD8 compile (which happens first), we can detect and flag
819 * things that are unsupported in SIMD16 mode, so the compiler can skip
820 * the SIMD16 compile altogether.
821 *
822 * During a SIMD16 compile (if one happens anyway), this just calls fail().
823 */
824 void
825 fs_visitor::no16(const char *format, ...)
826 {
827 va_list va;
828
829 va_start(va, format);
830
831 if (dispatch_width == 16) {
832 vfail(format, va);
833 } else {
834 simd16_unsupported = true;
835
836 if (brw->perf_debug) {
837 if (no16_msg)
838 ralloc_vasprintf_append(&no16_msg, format, va);
839 else
840 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
841 }
842 }
843
844 va_end(va);
845 }
846
847 fs_inst *
848 fs_visitor::emit(enum opcode opcode)
849 {
850 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
851 }
852
853 fs_inst *
854 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
855 {
856 return emit(new(mem_ctx) fs_inst(opcode, dst));
857 }
858
859 fs_inst *
860 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
861 {
862 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
863 }
864
865 fs_inst *
866 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
867 const fs_reg &src1)
868 {
869 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
870 }
871
872 fs_inst *
873 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
874 const fs_reg &src1, const fs_reg &src2)
875 {
876 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
877 }
878
879 fs_inst *
880 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
881 fs_reg src[], int sources)
882 {
883 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
884 }
885
886 void
887 fs_visitor::push_force_uncompressed()
888 {
889 force_uncompressed_stack++;
890 }
891
892 void
893 fs_visitor::pop_force_uncompressed()
894 {
895 force_uncompressed_stack--;
896 assert(force_uncompressed_stack >= 0);
897 }
898
899 /**
900 * Returns true if the instruction has a flag that means it won't
901 * update an entire destination register.
902 *
903 * For example, dead code elimination and live variable analysis want to know
904 * when a write to a variable screens off any preceding values that were in
905 * it.
906 */
907 bool
908 fs_inst::is_partial_write() const
909 {
910 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
911 (this->dst.width * type_sz(this->dst.type)) < 32 ||
912 !this->dst.is_contiguous());
913 }
914
915 int
916 fs_inst::regs_read(fs_visitor *v, int arg) const
917 {
918 if (is_tex() && arg == 0 && src[0].file == GRF) {
919 return mlen;
920 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
921 return mlen;
922 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
923 return mlen;
924 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
925 return mlen;
926 }
927
928 switch (src[arg].file) {
929 case BAD_FILE:
930 case UNIFORM:
931 case IMM:
932 return 1;
933 case GRF:
934 case HW_REG:
935 if (src[arg].stride == 0) {
936 return 1;
937 } else {
938 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
939 return (size + 31) / 32;
940 }
941 case MRF:
942 unreachable("MRF registers are not allowed as sources");
943 default:
944 unreachable("Invalid register file");
945 }
946 }
947
948 bool
949 fs_inst::reads_flag() const
950 {
951 return predicate;
952 }
953
954 bool
955 fs_inst::writes_flag() const
956 {
957 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
958 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
959 }
960
961 /**
962 * Returns how many MRFs an FS opcode will write over.
963 *
964 * Note that this is not the 0 or 1 implied writes in an actual gen
965 * instruction -- the FS opcodes often generate MOVs in addition.
966 */
967 int
968 fs_visitor::implied_mrf_writes(fs_inst *inst)
969 {
970 if (inst->mlen == 0)
971 return 0;
972
973 if (inst->base_mrf == -1)
974 return 0;
975
976 switch (inst->opcode) {
977 case SHADER_OPCODE_RCP:
978 case SHADER_OPCODE_RSQ:
979 case SHADER_OPCODE_SQRT:
980 case SHADER_OPCODE_EXP2:
981 case SHADER_OPCODE_LOG2:
982 case SHADER_OPCODE_SIN:
983 case SHADER_OPCODE_COS:
984 return 1 * dispatch_width / 8;
985 case SHADER_OPCODE_POW:
986 case SHADER_OPCODE_INT_QUOTIENT:
987 case SHADER_OPCODE_INT_REMAINDER:
988 return 2 * dispatch_width / 8;
989 case SHADER_OPCODE_TEX:
990 case FS_OPCODE_TXB:
991 case SHADER_OPCODE_TXD:
992 case SHADER_OPCODE_TXF:
993 case SHADER_OPCODE_TXF_CMS:
994 case SHADER_OPCODE_TXF_MCS:
995 case SHADER_OPCODE_TG4:
996 case SHADER_OPCODE_TG4_OFFSET:
997 case SHADER_OPCODE_TXL:
998 case SHADER_OPCODE_TXS:
999 case SHADER_OPCODE_LOD:
1000 return 1;
1001 case FS_OPCODE_FB_WRITE:
1002 return 2;
1003 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1004 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1005 return 1;
1006 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1007 return inst->mlen;
1008 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1009 return 2;
1010 case SHADER_OPCODE_UNTYPED_ATOMIC:
1011 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1012 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1013 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1014 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1015 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1016 return 0;
1017 default:
1018 unreachable("not reached");
1019 }
1020 }
1021
1022 int
1023 fs_visitor::virtual_grf_alloc(int size)
1024 {
1025 if (virtual_grf_array_size <= virtual_grf_count) {
1026 if (virtual_grf_array_size == 0)
1027 virtual_grf_array_size = 16;
1028 else
1029 virtual_grf_array_size *= 2;
1030 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
1031 virtual_grf_array_size);
1032 }
1033 virtual_grf_sizes[virtual_grf_count] = size;
1034 return virtual_grf_count++;
1035 }
1036
1037 /** Fixed HW reg constructor. */
1038 fs_reg::fs_reg(enum register_file file, int reg)
1039 {
1040 init();
1041 this->file = file;
1042 this->reg = reg;
1043 this->type = BRW_REGISTER_TYPE_F;
1044
1045 switch (file) {
1046 case UNIFORM:
1047 this->width = 1;
1048 break;
1049 default:
1050 this->width = 8;
1051 }
1052 }
1053
1054 /** Fixed HW reg constructor. */
1055 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1056 {
1057 init();
1058 this->file = file;
1059 this->reg = reg;
1060 this->type = type;
1061
1062 switch (file) {
1063 case UNIFORM:
1064 this->width = 1;
1065 break;
1066 default:
1067 this->width = 8;
1068 }
1069 }
1070
1071 /** Fixed HW reg constructor. */
1072 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1073 uint8_t width)
1074 {
1075 init();
1076 this->file = file;
1077 this->reg = reg;
1078 this->type = type;
1079 this->width = width;
1080 }
1081
1082 /** Automatic reg constructor. */
1083 fs_reg::fs_reg(fs_visitor *v, const struct glsl_type *type)
1084 {
1085 init();
1086 int reg_width = v->dispatch_width / 8;
1087
1088 this->file = GRF;
1089 this->reg = v->virtual_grf_alloc(v->type_size(type) * reg_width);
1090 this->reg_offset = 0;
1091 this->type = brw_type_for_base_type(type);
1092 this->width = v->dispatch_width;
1093 assert(this->width == 8 || this->width == 16);
1094 }
1095
1096 fs_reg *
1097 fs_visitor::variable_storage(ir_variable *var)
1098 {
1099 return (fs_reg *)hash_table_find(this->variable_ht, var);
1100 }
1101
1102 void
1103 import_uniforms_callback(const void *key,
1104 void *data,
1105 void *closure)
1106 {
1107 struct hash_table *dst_ht = (struct hash_table *)closure;
1108 const fs_reg *reg = (const fs_reg *)data;
1109
1110 if (reg->file != UNIFORM)
1111 return;
1112
1113 hash_table_insert(dst_ht, data, key);
1114 }
1115
1116 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1117 * This brings in those uniform definitions
1118 */
1119 void
1120 fs_visitor::import_uniforms(fs_visitor *v)
1121 {
1122 hash_table_call_foreach(v->variable_ht,
1123 import_uniforms_callback,
1124 variable_ht);
1125 this->push_constant_loc = v->push_constant_loc;
1126 this->pull_constant_loc = v->pull_constant_loc;
1127 this->uniforms = v->uniforms;
1128 this->param_size = v->param_size;
1129 }
1130
1131 /* Our support for uniforms is piggy-backed on the struct
1132 * gl_fragment_program, because that's where the values actually
1133 * get stored, rather than in some global gl_shader_program uniform
1134 * store.
1135 */
1136 void
1137 fs_visitor::setup_uniform_values(ir_variable *ir)
1138 {
1139 int namelen = strlen(ir->name);
1140
1141 /* The data for our (non-builtin) uniforms is stored in a series of
1142 * gl_uniform_driver_storage structs for each subcomponent that
1143 * glGetUniformLocation() could name. We know it's been set up in the same
1144 * order we'd walk the type, so walk the list of storage and find anything
1145 * with our name, or the prefix of a component that starts with our name.
1146 */
1147 unsigned params_before = uniforms;
1148 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1149 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1150
1151 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1152 (storage->name[namelen] != 0 &&
1153 storage->name[namelen] != '.' &&
1154 storage->name[namelen] != '[')) {
1155 continue;
1156 }
1157
1158 unsigned slots = storage->type->component_slots();
1159 if (storage->array_elements)
1160 slots *= storage->array_elements;
1161
1162 for (unsigned i = 0; i < slots; i++) {
1163 stage_prog_data->param[uniforms++] = &storage->storage[i];
1164 }
1165 }
1166
1167 /* Make sure we actually initialized the right amount of stuff here. */
1168 assert(params_before + ir->type->component_slots() == uniforms);
1169 (void)params_before;
1170 }
1171
1172
1173 /* Our support for builtin uniforms is even scarier than non-builtin.
1174 * It sits on top of the PROG_STATE_VAR parameters that are
1175 * automatically updated from GL context state.
1176 */
1177 void
1178 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1179 {
1180 const ir_state_slot *const slots = ir->get_state_slots();
1181 assert(slots != NULL);
1182
1183 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1184 /* This state reference has already been setup by ir_to_mesa, but we'll
1185 * get the same index back here.
1186 */
1187 int index = _mesa_add_state_reference(this->prog->Parameters,
1188 (gl_state_index *)slots[i].tokens);
1189
1190 /* Add each of the unique swizzles of the element as a parameter.
1191 * This'll end up matching the expected layout of the
1192 * array/matrix/structure we're trying to fill in.
1193 */
1194 int last_swiz = -1;
1195 for (unsigned int j = 0; j < 4; j++) {
1196 int swiz = GET_SWZ(slots[i].swizzle, j);
1197 if (swiz == last_swiz)
1198 break;
1199 last_swiz = swiz;
1200
1201 stage_prog_data->param[uniforms++] =
1202 &prog->Parameters->ParameterValues[index][swiz];
1203 }
1204 }
1205 }
1206
1207 fs_reg *
1208 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1209 {
1210 assert(stage == MESA_SHADER_FRAGMENT);
1211 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1212 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1213 fs_reg wpos = *reg;
1214 bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1215
1216 /* gl_FragCoord.x */
1217 if (ir->data.pixel_center_integer) {
1218 emit(MOV(wpos, this->pixel_x));
1219 } else {
1220 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1221 }
1222 wpos = offset(wpos, 1);
1223
1224 /* gl_FragCoord.y */
1225 if (!flip && ir->data.pixel_center_integer) {
1226 emit(MOV(wpos, this->pixel_y));
1227 } else {
1228 fs_reg pixel_y = this->pixel_y;
1229 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1230
1231 if (flip) {
1232 pixel_y.negate = true;
1233 offset += key->drawable_height - 1.0;
1234 }
1235
1236 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1237 }
1238 wpos = offset(wpos, 1);
1239
1240 /* gl_FragCoord.z */
1241 if (brw->gen >= 6) {
1242 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1243 } else {
1244 emit(FS_OPCODE_LINTERP, wpos,
1245 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1246 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1247 interp_reg(VARYING_SLOT_POS, 2));
1248 }
1249 wpos = offset(wpos, 1);
1250
1251 /* gl_FragCoord.w: Already set up in emit_interpolation */
1252 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1253
1254 return reg;
1255 }
1256
1257 fs_inst *
1258 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1259 glsl_interp_qualifier interpolation_mode,
1260 bool is_centroid, bool is_sample)
1261 {
1262 brw_wm_barycentric_interp_mode barycoord_mode;
1263 if (brw->gen >= 6) {
1264 if (is_centroid) {
1265 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1266 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1267 else
1268 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1269 } else if (is_sample) {
1270 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1271 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1272 else
1273 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1274 } else {
1275 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1276 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1277 else
1278 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1279 }
1280 } else {
1281 /* On Ironlake and below, there is only one interpolation mode.
1282 * Centroid interpolation doesn't mean anything on this hardware --
1283 * there is no multisampling.
1284 */
1285 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1286 }
1287 return emit(FS_OPCODE_LINTERP, attr,
1288 this->delta_x[barycoord_mode],
1289 this->delta_y[barycoord_mode], interp);
1290 }
1291
1292 fs_reg *
1293 fs_visitor::emit_general_interpolation(ir_variable *ir)
1294 {
1295 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1296 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1297 fs_reg attr = *reg;
1298
1299 assert(stage == MESA_SHADER_FRAGMENT);
1300 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1301 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1302
1303 unsigned int array_elements;
1304 const glsl_type *type;
1305
1306 if (ir->type->is_array()) {
1307 array_elements = ir->type->length;
1308 if (array_elements == 0) {
1309 fail("dereferenced array '%s' has length 0\n", ir->name);
1310 }
1311 type = ir->type->fields.array;
1312 } else {
1313 array_elements = 1;
1314 type = ir->type;
1315 }
1316
1317 glsl_interp_qualifier interpolation_mode =
1318 ir->determine_interpolation_mode(key->flat_shade);
1319
1320 int location = ir->data.location;
1321 for (unsigned int i = 0; i < array_elements; i++) {
1322 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1323 if (prog_data->urb_setup[location] == -1) {
1324 /* If there's no incoming setup data for this slot, don't
1325 * emit interpolation for it.
1326 */
1327 attr = offset(attr, type->vector_elements);
1328 location++;
1329 continue;
1330 }
1331
1332 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1333 /* Constant interpolation (flat shading) case. The SF has
1334 * handed us defined values in only the constant offset
1335 * field of the setup reg.
1336 */
1337 for (unsigned int k = 0; k < type->vector_elements; k++) {
1338 struct brw_reg interp = interp_reg(location, k);
1339 interp = suboffset(interp, 3);
1340 interp.type = reg->type;
1341 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1342 attr = offset(attr, 1);
1343 }
1344 } else {
1345 /* Smooth/noperspective interpolation case. */
1346 for (unsigned int k = 0; k < type->vector_elements; k++) {
1347 struct brw_reg interp = interp_reg(location, k);
1348 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1349 /* Get the pixel/sample mask into f0 so that we know
1350 * which pixels are lit. Then, for each channel that is
1351 * unlit, replace the centroid data with non-centroid
1352 * data.
1353 */
1354 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1355
1356 fs_inst *inst;
1357 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1358 false, false);
1359 inst->predicate = BRW_PREDICATE_NORMAL;
1360 inst->predicate_inverse = true;
1361 if (brw->has_pln)
1362 inst->no_dd_clear = true;
1363
1364 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1365 ir->data.centroid && !key->persample_shading,
1366 ir->data.sample || key->persample_shading);
1367 inst->predicate = BRW_PREDICATE_NORMAL;
1368 inst->predicate_inverse = false;
1369 if (brw->has_pln)
1370 inst->no_dd_check = true;
1371
1372 } else {
1373 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1374 ir->data.centroid && !key->persample_shading,
1375 ir->data.sample || key->persample_shading);
1376 }
1377 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1378 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1379 }
1380 attr = offset(attr, 1);
1381 }
1382
1383 }
1384 location++;
1385 }
1386 }
1387
1388 return reg;
1389 }
1390
1391 fs_reg *
1392 fs_visitor::emit_frontfacing_interpolation()
1393 {
1394 fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::bool_type);
1395
1396 if (brw->gen >= 6) {
1397 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1398 * a boolean result from this (~0/true or 0/false).
1399 *
1400 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1401 * this task in only one instruction:
1402 * - a negation source modifier will flip the bit; and
1403 * - a W -> D type conversion will sign extend the bit into the high
1404 * word of the destination.
1405 *
1406 * An ASR 15 fills the low word of the destination.
1407 */
1408 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1409 g0.negate = true;
1410
1411 emit(ASR(*reg, g0, fs_reg(15)));
1412 } else {
1413 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1414 * a boolean result from this (1/true or 0/false).
1415 *
1416 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1417 * the negation source modifier to flip it. Unfortunately the SHR
1418 * instruction only operates on UD (or D with an abs source modifier)
1419 * sources without negation.
1420 *
1421 * Instead, use ASR (which will give ~0/true or 0/false) followed by an
1422 * AND 1.
1423 */
1424 fs_reg asr = fs_reg(this, glsl_type::bool_type);
1425 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1426 g1_6.negate = true;
1427
1428 emit(ASR(asr, g1_6, fs_reg(31)));
1429 emit(AND(*reg, asr, fs_reg(1)));
1430 }
1431
1432 return reg;
1433 }
1434
1435 void
1436 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1437 {
1438 assert(stage == MESA_SHADER_FRAGMENT);
1439 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1440 assert(dst.type == BRW_REGISTER_TYPE_F);
1441
1442 if (key->compute_pos_offset) {
1443 /* Convert int_sample_pos to floating point */
1444 emit(MOV(dst, int_sample_pos));
1445 /* Scale to the range [0, 1] */
1446 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1447 }
1448 else {
1449 /* From ARB_sample_shading specification:
1450 * "When rendering to a non-multisample buffer, or if multisample
1451 * rasterization is disabled, gl_SamplePosition will always be
1452 * (0.5, 0.5).
1453 */
1454 emit(MOV(dst, fs_reg(0.5f)));
1455 }
1456 }
1457
1458 fs_reg *
1459 fs_visitor::emit_samplepos_setup()
1460 {
1461 assert(brw->gen >= 6);
1462
1463 this->current_annotation = "compute sample position";
1464 fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::vec2_type);
1465 fs_reg pos = *reg;
1466 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1467 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1468
1469 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1470 * mode will be enabled.
1471 *
1472 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1473 * R31.1:0 Position Offset X/Y for Slot[3:0]
1474 * R31.3:2 Position Offset X/Y for Slot[7:4]
1475 * .....
1476 *
1477 * The X, Y sample positions come in as bytes in thread payload. So, read
1478 * the positions using vstride=16, width=8, hstride=2.
1479 */
1480 struct brw_reg sample_pos_reg =
1481 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1482 BRW_REGISTER_TYPE_B), 16, 8, 2);
1483
1484 if (dispatch_width == 8) {
1485 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1486 } else {
1487 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1488 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1489 ->force_sechalf = true;
1490 }
1491 /* Compute gl_SamplePosition.x */
1492 compute_sample_position(pos, int_sample_x);
1493 pos = offset(pos, 1);
1494 if (dispatch_width == 8) {
1495 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1496 } else {
1497 emit(MOV(half(int_sample_y, 0),
1498 fs_reg(suboffset(sample_pos_reg, 1))));
1499 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1500 ->force_sechalf = true;
1501 }
1502 /* Compute gl_SamplePosition.y */
1503 compute_sample_position(pos, int_sample_y);
1504 return reg;
1505 }
1506
1507 fs_reg *
1508 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1509 {
1510 assert(stage == MESA_SHADER_FRAGMENT);
1511 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1512 assert(brw->gen >= 6);
1513
1514 this->current_annotation = "compute sample id";
1515 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1516
1517 if (key->compute_sample_id) {
1518 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1519 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1520 t2.type = BRW_REGISTER_TYPE_UW;
1521
1522 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1523 * 8x multisampling, subspan 0 will represent sample N (where N
1524 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1525 * 7. We can find the value of N by looking at R0.0 bits 7:6
1526 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1527 * (since samples are always delivered in pairs). That is, we
1528 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1529 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1530 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1531 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1532 * populating a temporary variable with the sequence (0, 1, 2, 3),
1533 * and then reading from it using vstride=1, width=4, hstride=0.
1534 * These computations hold good for 4x multisampling as well.
1535 *
1536 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1537 * the first four slots are sample 0 of subspan 0; the next four
1538 * are sample 1 of subspan 0; the third group is sample 0 of
1539 * subspan 1, and finally sample 1 of subspan 1.
1540 */
1541 fs_inst *inst;
1542 inst = emit(BRW_OPCODE_AND, t1,
1543 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1544 fs_reg(0xc0));
1545 inst->force_writemask_all = true;
1546 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1547 inst->force_writemask_all = true;
1548 /* This works for both SIMD8 and SIMD16 */
1549 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1550 inst->force_writemask_all = true;
1551 /* This special instruction takes care of setting vstride=1,
1552 * width=4, hstride=0 of t2 during an ADD instruction.
1553 */
1554 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1555 } else {
1556 /* As per GL_ARB_sample_shading specification:
1557 * "When rendering to a non-multisample buffer, or if multisample
1558 * rasterization is disabled, gl_SampleID will always be zero."
1559 */
1560 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1561 }
1562
1563 return reg;
1564 }
1565
1566 fs_reg
1567 fs_visitor::fix_math_operand(fs_reg src)
1568 {
1569 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1570 * might be able to do better by doing execsize = 1 math and then
1571 * expanding that result out, but we would need to be careful with
1572 * masking.
1573 *
1574 * The hardware ignores source modifiers (negate and abs) on math
1575 * instructions, so we also move to a temp to set those up.
1576 */
1577 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1578 !src.abs && !src.negate)
1579 return src;
1580
1581 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1582 * operands to math
1583 */
1584 if (brw->gen >= 7 && src.file != IMM)
1585 return src;
1586
1587 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1588 expanded.type = src.type;
1589 emit(BRW_OPCODE_MOV, expanded, src);
1590 return expanded;
1591 }
1592
1593 fs_inst *
1594 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1595 {
1596 switch (opcode) {
1597 case SHADER_OPCODE_RCP:
1598 case SHADER_OPCODE_RSQ:
1599 case SHADER_OPCODE_SQRT:
1600 case SHADER_OPCODE_EXP2:
1601 case SHADER_OPCODE_LOG2:
1602 case SHADER_OPCODE_SIN:
1603 case SHADER_OPCODE_COS:
1604 break;
1605 default:
1606 unreachable("not reached: bad math opcode");
1607 }
1608
1609 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1610 * might be able to do better by doing execsize = 1 math and then
1611 * expanding that result out, but we would need to be careful with
1612 * masking.
1613 *
1614 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1615 * instructions, so we also move to a temp to set those up.
1616 */
1617 if (brw->gen == 6 || brw->gen == 7)
1618 src = fix_math_operand(src);
1619
1620 fs_inst *inst = emit(opcode, dst, src);
1621
1622 if (brw->gen < 6) {
1623 inst->base_mrf = 2;
1624 inst->mlen = dispatch_width / 8;
1625 }
1626
1627 return inst;
1628 }
1629
1630 fs_inst *
1631 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1632 {
1633 int base_mrf = 2;
1634 fs_inst *inst;
1635
1636 if (brw->gen >= 8) {
1637 inst = emit(opcode, dst, src0, src1);
1638 } else if (brw->gen >= 6) {
1639 src0 = fix_math_operand(src0);
1640 src1 = fix_math_operand(src1);
1641
1642 inst = emit(opcode, dst, src0, src1);
1643 } else {
1644 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1645 * "Message Payload":
1646 *
1647 * "Operand0[7]. For the INT DIV functions, this operand is the
1648 * denominator."
1649 * ...
1650 * "Operand1[7]. For the INT DIV functions, this operand is the
1651 * numerator."
1652 */
1653 bool is_int_div = opcode != SHADER_OPCODE_POW;
1654 fs_reg &op0 = is_int_div ? src1 : src0;
1655 fs_reg &op1 = is_int_div ? src0 : src1;
1656
1657 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1658 inst = emit(opcode, dst, op0, reg_null_f);
1659
1660 inst->base_mrf = base_mrf;
1661 inst->mlen = 2 * dispatch_width / 8;
1662 }
1663 return inst;
1664 }
1665
1666 void
1667 fs_visitor::assign_curb_setup()
1668 {
1669 if (dispatch_width == 8) {
1670 prog_data->dispatch_grf_start_reg = payload.num_regs;
1671 } else {
1672 assert(stage == MESA_SHADER_FRAGMENT);
1673 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1674 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1675 }
1676
1677 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1678
1679 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1680 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1681 for (unsigned int i = 0; i < inst->sources; i++) {
1682 if (inst->src[i].file == UNIFORM) {
1683 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1684 int constant_nr;
1685 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1686 constant_nr = push_constant_loc[uniform_nr];
1687 } else {
1688 /* Section 5.11 of the OpenGL 4.1 spec says:
1689 * "Out-of-bounds reads return undefined values, which include
1690 * values from other variables of the active program or zero."
1691 * Just return the first push constant.
1692 */
1693 constant_nr = 0;
1694 }
1695
1696 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1697 constant_nr / 8,
1698 constant_nr % 8);
1699
1700 inst->src[i].file = HW_REG;
1701 inst->src[i].fixed_hw_reg = byte_offset(
1702 retype(brw_reg, inst->src[i].type),
1703 inst->src[i].subreg_offset);
1704 }
1705 }
1706 }
1707 }
1708
1709 void
1710 fs_visitor::calculate_urb_setup()
1711 {
1712 assert(stage == MESA_SHADER_FRAGMENT);
1713 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1714 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1715
1716 memset(prog_data->urb_setup, -1,
1717 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1718
1719 int urb_next = 0;
1720 /* Figure out where each of the incoming setup attributes lands. */
1721 if (brw->gen >= 6) {
1722 if (_mesa_bitcount_64(prog->InputsRead &
1723 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1724 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1725 * first 16 varying inputs, so we can put them wherever we want.
1726 * Just put them in order.
1727 *
1728 * This is useful because it means that (a) inputs not used by the
1729 * fragment shader won't take up valuable register space, and (b) we
1730 * won't have to recompile the fragment shader if it gets paired with
1731 * a different vertex (or geometry) shader.
1732 */
1733 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1734 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1735 BITFIELD64_BIT(i)) {
1736 prog_data->urb_setup[i] = urb_next++;
1737 }
1738 }
1739 } else {
1740 /* We have enough input varyings that the SF/SBE pipeline stage can't
1741 * arbitrarily rearrange them to suit our whim; we have to put them
1742 * in an order that matches the output of the previous pipeline stage
1743 * (geometry or vertex shader).
1744 */
1745 struct brw_vue_map prev_stage_vue_map;
1746 brw_compute_vue_map(brw, &prev_stage_vue_map,
1747 key->input_slots_valid);
1748 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1749 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1750 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1751 slot++) {
1752 int varying = prev_stage_vue_map.slot_to_varying[slot];
1753 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1754 * unused.
1755 */
1756 if (varying != BRW_VARYING_SLOT_COUNT &&
1757 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1758 BITFIELD64_BIT(varying))) {
1759 prog_data->urb_setup[varying] = slot - first_slot;
1760 }
1761 }
1762 urb_next = prev_stage_vue_map.num_slots - first_slot;
1763 }
1764 } else {
1765 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1766 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1767 /* Point size is packed into the header, not as a general attribute */
1768 if (i == VARYING_SLOT_PSIZ)
1769 continue;
1770
1771 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1772 /* The back color slot is skipped when the front color is
1773 * also written to. In addition, some slots can be
1774 * written in the vertex shader and not read in the
1775 * fragment shader. So the register number must always be
1776 * incremented, mapped or not.
1777 */
1778 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1779 prog_data->urb_setup[i] = urb_next;
1780 urb_next++;
1781 }
1782 }
1783
1784 /*
1785 * It's a FS only attribute, and we did interpolation for this attribute
1786 * in SF thread. So, count it here, too.
1787 *
1788 * See compile_sf_prog() for more info.
1789 */
1790 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1791 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1792 }
1793
1794 prog_data->num_varying_inputs = urb_next;
1795 }
1796
1797 void
1798 fs_visitor::assign_urb_setup()
1799 {
1800 assert(stage == MESA_SHADER_FRAGMENT);
1801 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1802
1803 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1804
1805 /* Offset all the urb_setup[] index by the actual position of the
1806 * setup regs, now that the location of the constants has been chosen.
1807 */
1808 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1809 if (inst->opcode == FS_OPCODE_LINTERP) {
1810 assert(inst->src[2].file == HW_REG);
1811 inst->src[2].fixed_hw_reg.nr += urb_start;
1812 }
1813
1814 if (inst->opcode == FS_OPCODE_CINTERP) {
1815 assert(inst->src[0].file == HW_REG);
1816 inst->src[0].fixed_hw_reg.nr += urb_start;
1817 }
1818 }
1819
1820 /* Each attribute is 4 setup channels, each of which is half a reg. */
1821 this->first_non_payload_grf =
1822 urb_start + prog_data->num_varying_inputs * 2;
1823 }
1824
1825 /**
1826 * Split large virtual GRFs into separate components if we can.
1827 *
1828 * This is mostly duplicated with what brw_fs_vector_splitting does,
1829 * but that's really conservative because it's afraid of doing
1830 * splitting that doesn't result in real progress after the rest of
1831 * the optimization phases, which would cause infinite looping in
1832 * optimization. We can do it once here, safely. This also has the
1833 * opportunity to split interpolated values, or maybe even uniforms,
1834 * which we don't have at the IR level.
1835 *
1836 * We want to split, because virtual GRFs are what we register
1837 * allocate and spill (due to contiguousness requirements for some
1838 * instructions), and they're what we naturally generate in the
1839 * codegen process, but most virtual GRFs don't actually need to be
1840 * contiguous sets of GRFs. If we split, we'll end up with reduced
1841 * live intervals and better dead code elimination and coalescing.
1842 */
1843 void
1844 fs_visitor::split_virtual_grfs()
1845 {
1846 int num_vars = this->virtual_grf_count;
1847
1848 /* Count the total number of registers */
1849 int reg_count = 0;
1850 int vgrf_to_reg[num_vars];
1851 for (int i = 0; i < num_vars; i++) {
1852 vgrf_to_reg[i] = reg_count;
1853 reg_count += virtual_grf_sizes[i];
1854 }
1855
1856 /* An array of "split points". For each register slot, this indicates
1857 * if this slot can be separated from the previous slot. Every time an
1858 * instruction uses multiple elements of a register (as a source or
1859 * destination), we mark the used slots as inseparable. Then we go
1860 * through and split the registers into the smallest pieces we can.
1861 */
1862 bool split_points[reg_count];
1863 memset(split_points, 0, sizeof(split_points));
1864
1865 /* Mark all used registers as fully splittable */
1866 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1867 if (inst->dst.file == GRF) {
1868 int reg = vgrf_to_reg[inst->dst.reg];
1869 for (int j = 1; j < this->virtual_grf_sizes[inst->dst.reg]; j++)
1870 split_points[reg + j] = true;
1871 }
1872
1873 for (int i = 0; i < inst->sources; i++) {
1874 if (inst->src[i].file == GRF) {
1875 int reg = vgrf_to_reg[inst->src[i].reg];
1876 for (int j = 1; j < this->virtual_grf_sizes[inst->src[i].reg]; j++)
1877 split_points[reg + j] = true;
1878 }
1879 }
1880 }
1881
1882 if (brw->has_pln &&
1883 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1884 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1885 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1886 * Gen6, that was the only supported interpolation mode, and since Gen6,
1887 * delta_x and delta_y are in fixed hardware registers.
1888 */
1889 int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1890 split_points[vgrf_to_reg[vgrf] + 1] = false;
1891 }
1892
1893 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1894 if (inst->dst.file == GRF) {
1895 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1896 for (int j = 1; j < inst->regs_written; j++)
1897 split_points[reg + j] = false;
1898 }
1899 for (int i = 0; i < inst->sources; i++) {
1900 if (inst->src[i].file == GRF) {
1901 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1902 for (int j = 1; j < inst->regs_read(this, i); j++)
1903 split_points[reg + j] = false;
1904 }
1905 }
1906 }
1907
1908 int new_virtual_grf[reg_count];
1909 int new_reg_offset[reg_count];
1910
1911 int reg = 0;
1912 for (int i = 0; i < num_vars; i++) {
1913 /* The first one should always be 0 as a quick sanity check. */
1914 assert(split_points[reg] == false);
1915
1916 /* j = 0 case */
1917 new_reg_offset[reg] = 0;
1918 reg++;
1919 int offset = 1;
1920
1921 /* j > 0 case */
1922 for (int j = 1; j < virtual_grf_sizes[i]; j++) {
1923 /* If this is a split point, reset the offset to 0 and allocate a
1924 * new virtual GRF for the previous offset many registers
1925 */
1926 if (split_points[reg]) {
1927 int grf = virtual_grf_alloc(offset);
1928 for (int k = reg - offset; k < reg; k++)
1929 new_virtual_grf[k] = grf;
1930 offset = 0;
1931 }
1932 new_reg_offset[reg] = offset;
1933 offset++;
1934 reg++;
1935 }
1936
1937 /* The last one gets the original register number */
1938 virtual_grf_sizes[i] = offset;
1939 for (int k = reg - offset; k < reg; k++)
1940 new_virtual_grf[k] = i;
1941 }
1942 assert(reg == reg_count);
1943
1944 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1945 if (inst->dst.file == GRF) {
1946 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1947 inst->dst.reg = new_virtual_grf[reg];
1948 inst->dst.reg_offset = new_reg_offset[reg];
1949 assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
1950 }
1951 for (int i = 0; i < inst->sources; i++) {
1952 if (inst->src[i].file == GRF) {
1953 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1954 inst->src[i].reg = new_virtual_grf[reg];
1955 inst->src[i].reg_offset = new_reg_offset[reg];
1956 assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
1957 }
1958 }
1959 }
1960 invalidate_live_intervals();
1961 }
1962
1963 /**
1964 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1965 *
1966 * During code generation, we create tons of temporary variables, many of
1967 * which get immediately killed and are never used again. Yet, in later
1968 * optimization and analysis passes, such as compute_live_intervals, we need
1969 * to loop over all the virtual GRFs. Compacting them can save a lot of
1970 * overhead.
1971 */
1972 bool
1973 fs_visitor::compact_virtual_grfs()
1974 {
1975 bool progress = false;
1976 int remap_table[this->virtual_grf_count];
1977 memset(remap_table, -1, sizeof(remap_table));
1978
1979 /* Mark which virtual GRFs are used. */
1980 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1981 if (inst->dst.file == GRF)
1982 remap_table[inst->dst.reg] = 0;
1983
1984 for (int i = 0; i < inst->sources; i++) {
1985 if (inst->src[i].file == GRF)
1986 remap_table[inst->src[i].reg] = 0;
1987 }
1988 }
1989
1990 /* Compact the GRF arrays. */
1991 int new_index = 0;
1992 for (int i = 0; i < this->virtual_grf_count; i++) {
1993 if (remap_table[i] == -1) {
1994 /* We just found an unused register. This means that we are
1995 * actually going to compact something.
1996 */
1997 progress = true;
1998 } else {
1999 remap_table[i] = new_index;
2000 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
2001 invalidate_live_intervals();
2002 ++new_index;
2003 }
2004 }
2005
2006 this->virtual_grf_count = new_index;
2007
2008 /* Patch all the instructions to use the newly renumbered registers */
2009 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2010 if (inst->dst.file == GRF)
2011 inst->dst.reg = remap_table[inst->dst.reg];
2012
2013 for (int i = 0; i < inst->sources; i++) {
2014 if (inst->src[i].file == GRF)
2015 inst->src[i].reg = remap_table[inst->src[i].reg];
2016 }
2017 }
2018
2019 /* Patch all the references to delta_x/delta_y, since they're used in
2020 * register allocation. If they're unused, switch them to BAD_FILE so
2021 * we don't think some random VGRF is delta_x/delta_y.
2022 */
2023 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2024 if (delta_x[i].file == GRF) {
2025 if (remap_table[delta_x[i].reg] != -1) {
2026 delta_x[i].reg = remap_table[delta_x[i].reg];
2027 } else {
2028 delta_x[i].file = BAD_FILE;
2029 }
2030 }
2031 }
2032 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2033 if (delta_y[i].file == GRF) {
2034 if (remap_table[delta_y[i].reg] != -1) {
2035 delta_y[i].reg = remap_table[delta_y[i].reg];
2036 } else {
2037 delta_y[i].file = BAD_FILE;
2038 }
2039 }
2040 }
2041
2042 return progress;
2043 }
2044
2045 /*
2046 * Implements array access of uniforms by inserting a
2047 * PULL_CONSTANT_LOAD instruction.
2048 *
2049 * Unlike temporary GRF array access (where we don't support it due to
2050 * the difficulty of doing relative addressing on instruction
2051 * destinations), we could potentially do array access of uniforms
2052 * that were loaded in GRF space as push constants. In real-world
2053 * usage we've seen, though, the arrays being used are always larger
2054 * than we could load as push constants, so just always move all
2055 * uniform array access out to a pull constant buffer.
2056 */
2057 void
2058 fs_visitor::move_uniform_array_access_to_pull_constants()
2059 {
2060 if (dispatch_width != 8)
2061 return;
2062
2063 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2064 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2065
2066 /* Walk through and find array access of uniforms. Put a copy of that
2067 * uniform in the pull constant buffer.
2068 *
2069 * Note that we don't move constant-indexed accesses to arrays. No
2070 * testing has been done of the performance impact of this choice.
2071 */
2072 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2073 for (int i = 0 ; i < inst->sources; i++) {
2074 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2075 continue;
2076
2077 int uniform = inst->src[i].reg;
2078
2079 /* If this array isn't already present in the pull constant buffer,
2080 * add it.
2081 */
2082 if (pull_constant_loc[uniform] == -1) {
2083 const gl_constant_value **values = &stage_prog_data->param[uniform];
2084
2085 assert(param_size[uniform]);
2086
2087 for (int j = 0; j < param_size[uniform]; j++) {
2088 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2089
2090 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2091 values[j];
2092 }
2093 }
2094 }
2095 }
2096 }
2097
2098 /**
2099 * Assign UNIFORM file registers to either push constants or pull constants.
2100 *
2101 * We allow a fragment shader to have more than the specified minimum
2102 * maximum number of fragment shader uniform components (64). If
2103 * there are too many of these, they'd fill up all of register space.
2104 * So, this will push some of them out to the pull constant buffer and
2105 * update the program to load them.
2106 */
2107 void
2108 fs_visitor::assign_constant_locations()
2109 {
2110 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2111 if (dispatch_width != 8)
2112 return;
2113
2114 /* Find which UNIFORM registers are still in use. */
2115 bool is_live[uniforms];
2116 for (unsigned int i = 0; i < uniforms; i++) {
2117 is_live[i] = false;
2118 }
2119
2120 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2121 for (int i = 0; i < inst->sources; i++) {
2122 if (inst->src[i].file != UNIFORM)
2123 continue;
2124
2125 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2126 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2127 is_live[constant_nr] = true;
2128 }
2129 }
2130
2131 /* Only allow 16 registers (128 uniform components) as push constants.
2132 *
2133 * Just demote the end of the list. We could probably do better
2134 * here, demoting things that are rarely used in the program first.
2135 *
2136 * If changing this value, note the limitation about total_regs in
2137 * brw_curbe.c.
2138 */
2139 unsigned int max_push_components = 16 * 8;
2140 unsigned int num_push_constants = 0;
2141
2142 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2143
2144 for (unsigned int i = 0; i < uniforms; i++) {
2145 if (!is_live[i] || pull_constant_loc[i] != -1) {
2146 /* This UNIFORM register is either dead, or has already been demoted
2147 * to a pull const. Mark it as no longer living in the param[] array.
2148 */
2149 push_constant_loc[i] = -1;
2150 continue;
2151 }
2152
2153 if (num_push_constants < max_push_components) {
2154 /* Retain as a push constant. Record the location in the params[]
2155 * array.
2156 */
2157 push_constant_loc[i] = num_push_constants++;
2158 } else {
2159 /* Demote to a pull constant. */
2160 push_constant_loc[i] = -1;
2161
2162 int pull_index = stage_prog_data->nr_pull_params++;
2163 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2164 pull_constant_loc[i] = pull_index;
2165 }
2166 }
2167
2168 stage_prog_data->nr_params = num_push_constants;
2169
2170 /* Up until now, the param[] array has been indexed by reg + reg_offset
2171 * of UNIFORM registers. Condense it to only contain the uniforms we
2172 * chose to upload as push constants.
2173 */
2174 for (unsigned int i = 0; i < uniforms; i++) {
2175 int remapped = push_constant_loc[i];
2176
2177 if (remapped == -1)
2178 continue;
2179
2180 assert(remapped <= (int)i);
2181 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2182 }
2183 }
2184
2185 /**
2186 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2187 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2188 */
2189 void
2190 fs_visitor::demote_pull_constants()
2191 {
2192 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2193 for (int i = 0; i < inst->sources; i++) {
2194 if (inst->src[i].file != UNIFORM)
2195 continue;
2196
2197 int pull_index = pull_constant_loc[inst->src[i].reg +
2198 inst->src[i].reg_offset];
2199 if (pull_index == -1)
2200 continue;
2201
2202 /* Set up the annotation tracking for new generated instructions. */
2203 base_ir = inst->ir;
2204 current_annotation = inst->annotation;
2205
2206 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2207 fs_reg dst = fs_reg(this, glsl_type::float_type);
2208
2209 /* Generate a pull load into dst. */
2210 if (inst->src[i].reladdr) {
2211 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2212 surf_index,
2213 *inst->src[i].reladdr,
2214 pull_index);
2215 inst->insert_before(block, &list);
2216 inst->src[i].reladdr = NULL;
2217 } else {
2218 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2219 fs_inst *pull =
2220 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2221 dst, surf_index, offset);
2222 inst->insert_before(block, pull);
2223 inst->src[i].set_smear(pull_index & 3);
2224 }
2225
2226 /* Rewrite the instruction to use the temporary VGRF. */
2227 inst->src[i].file = GRF;
2228 inst->src[i].reg = dst.reg;
2229 inst->src[i].reg_offset = 0;
2230 inst->src[i].width = dispatch_width;
2231 }
2232 }
2233 invalidate_live_intervals();
2234 }
2235
2236 bool
2237 fs_visitor::opt_algebraic()
2238 {
2239 bool progress = false;
2240
2241 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2242 switch (inst->opcode) {
2243 case BRW_OPCODE_MUL:
2244 if (inst->src[1].file != IMM)
2245 continue;
2246
2247 /* a * 1.0 = a */
2248 if (inst->src[1].is_one()) {
2249 inst->opcode = BRW_OPCODE_MOV;
2250 inst->src[1] = reg_undef;
2251 progress = true;
2252 break;
2253 }
2254
2255 /* a * 0.0 = 0.0 */
2256 if (inst->src[1].is_zero()) {
2257 inst->opcode = BRW_OPCODE_MOV;
2258 inst->src[0] = inst->src[1];
2259 inst->src[1] = reg_undef;
2260 progress = true;
2261 break;
2262 }
2263
2264 break;
2265 case BRW_OPCODE_ADD:
2266 if (inst->src[1].file != IMM)
2267 continue;
2268
2269 /* a + 0.0 = a */
2270 if (inst->src[1].is_zero()) {
2271 inst->opcode = BRW_OPCODE_MOV;
2272 inst->src[1] = reg_undef;
2273 progress = true;
2274 break;
2275 }
2276 break;
2277 case BRW_OPCODE_OR:
2278 if (inst->src[0].equals(inst->src[1])) {
2279 inst->opcode = BRW_OPCODE_MOV;
2280 inst->src[1] = reg_undef;
2281 progress = true;
2282 break;
2283 }
2284 break;
2285 case BRW_OPCODE_LRP:
2286 if (inst->src[1].equals(inst->src[2])) {
2287 inst->opcode = BRW_OPCODE_MOV;
2288 inst->src[0] = inst->src[1];
2289 inst->src[1] = reg_undef;
2290 inst->src[2] = reg_undef;
2291 progress = true;
2292 break;
2293 }
2294 break;
2295 case BRW_OPCODE_SEL:
2296 if (inst->src[0].equals(inst->src[1])) {
2297 inst->opcode = BRW_OPCODE_MOV;
2298 inst->src[1] = reg_undef;
2299 inst->predicate = BRW_PREDICATE_NONE;
2300 inst->predicate_inverse = false;
2301 progress = true;
2302 } else if (inst->saturate && inst->src[1].file == IMM) {
2303 switch (inst->conditional_mod) {
2304 case BRW_CONDITIONAL_LE:
2305 case BRW_CONDITIONAL_L:
2306 switch (inst->src[1].type) {
2307 case BRW_REGISTER_TYPE_F:
2308 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2309 inst->opcode = BRW_OPCODE_MOV;
2310 inst->src[1] = reg_undef;
2311 progress = true;
2312 }
2313 break;
2314 default:
2315 break;
2316 }
2317 break;
2318 case BRW_CONDITIONAL_GE:
2319 case BRW_CONDITIONAL_G:
2320 switch (inst->src[1].type) {
2321 case BRW_REGISTER_TYPE_F:
2322 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2323 inst->opcode = BRW_OPCODE_MOV;
2324 inst->src[1] = reg_undef;
2325 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2326 progress = true;
2327 }
2328 break;
2329 default:
2330 break;
2331 }
2332 default:
2333 break;
2334 }
2335 }
2336 break;
2337 case SHADER_OPCODE_RCP: {
2338 fs_inst *prev = (fs_inst *)inst->prev;
2339 if (prev->opcode == SHADER_OPCODE_SQRT) {
2340 if (inst->src[0].equals(prev->dst)) {
2341 inst->opcode = SHADER_OPCODE_RSQ;
2342 inst->src[0] = prev->src[0];
2343 progress = true;
2344 }
2345 }
2346 break;
2347 }
2348 default:
2349 break;
2350 }
2351 }
2352
2353 return progress;
2354 }
2355
2356 bool
2357 fs_visitor::opt_register_renaming()
2358 {
2359 bool progress = false;
2360 int depth = 0;
2361
2362 int remap[virtual_grf_count];
2363 memset(remap, -1, sizeof(int) * virtual_grf_count);
2364
2365 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2366 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2367 depth++;
2368 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2369 inst->opcode == BRW_OPCODE_WHILE) {
2370 depth--;
2371 }
2372
2373 /* Rewrite instruction sources. */
2374 for (int i = 0; i < inst->sources; i++) {
2375 if (inst->src[i].file == GRF &&
2376 remap[inst->src[i].reg] != -1 &&
2377 remap[inst->src[i].reg] != inst->src[i].reg) {
2378 inst->src[i].reg = remap[inst->src[i].reg];
2379 progress = true;
2380 }
2381 }
2382
2383 const int dst = inst->dst.reg;
2384
2385 if (depth == 0 &&
2386 inst->dst.file == GRF &&
2387 virtual_grf_sizes[inst->dst.reg] == inst->dst.width / 8 &&
2388 !inst->is_partial_write()) {
2389 if (remap[dst] == -1) {
2390 remap[dst] = dst;
2391 } else {
2392 remap[dst] = virtual_grf_alloc(inst->dst.width / 8);
2393 inst->dst.reg = remap[dst];
2394 progress = true;
2395 }
2396 } else if (inst->dst.file == GRF &&
2397 remap[dst] != -1 &&
2398 remap[dst] != dst) {
2399 inst->dst.reg = remap[dst];
2400 progress = true;
2401 }
2402 }
2403
2404 if (progress) {
2405 invalidate_live_intervals();
2406
2407 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2408 if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2409 delta_x[i].reg = remap[delta_x[i].reg];
2410 }
2411 }
2412 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2413 if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2414 delta_y[i].reg = remap[delta_y[i].reg];
2415 }
2416 }
2417 }
2418
2419 return progress;
2420 }
2421
2422 bool
2423 fs_visitor::compute_to_mrf()
2424 {
2425 bool progress = false;
2426 int next_ip = 0;
2427
2428 calculate_live_intervals();
2429
2430 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2431 int ip = next_ip;
2432 next_ip++;
2433
2434 if (inst->opcode != BRW_OPCODE_MOV ||
2435 inst->is_partial_write() ||
2436 inst->dst.file != MRF || inst->src[0].file != GRF ||
2437 inst->dst.type != inst->src[0].type ||
2438 inst->src[0].abs || inst->src[0].negate ||
2439 !inst->src[0].is_contiguous() ||
2440 inst->src[0].subreg_offset)
2441 continue;
2442
2443 /* Work out which hardware MRF registers are written by this
2444 * instruction.
2445 */
2446 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2447 int mrf_high;
2448 if (inst->dst.reg & BRW_MRF_COMPR4) {
2449 mrf_high = mrf_low + 4;
2450 } else if (inst->exec_size == 16) {
2451 mrf_high = mrf_low + 1;
2452 } else {
2453 mrf_high = mrf_low;
2454 }
2455
2456 /* Can't compute-to-MRF this GRF if someone else was going to
2457 * read it later.
2458 */
2459 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2460 continue;
2461
2462 /* Found a move of a GRF to a MRF. Let's see if we can go
2463 * rewrite the thing that made this GRF to write into the MRF.
2464 */
2465 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2466 if (scan_inst->dst.file == GRF &&
2467 scan_inst->dst.reg == inst->src[0].reg) {
2468 /* Found the last thing to write our reg we want to turn
2469 * into a compute-to-MRF.
2470 */
2471
2472 /* If this one instruction didn't populate all the
2473 * channels, bail. We might be able to rewrite everything
2474 * that writes that reg, but it would require smarter
2475 * tracking to delay the rewriting until complete success.
2476 */
2477 if (scan_inst->is_partial_write())
2478 break;
2479
2480 /* Things returning more than one register would need us to
2481 * understand coalescing out more than one MOV at a time.
2482 */
2483 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2484 break;
2485
2486 /* SEND instructions can't have MRF as a destination. */
2487 if (scan_inst->mlen)
2488 break;
2489
2490 if (brw->gen == 6) {
2491 /* gen6 math instructions must have the destination be
2492 * GRF, so no compute-to-MRF for them.
2493 */
2494 if (scan_inst->is_math()) {
2495 break;
2496 }
2497 }
2498
2499 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2500 /* Found the creator of our MRF's source value. */
2501 scan_inst->dst.file = MRF;
2502 scan_inst->dst.reg = inst->dst.reg;
2503 scan_inst->saturate |= inst->saturate;
2504 inst->remove(block);
2505 progress = true;
2506 }
2507 break;
2508 }
2509
2510 /* We don't handle control flow here. Most computation of
2511 * values that end up in MRFs are shortly before the MRF
2512 * write anyway.
2513 */
2514 if (block->start() == scan_inst)
2515 break;
2516
2517 /* You can't read from an MRF, so if someone else reads our
2518 * MRF's source GRF that we wanted to rewrite, that stops us.
2519 */
2520 bool interfered = false;
2521 for (int i = 0; i < scan_inst->sources; i++) {
2522 if (scan_inst->src[i].file == GRF &&
2523 scan_inst->src[i].reg == inst->src[0].reg &&
2524 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2525 interfered = true;
2526 }
2527 }
2528 if (interfered)
2529 break;
2530
2531 if (scan_inst->dst.file == MRF) {
2532 /* If somebody else writes our MRF here, we can't
2533 * compute-to-MRF before that.
2534 */
2535 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2536 int scan_mrf_high;
2537
2538 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2539 scan_mrf_high = scan_mrf_low + 4;
2540 } else if (scan_inst->exec_size == 16) {
2541 scan_mrf_high = scan_mrf_low + 1;
2542 } else {
2543 scan_mrf_high = scan_mrf_low;
2544 }
2545
2546 if (mrf_low == scan_mrf_low ||
2547 mrf_low == scan_mrf_high ||
2548 mrf_high == scan_mrf_low ||
2549 mrf_high == scan_mrf_high) {
2550 break;
2551 }
2552 }
2553
2554 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2555 /* Found a SEND instruction, which means that there are
2556 * live values in MRFs from base_mrf to base_mrf +
2557 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2558 * above it.
2559 */
2560 if (mrf_low >= scan_inst->base_mrf &&
2561 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2562 break;
2563 }
2564 if (mrf_high >= scan_inst->base_mrf &&
2565 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2566 break;
2567 }
2568 }
2569 }
2570 }
2571
2572 if (progress)
2573 invalidate_live_intervals();
2574
2575 return progress;
2576 }
2577
2578 /**
2579 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2580 * instructions to FS_OPCODE_REP_FB_WRITE.
2581 */
2582 void
2583 fs_visitor::emit_repclear_shader()
2584 {
2585 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2586 int base_mrf = 1;
2587 int color_mrf = base_mrf + 2;
2588
2589 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2590 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2591 mov->force_writemask_all = true;
2592
2593 fs_inst *write;
2594 if (key->nr_color_regions == 1) {
2595 write = emit(FS_OPCODE_REP_FB_WRITE);
2596 write->saturate = key->clamp_fragment_color;
2597 write->base_mrf = color_mrf;
2598 write->target = 0;
2599 write->header_present = false;
2600 write->mlen = 1;
2601 } else {
2602 for (int i = 0; i < key->nr_color_regions; ++i) {
2603 write = emit(FS_OPCODE_REP_FB_WRITE);
2604 write->saturate = key->clamp_fragment_color;
2605 write->base_mrf = base_mrf;
2606 write->target = i;
2607 write->header_present = true;
2608 write->mlen = 3;
2609 }
2610 }
2611 write->eot = true;
2612
2613 calculate_cfg();
2614
2615 assign_constant_locations();
2616 assign_curb_setup();
2617
2618 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2619 assert(mov->src[0].file == HW_REG);
2620 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2621 }
2622
2623 /**
2624 * Walks through basic blocks, looking for repeated MRF writes and
2625 * removing the later ones.
2626 */
2627 bool
2628 fs_visitor::remove_duplicate_mrf_writes()
2629 {
2630 fs_inst *last_mrf_move[16];
2631 bool progress = false;
2632
2633 /* Need to update the MRF tracking for compressed instructions. */
2634 if (dispatch_width == 16)
2635 return false;
2636
2637 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2638
2639 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2640 if (inst->is_control_flow()) {
2641 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2642 }
2643
2644 if (inst->opcode == BRW_OPCODE_MOV &&
2645 inst->dst.file == MRF) {
2646 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2647 if (prev_inst && inst->equals(prev_inst)) {
2648 inst->remove(block);
2649 progress = true;
2650 continue;
2651 }
2652 }
2653
2654 /* Clear out the last-write records for MRFs that were overwritten. */
2655 if (inst->dst.file == MRF) {
2656 last_mrf_move[inst->dst.reg] = NULL;
2657 }
2658
2659 if (inst->mlen > 0 && inst->base_mrf != -1) {
2660 /* Found a SEND instruction, which will include two or fewer
2661 * implied MRF writes. We could do better here.
2662 */
2663 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2664 last_mrf_move[inst->base_mrf + i] = NULL;
2665 }
2666 }
2667
2668 /* Clear out any MRF move records whose sources got overwritten. */
2669 if (inst->dst.file == GRF) {
2670 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2671 if (last_mrf_move[i] &&
2672 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2673 last_mrf_move[i] = NULL;
2674 }
2675 }
2676 }
2677
2678 if (inst->opcode == BRW_OPCODE_MOV &&
2679 inst->dst.file == MRF &&
2680 inst->src[0].file == GRF &&
2681 !inst->is_partial_write()) {
2682 last_mrf_move[inst->dst.reg] = inst;
2683 }
2684 }
2685
2686 if (progress)
2687 invalidate_live_intervals();
2688
2689 return progress;
2690 }
2691
2692 static void
2693 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2694 int first_grf, int grf_len)
2695 {
2696 /* Clear the flag for registers that actually got read (as expected). */
2697 for (int i = 0; i < inst->sources; i++) {
2698 int grf;
2699 if (inst->src[i].file == GRF) {
2700 grf = inst->src[i].reg;
2701 } else if (inst->src[i].file == HW_REG &&
2702 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2703 grf = inst->src[i].fixed_hw_reg.nr;
2704 } else {
2705 continue;
2706 }
2707
2708 if (grf >= first_grf &&
2709 grf < first_grf + grf_len) {
2710 deps[grf - first_grf] = false;
2711 if (inst->exec_size == 16)
2712 deps[grf - first_grf + 1] = false;
2713 }
2714 }
2715 }
2716
2717 /**
2718 * Implements this workaround for the original 965:
2719 *
2720 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2721 * check for post destination dependencies on this instruction, software
2722 * must ensure that there is no destination hazard for the case of ‘write
2723 * followed by a posted write’ shown in the following example.
2724 *
2725 * 1. mov r3 0
2726 * 2. send r3.xy <rest of send instruction>
2727 * 3. mov r2 r3
2728 *
2729 * Due to no post-destination dependency check on the ‘send’, the above
2730 * code sequence could have two instructions (1 and 2) in flight at the
2731 * same time that both consider ‘r3’ as the target of their final writes.
2732 */
2733 void
2734 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2735 fs_inst *inst)
2736 {
2737 int write_len = inst->regs_written;
2738 int first_write_grf = inst->dst.reg;
2739 bool needs_dep[BRW_MAX_MRF];
2740 assert(write_len < (int)sizeof(needs_dep) - 1);
2741
2742 memset(needs_dep, false, sizeof(needs_dep));
2743 memset(needs_dep, true, write_len);
2744
2745 clear_deps_for_inst_src(inst, dispatch_width,
2746 needs_dep, first_write_grf, write_len);
2747
2748 /* Walk backwards looking for writes to registers we're writing which
2749 * aren't read since being written. If we hit the start of the program,
2750 * we assume that there are no outstanding dependencies on entry to the
2751 * program.
2752 */
2753 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2754 /* If we hit control flow, assume that there *are* outstanding
2755 * dependencies, and force their cleanup before our instruction.
2756 */
2757 if (block->start() == scan_inst) {
2758 for (int i = 0; i < write_len; i++) {
2759 if (needs_dep[i]) {
2760 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2761 }
2762 }
2763 return;
2764 }
2765
2766 /* We insert our reads as late as possible on the assumption that any
2767 * instruction but a MOV that might have left us an outstanding
2768 * dependency has more latency than a MOV.
2769 */
2770 if (scan_inst->dst.file == GRF) {
2771 for (int i = 0; i < scan_inst->regs_written; i++) {
2772 int reg = scan_inst->dst.reg + i;
2773
2774 if (reg >= first_write_grf &&
2775 reg < first_write_grf + write_len &&
2776 needs_dep[reg - first_write_grf]) {
2777 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2778 needs_dep[reg - first_write_grf] = false;
2779 if (scan_inst->exec_size == 16)
2780 needs_dep[reg - first_write_grf + 1] = false;
2781 }
2782 }
2783 }
2784
2785 /* Clear the flag for registers that actually got read (as expected). */
2786 clear_deps_for_inst_src(scan_inst, dispatch_width,
2787 needs_dep, first_write_grf, write_len);
2788
2789 /* Continue the loop only if we haven't resolved all the dependencies */
2790 int i;
2791 for (i = 0; i < write_len; i++) {
2792 if (needs_dep[i])
2793 break;
2794 }
2795 if (i == write_len)
2796 return;
2797 }
2798 }
2799
2800 /**
2801 * Implements this workaround for the original 965:
2802 *
2803 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2804 * used as a destination register until after it has been sourced by an
2805 * instruction with a different destination register.
2806 */
2807 void
2808 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2809 {
2810 int write_len = inst->regs_written;
2811 int first_write_grf = inst->dst.reg;
2812 bool needs_dep[BRW_MAX_MRF];
2813 assert(write_len < (int)sizeof(needs_dep) - 1);
2814
2815 memset(needs_dep, false, sizeof(needs_dep));
2816 memset(needs_dep, true, write_len);
2817 /* Walk forwards looking for writes to registers we're writing which aren't
2818 * read before being written.
2819 */
2820 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2821 /* If we hit control flow, force resolve all remaining dependencies. */
2822 if (block->end() == scan_inst) {
2823 for (int i = 0; i < write_len; i++) {
2824 if (needs_dep[i])
2825 scan_inst->insert_before(block,
2826 DEP_RESOLVE_MOV(first_write_grf + i));
2827 }
2828 return;
2829 }
2830
2831 /* Clear the flag for registers that actually got read (as expected). */
2832 clear_deps_for_inst_src(scan_inst, dispatch_width,
2833 needs_dep, first_write_grf, write_len);
2834
2835 /* We insert our reads as late as possible since they're reading the
2836 * result of a SEND, which has massive latency.
2837 */
2838 if (scan_inst->dst.file == GRF &&
2839 scan_inst->dst.reg >= first_write_grf &&
2840 scan_inst->dst.reg < first_write_grf + write_len &&
2841 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2842 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
2843 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2844 }
2845
2846 /* Continue the loop only if we haven't resolved all the dependencies */
2847 int i;
2848 for (i = 0; i < write_len; i++) {
2849 if (needs_dep[i])
2850 break;
2851 }
2852 if (i == write_len)
2853 return;
2854 }
2855
2856 /* If we hit the end of the program, resolve all remaining dependencies out
2857 * of paranoia.
2858 */
2859 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2860 assert(last_inst->eot);
2861 for (int i = 0; i < write_len; i++) {
2862 if (needs_dep[i])
2863 last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2864 }
2865 }
2866
2867 void
2868 fs_visitor::insert_gen4_send_dependency_workarounds()
2869 {
2870 if (brw->gen != 4 || brw->is_g4x)
2871 return;
2872
2873 bool progress = false;
2874
2875 /* Note that we're done with register allocation, so GRF fs_regs always
2876 * have a .reg_offset of 0.
2877 */
2878
2879 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2880 if (inst->mlen != 0 && inst->dst.file == GRF) {
2881 insert_gen4_pre_send_dependency_workarounds(block, inst);
2882 insert_gen4_post_send_dependency_workarounds(block, inst);
2883 progress = true;
2884 }
2885 }
2886
2887 if (progress)
2888 invalidate_live_intervals();
2889 }
2890
2891 /**
2892 * Turns the generic expression-style uniform pull constant load instruction
2893 * into a hardware-specific series of instructions for loading a pull
2894 * constant.
2895 *
2896 * The expression style allows the CSE pass before this to optimize out
2897 * repeated loads from the same offset, and gives the pre-register-allocation
2898 * scheduling full flexibility, while the conversion to native instructions
2899 * allows the post-register-allocation scheduler the best information
2900 * possible.
2901 *
2902 * Note that execution masking for setting up pull constant loads is special:
2903 * the channels that need to be written are unrelated to the current execution
2904 * mask, since a later instruction will use one of the result channels as a
2905 * source operand for all 8 or 16 of its channels.
2906 */
2907 void
2908 fs_visitor::lower_uniform_pull_constant_loads()
2909 {
2910 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2911 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2912 continue;
2913
2914 if (brw->gen >= 7) {
2915 /* The offset arg before was a vec4-aligned byte offset. We need to
2916 * turn it into a dword offset.
2917 */
2918 fs_reg const_offset_reg = inst->src[1];
2919 assert(const_offset_reg.file == IMM &&
2920 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2921 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2922 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2923
2924 /* This is actually going to be a MOV, but since only the first dword
2925 * is accessed, we have a special opcode to do just that one. Note
2926 * that this needs to be an operation that will be considered a def
2927 * by live variable analysis, or register allocation will explode.
2928 */
2929 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2930 8, payload, const_offset_reg);
2931 setup->force_writemask_all = true;
2932
2933 setup->ir = inst->ir;
2934 setup->annotation = inst->annotation;
2935 inst->insert_before(block, setup);
2936
2937 /* Similarly, this will only populate the first 4 channels of the
2938 * result register (since we only use smear values from 0-3), but we
2939 * don't tell the optimizer.
2940 */
2941 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2942 inst->src[1] = payload;
2943
2944 invalidate_live_intervals();
2945 } else {
2946 /* Before register allocation, we didn't tell the scheduler about the
2947 * MRF we use. We know it's safe to use this MRF because nothing
2948 * else does except for register spill/unspill, which generates and
2949 * uses its MRF within a single IR instruction.
2950 */
2951 inst->base_mrf = 14;
2952 inst->mlen = 1;
2953 }
2954 }
2955 }
2956
2957 bool
2958 fs_visitor::lower_load_payload()
2959 {
2960 bool progress = false;
2961
2962 int vgrf_to_reg[virtual_grf_count];
2963 int reg_count = 16; /* Leave room for MRF */
2964 for (int i = 0; i < virtual_grf_count; ++i) {
2965 vgrf_to_reg[i] = reg_count;
2966 reg_count += virtual_grf_sizes[i];
2967 }
2968
2969 struct {
2970 bool written:1; /* Whether this register has ever been written */
2971 bool force_writemask_all:1;
2972 bool force_sechalf:1;
2973 } metadata[reg_count];
2974 memset(metadata, 0, sizeof(metadata));
2975
2976 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2977 int dst_reg;
2978 if (inst->dst.file == MRF) {
2979 dst_reg = inst->dst.reg;
2980 } else if (inst->dst.file == GRF) {
2981 dst_reg = vgrf_to_reg[inst->dst.reg];
2982 }
2983
2984 if (inst->dst.file == MRF || inst->dst.file == GRF) {
2985 bool force_sechalf = inst->force_sechalf;
2986 bool toggle_sechalf = inst->dst.width == 16 &&
2987 type_sz(inst->dst.type) == 4;
2988 for (int i = 0; i < inst->regs_written; ++i) {
2989 metadata[dst_reg + i].written = true;
2990 metadata[dst_reg + i].force_sechalf = force_sechalf;
2991 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
2992 force_sechalf = (toggle_sechalf != force_sechalf);
2993 }
2994 }
2995
2996 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
2997 assert(inst->dst.file == MRF || inst->dst.file == GRF);
2998 fs_reg dst = inst->dst;
2999
3000 for (int i = 0; i < inst->sources; i++) {
3001 dst.width = inst->src[i].effective_width;
3002 dst.type = inst->src[i].type;
3003
3004 if (inst->src[i].file == BAD_FILE) {
3005 /* Do nothing but otherwise increment as normal */
3006 } else if (dst.file == MRF &&
3007 dst.width == 8 &&
3008 brw->has_compr4 &&
3009 i + 4 < inst->sources &&
3010 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3011 fs_reg compr4_dst = dst;
3012 compr4_dst.reg += BRW_MRF_COMPR4;
3013 compr4_dst.width = 16;
3014 fs_reg compr4_src = inst->src[i];
3015 compr4_src.width = 16;
3016 fs_inst *mov = MOV(compr4_dst, compr4_src);
3017 mov->force_writemask_all = true;
3018 inst->insert_before(block, mov);
3019 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3020 inst->src[i + 4].file = BAD_FILE;
3021 } else {
3022 fs_inst *mov = MOV(dst, inst->src[i]);
3023 if (inst->src[i].file == GRF) {
3024 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3025 inst->src[i].reg_offset;
3026 mov->force_sechalf = metadata[src_reg].force_sechalf;
3027 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3028 metadata[dst_reg] = metadata[src_reg];
3029 if (dst.width * type_sz(dst.type) > 32) {
3030 assert((!metadata[src_reg].written ||
3031 !metadata[src_reg].force_sechalf) &&
3032 (!metadata[src_reg + 1].written ||
3033 metadata[src_reg + 1].force_sechalf));
3034 metadata[dst_reg + 1] = metadata[src_reg + 1];
3035 }
3036 } else {
3037 metadata[dst_reg].force_writemask_all = false;
3038 metadata[dst_reg].force_sechalf = false;
3039 if (dst.width == 16) {
3040 metadata[dst_reg + 1].force_writemask_all = false;
3041 metadata[dst_reg + 1].force_sechalf = true;
3042 }
3043 }
3044 inst->insert_before(block, mov);
3045 }
3046
3047 dst = offset(dst, 1);
3048 }
3049
3050 inst->remove(block);
3051 progress = true;
3052 }
3053 }
3054
3055 if (progress)
3056 invalidate_live_intervals();
3057
3058 return progress;
3059 }
3060
3061 void
3062 fs_visitor::dump_instructions()
3063 {
3064 dump_instructions(NULL);
3065 }
3066
3067 void
3068 fs_visitor::dump_instructions(const char *name)
3069 {
3070 calculate_register_pressure();
3071 FILE *file = stderr;
3072 if (name && geteuid() != 0) {
3073 file = fopen(name, "w");
3074 if (!file)
3075 file = stderr;
3076 }
3077
3078 int ip = 0, max_pressure = 0;
3079 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3080 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3081 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3082 dump_instruction(inst, file);
3083 ++ip;
3084 }
3085 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3086
3087 if (file != stderr) {
3088 fclose(file);
3089 }
3090 }
3091
3092 void
3093 fs_visitor::dump_instruction(backend_instruction *be_inst)
3094 {
3095 dump_instruction(be_inst, stderr);
3096 }
3097
3098 void
3099 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3100 {
3101 fs_inst *inst = (fs_inst *)be_inst;
3102
3103 if (inst->predicate) {
3104 fprintf(file, "(%cf0.%d) ",
3105 inst->predicate_inverse ? '-' : '+',
3106 inst->flag_subreg);
3107 }
3108
3109 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3110 if (inst->saturate)
3111 fprintf(file, ".sat");
3112 if (inst->conditional_mod) {
3113 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3114 if (!inst->predicate &&
3115 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3116 inst->opcode != BRW_OPCODE_IF &&
3117 inst->opcode != BRW_OPCODE_WHILE))) {
3118 fprintf(file, ".f0.%d", inst->flag_subreg);
3119 }
3120 }
3121 fprintf(file, "(%d) ", inst->exec_size);
3122
3123
3124 switch (inst->dst.file) {
3125 case GRF:
3126 fprintf(file, "vgrf%d", inst->dst.reg);
3127 if (inst->dst.width != dispatch_width)
3128 fprintf(file, "@%d", inst->dst.width);
3129 if (virtual_grf_sizes[inst->dst.reg] != inst->dst.width / 8 ||
3130 inst->dst.subreg_offset)
3131 fprintf(file, "+%d.%d",
3132 inst->dst.reg_offset, inst->dst.subreg_offset);
3133 break;
3134 case MRF:
3135 fprintf(file, "m%d", inst->dst.reg);
3136 break;
3137 case BAD_FILE:
3138 fprintf(file, "(null)");
3139 break;
3140 case UNIFORM:
3141 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3142 break;
3143 case HW_REG:
3144 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3145 switch (inst->dst.fixed_hw_reg.nr) {
3146 case BRW_ARF_NULL:
3147 fprintf(file, "null");
3148 break;
3149 case BRW_ARF_ADDRESS:
3150 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3151 break;
3152 case BRW_ARF_ACCUMULATOR:
3153 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3154 break;
3155 case BRW_ARF_FLAG:
3156 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3157 inst->dst.fixed_hw_reg.subnr);
3158 break;
3159 default:
3160 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3161 inst->dst.fixed_hw_reg.subnr);
3162 break;
3163 }
3164 } else {
3165 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3166 }
3167 if (inst->dst.fixed_hw_reg.subnr)
3168 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3169 break;
3170 default:
3171 fprintf(file, "???");
3172 break;
3173 }
3174 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3175
3176 for (int i = 0; i < inst->sources; i++) {
3177 if (inst->src[i].negate)
3178 fprintf(file, "-");
3179 if (inst->src[i].abs)
3180 fprintf(file, "|");
3181 switch (inst->src[i].file) {
3182 case GRF:
3183 fprintf(file, "vgrf%d", inst->src[i].reg);
3184 if (inst->src[i].width != dispatch_width)
3185 fprintf(file, "@%d", inst->src[i].width);
3186 if (virtual_grf_sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3187 inst->src[i].subreg_offset)
3188 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3189 inst->src[i].subreg_offset);
3190 break;
3191 case MRF:
3192 fprintf(file, "***m%d***", inst->src[i].reg);
3193 break;
3194 case UNIFORM:
3195 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3196 if (inst->src[i].reladdr) {
3197 fprintf(file, "+reladdr");
3198 } else if (inst->src[i].subreg_offset) {
3199 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3200 inst->src[i].subreg_offset);
3201 }
3202 break;
3203 case BAD_FILE:
3204 fprintf(file, "(null)");
3205 break;
3206 case IMM:
3207 switch (inst->src[i].type) {
3208 case BRW_REGISTER_TYPE_F:
3209 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3210 break;
3211 case BRW_REGISTER_TYPE_D:
3212 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3213 break;
3214 case BRW_REGISTER_TYPE_UD:
3215 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3216 break;
3217 default:
3218 fprintf(file, "???");
3219 break;
3220 }
3221 break;
3222 case HW_REG:
3223 if (inst->src[i].fixed_hw_reg.negate)
3224 fprintf(file, "-");
3225 if (inst->src[i].fixed_hw_reg.abs)
3226 fprintf(file, "|");
3227 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3228 switch (inst->src[i].fixed_hw_reg.nr) {
3229 case BRW_ARF_NULL:
3230 fprintf(file, "null");
3231 break;
3232 case BRW_ARF_ADDRESS:
3233 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3234 break;
3235 case BRW_ARF_ACCUMULATOR:
3236 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3237 break;
3238 case BRW_ARF_FLAG:
3239 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3240 inst->src[i].fixed_hw_reg.subnr);
3241 break;
3242 default:
3243 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3244 inst->src[i].fixed_hw_reg.subnr);
3245 break;
3246 }
3247 } else {
3248 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3249 }
3250 if (inst->src[i].fixed_hw_reg.subnr)
3251 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3252 if (inst->src[i].fixed_hw_reg.abs)
3253 fprintf(file, "|");
3254 break;
3255 default:
3256 fprintf(file, "???");
3257 break;
3258 }
3259 if (inst->src[i].abs)
3260 fprintf(file, "|");
3261
3262 if (inst->src[i].file != IMM) {
3263 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3264 }
3265
3266 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3267 fprintf(file, ", ");
3268 }
3269
3270 fprintf(file, " ");
3271
3272 if (dispatch_width == 16 && inst->exec_size == 8) {
3273 if (inst->force_sechalf)
3274 fprintf(file, "2ndhalf ");
3275 else
3276 fprintf(file, "1sthalf ");
3277 }
3278
3279 fprintf(file, "\n");
3280 }
3281
3282 /**
3283 * Possibly returns an instruction that set up @param reg.
3284 *
3285 * Sometimes we want to take the result of some expression/variable
3286 * dereference tree and rewrite the instruction generating the result
3287 * of the tree. When processing the tree, we know that the
3288 * instructions generated are all writing temporaries that are dead
3289 * outside of this tree. So, if we have some instructions that write
3290 * a temporary, we're free to point that temp write somewhere else.
3291 *
3292 * Note that this doesn't guarantee that the instruction generated
3293 * only reg -- it might be the size=4 destination of a texture instruction.
3294 */
3295 fs_inst *
3296 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3297 fs_inst *end,
3298 const fs_reg &reg)
3299 {
3300 if (end == start ||
3301 end->is_partial_write() ||
3302 reg.reladdr ||
3303 !reg.equals(end->dst)) {
3304 return NULL;
3305 } else {
3306 return end;
3307 }
3308 }
3309
3310 void
3311 fs_visitor::setup_payload_gen6()
3312 {
3313 bool uses_depth =
3314 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3315 unsigned barycentric_interp_modes =
3316 (stage == MESA_SHADER_FRAGMENT) ?
3317 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3318
3319 assert(brw->gen >= 6);
3320
3321 /* R0-1: masks, pixel X/Y coordinates. */
3322 payload.num_regs = 2;
3323 /* R2: only for 32-pixel dispatch.*/
3324
3325 /* R3-26: barycentric interpolation coordinates. These appear in the
3326 * same order that they appear in the brw_wm_barycentric_interp_mode
3327 * enum. Each set of coordinates occupies 2 registers if dispatch width
3328 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3329 * appear if they were enabled using the "Barycentric Interpolation
3330 * Mode" bits in WM_STATE.
3331 */
3332 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3333 if (barycentric_interp_modes & (1 << i)) {
3334 payload.barycentric_coord_reg[i] = payload.num_regs;
3335 payload.num_regs += 2;
3336 if (dispatch_width == 16) {
3337 payload.num_regs += 2;
3338 }
3339 }
3340 }
3341
3342 /* R27: interpolated depth if uses source depth */
3343 if (uses_depth) {
3344 payload.source_depth_reg = payload.num_regs;
3345 payload.num_regs++;
3346 if (dispatch_width == 16) {
3347 /* R28: interpolated depth if not SIMD8. */
3348 payload.num_regs++;
3349 }
3350 }
3351 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3352 if (uses_depth) {
3353 payload.source_w_reg = payload.num_regs;
3354 payload.num_regs++;
3355 if (dispatch_width == 16) {
3356 /* R30: interpolated W if not SIMD8. */
3357 payload.num_regs++;
3358 }
3359 }
3360
3361 if (stage == MESA_SHADER_FRAGMENT) {
3362 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3363 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3364 prog_data->uses_pos_offset = key->compute_pos_offset;
3365 /* R31: MSAA position offsets. */
3366 if (prog_data->uses_pos_offset) {
3367 payload.sample_pos_reg = payload.num_regs;
3368 payload.num_regs++;
3369 }
3370 }
3371
3372 /* R32: MSAA input coverage mask */
3373 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3374 assert(brw->gen >= 7);
3375 payload.sample_mask_in_reg = payload.num_regs;
3376 payload.num_regs++;
3377 if (dispatch_width == 16) {
3378 /* R33: input coverage mask if not SIMD8. */
3379 payload.num_regs++;
3380 }
3381 }
3382
3383 /* R34-: bary for 32-pixel. */
3384 /* R58-59: interp W for 32-pixel. */
3385
3386 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3387 source_depth_to_render_target = true;
3388 }
3389 }
3390
3391 void
3392 fs_visitor::assign_binding_table_offsets()
3393 {
3394 assert(stage == MESA_SHADER_FRAGMENT);
3395 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3396 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3397 uint32_t next_binding_table_offset = 0;
3398
3399 /* If there are no color regions, we still perform an FB write to a null
3400 * renderbuffer, which we place at surface index 0.
3401 */
3402 prog_data->binding_table.render_target_start = next_binding_table_offset;
3403 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3404
3405 assign_common_binding_table_offsets(next_binding_table_offset);
3406 }
3407
3408 void
3409 fs_visitor::calculate_register_pressure()
3410 {
3411 invalidate_live_intervals();
3412 calculate_live_intervals();
3413
3414 unsigned num_instructions = 0;
3415 foreach_block(block, cfg)
3416 num_instructions += block->instructions.length();
3417
3418 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3419
3420 for (int reg = 0; reg < virtual_grf_count; reg++) {
3421 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3422 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3423 }
3424 }
3425
3426 /**
3427 * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
3428 *
3429 * The needs_unlit_centroid_workaround ends up producing one of these per
3430 * channel of centroid input, so it's good to clean them up.
3431 *
3432 * An assumption here is that nothing ever modifies the dispatched pixels
3433 * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
3434 * dictates that anyway.
3435 */
3436 void
3437 fs_visitor::opt_drop_redundant_mov_to_flags()
3438 {
3439 bool flag_mov_found[2] = {false};
3440
3441 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3442 if (inst->is_control_flow()) {
3443 memset(flag_mov_found, 0, sizeof(flag_mov_found));
3444 } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
3445 if (!flag_mov_found[inst->flag_subreg])
3446 flag_mov_found[inst->flag_subreg] = true;
3447 else
3448 inst->remove(block);
3449 } else if (inst->writes_flag()) {
3450 flag_mov_found[inst->flag_subreg] = false;
3451 }
3452 }
3453 }
3454
3455 bool
3456 fs_visitor::run()
3457 {
3458 sanity_param_count = prog->Parameters->NumParameters;
3459 bool allocated_without_spills;
3460
3461 assign_binding_table_offsets();
3462
3463 if (brw->gen >= 6)
3464 setup_payload_gen6();
3465 else
3466 setup_payload_gen4();
3467
3468 if (0) {
3469 emit_dummy_fs();
3470 } else if (brw->use_rep_send && dispatch_width == 16) {
3471 emit_repclear_shader();
3472 allocated_without_spills = true;
3473 } else {
3474 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3475 emit_shader_time_begin();
3476
3477 calculate_urb_setup();
3478 if (prog->InputsRead > 0) {
3479 if (brw->gen < 6)
3480 emit_interpolation_setup_gen4();
3481 else
3482 emit_interpolation_setup_gen6();
3483 }
3484
3485 /* We handle discards by keeping track of the still-live pixels in f0.1.
3486 * Initialize it with the dispatched pixels.
3487 */
3488 bool uses_kill =
3489 (stage == MESA_SHADER_FRAGMENT) &&
3490 ((brw_wm_prog_data*) this->prog_data)->uses_kill;
3491 bool alpha_test_func =
3492 (stage == MESA_SHADER_FRAGMENT) &&
3493 ((brw_wm_prog_key*) this->key)->alpha_test_func;
3494 if (uses_kill || alpha_test_func) {
3495 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3496 discard_init->flag_subreg = 1;
3497 }
3498
3499 /* Generate FS IR for main(). (the visitor only descends into
3500 * functions called "main").
3501 */
3502 if (shader) {
3503 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3504 base_ir = ir;
3505 this->result = reg_undef;
3506 ir->accept(this);
3507 }
3508 } else {
3509 emit_fragment_program_code();
3510 }
3511 base_ir = NULL;
3512 if (failed)
3513 return false;
3514
3515 emit(FS_OPCODE_PLACEHOLDER_HALT);
3516
3517 if (alpha_test_func)
3518 emit_alpha_test();
3519
3520 emit_fb_writes();
3521
3522 calculate_cfg();
3523
3524 split_virtual_grfs();
3525
3526 move_uniform_array_access_to_pull_constants();
3527 assign_constant_locations();
3528 demote_pull_constants();
3529
3530 opt_drop_redundant_mov_to_flags();
3531
3532 #define OPT(pass, args...) do { \
3533 pass_num++; \
3534 bool this_progress = pass(args); \
3535 \
3536 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3537 char filename[64]; \
3538 snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass, \
3539 dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3540 \
3541 backend_visitor::dump_instructions(filename); \
3542 } \
3543 \
3544 progress = progress || this_progress; \
3545 } while (false)
3546
3547 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3548 char filename[64];
3549 snprintf(filename, 64, "fs%d-%04d-00-start",
3550 dispatch_width, shader_prog ? shader_prog->Name : 0);
3551
3552 backend_visitor::dump_instructions(filename);
3553 }
3554
3555 bool progress;
3556 int iteration = 0;
3557 do {
3558 progress = false;
3559 iteration++;
3560 int pass_num = 0;
3561
3562 OPT(remove_duplicate_mrf_writes);
3563
3564 OPT(opt_algebraic);
3565 OPT(opt_cse);
3566 OPT(opt_copy_propagate);
3567 OPT(opt_peephole_predicated_break);
3568 OPT(dead_code_eliminate);
3569 OPT(opt_peephole_sel);
3570 OPT(dead_control_flow_eliminate, this);
3571 OPT(opt_register_renaming);
3572 OPT(opt_saturate_propagation);
3573 OPT(register_coalesce);
3574 OPT(compute_to_mrf);
3575
3576 OPT(compact_virtual_grfs);
3577 } while (progress);
3578
3579 if (lower_load_payload()) {
3580 split_virtual_grfs();
3581 register_coalesce();
3582 compute_to_mrf();
3583 dead_code_eliminate();
3584 }
3585
3586 lower_uniform_pull_constant_loads();
3587
3588 assign_curb_setup();
3589 assign_urb_setup();
3590
3591 static enum instruction_scheduler_mode pre_modes[] = {
3592 SCHEDULE_PRE,
3593 SCHEDULE_PRE_NON_LIFO,
3594 SCHEDULE_PRE_LIFO,
3595 };
3596
3597 /* Try each scheduling heuristic to see if it can successfully register
3598 * allocate without spilling. They should be ordered by decreasing
3599 * performance but increasing likelihood of allocating.
3600 */
3601 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3602 schedule_instructions(pre_modes[i]);
3603
3604 if (0) {
3605 assign_regs_trivial();
3606 allocated_without_spills = true;
3607 } else {
3608 allocated_without_spills = assign_regs(false);
3609 }
3610 if (allocated_without_spills)
3611 break;
3612 }
3613
3614 if (!allocated_without_spills) {
3615 /* We assume that any spilling is worse than just dropping back to
3616 * SIMD8. There's probably actually some intermediate point where
3617 * SIMD16 with a couple of spills is still better.
3618 */
3619 if (dispatch_width == 16) {
3620 fail("Failure to register allocate. Reduce number of "
3621 "live scalar values to avoid this.");
3622 } else {
3623 perf_debug("Fragment shader triggered register spilling. "
3624 "Try reducing the number of live scalar values to "
3625 "improve performance.\n");
3626 }
3627
3628 /* Since we're out of heuristics, just go spill registers until we
3629 * get an allocation.
3630 */
3631 while (!assign_regs(true)) {
3632 if (failed)
3633 break;
3634 }
3635 }
3636 }
3637 assert(force_uncompressed_stack == 0);
3638
3639 /* This must come after all optimization and register allocation, since
3640 * it inserts dead code that happens to have side effects, and it does
3641 * so based on the actual physical registers in use.
3642 */
3643 insert_gen4_send_dependency_workarounds();
3644
3645 if (failed)
3646 return false;
3647
3648 if (!allocated_without_spills)
3649 schedule_instructions(SCHEDULE_POST);
3650
3651 if (last_scratch > 0) {
3652 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3653 }
3654
3655 if (stage == MESA_SHADER_FRAGMENT) {
3656 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3657 if (dispatch_width == 8)
3658 prog_data->reg_blocks = brw_register_blocks(grf_used);
3659 else
3660 prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3661 }
3662
3663 /* If any state parameters were appended, then ParameterValues could have
3664 * been realloced, in which case the driver uniform storage set up by
3665 * _mesa_associate_uniform_storage() would point to freed memory. Make
3666 * sure that didn't happen.
3667 */
3668 assert(sanity_param_count == prog->Parameters->NumParameters);
3669
3670 return !failed;
3671 }
3672
3673 const unsigned *
3674 brw_wm_fs_emit(struct brw_context *brw,
3675 void *mem_ctx,
3676 const struct brw_wm_prog_key *key,
3677 struct brw_wm_prog_data *prog_data,
3678 struct gl_fragment_program *fp,
3679 struct gl_shader_program *prog,
3680 unsigned *final_assembly_size)
3681 {
3682 bool start_busy = false;
3683 double start_time = 0;
3684
3685 if (unlikely(brw->perf_debug)) {
3686 start_busy = (brw->batch.last_bo &&
3687 drm_intel_bo_busy(brw->batch.last_bo));
3688 start_time = get_time();
3689 }
3690
3691 struct brw_shader *shader = NULL;
3692 if (prog)
3693 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3694
3695 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3696 brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3697
3698 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3699 */
3700 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3701 if (!v.run()) {
3702 if (prog) {
3703 prog->LinkStatus = false;
3704 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3705 }
3706
3707 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3708 v.fail_msg);
3709
3710 return NULL;
3711 }
3712
3713 cfg_t *simd16_cfg = NULL;
3714 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3715 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3716 brw->use_rep_send)) {
3717 if (!v.simd16_unsupported) {
3718 /* Try a SIMD16 compile */
3719 v2.import_uniforms(&v);
3720 if (!v2.run()) {
3721 perf_debug("SIMD16 shader failed to compile, falling back to "
3722 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3723 } else {
3724 simd16_cfg = v2.cfg;
3725 }
3726 } else {
3727 perf_debug("SIMD16 shader unsupported, falling back to "
3728 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3729 }
3730 }
3731
3732 cfg_t *simd8_cfg;
3733 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3734 if (no_simd8 && simd16_cfg) {
3735 simd8_cfg = NULL;
3736 prog_data->no_8 = true;
3737 } else {
3738 simd8_cfg = v.cfg;
3739 prog_data->no_8 = false;
3740 }
3741
3742 const unsigned *assembly = NULL;
3743 fs_generator g(brw, mem_ctx, key, prog_data, prog, fp,
3744 v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
3745 assembly = g.generate_assembly(simd8_cfg, simd16_cfg,
3746 final_assembly_size);
3747
3748 if (unlikely(brw->perf_debug) && shader) {
3749 if (shader->compiled_once)
3750 brw_wm_debug_recompile(brw, prog, key);
3751 shader->compiled_once = true;
3752
3753 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3754 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3755 (get_time() - start_time) * 1000);
3756 }
3757 }
3758
3759 return assembly;
3760 }
3761
3762 bool
3763 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3764 {
3765 struct brw_context *brw = brw_context(ctx);
3766 struct brw_wm_prog_key key;
3767
3768 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3769 return true;
3770
3771 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3772 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3773 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3774 bool program_uses_dfdy = fp->UsesDFdy;
3775
3776 memset(&key, 0, sizeof(key));
3777
3778 if (brw->gen < 6) {
3779 if (fp->UsesKill)
3780 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3781
3782 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3783 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3784
3785 /* Just assume depth testing. */
3786 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3787 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3788 }
3789
3790 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3791 BRW_FS_VARYING_INPUT_MASK) > 16)
3792 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3793
3794 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3795 for (unsigned i = 0; i < sampler_count; i++) {
3796 if (fp->Base.ShadowSamplers & (1 << i)) {
3797 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3798 key.tex.swizzles[i] =
3799 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3800 } else {
3801 /* Color sampler: assume no swizzling. */
3802 key.tex.swizzles[i] = SWIZZLE_XYZW;
3803 }
3804 }
3805
3806 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3807 key.drawable_height = ctx->DrawBuffer->Height;
3808 }
3809
3810 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3811 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3812 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3813
3814 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3815 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3816 key.nr_color_regions > 1;
3817 }
3818
3819 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3820 * quality of the derivatives is likely to be determined by the driconf
3821 * option.
3822 */
3823 key.high_quality_derivatives = brw->disable_derivative_optimization;
3824
3825 key.program_string_id = bfp->id;
3826
3827 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3828 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3829
3830 bool success = do_wm_prog(brw, prog, bfp, &key);
3831
3832 brw->wm.base.prog_offset = old_prog_offset;
3833 brw->wm.prog_data = old_prog_data;
3834
3835 return success;
3836 }