i965: Make precompile functions accessible from C.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "util/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "util/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_cfg.h"
50 #include "brw_dead_control_flow.h"
51 #include "main/uniforms.h"
52 #include "brw_fs_live_variables.h"
53 #include "glsl/glsl_types.h"
54
55 void
56 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
57 fs_reg *src, int sources)
58 {
59 memset(this, 0, sizeof(*this));
60
61 this->opcode = opcode;
62 this->dst = dst;
63 this->src = src;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (int i = 0; i < sources; ++i) {
79 if (src[i].file != GRF)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 for (int i = 0; i < sources; ++i) {
94 switch (this->src[i].file) {
95 case BAD_FILE:
96 this->src[i].effective_width = 8;
97 break;
98 case GRF:
99 case HW_REG:
100 assert(this->src[i].width > 0);
101 if (this->src[i].width == 1) {
102 this->src[i].effective_width = this->exec_size;
103 } else {
104 this->src[i].effective_width = this->src[i].width;
105 }
106 break;
107 case IMM:
108 case UNIFORM:
109 this->src[i].effective_width = this->exec_size;
110 break;
111 default:
112 unreachable("Invalid source register file");
113 }
114 }
115 this->dst.effective_width = this->exec_size;
116
117 this->conditional_mod = BRW_CONDITIONAL_NONE;
118
119 /* This will be the case for almost all instructions. */
120 switch (dst.file) {
121 case GRF:
122 case HW_REG:
123 case MRF:
124 this->regs_written = (dst.width * dst.stride * type_sz(dst.type) + 31) / 32;
125 break;
126 case BAD_FILE:
127 this->regs_written = 0;
128 break;
129 case IMM:
130 case UNIFORM:
131 unreachable("Invalid destination register file");
132 default:
133 unreachable("Invalid register file");
134 }
135
136 this->writes_accumulator = false;
137 }
138
139 fs_inst::fs_inst()
140 {
141 fs_reg *src = ralloc_array(this, fs_reg, 3);
142 init(BRW_OPCODE_NOP, 8, dst, src, 0);
143 }
144
145 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
146 {
147 fs_reg *src = ralloc_array(this, fs_reg, 3);
148 init(opcode, exec_size, reg_undef, src, 0);
149 }
150
151 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
152 {
153 fs_reg *src = ralloc_array(this, fs_reg, 3);
154 init(opcode, 0, dst, src, 0);
155 }
156
157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
158 const fs_reg &src0)
159 {
160 fs_reg *src = ralloc_array(this, fs_reg, 3);
161 src[0] = src0;
162 init(opcode, exec_size, dst, src, 1);
163 }
164
165 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
166 {
167 fs_reg *src = ralloc_array(this, fs_reg, 3);
168 src[0] = src0;
169 init(opcode, 0, dst, src, 1);
170 }
171
172 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
173 const fs_reg &src0, const fs_reg &src1)
174 {
175 fs_reg *src = ralloc_array(this, fs_reg, 3);
176 src[0] = src0;
177 src[1] = src1;
178 init(opcode, exec_size, dst, src, 2);
179 }
180
181 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
182 const fs_reg &src1)
183 {
184 fs_reg *src = ralloc_array(this, fs_reg, 3);
185 src[0] = src0;
186 src[1] = src1;
187 init(opcode, 0, dst, src, 2);
188 }
189
190 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
191 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
192 {
193 fs_reg *src = ralloc_array(this, fs_reg, 3);
194 src[0] = src0;
195 src[1] = src1;
196 src[2] = src2;
197 init(opcode, exec_size, dst, src, 3);
198 }
199
200 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
201 const fs_reg &src1, const fs_reg &src2)
202 {
203 fs_reg *src = ralloc_array(this, fs_reg, 3);
204 src[0] = src0;
205 src[1] = src1;
206 src[2] = src2;
207 init(opcode, 0, dst, src, 3);
208 }
209
210 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
211 {
212 init(opcode, 0, dst, src, sources);
213 }
214
215 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
216 fs_reg src[], int sources)
217 {
218 init(opcode, exec_width, dst, src, sources);
219 }
220
221 fs_inst::fs_inst(const fs_inst &that)
222 {
223 memcpy(this, &that, sizeof(that));
224
225 this->src = ralloc_array(this, fs_reg, that.sources);
226
227 for (int i = 0; i < that.sources; i++)
228 this->src[i] = that.src[i];
229 }
230
231 void
232 fs_inst::resize_sources(uint8_t num_sources)
233 {
234 if (this->sources != num_sources) {
235 this->src = reralloc(this, this->src, fs_reg, num_sources);
236 this->sources = num_sources;
237 }
238 }
239
240 #define ALU1(op) \
241 fs_inst * \
242 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
243 { \
244 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
245 }
246
247 #define ALU2(op) \
248 fs_inst * \
249 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
250 const fs_reg &src1) \
251 { \
252 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
253 }
254
255 #define ALU2_ACC(op) \
256 fs_inst * \
257 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
258 const fs_reg &src1) \
259 { \
260 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
261 inst->writes_accumulator = true; \
262 return inst; \
263 }
264
265 #define ALU3(op) \
266 fs_inst * \
267 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
268 const fs_reg &src1, const fs_reg &src2) \
269 { \
270 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
271 }
272
273 ALU1(NOT)
274 ALU1(MOV)
275 ALU1(FRC)
276 ALU1(RNDD)
277 ALU1(RNDE)
278 ALU1(RNDZ)
279 ALU2(ADD)
280 ALU2(MUL)
281 ALU2_ACC(MACH)
282 ALU2(AND)
283 ALU2(OR)
284 ALU2(XOR)
285 ALU2(SHL)
286 ALU2(SHR)
287 ALU2(ASR)
288 ALU3(LRP)
289 ALU1(BFREV)
290 ALU3(BFE)
291 ALU2(BFI1)
292 ALU3(BFI2)
293 ALU1(FBH)
294 ALU1(FBL)
295 ALU1(CBIT)
296 ALU3(MAD)
297 ALU2_ACC(ADDC)
298 ALU2_ACC(SUBB)
299 ALU2(SEL)
300 ALU2(MAC)
301
302 /** Gen4 predicated IF. */
303 fs_inst *
304 fs_visitor::IF(enum brw_predicate predicate)
305 {
306 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
307 inst->predicate = predicate;
308 return inst;
309 }
310
311 /** Gen6 IF with embedded comparison. */
312 fs_inst *
313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
314 enum brw_conditional_mod condition)
315 {
316 assert(brw->gen == 6);
317 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
318 reg_null_d, src0, src1);
319 inst->conditional_mod = condition;
320 return inst;
321 }
322
323 /**
324 * CMP: Sets the low bit of the destination channels with the result
325 * of the comparison, while the upper bits are undefined, and updates
326 * the flag register with the packed 16 bits of the result.
327 */
328 fs_inst *
329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
330 enum brw_conditional_mod condition)
331 {
332 fs_inst *inst;
333
334 /* Take the instruction:
335 *
336 * CMP null<d> src0<f> src1<f>
337 *
338 * Original gen4 does type conversion to the destination type before
339 * comparison, producing garbage results for floating point comparisons.
340 * gen5 does the comparison on the execution type (resolved source types),
341 * so dst type doesn't matter. gen6 does comparison and then uses the
342 * result as if it was the dst type with no conversion, which happens to
343 * mostly work out for float-interpreted-as-int since our comparisons are
344 * for >0, =0, <0.
345 */
346 if (brw->gen == 4) {
347 dst.type = src0.type;
348 if (dst.file == HW_REG)
349 dst.fixed_hw_reg.type = dst.type;
350 }
351
352 resolve_ud_negate(&src0);
353 resolve_ud_negate(&src1);
354
355 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
356 inst->conditional_mod = condition;
357
358 return inst;
359 }
360
361 fs_inst *
362 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
363 {
364 uint8_t exec_size = dst.width;
365 for (int i = 0; i < sources; ++i) {
366 assert(src[i].width % dst.width == 0);
367 if (src[i].width > exec_size)
368 exec_size = src[i].width;
369 }
370
371 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
372 dst, src, sources);
373 inst->regs_written = 0;
374 for (int i = 0; i < sources; ++i) {
375 /* The LOAD_PAYLOAD instruction only really makes sense if we are
376 * dealing with whole registers. If this ever changes, we can deal
377 * with it later.
378 */
379 int size = src[i].effective_width * type_sz(src[i].type);
380 assert(size % 32 == 0);
381 inst->regs_written += (size + 31) / 32;
382 }
383
384 return inst;
385 }
386
387 exec_list
388 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
389 const fs_reg &surf_index,
390 const fs_reg &varying_offset,
391 uint32_t const_offset)
392 {
393 exec_list instructions;
394 fs_inst *inst;
395
396 /* We have our constant surface use a pitch of 4 bytes, so our index can
397 * be any component of a vector, and then we load 4 contiguous
398 * components starting from that.
399 *
400 * We break down the const_offset to a portion added to the variable
401 * offset and a portion done using reg_offset, which means that if you
402 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
403 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
404 * CSE can later notice that those loads are all the same and eliminate
405 * the redundant ones.
406 */
407 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
408 instructions.push_tail(ADD(vec4_offset,
409 varying_offset, fs_reg(const_offset & ~3)));
410
411 int scale = 1;
412 if (brw->gen == 4 && dst.width == 8) {
413 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
414 * u, v, r) as parameters, or we can just use the SIMD16 message
415 * consisting of (header, u). We choose the second, at the cost of a
416 * longer return length.
417 */
418 scale = 2;
419 }
420
421 enum opcode op;
422 if (brw->gen >= 7)
423 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
424 else
425 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
426
427 assert(dst.width % 8 == 0);
428 int regs_written = 4 * (dst.width / 8) * scale;
429 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(regs_written),
430 dst.type, dst.width);
431 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
432 inst->regs_written = regs_written;
433 instructions.push_tail(inst);
434
435 if (brw->gen < 7) {
436 inst->base_mrf = 13;
437 inst->header_present = true;
438 if (brw->gen == 4)
439 inst->mlen = 3;
440 else
441 inst->mlen = 1 + dispatch_width / 8;
442 }
443
444 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
445 instructions.push_tail(MOV(dst, result));
446
447 return instructions;
448 }
449
450 /**
451 * A helper for MOV generation for fixing up broken hardware SEND dependency
452 * handling.
453 */
454 fs_inst *
455 fs_visitor::DEP_RESOLVE_MOV(int grf)
456 {
457 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
458
459 inst->ir = NULL;
460 inst->annotation = "send dependency resolve";
461
462 /* The caller always wants uncompressed to emit the minimal extra
463 * dependencies, and to avoid having to deal with aligning its regs to 2.
464 */
465 inst->exec_size = 8;
466
467 return inst;
468 }
469
470 bool
471 fs_inst::equals(fs_inst *inst) const
472 {
473 return (opcode == inst->opcode &&
474 dst.equals(inst->dst) &&
475 src[0].equals(inst->src[0]) &&
476 src[1].equals(inst->src[1]) &&
477 src[2].equals(inst->src[2]) &&
478 saturate == inst->saturate &&
479 predicate == inst->predicate &&
480 conditional_mod == inst->conditional_mod &&
481 mlen == inst->mlen &&
482 base_mrf == inst->base_mrf &&
483 target == inst->target &&
484 eot == inst->eot &&
485 header_present == inst->header_present &&
486 shadow_compare == inst->shadow_compare &&
487 exec_size == inst->exec_size &&
488 offset == inst->offset);
489 }
490
491 bool
492 fs_inst::overwrites_reg(const fs_reg &reg) const
493 {
494 return (reg.file == dst.file &&
495 reg.reg == dst.reg &&
496 reg.reg_offset >= dst.reg_offset &&
497 reg.reg_offset < dst.reg_offset + regs_written);
498 }
499
500 bool
501 fs_inst::is_send_from_grf() const
502 {
503 switch (opcode) {
504 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
505 case SHADER_OPCODE_SHADER_TIME_ADD:
506 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
507 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
508 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
509 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
510 case SHADER_OPCODE_UNTYPED_ATOMIC:
511 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
512 return true;
513 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
514 return src[1].file == GRF;
515 case FS_OPCODE_FB_WRITE:
516 return src[0].file == GRF;
517 default:
518 if (is_tex())
519 return src[0].file == GRF;
520
521 return false;
522 }
523 }
524
525 bool
526 fs_inst::can_do_source_mods(struct brw_context *brw)
527 {
528 if (brw->gen == 6 && is_math())
529 return false;
530
531 if (is_send_from_grf())
532 return false;
533
534 if (!backend_instruction::can_do_source_mods())
535 return false;
536
537 return true;
538 }
539
540 void
541 fs_reg::init()
542 {
543 memset(this, 0, sizeof(*this));
544 stride = 1;
545 }
546
547 /** Generic unset register constructor. */
548 fs_reg::fs_reg()
549 {
550 init();
551 this->file = BAD_FILE;
552 }
553
554 /** Immediate value constructor. */
555 fs_reg::fs_reg(float f)
556 {
557 init();
558 this->file = IMM;
559 this->type = BRW_REGISTER_TYPE_F;
560 this->fixed_hw_reg.dw1.f = f;
561 this->width = 1;
562 }
563
564 /** Immediate value constructor. */
565 fs_reg::fs_reg(int32_t i)
566 {
567 init();
568 this->file = IMM;
569 this->type = BRW_REGISTER_TYPE_D;
570 this->fixed_hw_reg.dw1.d = i;
571 this->width = 1;
572 }
573
574 /** Immediate value constructor. */
575 fs_reg::fs_reg(uint32_t u)
576 {
577 init();
578 this->file = IMM;
579 this->type = BRW_REGISTER_TYPE_UD;
580 this->fixed_hw_reg.dw1.ud = u;
581 this->width = 1;
582 }
583
584 /** Fixed brw_reg. */
585 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
586 {
587 init();
588 this->file = HW_REG;
589 this->fixed_hw_reg = fixed_hw_reg;
590 this->type = fixed_hw_reg.type;
591 this->width = 1 << fixed_hw_reg.width;
592 }
593
594 bool
595 fs_reg::equals(const fs_reg &r) const
596 {
597 return (file == r.file &&
598 reg == r.reg &&
599 reg_offset == r.reg_offset &&
600 subreg_offset == r.subreg_offset &&
601 type == r.type &&
602 negate == r.negate &&
603 abs == r.abs &&
604 !reladdr && !r.reladdr &&
605 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
606 width == r.width &&
607 stride == r.stride);
608 }
609
610 fs_reg &
611 fs_reg::set_smear(unsigned subreg)
612 {
613 assert(file != HW_REG && file != IMM);
614 subreg_offset = subreg * type_sz(type);
615 stride = 0;
616 return *this;
617 }
618
619 bool
620 fs_reg::is_contiguous() const
621 {
622 return stride == 1;
623 }
624
625 int
626 fs_visitor::type_size(const struct glsl_type *type)
627 {
628 unsigned int size, i;
629
630 switch (type->base_type) {
631 case GLSL_TYPE_UINT:
632 case GLSL_TYPE_INT:
633 case GLSL_TYPE_FLOAT:
634 case GLSL_TYPE_BOOL:
635 return type->components();
636 case GLSL_TYPE_ARRAY:
637 return type_size(type->fields.array) * type->length;
638 case GLSL_TYPE_STRUCT:
639 size = 0;
640 for (i = 0; i < type->length; i++) {
641 size += type_size(type->fields.structure[i].type);
642 }
643 return size;
644 case GLSL_TYPE_SAMPLER:
645 /* Samplers take up no register space, since they're baked in at
646 * link time.
647 */
648 return 0;
649 case GLSL_TYPE_ATOMIC_UINT:
650 return 0;
651 case GLSL_TYPE_IMAGE:
652 case GLSL_TYPE_VOID:
653 case GLSL_TYPE_ERROR:
654 case GLSL_TYPE_INTERFACE:
655 unreachable("not reached");
656 }
657
658 return 0;
659 }
660
661 fs_reg
662 fs_visitor::get_timestamp()
663 {
664 assert(brw->gen >= 7);
665
666 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
667 BRW_ARF_TIMESTAMP,
668 0),
669 BRW_REGISTER_TYPE_UD));
670
671 fs_reg dst = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 4);
672
673 fs_inst *mov = emit(MOV(dst, ts));
674 /* We want to read the 3 fields we care about even if it's not enabled in
675 * the dispatch.
676 */
677 mov->force_writemask_all = true;
678
679 /* The caller wants the low 32 bits of the timestamp. Since it's running
680 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
681 * which is plenty of time for our purposes. It is identical across the
682 * EUs, but since it's tracking GPU core speed it will increment at a
683 * varying rate as render P-states change.
684 *
685 * The caller could also check if render P-states have changed (or anything
686 * else that might disrupt timing) by setting smear to 2 and checking if
687 * that field is != 0.
688 */
689 dst.set_smear(0);
690
691 return dst;
692 }
693
694 void
695 fs_visitor::emit_shader_time_begin()
696 {
697 current_annotation = "shader time start";
698 shader_start_time = get_timestamp();
699 }
700
701 void
702 fs_visitor::emit_shader_time_end()
703 {
704 current_annotation = "shader time end";
705
706 enum shader_time_shader_type type, written_type, reset_type;
707 if (dispatch_width == 8) {
708 type = ST_FS8;
709 written_type = ST_FS8_WRITTEN;
710 reset_type = ST_FS8_RESET;
711 } else {
712 assert(dispatch_width == 16);
713 type = ST_FS16;
714 written_type = ST_FS16_WRITTEN;
715 reset_type = ST_FS16_RESET;
716 }
717
718 fs_reg shader_end_time = get_timestamp();
719
720 /* Check that there weren't any timestamp reset events (assuming these
721 * were the only two timestamp reads that happened).
722 */
723 fs_reg reset = shader_end_time;
724 reset.set_smear(2);
725 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
726 test->conditional_mod = BRW_CONDITIONAL_Z;
727 emit(IF(BRW_PREDICATE_NORMAL));
728
729 fs_reg start = shader_start_time;
730 start.negate = true;
731 fs_reg diff = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 1);
732 emit(ADD(diff, start, shader_end_time));
733
734 /* If there were no instructions between the two timestamp gets, the diff
735 * is 2 cycles. Remove that overhead, so I can forget about that when
736 * trying to determine the time taken for single instructions.
737 */
738 emit(ADD(diff, diff, fs_reg(-2u)));
739
740 emit_shader_time_write(type, diff);
741 emit_shader_time_write(written_type, fs_reg(1u));
742 emit(BRW_OPCODE_ELSE);
743 emit_shader_time_write(reset_type, fs_reg(1u));
744 emit(BRW_OPCODE_ENDIF);
745 }
746
747 void
748 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
749 fs_reg value)
750 {
751 int shader_time_index =
752 brw_get_shader_time_index(brw, shader_prog, prog, type);
753 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
754
755 fs_reg payload;
756 if (dispatch_width == 8)
757 payload = fs_reg(this, glsl_type::uvec2_type);
758 else
759 payload = fs_reg(this, glsl_type::uint_type);
760
761 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
762 fs_reg(), payload, offset, value));
763 }
764
765 void
766 fs_visitor::vfail(const char *format, va_list va)
767 {
768 char *msg;
769
770 if (failed)
771 return;
772
773 failed = true;
774
775 msg = ralloc_vasprintf(mem_ctx, format, va);
776 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
777
778 this->fail_msg = msg;
779
780 if (INTEL_DEBUG & DEBUG_WM) {
781 fprintf(stderr, "%s", msg);
782 }
783 }
784
785 void
786 fs_visitor::fail(const char *format, ...)
787 {
788 va_list va;
789
790 va_start(va, format);
791 vfail(format, va);
792 va_end(va);
793 }
794
795 /**
796 * Mark this program as impossible to compile in SIMD16 mode.
797 *
798 * During the SIMD8 compile (which happens first), we can detect and flag
799 * things that are unsupported in SIMD16 mode, so the compiler can skip
800 * the SIMD16 compile altogether.
801 *
802 * During a SIMD16 compile (if one happens anyway), this just calls fail().
803 */
804 void
805 fs_visitor::no16(const char *format, ...)
806 {
807 va_list va;
808
809 va_start(va, format);
810
811 if (dispatch_width == 16) {
812 vfail(format, va);
813 } else {
814 simd16_unsupported = true;
815
816 if (brw->perf_debug) {
817 if (no16_msg)
818 ralloc_vasprintf_append(&no16_msg, format, va);
819 else
820 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
821 }
822 }
823
824 va_end(va);
825 }
826
827 fs_inst *
828 fs_visitor::emit(enum opcode opcode)
829 {
830 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
831 }
832
833 fs_inst *
834 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
835 {
836 return emit(new(mem_ctx) fs_inst(opcode, dst));
837 }
838
839 fs_inst *
840 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
841 {
842 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
843 }
844
845 fs_inst *
846 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
847 const fs_reg &src1)
848 {
849 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
850 }
851
852 fs_inst *
853 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
854 const fs_reg &src1, const fs_reg &src2)
855 {
856 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
857 }
858
859 fs_inst *
860 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
861 fs_reg src[], int sources)
862 {
863 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
864 }
865
866 /**
867 * Returns true if the instruction has a flag that means it won't
868 * update an entire destination register.
869 *
870 * For example, dead code elimination and live variable analysis want to know
871 * when a write to a variable screens off any preceding values that were in
872 * it.
873 */
874 bool
875 fs_inst::is_partial_write() const
876 {
877 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
878 (this->dst.width * type_sz(this->dst.type)) < 32 ||
879 !this->dst.is_contiguous());
880 }
881
882 int
883 fs_inst::regs_read(fs_visitor *v, int arg) const
884 {
885 if (is_tex() && arg == 0 && src[0].file == GRF) {
886 return mlen;
887 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
888 return mlen;
889 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
890 return mlen;
891 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
892 return mlen;
893 }
894
895 switch (src[arg].file) {
896 case BAD_FILE:
897 case UNIFORM:
898 case IMM:
899 return 1;
900 case GRF:
901 case HW_REG:
902 if (src[arg].stride == 0) {
903 return 1;
904 } else {
905 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
906 return (size + 31) / 32;
907 }
908 case MRF:
909 unreachable("MRF registers are not allowed as sources");
910 default:
911 unreachable("Invalid register file");
912 }
913 }
914
915 bool
916 fs_inst::reads_flag() const
917 {
918 return predicate;
919 }
920
921 bool
922 fs_inst::writes_flag() const
923 {
924 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
925 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
926 }
927
928 /**
929 * Returns how many MRFs an FS opcode will write over.
930 *
931 * Note that this is not the 0 or 1 implied writes in an actual gen
932 * instruction -- the FS opcodes often generate MOVs in addition.
933 */
934 int
935 fs_visitor::implied_mrf_writes(fs_inst *inst)
936 {
937 if (inst->mlen == 0)
938 return 0;
939
940 if (inst->base_mrf == -1)
941 return 0;
942
943 switch (inst->opcode) {
944 case SHADER_OPCODE_RCP:
945 case SHADER_OPCODE_RSQ:
946 case SHADER_OPCODE_SQRT:
947 case SHADER_OPCODE_EXP2:
948 case SHADER_OPCODE_LOG2:
949 case SHADER_OPCODE_SIN:
950 case SHADER_OPCODE_COS:
951 return 1 * dispatch_width / 8;
952 case SHADER_OPCODE_POW:
953 case SHADER_OPCODE_INT_QUOTIENT:
954 case SHADER_OPCODE_INT_REMAINDER:
955 return 2 * dispatch_width / 8;
956 case SHADER_OPCODE_TEX:
957 case FS_OPCODE_TXB:
958 case SHADER_OPCODE_TXD:
959 case SHADER_OPCODE_TXF:
960 case SHADER_OPCODE_TXF_CMS:
961 case SHADER_OPCODE_TXF_MCS:
962 case SHADER_OPCODE_TG4:
963 case SHADER_OPCODE_TG4_OFFSET:
964 case SHADER_OPCODE_TXL:
965 case SHADER_OPCODE_TXS:
966 case SHADER_OPCODE_LOD:
967 return 1;
968 case FS_OPCODE_FB_WRITE:
969 return 2;
970 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
971 case SHADER_OPCODE_GEN4_SCRATCH_READ:
972 return 1;
973 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
974 return inst->mlen;
975 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
976 return 2;
977 case SHADER_OPCODE_UNTYPED_ATOMIC:
978 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
979 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
980 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
981 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
982 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
983 return 0;
984 default:
985 unreachable("not reached");
986 }
987 }
988
989 int
990 fs_visitor::virtual_grf_alloc(int size)
991 {
992 if (virtual_grf_array_size <= virtual_grf_count) {
993 if (virtual_grf_array_size == 0)
994 virtual_grf_array_size = 16;
995 else
996 virtual_grf_array_size *= 2;
997 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
998 virtual_grf_array_size);
999 }
1000 virtual_grf_sizes[virtual_grf_count] = size;
1001 return virtual_grf_count++;
1002 }
1003
1004 /** Fixed HW reg constructor. */
1005 fs_reg::fs_reg(enum register_file file, int reg)
1006 {
1007 init();
1008 this->file = file;
1009 this->reg = reg;
1010 this->type = BRW_REGISTER_TYPE_F;
1011
1012 switch (file) {
1013 case UNIFORM:
1014 this->width = 1;
1015 break;
1016 default:
1017 this->width = 8;
1018 }
1019 }
1020
1021 /** Fixed HW reg constructor. */
1022 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1023 {
1024 init();
1025 this->file = file;
1026 this->reg = reg;
1027 this->type = type;
1028
1029 switch (file) {
1030 case UNIFORM:
1031 this->width = 1;
1032 break;
1033 default:
1034 this->width = 8;
1035 }
1036 }
1037
1038 /** Fixed HW reg constructor. */
1039 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1040 uint8_t width)
1041 {
1042 init();
1043 this->file = file;
1044 this->reg = reg;
1045 this->type = type;
1046 this->width = width;
1047 }
1048
1049 /** Automatic reg constructor. */
1050 fs_reg::fs_reg(fs_visitor *v, const struct glsl_type *type)
1051 {
1052 init();
1053 int reg_width = v->dispatch_width / 8;
1054
1055 this->file = GRF;
1056 this->reg = v->virtual_grf_alloc(v->type_size(type) * reg_width);
1057 this->reg_offset = 0;
1058 this->type = brw_type_for_base_type(type);
1059 this->width = v->dispatch_width;
1060 assert(this->width == 8 || this->width == 16);
1061 }
1062
1063 fs_reg *
1064 fs_visitor::variable_storage(ir_variable *var)
1065 {
1066 return (fs_reg *)hash_table_find(this->variable_ht, var);
1067 }
1068
1069 void
1070 import_uniforms_callback(const void *key,
1071 void *data,
1072 void *closure)
1073 {
1074 struct hash_table *dst_ht = (struct hash_table *)closure;
1075 const fs_reg *reg = (const fs_reg *)data;
1076
1077 if (reg->file != UNIFORM)
1078 return;
1079
1080 hash_table_insert(dst_ht, data, key);
1081 }
1082
1083 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1084 * This brings in those uniform definitions
1085 */
1086 void
1087 fs_visitor::import_uniforms(fs_visitor *v)
1088 {
1089 hash_table_call_foreach(v->variable_ht,
1090 import_uniforms_callback,
1091 variable_ht);
1092 this->push_constant_loc = v->push_constant_loc;
1093 this->pull_constant_loc = v->pull_constant_loc;
1094 this->uniforms = v->uniforms;
1095 this->param_size = v->param_size;
1096 }
1097
1098 /* Our support for uniforms is piggy-backed on the struct
1099 * gl_fragment_program, because that's where the values actually
1100 * get stored, rather than in some global gl_shader_program uniform
1101 * store.
1102 */
1103 void
1104 fs_visitor::setup_uniform_values(ir_variable *ir)
1105 {
1106 int namelen = strlen(ir->name);
1107
1108 /* The data for our (non-builtin) uniforms is stored in a series of
1109 * gl_uniform_driver_storage structs for each subcomponent that
1110 * glGetUniformLocation() could name. We know it's been set up in the same
1111 * order we'd walk the type, so walk the list of storage and find anything
1112 * with our name, or the prefix of a component that starts with our name.
1113 */
1114 unsigned params_before = uniforms;
1115 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1116 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1117
1118 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1119 (storage->name[namelen] != 0 &&
1120 storage->name[namelen] != '.' &&
1121 storage->name[namelen] != '[')) {
1122 continue;
1123 }
1124
1125 unsigned slots = storage->type->component_slots();
1126 if (storage->array_elements)
1127 slots *= storage->array_elements;
1128
1129 for (unsigned i = 0; i < slots; i++) {
1130 stage_prog_data->param[uniforms++] = &storage->storage[i];
1131 }
1132 }
1133
1134 /* Make sure we actually initialized the right amount of stuff here. */
1135 assert(params_before + ir->type->component_slots() == uniforms);
1136 (void)params_before;
1137 }
1138
1139
1140 /* Our support for builtin uniforms is even scarier than non-builtin.
1141 * It sits on top of the PROG_STATE_VAR parameters that are
1142 * automatically updated from GL context state.
1143 */
1144 void
1145 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1146 {
1147 const ir_state_slot *const slots = ir->get_state_slots();
1148 assert(slots != NULL);
1149
1150 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1151 /* This state reference has already been setup by ir_to_mesa, but we'll
1152 * get the same index back here.
1153 */
1154 int index = _mesa_add_state_reference(this->prog->Parameters,
1155 (gl_state_index *)slots[i].tokens);
1156
1157 /* Add each of the unique swizzles of the element as a parameter.
1158 * This'll end up matching the expected layout of the
1159 * array/matrix/structure we're trying to fill in.
1160 */
1161 int last_swiz = -1;
1162 for (unsigned int j = 0; j < 4; j++) {
1163 int swiz = GET_SWZ(slots[i].swizzle, j);
1164 if (swiz == last_swiz)
1165 break;
1166 last_swiz = swiz;
1167
1168 stage_prog_data->param[uniforms++] =
1169 &prog->Parameters->ParameterValues[index][swiz];
1170 }
1171 }
1172 }
1173
1174 fs_reg *
1175 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1176 {
1177 assert(stage == MESA_SHADER_FRAGMENT);
1178 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1179 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1180 fs_reg wpos = *reg;
1181 bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1182
1183 /* gl_FragCoord.x */
1184 if (ir->data.pixel_center_integer) {
1185 emit(MOV(wpos, this->pixel_x));
1186 } else {
1187 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1188 }
1189 wpos = offset(wpos, 1);
1190
1191 /* gl_FragCoord.y */
1192 if (!flip && ir->data.pixel_center_integer) {
1193 emit(MOV(wpos, this->pixel_y));
1194 } else {
1195 fs_reg pixel_y = this->pixel_y;
1196 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1197
1198 if (flip) {
1199 pixel_y.negate = true;
1200 offset += key->drawable_height - 1.0;
1201 }
1202
1203 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1204 }
1205 wpos = offset(wpos, 1);
1206
1207 /* gl_FragCoord.z */
1208 if (brw->gen >= 6) {
1209 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1210 } else {
1211 emit(FS_OPCODE_LINTERP, wpos,
1212 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1213 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1214 interp_reg(VARYING_SLOT_POS, 2));
1215 }
1216 wpos = offset(wpos, 1);
1217
1218 /* gl_FragCoord.w: Already set up in emit_interpolation */
1219 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1220
1221 return reg;
1222 }
1223
1224 fs_inst *
1225 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1226 glsl_interp_qualifier interpolation_mode,
1227 bool is_centroid, bool is_sample)
1228 {
1229 brw_wm_barycentric_interp_mode barycoord_mode;
1230 if (brw->gen >= 6) {
1231 if (is_centroid) {
1232 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1233 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1234 else
1235 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1236 } else if (is_sample) {
1237 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1238 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1239 else
1240 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1241 } else {
1242 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1243 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1244 else
1245 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1246 }
1247 } else {
1248 /* On Ironlake and below, there is only one interpolation mode.
1249 * Centroid interpolation doesn't mean anything on this hardware --
1250 * there is no multisampling.
1251 */
1252 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1253 }
1254 return emit(FS_OPCODE_LINTERP, attr,
1255 this->delta_x[barycoord_mode],
1256 this->delta_y[barycoord_mode], interp);
1257 }
1258
1259 fs_reg *
1260 fs_visitor::emit_general_interpolation(ir_variable *ir)
1261 {
1262 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1263 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1264 fs_reg attr = *reg;
1265
1266 assert(stage == MESA_SHADER_FRAGMENT);
1267 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1268 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1269
1270 unsigned int array_elements;
1271 const glsl_type *type;
1272
1273 if (ir->type->is_array()) {
1274 array_elements = ir->type->length;
1275 if (array_elements == 0) {
1276 fail("dereferenced array '%s' has length 0\n", ir->name);
1277 }
1278 type = ir->type->fields.array;
1279 } else {
1280 array_elements = 1;
1281 type = ir->type;
1282 }
1283
1284 glsl_interp_qualifier interpolation_mode =
1285 ir->determine_interpolation_mode(key->flat_shade);
1286
1287 int location = ir->data.location;
1288 for (unsigned int i = 0; i < array_elements; i++) {
1289 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1290 if (prog_data->urb_setup[location] == -1) {
1291 /* If there's no incoming setup data for this slot, don't
1292 * emit interpolation for it.
1293 */
1294 attr = offset(attr, type->vector_elements);
1295 location++;
1296 continue;
1297 }
1298
1299 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1300 /* Constant interpolation (flat shading) case. The SF has
1301 * handed us defined values in only the constant offset
1302 * field of the setup reg.
1303 */
1304 for (unsigned int k = 0; k < type->vector_elements; k++) {
1305 struct brw_reg interp = interp_reg(location, k);
1306 interp = suboffset(interp, 3);
1307 interp.type = reg->type;
1308 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1309 attr = offset(attr, 1);
1310 }
1311 } else {
1312 /* Smooth/noperspective interpolation case. */
1313 for (unsigned int k = 0; k < type->vector_elements; k++) {
1314 struct brw_reg interp = interp_reg(location, k);
1315 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1316 /* Get the pixel/sample mask into f0 so that we know
1317 * which pixels are lit. Then, for each channel that is
1318 * unlit, replace the centroid data with non-centroid
1319 * data.
1320 */
1321 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1322
1323 fs_inst *inst;
1324 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1325 false, false);
1326 inst->predicate = BRW_PREDICATE_NORMAL;
1327 inst->predicate_inverse = true;
1328 if (brw->has_pln)
1329 inst->no_dd_clear = true;
1330
1331 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1332 ir->data.centroid && !key->persample_shading,
1333 ir->data.sample || key->persample_shading);
1334 inst->predicate = BRW_PREDICATE_NORMAL;
1335 inst->predicate_inverse = false;
1336 if (brw->has_pln)
1337 inst->no_dd_check = true;
1338
1339 } else {
1340 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1341 ir->data.centroid && !key->persample_shading,
1342 ir->data.sample || key->persample_shading);
1343 }
1344 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1345 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1346 }
1347 attr = offset(attr, 1);
1348 }
1349
1350 }
1351 location++;
1352 }
1353 }
1354
1355 return reg;
1356 }
1357
1358 fs_reg *
1359 fs_visitor::emit_frontfacing_interpolation()
1360 {
1361 fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::bool_type);
1362
1363 if (brw->gen >= 6) {
1364 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1365 * a boolean result from this (~0/true or 0/false).
1366 *
1367 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1368 * this task in only one instruction:
1369 * - a negation source modifier will flip the bit; and
1370 * - a W -> D type conversion will sign extend the bit into the high
1371 * word of the destination.
1372 *
1373 * An ASR 15 fills the low word of the destination.
1374 */
1375 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1376 g0.negate = true;
1377
1378 emit(ASR(*reg, g0, fs_reg(15)));
1379 } else {
1380 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1381 * a boolean result from this (1/true or 0/false).
1382 *
1383 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1384 * the negation source modifier to flip it. Unfortunately the SHR
1385 * instruction only operates on UD (or D with an abs source modifier)
1386 * sources without negation.
1387 *
1388 * Instead, use ASR (which will give ~0/true or 0/false) followed by an
1389 * AND 1.
1390 */
1391 fs_reg asr = fs_reg(this, glsl_type::bool_type);
1392 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1393 g1_6.negate = true;
1394
1395 emit(ASR(asr, g1_6, fs_reg(31)));
1396 emit(AND(*reg, asr, fs_reg(1)));
1397 }
1398
1399 return reg;
1400 }
1401
1402 void
1403 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1404 {
1405 assert(stage == MESA_SHADER_FRAGMENT);
1406 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1407 assert(dst.type == BRW_REGISTER_TYPE_F);
1408
1409 if (key->compute_pos_offset) {
1410 /* Convert int_sample_pos to floating point */
1411 emit(MOV(dst, int_sample_pos));
1412 /* Scale to the range [0, 1] */
1413 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1414 }
1415 else {
1416 /* From ARB_sample_shading specification:
1417 * "When rendering to a non-multisample buffer, or if multisample
1418 * rasterization is disabled, gl_SamplePosition will always be
1419 * (0.5, 0.5).
1420 */
1421 emit(MOV(dst, fs_reg(0.5f)));
1422 }
1423 }
1424
1425 fs_reg *
1426 fs_visitor::emit_samplepos_setup()
1427 {
1428 assert(brw->gen >= 6);
1429
1430 this->current_annotation = "compute sample position";
1431 fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::vec2_type);
1432 fs_reg pos = *reg;
1433 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1434 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1435
1436 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1437 * mode will be enabled.
1438 *
1439 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1440 * R31.1:0 Position Offset X/Y for Slot[3:0]
1441 * R31.3:2 Position Offset X/Y for Slot[7:4]
1442 * .....
1443 *
1444 * The X, Y sample positions come in as bytes in thread payload. So, read
1445 * the positions using vstride=16, width=8, hstride=2.
1446 */
1447 struct brw_reg sample_pos_reg =
1448 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1449 BRW_REGISTER_TYPE_B), 16, 8, 2);
1450
1451 if (dispatch_width == 8) {
1452 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1453 } else {
1454 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1455 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1456 ->force_sechalf = true;
1457 }
1458 /* Compute gl_SamplePosition.x */
1459 compute_sample_position(pos, int_sample_x);
1460 pos = offset(pos, 1);
1461 if (dispatch_width == 8) {
1462 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1463 } else {
1464 emit(MOV(half(int_sample_y, 0),
1465 fs_reg(suboffset(sample_pos_reg, 1))));
1466 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1467 ->force_sechalf = true;
1468 }
1469 /* Compute gl_SamplePosition.y */
1470 compute_sample_position(pos, int_sample_y);
1471 return reg;
1472 }
1473
1474 fs_reg *
1475 fs_visitor::emit_sampleid_setup()
1476 {
1477 assert(stage == MESA_SHADER_FRAGMENT);
1478 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1479 assert(brw->gen >= 6);
1480
1481 this->current_annotation = "compute sample id";
1482 fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::int_type);
1483
1484 if (key->compute_sample_id) {
1485 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1486 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1487 t2.type = BRW_REGISTER_TYPE_UW;
1488
1489 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1490 * 8x multisampling, subspan 0 will represent sample N (where N
1491 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1492 * 7. We can find the value of N by looking at R0.0 bits 7:6
1493 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1494 * (since samples are always delivered in pairs). That is, we
1495 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1496 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1497 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1498 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1499 * populating a temporary variable with the sequence (0, 1, 2, 3),
1500 * and then reading from it using vstride=1, width=4, hstride=0.
1501 * These computations hold good for 4x multisampling as well.
1502 *
1503 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1504 * the first four slots are sample 0 of subspan 0; the next four
1505 * are sample 1 of subspan 0; the third group is sample 0 of
1506 * subspan 1, and finally sample 1 of subspan 1.
1507 */
1508 fs_inst *inst;
1509 inst = emit(BRW_OPCODE_AND, t1,
1510 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1511 fs_reg(0xc0));
1512 inst->force_writemask_all = true;
1513 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1514 inst->force_writemask_all = true;
1515 /* This works for both SIMD8 and SIMD16 */
1516 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1517 inst->force_writemask_all = true;
1518 /* This special instruction takes care of setting vstride=1,
1519 * width=4, hstride=0 of t2 during an ADD instruction.
1520 */
1521 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1522 } else {
1523 /* As per GL_ARB_sample_shading specification:
1524 * "When rendering to a non-multisample buffer, or if multisample
1525 * rasterization is disabled, gl_SampleID will always be zero."
1526 */
1527 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1528 }
1529
1530 return reg;
1531 }
1532
1533 fs_reg
1534 fs_visitor::fix_math_operand(fs_reg src)
1535 {
1536 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1537 * might be able to do better by doing execsize = 1 math and then
1538 * expanding that result out, but we would need to be careful with
1539 * masking.
1540 *
1541 * The hardware ignores source modifiers (negate and abs) on math
1542 * instructions, so we also move to a temp to set those up.
1543 */
1544 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1545 !src.abs && !src.negate)
1546 return src;
1547
1548 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1549 * operands to math
1550 */
1551 if (brw->gen >= 7 && src.file != IMM)
1552 return src;
1553
1554 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1555 expanded.type = src.type;
1556 emit(BRW_OPCODE_MOV, expanded, src);
1557 return expanded;
1558 }
1559
1560 fs_inst *
1561 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1562 {
1563 switch (opcode) {
1564 case SHADER_OPCODE_RCP:
1565 case SHADER_OPCODE_RSQ:
1566 case SHADER_OPCODE_SQRT:
1567 case SHADER_OPCODE_EXP2:
1568 case SHADER_OPCODE_LOG2:
1569 case SHADER_OPCODE_SIN:
1570 case SHADER_OPCODE_COS:
1571 break;
1572 default:
1573 unreachable("not reached: bad math opcode");
1574 }
1575
1576 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1577 * might be able to do better by doing execsize = 1 math and then
1578 * expanding that result out, but we would need to be careful with
1579 * masking.
1580 *
1581 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1582 * instructions, so we also move to a temp to set those up.
1583 */
1584 if (brw->gen == 6 || brw->gen == 7)
1585 src = fix_math_operand(src);
1586
1587 fs_inst *inst = emit(opcode, dst, src);
1588
1589 if (brw->gen < 6) {
1590 inst->base_mrf = 2;
1591 inst->mlen = dispatch_width / 8;
1592 }
1593
1594 return inst;
1595 }
1596
1597 fs_inst *
1598 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1599 {
1600 int base_mrf = 2;
1601 fs_inst *inst;
1602
1603 if (brw->gen >= 8) {
1604 inst = emit(opcode, dst, src0, src1);
1605 } else if (brw->gen >= 6) {
1606 src0 = fix_math_operand(src0);
1607 src1 = fix_math_operand(src1);
1608
1609 inst = emit(opcode, dst, src0, src1);
1610 } else {
1611 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1612 * "Message Payload":
1613 *
1614 * "Operand0[7]. For the INT DIV functions, this operand is the
1615 * denominator."
1616 * ...
1617 * "Operand1[7]. For the INT DIV functions, this operand is the
1618 * numerator."
1619 */
1620 bool is_int_div = opcode != SHADER_OPCODE_POW;
1621 fs_reg &op0 = is_int_div ? src1 : src0;
1622 fs_reg &op1 = is_int_div ? src0 : src1;
1623
1624 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1625 inst = emit(opcode, dst, op0, reg_null_f);
1626
1627 inst->base_mrf = base_mrf;
1628 inst->mlen = 2 * dispatch_width / 8;
1629 }
1630 return inst;
1631 }
1632
1633 void
1634 fs_visitor::assign_curb_setup()
1635 {
1636 if (dispatch_width == 8) {
1637 prog_data->dispatch_grf_start_reg = payload.num_regs;
1638 } else {
1639 assert(stage == MESA_SHADER_FRAGMENT);
1640 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1641 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1642 }
1643
1644 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1645
1646 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1647 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1648 for (unsigned int i = 0; i < inst->sources; i++) {
1649 if (inst->src[i].file == UNIFORM) {
1650 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1651 int constant_nr;
1652 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1653 constant_nr = push_constant_loc[uniform_nr];
1654 } else {
1655 /* Section 5.11 of the OpenGL 4.1 spec says:
1656 * "Out-of-bounds reads return undefined values, which include
1657 * values from other variables of the active program or zero."
1658 * Just return the first push constant.
1659 */
1660 constant_nr = 0;
1661 }
1662
1663 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1664 constant_nr / 8,
1665 constant_nr % 8);
1666
1667 inst->src[i].file = HW_REG;
1668 inst->src[i].fixed_hw_reg = byte_offset(
1669 retype(brw_reg, inst->src[i].type),
1670 inst->src[i].subreg_offset);
1671 }
1672 }
1673 }
1674 }
1675
1676 void
1677 fs_visitor::calculate_urb_setup()
1678 {
1679 assert(stage == MESA_SHADER_FRAGMENT);
1680 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1681 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1682
1683 memset(prog_data->urb_setup, -1,
1684 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1685
1686 int urb_next = 0;
1687 /* Figure out where each of the incoming setup attributes lands. */
1688 if (brw->gen >= 6) {
1689 if (_mesa_bitcount_64(prog->InputsRead &
1690 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1691 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1692 * first 16 varying inputs, so we can put them wherever we want.
1693 * Just put them in order.
1694 *
1695 * This is useful because it means that (a) inputs not used by the
1696 * fragment shader won't take up valuable register space, and (b) we
1697 * won't have to recompile the fragment shader if it gets paired with
1698 * a different vertex (or geometry) shader.
1699 */
1700 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1701 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1702 BITFIELD64_BIT(i)) {
1703 prog_data->urb_setup[i] = urb_next++;
1704 }
1705 }
1706 } else {
1707 /* We have enough input varyings that the SF/SBE pipeline stage can't
1708 * arbitrarily rearrange them to suit our whim; we have to put them
1709 * in an order that matches the output of the previous pipeline stage
1710 * (geometry or vertex shader).
1711 */
1712 struct brw_vue_map prev_stage_vue_map;
1713 brw_compute_vue_map(brw, &prev_stage_vue_map,
1714 key->input_slots_valid);
1715 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1716 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1717 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1718 slot++) {
1719 int varying = prev_stage_vue_map.slot_to_varying[slot];
1720 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1721 * unused.
1722 */
1723 if (varying != BRW_VARYING_SLOT_COUNT &&
1724 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1725 BITFIELD64_BIT(varying))) {
1726 prog_data->urb_setup[varying] = slot - first_slot;
1727 }
1728 }
1729 urb_next = prev_stage_vue_map.num_slots - first_slot;
1730 }
1731 } else {
1732 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1733 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1734 /* Point size is packed into the header, not as a general attribute */
1735 if (i == VARYING_SLOT_PSIZ)
1736 continue;
1737
1738 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1739 /* The back color slot is skipped when the front color is
1740 * also written to. In addition, some slots can be
1741 * written in the vertex shader and not read in the
1742 * fragment shader. So the register number must always be
1743 * incremented, mapped or not.
1744 */
1745 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1746 prog_data->urb_setup[i] = urb_next;
1747 urb_next++;
1748 }
1749 }
1750
1751 /*
1752 * It's a FS only attribute, and we did interpolation for this attribute
1753 * in SF thread. So, count it here, too.
1754 *
1755 * See compile_sf_prog() for more info.
1756 */
1757 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1758 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1759 }
1760
1761 prog_data->num_varying_inputs = urb_next;
1762 }
1763
1764 void
1765 fs_visitor::assign_urb_setup()
1766 {
1767 assert(stage == MESA_SHADER_FRAGMENT);
1768 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1769
1770 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1771
1772 /* Offset all the urb_setup[] index by the actual position of the
1773 * setup regs, now that the location of the constants has been chosen.
1774 */
1775 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1776 if (inst->opcode == FS_OPCODE_LINTERP) {
1777 assert(inst->src[2].file == HW_REG);
1778 inst->src[2].fixed_hw_reg.nr += urb_start;
1779 }
1780
1781 if (inst->opcode == FS_OPCODE_CINTERP) {
1782 assert(inst->src[0].file == HW_REG);
1783 inst->src[0].fixed_hw_reg.nr += urb_start;
1784 }
1785 }
1786
1787 /* Each attribute is 4 setup channels, each of which is half a reg. */
1788 this->first_non_payload_grf =
1789 urb_start + prog_data->num_varying_inputs * 2;
1790 }
1791
1792 /**
1793 * Split large virtual GRFs into separate components if we can.
1794 *
1795 * This is mostly duplicated with what brw_fs_vector_splitting does,
1796 * but that's really conservative because it's afraid of doing
1797 * splitting that doesn't result in real progress after the rest of
1798 * the optimization phases, which would cause infinite looping in
1799 * optimization. We can do it once here, safely. This also has the
1800 * opportunity to split interpolated values, or maybe even uniforms,
1801 * which we don't have at the IR level.
1802 *
1803 * We want to split, because virtual GRFs are what we register
1804 * allocate and spill (due to contiguousness requirements for some
1805 * instructions), and they're what we naturally generate in the
1806 * codegen process, but most virtual GRFs don't actually need to be
1807 * contiguous sets of GRFs. If we split, we'll end up with reduced
1808 * live intervals and better dead code elimination and coalescing.
1809 */
1810 void
1811 fs_visitor::split_virtual_grfs()
1812 {
1813 int num_vars = this->virtual_grf_count;
1814
1815 /* Count the total number of registers */
1816 int reg_count = 0;
1817 int vgrf_to_reg[num_vars];
1818 for (int i = 0; i < num_vars; i++) {
1819 vgrf_to_reg[i] = reg_count;
1820 reg_count += virtual_grf_sizes[i];
1821 }
1822
1823 /* An array of "split points". For each register slot, this indicates
1824 * if this slot can be separated from the previous slot. Every time an
1825 * instruction uses multiple elements of a register (as a source or
1826 * destination), we mark the used slots as inseparable. Then we go
1827 * through and split the registers into the smallest pieces we can.
1828 */
1829 bool split_points[reg_count];
1830 memset(split_points, 0, sizeof(split_points));
1831
1832 /* Mark all used registers as fully splittable */
1833 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1834 if (inst->dst.file == GRF) {
1835 int reg = vgrf_to_reg[inst->dst.reg];
1836 for (int j = 1; j < this->virtual_grf_sizes[inst->dst.reg]; j++)
1837 split_points[reg + j] = true;
1838 }
1839
1840 for (int i = 0; i < inst->sources; i++) {
1841 if (inst->src[i].file == GRF) {
1842 int reg = vgrf_to_reg[inst->src[i].reg];
1843 for (int j = 1; j < this->virtual_grf_sizes[inst->src[i].reg]; j++)
1844 split_points[reg + j] = true;
1845 }
1846 }
1847 }
1848
1849 if (brw->has_pln &&
1850 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1851 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1852 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1853 * Gen6, that was the only supported interpolation mode, and since Gen6,
1854 * delta_x and delta_y are in fixed hardware registers.
1855 */
1856 int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1857 split_points[vgrf_to_reg[vgrf] + 1] = false;
1858 }
1859
1860 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1861 if (inst->dst.file == GRF) {
1862 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1863 for (int j = 1; j < inst->regs_written; j++)
1864 split_points[reg + j] = false;
1865 }
1866 for (int i = 0; i < inst->sources; i++) {
1867 if (inst->src[i].file == GRF) {
1868 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1869 for (int j = 1; j < inst->regs_read(this, i); j++)
1870 split_points[reg + j] = false;
1871 }
1872 }
1873 }
1874
1875 int new_virtual_grf[reg_count];
1876 int new_reg_offset[reg_count];
1877
1878 int reg = 0;
1879 for (int i = 0; i < num_vars; i++) {
1880 /* The first one should always be 0 as a quick sanity check. */
1881 assert(split_points[reg] == false);
1882
1883 /* j = 0 case */
1884 new_reg_offset[reg] = 0;
1885 reg++;
1886 int offset = 1;
1887
1888 /* j > 0 case */
1889 for (int j = 1; j < virtual_grf_sizes[i]; j++) {
1890 /* If this is a split point, reset the offset to 0 and allocate a
1891 * new virtual GRF for the previous offset many registers
1892 */
1893 if (split_points[reg]) {
1894 assert(offset <= MAX_VGRF_SIZE);
1895 int grf = virtual_grf_alloc(offset);
1896 for (int k = reg - offset; k < reg; k++)
1897 new_virtual_grf[k] = grf;
1898 offset = 0;
1899 }
1900 new_reg_offset[reg] = offset;
1901 offset++;
1902 reg++;
1903 }
1904
1905 /* The last one gets the original register number */
1906 assert(offset <= MAX_VGRF_SIZE);
1907 virtual_grf_sizes[i] = offset;
1908 for (int k = reg - offset; k < reg; k++)
1909 new_virtual_grf[k] = i;
1910 }
1911 assert(reg == reg_count);
1912
1913 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1914 if (inst->dst.file == GRF) {
1915 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1916 inst->dst.reg = new_virtual_grf[reg];
1917 inst->dst.reg_offset = new_reg_offset[reg];
1918 assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
1919 }
1920 for (int i = 0; i < inst->sources; i++) {
1921 if (inst->src[i].file == GRF) {
1922 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1923 inst->src[i].reg = new_virtual_grf[reg];
1924 inst->src[i].reg_offset = new_reg_offset[reg];
1925 assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
1926 }
1927 }
1928 }
1929 invalidate_live_intervals();
1930 }
1931
1932 /**
1933 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1934 *
1935 * During code generation, we create tons of temporary variables, many of
1936 * which get immediately killed and are never used again. Yet, in later
1937 * optimization and analysis passes, such as compute_live_intervals, we need
1938 * to loop over all the virtual GRFs. Compacting them can save a lot of
1939 * overhead.
1940 */
1941 bool
1942 fs_visitor::compact_virtual_grfs()
1943 {
1944 bool progress = false;
1945 int remap_table[this->virtual_grf_count];
1946 memset(remap_table, -1, sizeof(remap_table));
1947
1948 /* Mark which virtual GRFs are used. */
1949 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1950 if (inst->dst.file == GRF)
1951 remap_table[inst->dst.reg] = 0;
1952
1953 for (int i = 0; i < inst->sources; i++) {
1954 if (inst->src[i].file == GRF)
1955 remap_table[inst->src[i].reg] = 0;
1956 }
1957 }
1958
1959 /* Compact the GRF arrays. */
1960 int new_index = 0;
1961 for (int i = 0; i < this->virtual_grf_count; i++) {
1962 if (remap_table[i] == -1) {
1963 /* We just found an unused register. This means that we are
1964 * actually going to compact something.
1965 */
1966 progress = true;
1967 } else {
1968 remap_table[i] = new_index;
1969 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1970 invalidate_live_intervals();
1971 ++new_index;
1972 }
1973 }
1974
1975 this->virtual_grf_count = new_index;
1976
1977 /* Patch all the instructions to use the newly renumbered registers */
1978 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1979 if (inst->dst.file == GRF)
1980 inst->dst.reg = remap_table[inst->dst.reg];
1981
1982 for (int i = 0; i < inst->sources; i++) {
1983 if (inst->src[i].file == GRF)
1984 inst->src[i].reg = remap_table[inst->src[i].reg];
1985 }
1986 }
1987
1988 /* Patch all the references to delta_x/delta_y, since they're used in
1989 * register allocation. If they're unused, switch them to BAD_FILE so
1990 * we don't think some random VGRF is delta_x/delta_y.
1991 */
1992 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
1993 if (delta_x[i].file == GRF) {
1994 if (remap_table[delta_x[i].reg] != -1) {
1995 delta_x[i].reg = remap_table[delta_x[i].reg];
1996 } else {
1997 delta_x[i].file = BAD_FILE;
1998 }
1999 }
2000 }
2001 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2002 if (delta_y[i].file == GRF) {
2003 if (remap_table[delta_y[i].reg] != -1) {
2004 delta_y[i].reg = remap_table[delta_y[i].reg];
2005 } else {
2006 delta_y[i].file = BAD_FILE;
2007 }
2008 }
2009 }
2010
2011 return progress;
2012 }
2013
2014 /*
2015 * Implements array access of uniforms by inserting a
2016 * PULL_CONSTANT_LOAD instruction.
2017 *
2018 * Unlike temporary GRF array access (where we don't support it due to
2019 * the difficulty of doing relative addressing on instruction
2020 * destinations), we could potentially do array access of uniforms
2021 * that were loaded in GRF space as push constants. In real-world
2022 * usage we've seen, though, the arrays being used are always larger
2023 * than we could load as push constants, so just always move all
2024 * uniform array access out to a pull constant buffer.
2025 */
2026 void
2027 fs_visitor::move_uniform_array_access_to_pull_constants()
2028 {
2029 if (dispatch_width != 8)
2030 return;
2031
2032 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2033 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2034
2035 /* Walk through and find array access of uniforms. Put a copy of that
2036 * uniform in the pull constant buffer.
2037 *
2038 * Note that we don't move constant-indexed accesses to arrays. No
2039 * testing has been done of the performance impact of this choice.
2040 */
2041 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2042 for (int i = 0 ; i < inst->sources; i++) {
2043 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2044 continue;
2045
2046 int uniform = inst->src[i].reg;
2047
2048 /* If this array isn't already present in the pull constant buffer,
2049 * add it.
2050 */
2051 if (pull_constant_loc[uniform] == -1) {
2052 const gl_constant_value **values = &stage_prog_data->param[uniform];
2053
2054 assert(param_size[uniform]);
2055
2056 for (int j = 0; j < param_size[uniform]; j++) {
2057 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2058
2059 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2060 values[j];
2061 }
2062 }
2063 }
2064 }
2065 }
2066
2067 /**
2068 * Assign UNIFORM file registers to either push constants or pull constants.
2069 *
2070 * We allow a fragment shader to have more than the specified minimum
2071 * maximum number of fragment shader uniform components (64). If
2072 * there are too many of these, they'd fill up all of register space.
2073 * So, this will push some of them out to the pull constant buffer and
2074 * update the program to load them.
2075 */
2076 void
2077 fs_visitor::assign_constant_locations()
2078 {
2079 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2080 if (dispatch_width != 8)
2081 return;
2082
2083 /* Find which UNIFORM registers are still in use. */
2084 bool is_live[uniforms];
2085 for (unsigned int i = 0; i < uniforms; i++) {
2086 is_live[i] = false;
2087 }
2088
2089 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2090 for (int i = 0; i < inst->sources; i++) {
2091 if (inst->src[i].file != UNIFORM)
2092 continue;
2093
2094 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2095 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2096 is_live[constant_nr] = true;
2097 }
2098 }
2099
2100 /* Only allow 16 registers (128 uniform components) as push constants.
2101 *
2102 * Just demote the end of the list. We could probably do better
2103 * here, demoting things that are rarely used in the program first.
2104 *
2105 * If changing this value, note the limitation about total_regs in
2106 * brw_curbe.c.
2107 */
2108 unsigned int max_push_components = 16 * 8;
2109 unsigned int num_push_constants = 0;
2110
2111 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2112
2113 for (unsigned int i = 0; i < uniforms; i++) {
2114 if (!is_live[i] || pull_constant_loc[i] != -1) {
2115 /* This UNIFORM register is either dead, or has already been demoted
2116 * to a pull const. Mark it as no longer living in the param[] array.
2117 */
2118 push_constant_loc[i] = -1;
2119 continue;
2120 }
2121
2122 if (num_push_constants < max_push_components) {
2123 /* Retain as a push constant. Record the location in the params[]
2124 * array.
2125 */
2126 push_constant_loc[i] = num_push_constants++;
2127 } else {
2128 /* Demote to a pull constant. */
2129 push_constant_loc[i] = -1;
2130
2131 int pull_index = stage_prog_data->nr_pull_params++;
2132 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2133 pull_constant_loc[i] = pull_index;
2134 }
2135 }
2136
2137 stage_prog_data->nr_params = num_push_constants;
2138
2139 /* Up until now, the param[] array has been indexed by reg + reg_offset
2140 * of UNIFORM registers. Condense it to only contain the uniforms we
2141 * chose to upload as push constants.
2142 */
2143 for (unsigned int i = 0; i < uniforms; i++) {
2144 int remapped = push_constant_loc[i];
2145
2146 if (remapped == -1)
2147 continue;
2148
2149 assert(remapped <= (int)i);
2150 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2151 }
2152 }
2153
2154 /**
2155 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2156 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2157 */
2158 void
2159 fs_visitor::demote_pull_constants()
2160 {
2161 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2162 for (int i = 0; i < inst->sources; i++) {
2163 if (inst->src[i].file != UNIFORM)
2164 continue;
2165
2166 int pull_index = pull_constant_loc[inst->src[i].reg +
2167 inst->src[i].reg_offset];
2168 if (pull_index == -1)
2169 continue;
2170
2171 /* Set up the annotation tracking for new generated instructions. */
2172 base_ir = inst->ir;
2173 current_annotation = inst->annotation;
2174
2175 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2176 fs_reg dst = fs_reg(this, glsl_type::float_type);
2177
2178 /* Generate a pull load into dst. */
2179 if (inst->src[i].reladdr) {
2180 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2181 surf_index,
2182 *inst->src[i].reladdr,
2183 pull_index);
2184 inst->insert_before(block, &list);
2185 inst->src[i].reladdr = NULL;
2186 } else {
2187 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2188 fs_inst *pull =
2189 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2190 dst, surf_index, offset);
2191 inst->insert_before(block, pull);
2192 inst->src[i].set_smear(pull_index & 3);
2193 }
2194
2195 /* Rewrite the instruction to use the temporary VGRF. */
2196 inst->src[i].file = GRF;
2197 inst->src[i].reg = dst.reg;
2198 inst->src[i].reg_offset = 0;
2199 inst->src[i].width = dispatch_width;
2200 }
2201 }
2202 invalidate_live_intervals();
2203 }
2204
2205 bool
2206 fs_visitor::opt_algebraic()
2207 {
2208 bool progress = false;
2209
2210 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2211 switch (inst->opcode) {
2212 case BRW_OPCODE_MUL:
2213 if (inst->src[1].file != IMM)
2214 continue;
2215
2216 /* a * 1.0 = a */
2217 if (inst->src[1].is_one()) {
2218 inst->opcode = BRW_OPCODE_MOV;
2219 inst->src[1] = reg_undef;
2220 progress = true;
2221 break;
2222 }
2223
2224 /* a * 0.0 = 0.0 */
2225 if (inst->src[1].is_zero()) {
2226 inst->opcode = BRW_OPCODE_MOV;
2227 inst->src[0] = inst->src[1];
2228 inst->src[1] = reg_undef;
2229 progress = true;
2230 break;
2231 }
2232
2233 break;
2234 case BRW_OPCODE_ADD:
2235 if (inst->src[1].file != IMM)
2236 continue;
2237
2238 /* a + 0.0 = a */
2239 if (inst->src[1].is_zero()) {
2240 inst->opcode = BRW_OPCODE_MOV;
2241 inst->src[1] = reg_undef;
2242 progress = true;
2243 break;
2244 }
2245 break;
2246 case BRW_OPCODE_OR:
2247 if (inst->src[0].equals(inst->src[1])) {
2248 inst->opcode = BRW_OPCODE_MOV;
2249 inst->src[1] = reg_undef;
2250 progress = true;
2251 break;
2252 }
2253 break;
2254 case BRW_OPCODE_LRP:
2255 if (inst->src[1].equals(inst->src[2])) {
2256 inst->opcode = BRW_OPCODE_MOV;
2257 inst->src[0] = inst->src[1];
2258 inst->src[1] = reg_undef;
2259 inst->src[2] = reg_undef;
2260 progress = true;
2261 break;
2262 }
2263 break;
2264 case BRW_OPCODE_SEL:
2265 if (inst->src[0].equals(inst->src[1])) {
2266 inst->opcode = BRW_OPCODE_MOV;
2267 inst->src[1] = reg_undef;
2268 inst->predicate = BRW_PREDICATE_NONE;
2269 inst->predicate_inverse = false;
2270 progress = true;
2271 } else if (inst->saturate && inst->src[1].file == IMM) {
2272 switch (inst->conditional_mod) {
2273 case BRW_CONDITIONAL_LE:
2274 case BRW_CONDITIONAL_L:
2275 switch (inst->src[1].type) {
2276 case BRW_REGISTER_TYPE_F:
2277 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2278 inst->opcode = BRW_OPCODE_MOV;
2279 inst->src[1] = reg_undef;
2280 progress = true;
2281 }
2282 break;
2283 default:
2284 break;
2285 }
2286 break;
2287 case BRW_CONDITIONAL_GE:
2288 case BRW_CONDITIONAL_G:
2289 switch (inst->src[1].type) {
2290 case BRW_REGISTER_TYPE_F:
2291 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2292 inst->opcode = BRW_OPCODE_MOV;
2293 inst->src[1] = reg_undef;
2294 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2295 progress = true;
2296 }
2297 break;
2298 default:
2299 break;
2300 }
2301 default:
2302 break;
2303 }
2304 }
2305 break;
2306 case SHADER_OPCODE_RCP: {
2307 fs_inst *prev = (fs_inst *)inst->prev;
2308 if (prev->opcode == SHADER_OPCODE_SQRT) {
2309 if (inst->src[0].equals(prev->dst)) {
2310 inst->opcode = SHADER_OPCODE_RSQ;
2311 inst->src[0] = prev->src[0];
2312 progress = true;
2313 }
2314 }
2315 break;
2316 }
2317 default:
2318 break;
2319 }
2320 }
2321
2322 return progress;
2323 }
2324
2325 bool
2326 fs_visitor::opt_register_renaming()
2327 {
2328 bool progress = false;
2329 int depth = 0;
2330
2331 int remap[virtual_grf_count];
2332 memset(remap, -1, sizeof(int) * virtual_grf_count);
2333
2334 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2335 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2336 depth++;
2337 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2338 inst->opcode == BRW_OPCODE_WHILE) {
2339 depth--;
2340 }
2341
2342 /* Rewrite instruction sources. */
2343 for (int i = 0; i < inst->sources; i++) {
2344 if (inst->src[i].file == GRF &&
2345 remap[inst->src[i].reg] != -1 &&
2346 remap[inst->src[i].reg] != inst->src[i].reg) {
2347 inst->src[i].reg = remap[inst->src[i].reg];
2348 progress = true;
2349 }
2350 }
2351
2352 const int dst = inst->dst.reg;
2353
2354 if (depth == 0 &&
2355 inst->dst.file == GRF &&
2356 virtual_grf_sizes[inst->dst.reg] == inst->dst.width / 8 &&
2357 !inst->is_partial_write()) {
2358 if (remap[dst] == -1) {
2359 remap[dst] = dst;
2360 } else {
2361 remap[dst] = virtual_grf_alloc(inst->dst.width / 8);
2362 inst->dst.reg = remap[dst];
2363 progress = true;
2364 }
2365 } else if (inst->dst.file == GRF &&
2366 remap[dst] != -1 &&
2367 remap[dst] != dst) {
2368 inst->dst.reg = remap[dst];
2369 progress = true;
2370 }
2371 }
2372
2373 if (progress) {
2374 invalidate_live_intervals();
2375
2376 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2377 if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2378 delta_x[i].reg = remap[delta_x[i].reg];
2379 }
2380 }
2381 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2382 if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2383 delta_y[i].reg = remap[delta_y[i].reg];
2384 }
2385 }
2386 }
2387
2388 return progress;
2389 }
2390
2391 bool
2392 fs_visitor::compute_to_mrf()
2393 {
2394 bool progress = false;
2395 int next_ip = 0;
2396
2397 /* No MRFs on Gen >= 7. */
2398 if (brw->gen >= 7)
2399 return false;
2400
2401 calculate_live_intervals();
2402
2403 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2404 int ip = next_ip;
2405 next_ip++;
2406
2407 if (inst->opcode != BRW_OPCODE_MOV ||
2408 inst->is_partial_write() ||
2409 inst->dst.file != MRF || inst->src[0].file != GRF ||
2410 inst->dst.type != inst->src[0].type ||
2411 inst->src[0].abs || inst->src[0].negate ||
2412 !inst->src[0].is_contiguous() ||
2413 inst->src[0].subreg_offset)
2414 continue;
2415
2416 /* Work out which hardware MRF registers are written by this
2417 * instruction.
2418 */
2419 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2420 int mrf_high;
2421 if (inst->dst.reg & BRW_MRF_COMPR4) {
2422 mrf_high = mrf_low + 4;
2423 } else if (inst->exec_size == 16) {
2424 mrf_high = mrf_low + 1;
2425 } else {
2426 mrf_high = mrf_low;
2427 }
2428
2429 /* Can't compute-to-MRF this GRF if someone else was going to
2430 * read it later.
2431 */
2432 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2433 continue;
2434
2435 /* Found a move of a GRF to a MRF. Let's see if we can go
2436 * rewrite the thing that made this GRF to write into the MRF.
2437 */
2438 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2439 if (scan_inst->dst.file == GRF &&
2440 scan_inst->dst.reg == inst->src[0].reg) {
2441 /* Found the last thing to write our reg we want to turn
2442 * into a compute-to-MRF.
2443 */
2444
2445 /* If this one instruction didn't populate all the
2446 * channels, bail. We might be able to rewrite everything
2447 * that writes that reg, but it would require smarter
2448 * tracking to delay the rewriting until complete success.
2449 */
2450 if (scan_inst->is_partial_write())
2451 break;
2452
2453 /* Things returning more than one register would need us to
2454 * understand coalescing out more than one MOV at a time.
2455 */
2456 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2457 break;
2458
2459 /* SEND instructions can't have MRF as a destination. */
2460 if (scan_inst->mlen)
2461 break;
2462
2463 if (brw->gen == 6) {
2464 /* gen6 math instructions must have the destination be
2465 * GRF, so no compute-to-MRF for them.
2466 */
2467 if (scan_inst->is_math()) {
2468 break;
2469 }
2470 }
2471
2472 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2473 /* Found the creator of our MRF's source value. */
2474 scan_inst->dst.file = MRF;
2475 scan_inst->dst.reg = inst->dst.reg;
2476 scan_inst->saturate |= inst->saturate;
2477 inst->remove(block);
2478 progress = true;
2479 }
2480 break;
2481 }
2482
2483 /* We don't handle control flow here. Most computation of
2484 * values that end up in MRFs are shortly before the MRF
2485 * write anyway.
2486 */
2487 if (block->start() == scan_inst)
2488 break;
2489
2490 /* You can't read from an MRF, so if someone else reads our
2491 * MRF's source GRF that we wanted to rewrite, that stops us.
2492 */
2493 bool interfered = false;
2494 for (int i = 0; i < scan_inst->sources; i++) {
2495 if (scan_inst->src[i].file == GRF &&
2496 scan_inst->src[i].reg == inst->src[0].reg &&
2497 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2498 interfered = true;
2499 }
2500 }
2501 if (interfered)
2502 break;
2503
2504 if (scan_inst->dst.file == MRF) {
2505 /* If somebody else writes our MRF here, we can't
2506 * compute-to-MRF before that.
2507 */
2508 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2509 int scan_mrf_high;
2510
2511 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2512 scan_mrf_high = scan_mrf_low + 4;
2513 } else if (scan_inst->exec_size == 16) {
2514 scan_mrf_high = scan_mrf_low + 1;
2515 } else {
2516 scan_mrf_high = scan_mrf_low;
2517 }
2518
2519 if (mrf_low == scan_mrf_low ||
2520 mrf_low == scan_mrf_high ||
2521 mrf_high == scan_mrf_low ||
2522 mrf_high == scan_mrf_high) {
2523 break;
2524 }
2525 }
2526
2527 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2528 /* Found a SEND instruction, which means that there are
2529 * live values in MRFs from base_mrf to base_mrf +
2530 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2531 * above it.
2532 */
2533 if (mrf_low >= scan_inst->base_mrf &&
2534 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2535 break;
2536 }
2537 if (mrf_high >= scan_inst->base_mrf &&
2538 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2539 break;
2540 }
2541 }
2542 }
2543 }
2544
2545 if (progress)
2546 invalidate_live_intervals();
2547
2548 return progress;
2549 }
2550
2551 /**
2552 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2553 * instructions to FS_OPCODE_REP_FB_WRITE.
2554 */
2555 void
2556 fs_visitor::emit_repclear_shader()
2557 {
2558 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2559 int base_mrf = 1;
2560 int color_mrf = base_mrf + 2;
2561
2562 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2563 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2564 mov->force_writemask_all = true;
2565
2566 fs_inst *write;
2567 if (key->nr_color_regions == 1) {
2568 write = emit(FS_OPCODE_REP_FB_WRITE);
2569 write->saturate = key->clamp_fragment_color;
2570 write->base_mrf = color_mrf;
2571 write->target = 0;
2572 write->header_present = false;
2573 write->mlen = 1;
2574 } else {
2575 assume(key->nr_color_regions > 0);
2576 for (int i = 0; i < key->nr_color_regions; ++i) {
2577 write = emit(FS_OPCODE_REP_FB_WRITE);
2578 write->saturate = key->clamp_fragment_color;
2579 write->base_mrf = base_mrf;
2580 write->target = i;
2581 write->header_present = true;
2582 write->mlen = 3;
2583 }
2584 }
2585 write->eot = true;
2586
2587 calculate_cfg();
2588
2589 assign_constant_locations();
2590 assign_curb_setup();
2591
2592 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2593 assert(mov->src[0].file == HW_REG);
2594 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2595 }
2596
2597 /**
2598 * Walks through basic blocks, looking for repeated MRF writes and
2599 * removing the later ones.
2600 */
2601 bool
2602 fs_visitor::remove_duplicate_mrf_writes()
2603 {
2604 fs_inst *last_mrf_move[16];
2605 bool progress = false;
2606
2607 /* Need to update the MRF tracking for compressed instructions. */
2608 if (dispatch_width == 16)
2609 return false;
2610
2611 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2612
2613 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2614 if (inst->is_control_flow()) {
2615 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2616 }
2617
2618 if (inst->opcode == BRW_OPCODE_MOV &&
2619 inst->dst.file == MRF) {
2620 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2621 if (prev_inst && inst->equals(prev_inst)) {
2622 inst->remove(block);
2623 progress = true;
2624 continue;
2625 }
2626 }
2627
2628 /* Clear out the last-write records for MRFs that were overwritten. */
2629 if (inst->dst.file == MRF) {
2630 last_mrf_move[inst->dst.reg] = NULL;
2631 }
2632
2633 if (inst->mlen > 0 && inst->base_mrf != -1) {
2634 /* Found a SEND instruction, which will include two or fewer
2635 * implied MRF writes. We could do better here.
2636 */
2637 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2638 last_mrf_move[inst->base_mrf + i] = NULL;
2639 }
2640 }
2641
2642 /* Clear out any MRF move records whose sources got overwritten. */
2643 if (inst->dst.file == GRF) {
2644 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2645 if (last_mrf_move[i] &&
2646 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2647 last_mrf_move[i] = NULL;
2648 }
2649 }
2650 }
2651
2652 if (inst->opcode == BRW_OPCODE_MOV &&
2653 inst->dst.file == MRF &&
2654 inst->src[0].file == GRF &&
2655 !inst->is_partial_write()) {
2656 last_mrf_move[inst->dst.reg] = inst;
2657 }
2658 }
2659
2660 if (progress)
2661 invalidate_live_intervals();
2662
2663 return progress;
2664 }
2665
2666 static void
2667 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2668 int first_grf, int grf_len)
2669 {
2670 /* Clear the flag for registers that actually got read (as expected). */
2671 for (int i = 0; i < inst->sources; i++) {
2672 int grf;
2673 if (inst->src[i].file == GRF) {
2674 grf = inst->src[i].reg;
2675 } else if (inst->src[i].file == HW_REG &&
2676 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2677 grf = inst->src[i].fixed_hw_reg.nr;
2678 } else {
2679 continue;
2680 }
2681
2682 if (grf >= first_grf &&
2683 grf < first_grf + grf_len) {
2684 deps[grf - first_grf] = false;
2685 if (inst->exec_size == 16)
2686 deps[grf - first_grf + 1] = false;
2687 }
2688 }
2689 }
2690
2691 /**
2692 * Implements this workaround for the original 965:
2693 *
2694 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2695 * check for post destination dependencies on this instruction, software
2696 * must ensure that there is no destination hazard for the case of ‘write
2697 * followed by a posted write’ shown in the following example.
2698 *
2699 * 1. mov r3 0
2700 * 2. send r3.xy <rest of send instruction>
2701 * 3. mov r2 r3
2702 *
2703 * Due to no post-destination dependency check on the ‘send’, the above
2704 * code sequence could have two instructions (1 and 2) in flight at the
2705 * same time that both consider ‘r3’ as the target of their final writes.
2706 */
2707 void
2708 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2709 fs_inst *inst)
2710 {
2711 int write_len = inst->regs_written;
2712 int first_write_grf = inst->dst.reg;
2713 bool needs_dep[BRW_MAX_MRF];
2714 assert(write_len < (int)sizeof(needs_dep) - 1);
2715
2716 memset(needs_dep, false, sizeof(needs_dep));
2717 memset(needs_dep, true, write_len);
2718
2719 clear_deps_for_inst_src(inst, dispatch_width,
2720 needs_dep, first_write_grf, write_len);
2721
2722 /* Walk backwards looking for writes to registers we're writing which
2723 * aren't read since being written. If we hit the start of the program,
2724 * we assume that there are no outstanding dependencies on entry to the
2725 * program.
2726 */
2727 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2728 /* If we hit control flow, assume that there *are* outstanding
2729 * dependencies, and force their cleanup before our instruction.
2730 */
2731 if (block->start() == scan_inst) {
2732 for (int i = 0; i < write_len; i++) {
2733 if (needs_dep[i]) {
2734 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2735 }
2736 }
2737 return;
2738 }
2739
2740 /* We insert our reads as late as possible on the assumption that any
2741 * instruction but a MOV that might have left us an outstanding
2742 * dependency has more latency than a MOV.
2743 */
2744 if (scan_inst->dst.file == GRF) {
2745 for (int i = 0; i < scan_inst->regs_written; i++) {
2746 int reg = scan_inst->dst.reg + i;
2747
2748 if (reg >= first_write_grf &&
2749 reg < first_write_grf + write_len &&
2750 needs_dep[reg - first_write_grf]) {
2751 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2752 needs_dep[reg - first_write_grf] = false;
2753 if (scan_inst->exec_size == 16)
2754 needs_dep[reg - first_write_grf + 1] = false;
2755 }
2756 }
2757 }
2758
2759 /* Clear the flag for registers that actually got read (as expected). */
2760 clear_deps_for_inst_src(scan_inst, dispatch_width,
2761 needs_dep, first_write_grf, write_len);
2762
2763 /* Continue the loop only if we haven't resolved all the dependencies */
2764 int i;
2765 for (i = 0; i < write_len; i++) {
2766 if (needs_dep[i])
2767 break;
2768 }
2769 if (i == write_len)
2770 return;
2771 }
2772 }
2773
2774 /**
2775 * Implements this workaround for the original 965:
2776 *
2777 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2778 * used as a destination register until after it has been sourced by an
2779 * instruction with a different destination register.
2780 */
2781 void
2782 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2783 {
2784 int write_len = inst->regs_written;
2785 int first_write_grf = inst->dst.reg;
2786 bool needs_dep[BRW_MAX_MRF];
2787 assert(write_len < (int)sizeof(needs_dep) - 1);
2788
2789 memset(needs_dep, false, sizeof(needs_dep));
2790 memset(needs_dep, true, write_len);
2791 /* Walk forwards looking for writes to registers we're writing which aren't
2792 * read before being written.
2793 */
2794 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2795 /* If we hit control flow, force resolve all remaining dependencies. */
2796 if (block->end() == scan_inst) {
2797 for (int i = 0; i < write_len; i++) {
2798 if (needs_dep[i])
2799 scan_inst->insert_before(block,
2800 DEP_RESOLVE_MOV(first_write_grf + i));
2801 }
2802 return;
2803 }
2804
2805 /* Clear the flag for registers that actually got read (as expected). */
2806 clear_deps_for_inst_src(scan_inst, dispatch_width,
2807 needs_dep, first_write_grf, write_len);
2808
2809 /* We insert our reads as late as possible since they're reading the
2810 * result of a SEND, which has massive latency.
2811 */
2812 if (scan_inst->dst.file == GRF &&
2813 scan_inst->dst.reg >= first_write_grf &&
2814 scan_inst->dst.reg < first_write_grf + write_len &&
2815 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2816 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
2817 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2818 }
2819
2820 /* Continue the loop only if we haven't resolved all the dependencies */
2821 int i;
2822 for (i = 0; i < write_len; i++) {
2823 if (needs_dep[i])
2824 break;
2825 }
2826 if (i == write_len)
2827 return;
2828 }
2829
2830 /* If we hit the end of the program, resolve all remaining dependencies out
2831 * of paranoia.
2832 */
2833 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2834 assert(last_inst->eot);
2835 for (int i = 0; i < write_len; i++) {
2836 if (needs_dep[i])
2837 last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2838 }
2839 }
2840
2841 void
2842 fs_visitor::insert_gen4_send_dependency_workarounds()
2843 {
2844 if (brw->gen != 4 || brw->is_g4x)
2845 return;
2846
2847 bool progress = false;
2848
2849 /* Note that we're done with register allocation, so GRF fs_regs always
2850 * have a .reg_offset of 0.
2851 */
2852
2853 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2854 if (inst->mlen != 0 && inst->dst.file == GRF) {
2855 insert_gen4_pre_send_dependency_workarounds(block, inst);
2856 insert_gen4_post_send_dependency_workarounds(block, inst);
2857 progress = true;
2858 }
2859 }
2860
2861 if (progress)
2862 invalidate_live_intervals();
2863 }
2864
2865 /**
2866 * Turns the generic expression-style uniform pull constant load instruction
2867 * into a hardware-specific series of instructions for loading a pull
2868 * constant.
2869 *
2870 * The expression style allows the CSE pass before this to optimize out
2871 * repeated loads from the same offset, and gives the pre-register-allocation
2872 * scheduling full flexibility, while the conversion to native instructions
2873 * allows the post-register-allocation scheduler the best information
2874 * possible.
2875 *
2876 * Note that execution masking for setting up pull constant loads is special:
2877 * the channels that need to be written are unrelated to the current execution
2878 * mask, since a later instruction will use one of the result channels as a
2879 * source operand for all 8 or 16 of its channels.
2880 */
2881 void
2882 fs_visitor::lower_uniform_pull_constant_loads()
2883 {
2884 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2885 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2886 continue;
2887
2888 if (brw->gen >= 7) {
2889 /* The offset arg before was a vec4-aligned byte offset. We need to
2890 * turn it into a dword offset.
2891 */
2892 fs_reg const_offset_reg = inst->src[1];
2893 assert(const_offset_reg.file == IMM &&
2894 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2895 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2896 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2897
2898 /* This is actually going to be a MOV, but since only the first dword
2899 * is accessed, we have a special opcode to do just that one. Note
2900 * that this needs to be an operation that will be considered a def
2901 * by live variable analysis, or register allocation will explode.
2902 */
2903 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2904 8, payload, const_offset_reg);
2905 setup->force_writemask_all = true;
2906
2907 setup->ir = inst->ir;
2908 setup->annotation = inst->annotation;
2909 inst->insert_before(block, setup);
2910
2911 /* Similarly, this will only populate the first 4 channels of the
2912 * result register (since we only use smear values from 0-3), but we
2913 * don't tell the optimizer.
2914 */
2915 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2916 inst->src[1] = payload;
2917
2918 invalidate_live_intervals();
2919 } else {
2920 /* Before register allocation, we didn't tell the scheduler about the
2921 * MRF we use. We know it's safe to use this MRF because nothing
2922 * else does except for register spill/unspill, which generates and
2923 * uses its MRF within a single IR instruction.
2924 */
2925 inst->base_mrf = 14;
2926 inst->mlen = 1;
2927 }
2928 }
2929 }
2930
2931 bool
2932 fs_visitor::lower_load_payload()
2933 {
2934 bool progress = false;
2935
2936 int vgrf_to_reg[virtual_grf_count];
2937 int reg_count = 16; /* Leave room for MRF */
2938 for (int i = 0; i < virtual_grf_count; ++i) {
2939 vgrf_to_reg[i] = reg_count;
2940 reg_count += virtual_grf_sizes[i];
2941 }
2942
2943 struct {
2944 bool written:1; /* Whether this register has ever been written */
2945 bool force_writemask_all:1;
2946 bool force_sechalf:1;
2947 } metadata[reg_count];
2948 memset(metadata, 0, sizeof(metadata));
2949
2950 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2951 int dst_reg;
2952 if (inst->dst.file == GRF) {
2953 dst_reg = vgrf_to_reg[inst->dst.reg];
2954 } else {
2955 /* MRF */
2956 dst_reg = inst->dst.reg;
2957 }
2958
2959 if (inst->dst.file == MRF || inst->dst.file == GRF) {
2960 bool force_sechalf = inst->force_sechalf;
2961 bool toggle_sechalf = inst->dst.width == 16 &&
2962 type_sz(inst->dst.type) == 4;
2963 for (int i = 0; i < inst->regs_written; ++i) {
2964 metadata[dst_reg + i].written = true;
2965 metadata[dst_reg + i].force_sechalf = force_sechalf;
2966 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
2967 force_sechalf = (toggle_sechalf != force_sechalf);
2968 }
2969 }
2970
2971 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
2972 assert(inst->dst.file == MRF || inst->dst.file == GRF);
2973 fs_reg dst = inst->dst;
2974
2975 for (int i = 0; i < inst->sources; i++) {
2976 dst.width = inst->src[i].effective_width;
2977 dst.type = inst->src[i].type;
2978
2979 if (inst->src[i].file == BAD_FILE) {
2980 /* Do nothing but otherwise increment as normal */
2981 } else if (dst.file == MRF &&
2982 dst.width == 8 &&
2983 brw->has_compr4 &&
2984 i + 4 < inst->sources &&
2985 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
2986 fs_reg compr4_dst = dst;
2987 compr4_dst.reg += BRW_MRF_COMPR4;
2988 compr4_dst.width = 16;
2989 fs_reg compr4_src = inst->src[i];
2990 compr4_src.width = 16;
2991 fs_inst *mov = MOV(compr4_dst, compr4_src);
2992 mov->force_writemask_all = true;
2993 inst->insert_before(block, mov);
2994 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
2995 inst->src[i + 4].file = BAD_FILE;
2996 } else {
2997 fs_inst *mov = MOV(dst, inst->src[i]);
2998 if (inst->src[i].file == GRF) {
2999 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3000 inst->src[i].reg_offset;
3001 mov->force_sechalf = metadata[src_reg].force_sechalf;
3002 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3003 metadata[dst_reg] = metadata[src_reg];
3004 if (dst.width * type_sz(dst.type) > 32) {
3005 assert((!metadata[src_reg].written ||
3006 !metadata[src_reg].force_sechalf) &&
3007 (!metadata[src_reg + 1].written ||
3008 metadata[src_reg + 1].force_sechalf));
3009 metadata[dst_reg + 1] = metadata[src_reg + 1];
3010 }
3011 } else {
3012 metadata[dst_reg].force_writemask_all = false;
3013 metadata[dst_reg].force_sechalf = false;
3014 if (dst.width == 16) {
3015 metadata[dst_reg + 1].force_writemask_all = false;
3016 metadata[dst_reg + 1].force_sechalf = true;
3017 }
3018 }
3019 inst->insert_before(block, mov);
3020 }
3021
3022 dst = offset(dst, 1);
3023 }
3024
3025 inst->remove(block);
3026 progress = true;
3027 }
3028 }
3029
3030 if (progress)
3031 invalidate_live_intervals();
3032
3033 return progress;
3034 }
3035
3036 void
3037 fs_visitor::dump_instructions()
3038 {
3039 dump_instructions(NULL);
3040 }
3041
3042 void
3043 fs_visitor::dump_instructions(const char *name)
3044 {
3045 calculate_register_pressure();
3046 FILE *file = stderr;
3047 if (name && geteuid() != 0) {
3048 file = fopen(name, "w");
3049 if (!file)
3050 file = stderr;
3051 }
3052
3053 int ip = 0, max_pressure = 0;
3054 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3055 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3056 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3057 dump_instruction(inst, file);
3058 ++ip;
3059 }
3060 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3061
3062 if (file != stderr) {
3063 fclose(file);
3064 }
3065 }
3066
3067 void
3068 fs_visitor::dump_instruction(backend_instruction *be_inst)
3069 {
3070 dump_instruction(be_inst, stderr);
3071 }
3072
3073 void
3074 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3075 {
3076 fs_inst *inst = (fs_inst *)be_inst;
3077
3078 if (inst->predicate) {
3079 fprintf(file, "(%cf0.%d) ",
3080 inst->predicate_inverse ? '-' : '+',
3081 inst->flag_subreg);
3082 }
3083
3084 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3085 if (inst->saturate)
3086 fprintf(file, ".sat");
3087 if (inst->conditional_mod) {
3088 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3089 if (!inst->predicate &&
3090 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3091 inst->opcode != BRW_OPCODE_IF &&
3092 inst->opcode != BRW_OPCODE_WHILE))) {
3093 fprintf(file, ".f0.%d", inst->flag_subreg);
3094 }
3095 }
3096 fprintf(file, "(%d) ", inst->exec_size);
3097
3098
3099 switch (inst->dst.file) {
3100 case GRF:
3101 fprintf(file, "vgrf%d", inst->dst.reg);
3102 if (inst->dst.width != dispatch_width)
3103 fprintf(file, "@%d", inst->dst.width);
3104 if (virtual_grf_sizes[inst->dst.reg] != inst->dst.width / 8 ||
3105 inst->dst.subreg_offset)
3106 fprintf(file, "+%d.%d",
3107 inst->dst.reg_offset, inst->dst.subreg_offset);
3108 break;
3109 case MRF:
3110 fprintf(file, "m%d", inst->dst.reg);
3111 break;
3112 case BAD_FILE:
3113 fprintf(file, "(null)");
3114 break;
3115 case UNIFORM:
3116 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3117 break;
3118 case HW_REG:
3119 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3120 switch (inst->dst.fixed_hw_reg.nr) {
3121 case BRW_ARF_NULL:
3122 fprintf(file, "null");
3123 break;
3124 case BRW_ARF_ADDRESS:
3125 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3126 break;
3127 case BRW_ARF_ACCUMULATOR:
3128 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3129 break;
3130 case BRW_ARF_FLAG:
3131 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3132 inst->dst.fixed_hw_reg.subnr);
3133 break;
3134 default:
3135 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3136 inst->dst.fixed_hw_reg.subnr);
3137 break;
3138 }
3139 } else {
3140 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3141 }
3142 if (inst->dst.fixed_hw_reg.subnr)
3143 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3144 break;
3145 default:
3146 fprintf(file, "???");
3147 break;
3148 }
3149 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3150
3151 for (int i = 0; i < inst->sources; i++) {
3152 if (inst->src[i].negate)
3153 fprintf(file, "-");
3154 if (inst->src[i].abs)
3155 fprintf(file, "|");
3156 switch (inst->src[i].file) {
3157 case GRF:
3158 fprintf(file, "vgrf%d", inst->src[i].reg);
3159 if (inst->src[i].width != dispatch_width)
3160 fprintf(file, "@%d", inst->src[i].width);
3161 if (virtual_grf_sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3162 inst->src[i].subreg_offset)
3163 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3164 inst->src[i].subreg_offset);
3165 break;
3166 case MRF:
3167 fprintf(file, "***m%d***", inst->src[i].reg);
3168 break;
3169 case UNIFORM:
3170 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3171 if (inst->src[i].reladdr) {
3172 fprintf(file, "+reladdr");
3173 } else if (inst->src[i].subreg_offset) {
3174 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3175 inst->src[i].subreg_offset);
3176 }
3177 break;
3178 case BAD_FILE:
3179 fprintf(file, "(null)");
3180 break;
3181 case IMM:
3182 switch (inst->src[i].type) {
3183 case BRW_REGISTER_TYPE_F:
3184 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3185 break;
3186 case BRW_REGISTER_TYPE_D:
3187 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3188 break;
3189 case BRW_REGISTER_TYPE_UD:
3190 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3191 break;
3192 default:
3193 fprintf(file, "???");
3194 break;
3195 }
3196 break;
3197 case HW_REG:
3198 if (inst->src[i].fixed_hw_reg.negate)
3199 fprintf(file, "-");
3200 if (inst->src[i].fixed_hw_reg.abs)
3201 fprintf(file, "|");
3202 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3203 switch (inst->src[i].fixed_hw_reg.nr) {
3204 case BRW_ARF_NULL:
3205 fprintf(file, "null");
3206 break;
3207 case BRW_ARF_ADDRESS:
3208 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3209 break;
3210 case BRW_ARF_ACCUMULATOR:
3211 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3212 break;
3213 case BRW_ARF_FLAG:
3214 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3215 inst->src[i].fixed_hw_reg.subnr);
3216 break;
3217 default:
3218 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3219 inst->src[i].fixed_hw_reg.subnr);
3220 break;
3221 }
3222 } else {
3223 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3224 }
3225 if (inst->src[i].fixed_hw_reg.subnr)
3226 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3227 if (inst->src[i].fixed_hw_reg.abs)
3228 fprintf(file, "|");
3229 break;
3230 default:
3231 fprintf(file, "???");
3232 break;
3233 }
3234 if (inst->src[i].abs)
3235 fprintf(file, "|");
3236
3237 if (inst->src[i].file != IMM) {
3238 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3239 }
3240
3241 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3242 fprintf(file, ", ");
3243 }
3244
3245 fprintf(file, " ");
3246
3247 if (dispatch_width == 16 && inst->exec_size == 8) {
3248 if (inst->force_sechalf)
3249 fprintf(file, "2ndhalf ");
3250 else
3251 fprintf(file, "1sthalf ");
3252 }
3253
3254 fprintf(file, "\n");
3255 }
3256
3257 /**
3258 * Possibly returns an instruction that set up @param reg.
3259 *
3260 * Sometimes we want to take the result of some expression/variable
3261 * dereference tree and rewrite the instruction generating the result
3262 * of the tree. When processing the tree, we know that the
3263 * instructions generated are all writing temporaries that are dead
3264 * outside of this tree. So, if we have some instructions that write
3265 * a temporary, we're free to point that temp write somewhere else.
3266 *
3267 * Note that this doesn't guarantee that the instruction generated
3268 * only reg -- it might be the size=4 destination of a texture instruction.
3269 */
3270 fs_inst *
3271 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3272 fs_inst *end,
3273 const fs_reg &reg)
3274 {
3275 if (end == start ||
3276 end->is_partial_write() ||
3277 reg.reladdr ||
3278 !reg.equals(end->dst)) {
3279 return NULL;
3280 } else {
3281 return end;
3282 }
3283 }
3284
3285 void
3286 fs_visitor::setup_payload_gen6()
3287 {
3288 bool uses_depth =
3289 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3290 unsigned barycentric_interp_modes =
3291 (stage == MESA_SHADER_FRAGMENT) ?
3292 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3293
3294 assert(brw->gen >= 6);
3295
3296 /* R0-1: masks, pixel X/Y coordinates. */
3297 payload.num_regs = 2;
3298 /* R2: only for 32-pixel dispatch.*/
3299
3300 /* R3-26: barycentric interpolation coordinates. These appear in the
3301 * same order that they appear in the brw_wm_barycentric_interp_mode
3302 * enum. Each set of coordinates occupies 2 registers if dispatch width
3303 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3304 * appear if they were enabled using the "Barycentric Interpolation
3305 * Mode" bits in WM_STATE.
3306 */
3307 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3308 if (barycentric_interp_modes & (1 << i)) {
3309 payload.barycentric_coord_reg[i] = payload.num_regs;
3310 payload.num_regs += 2;
3311 if (dispatch_width == 16) {
3312 payload.num_regs += 2;
3313 }
3314 }
3315 }
3316
3317 /* R27: interpolated depth if uses source depth */
3318 if (uses_depth) {
3319 payload.source_depth_reg = payload.num_regs;
3320 payload.num_regs++;
3321 if (dispatch_width == 16) {
3322 /* R28: interpolated depth if not SIMD8. */
3323 payload.num_regs++;
3324 }
3325 }
3326 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3327 if (uses_depth) {
3328 payload.source_w_reg = payload.num_regs;
3329 payload.num_regs++;
3330 if (dispatch_width == 16) {
3331 /* R30: interpolated W if not SIMD8. */
3332 payload.num_regs++;
3333 }
3334 }
3335
3336 if (stage == MESA_SHADER_FRAGMENT) {
3337 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3338 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3339 prog_data->uses_pos_offset = key->compute_pos_offset;
3340 /* R31: MSAA position offsets. */
3341 if (prog_data->uses_pos_offset) {
3342 payload.sample_pos_reg = payload.num_regs;
3343 payload.num_regs++;
3344 }
3345 }
3346
3347 /* R32: MSAA input coverage mask */
3348 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3349 assert(brw->gen >= 7);
3350 payload.sample_mask_in_reg = payload.num_regs;
3351 payload.num_regs++;
3352 if (dispatch_width == 16) {
3353 /* R33: input coverage mask if not SIMD8. */
3354 payload.num_regs++;
3355 }
3356 }
3357
3358 /* R34-: bary for 32-pixel. */
3359 /* R58-59: interp W for 32-pixel. */
3360
3361 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3362 source_depth_to_render_target = true;
3363 }
3364 }
3365
3366 void
3367 fs_visitor::assign_binding_table_offsets()
3368 {
3369 assert(stage == MESA_SHADER_FRAGMENT);
3370 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3371 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3372 uint32_t next_binding_table_offset = 0;
3373
3374 /* If there are no color regions, we still perform an FB write to a null
3375 * renderbuffer, which we place at surface index 0.
3376 */
3377 prog_data->binding_table.render_target_start = next_binding_table_offset;
3378 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3379
3380 assign_common_binding_table_offsets(next_binding_table_offset);
3381 }
3382
3383 void
3384 fs_visitor::calculate_register_pressure()
3385 {
3386 invalidate_live_intervals();
3387 calculate_live_intervals();
3388
3389 unsigned num_instructions = 0;
3390 foreach_block(block, cfg)
3391 num_instructions += block->instructions.length();
3392
3393 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3394
3395 for (int reg = 0; reg < virtual_grf_count; reg++) {
3396 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3397 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3398 }
3399 }
3400
3401 /**
3402 * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
3403 *
3404 * The needs_unlit_centroid_workaround ends up producing one of these per
3405 * channel of centroid input, so it's good to clean them up.
3406 *
3407 * An assumption here is that nothing ever modifies the dispatched pixels
3408 * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
3409 * dictates that anyway.
3410 */
3411 void
3412 fs_visitor::opt_drop_redundant_mov_to_flags()
3413 {
3414 bool flag_mov_found[2] = {false};
3415
3416 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3417 if (inst->is_control_flow()) {
3418 memset(flag_mov_found, 0, sizeof(flag_mov_found));
3419 } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
3420 if (!flag_mov_found[inst->flag_subreg])
3421 flag_mov_found[inst->flag_subreg] = true;
3422 else
3423 inst->remove(block);
3424 } else if (inst->writes_flag()) {
3425 flag_mov_found[inst->flag_subreg] = false;
3426 }
3427 }
3428 }
3429
3430 void
3431 fs_visitor::optimize()
3432 {
3433 calculate_cfg();
3434
3435 split_virtual_grfs();
3436
3437 move_uniform_array_access_to_pull_constants();
3438 assign_constant_locations();
3439 demote_pull_constants();
3440
3441 opt_drop_redundant_mov_to_flags();
3442
3443 #define OPT(pass, args...) do { \
3444 pass_num++; \
3445 bool this_progress = pass(args); \
3446 \
3447 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3448 char filename[64]; \
3449 snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass, \
3450 dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3451 \
3452 backend_visitor::dump_instructions(filename); \
3453 } \
3454 \
3455 progress = progress || this_progress; \
3456 } while (false)
3457
3458 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3459 char filename[64];
3460 snprintf(filename, 64, "fs%d-%04d-00-start",
3461 dispatch_width, shader_prog ? shader_prog->Name : 0);
3462
3463 backend_visitor::dump_instructions(filename);
3464 }
3465
3466 bool progress;
3467 int iteration = 0;
3468 do {
3469 progress = false;
3470 iteration++;
3471 int pass_num = 0;
3472
3473 OPT(remove_duplicate_mrf_writes);
3474
3475 OPT(opt_algebraic);
3476 OPT(opt_cse);
3477 OPT(opt_copy_propagate);
3478 OPT(opt_peephole_predicated_break);
3479 OPT(dead_code_eliminate);
3480 OPT(opt_peephole_sel);
3481 OPT(dead_control_flow_eliminate, this);
3482 OPT(opt_register_renaming);
3483 OPT(opt_saturate_propagation);
3484 OPT(register_coalesce);
3485 OPT(compute_to_mrf);
3486
3487 OPT(compact_virtual_grfs);
3488 } while (progress);
3489
3490 if (lower_load_payload()) {
3491 split_virtual_grfs();
3492 register_coalesce();
3493 compute_to_mrf();
3494 dead_code_eliminate();
3495 }
3496
3497 lower_uniform_pull_constant_loads();
3498 }
3499
3500 void
3501 fs_visitor::allocate_registers()
3502 {
3503 bool allocated_without_spills;
3504
3505 static enum instruction_scheduler_mode pre_modes[] = {
3506 SCHEDULE_PRE,
3507 SCHEDULE_PRE_NON_LIFO,
3508 SCHEDULE_PRE_LIFO,
3509 };
3510
3511 /* Try each scheduling heuristic to see if it can successfully register
3512 * allocate without spilling. They should be ordered by decreasing
3513 * performance but increasing likelihood of allocating.
3514 */
3515 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3516 schedule_instructions(pre_modes[i]);
3517
3518 if (0) {
3519 assign_regs_trivial();
3520 allocated_without_spills = true;
3521 } else {
3522 allocated_without_spills = assign_regs(false);
3523 }
3524 if (allocated_without_spills)
3525 break;
3526 }
3527
3528 if (!allocated_without_spills) {
3529 /* We assume that any spilling is worse than just dropping back to
3530 * SIMD8. There's probably actually some intermediate point where
3531 * SIMD16 with a couple of spills is still better.
3532 */
3533 if (dispatch_width == 16) {
3534 fail("Failure to register allocate. Reduce number of "
3535 "live scalar values to avoid this.");
3536 } else {
3537 perf_debug("Fragment shader triggered register spilling. "
3538 "Try reducing the number of live scalar values to "
3539 "improve performance.\n");
3540 }
3541
3542 /* Since we're out of heuristics, just go spill registers until we
3543 * get an allocation.
3544 */
3545 while (!assign_regs(true)) {
3546 if (failed)
3547 break;
3548 }
3549 }
3550
3551 /* This must come after all optimization and register allocation, since
3552 * it inserts dead code that happens to have side effects, and it does
3553 * so based on the actual physical registers in use.
3554 */
3555 insert_gen4_send_dependency_workarounds();
3556
3557 if (failed)
3558 return;
3559
3560 if (!allocated_without_spills)
3561 schedule_instructions(SCHEDULE_POST);
3562
3563 if (last_scratch > 0)
3564 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3565 }
3566
3567 bool
3568 fs_visitor::run()
3569 {
3570 sanity_param_count = prog->Parameters->NumParameters;
3571
3572 assign_binding_table_offsets();
3573
3574 if (brw->gen >= 6)
3575 setup_payload_gen6();
3576 else
3577 setup_payload_gen4();
3578
3579 if (0) {
3580 emit_dummy_fs();
3581 } else if (brw->use_rep_send && dispatch_width == 16) {
3582 emit_repclear_shader();
3583 } else {
3584 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3585 emit_shader_time_begin();
3586
3587 calculate_urb_setup();
3588 if (prog->InputsRead > 0) {
3589 if (brw->gen < 6)
3590 emit_interpolation_setup_gen4();
3591 else
3592 emit_interpolation_setup_gen6();
3593 }
3594
3595 /* We handle discards by keeping track of the still-live pixels in f0.1.
3596 * Initialize it with the dispatched pixels.
3597 */
3598 bool uses_kill =
3599 (stage == MESA_SHADER_FRAGMENT) &&
3600 ((brw_wm_prog_data*) this->prog_data)->uses_kill;
3601 bool alpha_test_func =
3602 (stage == MESA_SHADER_FRAGMENT) &&
3603 ((brw_wm_prog_key*) this->key)->alpha_test_func;
3604 if (uses_kill || alpha_test_func) {
3605 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3606 discard_init->flag_subreg = 1;
3607 }
3608
3609 /* Generate FS IR for main(). (the visitor only descends into
3610 * functions called "main").
3611 */
3612 if (shader) {
3613 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3614 base_ir = ir;
3615 this->result = reg_undef;
3616 ir->accept(this);
3617 }
3618 } else {
3619 emit_fragment_program_code();
3620 }
3621 base_ir = NULL;
3622 if (failed)
3623 return false;
3624
3625 emit(FS_OPCODE_PLACEHOLDER_HALT);
3626
3627 if (alpha_test_func)
3628 emit_alpha_test();
3629
3630 emit_fb_writes();
3631
3632 optimize();
3633
3634 assign_curb_setup();
3635 assign_urb_setup();
3636
3637 allocate_registers();
3638
3639 if (failed)
3640 return false;
3641 }
3642
3643 if (stage == MESA_SHADER_FRAGMENT) {
3644 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3645 if (dispatch_width == 8)
3646 prog_data->reg_blocks = brw_register_blocks(grf_used);
3647 else
3648 prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3649 }
3650
3651 /* If any state parameters were appended, then ParameterValues could have
3652 * been realloced, in which case the driver uniform storage set up by
3653 * _mesa_associate_uniform_storage() would point to freed memory. Make
3654 * sure that didn't happen.
3655 */
3656 assert(sanity_param_count == prog->Parameters->NumParameters);
3657
3658 return !failed;
3659 }
3660
3661 const unsigned *
3662 brw_wm_fs_emit(struct brw_context *brw,
3663 void *mem_ctx,
3664 const struct brw_wm_prog_key *key,
3665 struct brw_wm_prog_data *prog_data,
3666 struct gl_fragment_program *fp,
3667 struct gl_shader_program *prog,
3668 unsigned *final_assembly_size)
3669 {
3670 bool start_busy = false;
3671 double start_time = 0;
3672
3673 if (unlikely(brw->perf_debug)) {
3674 start_busy = (brw->batch.last_bo &&
3675 drm_intel_bo_busy(brw->batch.last_bo));
3676 start_time = get_time();
3677 }
3678
3679 struct brw_shader *shader = NULL;
3680 if (prog)
3681 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3682
3683 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3684 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3685
3686 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3687 */
3688 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3689 if (!v.run()) {
3690 if (prog) {
3691 prog->LinkStatus = false;
3692 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3693 }
3694
3695 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3696 v.fail_msg);
3697
3698 return NULL;
3699 }
3700
3701 cfg_t *simd16_cfg = NULL;
3702 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3703 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3704 brw->use_rep_send)) {
3705 if (!v.simd16_unsupported) {
3706 /* Try a SIMD16 compile */
3707 v2.import_uniforms(&v);
3708 if (!v2.run()) {
3709 perf_debug("SIMD16 shader failed to compile, falling back to "
3710 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3711 } else {
3712 simd16_cfg = v2.cfg;
3713 }
3714 } else {
3715 perf_debug("SIMD16 shader unsupported, falling back to "
3716 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3717 }
3718 }
3719
3720 cfg_t *simd8_cfg;
3721 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3722 if (no_simd8 && simd16_cfg) {
3723 simd8_cfg = NULL;
3724 prog_data->no_8 = true;
3725 } else {
3726 simd8_cfg = v.cfg;
3727 prog_data->no_8 = false;
3728 }
3729
3730 fs_generator g(brw, mem_ctx, key, prog_data, prog, fp,
3731 v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
3732 if (simd8_cfg)
3733 g.generate_code(simd8_cfg, 8);
3734 if (simd16_cfg)
3735 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
3736
3737 if (unlikely(brw->perf_debug) && shader) {
3738 if (shader->compiled_once)
3739 brw_wm_debug_recompile(brw, prog, key);
3740 shader->compiled_once = true;
3741
3742 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3743 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3744 (get_time() - start_time) * 1000);
3745 }
3746 }
3747
3748 return g.get_assembly(final_assembly_size);
3749 }
3750
3751 extern "C" bool
3752 brw_fs_precompile(struct gl_context *ctx,
3753 struct gl_shader_program *shader_prog,
3754 struct gl_program *prog)
3755 {
3756 struct brw_context *brw = brw_context(ctx);
3757 struct brw_wm_prog_key key;
3758
3759 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
3760 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3761 bool program_uses_dfdy = fp->UsesDFdy;
3762
3763 memset(&key, 0, sizeof(key));
3764
3765 if (brw->gen < 6) {
3766 if (fp->UsesKill)
3767 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3768
3769 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3770 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3771
3772 /* Just assume depth testing. */
3773 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3774 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3775 }
3776
3777 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3778 BRW_FS_VARYING_INPUT_MASK) > 16)
3779 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3780
3781 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3782 for (unsigned i = 0; i < sampler_count; i++) {
3783 if (fp->Base.ShadowSamplers & (1 << i)) {
3784 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3785 key.tex.swizzles[i] =
3786 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3787 } else {
3788 /* Color sampler: assume no swizzling. */
3789 key.tex.swizzles[i] = SWIZZLE_XYZW;
3790 }
3791 }
3792
3793 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3794 key.drawable_height = ctx->DrawBuffer->Height;
3795 }
3796
3797 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3798 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3799 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3800
3801 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3802 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3803 key.nr_color_regions > 1;
3804 }
3805
3806 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3807 * quality of the derivatives is likely to be determined by the driconf
3808 * option.
3809 */
3810 key.high_quality_derivatives = brw->disable_derivative_optimization;
3811
3812 key.program_string_id = bfp->id;
3813
3814 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3815 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3816
3817 bool success = do_wm_prog(brw, shader_prog, bfp, &key);
3818
3819 brw->wm.base.prog_offset = old_prog_offset;
3820 brw->wm.prog_data = old_prog_data;
3821
3822 return success;
3823 }