i965: add SHADER_OPCODE_TG4
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "main/uniforms.h"
50 #include "glsl/glsl_types.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63
64 /* This will be the case for almost all instructions. */
65 this->regs_written = 1;
66 }
67
68 fs_inst::fs_inst()
69 {
70 init();
71 }
72
73 fs_inst::fs_inst(enum opcode opcode)
74 {
75 init();
76 this->opcode = opcode;
77 }
78
79 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
80 {
81 init();
82 this->opcode = opcode;
83 this->dst = dst;
84
85 if (dst.file == GRF)
86 assert(dst.reg_offset >= 0);
87 }
88
89 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
90 {
91 init();
92 this->opcode = opcode;
93 this->dst = dst;
94 this->src[0] = src0;
95
96 if (dst.file == GRF)
97 assert(dst.reg_offset >= 0);
98 if (src[0].file == GRF)
99 assert(src[0].reg_offset >= 0);
100 }
101
102 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
103 {
104 init();
105 this->opcode = opcode;
106 this->dst = dst;
107 this->src[0] = src0;
108 this->src[1] = src1;
109
110 if (dst.file == GRF)
111 assert(dst.reg_offset >= 0);
112 if (src[0].file == GRF)
113 assert(src[0].reg_offset >= 0);
114 if (src[1].file == GRF)
115 assert(src[1].reg_offset >= 0);
116 }
117
118 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
119 fs_reg src0, fs_reg src1, fs_reg src2)
120 {
121 init();
122 this->opcode = opcode;
123 this->dst = dst;
124 this->src[0] = src0;
125 this->src[1] = src1;
126 this->src[2] = src2;
127
128 if (dst.file == GRF)
129 assert(dst.reg_offset >= 0);
130 if (src[0].file == GRF)
131 assert(src[0].reg_offset >= 0);
132 if (src[1].file == GRF)
133 assert(src[1].reg_offset >= 0);
134 if (src[2].file == GRF)
135 assert(src[2].reg_offset >= 0);
136 }
137
138 #define ALU1(op) \
139 fs_inst * \
140 fs_visitor::op(fs_reg dst, fs_reg src0) \
141 { \
142 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
143 }
144
145 #define ALU2(op) \
146 fs_inst * \
147 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
148 { \
149 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
150 }
151
152 #define ALU3(op) \
153 fs_inst * \
154 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
155 { \
156 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
157 }
158
159 ALU1(NOT)
160 ALU1(MOV)
161 ALU1(FRC)
162 ALU1(RNDD)
163 ALU1(RNDE)
164 ALU1(RNDZ)
165 ALU2(ADD)
166 ALU2(MUL)
167 ALU2(MACH)
168 ALU2(AND)
169 ALU2(OR)
170 ALU2(XOR)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183
184 /** Gen4 predicated IF. */
185 fs_inst *
186 fs_visitor::IF(uint32_t predicate)
187 {
188 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
189 inst->predicate = predicate;
190 return inst;
191 }
192
193 /** Gen6+ IF with embedded comparison. */
194 fs_inst *
195 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
196 {
197 assert(brw->gen >= 6);
198 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
199 reg_null_d, src0, src1);
200 inst->conditional_mod = condition;
201 return inst;
202 }
203
204 /**
205 * CMP: Sets the low bit of the destination channels with the result
206 * of the comparison, while the upper bits are undefined, and updates
207 * the flag register with the packed 16 bits of the result.
208 */
209 fs_inst *
210 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
211 {
212 fs_inst *inst;
213
214 /* Take the instruction:
215 *
216 * CMP null<d> src0<f> src1<f>
217 *
218 * Original gen4 does type conversion to the destination type before
219 * comparison, producing garbage results for floating point comparisons.
220 * gen5 does the comparison on the execution type (resolved source types),
221 * so dst type doesn't matter. gen6 does comparison and then uses the
222 * result as if it was the dst type with no conversion, which happens to
223 * mostly work out for float-interpreted-as-int since our comparisons are
224 * for >0, =0, <0.
225 */
226 if (brw->gen == 4) {
227 dst.type = src0.type;
228 if (dst.file == HW_REG)
229 dst.fixed_hw_reg.type = dst.type;
230 }
231
232 resolve_ud_negate(&src0);
233 resolve_ud_negate(&src1);
234
235 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
236 inst->conditional_mod = condition;
237
238 return inst;
239 }
240
241 exec_list
242 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
243 fs_reg varying_offset,
244 uint32_t const_offset)
245 {
246 exec_list instructions;
247 fs_inst *inst;
248
249 /* We have our constant surface use a pitch of 4 bytes, so our index can
250 * be any component of a vector, and then we load 4 contiguous
251 * components starting from that.
252 *
253 * We break down the const_offset to a portion added to the variable
254 * offset and a portion done using reg_offset, which means that if you
255 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
256 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
257 * CSE can later notice that those loads are all the same and eliminate
258 * the redundant ones.
259 */
260 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
261 instructions.push_tail(ADD(vec4_offset,
262 varying_offset, const_offset & ~3));
263
264 int scale = 1;
265 if (brw->gen == 4 && dispatch_width == 8) {
266 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
267 * u, v, r) as parameters, or we can just use the SIMD16 message
268 * consisting of (header, u). We choose the second, at the cost of a
269 * longer return length.
270 */
271 scale = 2;
272 }
273
274 enum opcode op;
275 if (brw->gen >= 7)
276 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
277 else
278 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
279 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
280 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
281 inst->regs_written = 4 * scale;
282 instructions.push_tail(inst);
283
284 if (brw->gen < 7) {
285 inst->base_mrf = 13;
286 inst->header_present = true;
287 if (brw->gen == 4)
288 inst->mlen = 3;
289 else
290 inst->mlen = 1 + dispatch_width / 8;
291 }
292
293 vec4_result.reg_offset += (const_offset & 3) * scale;
294 instructions.push_tail(MOV(dst, vec4_result));
295
296 return instructions;
297 }
298
299 /**
300 * A helper for MOV generation for fixing up broken hardware SEND dependency
301 * handling.
302 */
303 fs_inst *
304 fs_visitor::DEP_RESOLVE_MOV(int grf)
305 {
306 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
307
308 inst->ir = NULL;
309 inst->annotation = "send dependency resolve";
310
311 /* The caller always wants uncompressed to emit the minimal extra
312 * dependencies, and to avoid having to deal with aligning its regs to 2.
313 */
314 inst->force_uncompressed = true;
315
316 return inst;
317 }
318
319 bool
320 fs_inst::equals(fs_inst *inst)
321 {
322 return (opcode == inst->opcode &&
323 dst.equals(inst->dst) &&
324 src[0].equals(inst->src[0]) &&
325 src[1].equals(inst->src[1]) &&
326 src[2].equals(inst->src[2]) &&
327 saturate == inst->saturate &&
328 predicate == inst->predicate &&
329 conditional_mod == inst->conditional_mod &&
330 mlen == inst->mlen &&
331 base_mrf == inst->base_mrf &&
332 sampler == inst->sampler &&
333 target == inst->target &&
334 eot == inst->eot &&
335 header_present == inst->header_present &&
336 shadow_compare == inst->shadow_compare &&
337 offset == inst->offset);
338 }
339
340 bool
341 fs_inst::overwrites_reg(const fs_reg &reg)
342 {
343 return (reg.file == dst.file &&
344 reg.reg == dst.reg &&
345 reg.reg_offset >= dst.reg_offset &&
346 reg.reg_offset < dst.reg_offset + regs_written);
347 }
348
349 bool
350 fs_inst::is_send_from_grf()
351 {
352 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
353 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
354 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
355 src[1].file == GRF));
356 }
357
358 bool
359 fs_visitor::can_do_source_mods(fs_inst *inst)
360 {
361 if (brw->gen == 6 && inst->is_math())
362 return false;
363
364 if (inst->is_send_from_grf())
365 return false;
366
367 return true;
368 }
369
370 void
371 fs_reg::init()
372 {
373 memset(this, 0, sizeof(*this));
374 this->smear = -1;
375 }
376
377 /** Generic unset register constructor. */
378 fs_reg::fs_reg()
379 {
380 init();
381 this->file = BAD_FILE;
382 }
383
384 /** Immediate value constructor. */
385 fs_reg::fs_reg(float f)
386 {
387 init();
388 this->file = IMM;
389 this->type = BRW_REGISTER_TYPE_F;
390 this->imm.f = f;
391 }
392
393 /** Immediate value constructor. */
394 fs_reg::fs_reg(int32_t i)
395 {
396 init();
397 this->file = IMM;
398 this->type = BRW_REGISTER_TYPE_D;
399 this->imm.i = i;
400 }
401
402 /** Immediate value constructor. */
403 fs_reg::fs_reg(uint32_t u)
404 {
405 init();
406 this->file = IMM;
407 this->type = BRW_REGISTER_TYPE_UD;
408 this->imm.u = u;
409 }
410
411 /** Fixed brw_reg Immediate value constructor. */
412 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
413 {
414 init();
415 this->file = HW_REG;
416 this->fixed_hw_reg = fixed_hw_reg;
417 this->type = fixed_hw_reg.type;
418 }
419
420 bool
421 fs_reg::equals(const fs_reg &r) const
422 {
423 return (file == r.file &&
424 reg == r.reg &&
425 reg_offset == r.reg_offset &&
426 type == r.type &&
427 negate == r.negate &&
428 abs == r.abs &&
429 !reladdr && !r.reladdr &&
430 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
431 sizeof(fixed_hw_reg)) == 0 &&
432 smear == r.smear &&
433 imm.u == r.imm.u);
434 }
435
436 bool
437 fs_reg::is_zero() const
438 {
439 if (file != IMM)
440 return false;
441
442 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
443 }
444
445 bool
446 fs_reg::is_one() const
447 {
448 if (file != IMM)
449 return false;
450
451 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
452 }
453
454 bool
455 fs_reg::is_valid_3src() const
456 {
457 return file == GRF || file == UNIFORM;
458 }
459
460 int
461 fs_visitor::type_size(const struct glsl_type *type)
462 {
463 unsigned int size, i;
464
465 switch (type->base_type) {
466 case GLSL_TYPE_UINT:
467 case GLSL_TYPE_INT:
468 case GLSL_TYPE_FLOAT:
469 case GLSL_TYPE_BOOL:
470 return type->components();
471 case GLSL_TYPE_ARRAY:
472 return type_size(type->fields.array) * type->length;
473 case GLSL_TYPE_STRUCT:
474 size = 0;
475 for (i = 0; i < type->length; i++) {
476 size += type_size(type->fields.structure[i].type);
477 }
478 return size;
479 case GLSL_TYPE_SAMPLER:
480 /* Samplers take up no register space, since they're baked in at
481 * link time.
482 */
483 return 0;
484 case GLSL_TYPE_VOID:
485 case GLSL_TYPE_ERROR:
486 case GLSL_TYPE_INTERFACE:
487 assert(!"not reached");
488 break;
489 }
490
491 return 0;
492 }
493
494 fs_reg
495 fs_visitor::get_timestamp()
496 {
497 assert(brw->gen >= 7);
498
499 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
500 BRW_ARF_TIMESTAMP,
501 0),
502 BRW_REGISTER_TYPE_UD));
503
504 fs_reg dst = fs_reg(this, glsl_type::uint_type);
505
506 fs_inst *mov = emit(MOV(dst, ts));
507 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
508 * even if it's not enabled in the dispatch.
509 */
510 mov->force_writemask_all = true;
511 mov->force_uncompressed = true;
512
513 /* The caller wants the low 32 bits of the timestamp. Since it's running
514 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
515 * which is plenty of time for our purposes. It is identical across the
516 * EUs, but since it's tracking GPU core speed it will increment at a
517 * varying rate as render P-states change.
518 *
519 * The caller could also check if render P-states have changed (or anything
520 * else that might disrupt timing) by setting smear to 2 and checking if
521 * that field is != 0.
522 */
523 dst.smear = 0;
524
525 return dst;
526 }
527
528 void
529 fs_visitor::emit_shader_time_begin()
530 {
531 current_annotation = "shader time start";
532 shader_start_time = get_timestamp();
533 }
534
535 void
536 fs_visitor::emit_shader_time_end()
537 {
538 current_annotation = "shader time end";
539
540 enum shader_time_shader_type type, written_type, reset_type;
541 if (dispatch_width == 8) {
542 type = ST_FS8;
543 written_type = ST_FS8_WRITTEN;
544 reset_type = ST_FS8_RESET;
545 } else {
546 assert(dispatch_width == 16);
547 type = ST_FS16;
548 written_type = ST_FS16_WRITTEN;
549 reset_type = ST_FS16_RESET;
550 }
551
552 fs_reg shader_end_time = get_timestamp();
553
554 /* Check that there weren't any timestamp reset events (assuming these
555 * were the only two timestamp reads that happened).
556 */
557 fs_reg reset = shader_end_time;
558 reset.smear = 2;
559 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
560 test->conditional_mod = BRW_CONDITIONAL_Z;
561 emit(IF(BRW_PREDICATE_NORMAL));
562
563 push_force_uncompressed();
564 fs_reg start = shader_start_time;
565 start.negate = true;
566 fs_reg diff = fs_reg(this, glsl_type::uint_type);
567 emit(ADD(diff, start, shader_end_time));
568
569 /* If there were no instructions between the two timestamp gets, the diff
570 * is 2 cycles. Remove that overhead, so I can forget about that when
571 * trying to determine the time taken for single instructions.
572 */
573 emit(ADD(diff, diff, fs_reg(-2u)));
574
575 emit_shader_time_write(type, diff);
576 emit_shader_time_write(written_type, fs_reg(1u));
577 emit(BRW_OPCODE_ELSE);
578 emit_shader_time_write(reset_type, fs_reg(1u));
579 emit(BRW_OPCODE_ENDIF);
580
581 pop_force_uncompressed();
582 }
583
584 void
585 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
586 fs_reg value)
587 {
588 int shader_time_index =
589 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
590 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
591
592 fs_reg payload;
593 if (dispatch_width == 8)
594 payload = fs_reg(this, glsl_type::uvec2_type);
595 else
596 payload = fs_reg(this, glsl_type::uint_type);
597
598 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
599 fs_reg(), payload, offset, value));
600 }
601
602 void
603 fs_visitor::fail(const char *format, ...)
604 {
605 va_list va;
606 char *msg;
607
608 if (failed)
609 return;
610
611 failed = true;
612
613 va_start(va, format);
614 msg = ralloc_vasprintf(mem_ctx, format, va);
615 va_end(va);
616 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
617
618 this->fail_msg = msg;
619
620 if (INTEL_DEBUG & DEBUG_WM) {
621 fprintf(stderr, "%s", msg);
622 }
623 }
624
625 fs_inst *
626 fs_visitor::emit(enum opcode opcode)
627 {
628 return emit(fs_inst(opcode));
629 }
630
631 fs_inst *
632 fs_visitor::emit(enum opcode opcode, fs_reg dst)
633 {
634 return emit(fs_inst(opcode, dst));
635 }
636
637 fs_inst *
638 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
639 {
640 return emit(fs_inst(opcode, dst, src0));
641 }
642
643 fs_inst *
644 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
645 {
646 return emit(fs_inst(opcode, dst, src0, src1));
647 }
648
649 fs_inst *
650 fs_visitor::emit(enum opcode opcode, fs_reg dst,
651 fs_reg src0, fs_reg src1, fs_reg src2)
652 {
653 return emit(fs_inst(opcode, dst, src0, src1, src2));
654 }
655
656 void
657 fs_visitor::push_force_uncompressed()
658 {
659 force_uncompressed_stack++;
660 }
661
662 void
663 fs_visitor::pop_force_uncompressed()
664 {
665 force_uncompressed_stack--;
666 assert(force_uncompressed_stack >= 0);
667 }
668
669 void
670 fs_visitor::push_force_sechalf()
671 {
672 force_sechalf_stack++;
673 }
674
675 void
676 fs_visitor::pop_force_sechalf()
677 {
678 force_sechalf_stack--;
679 assert(force_sechalf_stack >= 0);
680 }
681
682 /**
683 * Returns true if the instruction has a flag that means it won't
684 * update an entire destination register.
685 *
686 * For example, dead code elimination and live variable analysis want to know
687 * when a write to a variable screens off any preceding values that were in
688 * it.
689 */
690 bool
691 fs_inst::is_partial_write()
692 {
693 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
694 this->force_uncompressed ||
695 this->force_sechalf);
696 }
697
698 /**
699 * Returns how many MRFs an FS opcode will write over.
700 *
701 * Note that this is not the 0 or 1 implied writes in an actual gen
702 * instruction -- the FS opcodes often generate MOVs in addition.
703 */
704 int
705 fs_visitor::implied_mrf_writes(fs_inst *inst)
706 {
707 if (inst->mlen == 0)
708 return 0;
709
710 switch (inst->opcode) {
711 case SHADER_OPCODE_RCP:
712 case SHADER_OPCODE_RSQ:
713 case SHADER_OPCODE_SQRT:
714 case SHADER_OPCODE_EXP2:
715 case SHADER_OPCODE_LOG2:
716 case SHADER_OPCODE_SIN:
717 case SHADER_OPCODE_COS:
718 return 1 * dispatch_width / 8;
719 case SHADER_OPCODE_POW:
720 case SHADER_OPCODE_INT_QUOTIENT:
721 case SHADER_OPCODE_INT_REMAINDER:
722 return 2 * dispatch_width / 8;
723 case SHADER_OPCODE_TEX:
724 case FS_OPCODE_TXB:
725 case SHADER_OPCODE_TXD:
726 case SHADER_OPCODE_TXF:
727 case SHADER_OPCODE_TXF_MS:
728 case SHADER_OPCODE_TG4:
729 case SHADER_OPCODE_TXL:
730 case SHADER_OPCODE_TXS:
731 case SHADER_OPCODE_LOD:
732 return 1;
733 case FS_OPCODE_FB_WRITE:
734 return 2;
735 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
736 case FS_OPCODE_UNSPILL:
737 return 1;
738 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
739 return inst->mlen;
740 case FS_OPCODE_SPILL:
741 return 2;
742 default:
743 assert(!"not reached");
744 return inst->mlen;
745 }
746 }
747
748 int
749 fs_visitor::virtual_grf_alloc(int size)
750 {
751 if (virtual_grf_array_size <= virtual_grf_count) {
752 if (virtual_grf_array_size == 0)
753 virtual_grf_array_size = 16;
754 else
755 virtual_grf_array_size *= 2;
756 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
757 virtual_grf_array_size);
758 }
759 virtual_grf_sizes[virtual_grf_count] = size;
760 return virtual_grf_count++;
761 }
762
763 /** Fixed HW reg constructor. */
764 fs_reg::fs_reg(enum register_file file, int reg)
765 {
766 init();
767 this->file = file;
768 this->reg = reg;
769 this->type = BRW_REGISTER_TYPE_F;
770 }
771
772 /** Fixed HW reg constructor. */
773 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
774 {
775 init();
776 this->file = file;
777 this->reg = reg;
778 this->type = type;
779 }
780
781 /** Automatic reg constructor. */
782 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
783 {
784 init();
785
786 this->file = GRF;
787 this->reg = v->virtual_grf_alloc(v->type_size(type));
788 this->reg_offset = 0;
789 this->type = brw_type_for_base_type(type);
790 }
791
792 fs_reg *
793 fs_visitor::variable_storage(ir_variable *var)
794 {
795 return (fs_reg *)hash_table_find(this->variable_ht, var);
796 }
797
798 void
799 import_uniforms_callback(const void *key,
800 void *data,
801 void *closure)
802 {
803 struct hash_table *dst_ht = (struct hash_table *)closure;
804 const fs_reg *reg = (const fs_reg *)data;
805
806 if (reg->file != UNIFORM)
807 return;
808
809 hash_table_insert(dst_ht, data, key);
810 }
811
812 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
813 * This brings in those uniform definitions
814 */
815 void
816 fs_visitor::import_uniforms(fs_visitor *v)
817 {
818 hash_table_call_foreach(v->variable_ht,
819 import_uniforms_callback,
820 variable_ht);
821 this->params_remap = v->params_remap;
822 this->nr_params_remap = v->nr_params_remap;
823 }
824
825 /* Our support for uniforms is piggy-backed on the struct
826 * gl_fragment_program, because that's where the values actually
827 * get stored, rather than in some global gl_shader_program uniform
828 * store.
829 */
830 void
831 fs_visitor::setup_uniform_values(ir_variable *ir)
832 {
833 int namelen = strlen(ir->name);
834
835 /* The data for our (non-builtin) uniforms is stored in a series of
836 * gl_uniform_driver_storage structs for each subcomponent that
837 * glGetUniformLocation() could name. We know it's been set up in the same
838 * order we'd walk the type, so walk the list of storage and find anything
839 * with our name, or the prefix of a component that starts with our name.
840 */
841 unsigned params_before = c->prog_data.nr_params;
842 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
843 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
844
845 if (strncmp(ir->name, storage->name, namelen) != 0 ||
846 (storage->name[namelen] != 0 &&
847 storage->name[namelen] != '.' &&
848 storage->name[namelen] != '[')) {
849 continue;
850 }
851
852 unsigned slots = storage->type->component_slots();
853 if (storage->array_elements)
854 slots *= storage->array_elements;
855
856 for (unsigned i = 0; i < slots; i++) {
857 c->prog_data.param[c->prog_data.nr_params++] =
858 &storage->storage[i].f;
859 }
860 }
861
862 /* Make sure we actually initialized the right amount of stuff here. */
863 assert(params_before + ir->type->component_slots() ==
864 c->prog_data.nr_params);
865 (void)params_before;
866 }
867
868
869 /* Our support for builtin uniforms is even scarier than non-builtin.
870 * It sits on top of the PROG_STATE_VAR parameters that are
871 * automatically updated from GL context state.
872 */
873 void
874 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
875 {
876 const ir_state_slot *const slots = ir->state_slots;
877 assert(ir->state_slots != NULL);
878
879 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
880 /* This state reference has already been setup by ir_to_mesa, but we'll
881 * get the same index back here.
882 */
883 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
884 (gl_state_index *)slots[i].tokens);
885
886 /* Add each of the unique swizzles of the element as a parameter.
887 * This'll end up matching the expected layout of the
888 * array/matrix/structure we're trying to fill in.
889 */
890 int last_swiz = -1;
891 for (unsigned int j = 0; j < 4; j++) {
892 int swiz = GET_SWZ(slots[i].swizzle, j);
893 if (swiz == last_swiz)
894 break;
895 last_swiz = swiz;
896
897 c->prog_data.param[c->prog_data.nr_params++] =
898 &fp->Base.Parameters->ParameterValues[index][swiz].f;
899 }
900 }
901 }
902
903 fs_reg *
904 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
905 {
906 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
907 fs_reg wpos = *reg;
908 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
909
910 /* gl_FragCoord.x */
911 if (ir->pixel_center_integer) {
912 emit(MOV(wpos, this->pixel_x));
913 } else {
914 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
915 }
916 wpos.reg_offset++;
917
918 /* gl_FragCoord.y */
919 if (!flip && ir->pixel_center_integer) {
920 emit(MOV(wpos, this->pixel_y));
921 } else {
922 fs_reg pixel_y = this->pixel_y;
923 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
924
925 if (flip) {
926 pixel_y.negate = true;
927 offset += c->key.drawable_height - 1.0;
928 }
929
930 emit(ADD(wpos, pixel_y, fs_reg(offset)));
931 }
932 wpos.reg_offset++;
933
934 /* gl_FragCoord.z */
935 if (brw->gen >= 6) {
936 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
937 } else {
938 emit(FS_OPCODE_LINTERP, wpos,
939 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
940 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
941 interp_reg(VARYING_SLOT_POS, 2));
942 }
943 wpos.reg_offset++;
944
945 /* gl_FragCoord.w: Already set up in emit_interpolation */
946 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
947
948 return reg;
949 }
950
951 fs_inst *
952 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
953 glsl_interp_qualifier interpolation_mode,
954 bool is_centroid)
955 {
956 brw_wm_barycentric_interp_mode barycoord_mode;
957 if (brw->gen >= 6) {
958 if (is_centroid) {
959 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
960 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
961 else
962 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
963 } else {
964 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
965 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
966 else
967 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
968 }
969 } else {
970 /* On Ironlake and below, there is only one interpolation mode.
971 * Centroid interpolation doesn't mean anything on this hardware --
972 * there is no multisampling.
973 */
974 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
975 }
976 return emit(FS_OPCODE_LINTERP, attr,
977 this->delta_x[barycoord_mode],
978 this->delta_y[barycoord_mode], interp);
979 }
980
981 fs_reg *
982 fs_visitor::emit_general_interpolation(ir_variable *ir)
983 {
984 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
985 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
986 fs_reg attr = *reg;
987
988 unsigned int array_elements;
989 const glsl_type *type;
990
991 if (ir->type->is_array()) {
992 array_elements = ir->type->length;
993 if (array_elements == 0) {
994 fail("dereferenced array '%s' has length 0\n", ir->name);
995 }
996 type = ir->type->fields.array;
997 } else {
998 array_elements = 1;
999 type = ir->type;
1000 }
1001
1002 glsl_interp_qualifier interpolation_mode =
1003 ir->determine_interpolation_mode(c->key.flat_shade);
1004
1005 int location = ir->location;
1006 for (unsigned int i = 0; i < array_elements; i++) {
1007 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1008 if (c->prog_data.urb_setup[location] == -1) {
1009 /* If there's no incoming setup data for this slot, don't
1010 * emit interpolation for it.
1011 */
1012 attr.reg_offset += type->vector_elements;
1013 location++;
1014 continue;
1015 }
1016
1017 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1018 /* Constant interpolation (flat shading) case. The SF has
1019 * handed us defined values in only the constant offset
1020 * field of the setup reg.
1021 */
1022 for (unsigned int k = 0; k < type->vector_elements; k++) {
1023 struct brw_reg interp = interp_reg(location, k);
1024 interp = suboffset(interp, 3);
1025 interp.type = reg->type;
1026 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1027 attr.reg_offset++;
1028 }
1029 } else {
1030 /* Smooth/noperspective interpolation case. */
1031 for (unsigned int k = 0; k < type->vector_elements; k++) {
1032 /* FINISHME: At some point we probably want to push
1033 * this farther by giving similar treatment to the
1034 * other potentially constant components of the
1035 * attribute, as well as making brw_vs_constval.c
1036 * handle varyings other than gl_TexCoord.
1037 */
1038 struct brw_reg interp = interp_reg(location, k);
1039 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1040 ir->centroid);
1041 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1042 /* Get the pixel/sample mask into f0 so that we know
1043 * which pixels are lit. Then, for each channel that is
1044 * unlit, replace the centroid data with non-centroid
1045 * data.
1046 */
1047 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1048 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1049 interpolation_mode, false);
1050 inst->predicate = BRW_PREDICATE_NORMAL;
1051 inst->predicate_inverse = true;
1052 }
1053 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1054 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1055 }
1056 attr.reg_offset++;
1057 }
1058
1059 }
1060 location++;
1061 }
1062 }
1063
1064 return reg;
1065 }
1066
1067 fs_reg *
1068 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1069 {
1070 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1071
1072 /* The frontfacing comes in as a bit in the thread payload. */
1073 if (brw->gen >= 6) {
1074 emit(BRW_OPCODE_ASR, *reg,
1075 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1076 fs_reg(15));
1077 emit(BRW_OPCODE_NOT, *reg, *reg);
1078 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1079 } else {
1080 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1081 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1082 * us front face
1083 */
1084 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1085 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1086 }
1087
1088 return reg;
1089 }
1090
1091 fs_reg
1092 fs_visitor::fix_math_operand(fs_reg src)
1093 {
1094 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1095 * might be able to do better by doing execsize = 1 math and then
1096 * expanding that result out, but we would need to be careful with
1097 * masking.
1098 *
1099 * The hardware ignores source modifiers (negate and abs) on math
1100 * instructions, so we also move to a temp to set those up.
1101 */
1102 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1103 !src.abs && !src.negate)
1104 return src;
1105
1106 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1107 * operands to math
1108 */
1109 if (brw->gen >= 7 && src.file != IMM)
1110 return src;
1111
1112 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1113 expanded.type = src.type;
1114 emit(BRW_OPCODE_MOV, expanded, src);
1115 return expanded;
1116 }
1117
1118 fs_inst *
1119 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1120 {
1121 switch (opcode) {
1122 case SHADER_OPCODE_RCP:
1123 case SHADER_OPCODE_RSQ:
1124 case SHADER_OPCODE_SQRT:
1125 case SHADER_OPCODE_EXP2:
1126 case SHADER_OPCODE_LOG2:
1127 case SHADER_OPCODE_SIN:
1128 case SHADER_OPCODE_COS:
1129 break;
1130 default:
1131 assert(!"not reached: bad math opcode");
1132 return NULL;
1133 }
1134
1135 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1136 * might be able to do better by doing execsize = 1 math and then
1137 * expanding that result out, but we would need to be careful with
1138 * masking.
1139 *
1140 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1141 * instructions, so we also move to a temp to set those up.
1142 */
1143 if (brw->gen >= 6)
1144 src = fix_math_operand(src);
1145
1146 fs_inst *inst = emit(opcode, dst, src);
1147
1148 if (brw->gen < 6) {
1149 inst->base_mrf = 2;
1150 inst->mlen = dispatch_width / 8;
1151 }
1152
1153 return inst;
1154 }
1155
1156 fs_inst *
1157 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1158 {
1159 int base_mrf = 2;
1160 fs_inst *inst;
1161
1162 switch (opcode) {
1163 case SHADER_OPCODE_INT_QUOTIENT:
1164 case SHADER_OPCODE_INT_REMAINDER:
1165 if (brw->gen >= 7 && dispatch_width == 16)
1166 fail("16-wide INTDIV unsupported\n");
1167 break;
1168 case SHADER_OPCODE_POW:
1169 break;
1170 default:
1171 assert(!"not reached: unsupported binary math opcode.");
1172 return NULL;
1173 }
1174
1175 if (brw->gen >= 6) {
1176 src0 = fix_math_operand(src0);
1177 src1 = fix_math_operand(src1);
1178
1179 inst = emit(opcode, dst, src0, src1);
1180 } else {
1181 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1182 * "Message Payload":
1183 *
1184 * "Operand0[7]. For the INT DIV functions, this operand is the
1185 * denominator."
1186 * ...
1187 * "Operand1[7]. For the INT DIV functions, this operand is the
1188 * numerator."
1189 */
1190 bool is_int_div = opcode != SHADER_OPCODE_POW;
1191 fs_reg &op0 = is_int_div ? src1 : src0;
1192 fs_reg &op1 = is_int_div ? src0 : src1;
1193
1194 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1195 inst = emit(opcode, dst, op0, reg_null_f);
1196
1197 inst->base_mrf = base_mrf;
1198 inst->mlen = 2 * dispatch_width / 8;
1199 }
1200 return inst;
1201 }
1202
1203 void
1204 fs_visitor::assign_curb_setup()
1205 {
1206 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1207 if (dispatch_width == 8) {
1208 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1209 } else {
1210 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1211 }
1212
1213 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1214 foreach_list(node, &this->instructions) {
1215 fs_inst *inst = (fs_inst *)node;
1216
1217 for (unsigned int i = 0; i < 3; i++) {
1218 if (inst->src[i].file == UNIFORM) {
1219 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1220 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1221 constant_nr / 8,
1222 constant_nr % 8);
1223
1224 inst->src[i].file = HW_REG;
1225 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1226 }
1227 }
1228 }
1229 }
1230
1231 void
1232 fs_visitor::calculate_urb_setup()
1233 {
1234 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1235 c->prog_data.urb_setup[i] = -1;
1236 }
1237
1238 int urb_next = 0;
1239 /* Figure out where each of the incoming setup attributes lands. */
1240 if (brw->gen >= 6) {
1241 if (_mesa_bitcount_64(fp->Base.InputsRead &
1242 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1243 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1244 * first 16 varying inputs, so we can put them wherever we want.
1245 * Just put them in order.
1246 *
1247 * This is useful because it means that (a) inputs not used by the
1248 * fragment shader won't take up valuable register space, and (b) we
1249 * won't have to recompile the fragment shader if it gets paired with
1250 * a different vertex (or geometry) shader.
1251 */
1252 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1253 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1254 BITFIELD64_BIT(i)) {
1255 c->prog_data.urb_setup[i] = urb_next++;
1256 }
1257 }
1258 } else {
1259 /* We have enough input varyings that the SF/SBE pipeline stage can't
1260 * arbitrarily rearrange them to suit our whim; we have to put them
1261 * in an order that matches the output of the previous pipeline stage
1262 * (geometry or vertex shader).
1263 */
1264 struct brw_vue_map prev_stage_vue_map;
1265 brw_compute_vue_map(brw, &prev_stage_vue_map,
1266 c->key.input_slots_valid);
1267 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1268 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1269 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1270 slot++) {
1271 int varying = prev_stage_vue_map.slot_to_varying[slot];
1272 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1273 * unused.
1274 */
1275 if (varying != BRW_VARYING_SLOT_COUNT &&
1276 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1277 BITFIELD64_BIT(varying))) {
1278 c->prog_data.urb_setup[varying] = slot - first_slot;
1279 }
1280 }
1281 urb_next = prev_stage_vue_map.num_slots - first_slot;
1282 }
1283 } else {
1284 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1285 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1286 /* Point size is packed into the header, not as a general attribute */
1287 if (i == VARYING_SLOT_PSIZ)
1288 continue;
1289
1290 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1291 /* The back color slot is skipped when the front color is
1292 * also written to. In addition, some slots can be
1293 * written in the vertex shader and not read in the
1294 * fragment shader. So the register number must always be
1295 * incremented, mapped or not.
1296 */
1297 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1298 c->prog_data.urb_setup[i] = urb_next;
1299 urb_next++;
1300 }
1301 }
1302
1303 /*
1304 * It's a FS only attribute, and we did interpolation for this attribute
1305 * in SF thread. So, count it here, too.
1306 *
1307 * See compile_sf_prog() for more info.
1308 */
1309 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1310 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1311 }
1312
1313 c->prog_data.num_varying_inputs = urb_next;
1314 }
1315
1316 void
1317 fs_visitor::assign_urb_setup()
1318 {
1319 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1320
1321 /* Offset all the urb_setup[] index by the actual position of the
1322 * setup regs, now that the location of the constants has been chosen.
1323 */
1324 foreach_list(node, &this->instructions) {
1325 fs_inst *inst = (fs_inst *)node;
1326
1327 if (inst->opcode == FS_OPCODE_LINTERP) {
1328 assert(inst->src[2].file == HW_REG);
1329 inst->src[2].fixed_hw_reg.nr += urb_start;
1330 }
1331
1332 if (inst->opcode == FS_OPCODE_CINTERP) {
1333 assert(inst->src[0].file == HW_REG);
1334 inst->src[0].fixed_hw_reg.nr += urb_start;
1335 }
1336 }
1337
1338 /* Each attribute is 4 setup channels, each of which is half a reg. */
1339 this->first_non_payload_grf =
1340 urb_start + c->prog_data.num_varying_inputs * 2;
1341 }
1342
1343 /**
1344 * Split large virtual GRFs into separate components if we can.
1345 *
1346 * This is mostly duplicated with what brw_fs_vector_splitting does,
1347 * but that's really conservative because it's afraid of doing
1348 * splitting that doesn't result in real progress after the rest of
1349 * the optimization phases, which would cause infinite looping in
1350 * optimization. We can do it once here, safely. This also has the
1351 * opportunity to split interpolated values, or maybe even uniforms,
1352 * which we don't have at the IR level.
1353 *
1354 * We want to split, because virtual GRFs are what we register
1355 * allocate and spill (due to contiguousness requirements for some
1356 * instructions), and they're what we naturally generate in the
1357 * codegen process, but most virtual GRFs don't actually need to be
1358 * contiguous sets of GRFs. If we split, we'll end up with reduced
1359 * live intervals and better dead code elimination and coalescing.
1360 */
1361 void
1362 fs_visitor::split_virtual_grfs()
1363 {
1364 int num_vars = this->virtual_grf_count;
1365 bool split_grf[num_vars];
1366 int new_virtual_grf[num_vars];
1367
1368 /* Try to split anything > 0 sized. */
1369 for (int i = 0; i < num_vars; i++) {
1370 if (this->virtual_grf_sizes[i] != 1)
1371 split_grf[i] = true;
1372 else
1373 split_grf[i] = false;
1374 }
1375
1376 if (brw->has_pln &&
1377 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1378 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1379 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1380 * Gen6, that was the only supported interpolation mode, and since Gen6,
1381 * delta_x and delta_y are in fixed hardware registers.
1382 */
1383 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1384 false;
1385 }
1386
1387 foreach_list(node, &this->instructions) {
1388 fs_inst *inst = (fs_inst *)node;
1389
1390 /* If there's a SEND message that requires contiguous destination
1391 * registers, no splitting is allowed.
1392 */
1393 if (inst->regs_written > 1) {
1394 split_grf[inst->dst.reg] = false;
1395 }
1396
1397 /* If we're sending from a GRF, don't split it, on the assumption that
1398 * the send is reading the whole thing.
1399 */
1400 if (inst->is_send_from_grf()) {
1401 for (int i = 0; i < 3; i++) {
1402 if (inst->src[i].file == GRF) {
1403 split_grf[inst->src[i].reg] = false;
1404 }
1405 }
1406 }
1407 }
1408
1409 /* Allocate new space for split regs. Note that the virtual
1410 * numbers will be contiguous.
1411 */
1412 for (int i = 0; i < num_vars; i++) {
1413 if (split_grf[i]) {
1414 new_virtual_grf[i] = virtual_grf_alloc(1);
1415 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1416 int reg = virtual_grf_alloc(1);
1417 assert(reg == new_virtual_grf[i] + j - 1);
1418 (void) reg;
1419 }
1420 this->virtual_grf_sizes[i] = 1;
1421 }
1422 }
1423
1424 foreach_list(node, &this->instructions) {
1425 fs_inst *inst = (fs_inst *)node;
1426
1427 if (inst->dst.file == GRF &&
1428 split_grf[inst->dst.reg] &&
1429 inst->dst.reg_offset != 0) {
1430 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1431 inst->dst.reg_offset - 1);
1432 inst->dst.reg_offset = 0;
1433 }
1434 for (int i = 0; i < 3; i++) {
1435 if (inst->src[i].file == GRF &&
1436 split_grf[inst->src[i].reg] &&
1437 inst->src[i].reg_offset != 0) {
1438 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1439 inst->src[i].reg_offset - 1);
1440 inst->src[i].reg_offset = 0;
1441 }
1442 }
1443 }
1444 this->live_intervals_valid = false;
1445 }
1446
1447 /**
1448 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1449 *
1450 * During code generation, we create tons of temporary variables, many of
1451 * which get immediately killed and are never used again. Yet, in later
1452 * optimization and analysis passes, such as compute_live_intervals, we need
1453 * to loop over all the virtual GRFs. Compacting them can save a lot of
1454 * overhead.
1455 */
1456 void
1457 fs_visitor::compact_virtual_grfs()
1458 {
1459 /* Mark which virtual GRFs are used, and count how many. */
1460 int remap_table[this->virtual_grf_count];
1461 memset(remap_table, -1, sizeof(remap_table));
1462
1463 foreach_list(node, &this->instructions) {
1464 const fs_inst *inst = (const fs_inst *) node;
1465
1466 if (inst->dst.file == GRF)
1467 remap_table[inst->dst.reg] = 0;
1468
1469 for (int i = 0; i < 3; i++) {
1470 if (inst->src[i].file == GRF)
1471 remap_table[inst->src[i].reg] = 0;
1472 }
1473 }
1474
1475 /* In addition to registers used in instructions, fs_visitor keeps
1476 * direct references to certain special values which must be patched:
1477 */
1478 fs_reg *special[] = {
1479 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1480 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1481 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1482 &delta_x[0], &delta_x[1], &delta_x[2],
1483 &delta_x[3], &delta_x[4], &delta_x[5],
1484 &delta_y[0], &delta_y[1], &delta_y[2],
1485 &delta_y[3], &delta_y[4], &delta_y[5],
1486 };
1487 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1488 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1489
1490 /* Treat all special values as used, to be conservative */
1491 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1492 if (special[i]->file == GRF)
1493 remap_table[special[i]->reg] = 0;
1494 }
1495
1496 /* Compact the GRF arrays. */
1497 int new_index = 0;
1498 for (int i = 0; i < this->virtual_grf_count; i++) {
1499 if (remap_table[i] != -1) {
1500 remap_table[i] = new_index;
1501 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1502 if (live_intervals_valid) {
1503 virtual_grf_start[new_index] = virtual_grf_start[i];
1504 virtual_grf_end[new_index] = virtual_grf_end[i];
1505 }
1506 ++new_index;
1507 }
1508 }
1509
1510 this->virtual_grf_count = new_index;
1511
1512 /* Patch all the instructions to use the newly renumbered registers */
1513 foreach_list(node, &this->instructions) {
1514 fs_inst *inst = (fs_inst *) node;
1515
1516 if (inst->dst.file == GRF)
1517 inst->dst.reg = remap_table[inst->dst.reg];
1518
1519 for (int i = 0; i < 3; i++) {
1520 if (inst->src[i].file == GRF)
1521 inst->src[i].reg = remap_table[inst->src[i].reg];
1522 }
1523 }
1524
1525 /* Patch all the references to special values */
1526 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1527 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1528 special[i]->reg = remap_table[special[i]->reg];
1529 }
1530 }
1531
1532 bool
1533 fs_visitor::remove_dead_constants()
1534 {
1535 if (dispatch_width == 8) {
1536 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1537 this->nr_params_remap = c->prog_data.nr_params;
1538
1539 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1540 this->params_remap[i] = -1;
1541
1542 /* Find which params are still in use. */
1543 foreach_list(node, &this->instructions) {
1544 fs_inst *inst = (fs_inst *)node;
1545
1546 for (int i = 0; i < 3; i++) {
1547 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1548
1549 if (inst->src[i].file != UNIFORM)
1550 continue;
1551
1552 /* Section 5.11 of the OpenGL 4.3 spec says:
1553 *
1554 * "Out-of-bounds reads return undefined values, which include
1555 * values from other variables of the active program or zero."
1556 */
1557 if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1558 constant_nr = 0;
1559 }
1560
1561 /* For now, set this to non-negative. We'll give it the
1562 * actual new number in a moment, in order to keep the
1563 * register numbers nicely ordered.
1564 */
1565 this->params_remap[constant_nr] = 0;
1566 }
1567 }
1568
1569 /* Figure out what the new numbers for the params will be. At some
1570 * point when we're doing uniform array access, we're going to want
1571 * to keep the distinction between .reg and .reg_offset, but for
1572 * now we don't care.
1573 */
1574 unsigned int new_nr_params = 0;
1575 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1576 if (this->params_remap[i] != -1) {
1577 this->params_remap[i] = new_nr_params++;
1578 }
1579 }
1580
1581 /* Update the list of params to be uploaded to match our new numbering. */
1582 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1583 int remapped = this->params_remap[i];
1584
1585 if (remapped == -1)
1586 continue;
1587
1588 c->prog_data.param[remapped] = c->prog_data.param[i];
1589 }
1590
1591 c->prog_data.nr_params = new_nr_params;
1592 } else {
1593 /* This should have been generated in the 8-wide pass already. */
1594 assert(this->params_remap);
1595 }
1596
1597 /* Now do the renumbering of the shader to remove unused params. */
1598 foreach_list(node, &this->instructions) {
1599 fs_inst *inst = (fs_inst *)node;
1600
1601 for (int i = 0; i < 3; i++) {
1602 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1603
1604 if (inst->src[i].file != UNIFORM)
1605 continue;
1606
1607 /* as above alias to 0 */
1608 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1609 constant_nr = 0;
1610 }
1611 assert(this->params_remap[constant_nr] != -1);
1612 inst->src[i].reg = this->params_remap[constant_nr];
1613 inst->src[i].reg_offset = 0;
1614 }
1615 }
1616
1617 return true;
1618 }
1619
1620 /*
1621 * Implements array access of uniforms by inserting a
1622 * PULL_CONSTANT_LOAD instruction.
1623 *
1624 * Unlike temporary GRF array access (where we don't support it due to
1625 * the difficulty of doing relative addressing on instruction
1626 * destinations), we could potentially do array access of uniforms
1627 * that were loaded in GRF space as push constants. In real-world
1628 * usage we've seen, though, the arrays being used are always larger
1629 * than we could load as push constants, so just always move all
1630 * uniform array access out to a pull constant buffer.
1631 */
1632 void
1633 fs_visitor::move_uniform_array_access_to_pull_constants()
1634 {
1635 int pull_constant_loc[c->prog_data.nr_params];
1636
1637 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1638 pull_constant_loc[i] = -1;
1639 }
1640
1641 /* Walk through and find array access of uniforms. Put a copy of that
1642 * uniform in the pull constant buffer.
1643 *
1644 * Note that we don't move constant-indexed accesses to arrays. No
1645 * testing has been done of the performance impact of this choice.
1646 */
1647 foreach_list_safe(node, &this->instructions) {
1648 fs_inst *inst = (fs_inst *)node;
1649
1650 for (int i = 0 ; i < 3; i++) {
1651 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1652 continue;
1653
1654 int uniform = inst->src[i].reg;
1655
1656 /* If this array isn't already present in the pull constant buffer,
1657 * add it.
1658 */
1659 if (pull_constant_loc[uniform] == -1) {
1660 const float **values = &c->prog_data.param[uniform];
1661
1662 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1663
1664 assert(param_size[uniform]);
1665
1666 for (int j = 0; j < param_size[uniform]; j++) {
1667 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1668 values[j];
1669 }
1670 }
1671
1672 /* Set up the annotation tracking for new generated instructions. */
1673 base_ir = inst->ir;
1674 current_annotation = inst->annotation;
1675
1676 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1677 fs_reg temp = fs_reg(this, glsl_type::float_type);
1678 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1679 surf_index,
1680 *inst->src[i].reladdr,
1681 pull_constant_loc[uniform] +
1682 inst->src[i].reg_offset);
1683 inst->insert_before(&list);
1684
1685 inst->src[i].file = temp.file;
1686 inst->src[i].reg = temp.reg;
1687 inst->src[i].reg_offset = temp.reg_offset;
1688 inst->src[i].reladdr = NULL;
1689 }
1690 }
1691 }
1692
1693 /**
1694 * Choose accesses from the UNIFORM file to demote to using the pull
1695 * constant buffer.
1696 *
1697 * We allow a fragment shader to have more than the specified minimum
1698 * maximum number of fragment shader uniform components (64). If
1699 * there are too many of these, they'd fill up all of register space.
1700 * So, this will push some of them out to the pull constant buffer and
1701 * update the program to load them.
1702 */
1703 void
1704 fs_visitor::setup_pull_constants()
1705 {
1706 /* Only allow 16 registers (128 uniform components) as push constants. */
1707 unsigned int max_uniform_components = 16 * 8;
1708 if (c->prog_data.nr_params <= max_uniform_components)
1709 return;
1710
1711 if (dispatch_width == 16) {
1712 fail("Pull constants not supported in 16-wide\n");
1713 return;
1714 }
1715
1716 /* Just demote the end of the list. We could probably do better
1717 * here, demoting things that are rarely used in the program first.
1718 */
1719 unsigned int pull_uniform_base = max_uniform_components;
1720
1721 int pull_constant_loc[c->prog_data.nr_params];
1722 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1723 if (i < pull_uniform_base) {
1724 pull_constant_loc[i] = -1;
1725 } else {
1726 pull_constant_loc[i] = -1;
1727 /* If our constant is already being uploaded for reladdr purposes,
1728 * reuse it.
1729 */
1730 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1731 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1732 pull_constant_loc[i] = j;
1733 break;
1734 }
1735 }
1736 if (pull_constant_loc[i] == -1) {
1737 int pull_index = c->prog_data.nr_pull_params++;
1738 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1739 pull_constant_loc[i] = pull_index;;
1740 }
1741 }
1742 }
1743 c->prog_data.nr_params = pull_uniform_base;
1744
1745 foreach_list(node, &this->instructions) {
1746 fs_inst *inst = (fs_inst *)node;
1747
1748 for (int i = 0; i < 3; i++) {
1749 if (inst->src[i].file != UNIFORM)
1750 continue;
1751
1752 int pull_index = pull_constant_loc[inst->src[i].reg +
1753 inst->src[i].reg_offset];
1754 if (pull_index == -1)
1755 continue;
1756
1757 assert(!inst->src[i].reladdr);
1758
1759 fs_reg dst = fs_reg(this, glsl_type::float_type);
1760 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1761 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1762 fs_inst *pull =
1763 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1764 dst, index, offset);
1765 pull->ir = inst->ir;
1766 pull->annotation = inst->annotation;
1767
1768 inst->insert_before(pull);
1769
1770 inst->src[i].file = GRF;
1771 inst->src[i].reg = dst.reg;
1772 inst->src[i].reg_offset = 0;
1773 inst->src[i].smear = pull_index & 3;
1774 }
1775 }
1776 }
1777
1778 bool
1779 fs_visitor::opt_algebraic()
1780 {
1781 bool progress = false;
1782
1783 foreach_list(node, &this->instructions) {
1784 fs_inst *inst = (fs_inst *)node;
1785
1786 switch (inst->opcode) {
1787 case BRW_OPCODE_MUL:
1788 if (inst->src[1].file != IMM)
1789 continue;
1790
1791 /* a * 1.0 = a */
1792 if (inst->src[1].is_one()) {
1793 inst->opcode = BRW_OPCODE_MOV;
1794 inst->src[1] = reg_undef;
1795 progress = true;
1796 break;
1797 }
1798
1799 /* a * 0.0 = 0.0 */
1800 if (inst->src[1].is_zero()) {
1801 inst->opcode = BRW_OPCODE_MOV;
1802 inst->src[0] = inst->src[1];
1803 inst->src[1] = reg_undef;
1804 progress = true;
1805 break;
1806 }
1807
1808 break;
1809 case BRW_OPCODE_ADD:
1810 if (inst->src[1].file != IMM)
1811 continue;
1812
1813 /* a + 0.0 = a */
1814 if (inst->src[1].is_zero()) {
1815 inst->opcode = BRW_OPCODE_MOV;
1816 inst->src[1] = reg_undef;
1817 progress = true;
1818 break;
1819 }
1820 break;
1821 default:
1822 break;
1823 }
1824 }
1825
1826 return progress;
1827 }
1828
1829 /**
1830 * Removes any instructions writing a VGRF where that VGRF is not used by any
1831 * later instruction.
1832 */
1833 bool
1834 fs_visitor::dead_code_eliminate()
1835 {
1836 bool progress = false;
1837 int pc = 0;
1838
1839 calculate_live_intervals();
1840
1841 foreach_list_safe(node, &this->instructions) {
1842 fs_inst *inst = (fs_inst *)node;
1843
1844 if (inst->dst.file == GRF) {
1845 assert(this->virtual_grf_end[inst->dst.reg] >= pc);
1846 if (this->virtual_grf_end[inst->dst.reg] == pc) {
1847 inst->remove();
1848 progress = true;
1849 }
1850 }
1851
1852 pc++;
1853 }
1854
1855 if (progress)
1856 live_intervals_valid = false;
1857
1858 return progress;
1859 }
1860
1861 struct dead_code_hash_key
1862 {
1863 int vgrf;
1864 int reg_offset;
1865 };
1866
1867 static bool
1868 dead_code_hash_compare(const void *a, const void *b)
1869 {
1870 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1871 }
1872
1873 static void
1874 clear_dead_code_hash(struct hash_table *ht)
1875 {
1876 struct hash_entry *entry;
1877
1878 hash_table_foreach(ht, entry) {
1879 _mesa_hash_table_remove(ht, entry);
1880 }
1881 }
1882
1883 static void
1884 insert_dead_code_hash(struct hash_table *ht,
1885 int vgrf, int reg_offset, fs_inst *inst)
1886 {
1887 /* We don't bother freeing keys, because they'll be GCed with the ht. */
1888 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1889
1890 key->vgrf = vgrf;
1891 key->reg_offset = reg_offset;
1892
1893 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1894 }
1895
1896 static struct hash_entry *
1897 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1898 {
1899 struct dead_code_hash_key key;
1900
1901 key.vgrf = vgrf;
1902 key.reg_offset = reg_offset;
1903
1904 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1905 }
1906
1907 static void
1908 remove_dead_code_hash(struct hash_table *ht,
1909 int vgrf, int reg_offset)
1910 {
1911 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1912 if (!entry)
1913 return;
1914
1915 _mesa_hash_table_remove(ht, entry);
1916 }
1917
1918 /**
1919 * Walks basic blocks, removing any regs that are written but not read before
1920 * being redefined.
1921 *
1922 * The dead_code_eliminate() function implements a global dead code
1923 * elimination, but it only handles the removing the last write to a register
1924 * if it's never read. This one can handle intermediate writes, but only
1925 * within a basic block.
1926 */
1927 bool
1928 fs_visitor::dead_code_eliminate_local()
1929 {
1930 struct hash_table *ht;
1931 bool progress = false;
1932
1933 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1934
1935 foreach_list_safe(node, &this->instructions) {
1936 fs_inst *inst = (fs_inst *)node;
1937
1938 /* At a basic block, empty the HT since we don't understand dataflow
1939 * here.
1940 */
1941 if (inst->is_control_flow()) {
1942 clear_dead_code_hash(ht);
1943 continue;
1944 }
1945
1946 /* Clear the HT of any instructions that got read. */
1947 for (int i = 0; i < 3; i++) {
1948 fs_reg src = inst->src[i];
1949 if (src.file != GRF)
1950 continue;
1951
1952 int read = 1;
1953 if (inst->is_send_from_grf())
1954 read = virtual_grf_sizes[src.reg] - src.reg_offset;
1955
1956 for (int reg_offset = src.reg_offset;
1957 reg_offset < src.reg_offset + read;
1958 reg_offset++) {
1959 remove_dead_code_hash(ht, src.reg, reg_offset);
1960 }
1961 }
1962
1963 /* Add any update of a GRF to the HT, removing a previous write if it
1964 * wasn't read.
1965 */
1966 if (inst->dst.file == GRF) {
1967 if (inst->regs_written > 1) {
1968 /* We don't know how to trim channels from an instruction's
1969 * writes, so we can't incrementally remove unread channels from
1970 * it. Just remove whatever it overwrites from the table
1971 */
1972 for (int i = 0; i < inst->regs_written; i++) {
1973 remove_dead_code_hash(ht,
1974 inst->dst.reg,
1975 inst->dst.reg_offset + i);
1976 }
1977 } else {
1978 struct hash_entry *entry =
1979 get_dead_code_hash_entry(ht, inst->dst.reg,
1980 inst->dst.reg_offset);
1981
1982 if (inst->is_partial_write()) {
1983 /* For a partial write, we can't remove any previous dead code
1984 * candidate, since we're just modifying their result, but we can
1985 * be dead code eliminiated ourselves.
1986 */
1987 if (entry) {
1988 entry->data = inst;
1989 } else {
1990 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1991 inst);
1992 }
1993 } else {
1994 if (entry) {
1995 /* We're completely updating a channel, and there was a
1996 * previous write to the channel that wasn't read. Kill it!
1997 */
1998 fs_inst *inst = (fs_inst *)entry->data;
1999 inst->remove();
2000 progress = true;
2001 _mesa_hash_table_remove(ht, entry);
2002 }
2003
2004 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2005 inst);
2006 }
2007 }
2008 }
2009 }
2010
2011 _mesa_hash_table_destroy(ht, NULL);
2012
2013 if (progress)
2014 live_intervals_valid = false;
2015
2016 return progress;
2017 }
2018
2019 /**
2020 * Implements a second type of register coalescing: This one checks if
2021 * the two regs involved in a raw move don't interfere, in which case
2022 * they can both by stored in the same place and the MOV removed.
2023 */
2024 bool
2025 fs_visitor::register_coalesce_2()
2026 {
2027 bool progress = false;
2028
2029 calculate_live_intervals();
2030
2031 foreach_list_safe(node, &this->instructions) {
2032 fs_inst *inst = (fs_inst *)node;
2033
2034 if (inst->opcode != BRW_OPCODE_MOV ||
2035 inst->is_partial_write() ||
2036 inst->saturate ||
2037 inst->src[0].file != GRF ||
2038 inst->src[0].negate ||
2039 inst->src[0].abs ||
2040 inst->src[0].smear != -1 ||
2041 inst->dst.file != GRF ||
2042 inst->dst.type != inst->src[0].type ||
2043 virtual_grf_sizes[inst->src[0].reg] != 1 ||
2044 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
2045 continue;
2046 }
2047
2048 int reg_from = inst->src[0].reg;
2049 assert(inst->src[0].reg_offset == 0);
2050 int reg_to = inst->dst.reg;
2051 int reg_to_offset = inst->dst.reg_offset;
2052
2053 foreach_list(node, &this->instructions) {
2054 fs_inst *scan_inst = (fs_inst *)node;
2055
2056 if (scan_inst->dst.file == GRF &&
2057 scan_inst->dst.reg == reg_from) {
2058 scan_inst->dst.reg = reg_to;
2059 scan_inst->dst.reg_offset = reg_to_offset;
2060 }
2061 for (int i = 0; i < 3; i++) {
2062 if (scan_inst->src[i].file == GRF &&
2063 scan_inst->src[i].reg == reg_from) {
2064 scan_inst->src[i].reg = reg_to;
2065 scan_inst->src[i].reg_offset = reg_to_offset;
2066 }
2067 }
2068 }
2069
2070 inst->remove();
2071
2072 /* We don't need to recalculate live intervals inside the loop despite
2073 * flagging live_intervals_valid because we only use live intervals for
2074 * the interferes test, and we must have had a situation where the
2075 * intervals were:
2076 *
2077 * from to
2078 * ^
2079 * |
2080 * v
2081 * ^
2082 * |
2083 * v
2084 *
2085 * Some register R that might get coalesced with one of these two could
2086 * only be referencing "to", otherwise "from"'s range would have been
2087 * longer. R's range could also only start at the end of "to" or later,
2088 * otherwise it will conflict with "to" when we try to coalesce "to"
2089 * into Rw anyway.
2090 */
2091 live_intervals_valid = false;
2092
2093 progress = true;
2094 continue;
2095 }
2096
2097 return progress;
2098 }
2099
2100 bool
2101 fs_visitor::register_coalesce()
2102 {
2103 bool progress = false;
2104 int if_depth = 0;
2105 int loop_depth = 0;
2106
2107 foreach_list_safe(node, &this->instructions) {
2108 fs_inst *inst = (fs_inst *)node;
2109
2110 /* Make sure that we dominate the instructions we're going to
2111 * scan for interfering with our coalescing, or we won't have
2112 * scanned enough to see if anything interferes with our
2113 * coalescing. We don't dominate the following instructions if
2114 * we're in a loop or an if block.
2115 */
2116 switch (inst->opcode) {
2117 case BRW_OPCODE_DO:
2118 loop_depth++;
2119 break;
2120 case BRW_OPCODE_WHILE:
2121 loop_depth--;
2122 break;
2123 case BRW_OPCODE_IF:
2124 if_depth++;
2125 break;
2126 case BRW_OPCODE_ENDIF:
2127 if_depth--;
2128 break;
2129 default:
2130 break;
2131 }
2132 if (loop_depth || if_depth)
2133 continue;
2134
2135 if (inst->opcode != BRW_OPCODE_MOV ||
2136 inst->is_partial_write() ||
2137 inst->saturate ||
2138 inst->dst.file != GRF || (inst->src[0].file != GRF &&
2139 inst->src[0].file != UNIFORM)||
2140 inst->dst.type != inst->src[0].type)
2141 continue;
2142
2143 bool has_source_modifiers = (inst->src[0].abs ||
2144 inst->src[0].negate ||
2145 inst->src[0].smear != -1 ||
2146 inst->src[0].file == UNIFORM);
2147
2148 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
2149 * them: check for no writes to either one until the exit of the
2150 * program.
2151 */
2152 bool interfered = false;
2153
2154 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2155 !scan_inst->is_tail_sentinel();
2156 scan_inst = (fs_inst *)scan_inst->next) {
2157 if (scan_inst->dst.file == GRF) {
2158 if (scan_inst->overwrites_reg(inst->dst) ||
2159 scan_inst->overwrites_reg(inst->src[0])) {
2160 interfered = true;
2161 break;
2162 }
2163 }
2164
2165 if (has_source_modifiers) {
2166 for (int i = 0; i < 3; i++) {
2167 if (scan_inst->src[i].file == GRF &&
2168 scan_inst->src[i].reg == inst->dst.reg &&
2169 scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2170 inst->dst.type != scan_inst->src[i].type)
2171 {
2172 interfered = true;
2173 break;
2174 }
2175 }
2176 }
2177
2178
2179 /* The gen6 MATH instruction can't handle source modifiers or
2180 * unusual register regions, so avoid coalescing those for
2181 * now. We should do something more specific.
2182 */
2183 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2184 interfered = true;
2185 break;
2186 }
2187
2188 /* The accumulator result appears to get used for the
2189 * conditional modifier generation. When negating a UD
2190 * value, there is a 33rd bit generated for the sign in the
2191 * accumulator value, so now you can't check, for example,
2192 * equality with a 32-bit value. See piglit fs-op-neg-uint.
2193 */
2194 if (scan_inst->conditional_mod &&
2195 inst->src[0].negate &&
2196 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2197 interfered = true;
2198 break;
2199 }
2200 }
2201 if (interfered) {
2202 continue;
2203 }
2204
2205 /* Rewrite the later usage to point at the source of the move to
2206 * be removed.
2207 */
2208 for (fs_inst *scan_inst = inst;
2209 !scan_inst->is_tail_sentinel();
2210 scan_inst = (fs_inst *)scan_inst->next) {
2211 for (int i = 0; i < 3; i++) {
2212 if (scan_inst->src[i].file == GRF &&
2213 scan_inst->src[i].reg == inst->dst.reg &&
2214 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2215 fs_reg new_src = inst->src[0];
2216 if (scan_inst->src[i].abs) {
2217 new_src.negate = 0;
2218 new_src.abs = 1;
2219 }
2220 new_src.negate ^= scan_inst->src[i].negate;
2221 scan_inst->src[i] = new_src;
2222 }
2223 }
2224 }
2225
2226 inst->remove();
2227 progress = true;
2228 }
2229
2230 if (progress)
2231 live_intervals_valid = false;
2232
2233 return progress;
2234 }
2235
2236
2237 bool
2238 fs_visitor::compute_to_mrf()
2239 {
2240 bool progress = false;
2241 int next_ip = 0;
2242
2243 calculate_live_intervals();
2244
2245 foreach_list_safe(node, &this->instructions) {
2246 fs_inst *inst = (fs_inst *)node;
2247
2248 int ip = next_ip;
2249 next_ip++;
2250
2251 if (inst->opcode != BRW_OPCODE_MOV ||
2252 inst->is_partial_write() ||
2253 inst->dst.file != MRF || inst->src[0].file != GRF ||
2254 inst->dst.type != inst->src[0].type ||
2255 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2256 continue;
2257
2258 /* Work out which hardware MRF registers are written by this
2259 * instruction.
2260 */
2261 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2262 int mrf_high;
2263 if (inst->dst.reg & BRW_MRF_COMPR4) {
2264 mrf_high = mrf_low + 4;
2265 } else if (dispatch_width == 16 &&
2266 (!inst->force_uncompressed && !inst->force_sechalf)) {
2267 mrf_high = mrf_low + 1;
2268 } else {
2269 mrf_high = mrf_low;
2270 }
2271
2272 /* Can't compute-to-MRF this GRF if someone else was going to
2273 * read it later.
2274 */
2275 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2276 continue;
2277
2278 /* Found a move of a GRF to a MRF. Let's see if we can go
2279 * rewrite the thing that made this GRF to write into the MRF.
2280 */
2281 fs_inst *scan_inst;
2282 for (scan_inst = (fs_inst *)inst->prev;
2283 scan_inst->prev != NULL;
2284 scan_inst = (fs_inst *)scan_inst->prev) {
2285 if (scan_inst->dst.file == GRF &&
2286 scan_inst->dst.reg == inst->src[0].reg) {
2287 /* Found the last thing to write our reg we want to turn
2288 * into a compute-to-MRF.
2289 */
2290
2291 /* If this one instruction didn't populate all the
2292 * channels, bail. We might be able to rewrite everything
2293 * that writes that reg, but it would require smarter
2294 * tracking to delay the rewriting until complete success.
2295 */
2296 if (scan_inst->is_partial_write())
2297 break;
2298
2299 /* Things returning more than one register would need us to
2300 * understand coalescing out more than one MOV at a time.
2301 */
2302 if (scan_inst->regs_written > 1)
2303 break;
2304
2305 /* SEND instructions can't have MRF as a destination. */
2306 if (scan_inst->mlen)
2307 break;
2308
2309 if (brw->gen == 6) {
2310 /* gen6 math instructions must have the destination be
2311 * GRF, so no compute-to-MRF for them.
2312 */
2313 if (scan_inst->is_math()) {
2314 break;
2315 }
2316 }
2317
2318 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2319 /* Found the creator of our MRF's source value. */
2320 scan_inst->dst.file = MRF;
2321 scan_inst->dst.reg = inst->dst.reg;
2322 scan_inst->saturate |= inst->saturate;
2323 inst->remove();
2324 progress = true;
2325 }
2326 break;
2327 }
2328
2329 /* We don't handle control flow here. Most computation of
2330 * values that end up in MRFs are shortly before the MRF
2331 * write anyway.
2332 */
2333 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2334 break;
2335
2336 /* You can't read from an MRF, so if someone else reads our
2337 * MRF's source GRF that we wanted to rewrite, that stops us.
2338 */
2339 bool interfered = false;
2340 for (int i = 0; i < 3; i++) {
2341 if (scan_inst->src[i].file == GRF &&
2342 scan_inst->src[i].reg == inst->src[0].reg &&
2343 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2344 interfered = true;
2345 }
2346 }
2347 if (interfered)
2348 break;
2349
2350 if (scan_inst->dst.file == MRF) {
2351 /* If somebody else writes our MRF here, we can't
2352 * compute-to-MRF before that.
2353 */
2354 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2355 int scan_mrf_high;
2356
2357 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2358 scan_mrf_high = scan_mrf_low + 4;
2359 } else if (dispatch_width == 16 &&
2360 (!scan_inst->force_uncompressed &&
2361 !scan_inst->force_sechalf)) {
2362 scan_mrf_high = scan_mrf_low + 1;
2363 } else {
2364 scan_mrf_high = scan_mrf_low;
2365 }
2366
2367 if (mrf_low == scan_mrf_low ||
2368 mrf_low == scan_mrf_high ||
2369 mrf_high == scan_mrf_low ||
2370 mrf_high == scan_mrf_high) {
2371 break;
2372 }
2373 }
2374
2375 if (scan_inst->mlen > 0) {
2376 /* Found a SEND instruction, which means that there are
2377 * live values in MRFs from base_mrf to base_mrf +
2378 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2379 * above it.
2380 */
2381 if (mrf_low >= scan_inst->base_mrf &&
2382 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2383 break;
2384 }
2385 if (mrf_high >= scan_inst->base_mrf &&
2386 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2387 break;
2388 }
2389 }
2390 }
2391 }
2392
2393 if (progress)
2394 live_intervals_valid = false;
2395
2396 return progress;
2397 }
2398
2399 /**
2400 * Walks through basic blocks, looking for repeated MRF writes and
2401 * removing the later ones.
2402 */
2403 bool
2404 fs_visitor::remove_duplicate_mrf_writes()
2405 {
2406 fs_inst *last_mrf_move[16];
2407 bool progress = false;
2408
2409 /* Need to update the MRF tracking for compressed instructions. */
2410 if (dispatch_width == 16)
2411 return false;
2412
2413 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2414
2415 foreach_list_safe(node, &this->instructions) {
2416 fs_inst *inst = (fs_inst *)node;
2417
2418 if (inst->is_control_flow()) {
2419 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2420 }
2421
2422 if (inst->opcode == BRW_OPCODE_MOV &&
2423 inst->dst.file == MRF) {
2424 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2425 if (prev_inst && inst->equals(prev_inst)) {
2426 inst->remove();
2427 progress = true;
2428 continue;
2429 }
2430 }
2431
2432 /* Clear out the last-write records for MRFs that were overwritten. */
2433 if (inst->dst.file == MRF) {
2434 last_mrf_move[inst->dst.reg] = NULL;
2435 }
2436
2437 if (inst->mlen > 0) {
2438 /* Found a SEND instruction, which will include two or fewer
2439 * implied MRF writes. We could do better here.
2440 */
2441 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2442 last_mrf_move[inst->base_mrf + i] = NULL;
2443 }
2444 }
2445
2446 /* Clear out any MRF move records whose sources got overwritten. */
2447 if (inst->dst.file == GRF) {
2448 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2449 if (last_mrf_move[i] &&
2450 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2451 last_mrf_move[i] = NULL;
2452 }
2453 }
2454 }
2455
2456 if (inst->opcode == BRW_OPCODE_MOV &&
2457 inst->dst.file == MRF &&
2458 inst->src[0].file == GRF &&
2459 !inst->is_partial_write()) {
2460 last_mrf_move[inst->dst.reg] = inst;
2461 }
2462 }
2463
2464 if (progress)
2465 live_intervals_valid = false;
2466
2467 return progress;
2468 }
2469
2470 static void
2471 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2472 int first_grf, int grf_len)
2473 {
2474 bool inst_16wide = (dispatch_width > 8 &&
2475 !inst->force_uncompressed &&
2476 !inst->force_sechalf);
2477
2478 /* Clear the flag for registers that actually got read (as expected). */
2479 for (int i = 0; i < 3; i++) {
2480 int grf;
2481 if (inst->src[i].file == GRF) {
2482 grf = inst->src[i].reg;
2483 } else if (inst->src[i].file == HW_REG &&
2484 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2485 grf = inst->src[i].fixed_hw_reg.nr;
2486 } else {
2487 continue;
2488 }
2489
2490 if (grf >= first_grf &&
2491 grf < first_grf + grf_len) {
2492 deps[grf - first_grf] = false;
2493 if (inst_16wide)
2494 deps[grf - first_grf + 1] = false;
2495 }
2496 }
2497 }
2498
2499 /**
2500 * Implements this workaround for the original 965:
2501 *
2502 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2503 * check for post destination dependencies on this instruction, software
2504 * must ensure that there is no destination hazard for the case of ‘write
2505 * followed by a posted write’ shown in the following example.
2506 *
2507 * 1. mov r3 0
2508 * 2. send r3.xy <rest of send instruction>
2509 * 3. mov r2 r3
2510 *
2511 * Due to no post-destination dependency check on the ‘send’, the above
2512 * code sequence could have two instructions (1 and 2) in flight at the
2513 * same time that both consider ‘r3’ as the target of their final writes.
2514 */
2515 void
2516 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2517 {
2518 int reg_size = dispatch_width / 8;
2519 int write_len = inst->regs_written * reg_size;
2520 int first_write_grf = inst->dst.reg;
2521 bool needs_dep[BRW_MAX_MRF];
2522 assert(write_len < (int)sizeof(needs_dep) - 1);
2523
2524 memset(needs_dep, false, sizeof(needs_dep));
2525 memset(needs_dep, true, write_len);
2526
2527 clear_deps_for_inst_src(inst, dispatch_width,
2528 needs_dep, first_write_grf, write_len);
2529
2530 /* Walk backwards looking for writes to registers we're writing which
2531 * aren't read since being written. If we hit the start of the program,
2532 * we assume that there are no outstanding dependencies on entry to the
2533 * program.
2534 */
2535 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2536 scan_inst != NULL;
2537 scan_inst = (fs_inst *)scan_inst->prev) {
2538
2539 /* If we hit control flow, assume that there *are* outstanding
2540 * dependencies, and force their cleanup before our instruction.
2541 */
2542 if (scan_inst->is_control_flow()) {
2543 for (int i = 0; i < write_len; i++) {
2544 if (needs_dep[i]) {
2545 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2546 }
2547 }
2548 return;
2549 }
2550
2551 bool scan_inst_16wide = (dispatch_width > 8 &&
2552 !scan_inst->force_uncompressed &&
2553 !scan_inst->force_sechalf);
2554
2555 /* We insert our reads as late as possible on the assumption that any
2556 * instruction but a MOV that might have left us an outstanding
2557 * dependency has more latency than a MOV.
2558 */
2559 if (scan_inst->dst.file == GRF) {
2560 for (int i = 0; i < scan_inst->regs_written; i++) {
2561 int reg = scan_inst->dst.reg + i * reg_size;
2562
2563 if (reg >= first_write_grf &&
2564 reg < first_write_grf + write_len &&
2565 needs_dep[reg - first_write_grf]) {
2566 inst->insert_before(DEP_RESOLVE_MOV(reg));
2567 needs_dep[reg - first_write_grf] = false;
2568 if (scan_inst_16wide)
2569 needs_dep[reg - first_write_grf + 1] = false;
2570 }
2571 }
2572 }
2573
2574 /* Clear the flag for registers that actually got read (as expected). */
2575 clear_deps_for_inst_src(scan_inst, dispatch_width,
2576 needs_dep, first_write_grf, write_len);
2577
2578 /* Continue the loop only if we haven't resolved all the dependencies */
2579 int i;
2580 for (i = 0; i < write_len; i++) {
2581 if (needs_dep[i])
2582 break;
2583 }
2584 if (i == write_len)
2585 return;
2586 }
2587 }
2588
2589 /**
2590 * Implements this workaround for the original 965:
2591 *
2592 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2593 * used as a destination register until after it has been sourced by an
2594 * instruction with a different destination register.
2595 */
2596 void
2597 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2598 {
2599 int write_len = inst->regs_written * dispatch_width / 8;
2600 int first_write_grf = inst->dst.reg;
2601 bool needs_dep[BRW_MAX_MRF];
2602 assert(write_len < (int)sizeof(needs_dep) - 1);
2603
2604 memset(needs_dep, false, sizeof(needs_dep));
2605 memset(needs_dep, true, write_len);
2606 /* Walk forwards looking for writes to registers we're writing which aren't
2607 * read before being written.
2608 */
2609 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2610 !scan_inst->is_tail_sentinel();
2611 scan_inst = (fs_inst *)scan_inst->next) {
2612 /* If we hit control flow, force resolve all remaining dependencies. */
2613 if (scan_inst->is_control_flow()) {
2614 for (int i = 0; i < write_len; i++) {
2615 if (needs_dep[i])
2616 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2617 }
2618 return;
2619 }
2620
2621 /* Clear the flag for registers that actually got read (as expected). */
2622 clear_deps_for_inst_src(scan_inst, dispatch_width,
2623 needs_dep, first_write_grf, write_len);
2624
2625 /* We insert our reads as late as possible since they're reading the
2626 * result of a SEND, which has massive latency.
2627 */
2628 if (scan_inst->dst.file == GRF &&
2629 scan_inst->dst.reg >= first_write_grf &&
2630 scan_inst->dst.reg < first_write_grf + write_len &&
2631 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2632 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2633 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2634 }
2635
2636 /* Continue the loop only if we haven't resolved all the dependencies */
2637 int i;
2638 for (i = 0; i < write_len; i++) {
2639 if (needs_dep[i])
2640 break;
2641 }
2642 if (i == write_len)
2643 return;
2644 }
2645
2646 /* If we hit the end of the program, resolve all remaining dependencies out
2647 * of paranoia.
2648 */
2649 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2650 assert(last_inst->eot);
2651 for (int i = 0; i < write_len; i++) {
2652 if (needs_dep[i])
2653 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2654 }
2655 }
2656
2657 void
2658 fs_visitor::insert_gen4_send_dependency_workarounds()
2659 {
2660 if (brw->gen != 4 || brw->is_g4x)
2661 return;
2662
2663 /* Note that we're done with register allocation, so GRF fs_regs always
2664 * have a .reg_offset of 0.
2665 */
2666
2667 foreach_list_safe(node, &this->instructions) {
2668 fs_inst *inst = (fs_inst *)node;
2669
2670 if (inst->mlen != 0 && inst->dst.file == GRF) {
2671 insert_gen4_pre_send_dependency_workarounds(inst);
2672 insert_gen4_post_send_dependency_workarounds(inst);
2673 }
2674 }
2675 }
2676
2677 /**
2678 * Turns the generic expression-style uniform pull constant load instruction
2679 * into a hardware-specific series of instructions for loading a pull
2680 * constant.
2681 *
2682 * The expression style allows the CSE pass before this to optimize out
2683 * repeated loads from the same offset, and gives the pre-register-allocation
2684 * scheduling full flexibility, while the conversion to native instructions
2685 * allows the post-register-allocation scheduler the best information
2686 * possible.
2687 *
2688 * Note that execution masking for setting up pull constant loads is special:
2689 * the channels that need to be written are unrelated to the current execution
2690 * mask, since a later instruction will use one of the result channels as a
2691 * source operand for all 8 or 16 of its channels.
2692 */
2693 void
2694 fs_visitor::lower_uniform_pull_constant_loads()
2695 {
2696 foreach_list(node, &this->instructions) {
2697 fs_inst *inst = (fs_inst *)node;
2698
2699 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2700 continue;
2701
2702 if (brw->gen >= 7) {
2703 /* The offset arg before was a vec4-aligned byte offset. We need to
2704 * turn it into a dword offset.
2705 */
2706 fs_reg const_offset_reg = inst->src[1];
2707 assert(const_offset_reg.file == IMM &&
2708 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2709 const_offset_reg.imm.u /= 4;
2710 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2711
2712 /* This is actually going to be a MOV, but since only the first dword
2713 * is accessed, we have a special opcode to do just that one. Note
2714 * that this needs to be an operation that will be considered a def
2715 * by live variable analysis, or register allocation will explode.
2716 */
2717 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2718 payload, const_offset_reg);
2719 setup->force_writemask_all = true;
2720
2721 setup->ir = inst->ir;
2722 setup->annotation = inst->annotation;
2723 inst->insert_before(setup);
2724
2725 /* Similarly, this will only populate the first 4 channels of the
2726 * result register (since we only use smear values from 0-3), but we
2727 * don't tell the optimizer.
2728 */
2729 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2730 inst->src[1] = payload;
2731
2732 this->live_intervals_valid = false;
2733 } else {
2734 /* Before register allocation, we didn't tell the scheduler about the
2735 * MRF we use. We know it's safe to use this MRF because nothing
2736 * else does except for register spill/unspill, which generates and
2737 * uses its MRF within a single IR instruction.
2738 */
2739 inst->base_mrf = 14;
2740 inst->mlen = 1;
2741 }
2742 }
2743 }
2744
2745 void
2746 fs_visitor::dump_instruction(backend_instruction *be_inst)
2747 {
2748 fs_inst *inst = (fs_inst *)be_inst;
2749
2750 if (inst->predicate) {
2751 printf("(%cf0.%d) ",
2752 inst->predicate_inverse ? '-' : '+',
2753 inst->flag_subreg);
2754 }
2755
2756 printf("%s", brw_instruction_name(inst->opcode));
2757 if (inst->saturate)
2758 printf(".sat");
2759 if (inst->conditional_mod) {
2760 printf(".cmod");
2761 if (!inst->predicate &&
2762 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2763 inst->opcode != BRW_OPCODE_IF &&
2764 inst->opcode != BRW_OPCODE_WHILE))) {
2765 printf(".f0.%d", inst->flag_subreg);
2766 }
2767 }
2768 printf(" ");
2769
2770
2771 switch (inst->dst.file) {
2772 case GRF:
2773 printf("vgrf%d", inst->dst.reg);
2774 if (inst->dst.reg_offset)
2775 printf("+%d", inst->dst.reg_offset);
2776 break;
2777 case MRF:
2778 printf("m%d", inst->dst.reg);
2779 break;
2780 case BAD_FILE:
2781 printf("(null)");
2782 break;
2783 case UNIFORM:
2784 printf("***u%d***", inst->dst.reg);
2785 break;
2786 case ARF:
2787 if (inst->dst.reg == BRW_ARF_NULL)
2788 printf("(null)");
2789 else
2790 printf("arf%d", inst->dst.reg);
2791 break;
2792 default:
2793 printf("???");
2794 break;
2795 }
2796 printf(", ");
2797
2798 for (int i = 0; i < 3; i++) {
2799 if (inst->src[i].negate)
2800 printf("-");
2801 if (inst->src[i].abs)
2802 printf("|");
2803 switch (inst->src[i].file) {
2804 case GRF:
2805 printf("vgrf%d", inst->src[i].reg);
2806 if (inst->src[i].reg_offset)
2807 printf("+%d", inst->src[i].reg_offset);
2808 break;
2809 case MRF:
2810 printf("***m%d***", inst->src[i].reg);
2811 break;
2812 case UNIFORM:
2813 printf("u%d", inst->src[i].reg);
2814 if (inst->src[i].reg_offset)
2815 printf(".%d", inst->src[i].reg_offset);
2816 break;
2817 case BAD_FILE:
2818 printf("(null)");
2819 break;
2820 case IMM:
2821 switch (inst->src[i].type) {
2822 case BRW_REGISTER_TYPE_F:
2823 printf("%ff", inst->src[i].imm.f);
2824 break;
2825 case BRW_REGISTER_TYPE_D:
2826 printf("%dd", inst->src[i].imm.i);
2827 break;
2828 case BRW_REGISTER_TYPE_UD:
2829 printf("%uu", inst->src[i].imm.u);
2830 break;
2831 default:
2832 printf("???");
2833 break;
2834 }
2835 break;
2836 default:
2837 printf("???");
2838 break;
2839 }
2840 if (inst->src[i].abs)
2841 printf("|");
2842
2843 if (i < 3)
2844 printf(", ");
2845 }
2846
2847 printf(" ");
2848
2849 if (inst->force_uncompressed)
2850 printf("1sthalf ");
2851
2852 if (inst->force_sechalf)
2853 printf("2ndhalf ");
2854
2855 printf("\n");
2856 }
2857
2858 /**
2859 * Possibly returns an instruction that set up @param reg.
2860 *
2861 * Sometimes we want to take the result of some expression/variable
2862 * dereference tree and rewrite the instruction generating the result
2863 * of the tree. When processing the tree, we know that the
2864 * instructions generated are all writing temporaries that are dead
2865 * outside of this tree. So, if we have some instructions that write
2866 * a temporary, we're free to point that temp write somewhere else.
2867 *
2868 * Note that this doesn't guarantee that the instruction generated
2869 * only reg -- it might be the size=4 destination of a texture instruction.
2870 */
2871 fs_inst *
2872 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2873 fs_inst *end,
2874 fs_reg reg)
2875 {
2876 if (end == start ||
2877 end->is_partial_write() ||
2878 reg.reladdr ||
2879 !reg.equals(end->dst)) {
2880 return NULL;
2881 } else {
2882 return end;
2883 }
2884 }
2885
2886 void
2887 fs_visitor::setup_payload_gen6()
2888 {
2889 bool uses_depth =
2890 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2891 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2892
2893 assert(brw->gen >= 6);
2894
2895 /* R0-1: masks, pixel X/Y coordinates. */
2896 c->nr_payload_regs = 2;
2897 /* R2: only for 32-pixel dispatch.*/
2898
2899 /* R3-26: barycentric interpolation coordinates. These appear in the
2900 * same order that they appear in the brw_wm_barycentric_interp_mode
2901 * enum. Each set of coordinates occupies 2 registers if dispatch width
2902 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2903 * appear if they were enabled using the "Barycentric Interpolation
2904 * Mode" bits in WM_STATE.
2905 */
2906 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2907 if (barycentric_interp_modes & (1 << i)) {
2908 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2909 c->nr_payload_regs += 2;
2910 if (dispatch_width == 16) {
2911 c->nr_payload_regs += 2;
2912 }
2913 }
2914 }
2915
2916 /* R27: interpolated depth if uses source depth */
2917 if (uses_depth) {
2918 c->source_depth_reg = c->nr_payload_regs;
2919 c->nr_payload_regs++;
2920 if (dispatch_width == 16) {
2921 /* R28: interpolated depth if not 8-wide. */
2922 c->nr_payload_regs++;
2923 }
2924 }
2925 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2926 if (uses_depth) {
2927 c->source_w_reg = c->nr_payload_regs;
2928 c->nr_payload_regs++;
2929 if (dispatch_width == 16) {
2930 /* R30: interpolated W if not 8-wide. */
2931 c->nr_payload_regs++;
2932 }
2933 }
2934 /* R31: MSAA position offsets. */
2935 /* R32-: bary for 32-pixel. */
2936 /* R58-59: interp W for 32-pixel. */
2937
2938 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2939 c->source_depth_to_render_target = true;
2940 }
2941 }
2942
2943 bool
2944 fs_visitor::run()
2945 {
2946 sanity_param_count = fp->Base.Parameters->NumParameters;
2947 uint32_t orig_nr_params = c->prog_data.nr_params;
2948
2949 if (brw->gen >= 6)
2950 setup_payload_gen6();
2951 else
2952 setup_payload_gen4();
2953
2954 if (0) {
2955 emit_dummy_fs();
2956 } else {
2957 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2958 emit_shader_time_begin();
2959
2960 calculate_urb_setup();
2961 if (brw->gen < 6)
2962 emit_interpolation_setup_gen4();
2963 else
2964 emit_interpolation_setup_gen6();
2965
2966 /* We handle discards by keeping track of the still-live pixels in f0.1.
2967 * Initialize it with the dispatched pixels.
2968 */
2969 if (fp->UsesKill) {
2970 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2971 discard_init->flag_subreg = 1;
2972 }
2973
2974 /* Generate FS IR for main(). (the visitor only descends into
2975 * functions called "main").
2976 */
2977 if (shader) {
2978 foreach_list(node, &*shader->ir) {
2979 ir_instruction *ir = (ir_instruction *)node;
2980 base_ir = ir;
2981 this->result = reg_undef;
2982 ir->accept(this);
2983 }
2984 } else {
2985 emit_fragment_program_code();
2986 }
2987 base_ir = NULL;
2988 if (failed)
2989 return false;
2990
2991 emit(FS_OPCODE_PLACEHOLDER_HALT);
2992
2993 emit_fb_writes();
2994
2995 split_virtual_grfs();
2996
2997 move_uniform_array_access_to_pull_constants();
2998 setup_pull_constants();
2999
3000 bool progress;
3001 do {
3002 progress = false;
3003
3004 compact_virtual_grfs();
3005
3006 progress = remove_duplicate_mrf_writes() || progress;
3007
3008 progress = opt_algebraic() || progress;
3009 progress = opt_cse() || progress;
3010 progress = opt_copy_propagate() || progress;
3011 progress = dead_code_eliminate() || progress;
3012 progress = dead_code_eliminate_local() || progress;
3013 progress = register_coalesce() || progress;
3014 progress = register_coalesce_2() || progress;
3015 progress = compute_to_mrf() || progress;
3016 } while (progress);
3017
3018 remove_dead_constants();
3019
3020 schedule_instructions(false);
3021
3022 lower_uniform_pull_constant_loads();
3023
3024 assign_curb_setup();
3025 assign_urb_setup();
3026
3027 if (0) {
3028 /* Debug of register spilling: Go spill everything. */
3029 for (int i = 0; i < virtual_grf_count; i++) {
3030 spill_reg(i);
3031 }
3032 }
3033
3034 if (0)
3035 assign_regs_trivial();
3036 else {
3037 while (!assign_regs()) {
3038 if (failed)
3039 break;
3040 }
3041 }
3042 }
3043 assert(force_uncompressed_stack == 0);
3044 assert(force_sechalf_stack == 0);
3045
3046 /* This must come after all optimization and register allocation, since
3047 * it inserts dead code that happens to have side effects, and it does
3048 * so based on the actual physical registers in use.
3049 */
3050 insert_gen4_send_dependency_workarounds();
3051
3052 if (failed)
3053 return false;
3054
3055 schedule_instructions(true);
3056
3057 if (dispatch_width == 8) {
3058 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3059 } else {
3060 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3061
3062 /* Make sure we didn't try to sneak in an extra uniform */
3063 assert(orig_nr_params == c->prog_data.nr_params);
3064 (void) orig_nr_params;
3065 }
3066
3067 /* If any state parameters were appended, then ParameterValues could have
3068 * been realloced, in which case the driver uniform storage set up by
3069 * _mesa_associate_uniform_storage() would point to freed memory. Make
3070 * sure that didn't happen.
3071 */
3072 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3073
3074 return !failed;
3075 }
3076
3077 const unsigned *
3078 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3079 struct gl_fragment_program *fp,
3080 struct gl_shader_program *prog,
3081 unsigned *final_assembly_size)
3082 {
3083 bool start_busy = false;
3084 float start_time = 0;
3085
3086 if (unlikely(brw->perf_debug)) {
3087 start_busy = (brw->batch.last_bo &&
3088 drm_intel_bo_busy(brw->batch.last_bo));
3089 start_time = get_time();
3090 }
3091
3092 struct brw_shader *shader = NULL;
3093 if (prog)
3094 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3095
3096 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3097 if (prog) {
3098 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3099 _mesa_print_ir(shader->ir, NULL);
3100 printf("\n\n");
3101 } else {
3102 printf("ARB_fragment_program %d ir for native fragment shader\n",
3103 fp->Base.Id);
3104 _mesa_print_program(&fp->Base);
3105 }
3106 }
3107
3108 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3109 */
3110 fs_visitor v(brw, c, prog, fp, 8);
3111 if (!v.run()) {
3112 if (prog) {
3113 prog->LinkStatus = false;
3114 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3115 }
3116
3117 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3118 v.fail_msg);
3119
3120 return NULL;
3121 }
3122
3123 exec_list *simd16_instructions = NULL;
3124 fs_visitor v2(brw, c, prog, fp, 16);
3125 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3126 if (c->prog_data.nr_pull_params == 0) {
3127 /* Try a 16-wide compile */
3128 v2.import_uniforms(&v);
3129 if (!v2.run()) {
3130 perf_debug("16-wide shader failed to compile, falling back to "
3131 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3132 } else {
3133 simd16_instructions = &v2.instructions;
3134 }
3135 } else {
3136 perf_debug("Skipping 16-wide due to pull parameters.\n");
3137 }
3138 }
3139
3140 c->prog_data.dispatch_width = 8;
3141
3142 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3143 const unsigned *generated = g.generate_assembly(&v.instructions,
3144 simd16_instructions,
3145 final_assembly_size);
3146
3147 if (unlikely(brw->perf_debug) && shader) {
3148 if (shader->compiled_once)
3149 brw_wm_debug_recompile(brw, prog, &c->key);
3150 shader->compiled_once = true;
3151
3152 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3153 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3154 (get_time() - start_time) * 1000);
3155 }
3156 }
3157
3158 return generated;
3159 }
3160
3161 bool
3162 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3163 {
3164 struct brw_context *brw = brw_context(ctx);
3165 struct brw_wm_prog_key key;
3166
3167 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3168 return true;
3169
3170 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3171 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3172 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3173 bool program_uses_dfdy = fp->UsesDFdy;
3174
3175 memset(&key, 0, sizeof(key));
3176
3177 if (brw->gen < 6) {
3178 if (fp->UsesKill)
3179 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3180
3181 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3182 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3183
3184 /* Just assume depth testing. */
3185 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3186 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3187 }
3188
3189 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3190 BRW_FS_VARYING_INPUT_MASK) > 16)
3191 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3192
3193 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3194
3195 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3196 for (unsigned i = 0; i < sampler_count; i++) {
3197 if (fp->Base.ShadowSamplers & (1 << i)) {
3198 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3199 key.tex.swizzles[i] =
3200 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3201 } else {
3202 /* Color sampler: assume no swizzling. */
3203 key.tex.swizzles[i] = SWIZZLE_XYZW;
3204 }
3205 }
3206
3207 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3208 key.drawable_height = ctx->DrawBuffer->Height;
3209 }
3210
3211 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3212 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3213 }
3214
3215 key.nr_color_regions = 1;
3216
3217 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3218 * quality of the derivatives is likely to be determined by the driconf
3219 * option.
3220 */
3221 key.high_quality_derivatives = brw->disable_derivative_optimization;
3222
3223 key.program_string_id = bfp->id;
3224
3225 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3226 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3227
3228 bool success = do_wm_prog(brw, prog, bfp, &key);
3229
3230 brw->wm.base.prog_offset = old_prog_offset;
3231 brw->wm.prog_data = old_prog_data;
3232
3233 return success;
3234 }