i965/fs: Add support for translating ir_triop_fma into MAD.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/uniforms.h"
39 #include "main/fbobject.h"
40 #include "program/prog_parameter.h"
41 #include "program/prog_print.h"
42 #include "program/register_allocate.h"
43 #include "program/sampler.h"
44 #include "program/hash_table.h"
45 #include "brw_context.h"
46 #include "brw_eu.h"
47 #include "brw_wm.h"
48 }
49 #include "brw_fs.h"
50 #include "glsl/glsl_types.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63
64 /* This will be the case for almost all instructions. */
65 this->regs_written = 1;
66 }
67
68 fs_inst::fs_inst()
69 {
70 init();
71 }
72
73 fs_inst::fs_inst(enum opcode opcode)
74 {
75 init();
76 this->opcode = opcode;
77 }
78
79 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
80 {
81 init();
82 this->opcode = opcode;
83 this->dst = dst;
84
85 if (dst.file == GRF)
86 assert(dst.reg_offset >= 0);
87 }
88
89 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
90 {
91 init();
92 this->opcode = opcode;
93 this->dst = dst;
94 this->src[0] = src0;
95
96 if (dst.file == GRF)
97 assert(dst.reg_offset >= 0);
98 if (src[0].file == GRF)
99 assert(src[0].reg_offset >= 0);
100 }
101
102 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
103 {
104 init();
105 this->opcode = opcode;
106 this->dst = dst;
107 this->src[0] = src0;
108 this->src[1] = src1;
109
110 if (dst.file == GRF)
111 assert(dst.reg_offset >= 0);
112 if (src[0].file == GRF)
113 assert(src[0].reg_offset >= 0);
114 if (src[1].file == GRF)
115 assert(src[1].reg_offset >= 0);
116 }
117
118 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
119 fs_reg src0, fs_reg src1, fs_reg src2)
120 {
121 init();
122 this->opcode = opcode;
123 this->dst = dst;
124 this->src[0] = src0;
125 this->src[1] = src1;
126 this->src[2] = src2;
127
128 if (dst.file == GRF)
129 assert(dst.reg_offset >= 0);
130 if (src[0].file == GRF)
131 assert(src[0].reg_offset >= 0);
132 if (src[1].file == GRF)
133 assert(src[1].reg_offset >= 0);
134 if (src[2].file == GRF)
135 assert(src[2].reg_offset >= 0);
136 }
137
138 #define ALU1(op) \
139 fs_inst * \
140 fs_visitor::op(fs_reg dst, fs_reg src0) \
141 { \
142 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
143 }
144
145 #define ALU2(op) \
146 fs_inst * \
147 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
148 { \
149 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
150 }
151
152 #define ALU3(op) \
153 fs_inst * \
154 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
155 { \
156 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
157 }
158
159 ALU1(NOT)
160 ALU1(MOV)
161 ALU1(FRC)
162 ALU1(RNDD)
163 ALU1(RNDE)
164 ALU1(RNDZ)
165 ALU2(ADD)
166 ALU2(MUL)
167 ALU2(MACH)
168 ALU2(AND)
169 ALU2(OR)
170 ALU2(XOR)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183
184 /** Gen4 predicated IF. */
185 fs_inst *
186 fs_visitor::IF(uint32_t predicate)
187 {
188 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
189 inst->predicate = predicate;
190 return inst;
191 }
192
193 /** Gen6+ IF with embedded comparison. */
194 fs_inst *
195 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
196 {
197 assert(brw->gen >= 6);
198 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
199 reg_null_d, src0, src1);
200 inst->conditional_mod = condition;
201 return inst;
202 }
203
204 /**
205 * CMP: Sets the low bit of the destination channels with the result
206 * of the comparison, while the upper bits are undefined, and updates
207 * the flag register with the packed 16 bits of the result.
208 */
209 fs_inst *
210 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
211 {
212 fs_inst *inst;
213
214 /* Take the instruction:
215 *
216 * CMP null<d> src0<f> src1<f>
217 *
218 * Original gen4 does type conversion to the destination type before
219 * comparison, producing garbage results for floating point comparisons.
220 * gen5 does the comparison on the execution type (resolved source types),
221 * so dst type doesn't matter. gen6 does comparison and then uses the
222 * result as if it was the dst type with no conversion, which happens to
223 * mostly work out for float-interpreted-as-int since our comparisons are
224 * for >0, =0, <0.
225 */
226 if (brw->gen == 4) {
227 dst.type = src0.type;
228 if (dst.file == HW_REG)
229 dst.fixed_hw_reg.type = dst.type;
230 }
231
232 resolve_ud_negate(&src0);
233 resolve_ud_negate(&src1);
234
235 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
236 inst->conditional_mod = condition;
237
238 return inst;
239 }
240
241 exec_list
242 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
243 fs_reg varying_offset,
244 uint32_t const_offset)
245 {
246 exec_list instructions;
247 fs_inst *inst;
248
249 /* We have our constant surface use a pitch of 4 bytes, so our index can
250 * be any component of a vector, and then we load 4 contiguous
251 * components starting from that.
252 *
253 * We break down the const_offset to a portion added to the variable
254 * offset and a portion done using reg_offset, which means that if you
255 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
256 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
257 * CSE can later notice that those loads are all the same and eliminate
258 * the redundant ones.
259 */
260 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
261 instructions.push_tail(ADD(vec4_offset,
262 varying_offset, const_offset & ~3));
263
264 int scale = 1;
265 if (brw->gen == 4 && dispatch_width == 8) {
266 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
267 * u, v, r) as parameters, or we can just use the SIMD16 message
268 * consisting of (header, u). We choose the second, at the cost of a
269 * longer return length.
270 */
271 scale = 2;
272 }
273
274 enum opcode op;
275 if (brw->gen >= 7)
276 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
277 else
278 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
279 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
280 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
281 inst->regs_written = 4 * scale;
282 instructions.push_tail(inst);
283
284 if (brw->gen < 7) {
285 inst->base_mrf = 13;
286 inst->header_present = true;
287 if (brw->gen == 4)
288 inst->mlen = 3;
289 else
290 inst->mlen = 1 + dispatch_width / 8;
291 }
292
293 vec4_result.reg_offset += (const_offset & 3) * scale;
294 instructions.push_tail(MOV(dst, vec4_result));
295
296 return instructions;
297 }
298
299 /**
300 * A helper for MOV generation for fixing up broken hardware SEND dependency
301 * handling.
302 */
303 fs_inst *
304 fs_visitor::DEP_RESOLVE_MOV(int grf)
305 {
306 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
307
308 inst->ir = NULL;
309 inst->annotation = "send dependency resolve";
310
311 /* The caller always wants uncompressed to emit the minimal extra
312 * dependencies, and to avoid having to deal with aligning its regs to 2.
313 */
314 inst->force_uncompressed = true;
315
316 return inst;
317 }
318
319 bool
320 fs_inst::equals(fs_inst *inst)
321 {
322 return (opcode == inst->opcode &&
323 dst.equals(inst->dst) &&
324 src[0].equals(inst->src[0]) &&
325 src[1].equals(inst->src[1]) &&
326 src[2].equals(inst->src[2]) &&
327 saturate == inst->saturate &&
328 predicate == inst->predicate &&
329 conditional_mod == inst->conditional_mod &&
330 mlen == inst->mlen &&
331 base_mrf == inst->base_mrf &&
332 sampler == inst->sampler &&
333 target == inst->target &&
334 eot == inst->eot &&
335 header_present == inst->header_present &&
336 shadow_compare == inst->shadow_compare &&
337 offset == inst->offset);
338 }
339
340 bool
341 fs_inst::overwrites_reg(const fs_reg &reg)
342 {
343 return (reg.file == dst.file &&
344 reg.reg == dst.reg &&
345 reg.reg_offset >= dst.reg_offset &&
346 reg.reg_offset < dst.reg_offset + regs_written);
347 }
348
349 bool
350 fs_inst::is_send_from_grf()
351 {
352 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
353 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
354 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
355 src[1].file == GRF));
356 }
357
358 bool
359 fs_visitor::can_do_source_mods(fs_inst *inst)
360 {
361 if (brw->gen == 6 && inst->is_math())
362 return false;
363
364 if (inst->is_send_from_grf())
365 return false;
366
367 return true;
368 }
369
370 void
371 fs_reg::init()
372 {
373 memset(this, 0, sizeof(*this));
374 this->smear = -1;
375 }
376
377 /** Generic unset register constructor. */
378 fs_reg::fs_reg()
379 {
380 init();
381 this->file = BAD_FILE;
382 }
383
384 /** Immediate value constructor. */
385 fs_reg::fs_reg(float f)
386 {
387 init();
388 this->file = IMM;
389 this->type = BRW_REGISTER_TYPE_F;
390 this->imm.f = f;
391 }
392
393 /** Immediate value constructor. */
394 fs_reg::fs_reg(int32_t i)
395 {
396 init();
397 this->file = IMM;
398 this->type = BRW_REGISTER_TYPE_D;
399 this->imm.i = i;
400 }
401
402 /** Immediate value constructor. */
403 fs_reg::fs_reg(uint32_t u)
404 {
405 init();
406 this->file = IMM;
407 this->type = BRW_REGISTER_TYPE_UD;
408 this->imm.u = u;
409 }
410
411 /** Fixed brw_reg Immediate value constructor. */
412 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
413 {
414 init();
415 this->file = HW_REG;
416 this->fixed_hw_reg = fixed_hw_reg;
417 this->type = fixed_hw_reg.type;
418 }
419
420 bool
421 fs_reg::equals(const fs_reg &r) const
422 {
423 return (file == r.file &&
424 reg == r.reg &&
425 reg_offset == r.reg_offset &&
426 type == r.type &&
427 negate == r.negate &&
428 abs == r.abs &&
429 !reladdr && !r.reladdr &&
430 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
431 sizeof(fixed_hw_reg)) == 0 &&
432 smear == r.smear &&
433 imm.u == r.imm.u);
434 }
435
436 bool
437 fs_reg::is_zero() const
438 {
439 if (file != IMM)
440 return false;
441
442 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
443 }
444
445 bool
446 fs_reg::is_one() const
447 {
448 if (file != IMM)
449 return false;
450
451 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
452 }
453
454 bool
455 fs_reg::is_valid_3src() const
456 {
457 return file == GRF || file == UNIFORM;
458 }
459
460 int
461 fs_visitor::type_size(const struct glsl_type *type)
462 {
463 unsigned int size, i;
464
465 switch (type->base_type) {
466 case GLSL_TYPE_UINT:
467 case GLSL_TYPE_INT:
468 case GLSL_TYPE_FLOAT:
469 case GLSL_TYPE_BOOL:
470 return type->components();
471 case GLSL_TYPE_ARRAY:
472 return type_size(type->fields.array) * type->length;
473 case GLSL_TYPE_STRUCT:
474 size = 0;
475 for (i = 0; i < type->length; i++) {
476 size += type_size(type->fields.structure[i].type);
477 }
478 return size;
479 case GLSL_TYPE_SAMPLER:
480 /* Samplers take up no register space, since they're baked in at
481 * link time.
482 */
483 return 0;
484 case GLSL_TYPE_VOID:
485 case GLSL_TYPE_ERROR:
486 case GLSL_TYPE_INTERFACE:
487 assert(!"not reached");
488 break;
489 }
490
491 return 0;
492 }
493
494 fs_reg
495 fs_visitor::get_timestamp()
496 {
497 assert(brw->gen >= 7);
498
499 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
500 BRW_ARF_TIMESTAMP,
501 0),
502 BRW_REGISTER_TYPE_UD));
503
504 fs_reg dst = fs_reg(this, glsl_type::uint_type);
505
506 fs_inst *mov = emit(MOV(dst, ts));
507 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
508 * even if it's not enabled in the dispatch.
509 */
510 mov->force_writemask_all = true;
511 mov->force_uncompressed = true;
512
513 /* The caller wants the low 32 bits of the timestamp. Since it's running
514 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
515 * which is plenty of time for our purposes. It is identical across the
516 * EUs, but since it's tracking GPU core speed it will increment at a
517 * varying rate as render P-states change.
518 *
519 * The caller could also check if render P-states have changed (or anything
520 * else that might disrupt timing) by setting smear to 2 and checking if
521 * that field is != 0.
522 */
523 dst.smear = 0;
524
525 return dst;
526 }
527
528 void
529 fs_visitor::emit_shader_time_begin()
530 {
531 current_annotation = "shader time start";
532 shader_start_time = get_timestamp();
533 }
534
535 void
536 fs_visitor::emit_shader_time_end()
537 {
538 current_annotation = "shader time end";
539
540 enum shader_time_shader_type type, written_type, reset_type;
541 if (dispatch_width == 8) {
542 type = ST_FS8;
543 written_type = ST_FS8_WRITTEN;
544 reset_type = ST_FS8_RESET;
545 } else {
546 assert(dispatch_width == 16);
547 type = ST_FS16;
548 written_type = ST_FS16_WRITTEN;
549 reset_type = ST_FS16_RESET;
550 }
551
552 fs_reg shader_end_time = get_timestamp();
553
554 /* Check that there weren't any timestamp reset events (assuming these
555 * were the only two timestamp reads that happened).
556 */
557 fs_reg reset = shader_end_time;
558 reset.smear = 2;
559 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
560 test->conditional_mod = BRW_CONDITIONAL_Z;
561 emit(IF(BRW_PREDICATE_NORMAL));
562
563 push_force_uncompressed();
564 fs_reg start = shader_start_time;
565 start.negate = true;
566 fs_reg diff = fs_reg(this, glsl_type::uint_type);
567 emit(ADD(diff, start, shader_end_time));
568
569 /* If there were no instructions between the two timestamp gets, the diff
570 * is 2 cycles. Remove that overhead, so I can forget about that when
571 * trying to determine the time taken for single instructions.
572 */
573 emit(ADD(diff, diff, fs_reg(-2u)));
574
575 emit_shader_time_write(type, diff);
576 emit_shader_time_write(written_type, fs_reg(1u));
577 emit(BRW_OPCODE_ELSE);
578 emit_shader_time_write(reset_type, fs_reg(1u));
579 emit(BRW_OPCODE_ENDIF);
580
581 pop_force_uncompressed();
582 }
583
584 void
585 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
586 fs_reg value)
587 {
588 int shader_time_index =
589 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
590 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
591
592 fs_reg payload;
593 if (dispatch_width == 8)
594 payload = fs_reg(this, glsl_type::uvec2_type);
595 else
596 payload = fs_reg(this, glsl_type::uint_type);
597
598 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
599 fs_reg(), payload, offset, value));
600 }
601
602 void
603 fs_visitor::fail(const char *format, ...)
604 {
605 va_list va;
606 char *msg;
607
608 if (failed)
609 return;
610
611 failed = true;
612
613 va_start(va, format);
614 msg = ralloc_vasprintf(mem_ctx, format, va);
615 va_end(va);
616 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
617
618 this->fail_msg = msg;
619
620 if (INTEL_DEBUG & DEBUG_WM) {
621 fprintf(stderr, "%s", msg);
622 }
623 }
624
625 fs_inst *
626 fs_visitor::emit(enum opcode opcode)
627 {
628 return emit(fs_inst(opcode));
629 }
630
631 fs_inst *
632 fs_visitor::emit(enum opcode opcode, fs_reg dst)
633 {
634 return emit(fs_inst(opcode, dst));
635 }
636
637 fs_inst *
638 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
639 {
640 return emit(fs_inst(opcode, dst, src0));
641 }
642
643 fs_inst *
644 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
645 {
646 return emit(fs_inst(opcode, dst, src0, src1));
647 }
648
649 fs_inst *
650 fs_visitor::emit(enum opcode opcode, fs_reg dst,
651 fs_reg src0, fs_reg src1, fs_reg src2)
652 {
653 return emit(fs_inst(opcode, dst, src0, src1, src2));
654 }
655
656 void
657 fs_visitor::push_force_uncompressed()
658 {
659 force_uncompressed_stack++;
660 }
661
662 void
663 fs_visitor::pop_force_uncompressed()
664 {
665 force_uncompressed_stack--;
666 assert(force_uncompressed_stack >= 0);
667 }
668
669 void
670 fs_visitor::push_force_sechalf()
671 {
672 force_sechalf_stack++;
673 }
674
675 void
676 fs_visitor::pop_force_sechalf()
677 {
678 force_sechalf_stack--;
679 assert(force_sechalf_stack >= 0);
680 }
681
682 /**
683 * Returns true if the instruction has a flag that means it won't
684 * update an entire destination register.
685 *
686 * For example, dead code elimination and live variable analysis want to know
687 * when a write to a variable screens off any preceding values that were in
688 * it.
689 */
690 bool
691 fs_inst::is_partial_write()
692 {
693 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
694 this->force_uncompressed ||
695 this->force_sechalf);
696 }
697
698 /**
699 * Returns how many MRFs an FS opcode will write over.
700 *
701 * Note that this is not the 0 or 1 implied writes in an actual gen
702 * instruction -- the FS opcodes often generate MOVs in addition.
703 */
704 int
705 fs_visitor::implied_mrf_writes(fs_inst *inst)
706 {
707 if (inst->mlen == 0)
708 return 0;
709
710 switch (inst->opcode) {
711 case SHADER_OPCODE_RCP:
712 case SHADER_OPCODE_RSQ:
713 case SHADER_OPCODE_SQRT:
714 case SHADER_OPCODE_EXP2:
715 case SHADER_OPCODE_LOG2:
716 case SHADER_OPCODE_SIN:
717 case SHADER_OPCODE_COS:
718 return 1 * dispatch_width / 8;
719 case SHADER_OPCODE_POW:
720 case SHADER_OPCODE_INT_QUOTIENT:
721 case SHADER_OPCODE_INT_REMAINDER:
722 return 2 * dispatch_width / 8;
723 case SHADER_OPCODE_TEX:
724 case FS_OPCODE_TXB:
725 case SHADER_OPCODE_TXD:
726 case SHADER_OPCODE_TXF:
727 case SHADER_OPCODE_TXF_MS:
728 case SHADER_OPCODE_TXL:
729 case SHADER_OPCODE_TXS:
730 case SHADER_OPCODE_LOD:
731 return 1;
732 case FS_OPCODE_FB_WRITE:
733 return 2;
734 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
735 case FS_OPCODE_UNSPILL:
736 return 1;
737 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
738 return inst->mlen;
739 case FS_OPCODE_SPILL:
740 return 2;
741 default:
742 assert(!"not reached");
743 return inst->mlen;
744 }
745 }
746
747 int
748 fs_visitor::virtual_grf_alloc(int size)
749 {
750 if (virtual_grf_array_size <= virtual_grf_count) {
751 if (virtual_grf_array_size == 0)
752 virtual_grf_array_size = 16;
753 else
754 virtual_grf_array_size *= 2;
755 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
756 virtual_grf_array_size);
757 }
758 virtual_grf_sizes[virtual_grf_count] = size;
759 return virtual_grf_count++;
760 }
761
762 /** Fixed HW reg constructor. */
763 fs_reg::fs_reg(enum register_file file, int reg)
764 {
765 init();
766 this->file = file;
767 this->reg = reg;
768 this->type = BRW_REGISTER_TYPE_F;
769 }
770
771 /** Fixed HW reg constructor. */
772 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
773 {
774 init();
775 this->file = file;
776 this->reg = reg;
777 this->type = type;
778 }
779
780 /** Automatic reg constructor. */
781 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
782 {
783 init();
784
785 this->file = GRF;
786 this->reg = v->virtual_grf_alloc(v->type_size(type));
787 this->reg_offset = 0;
788 this->type = brw_type_for_base_type(type);
789 }
790
791 fs_reg *
792 fs_visitor::variable_storage(ir_variable *var)
793 {
794 return (fs_reg *)hash_table_find(this->variable_ht, var);
795 }
796
797 void
798 import_uniforms_callback(const void *key,
799 void *data,
800 void *closure)
801 {
802 struct hash_table *dst_ht = (struct hash_table *)closure;
803 const fs_reg *reg = (const fs_reg *)data;
804
805 if (reg->file != UNIFORM)
806 return;
807
808 hash_table_insert(dst_ht, data, key);
809 }
810
811 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
812 * This brings in those uniform definitions
813 */
814 void
815 fs_visitor::import_uniforms(fs_visitor *v)
816 {
817 hash_table_call_foreach(v->variable_ht,
818 import_uniforms_callback,
819 variable_ht);
820 this->params_remap = v->params_remap;
821 this->nr_params_remap = v->nr_params_remap;
822 }
823
824 /* Our support for uniforms is piggy-backed on the struct
825 * gl_fragment_program, because that's where the values actually
826 * get stored, rather than in some global gl_shader_program uniform
827 * store.
828 */
829 void
830 fs_visitor::setup_uniform_values(ir_variable *ir)
831 {
832 int namelen = strlen(ir->name);
833
834 /* The data for our (non-builtin) uniforms is stored in a series of
835 * gl_uniform_driver_storage structs for each subcomponent that
836 * glGetUniformLocation() could name. We know it's been set up in the same
837 * order we'd walk the type, so walk the list of storage and find anything
838 * with our name, or the prefix of a component that starts with our name.
839 */
840 unsigned params_before = c->prog_data.nr_params;
841 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
842 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
843
844 if (strncmp(ir->name, storage->name, namelen) != 0 ||
845 (storage->name[namelen] != 0 &&
846 storage->name[namelen] != '.' &&
847 storage->name[namelen] != '[')) {
848 continue;
849 }
850
851 unsigned slots = storage->type->component_slots();
852 if (storage->array_elements)
853 slots *= storage->array_elements;
854
855 for (unsigned i = 0; i < slots; i++) {
856 c->prog_data.param[c->prog_data.nr_params++] =
857 &storage->storage[i].f;
858 }
859 }
860
861 /* Make sure we actually initialized the right amount of stuff here. */
862 assert(params_before + ir->type->component_slots() ==
863 c->prog_data.nr_params);
864 (void)params_before;
865 }
866
867
868 /* Our support for builtin uniforms is even scarier than non-builtin.
869 * It sits on top of the PROG_STATE_VAR parameters that are
870 * automatically updated from GL context state.
871 */
872 void
873 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
874 {
875 const ir_state_slot *const slots = ir->state_slots;
876 assert(ir->state_slots != NULL);
877
878 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
879 /* This state reference has already been setup by ir_to_mesa, but we'll
880 * get the same index back here.
881 */
882 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
883 (gl_state_index *)slots[i].tokens);
884
885 /* Add each of the unique swizzles of the element as a parameter.
886 * This'll end up matching the expected layout of the
887 * array/matrix/structure we're trying to fill in.
888 */
889 int last_swiz = -1;
890 for (unsigned int j = 0; j < 4; j++) {
891 int swiz = GET_SWZ(slots[i].swizzle, j);
892 if (swiz == last_swiz)
893 break;
894 last_swiz = swiz;
895
896 c->prog_data.param[c->prog_data.nr_params++] =
897 &fp->Base.Parameters->ParameterValues[index][swiz].f;
898 }
899 }
900 }
901
902 fs_reg *
903 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
904 {
905 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
906 fs_reg wpos = *reg;
907 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
908
909 /* gl_FragCoord.x */
910 if (ir->pixel_center_integer) {
911 emit(MOV(wpos, this->pixel_x));
912 } else {
913 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
914 }
915 wpos.reg_offset++;
916
917 /* gl_FragCoord.y */
918 if (!flip && ir->pixel_center_integer) {
919 emit(MOV(wpos, this->pixel_y));
920 } else {
921 fs_reg pixel_y = this->pixel_y;
922 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
923
924 if (flip) {
925 pixel_y.negate = true;
926 offset += c->key.drawable_height - 1.0;
927 }
928
929 emit(ADD(wpos, pixel_y, fs_reg(offset)));
930 }
931 wpos.reg_offset++;
932
933 /* gl_FragCoord.z */
934 if (brw->gen >= 6) {
935 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
936 } else {
937 emit(FS_OPCODE_LINTERP, wpos,
938 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
939 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
940 interp_reg(VARYING_SLOT_POS, 2));
941 }
942 wpos.reg_offset++;
943
944 /* gl_FragCoord.w: Already set up in emit_interpolation */
945 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
946
947 return reg;
948 }
949
950 fs_inst *
951 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
952 glsl_interp_qualifier interpolation_mode,
953 bool is_centroid)
954 {
955 brw_wm_barycentric_interp_mode barycoord_mode;
956 if (brw->gen >= 6) {
957 if (is_centroid) {
958 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
959 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
960 else
961 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
962 } else {
963 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
964 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
965 else
966 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
967 }
968 } else {
969 /* On Ironlake and below, there is only one interpolation mode.
970 * Centroid interpolation doesn't mean anything on this hardware --
971 * there is no multisampling.
972 */
973 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
974 }
975 return emit(FS_OPCODE_LINTERP, attr,
976 this->delta_x[barycoord_mode],
977 this->delta_y[barycoord_mode], interp);
978 }
979
980 fs_reg *
981 fs_visitor::emit_general_interpolation(ir_variable *ir)
982 {
983 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
984 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
985 fs_reg attr = *reg;
986
987 unsigned int array_elements;
988 const glsl_type *type;
989
990 if (ir->type->is_array()) {
991 array_elements = ir->type->length;
992 if (array_elements == 0) {
993 fail("dereferenced array '%s' has length 0\n", ir->name);
994 }
995 type = ir->type->fields.array;
996 } else {
997 array_elements = 1;
998 type = ir->type;
999 }
1000
1001 glsl_interp_qualifier interpolation_mode =
1002 ir->determine_interpolation_mode(c->key.flat_shade);
1003
1004 int location = ir->location;
1005 for (unsigned int i = 0; i < array_elements; i++) {
1006 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1007 if (urb_setup[location] == -1) {
1008 /* If there's no incoming setup data for this slot, don't
1009 * emit interpolation for it.
1010 */
1011 attr.reg_offset += type->vector_elements;
1012 location++;
1013 continue;
1014 }
1015
1016 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1017 /* Constant interpolation (flat shading) case. The SF has
1018 * handed us defined values in only the constant offset
1019 * field of the setup reg.
1020 */
1021 for (unsigned int k = 0; k < type->vector_elements; k++) {
1022 struct brw_reg interp = interp_reg(location, k);
1023 interp = suboffset(interp, 3);
1024 interp.type = reg->type;
1025 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1026 attr.reg_offset++;
1027 }
1028 } else {
1029 /* Smooth/noperspective interpolation case. */
1030 for (unsigned int k = 0; k < type->vector_elements; k++) {
1031 /* FINISHME: At some point we probably want to push
1032 * this farther by giving similar treatment to the
1033 * other potentially constant components of the
1034 * attribute, as well as making brw_vs_constval.c
1035 * handle varyings other than gl_TexCoord.
1036 */
1037 struct brw_reg interp = interp_reg(location, k);
1038 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1039 ir->centroid);
1040 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1041 /* Get the pixel/sample mask into f0 so that we know
1042 * which pixels are lit. Then, for each channel that is
1043 * unlit, replace the centroid data with non-centroid
1044 * data.
1045 */
1046 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1047 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1048 interpolation_mode, false);
1049 inst->predicate = BRW_PREDICATE_NORMAL;
1050 inst->predicate_inverse = true;
1051 }
1052 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1053 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1054 }
1055 attr.reg_offset++;
1056 }
1057
1058 }
1059 location++;
1060 }
1061 }
1062
1063 return reg;
1064 }
1065
1066 fs_reg *
1067 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1068 {
1069 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1070
1071 /* The frontfacing comes in as a bit in the thread payload. */
1072 if (brw->gen >= 6) {
1073 emit(BRW_OPCODE_ASR, *reg,
1074 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1075 fs_reg(15));
1076 emit(BRW_OPCODE_NOT, *reg, *reg);
1077 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1078 } else {
1079 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1080 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1081 * us front face
1082 */
1083 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1084 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1085 }
1086
1087 return reg;
1088 }
1089
1090 fs_reg
1091 fs_visitor::fix_math_operand(fs_reg src)
1092 {
1093 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1094 * might be able to do better by doing execsize = 1 math and then
1095 * expanding that result out, but we would need to be careful with
1096 * masking.
1097 *
1098 * The hardware ignores source modifiers (negate and abs) on math
1099 * instructions, so we also move to a temp to set those up.
1100 */
1101 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1102 !src.abs && !src.negate)
1103 return src;
1104
1105 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1106 * operands to math
1107 */
1108 if (brw->gen >= 7 && src.file != IMM)
1109 return src;
1110
1111 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1112 expanded.type = src.type;
1113 emit(BRW_OPCODE_MOV, expanded, src);
1114 return expanded;
1115 }
1116
1117 fs_inst *
1118 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1119 {
1120 switch (opcode) {
1121 case SHADER_OPCODE_RCP:
1122 case SHADER_OPCODE_RSQ:
1123 case SHADER_OPCODE_SQRT:
1124 case SHADER_OPCODE_EXP2:
1125 case SHADER_OPCODE_LOG2:
1126 case SHADER_OPCODE_SIN:
1127 case SHADER_OPCODE_COS:
1128 break;
1129 default:
1130 assert(!"not reached: bad math opcode");
1131 return NULL;
1132 }
1133
1134 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1135 * might be able to do better by doing execsize = 1 math and then
1136 * expanding that result out, but we would need to be careful with
1137 * masking.
1138 *
1139 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1140 * instructions, so we also move to a temp to set those up.
1141 */
1142 if (brw->gen >= 6)
1143 src = fix_math_operand(src);
1144
1145 fs_inst *inst = emit(opcode, dst, src);
1146
1147 if (brw->gen < 6) {
1148 inst->base_mrf = 2;
1149 inst->mlen = dispatch_width / 8;
1150 }
1151
1152 return inst;
1153 }
1154
1155 fs_inst *
1156 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1157 {
1158 int base_mrf = 2;
1159 fs_inst *inst;
1160
1161 switch (opcode) {
1162 case SHADER_OPCODE_INT_QUOTIENT:
1163 case SHADER_OPCODE_INT_REMAINDER:
1164 if (brw->gen >= 7 && dispatch_width == 16)
1165 fail("16-wide INTDIV unsupported\n");
1166 break;
1167 case SHADER_OPCODE_POW:
1168 break;
1169 default:
1170 assert(!"not reached: unsupported binary math opcode.");
1171 return NULL;
1172 }
1173
1174 if (brw->gen >= 6) {
1175 src0 = fix_math_operand(src0);
1176 src1 = fix_math_operand(src1);
1177
1178 inst = emit(opcode, dst, src0, src1);
1179 } else {
1180 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1181 * "Message Payload":
1182 *
1183 * "Operand0[7]. For the INT DIV functions, this operand is the
1184 * denominator."
1185 * ...
1186 * "Operand1[7]. For the INT DIV functions, this operand is the
1187 * numerator."
1188 */
1189 bool is_int_div = opcode != SHADER_OPCODE_POW;
1190 fs_reg &op0 = is_int_div ? src1 : src0;
1191 fs_reg &op1 = is_int_div ? src0 : src1;
1192
1193 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1194 inst = emit(opcode, dst, op0, reg_null_f);
1195
1196 inst->base_mrf = base_mrf;
1197 inst->mlen = 2 * dispatch_width / 8;
1198 }
1199 return inst;
1200 }
1201
1202 void
1203 fs_visitor::assign_curb_setup()
1204 {
1205 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1206 if (dispatch_width == 8) {
1207 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1208 } else {
1209 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1210 }
1211
1212 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1213 foreach_list(node, &this->instructions) {
1214 fs_inst *inst = (fs_inst *)node;
1215
1216 for (unsigned int i = 0; i < 3; i++) {
1217 if (inst->src[i].file == UNIFORM) {
1218 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1219 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1220 constant_nr / 8,
1221 constant_nr % 8);
1222
1223 inst->src[i].file = HW_REG;
1224 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1225 }
1226 }
1227 }
1228 }
1229
1230 void
1231 fs_visitor::calculate_urb_setup()
1232 {
1233 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1234 urb_setup[i] = -1;
1235 }
1236
1237 int urb_next = 0;
1238 /* Figure out where each of the incoming setup attributes lands. */
1239 if (brw->gen >= 6) {
1240 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1241 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1242 urb_setup[i] = urb_next++;
1243 }
1244 }
1245 } else {
1246 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1247 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1248 /* Point size is packed into the header, not as a general attribute */
1249 if (i == VARYING_SLOT_PSIZ)
1250 continue;
1251
1252 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1253 /* The back color slot is skipped when the front color is
1254 * also written to. In addition, some slots can be
1255 * written in the vertex shader and not read in the
1256 * fragment shader. So the register number must always be
1257 * incremented, mapped or not.
1258 */
1259 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1260 urb_setup[i] = urb_next;
1261 urb_next++;
1262 }
1263 }
1264
1265 /*
1266 * It's a FS only attribute, and we did interpolation for this attribute
1267 * in SF thread. So, count it here, too.
1268 *
1269 * See compile_sf_prog() for more info.
1270 */
1271 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1272 urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1273 }
1274
1275 /* Each attribute is 4 setup channels, each of which is half a reg. */
1276 c->prog_data.urb_read_length = urb_next * 2;
1277 }
1278
1279 void
1280 fs_visitor::assign_urb_setup()
1281 {
1282 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1283
1284 /* Offset all the urb_setup[] index by the actual position of the
1285 * setup regs, now that the location of the constants has been chosen.
1286 */
1287 foreach_list(node, &this->instructions) {
1288 fs_inst *inst = (fs_inst *)node;
1289
1290 if (inst->opcode == FS_OPCODE_LINTERP) {
1291 assert(inst->src[2].file == HW_REG);
1292 inst->src[2].fixed_hw_reg.nr += urb_start;
1293 }
1294
1295 if (inst->opcode == FS_OPCODE_CINTERP) {
1296 assert(inst->src[0].file == HW_REG);
1297 inst->src[0].fixed_hw_reg.nr += urb_start;
1298 }
1299 }
1300
1301 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1302 }
1303
1304 /**
1305 * Split large virtual GRFs into separate components if we can.
1306 *
1307 * This is mostly duplicated with what brw_fs_vector_splitting does,
1308 * but that's really conservative because it's afraid of doing
1309 * splitting that doesn't result in real progress after the rest of
1310 * the optimization phases, which would cause infinite looping in
1311 * optimization. We can do it once here, safely. This also has the
1312 * opportunity to split interpolated values, or maybe even uniforms,
1313 * which we don't have at the IR level.
1314 *
1315 * We want to split, because virtual GRFs are what we register
1316 * allocate and spill (due to contiguousness requirements for some
1317 * instructions), and they're what we naturally generate in the
1318 * codegen process, but most virtual GRFs don't actually need to be
1319 * contiguous sets of GRFs. If we split, we'll end up with reduced
1320 * live intervals and better dead code elimination and coalescing.
1321 */
1322 void
1323 fs_visitor::split_virtual_grfs()
1324 {
1325 int num_vars = this->virtual_grf_count;
1326 bool split_grf[num_vars];
1327 int new_virtual_grf[num_vars];
1328
1329 /* Try to split anything > 0 sized. */
1330 for (int i = 0; i < num_vars; i++) {
1331 if (this->virtual_grf_sizes[i] != 1)
1332 split_grf[i] = true;
1333 else
1334 split_grf[i] = false;
1335 }
1336
1337 if (brw->has_pln &&
1338 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1339 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1340 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1341 * Gen6, that was the only supported interpolation mode, and since Gen6,
1342 * delta_x and delta_y are in fixed hardware registers.
1343 */
1344 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1345 false;
1346 }
1347
1348 foreach_list(node, &this->instructions) {
1349 fs_inst *inst = (fs_inst *)node;
1350
1351 /* If there's a SEND message that requires contiguous destination
1352 * registers, no splitting is allowed.
1353 */
1354 if (inst->regs_written > 1) {
1355 split_grf[inst->dst.reg] = false;
1356 }
1357
1358 /* If we're sending from a GRF, don't split it, on the assumption that
1359 * the send is reading the whole thing.
1360 */
1361 if (inst->is_send_from_grf()) {
1362 split_grf[inst->src[0].reg] = false;
1363 }
1364 }
1365
1366 /* Allocate new space for split regs. Note that the virtual
1367 * numbers will be contiguous.
1368 */
1369 for (int i = 0; i < num_vars; i++) {
1370 if (split_grf[i]) {
1371 new_virtual_grf[i] = virtual_grf_alloc(1);
1372 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1373 int reg = virtual_grf_alloc(1);
1374 assert(reg == new_virtual_grf[i] + j - 1);
1375 (void) reg;
1376 }
1377 this->virtual_grf_sizes[i] = 1;
1378 }
1379 }
1380
1381 foreach_list(node, &this->instructions) {
1382 fs_inst *inst = (fs_inst *)node;
1383
1384 if (inst->dst.file == GRF &&
1385 split_grf[inst->dst.reg] &&
1386 inst->dst.reg_offset != 0) {
1387 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1388 inst->dst.reg_offset - 1);
1389 inst->dst.reg_offset = 0;
1390 }
1391 for (int i = 0; i < 3; i++) {
1392 if (inst->src[i].file == GRF &&
1393 split_grf[inst->src[i].reg] &&
1394 inst->src[i].reg_offset != 0) {
1395 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1396 inst->src[i].reg_offset - 1);
1397 inst->src[i].reg_offset = 0;
1398 }
1399 }
1400 }
1401 this->live_intervals_valid = false;
1402 }
1403
1404 /**
1405 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1406 *
1407 * During code generation, we create tons of temporary variables, many of
1408 * which get immediately killed and are never used again. Yet, in later
1409 * optimization and analysis passes, such as compute_live_intervals, we need
1410 * to loop over all the virtual GRFs. Compacting them can save a lot of
1411 * overhead.
1412 */
1413 void
1414 fs_visitor::compact_virtual_grfs()
1415 {
1416 /* Mark which virtual GRFs are used, and count how many. */
1417 int remap_table[this->virtual_grf_count];
1418 memset(remap_table, -1, sizeof(remap_table));
1419
1420 foreach_list(node, &this->instructions) {
1421 const fs_inst *inst = (const fs_inst *) node;
1422
1423 if (inst->dst.file == GRF)
1424 remap_table[inst->dst.reg] = 0;
1425
1426 for (int i = 0; i < 3; i++) {
1427 if (inst->src[i].file == GRF)
1428 remap_table[inst->src[i].reg] = 0;
1429 }
1430 }
1431
1432 /* In addition to registers used in instructions, fs_visitor keeps
1433 * direct references to certain special values which must be patched:
1434 */
1435 fs_reg *special[] = {
1436 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1437 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1438 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1439 &delta_x[0], &delta_x[1], &delta_x[2],
1440 &delta_x[3], &delta_x[4], &delta_x[5],
1441 &delta_y[0], &delta_y[1], &delta_y[2],
1442 &delta_y[3], &delta_y[4], &delta_y[5],
1443 };
1444 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1445 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1446
1447 /* Treat all special values as used, to be conservative */
1448 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1449 if (special[i]->file == GRF)
1450 remap_table[special[i]->reg] = 0;
1451 }
1452
1453 /* Compact the GRF arrays. */
1454 int new_index = 0;
1455 for (int i = 0; i < this->virtual_grf_count; i++) {
1456 if (remap_table[i] != -1) {
1457 remap_table[i] = new_index;
1458 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1459 if (live_intervals_valid) {
1460 virtual_grf_start[new_index] = virtual_grf_start[i];
1461 virtual_grf_end[new_index] = virtual_grf_end[i];
1462 }
1463 ++new_index;
1464 }
1465 }
1466
1467 this->virtual_grf_count = new_index;
1468
1469 /* Patch all the instructions to use the newly renumbered registers */
1470 foreach_list(node, &this->instructions) {
1471 fs_inst *inst = (fs_inst *) node;
1472
1473 if (inst->dst.file == GRF)
1474 inst->dst.reg = remap_table[inst->dst.reg];
1475
1476 for (int i = 0; i < 3; i++) {
1477 if (inst->src[i].file == GRF)
1478 inst->src[i].reg = remap_table[inst->src[i].reg];
1479 }
1480 }
1481
1482 /* Patch all the references to special values */
1483 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1484 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1485 special[i]->reg = remap_table[special[i]->reg];
1486 }
1487 }
1488
1489 bool
1490 fs_visitor::remove_dead_constants()
1491 {
1492 if (dispatch_width == 8) {
1493 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1494 this->nr_params_remap = c->prog_data.nr_params;
1495
1496 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1497 this->params_remap[i] = -1;
1498
1499 /* Find which params are still in use. */
1500 foreach_list(node, &this->instructions) {
1501 fs_inst *inst = (fs_inst *)node;
1502
1503 for (int i = 0; i < 3; i++) {
1504 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1505
1506 if (inst->src[i].file != UNIFORM)
1507 continue;
1508
1509 /* Section 5.11 of the OpenGL 4.3 spec says:
1510 *
1511 * "Out-of-bounds reads return undefined values, which include
1512 * values from other variables of the active program or zero."
1513 */
1514 if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1515 constant_nr = 0;
1516 }
1517
1518 /* For now, set this to non-negative. We'll give it the
1519 * actual new number in a moment, in order to keep the
1520 * register numbers nicely ordered.
1521 */
1522 this->params_remap[constant_nr] = 0;
1523 }
1524 }
1525
1526 /* Figure out what the new numbers for the params will be. At some
1527 * point when we're doing uniform array access, we're going to want
1528 * to keep the distinction between .reg and .reg_offset, but for
1529 * now we don't care.
1530 */
1531 unsigned int new_nr_params = 0;
1532 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1533 if (this->params_remap[i] != -1) {
1534 this->params_remap[i] = new_nr_params++;
1535 }
1536 }
1537
1538 /* Update the list of params to be uploaded to match our new numbering. */
1539 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1540 int remapped = this->params_remap[i];
1541
1542 if (remapped == -1)
1543 continue;
1544
1545 c->prog_data.param[remapped] = c->prog_data.param[i];
1546 }
1547
1548 c->prog_data.nr_params = new_nr_params;
1549 } else {
1550 /* This should have been generated in the 8-wide pass already. */
1551 assert(this->params_remap);
1552 }
1553
1554 /* Now do the renumbering of the shader to remove unused params. */
1555 foreach_list(node, &this->instructions) {
1556 fs_inst *inst = (fs_inst *)node;
1557
1558 for (int i = 0; i < 3; i++) {
1559 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1560
1561 if (inst->src[i].file != UNIFORM)
1562 continue;
1563
1564 /* as above alias to 0 */
1565 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1566 constant_nr = 0;
1567 }
1568 assert(this->params_remap[constant_nr] != -1);
1569 inst->src[i].reg = this->params_remap[constant_nr];
1570 inst->src[i].reg_offset = 0;
1571 }
1572 }
1573
1574 return true;
1575 }
1576
1577 /*
1578 * Implements array access of uniforms by inserting a
1579 * PULL_CONSTANT_LOAD instruction.
1580 *
1581 * Unlike temporary GRF array access (where we don't support it due to
1582 * the difficulty of doing relative addressing on instruction
1583 * destinations), we could potentially do array access of uniforms
1584 * that were loaded in GRF space as push constants. In real-world
1585 * usage we've seen, though, the arrays being used are always larger
1586 * than we could load as push constants, so just always move all
1587 * uniform array access out to a pull constant buffer.
1588 */
1589 void
1590 fs_visitor::move_uniform_array_access_to_pull_constants()
1591 {
1592 int pull_constant_loc[c->prog_data.nr_params];
1593
1594 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1595 pull_constant_loc[i] = -1;
1596 }
1597
1598 /* Walk through and find array access of uniforms. Put a copy of that
1599 * uniform in the pull constant buffer.
1600 *
1601 * Note that we don't move constant-indexed accesses to arrays. No
1602 * testing has been done of the performance impact of this choice.
1603 */
1604 foreach_list_safe(node, &this->instructions) {
1605 fs_inst *inst = (fs_inst *)node;
1606
1607 for (int i = 0 ; i < 3; i++) {
1608 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1609 continue;
1610
1611 int uniform = inst->src[i].reg;
1612
1613 /* If this array isn't already present in the pull constant buffer,
1614 * add it.
1615 */
1616 if (pull_constant_loc[uniform] == -1) {
1617 const float **values = &c->prog_data.param[uniform];
1618
1619 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1620
1621 assert(param_size[uniform]);
1622
1623 for (int j = 0; j < param_size[uniform]; j++) {
1624 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1625 values[j];
1626 }
1627 }
1628
1629 /* Set up the annotation tracking for new generated instructions. */
1630 base_ir = inst->ir;
1631 current_annotation = inst->annotation;
1632
1633 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1634 fs_reg temp = fs_reg(this, glsl_type::float_type);
1635 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1636 surf_index,
1637 *inst->src[i].reladdr,
1638 pull_constant_loc[uniform] +
1639 inst->src[i].reg_offset);
1640 inst->insert_before(&list);
1641
1642 inst->src[i].file = temp.file;
1643 inst->src[i].reg = temp.reg;
1644 inst->src[i].reg_offset = temp.reg_offset;
1645 inst->src[i].reladdr = NULL;
1646 }
1647 }
1648 }
1649
1650 /**
1651 * Choose accesses from the UNIFORM file to demote to using the pull
1652 * constant buffer.
1653 *
1654 * We allow a fragment shader to have more than the specified minimum
1655 * maximum number of fragment shader uniform components (64). If
1656 * there are too many of these, they'd fill up all of register space.
1657 * So, this will push some of them out to the pull constant buffer and
1658 * update the program to load them.
1659 */
1660 void
1661 fs_visitor::setup_pull_constants()
1662 {
1663 /* Only allow 16 registers (128 uniform components) as push constants. */
1664 unsigned int max_uniform_components = 16 * 8;
1665 if (c->prog_data.nr_params <= max_uniform_components)
1666 return;
1667
1668 if (dispatch_width == 16) {
1669 fail("Pull constants not supported in 16-wide\n");
1670 return;
1671 }
1672
1673 /* Just demote the end of the list. We could probably do better
1674 * here, demoting things that are rarely used in the program first.
1675 */
1676 unsigned int pull_uniform_base = max_uniform_components;
1677
1678 int pull_constant_loc[c->prog_data.nr_params];
1679 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1680 if (i < pull_uniform_base) {
1681 pull_constant_loc[i] = -1;
1682 } else {
1683 pull_constant_loc[i] = -1;
1684 /* If our constant is already being uploaded for reladdr purposes,
1685 * reuse it.
1686 */
1687 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1688 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1689 pull_constant_loc[i] = j;
1690 break;
1691 }
1692 }
1693 if (pull_constant_loc[i] == -1) {
1694 int pull_index = c->prog_data.nr_pull_params++;
1695 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1696 pull_constant_loc[i] = pull_index;;
1697 }
1698 }
1699 }
1700 c->prog_data.nr_params = pull_uniform_base;
1701
1702 foreach_list(node, &this->instructions) {
1703 fs_inst *inst = (fs_inst *)node;
1704
1705 for (int i = 0; i < 3; i++) {
1706 if (inst->src[i].file != UNIFORM)
1707 continue;
1708
1709 int pull_index = pull_constant_loc[inst->src[i].reg +
1710 inst->src[i].reg_offset];
1711 if (pull_index == -1)
1712 continue;
1713
1714 assert(!inst->src[i].reladdr);
1715
1716 fs_reg dst = fs_reg(this, glsl_type::float_type);
1717 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1718 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1719 fs_inst *pull =
1720 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1721 dst, index, offset);
1722 pull->ir = inst->ir;
1723 pull->annotation = inst->annotation;
1724
1725 inst->insert_before(pull);
1726
1727 inst->src[i].file = GRF;
1728 inst->src[i].reg = dst.reg;
1729 inst->src[i].reg_offset = 0;
1730 inst->src[i].smear = pull_index & 3;
1731 }
1732 }
1733 }
1734
1735 bool
1736 fs_visitor::opt_algebraic()
1737 {
1738 bool progress = false;
1739
1740 foreach_list(node, &this->instructions) {
1741 fs_inst *inst = (fs_inst *)node;
1742
1743 switch (inst->opcode) {
1744 case BRW_OPCODE_MUL:
1745 if (inst->src[1].file != IMM)
1746 continue;
1747
1748 /* a * 1.0 = a */
1749 if (inst->src[1].is_one()) {
1750 inst->opcode = BRW_OPCODE_MOV;
1751 inst->src[1] = reg_undef;
1752 progress = true;
1753 break;
1754 }
1755
1756 /* a * 0.0 = 0.0 */
1757 if (inst->src[1].is_zero()) {
1758 inst->opcode = BRW_OPCODE_MOV;
1759 inst->src[0] = inst->src[1];
1760 inst->src[1] = reg_undef;
1761 progress = true;
1762 break;
1763 }
1764
1765 break;
1766 case BRW_OPCODE_ADD:
1767 if (inst->src[1].file != IMM)
1768 continue;
1769
1770 /* a + 0.0 = a */
1771 if (inst->src[1].is_zero()) {
1772 inst->opcode = BRW_OPCODE_MOV;
1773 inst->src[1] = reg_undef;
1774 progress = true;
1775 break;
1776 }
1777 break;
1778 default:
1779 break;
1780 }
1781 }
1782
1783 return progress;
1784 }
1785
1786 /**
1787 * Removes any instructions writing a VGRF where that VGRF is not used by any
1788 * later instruction.
1789 */
1790 bool
1791 fs_visitor::dead_code_eliminate()
1792 {
1793 bool progress = false;
1794 int pc = 0;
1795
1796 calculate_live_intervals();
1797
1798 foreach_list_safe(node, &this->instructions) {
1799 fs_inst *inst = (fs_inst *)node;
1800
1801 if (inst->dst.file == GRF) {
1802 assert(this->virtual_grf_end[inst->dst.reg] >= pc);
1803 if (this->virtual_grf_end[inst->dst.reg] == pc) {
1804 inst->remove();
1805 progress = true;
1806 }
1807 }
1808
1809 pc++;
1810 }
1811
1812 if (progress)
1813 live_intervals_valid = false;
1814
1815 return progress;
1816 }
1817
1818 struct dead_code_hash_key
1819 {
1820 int vgrf;
1821 int reg_offset;
1822 };
1823
1824 static bool
1825 dead_code_hash_compare(const void *a, const void *b)
1826 {
1827 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1828 }
1829
1830 static void
1831 clear_dead_code_hash(struct hash_table *ht)
1832 {
1833 struct hash_entry *entry;
1834
1835 hash_table_foreach(ht, entry) {
1836 _mesa_hash_table_remove(ht, entry);
1837 }
1838 }
1839
1840 static void
1841 insert_dead_code_hash(struct hash_table *ht,
1842 int vgrf, int reg_offset, fs_inst *inst)
1843 {
1844 /* We don't bother freeing keys, because they'll be GCed with the ht. */
1845 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1846
1847 key->vgrf = vgrf;
1848 key->reg_offset = reg_offset;
1849
1850 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1851 }
1852
1853 static struct hash_entry *
1854 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1855 {
1856 struct dead_code_hash_key key;
1857
1858 key.vgrf = vgrf;
1859 key.reg_offset = reg_offset;
1860
1861 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1862 }
1863
1864 static void
1865 remove_dead_code_hash(struct hash_table *ht,
1866 int vgrf, int reg_offset)
1867 {
1868 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1869 if (!entry)
1870 return;
1871
1872 _mesa_hash_table_remove(ht, entry);
1873 }
1874
1875 /**
1876 * Walks basic blocks, removing any regs that are written but not read before
1877 * being redefined.
1878 *
1879 * The dead_code_eliminate() function implements a global dead code
1880 * elimination, but it only handles the removing the last write to a register
1881 * if it's never read. This one can handle intermediate writes, but only
1882 * within a basic block.
1883 */
1884 bool
1885 fs_visitor::dead_code_eliminate_local()
1886 {
1887 struct hash_table *ht;
1888 bool progress = false;
1889
1890 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1891
1892 foreach_list_safe(node, &this->instructions) {
1893 fs_inst *inst = (fs_inst *)node;
1894
1895 /* At a basic block, empty the HT since we don't understand dataflow
1896 * here.
1897 */
1898 if (inst->is_control_flow()) {
1899 clear_dead_code_hash(ht);
1900 continue;
1901 }
1902
1903 /* Clear the HT of any instructions that got read. */
1904 for (int i = 0; i < 3; i++) {
1905 fs_reg src = inst->src[i];
1906 if (src.file != GRF)
1907 continue;
1908
1909 int read = 1;
1910 if (inst->is_send_from_grf())
1911 read = virtual_grf_sizes[src.reg] - src.reg_offset;
1912
1913 for (int reg_offset = src.reg_offset;
1914 reg_offset < src.reg_offset + read;
1915 reg_offset++) {
1916 remove_dead_code_hash(ht, src.reg, reg_offset);
1917 }
1918 }
1919
1920 /* Add any update of a GRF to the HT, removing a previous write if it
1921 * wasn't read.
1922 */
1923 if (inst->dst.file == GRF) {
1924 if (inst->regs_written > 1) {
1925 /* We don't know how to trim channels from an instruction's
1926 * writes, so we can't incrementally remove unread channels from
1927 * it. Just remove whatever it overwrites from the table
1928 */
1929 for (int i = 0; i < inst->regs_written; i++) {
1930 remove_dead_code_hash(ht,
1931 inst->dst.reg,
1932 inst->dst.reg_offset + i);
1933 }
1934 } else {
1935 struct hash_entry *entry =
1936 get_dead_code_hash_entry(ht, inst->dst.reg,
1937 inst->dst.reg_offset);
1938
1939 if (inst->is_partial_write()) {
1940 /* For a partial write, we can't remove any previous dead code
1941 * candidate, since we're just modifying their result, but we can
1942 * be dead code eliminiated ourselves.
1943 */
1944 if (entry) {
1945 entry->data = inst;
1946 } else {
1947 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1948 inst);
1949 }
1950 } else {
1951 if (entry) {
1952 /* We're completely updating a channel, and there was a
1953 * previous write to the channel that wasn't read. Kill it!
1954 */
1955 fs_inst *inst = (fs_inst *)entry->data;
1956 inst->remove();
1957 progress = true;
1958 _mesa_hash_table_remove(ht, entry);
1959 }
1960
1961 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1962 inst);
1963 }
1964 }
1965 }
1966 }
1967
1968 _mesa_hash_table_destroy(ht, NULL);
1969
1970 if (progress)
1971 live_intervals_valid = false;
1972
1973 return progress;
1974 }
1975
1976 /**
1977 * Implements a second type of register coalescing: This one checks if
1978 * the two regs involved in a raw move don't interfere, in which case
1979 * they can both by stored in the same place and the MOV removed.
1980 */
1981 bool
1982 fs_visitor::register_coalesce_2()
1983 {
1984 bool progress = false;
1985
1986 calculate_live_intervals();
1987
1988 foreach_list_safe(node, &this->instructions) {
1989 fs_inst *inst = (fs_inst *)node;
1990
1991 if (inst->opcode != BRW_OPCODE_MOV ||
1992 inst->is_partial_write() ||
1993 inst->saturate ||
1994 inst->src[0].file != GRF ||
1995 inst->src[0].negate ||
1996 inst->src[0].abs ||
1997 inst->src[0].smear != -1 ||
1998 inst->dst.file != GRF ||
1999 inst->dst.type != inst->src[0].type ||
2000 virtual_grf_sizes[inst->src[0].reg] != 1 ||
2001 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
2002 continue;
2003 }
2004
2005 int reg_from = inst->src[0].reg;
2006 assert(inst->src[0].reg_offset == 0);
2007 int reg_to = inst->dst.reg;
2008 int reg_to_offset = inst->dst.reg_offset;
2009
2010 foreach_list(node, &this->instructions) {
2011 fs_inst *scan_inst = (fs_inst *)node;
2012
2013 if (scan_inst->dst.file == GRF &&
2014 scan_inst->dst.reg == reg_from) {
2015 scan_inst->dst.reg = reg_to;
2016 scan_inst->dst.reg_offset = reg_to_offset;
2017 }
2018 for (int i = 0; i < 3; i++) {
2019 if (scan_inst->src[i].file == GRF &&
2020 scan_inst->src[i].reg == reg_from) {
2021 scan_inst->src[i].reg = reg_to;
2022 scan_inst->src[i].reg_offset = reg_to_offset;
2023 }
2024 }
2025 }
2026
2027 inst->remove();
2028
2029 /* We don't need to recalculate live intervals inside the loop despite
2030 * flagging live_intervals_valid because we only use live intervals for
2031 * the interferes test, and we must have had a situation where the
2032 * intervals were:
2033 *
2034 * from to
2035 * ^
2036 * |
2037 * v
2038 * ^
2039 * |
2040 * v
2041 *
2042 * Some register R that might get coalesced with one of these two could
2043 * only be referencing "to", otherwise "from"'s range would have been
2044 * longer. R's range could also only start at the end of "to" or later,
2045 * otherwise it will conflict with "to" when we try to coalesce "to"
2046 * into Rw anyway.
2047 */
2048 live_intervals_valid = false;
2049
2050 progress = true;
2051 continue;
2052 }
2053
2054 return progress;
2055 }
2056
2057 bool
2058 fs_visitor::register_coalesce()
2059 {
2060 bool progress = false;
2061 int if_depth = 0;
2062 int loop_depth = 0;
2063
2064 foreach_list_safe(node, &this->instructions) {
2065 fs_inst *inst = (fs_inst *)node;
2066
2067 /* Make sure that we dominate the instructions we're going to
2068 * scan for interfering with our coalescing, or we won't have
2069 * scanned enough to see if anything interferes with our
2070 * coalescing. We don't dominate the following instructions if
2071 * we're in a loop or an if block.
2072 */
2073 switch (inst->opcode) {
2074 case BRW_OPCODE_DO:
2075 loop_depth++;
2076 break;
2077 case BRW_OPCODE_WHILE:
2078 loop_depth--;
2079 break;
2080 case BRW_OPCODE_IF:
2081 if_depth++;
2082 break;
2083 case BRW_OPCODE_ENDIF:
2084 if_depth--;
2085 break;
2086 default:
2087 break;
2088 }
2089 if (loop_depth || if_depth)
2090 continue;
2091
2092 if (inst->opcode != BRW_OPCODE_MOV ||
2093 inst->is_partial_write() ||
2094 inst->saturate ||
2095 inst->dst.file != GRF || (inst->src[0].file != GRF &&
2096 inst->src[0].file != UNIFORM)||
2097 inst->dst.type != inst->src[0].type)
2098 continue;
2099
2100 bool has_source_modifiers = (inst->src[0].abs ||
2101 inst->src[0].negate ||
2102 inst->src[0].smear != -1 ||
2103 inst->src[0].file == UNIFORM);
2104
2105 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
2106 * them: check for no writes to either one until the exit of the
2107 * program.
2108 */
2109 bool interfered = false;
2110
2111 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2112 !scan_inst->is_tail_sentinel();
2113 scan_inst = (fs_inst *)scan_inst->next) {
2114 if (scan_inst->dst.file == GRF) {
2115 if (scan_inst->overwrites_reg(inst->dst) ||
2116 scan_inst->overwrites_reg(inst->src[0])) {
2117 interfered = true;
2118 break;
2119 }
2120 }
2121
2122 if (has_source_modifiers) {
2123 for (int i = 0; i < 3; i++) {
2124 if (scan_inst->src[i].file == GRF &&
2125 scan_inst->src[i].reg == inst->dst.reg &&
2126 scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2127 inst->dst.type != scan_inst->src[i].type)
2128 {
2129 interfered = true;
2130 break;
2131 }
2132 }
2133 }
2134
2135
2136 /* The gen6 MATH instruction can't handle source modifiers or
2137 * unusual register regions, so avoid coalescing those for
2138 * now. We should do something more specific.
2139 */
2140 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2141 interfered = true;
2142 break;
2143 }
2144
2145 /* The accumulator result appears to get used for the
2146 * conditional modifier generation. When negating a UD
2147 * value, there is a 33rd bit generated for the sign in the
2148 * accumulator value, so now you can't check, for example,
2149 * equality with a 32-bit value. See piglit fs-op-neg-uint.
2150 */
2151 if (scan_inst->conditional_mod &&
2152 inst->src[0].negate &&
2153 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2154 interfered = true;
2155 break;
2156 }
2157 }
2158 if (interfered) {
2159 continue;
2160 }
2161
2162 /* Rewrite the later usage to point at the source of the move to
2163 * be removed.
2164 */
2165 for (fs_inst *scan_inst = inst;
2166 !scan_inst->is_tail_sentinel();
2167 scan_inst = (fs_inst *)scan_inst->next) {
2168 for (int i = 0; i < 3; i++) {
2169 if (scan_inst->src[i].file == GRF &&
2170 scan_inst->src[i].reg == inst->dst.reg &&
2171 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2172 fs_reg new_src = inst->src[0];
2173 if (scan_inst->src[i].abs) {
2174 new_src.negate = 0;
2175 new_src.abs = 1;
2176 }
2177 new_src.negate ^= scan_inst->src[i].negate;
2178 scan_inst->src[i] = new_src;
2179 }
2180 }
2181 }
2182
2183 inst->remove();
2184 progress = true;
2185 }
2186
2187 if (progress)
2188 live_intervals_valid = false;
2189
2190 return progress;
2191 }
2192
2193
2194 bool
2195 fs_visitor::compute_to_mrf()
2196 {
2197 bool progress = false;
2198 int next_ip = 0;
2199
2200 calculate_live_intervals();
2201
2202 foreach_list_safe(node, &this->instructions) {
2203 fs_inst *inst = (fs_inst *)node;
2204
2205 int ip = next_ip;
2206 next_ip++;
2207
2208 if (inst->opcode != BRW_OPCODE_MOV ||
2209 inst->is_partial_write() ||
2210 inst->dst.file != MRF || inst->src[0].file != GRF ||
2211 inst->dst.type != inst->src[0].type ||
2212 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2213 continue;
2214
2215 /* Work out which hardware MRF registers are written by this
2216 * instruction.
2217 */
2218 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2219 int mrf_high;
2220 if (inst->dst.reg & BRW_MRF_COMPR4) {
2221 mrf_high = mrf_low + 4;
2222 } else if (dispatch_width == 16 &&
2223 (!inst->force_uncompressed && !inst->force_sechalf)) {
2224 mrf_high = mrf_low + 1;
2225 } else {
2226 mrf_high = mrf_low;
2227 }
2228
2229 /* Can't compute-to-MRF this GRF if someone else was going to
2230 * read it later.
2231 */
2232 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2233 continue;
2234
2235 /* Found a move of a GRF to a MRF. Let's see if we can go
2236 * rewrite the thing that made this GRF to write into the MRF.
2237 */
2238 fs_inst *scan_inst;
2239 for (scan_inst = (fs_inst *)inst->prev;
2240 scan_inst->prev != NULL;
2241 scan_inst = (fs_inst *)scan_inst->prev) {
2242 if (scan_inst->dst.file == GRF &&
2243 scan_inst->dst.reg == inst->src[0].reg) {
2244 /* Found the last thing to write our reg we want to turn
2245 * into a compute-to-MRF.
2246 */
2247
2248 /* If this one instruction didn't populate all the
2249 * channels, bail. We might be able to rewrite everything
2250 * that writes that reg, but it would require smarter
2251 * tracking to delay the rewriting until complete success.
2252 */
2253 if (scan_inst->is_partial_write())
2254 break;
2255
2256 /* Things returning more than one register would need us to
2257 * understand coalescing out more than one MOV at a time.
2258 */
2259 if (scan_inst->regs_written > 1)
2260 break;
2261
2262 /* SEND instructions can't have MRF as a destination. */
2263 if (scan_inst->mlen)
2264 break;
2265
2266 if (brw->gen == 6) {
2267 /* gen6 math instructions must have the destination be
2268 * GRF, so no compute-to-MRF for them.
2269 */
2270 if (scan_inst->is_math()) {
2271 break;
2272 }
2273 }
2274
2275 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2276 /* Found the creator of our MRF's source value. */
2277 scan_inst->dst.file = MRF;
2278 scan_inst->dst.reg = inst->dst.reg;
2279 scan_inst->saturate |= inst->saturate;
2280 inst->remove();
2281 progress = true;
2282 }
2283 break;
2284 }
2285
2286 /* We don't handle control flow here. Most computation of
2287 * values that end up in MRFs are shortly before the MRF
2288 * write anyway.
2289 */
2290 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2291 break;
2292
2293 /* You can't read from an MRF, so if someone else reads our
2294 * MRF's source GRF that we wanted to rewrite, that stops us.
2295 */
2296 bool interfered = false;
2297 for (int i = 0; i < 3; i++) {
2298 if (scan_inst->src[i].file == GRF &&
2299 scan_inst->src[i].reg == inst->src[0].reg &&
2300 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2301 interfered = true;
2302 }
2303 }
2304 if (interfered)
2305 break;
2306
2307 if (scan_inst->dst.file == MRF) {
2308 /* If somebody else writes our MRF here, we can't
2309 * compute-to-MRF before that.
2310 */
2311 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2312 int scan_mrf_high;
2313
2314 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2315 scan_mrf_high = scan_mrf_low + 4;
2316 } else if (dispatch_width == 16 &&
2317 (!scan_inst->force_uncompressed &&
2318 !scan_inst->force_sechalf)) {
2319 scan_mrf_high = scan_mrf_low + 1;
2320 } else {
2321 scan_mrf_high = scan_mrf_low;
2322 }
2323
2324 if (mrf_low == scan_mrf_low ||
2325 mrf_low == scan_mrf_high ||
2326 mrf_high == scan_mrf_low ||
2327 mrf_high == scan_mrf_high) {
2328 break;
2329 }
2330 }
2331
2332 if (scan_inst->mlen > 0) {
2333 /* Found a SEND instruction, which means that there are
2334 * live values in MRFs from base_mrf to base_mrf +
2335 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2336 * above it.
2337 */
2338 if (mrf_low >= scan_inst->base_mrf &&
2339 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2340 break;
2341 }
2342 if (mrf_high >= scan_inst->base_mrf &&
2343 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2344 break;
2345 }
2346 }
2347 }
2348 }
2349
2350 if (progress)
2351 live_intervals_valid = false;
2352
2353 return progress;
2354 }
2355
2356 /**
2357 * Walks through basic blocks, looking for repeated MRF writes and
2358 * removing the later ones.
2359 */
2360 bool
2361 fs_visitor::remove_duplicate_mrf_writes()
2362 {
2363 fs_inst *last_mrf_move[16];
2364 bool progress = false;
2365
2366 /* Need to update the MRF tracking for compressed instructions. */
2367 if (dispatch_width == 16)
2368 return false;
2369
2370 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2371
2372 foreach_list_safe(node, &this->instructions) {
2373 fs_inst *inst = (fs_inst *)node;
2374
2375 if (inst->is_control_flow()) {
2376 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2377 }
2378
2379 if (inst->opcode == BRW_OPCODE_MOV &&
2380 inst->dst.file == MRF) {
2381 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2382 if (prev_inst && inst->equals(prev_inst)) {
2383 inst->remove();
2384 progress = true;
2385 continue;
2386 }
2387 }
2388
2389 /* Clear out the last-write records for MRFs that were overwritten. */
2390 if (inst->dst.file == MRF) {
2391 last_mrf_move[inst->dst.reg] = NULL;
2392 }
2393
2394 if (inst->mlen > 0) {
2395 /* Found a SEND instruction, which will include two or fewer
2396 * implied MRF writes. We could do better here.
2397 */
2398 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2399 last_mrf_move[inst->base_mrf + i] = NULL;
2400 }
2401 }
2402
2403 /* Clear out any MRF move records whose sources got overwritten. */
2404 if (inst->dst.file == GRF) {
2405 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2406 if (last_mrf_move[i] &&
2407 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2408 last_mrf_move[i] = NULL;
2409 }
2410 }
2411 }
2412
2413 if (inst->opcode == BRW_OPCODE_MOV &&
2414 inst->dst.file == MRF &&
2415 inst->src[0].file == GRF &&
2416 !inst->is_partial_write()) {
2417 last_mrf_move[inst->dst.reg] = inst;
2418 }
2419 }
2420
2421 if (progress)
2422 live_intervals_valid = false;
2423
2424 return progress;
2425 }
2426
2427 static void
2428 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2429 int first_grf, int grf_len)
2430 {
2431 bool inst_16wide = (dispatch_width > 8 &&
2432 !inst->force_uncompressed &&
2433 !inst->force_sechalf);
2434
2435 /* Clear the flag for registers that actually got read (as expected). */
2436 for (int i = 0; i < 3; i++) {
2437 int grf;
2438 if (inst->src[i].file == GRF) {
2439 grf = inst->src[i].reg;
2440 } else if (inst->src[i].file == HW_REG &&
2441 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2442 grf = inst->src[i].fixed_hw_reg.nr;
2443 } else {
2444 continue;
2445 }
2446
2447 if (grf >= first_grf &&
2448 grf < first_grf + grf_len) {
2449 deps[grf - first_grf] = false;
2450 if (inst_16wide)
2451 deps[grf - first_grf + 1] = false;
2452 }
2453 }
2454 }
2455
2456 /**
2457 * Implements this workaround for the original 965:
2458 *
2459 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2460 * check for post destination dependencies on this instruction, software
2461 * must ensure that there is no destination hazard for the case of ‘write
2462 * followed by a posted write’ shown in the following example.
2463 *
2464 * 1. mov r3 0
2465 * 2. send r3.xy <rest of send instruction>
2466 * 3. mov r2 r3
2467 *
2468 * Due to no post-destination dependency check on the ‘send’, the above
2469 * code sequence could have two instructions (1 and 2) in flight at the
2470 * same time that both consider ‘r3’ as the target of their final writes.
2471 */
2472 void
2473 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2474 {
2475 int reg_size = dispatch_width / 8;
2476 int write_len = inst->regs_written * reg_size;
2477 int first_write_grf = inst->dst.reg;
2478 bool needs_dep[BRW_MAX_MRF];
2479 assert(write_len < (int)sizeof(needs_dep) - 1);
2480
2481 memset(needs_dep, false, sizeof(needs_dep));
2482 memset(needs_dep, true, write_len);
2483
2484 clear_deps_for_inst_src(inst, dispatch_width,
2485 needs_dep, first_write_grf, write_len);
2486
2487 /* Walk backwards looking for writes to registers we're writing which
2488 * aren't read since being written. If we hit the start of the program,
2489 * we assume that there are no outstanding dependencies on entry to the
2490 * program.
2491 */
2492 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2493 scan_inst != NULL;
2494 scan_inst = (fs_inst *)scan_inst->prev) {
2495
2496 /* If we hit control flow, assume that there *are* outstanding
2497 * dependencies, and force their cleanup before our instruction.
2498 */
2499 if (scan_inst->is_control_flow()) {
2500 for (int i = 0; i < write_len; i++) {
2501 if (needs_dep[i]) {
2502 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2503 }
2504 }
2505 return;
2506 }
2507
2508 bool scan_inst_16wide = (dispatch_width > 8 &&
2509 !scan_inst->force_uncompressed &&
2510 !scan_inst->force_sechalf);
2511
2512 /* We insert our reads as late as possible on the assumption that any
2513 * instruction but a MOV that might have left us an outstanding
2514 * dependency has more latency than a MOV.
2515 */
2516 if (scan_inst->dst.file == GRF) {
2517 for (int i = 0; i < scan_inst->regs_written; i++) {
2518 int reg = scan_inst->dst.reg + i * reg_size;
2519
2520 if (reg >= first_write_grf &&
2521 reg < first_write_grf + write_len &&
2522 needs_dep[reg - first_write_grf]) {
2523 inst->insert_before(DEP_RESOLVE_MOV(reg));
2524 needs_dep[reg - first_write_grf] = false;
2525 if (scan_inst_16wide)
2526 needs_dep[reg - first_write_grf + 1] = false;
2527 }
2528 }
2529 }
2530
2531 /* Clear the flag for registers that actually got read (as expected). */
2532 clear_deps_for_inst_src(scan_inst, dispatch_width,
2533 needs_dep, first_write_grf, write_len);
2534
2535 /* Continue the loop only if we haven't resolved all the dependencies */
2536 int i;
2537 for (i = 0; i < write_len; i++) {
2538 if (needs_dep[i])
2539 break;
2540 }
2541 if (i == write_len)
2542 return;
2543 }
2544 }
2545
2546 /**
2547 * Implements this workaround for the original 965:
2548 *
2549 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2550 * used as a destination register until after it has been sourced by an
2551 * instruction with a different destination register.
2552 */
2553 void
2554 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2555 {
2556 int write_len = inst->regs_written * dispatch_width / 8;
2557 int first_write_grf = inst->dst.reg;
2558 bool needs_dep[BRW_MAX_MRF];
2559 assert(write_len < (int)sizeof(needs_dep) - 1);
2560
2561 memset(needs_dep, false, sizeof(needs_dep));
2562 memset(needs_dep, true, write_len);
2563 /* Walk forwards looking for writes to registers we're writing which aren't
2564 * read before being written.
2565 */
2566 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2567 !scan_inst->is_tail_sentinel();
2568 scan_inst = (fs_inst *)scan_inst->next) {
2569 /* If we hit control flow, force resolve all remaining dependencies. */
2570 if (scan_inst->is_control_flow()) {
2571 for (int i = 0; i < write_len; i++) {
2572 if (needs_dep[i])
2573 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2574 }
2575 return;
2576 }
2577
2578 /* Clear the flag for registers that actually got read (as expected). */
2579 clear_deps_for_inst_src(scan_inst, dispatch_width,
2580 needs_dep, first_write_grf, write_len);
2581
2582 /* We insert our reads as late as possible since they're reading the
2583 * result of a SEND, which has massive latency.
2584 */
2585 if (scan_inst->dst.file == GRF &&
2586 scan_inst->dst.reg >= first_write_grf &&
2587 scan_inst->dst.reg < first_write_grf + write_len &&
2588 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2589 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2590 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2591 }
2592
2593 /* Continue the loop only if we haven't resolved all the dependencies */
2594 int i;
2595 for (i = 0; i < write_len; i++) {
2596 if (needs_dep[i])
2597 break;
2598 }
2599 if (i == write_len)
2600 return;
2601 }
2602
2603 /* If we hit the end of the program, resolve all remaining dependencies out
2604 * of paranoia.
2605 */
2606 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2607 assert(last_inst->eot);
2608 for (int i = 0; i < write_len; i++) {
2609 if (needs_dep[i])
2610 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2611 }
2612 }
2613
2614 void
2615 fs_visitor::insert_gen4_send_dependency_workarounds()
2616 {
2617 if (brw->gen != 4 || brw->is_g4x)
2618 return;
2619
2620 /* Note that we're done with register allocation, so GRF fs_regs always
2621 * have a .reg_offset of 0.
2622 */
2623
2624 foreach_list_safe(node, &this->instructions) {
2625 fs_inst *inst = (fs_inst *)node;
2626
2627 if (inst->mlen != 0 && inst->dst.file == GRF) {
2628 insert_gen4_pre_send_dependency_workarounds(inst);
2629 insert_gen4_post_send_dependency_workarounds(inst);
2630 }
2631 }
2632 }
2633
2634 /**
2635 * Turns the generic expression-style uniform pull constant load instruction
2636 * into a hardware-specific series of instructions for loading a pull
2637 * constant.
2638 *
2639 * The expression style allows the CSE pass before this to optimize out
2640 * repeated loads from the same offset, and gives the pre-register-allocation
2641 * scheduling full flexibility, while the conversion to native instructions
2642 * allows the post-register-allocation scheduler the best information
2643 * possible.
2644 *
2645 * Note that execution masking for setting up pull constant loads is special:
2646 * the channels that need to be written are unrelated to the current execution
2647 * mask, since a later instruction will use one of the result channels as a
2648 * source operand for all 8 or 16 of its channels.
2649 */
2650 void
2651 fs_visitor::lower_uniform_pull_constant_loads()
2652 {
2653 foreach_list(node, &this->instructions) {
2654 fs_inst *inst = (fs_inst *)node;
2655
2656 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2657 continue;
2658
2659 if (brw->gen >= 7) {
2660 /* The offset arg before was a vec4-aligned byte offset. We need to
2661 * turn it into a dword offset.
2662 */
2663 fs_reg const_offset_reg = inst->src[1];
2664 assert(const_offset_reg.file == IMM &&
2665 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2666 const_offset_reg.imm.u /= 4;
2667 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2668
2669 /* This is actually going to be a MOV, but since only the first dword
2670 * is accessed, we have a special opcode to do just that one. Note
2671 * that this needs to be an operation that will be considered a def
2672 * by live variable analysis, or register allocation will explode.
2673 */
2674 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2675 payload, const_offset_reg);
2676 setup->force_writemask_all = true;
2677
2678 setup->ir = inst->ir;
2679 setup->annotation = inst->annotation;
2680 inst->insert_before(setup);
2681
2682 /* Similarly, this will only populate the first 4 channels of the
2683 * result register (since we only use smear values from 0-3), but we
2684 * don't tell the optimizer.
2685 */
2686 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2687 inst->src[1] = payload;
2688
2689 this->live_intervals_valid = false;
2690 } else {
2691 /* Before register allocation, we didn't tell the scheduler about the
2692 * MRF we use. We know it's safe to use this MRF because nothing
2693 * else does except for register spill/unspill, which generates and
2694 * uses its MRF within a single IR instruction.
2695 */
2696 inst->base_mrf = 14;
2697 inst->mlen = 1;
2698 }
2699 }
2700 }
2701
2702 void
2703 fs_visitor::dump_instruction(backend_instruction *be_inst)
2704 {
2705 fs_inst *inst = (fs_inst *)be_inst;
2706
2707 if (inst->predicate) {
2708 printf("(%cf0.%d) ",
2709 inst->predicate_inverse ? '-' : '+',
2710 inst->flag_subreg);
2711 }
2712
2713 printf("%s", brw_instruction_name(inst->opcode));
2714 if (inst->saturate)
2715 printf(".sat");
2716 if (inst->conditional_mod) {
2717 printf(".cmod");
2718 if (!inst->predicate &&
2719 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2720 inst->opcode != BRW_OPCODE_IF &&
2721 inst->opcode != BRW_OPCODE_WHILE))) {
2722 printf(".f0.%d", inst->flag_subreg);
2723 }
2724 }
2725 printf(" ");
2726
2727
2728 switch (inst->dst.file) {
2729 case GRF:
2730 printf("vgrf%d", inst->dst.reg);
2731 if (inst->dst.reg_offset)
2732 printf("+%d", inst->dst.reg_offset);
2733 break;
2734 case MRF:
2735 printf("m%d", inst->dst.reg);
2736 break;
2737 case BAD_FILE:
2738 printf("(null)");
2739 break;
2740 case UNIFORM:
2741 printf("***u%d***", inst->dst.reg);
2742 break;
2743 case ARF:
2744 if (inst->dst.reg == BRW_ARF_NULL)
2745 printf("(null)");
2746 else
2747 printf("arf%d", inst->dst.reg);
2748 break;
2749 default:
2750 printf("???");
2751 break;
2752 }
2753 printf(", ");
2754
2755 for (int i = 0; i < 3; i++) {
2756 if (inst->src[i].negate)
2757 printf("-");
2758 if (inst->src[i].abs)
2759 printf("|");
2760 switch (inst->src[i].file) {
2761 case GRF:
2762 printf("vgrf%d", inst->src[i].reg);
2763 if (inst->src[i].reg_offset)
2764 printf("+%d", inst->src[i].reg_offset);
2765 break;
2766 case MRF:
2767 printf("***m%d***", inst->src[i].reg);
2768 break;
2769 case UNIFORM:
2770 printf("u%d", inst->src[i].reg);
2771 if (inst->src[i].reg_offset)
2772 printf(".%d", inst->src[i].reg_offset);
2773 break;
2774 case BAD_FILE:
2775 printf("(null)");
2776 break;
2777 case IMM:
2778 switch (inst->src[i].type) {
2779 case BRW_REGISTER_TYPE_F:
2780 printf("%ff", inst->src[i].imm.f);
2781 break;
2782 case BRW_REGISTER_TYPE_D:
2783 printf("%dd", inst->src[i].imm.i);
2784 break;
2785 case BRW_REGISTER_TYPE_UD:
2786 printf("%uu", inst->src[i].imm.u);
2787 break;
2788 default:
2789 printf("???");
2790 break;
2791 }
2792 break;
2793 default:
2794 printf("???");
2795 break;
2796 }
2797 if (inst->src[i].abs)
2798 printf("|");
2799
2800 if (i < 3)
2801 printf(", ");
2802 }
2803
2804 printf(" ");
2805
2806 if (inst->force_uncompressed)
2807 printf("1sthalf ");
2808
2809 if (inst->force_sechalf)
2810 printf("2ndhalf ");
2811
2812 printf("\n");
2813 }
2814
2815 /**
2816 * Possibly returns an instruction that set up @param reg.
2817 *
2818 * Sometimes we want to take the result of some expression/variable
2819 * dereference tree and rewrite the instruction generating the result
2820 * of the tree. When processing the tree, we know that the
2821 * instructions generated are all writing temporaries that are dead
2822 * outside of this tree. So, if we have some instructions that write
2823 * a temporary, we're free to point that temp write somewhere else.
2824 *
2825 * Note that this doesn't guarantee that the instruction generated
2826 * only reg -- it might be the size=4 destination of a texture instruction.
2827 */
2828 fs_inst *
2829 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2830 fs_inst *end,
2831 fs_reg reg)
2832 {
2833 if (end == start ||
2834 end->is_partial_write() ||
2835 reg.reladdr ||
2836 !reg.equals(end->dst)) {
2837 return NULL;
2838 } else {
2839 return end;
2840 }
2841 }
2842
2843 void
2844 fs_visitor::setup_payload_gen6()
2845 {
2846 bool uses_depth =
2847 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2848 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2849
2850 assert(brw->gen >= 6);
2851
2852 /* R0-1: masks, pixel X/Y coordinates. */
2853 c->nr_payload_regs = 2;
2854 /* R2: only for 32-pixel dispatch.*/
2855
2856 /* R3-26: barycentric interpolation coordinates. These appear in the
2857 * same order that they appear in the brw_wm_barycentric_interp_mode
2858 * enum. Each set of coordinates occupies 2 registers if dispatch width
2859 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2860 * appear if they were enabled using the "Barycentric Interpolation
2861 * Mode" bits in WM_STATE.
2862 */
2863 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2864 if (barycentric_interp_modes & (1 << i)) {
2865 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2866 c->nr_payload_regs += 2;
2867 if (dispatch_width == 16) {
2868 c->nr_payload_regs += 2;
2869 }
2870 }
2871 }
2872
2873 /* R27: interpolated depth if uses source depth */
2874 if (uses_depth) {
2875 c->source_depth_reg = c->nr_payload_regs;
2876 c->nr_payload_regs++;
2877 if (dispatch_width == 16) {
2878 /* R28: interpolated depth if not 8-wide. */
2879 c->nr_payload_regs++;
2880 }
2881 }
2882 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2883 if (uses_depth) {
2884 c->source_w_reg = c->nr_payload_regs;
2885 c->nr_payload_regs++;
2886 if (dispatch_width == 16) {
2887 /* R30: interpolated W if not 8-wide. */
2888 c->nr_payload_regs++;
2889 }
2890 }
2891 /* R31: MSAA position offsets. */
2892 /* R32-: bary for 32-pixel. */
2893 /* R58-59: interp W for 32-pixel. */
2894
2895 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2896 c->source_depth_to_render_target = true;
2897 }
2898 }
2899
2900 bool
2901 fs_visitor::run()
2902 {
2903 sanity_param_count = fp->Base.Parameters->NumParameters;
2904 uint32_t orig_nr_params = c->prog_data.nr_params;
2905
2906 if (brw->gen >= 6)
2907 setup_payload_gen6();
2908 else
2909 setup_payload_gen4();
2910
2911 if (0) {
2912 emit_dummy_fs();
2913 } else {
2914 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2915 emit_shader_time_begin();
2916
2917 calculate_urb_setup();
2918 if (brw->gen < 6)
2919 emit_interpolation_setup_gen4();
2920 else
2921 emit_interpolation_setup_gen6();
2922
2923 /* We handle discards by keeping track of the still-live pixels in f0.1.
2924 * Initialize it with the dispatched pixels.
2925 */
2926 if (fp->UsesKill) {
2927 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2928 discard_init->flag_subreg = 1;
2929 }
2930
2931 /* Generate FS IR for main(). (the visitor only descends into
2932 * functions called "main").
2933 */
2934 if (shader) {
2935 foreach_list(node, &*shader->ir) {
2936 ir_instruction *ir = (ir_instruction *)node;
2937 base_ir = ir;
2938 this->result = reg_undef;
2939 ir->accept(this);
2940 }
2941 } else {
2942 emit_fragment_program_code();
2943 }
2944 base_ir = NULL;
2945 if (failed)
2946 return false;
2947
2948 emit(FS_OPCODE_PLACEHOLDER_HALT);
2949
2950 emit_fb_writes();
2951
2952 split_virtual_grfs();
2953
2954 move_uniform_array_access_to_pull_constants();
2955 setup_pull_constants();
2956
2957 bool progress;
2958 do {
2959 progress = false;
2960
2961 compact_virtual_grfs();
2962
2963 progress = remove_duplicate_mrf_writes() || progress;
2964
2965 progress = opt_algebraic() || progress;
2966 progress = opt_cse() || progress;
2967 progress = opt_copy_propagate() || progress;
2968 progress = dead_code_eliminate() || progress;
2969 progress = dead_code_eliminate_local() || progress;
2970 progress = register_coalesce() || progress;
2971 progress = register_coalesce_2() || progress;
2972 progress = compute_to_mrf() || progress;
2973 } while (progress);
2974
2975 remove_dead_constants();
2976
2977 schedule_instructions(false);
2978
2979 lower_uniform_pull_constant_loads();
2980
2981 assign_curb_setup();
2982 assign_urb_setup();
2983
2984 if (0) {
2985 /* Debug of register spilling: Go spill everything. */
2986 for (int i = 0; i < virtual_grf_count; i++) {
2987 spill_reg(i);
2988 }
2989 }
2990
2991 if (0)
2992 assign_regs_trivial();
2993 else {
2994 while (!assign_regs()) {
2995 if (failed)
2996 break;
2997 }
2998 }
2999 }
3000 assert(force_uncompressed_stack == 0);
3001 assert(force_sechalf_stack == 0);
3002
3003 /* This must come after all optimization and register allocation, since
3004 * it inserts dead code that happens to have side effects, and it does
3005 * so based on the actual physical registers in use.
3006 */
3007 insert_gen4_send_dependency_workarounds();
3008
3009 if (failed)
3010 return false;
3011
3012 schedule_instructions(true);
3013
3014 if (dispatch_width == 8) {
3015 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3016 } else {
3017 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3018
3019 /* Make sure we didn't try to sneak in an extra uniform */
3020 assert(orig_nr_params == c->prog_data.nr_params);
3021 (void) orig_nr_params;
3022 }
3023
3024 /* If any state parameters were appended, then ParameterValues could have
3025 * been realloced, in which case the driver uniform storage set up by
3026 * _mesa_associate_uniform_storage() would point to freed memory. Make
3027 * sure that didn't happen.
3028 */
3029 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3030
3031 return !failed;
3032 }
3033
3034 const unsigned *
3035 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3036 struct gl_fragment_program *fp,
3037 struct gl_shader_program *prog,
3038 unsigned *final_assembly_size)
3039 {
3040 bool start_busy = false;
3041 float start_time = 0;
3042
3043 if (unlikely(brw->perf_debug)) {
3044 start_busy = (brw->batch.last_bo &&
3045 drm_intel_bo_busy(brw->batch.last_bo));
3046 start_time = get_time();
3047 }
3048
3049 struct brw_shader *shader = NULL;
3050 if (prog)
3051 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3052
3053 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3054 if (prog) {
3055 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3056 _mesa_print_ir(shader->ir, NULL);
3057 printf("\n\n");
3058 } else {
3059 printf("ARB_fragment_program %d ir for native fragment shader\n",
3060 fp->Base.Id);
3061 _mesa_print_program(&fp->Base);
3062 }
3063 }
3064
3065 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3066 */
3067 fs_visitor v(brw, c, prog, fp, 8);
3068 if (!v.run()) {
3069 if (prog) {
3070 prog->LinkStatus = false;
3071 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3072 }
3073
3074 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3075 v.fail_msg);
3076
3077 return NULL;
3078 }
3079
3080 exec_list *simd16_instructions = NULL;
3081 fs_visitor v2(brw, c, prog, fp, 16);
3082 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3083 if (c->prog_data.nr_pull_params == 0) {
3084 /* Try a 16-wide compile */
3085 v2.import_uniforms(&v);
3086 if (!v2.run()) {
3087 perf_debug("16-wide shader failed to compile, falling back to "
3088 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3089 } else {
3090 simd16_instructions = &v2.instructions;
3091 }
3092 } else {
3093 perf_debug("Skipping 16-wide due to pull parameters.\n");
3094 }
3095 }
3096
3097 c->prog_data.dispatch_width = 8;
3098
3099 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3100 const unsigned *generated = g.generate_assembly(&v.instructions,
3101 simd16_instructions,
3102 final_assembly_size);
3103
3104 if (unlikely(brw->perf_debug) && shader) {
3105 if (shader->compiled_once)
3106 brw_wm_debug_recompile(brw, prog, &c->key);
3107 shader->compiled_once = true;
3108
3109 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3110 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3111 (get_time() - start_time) * 1000);
3112 }
3113 }
3114
3115 return generated;
3116 }
3117
3118 bool
3119 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3120 {
3121 struct brw_context *brw = brw_context(ctx);
3122 struct brw_wm_prog_key key;
3123
3124 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3125 return true;
3126
3127 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3128 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3129 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3130 bool program_uses_dfdy = fp->UsesDFdy;
3131
3132 memset(&key, 0, sizeof(key));
3133
3134 if (brw->gen < 6) {
3135 if (fp->UsesKill)
3136 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3137
3138 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3139 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3140
3141 /* Just assume depth testing. */
3142 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3143 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3144 }
3145
3146 if (brw->gen < 6)
3147 key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
3148
3149 for (int i = 0; i < VARYING_SLOT_MAX; i++) {
3150 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
3151 continue;
3152
3153 if (brw->gen < 6) {
3154 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
3155 key.input_slots_valid |= BITFIELD64_BIT(i);
3156 }
3157 }
3158
3159 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3160
3161 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3162 for (unsigned i = 0; i < sampler_count; i++) {
3163 if (fp->Base.ShadowSamplers & (1 << i)) {
3164 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3165 key.tex.swizzles[i] =
3166 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3167 } else {
3168 /* Color sampler: assume no swizzling. */
3169 key.tex.swizzles[i] = SWIZZLE_XYZW;
3170 }
3171 }
3172
3173 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3174 key.drawable_height = ctx->DrawBuffer->Height;
3175 }
3176
3177 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3178 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3179 }
3180
3181 key.nr_color_regions = 1;
3182
3183 key.program_string_id = bfp->id;
3184
3185 uint32_t old_prog_offset = brw->wm.prog_offset;
3186 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3187
3188 bool success = do_wm_prog(brw, prog, bfp, &key);
3189
3190 brw->wm.prog_offset = old_prog_offset;
3191 brw->wm.prog_data = old_prog_data;
3192
3193 return success;
3194 }