i965/fs: Add dump_instruction() support for ARF destinations.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/uniforms.h"
39 #include "main/fbobject.h"
40 #include "program/prog_parameter.h"
41 #include "program/prog_print.h"
42 #include "program/register_allocate.h"
43 #include "program/sampler.h"
44 #include "program/hash_table.h"
45 #include "brw_context.h"
46 #include "brw_eu.h"
47 #include "brw_wm.h"
48 }
49 #include "brw_fs.h"
50 #include "glsl/glsl_types.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63
64 /* This will be the case for almost all instructions. */
65 this->regs_written = 1;
66 }
67
68 fs_inst::fs_inst()
69 {
70 init();
71 }
72
73 fs_inst::fs_inst(enum opcode opcode)
74 {
75 init();
76 this->opcode = opcode;
77 }
78
79 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
80 {
81 init();
82 this->opcode = opcode;
83 this->dst = dst;
84
85 if (dst.file == GRF)
86 assert(dst.reg_offset >= 0);
87 }
88
89 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
90 {
91 init();
92 this->opcode = opcode;
93 this->dst = dst;
94 this->src[0] = src0;
95
96 if (dst.file == GRF)
97 assert(dst.reg_offset >= 0);
98 if (src[0].file == GRF)
99 assert(src[0].reg_offset >= 0);
100 }
101
102 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
103 {
104 init();
105 this->opcode = opcode;
106 this->dst = dst;
107 this->src[0] = src0;
108 this->src[1] = src1;
109
110 if (dst.file == GRF)
111 assert(dst.reg_offset >= 0);
112 if (src[0].file == GRF)
113 assert(src[0].reg_offset >= 0);
114 if (src[1].file == GRF)
115 assert(src[1].reg_offset >= 0);
116 }
117
118 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
119 fs_reg src0, fs_reg src1, fs_reg src2)
120 {
121 init();
122 this->opcode = opcode;
123 this->dst = dst;
124 this->src[0] = src0;
125 this->src[1] = src1;
126 this->src[2] = src2;
127
128 if (dst.file == GRF)
129 assert(dst.reg_offset >= 0);
130 if (src[0].file == GRF)
131 assert(src[0].reg_offset >= 0);
132 if (src[1].file == GRF)
133 assert(src[1].reg_offset >= 0);
134 if (src[2].file == GRF)
135 assert(src[2].reg_offset >= 0);
136 }
137
138 #define ALU1(op) \
139 fs_inst * \
140 fs_visitor::op(fs_reg dst, fs_reg src0) \
141 { \
142 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
143 }
144
145 #define ALU2(op) \
146 fs_inst * \
147 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
148 { \
149 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
150 }
151
152 #define ALU3(op) \
153 fs_inst * \
154 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
155 { \
156 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
157 }
158
159 ALU1(NOT)
160 ALU1(MOV)
161 ALU1(FRC)
162 ALU1(RNDD)
163 ALU1(RNDE)
164 ALU1(RNDZ)
165 ALU2(ADD)
166 ALU2(MUL)
167 ALU2(MACH)
168 ALU2(AND)
169 ALU2(OR)
170 ALU2(XOR)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182
183 /** Gen4 predicated IF. */
184 fs_inst *
185 fs_visitor::IF(uint32_t predicate)
186 {
187 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
188 inst->predicate = predicate;
189 return inst;
190 }
191
192 /** Gen6+ IF with embedded comparison. */
193 fs_inst *
194 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
195 {
196 assert(brw->gen >= 6);
197 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
198 reg_null_d, src0, src1);
199 inst->conditional_mod = condition;
200 return inst;
201 }
202
203 /**
204 * CMP: Sets the low bit of the destination channels with the result
205 * of the comparison, while the upper bits are undefined, and updates
206 * the flag register with the packed 16 bits of the result.
207 */
208 fs_inst *
209 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
210 {
211 fs_inst *inst;
212
213 /* Take the instruction:
214 *
215 * CMP null<d> src0<f> src1<f>
216 *
217 * Original gen4 does type conversion to the destination type before
218 * comparison, producing garbage results for floating point comparisons.
219 * gen5 does the comparison on the execution type (resolved source types),
220 * so dst type doesn't matter. gen6 does comparison and then uses the
221 * result as if it was the dst type with no conversion, which happens to
222 * mostly work out for float-interpreted-as-int since our comparisons are
223 * for >0, =0, <0.
224 */
225 if (brw->gen == 4) {
226 dst.type = src0.type;
227 if (dst.file == HW_REG)
228 dst.fixed_hw_reg.type = dst.type;
229 }
230
231 resolve_ud_negate(&src0);
232 resolve_ud_negate(&src1);
233
234 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
235 inst->conditional_mod = condition;
236
237 return inst;
238 }
239
240 exec_list
241 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
242 fs_reg varying_offset,
243 uint32_t const_offset)
244 {
245 exec_list instructions;
246 fs_inst *inst;
247
248 /* We have our constant surface use a pitch of 4 bytes, so our index can
249 * be any component of a vector, and then we load 4 contiguous
250 * components starting from that.
251 *
252 * We break down the const_offset to a portion added to the variable
253 * offset and a portion done using reg_offset, which means that if you
254 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
255 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
256 * CSE can later notice that those loads are all the same and eliminate
257 * the redundant ones.
258 */
259 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
260 instructions.push_tail(ADD(vec4_offset,
261 varying_offset, const_offset & ~3));
262
263 int scale = 1;
264 if (brw->gen == 4 && dispatch_width == 8) {
265 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
266 * u, v, r) as parameters, or we can just use the SIMD16 message
267 * consisting of (header, u). We choose the second, at the cost of a
268 * longer return length.
269 */
270 scale = 2;
271 }
272
273 enum opcode op;
274 if (brw->gen >= 7)
275 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
276 else
277 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
278 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
279 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
280 inst->regs_written = 4 * scale;
281 instructions.push_tail(inst);
282
283 if (brw->gen < 7) {
284 inst->base_mrf = 13;
285 inst->header_present = true;
286 if (brw->gen == 4)
287 inst->mlen = 3;
288 else
289 inst->mlen = 1 + dispatch_width / 8;
290 }
291
292 vec4_result.reg_offset += (const_offset & 3) * scale;
293 instructions.push_tail(MOV(dst, vec4_result));
294
295 return instructions;
296 }
297
298 /**
299 * A helper for MOV generation for fixing up broken hardware SEND dependency
300 * handling.
301 */
302 fs_inst *
303 fs_visitor::DEP_RESOLVE_MOV(int grf)
304 {
305 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
306
307 inst->ir = NULL;
308 inst->annotation = "send dependency resolve";
309
310 /* The caller always wants uncompressed to emit the minimal extra
311 * dependencies, and to avoid having to deal with aligning its regs to 2.
312 */
313 inst->force_uncompressed = true;
314
315 return inst;
316 }
317
318 bool
319 fs_inst::equals(fs_inst *inst)
320 {
321 return (opcode == inst->opcode &&
322 dst.equals(inst->dst) &&
323 src[0].equals(inst->src[0]) &&
324 src[1].equals(inst->src[1]) &&
325 src[2].equals(inst->src[2]) &&
326 saturate == inst->saturate &&
327 predicate == inst->predicate &&
328 conditional_mod == inst->conditional_mod &&
329 mlen == inst->mlen &&
330 base_mrf == inst->base_mrf &&
331 sampler == inst->sampler &&
332 target == inst->target &&
333 eot == inst->eot &&
334 header_present == inst->header_present &&
335 shadow_compare == inst->shadow_compare &&
336 offset == inst->offset);
337 }
338
339 bool
340 fs_inst::overwrites_reg(const fs_reg &reg)
341 {
342 return (reg.file == dst.file &&
343 reg.reg == dst.reg &&
344 reg.reg_offset >= dst.reg_offset &&
345 reg.reg_offset < dst.reg_offset + regs_written);
346 }
347
348 bool
349 fs_inst::is_send_from_grf()
350 {
351 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
352 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
353 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
354 src[1].file == GRF));
355 }
356
357 bool
358 fs_visitor::can_do_source_mods(fs_inst *inst)
359 {
360 if (brw->gen == 6 && inst->is_math())
361 return false;
362
363 if (inst->is_send_from_grf())
364 return false;
365
366 return true;
367 }
368
369 void
370 fs_reg::init()
371 {
372 memset(this, 0, sizeof(*this));
373 this->smear = -1;
374 }
375
376 /** Generic unset register constructor. */
377 fs_reg::fs_reg()
378 {
379 init();
380 this->file = BAD_FILE;
381 }
382
383 /** Immediate value constructor. */
384 fs_reg::fs_reg(float f)
385 {
386 init();
387 this->file = IMM;
388 this->type = BRW_REGISTER_TYPE_F;
389 this->imm.f = f;
390 }
391
392 /** Immediate value constructor. */
393 fs_reg::fs_reg(int32_t i)
394 {
395 init();
396 this->file = IMM;
397 this->type = BRW_REGISTER_TYPE_D;
398 this->imm.i = i;
399 }
400
401 /** Immediate value constructor. */
402 fs_reg::fs_reg(uint32_t u)
403 {
404 init();
405 this->file = IMM;
406 this->type = BRW_REGISTER_TYPE_UD;
407 this->imm.u = u;
408 }
409
410 /** Fixed brw_reg Immediate value constructor. */
411 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
412 {
413 init();
414 this->file = HW_REG;
415 this->fixed_hw_reg = fixed_hw_reg;
416 this->type = fixed_hw_reg.type;
417 }
418
419 bool
420 fs_reg::equals(const fs_reg &r) const
421 {
422 return (file == r.file &&
423 reg == r.reg &&
424 reg_offset == r.reg_offset &&
425 type == r.type &&
426 negate == r.negate &&
427 abs == r.abs &&
428 !reladdr && !r.reladdr &&
429 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
430 sizeof(fixed_hw_reg)) == 0 &&
431 smear == r.smear &&
432 imm.u == r.imm.u);
433 }
434
435 bool
436 fs_reg::is_zero() const
437 {
438 if (file != IMM)
439 return false;
440
441 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
442 }
443
444 bool
445 fs_reg::is_one() const
446 {
447 if (file != IMM)
448 return false;
449
450 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
451 }
452
453 bool
454 fs_reg::is_valid_3src() const
455 {
456 return file == GRF || file == UNIFORM;
457 }
458
459 int
460 fs_visitor::type_size(const struct glsl_type *type)
461 {
462 unsigned int size, i;
463
464 switch (type->base_type) {
465 case GLSL_TYPE_UINT:
466 case GLSL_TYPE_INT:
467 case GLSL_TYPE_FLOAT:
468 case GLSL_TYPE_BOOL:
469 return type->components();
470 case GLSL_TYPE_ARRAY:
471 return type_size(type->fields.array) * type->length;
472 case GLSL_TYPE_STRUCT:
473 size = 0;
474 for (i = 0; i < type->length; i++) {
475 size += type_size(type->fields.structure[i].type);
476 }
477 return size;
478 case GLSL_TYPE_SAMPLER:
479 /* Samplers take up no register space, since they're baked in at
480 * link time.
481 */
482 return 0;
483 case GLSL_TYPE_VOID:
484 case GLSL_TYPE_ERROR:
485 case GLSL_TYPE_INTERFACE:
486 assert(!"not reached");
487 break;
488 }
489
490 return 0;
491 }
492
493 fs_reg
494 fs_visitor::get_timestamp()
495 {
496 assert(brw->gen >= 7);
497
498 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
499 BRW_ARF_TIMESTAMP,
500 0),
501 BRW_REGISTER_TYPE_UD));
502
503 fs_reg dst = fs_reg(this, glsl_type::uint_type);
504
505 fs_inst *mov = emit(MOV(dst, ts));
506 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
507 * even if it's not enabled in the dispatch.
508 */
509 mov->force_writemask_all = true;
510 mov->force_uncompressed = true;
511
512 /* The caller wants the low 32 bits of the timestamp. Since it's running
513 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
514 * which is plenty of time for our purposes. It is identical across the
515 * EUs, but since it's tracking GPU core speed it will increment at a
516 * varying rate as render P-states change.
517 *
518 * The caller could also check if render P-states have changed (or anything
519 * else that might disrupt timing) by setting smear to 2 and checking if
520 * that field is != 0.
521 */
522 dst.smear = 0;
523
524 return dst;
525 }
526
527 void
528 fs_visitor::emit_shader_time_begin()
529 {
530 current_annotation = "shader time start";
531 shader_start_time = get_timestamp();
532 }
533
534 void
535 fs_visitor::emit_shader_time_end()
536 {
537 current_annotation = "shader time end";
538
539 enum shader_time_shader_type type, written_type, reset_type;
540 if (dispatch_width == 8) {
541 type = ST_FS8;
542 written_type = ST_FS8_WRITTEN;
543 reset_type = ST_FS8_RESET;
544 } else {
545 assert(dispatch_width == 16);
546 type = ST_FS16;
547 written_type = ST_FS16_WRITTEN;
548 reset_type = ST_FS16_RESET;
549 }
550
551 fs_reg shader_end_time = get_timestamp();
552
553 /* Check that there weren't any timestamp reset events (assuming these
554 * were the only two timestamp reads that happened).
555 */
556 fs_reg reset = shader_end_time;
557 reset.smear = 2;
558 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
559 test->conditional_mod = BRW_CONDITIONAL_Z;
560 emit(IF(BRW_PREDICATE_NORMAL));
561
562 push_force_uncompressed();
563 fs_reg start = shader_start_time;
564 start.negate = true;
565 fs_reg diff = fs_reg(this, glsl_type::uint_type);
566 emit(ADD(diff, start, shader_end_time));
567
568 /* If there were no instructions between the two timestamp gets, the diff
569 * is 2 cycles. Remove that overhead, so I can forget about that when
570 * trying to determine the time taken for single instructions.
571 */
572 emit(ADD(diff, diff, fs_reg(-2u)));
573
574 emit_shader_time_write(type, diff);
575 emit_shader_time_write(written_type, fs_reg(1u));
576 emit(BRW_OPCODE_ELSE);
577 emit_shader_time_write(reset_type, fs_reg(1u));
578 emit(BRW_OPCODE_ENDIF);
579
580 pop_force_uncompressed();
581 }
582
583 void
584 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
585 fs_reg value)
586 {
587 int shader_time_index =
588 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
589 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
590
591 fs_reg payload;
592 if (dispatch_width == 8)
593 payload = fs_reg(this, glsl_type::uvec2_type);
594 else
595 payload = fs_reg(this, glsl_type::uint_type);
596
597 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
598 fs_reg(), payload, offset, value));
599 }
600
601 void
602 fs_visitor::fail(const char *format, ...)
603 {
604 va_list va;
605 char *msg;
606
607 if (failed)
608 return;
609
610 failed = true;
611
612 va_start(va, format);
613 msg = ralloc_vasprintf(mem_ctx, format, va);
614 va_end(va);
615 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
616
617 this->fail_msg = msg;
618
619 if (INTEL_DEBUG & DEBUG_WM) {
620 fprintf(stderr, "%s", msg);
621 }
622 }
623
624 fs_inst *
625 fs_visitor::emit(enum opcode opcode)
626 {
627 return emit(fs_inst(opcode));
628 }
629
630 fs_inst *
631 fs_visitor::emit(enum opcode opcode, fs_reg dst)
632 {
633 return emit(fs_inst(opcode, dst));
634 }
635
636 fs_inst *
637 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
638 {
639 return emit(fs_inst(opcode, dst, src0));
640 }
641
642 fs_inst *
643 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
644 {
645 return emit(fs_inst(opcode, dst, src0, src1));
646 }
647
648 fs_inst *
649 fs_visitor::emit(enum opcode opcode, fs_reg dst,
650 fs_reg src0, fs_reg src1, fs_reg src2)
651 {
652 return emit(fs_inst(opcode, dst, src0, src1, src2));
653 }
654
655 void
656 fs_visitor::push_force_uncompressed()
657 {
658 force_uncompressed_stack++;
659 }
660
661 void
662 fs_visitor::pop_force_uncompressed()
663 {
664 force_uncompressed_stack--;
665 assert(force_uncompressed_stack >= 0);
666 }
667
668 void
669 fs_visitor::push_force_sechalf()
670 {
671 force_sechalf_stack++;
672 }
673
674 void
675 fs_visitor::pop_force_sechalf()
676 {
677 force_sechalf_stack--;
678 assert(force_sechalf_stack >= 0);
679 }
680
681 /**
682 * Returns true if the instruction has a flag that means it won't
683 * update an entire destination register.
684 *
685 * For example, dead code elimination and live variable analysis want to know
686 * when a write to a variable screens off any preceding values that were in
687 * it.
688 */
689 bool
690 fs_inst::is_partial_write()
691 {
692 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
693 this->force_uncompressed ||
694 this->force_sechalf);
695 }
696
697 /**
698 * Returns how many MRFs an FS opcode will write over.
699 *
700 * Note that this is not the 0 or 1 implied writes in an actual gen
701 * instruction -- the FS opcodes often generate MOVs in addition.
702 */
703 int
704 fs_visitor::implied_mrf_writes(fs_inst *inst)
705 {
706 if (inst->mlen == 0)
707 return 0;
708
709 switch (inst->opcode) {
710 case SHADER_OPCODE_RCP:
711 case SHADER_OPCODE_RSQ:
712 case SHADER_OPCODE_SQRT:
713 case SHADER_OPCODE_EXP2:
714 case SHADER_OPCODE_LOG2:
715 case SHADER_OPCODE_SIN:
716 case SHADER_OPCODE_COS:
717 return 1 * dispatch_width / 8;
718 case SHADER_OPCODE_POW:
719 case SHADER_OPCODE_INT_QUOTIENT:
720 case SHADER_OPCODE_INT_REMAINDER:
721 return 2 * dispatch_width / 8;
722 case SHADER_OPCODE_TEX:
723 case FS_OPCODE_TXB:
724 case SHADER_OPCODE_TXD:
725 case SHADER_OPCODE_TXF:
726 case SHADER_OPCODE_TXF_MS:
727 case SHADER_OPCODE_TXL:
728 case SHADER_OPCODE_TXS:
729 case SHADER_OPCODE_LOD:
730 return 1;
731 case FS_OPCODE_FB_WRITE:
732 return 2;
733 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
734 case FS_OPCODE_UNSPILL:
735 return 1;
736 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
737 return inst->mlen;
738 case FS_OPCODE_SPILL:
739 return 2;
740 default:
741 assert(!"not reached");
742 return inst->mlen;
743 }
744 }
745
746 int
747 fs_visitor::virtual_grf_alloc(int size)
748 {
749 if (virtual_grf_array_size <= virtual_grf_count) {
750 if (virtual_grf_array_size == 0)
751 virtual_grf_array_size = 16;
752 else
753 virtual_grf_array_size *= 2;
754 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
755 virtual_grf_array_size);
756 }
757 virtual_grf_sizes[virtual_grf_count] = size;
758 return virtual_grf_count++;
759 }
760
761 /** Fixed HW reg constructor. */
762 fs_reg::fs_reg(enum register_file file, int reg)
763 {
764 init();
765 this->file = file;
766 this->reg = reg;
767 this->type = BRW_REGISTER_TYPE_F;
768 }
769
770 /** Fixed HW reg constructor. */
771 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
772 {
773 init();
774 this->file = file;
775 this->reg = reg;
776 this->type = type;
777 }
778
779 /** Automatic reg constructor. */
780 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
781 {
782 init();
783
784 this->file = GRF;
785 this->reg = v->virtual_grf_alloc(v->type_size(type));
786 this->reg_offset = 0;
787 this->type = brw_type_for_base_type(type);
788 }
789
790 fs_reg *
791 fs_visitor::variable_storage(ir_variable *var)
792 {
793 return (fs_reg *)hash_table_find(this->variable_ht, var);
794 }
795
796 void
797 import_uniforms_callback(const void *key,
798 void *data,
799 void *closure)
800 {
801 struct hash_table *dst_ht = (struct hash_table *)closure;
802 const fs_reg *reg = (const fs_reg *)data;
803
804 if (reg->file != UNIFORM)
805 return;
806
807 hash_table_insert(dst_ht, data, key);
808 }
809
810 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
811 * This brings in those uniform definitions
812 */
813 void
814 fs_visitor::import_uniforms(fs_visitor *v)
815 {
816 hash_table_call_foreach(v->variable_ht,
817 import_uniforms_callback,
818 variable_ht);
819 this->params_remap = v->params_remap;
820 this->nr_params_remap = v->nr_params_remap;
821 }
822
823 /* Our support for uniforms is piggy-backed on the struct
824 * gl_fragment_program, because that's where the values actually
825 * get stored, rather than in some global gl_shader_program uniform
826 * store.
827 */
828 void
829 fs_visitor::setup_uniform_values(ir_variable *ir)
830 {
831 int namelen = strlen(ir->name);
832
833 /* The data for our (non-builtin) uniforms is stored in a series of
834 * gl_uniform_driver_storage structs for each subcomponent that
835 * glGetUniformLocation() could name. We know it's been set up in the same
836 * order we'd walk the type, so walk the list of storage and find anything
837 * with our name, or the prefix of a component that starts with our name.
838 */
839 unsigned params_before = c->prog_data.nr_params;
840 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
841 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
842
843 if (strncmp(ir->name, storage->name, namelen) != 0 ||
844 (storage->name[namelen] != 0 &&
845 storage->name[namelen] != '.' &&
846 storage->name[namelen] != '[')) {
847 continue;
848 }
849
850 unsigned slots = storage->type->component_slots();
851 if (storage->array_elements)
852 slots *= storage->array_elements;
853
854 for (unsigned i = 0; i < slots; i++) {
855 c->prog_data.param[c->prog_data.nr_params++] =
856 &storage->storage[i].f;
857 }
858 }
859
860 /* Make sure we actually initialized the right amount of stuff here. */
861 assert(params_before + ir->type->component_slots() ==
862 c->prog_data.nr_params);
863 (void)params_before;
864 }
865
866
867 /* Our support for builtin uniforms is even scarier than non-builtin.
868 * It sits on top of the PROG_STATE_VAR parameters that are
869 * automatically updated from GL context state.
870 */
871 void
872 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
873 {
874 const ir_state_slot *const slots = ir->state_slots;
875 assert(ir->state_slots != NULL);
876
877 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
878 /* This state reference has already been setup by ir_to_mesa, but we'll
879 * get the same index back here.
880 */
881 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
882 (gl_state_index *)slots[i].tokens);
883
884 /* Add each of the unique swizzles of the element as a parameter.
885 * This'll end up matching the expected layout of the
886 * array/matrix/structure we're trying to fill in.
887 */
888 int last_swiz = -1;
889 for (unsigned int j = 0; j < 4; j++) {
890 int swiz = GET_SWZ(slots[i].swizzle, j);
891 if (swiz == last_swiz)
892 break;
893 last_swiz = swiz;
894
895 c->prog_data.param[c->prog_data.nr_params++] =
896 &fp->Base.Parameters->ParameterValues[index][swiz].f;
897 }
898 }
899 }
900
901 fs_reg *
902 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
903 {
904 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
905 fs_reg wpos = *reg;
906 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
907
908 /* gl_FragCoord.x */
909 if (ir->pixel_center_integer) {
910 emit(MOV(wpos, this->pixel_x));
911 } else {
912 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
913 }
914 wpos.reg_offset++;
915
916 /* gl_FragCoord.y */
917 if (!flip && ir->pixel_center_integer) {
918 emit(MOV(wpos, this->pixel_y));
919 } else {
920 fs_reg pixel_y = this->pixel_y;
921 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
922
923 if (flip) {
924 pixel_y.negate = true;
925 offset += c->key.drawable_height - 1.0;
926 }
927
928 emit(ADD(wpos, pixel_y, fs_reg(offset)));
929 }
930 wpos.reg_offset++;
931
932 /* gl_FragCoord.z */
933 if (brw->gen >= 6) {
934 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
935 } else {
936 emit(FS_OPCODE_LINTERP, wpos,
937 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
938 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
939 interp_reg(VARYING_SLOT_POS, 2));
940 }
941 wpos.reg_offset++;
942
943 /* gl_FragCoord.w: Already set up in emit_interpolation */
944 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
945
946 return reg;
947 }
948
949 fs_inst *
950 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
951 glsl_interp_qualifier interpolation_mode,
952 bool is_centroid)
953 {
954 brw_wm_barycentric_interp_mode barycoord_mode;
955 if (brw->gen >= 6) {
956 if (is_centroid) {
957 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
958 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
959 else
960 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
961 } else {
962 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
963 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
964 else
965 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
966 }
967 } else {
968 /* On Ironlake and below, there is only one interpolation mode.
969 * Centroid interpolation doesn't mean anything on this hardware --
970 * there is no multisampling.
971 */
972 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
973 }
974 return emit(FS_OPCODE_LINTERP, attr,
975 this->delta_x[barycoord_mode],
976 this->delta_y[barycoord_mode], interp);
977 }
978
979 fs_reg *
980 fs_visitor::emit_general_interpolation(ir_variable *ir)
981 {
982 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
983 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
984 fs_reg attr = *reg;
985
986 unsigned int array_elements;
987 const glsl_type *type;
988
989 if (ir->type->is_array()) {
990 array_elements = ir->type->length;
991 if (array_elements == 0) {
992 fail("dereferenced array '%s' has length 0\n", ir->name);
993 }
994 type = ir->type->fields.array;
995 } else {
996 array_elements = 1;
997 type = ir->type;
998 }
999
1000 glsl_interp_qualifier interpolation_mode =
1001 ir->determine_interpolation_mode(c->key.flat_shade);
1002
1003 int location = ir->location;
1004 for (unsigned int i = 0; i < array_elements; i++) {
1005 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1006 if (urb_setup[location] == -1) {
1007 /* If there's no incoming setup data for this slot, don't
1008 * emit interpolation for it.
1009 */
1010 attr.reg_offset += type->vector_elements;
1011 location++;
1012 continue;
1013 }
1014
1015 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1016 /* Constant interpolation (flat shading) case. The SF has
1017 * handed us defined values in only the constant offset
1018 * field of the setup reg.
1019 */
1020 for (unsigned int k = 0; k < type->vector_elements; k++) {
1021 struct brw_reg interp = interp_reg(location, k);
1022 interp = suboffset(interp, 3);
1023 interp.type = reg->type;
1024 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1025 attr.reg_offset++;
1026 }
1027 } else {
1028 /* Smooth/noperspective interpolation case. */
1029 for (unsigned int k = 0; k < type->vector_elements; k++) {
1030 /* FINISHME: At some point we probably want to push
1031 * this farther by giving similar treatment to the
1032 * other potentially constant components of the
1033 * attribute, as well as making brw_vs_constval.c
1034 * handle varyings other than gl_TexCoord.
1035 */
1036 struct brw_reg interp = interp_reg(location, k);
1037 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1038 ir->centroid);
1039 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1040 /* Get the pixel/sample mask into f0 so that we know
1041 * which pixels are lit. Then, for each channel that is
1042 * unlit, replace the centroid data with non-centroid
1043 * data.
1044 */
1045 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1046 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1047 interpolation_mode, false);
1048 inst->predicate = BRW_PREDICATE_NORMAL;
1049 inst->predicate_inverse = true;
1050 }
1051 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1052 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1053 }
1054 attr.reg_offset++;
1055 }
1056
1057 }
1058 location++;
1059 }
1060 }
1061
1062 return reg;
1063 }
1064
1065 fs_reg *
1066 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1067 {
1068 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1069
1070 /* The frontfacing comes in as a bit in the thread payload. */
1071 if (brw->gen >= 6) {
1072 emit(BRW_OPCODE_ASR, *reg,
1073 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1074 fs_reg(15));
1075 emit(BRW_OPCODE_NOT, *reg, *reg);
1076 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1077 } else {
1078 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1079 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1080 * us front face
1081 */
1082 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1083 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1084 }
1085
1086 return reg;
1087 }
1088
1089 fs_reg
1090 fs_visitor::fix_math_operand(fs_reg src)
1091 {
1092 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1093 * might be able to do better by doing execsize = 1 math and then
1094 * expanding that result out, but we would need to be careful with
1095 * masking.
1096 *
1097 * The hardware ignores source modifiers (negate and abs) on math
1098 * instructions, so we also move to a temp to set those up.
1099 */
1100 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1101 !src.abs && !src.negate)
1102 return src;
1103
1104 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1105 * operands to math
1106 */
1107 if (brw->gen >= 7 && src.file != IMM)
1108 return src;
1109
1110 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1111 expanded.type = src.type;
1112 emit(BRW_OPCODE_MOV, expanded, src);
1113 return expanded;
1114 }
1115
1116 fs_inst *
1117 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1118 {
1119 switch (opcode) {
1120 case SHADER_OPCODE_RCP:
1121 case SHADER_OPCODE_RSQ:
1122 case SHADER_OPCODE_SQRT:
1123 case SHADER_OPCODE_EXP2:
1124 case SHADER_OPCODE_LOG2:
1125 case SHADER_OPCODE_SIN:
1126 case SHADER_OPCODE_COS:
1127 break;
1128 default:
1129 assert(!"not reached: bad math opcode");
1130 return NULL;
1131 }
1132
1133 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1134 * might be able to do better by doing execsize = 1 math and then
1135 * expanding that result out, but we would need to be careful with
1136 * masking.
1137 *
1138 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1139 * instructions, so we also move to a temp to set those up.
1140 */
1141 if (brw->gen >= 6)
1142 src = fix_math_operand(src);
1143
1144 fs_inst *inst = emit(opcode, dst, src);
1145
1146 if (brw->gen < 6) {
1147 inst->base_mrf = 2;
1148 inst->mlen = dispatch_width / 8;
1149 }
1150
1151 return inst;
1152 }
1153
1154 fs_inst *
1155 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1156 {
1157 int base_mrf = 2;
1158 fs_inst *inst;
1159
1160 switch (opcode) {
1161 case SHADER_OPCODE_INT_QUOTIENT:
1162 case SHADER_OPCODE_INT_REMAINDER:
1163 if (brw->gen >= 7 && dispatch_width == 16)
1164 fail("16-wide INTDIV unsupported\n");
1165 break;
1166 case SHADER_OPCODE_POW:
1167 break;
1168 default:
1169 assert(!"not reached: unsupported binary math opcode.");
1170 return NULL;
1171 }
1172
1173 if (brw->gen >= 6) {
1174 src0 = fix_math_operand(src0);
1175 src1 = fix_math_operand(src1);
1176
1177 inst = emit(opcode, dst, src0, src1);
1178 } else {
1179 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1180 * "Message Payload":
1181 *
1182 * "Operand0[7]. For the INT DIV functions, this operand is the
1183 * denominator."
1184 * ...
1185 * "Operand1[7]. For the INT DIV functions, this operand is the
1186 * numerator."
1187 */
1188 bool is_int_div = opcode != SHADER_OPCODE_POW;
1189 fs_reg &op0 = is_int_div ? src1 : src0;
1190 fs_reg &op1 = is_int_div ? src0 : src1;
1191
1192 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1193 inst = emit(opcode, dst, op0, reg_null_f);
1194
1195 inst->base_mrf = base_mrf;
1196 inst->mlen = 2 * dispatch_width / 8;
1197 }
1198 return inst;
1199 }
1200
1201 void
1202 fs_visitor::assign_curb_setup()
1203 {
1204 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1205 if (dispatch_width == 8) {
1206 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1207 } else {
1208 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1209 }
1210
1211 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1212 foreach_list(node, &this->instructions) {
1213 fs_inst *inst = (fs_inst *)node;
1214
1215 for (unsigned int i = 0; i < 3; i++) {
1216 if (inst->src[i].file == UNIFORM) {
1217 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1218 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1219 constant_nr / 8,
1220 constant_nr % 8);
1221
1222 inst->src[i].file = HW_REG;
1223 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1224 }
1225 }
1226 }
1227 }
1228
1229 void
1230 fs_visitor::calculate_urb_setup()
1231 {
1232 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1233 urb_setup[i] = -1;
1234 }
1235
1236 int urb_next = 0;
1237 /* Figure out where each of the incoming setup attributes lands. */
1238 if (brw->gen >= 6) {
1239 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1240 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1241 urb_setup[i] = urb_next++;
1242 }
1243 }
1244 } else {
1245 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1246 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1247 /* Point size is packed into the header, not as a general attribute */
1248 if (i == VARYING_SLOT_PSIZ)
1249 continue;
1250
1251 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1252 /* The back color slot is skipped when the front color is
1253 * also written to. In addition, some slots can be
1254 * written in the vertex shader and not read in the
1255 * fragment shader. So the register number must always be
1256 * incremented, mapped or not.
1257 */
1258 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1259 urb_setup[i] = urb_next;
1260 urb_next++;
1261 }
1262 }
1263
1264 /*
1265 * It's a FS only attribute, and we did interpolation for this attribute
1266 * in SF thread. So, count it here, too.
1267 *
1268 * See compile_sf_prog() for more info.
1269 */
1270 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1271 urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1272 }
1273
1274 /* Each attribute is 4 setup channels, each of which is half a reg. */
1275 c->prog_data.urb_read_length = urb_next * 2;
1276 }
1277
1278 void
1279 fs_visitor::assign_urb_setup()
1280 {
1281 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1282
1283 /* Offset all the urb_setup[] index by the actual position of the
1284 * setup regs, now that the location of the constants has been chosen.
1285 */
1286 foreach_list(node, &this->instructions) {
1287 fs_inst *inst = (fs_inst *)node;
1288
1289 if (inst->opcode == FS_OPCODE_LINTERP) {
1290 assert(inst->src[2].file == HW_REG);
1291 inst->src[2].fixed_hw_reg.nr += urb_start;
1292 }
1293
1294 if (inst->opcode == FS_OPCODE_CINTERP) {
1295 assert(inst->src[0].file == HW_REG);
1296 inst->src[0].fixed_hw_reg.nr += urb_start;
1297 }
1298 }
1299
1300 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1301 }
1302
1303 /**
1304 * Split large virtual GRFs into separate components if we can.
1305 *
1306 * This is mostly duplicated with what brw_fs_vector_splitting does,
1307 * but that's really conservative because it's afraid of doing
1308 * splitting that doesn't result in real progress after the rest of
1309 * the optimization phases, which would cause infinite looping in
1310 * optimization. We can do it once here, safely. This also has the
1311 * opportunity to split interpolated values, or maybe even uniforms,
1312 * which we don't have at the IR level.
1313 *
1314 * We want to split, because virtual GRFs are what we register
1315 * allocate and spill (due to contiguousness requirements for some
1316 * instructions), and they're what we naturally generate in the
1317 * codegen process, but most virtual GRFs don't actually need to be
1318 * contiguous sets of GRFs. If we split, we'll end up with reduced
1319 * live intervals and better dead code elimination and coalescing.
1320 */
1321 void
1322 fs_visitor::split_virtual_grfs()
1323 {
1324 int num_vars = this->virtual_grf_count;
1325 bool split_grf[num_vars];
1326 int new_virtual_grf[num_vars];
1327
1328 /* Try to split anything > 0 sized. */
1329 for (int i = 0; i < num_vars; i++) {
1330 if (this->virtual_grf_sizes[i] != 1)
1331 split_grf[i] = true;
1332 else
1333 split_grf[i] = false;
1334 }
1335
1336 if (brw->has_pln &&
1337 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1338 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1339 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1340 * Gen6, that was the only supported interpolation mode, and since Gen6,
1341 * delta_x and delta_y are in fixed hardware registers.
1342 */
1343 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1344 false;
1345 }
1346
1347 foreach_list(node, &this->instructions) {
1348 fs_inst *inst = (fs_inst *)node;
1349
1350 /* If there's a SEND message that requires contiguous destination
1351 * registers, no splitting is allowed.
1352 */
1353 if (inst->regs_written > 1) {
1354 split_grf[inst->dst.reg] = false;
1355 }
1356
1357 /* If we're sending from a GRF, don't split it, on the assumption that
1358 * the send is reading the whole thing.
1359 */
1360 if (inst->is_send_from_grf()) {
1361 split_grf[inst->src[0].reg] = false;
1362 }
1363 }
1364
1365 /* Allocate new space for split regs. Note that the virtual
1366 * numbers will be contiguous.
1367 */
1368 for (int i = 0; i < num_vars; i++) {
1369 if (split_grf[i]) {
1370 new_virtual_grf[i] = virtual_grf_alloc(1);
1371 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1372 int reg = virtual_grf_alloc(1);
1373 assert(reg == new_virtual_grf[i] + j - 1);
1374 (void) reg;
1375 }
1376 this->virtual_grf_sizes[i] = 1;
1377 }
1378 }
1379
1380 foreach_list(node, &this->instructions) {
1381 fs_inst *inst = (fs_inst *)node;
1382
1383 if (inst->dst.file == GRF &&
1384 split_grf[inst->dst.reg] &&
1385 inst->dst.reg_offset != 0) {
1386 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1387 inst->dst.reg_offset - 1);
1388 inst->dst.reg_offset = 0;
1389 }
1390 for (int i = 0; i < 3; i++) {
1391 if (inst->src[i].file == GRF &&
1392 split_grf[inst->src[i].reg] &&
1393 inst->src[i].reg_offset != 0) {
1394 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1395 inst->src[i].reg_offset - 1);
1396 inst->src[i].reg_offset = 0;
1397 }
1398 }
1399 }
1400 this->live_intervals_valid = false;
1401 }
1402
1403 /**
1404 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1405 *
1406 * During code generation, we create tons of temporary variables, many of
1407 * which get immediately killed and are never used again. Yet, in later
1408 * optimization and analysis passes, such as compute_live_intervals, we need
1409 * to loop over all the virtual GRFs. Compacting them can save a lot of
1410 * overhead.
1411 */
1412 void
1413 fs_visitor::compact_virtual_grfs()
1414 {
1415 /* Mark which virtual GRFs are used, and count how many. */
1416 int remap_table[this->virtual_grf_count];
1417 memset(remap_table, -1, sizeof(remap_table));
1418
1419 foreach_list(node, &this->instructions) {
1420 const fs_inst *inst = (const fs_inst *) node;
1421
1422 if (inst->dst.file == GRF)
1423 remap_table[inst->dst.reg] = 0;
1424
1425 for (int i = 0; i < 3; i++) {
1426 if (inst->src[i].file == GRF)
1427 remap_table[inst->src[i].reg] = 0;
1428 }
1429 }
1430
1431 /* In addition to registers used in instructions, fs_visitor keeps
1432 * direct references to certain special values which must be patched:
1433 */
1434 fs_reg *special[] = {
1435 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1436 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1437 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1438 &delta_x[0], &delta_x[1], &delta_x[2],
1439 &delta_x[3], &delta_x[4], &delta_x[5],
1440 &delta_y[0], &delta_y[1], &delta_y[2],
1441 &delta_y[3], &delta_y[4], &delta_y[5],
1442 };
1443 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1444 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1445
1446 /* Treat all special values as used, to be conservative */
1447 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1448 if (special[i]->file == GRF)
1449 remap_table[special[i]->reg] = 0;
1450 }
1451
1452 /* Compact the GRF arrays. */
1453 int new_index = 0;
1454 for (int i = 0; i < this->virtual_grf_count; i++) {
1455 if (remap_table[i] != -1) {
1456 remap_table[i] = new_index;
1457 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1458 if (live_intervals_valid) {
1459 virtual_grf_start[new_index] = virtual_grf_start[i];
1460 virtual_grf_end[new_index] = virtual_grf_end[i];
1461 }
1462 ++new_index;
1463 }
1464 }
1465
1466 this->virtual_grf_count = new_index;
1467
1468 /* Patch all the instructions to use the newly renumbered registers */
1469 foreach_list(node, &this->instructions) {
1470 fs_inst *inst = (fs_inst *) node;
1471
1472 if (inst->dst.file == GRF)
1473 inst->dst.reg = remap_table[inst->dst.reg];
1474
1475 for (int i = 0; i < 3; i++) {
1476 if (inst->src[i].file == GRF)
1477 inst->src[i].reg = remap_table[inst->src[i].reg];
1478 }
1479 }
1480
1481 /* Patch all the references to special values */
1482 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1483 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1484 special[i]->reg = remap_table[special[i]->reg];
1485 }
1486 }
1487
1488 bool
1489 fs_visitor::remove_dead_constants()
1490 {
1491 if (dispatch_width == 8) {
1492 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1493 this->nr_params_remap = c->prog_data.nr_params;
1494
1495 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1496 this->params_remap[i] = -1;
1497
1498 /* Find which params are still in use. */
1499 foreach_list(node, &this->instructions) {
1500 fs_inst *inst = (fs_inst *)node;
1501
1502 for (int i = 0; i < 3; i++) {
1503 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1504
1505 if (inst->src[i].file != UNIFORM)
1506 continue;
1507
1508 /* Section 5.11 of the OpenGL 4.3 spec says:
1509 *
1510 * "Out-of-bounds reads return undefined values, which include
1511 * values from other variables of the active program or zero."
1512 */
1513 if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1514 constant_nr = 0;
1515 }
1516
1517 /* For now, set this to non-negative. We'll give it the
1518 * actual new number in a moment, in order to keep the
1519 * register numbers nicely ordered.
1520 */
1521 this->params_remap[constant_nr] = 0;
1522 }
1523 }
1524
1525 /* Figure out what the new numbers for the params will be. At some
1526 * point when we're doing uniform array access, we're going to want
1527 * to keep the distinction between .reg and .reg_offset, but for
1528 * now we don't care.
1529 */
1530 unsigned int new_nr_params = 0;
1531 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1532 if (this->params_remap[i] != -1) {
1533 this->params_remap[i] = new_nr_params++;
1534 }
1535 }
1536
1537 /* Update the list of params to be uploaded to match our new numbering. */
1538 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1539 int remapped = this->params_remap[i];
1540
1541 if (remapped == -1)
1542 continue;
1543
1544 c->prog_data.param[remapped] = c->prog_data.param[i];
1545 }
1546
1547 c->prog_data.nr_params = new_nr_params;
1548 } else {
1549 /* This should have been generated in the 8-wide pass already. */
1550 assert(this->params_remap);
1551 }
1552
1553 /* Now do the renumbering of the shader to remove unused params. */
1554 foreach_list(node, &this->instructions) {
1555 fs_inst *inst = (fs_inst *)node;
1556
1557 for (int i = 0; i < 3; i++) {
1558 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1559
1560 if (inst->src[i].file != UNIFORM)
1561 continue;
1562
1563 /* as above alias to 0 */
1564 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1565 constant_nr = 0;
1566 }
1567 assert(this->params_remap[constant_nr] != -1);
1568 inst->src[i].reg = this->params_remap[constant_nr];
1569 inst->src[i].reg_offset = 0;
1570 }
1571 }
1572
1573 return true;
1574 }
1575
1576 /*
1577 * Implements array access of uniforms by inserting a
1578 * PULL_CONSTANT_LOAD instruction.
1579 *
1580 * Unlike temporary GRF array access (where we don't support it due to
1581 * the difficulty of doing relative addressing on instruction
1582 * destinations), we could potentially do array access of uniforms
1583 * that were loaded in GRF space as push constants. In real-world
1584 * usage we've seen, though, the arrays being used are always larger
1585 * than we could load as push constants, so just always move all
1586 * uniform array access out to a pull constant buffer.
1587 */
1588 void
1589 fs_visitor::move_uniform_array_access_to_pull_constants()
1590 {
1591 int pull_constant_loc[c->prog_data.nr_params];
1592
1593 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1594 pull_constant_loc[i] = -1;
1595 }
1596
1597 /* Walk through and find array access of uniforms. Put a copy of that
1598 * uniform in the pull constant buffer.
1599 *
1600 * Note that we don't move constant-indexed accesses to arrays. No
1601 * testing has been done of the performance impact of this choice.
1602 */
1603 foreach_list_safe(node, &this->instructions) {
1604 fs_inst *inst = (fs_inst *)node;
1605
1606 for (int i = 0 ; i < 3; i++) {
1607 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1608 continue;
1609
1610 int uniform = inst->src[i].reg;
1611
1612 /* If this array isn't already present in the pull constant buffer,
1613 * add it.
1614 */
1615 if (pull_constant_loc[uniform] == -1) {
1616 const float **values = &c->prog_data.param[uniform];
1617
1618 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1619
1620 assert(param_size[uniform]);
1621
1622 for (int j = 0; j < param_size[uniform]; j++) {
1623 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1624 values[j];
1625 }
1626 }
1627
1628 /* Set up the annotation tracking for new generated instructions. */
1629 base_ir = inst->ir;
1630 current_annotation = inst->annotation;
1631
1632 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1633 fs_reg temp = fs_reg(this, glsl_type::float_type);
1634 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1635 surf_index,
1636 *inst->src[i].reladdr,
1637 pull_constant_loc[uniform] +
1638 inst->src[i].reg_offset);
1639 inst->insert_before(&list);
1640
1641 inst->src[i].file = temp.file;
1642 inst->src[i].reg = temp.reg;
1643 inst->src[i].reg_offset = temp.reg_offset;
1644 inst->src[i].reladdr = NULL;
1645 }
1646 }
1647 }
1648
1649 /**
1650 * Choose accesses from the UNIFORM file to demote to using the pull
1651 * constant buffer.
1652 *
1653 * We allow a fragment shader to have more than the specified minimum
1654 * maximum number of fragment shader uniform components (64). If
1655 * there are too many of these, they'd fill up all of register space.
1656 * So, this will push some of them out to the pull constant buffer and
1657 * update the program to load them.
1658 */
1659 void
1660 fs_visitor::setup_pull_constants()
1661 {
1662 /* Only allow 16 registers (128 uniform components) as push constants. */
1663 unsigned int max_uniform_components = 16 * 8;
1664 if (c->prog_data.nr_params <= max_uniform_components)
1665 return;
1666
1667 if (dispatch_width == 16) {
1668 fail("Pull constants not supported in 16-wide\n");
1669 return;
1670 }
1671
1672 /* Just demote the end of the list. We could probably do better
1673 * here, demoting things that are rarely used in the program first.
1674 */
1675 unsigned int pull_uniform_base = max_uniform_components;
1676
1677 int pull_constant_loc[c->prog_data.nr_params];
1678 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1679 if (i < pull_uniform_base) {
1680 pull_constant_loc[i] = -1;
1681 } else {
1682 pull_constant_loc[i] = -1;
1683 /* If our constant is already being uploaded for reladdr purposes,
1684 * reuse it.
1685 */
1686 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1687 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1688 pull_constant_loc[i] = j;
1689 break;
1690 }
1691 }
1692 if (pull_constant_loc[i] == -1) {
1693 int pull_index = c->prog_data.nr_pull_params++;
1694 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1695 pull_constant_loc[i] = pull_index;;
1696 }
1697 }
1698 }
1699 c->prog_data.nr_params = pull_uniform_base;
1700
1701 foreach_list(node, &this->instructions) {
1702 fs_inst *inst = (fs_inst *)node;
1703
1704 for (int i = 0; i < 3; i++) {
1705 if (inst->src[i].file != UNIFORM)
1706 continue;
1707
1708 int pull_index = pull_constant_loc[inst->src[i].reg +
1709 inst->src[i].reg_offset];
1710 if (pull_index == -1)
1711 continue;
1712
1713 assert(!inst->src[i].reladdr);
1714
1715 fs_reg dst = fs_reg(this, glsl_type::float_type);
1716 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1717 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1718 fs_inst *pull =
1719 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1720 dst, index, offset);
1721 pull->ir = inst->ir;
1722 pull->annotation = inst->annotation;
1723
1724 inst->insert_before(pull);
1725
1726 inst->src[i].file = GRF;
1727 inst->src[i].reg = dst.reg;
1728 inst->src[i].reg_offset = 0;
1729 inst->src[i].smear = pull_index & 3;
1730 }
1731 }
1732 }
1733
1734 bool
1735 fs_visitor::opt_algebraic()
1736 {
1737 bool progress = false;
1738
1739 foreach_list(node, &this->instructions) {
1740 fs_inst *inst = (fs_inst *)node;
1741
1742 switch (inst->opcode) {
1743 case BRW_OPCODE_MUL:
1744 if (inst->src[1].file != IMM)
1745 continue;
1746
1747 /* a * 1.0 = a */
1748 if (inst->src[1].is_one()) {
1749 inst->opcode = BRW_OPCODE_MOV;
1750 inst->src[1] = reg_undef;
1751 progress = true;
1752 break;
1753 }
1754
1755 /* a * 0.0 = 0.0 */
1756 if (inst->src[1].is_zero()) {
1757 inst->opcode = BRW_OPCODE_MOV;
1758 inst->src[0] = inst->src[1];
1759 inst->src[1] = reg_undef;
1760 progress = true;
1761 break;
1762 }
1763
1764 break;
1765 case BRW_OPCODE_ADD:
1766 if (inst->src[1].file != IMM)
1767 continue;
1768
1769 /* a + 0.0 = a */
1770 if (inst->src[1].is_zero()) {
1771 inst->opcode = BRW_OPCODE_MOV;
1772 inst->src[1] = reg_undef;
1773 progress = true;
1774 break;
1775 }
1776 break;
1777 default:
1778 break;
1779 }
1780 }
1781
1782 return progress;
1783 }
1784
1785 /**
1786 * Removes any instructions writing a VGRF where that VGRF is not used by any
1787 * later instruction.
1788 */
1789 bool
1790 fs_visitor::dead_code_eliminate()
1791 {
1792 bool progress = false;
1793 int pc = 0;
1794
1795 calculate_live_intervals();
1796
1797 foreach_list_safe(node, &this->instructions) {
1798 fs_inst *inst = (fs_inst *)node;
1799
1800 if (inst->dst.file == GRF) {
1801 assert(this->virtual_grf_end[inst->dst.reg] >= pc);
1802 if (this->virtual_grf_end[inst->dst.reg] == pc) {
1803 inst->remove();
1804 progress = true;
1805 }
1806 }
1807
1808 pc++;
1809 }
1810
1811 if (progress)
1812 live_intervals_valid = false;
1813
1814 return progress;
1815 }
1816
1817 struct dead_code_hash_key
1818 {
1819 int vgrf;
1820 int reg_offset;
1821 };
1822
1823 static bool
1824 dead_code_hash_compare(const void *a, const void *b)
1825 {
1826 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1827 }
1828
1829 static void
1830 clear_dead_code_hash(struct hash_table *ht)
1831 {
1832 struct hash_entry *entry;
1833
1834 hash_table_foreach(ht, entry) {
1835 _mesa_hash_table_remove(ht, entry);
1836 }
1837 }
1838
1839 static void
1840 insert_dead_code_hash(struct hash_table *ht,
1841 int vgrf, int reg_offset, fs_inst *inst)
1842 {
1843 /* We don't bother freeing keys, because they'll be GCed with the ht. */
1844 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1845
1846 key->vgrf = vgrf;
1847 key->reg_offset = reg_offset;
1848
1849 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1850 }
1851
1852 static struct hash_entry *
1853 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1854 {
1855 struct dead_code_hash_key key;
1856
1857 key.vgrf = vgrf;
1858 key.reg_offset = reg_offset;
1859
1860 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1861 }
1862
1863 static void
1864 remove_dead_code_hash(struct hash_table *ht,
1865 int vgrf, int reg_offset)
1866 {
1867 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1868 if (!entry)
1869 return;
1870
1871 _mesa_hash_table_remove(ht, entry);
1872 }
1873
1874 /**
1875 * Walks basic blocks, removing any regs that are written but not read before
1876 * being redefined.
1877 *
1878 * The dead_code_eliminate() function implements a global dead code
1879 * elimination, but it only handles the removing the last write to a register
1880 * if it's never read. This one can handle intermediate writes, but only
1881 * within a basic block.
1882 */
1883 bool
1884 fs_visitor::dead_code_eliminate_local()
1885 {
1886 struct hash_table *ht;
1887 bool progress = false;
1888
1889 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1890
1891 foreach_list_safe(node, &this->instructions) {
1892 fs_inst *inst = (fs_inst *)node;
1893
1894 /* At a basic block, empty the HT since we don't understand dataflow
1895 * here.
1896 */
1897 if (inst->is_control_flow()) {
1898 clear_dead_code_hash(ht);
1899 continue;
1900 }
1901
1902 /* Clear the HT of any instructions that got read. */
1903 for (int i = 0; i < 3; i++) {
1904 fs_reg src = inst->src[i];
1905 if (src.file != GRF)
1906 continue;
1907
1908 int read = 1;
1909 if (inst->is_send_from_grf())
1910 read = virtual_grf_sizes[src.reg] - src.reg_offset;
1911
1912 for (int reg_offset = src.reg_offset;
1913 reg_offset < src.reg_offset + read;
1914 reg_offset++) {
1915 remove_dead_code_hash(ht, src.reg, reg_offset);
1916 }
1917 }
1918
1919 /* Add any update of a GRF to the HT, removing a previous write if it
1920 * wasn't read.
1921 */
1922 if (inst->dst.file == GRF) {
1923 if (inst->regs_written > 1) {
1924 /* We don't know how to trim channels from an instruction's
1925 * writes, so we can't incrementally remove unread channels from
1926 * it. Just remove whatever it overwrites from the table
1927 */
1928 for (int i = 0; i < inst->regs_written; i++) {
1929 remove_dead_code_hash(ht,
1930 inst->dst.reg,
1931 inst->dst.reg_offset + i);
1932 }
1933 } else {
1934 struct hash_entry *entry =
1935 get_dead_code_hash_entry(ht, inst->dst.reg,
1936 inst->dst.reg_offset);
1937
1938 if (inst->is_partial_write()) {
1939 /* For a partial write, we can't remove any previous dead code
1940 * candidate, since we're just modifying their result, but we can
1941 * be dead code eliminiated ourselves.
1942 */
1943 if (entry) {
1944 entry->data = inst;
1945 } else {
1946 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1947 inst);
1948 }
1949 } else {
1950 if (entry) {
1951 /* We're completely updating a channel, and there was a
1952 * previous write to the channel that wasn't read. Kill it!
1953 */
1954 fs_inst *inst = (fs_inst *)entry->data;
1955 inst->remove();
1956 progress = true;
1957 _mesa_hash_table_remove(ht, entry);
1958 }
1959
1960 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1961 inst);
1962 }
1963 }
1964 }
1965 }
1966
1967 _mesa_hash_table_destroy(ht, NULL);
1968
1969 if (progress)
1970 live_intervals_valid = false;
1971
1972 return progress;
1973 }
1974
1975 /**
1976 * Implements a second type of register coalescing: This one checks if
1977 * the two regs involved in a raw move don't interfere, in which case
1978 * they can both by stored in the same place and the MOV removed.
1979 */
1980 bool
1981 fs_visitor::register_coalesce_2()
1982 {
1983 bool progress = false;
1984
1985 calculate_live_intervals();
1986
1987 foreach_list_safe(node, &this->instructions) {
1988 fs_inst *inst = (fs_inst *)node;
1989
1990 if (inst->opcode != BRW_OPCODE_MOV ||
1991 inst->is_partial_write() ||
1992 inst->saturate ||
1993 inst->src[0].file != GRF ||
1994 inst->src[0].negate ||
1995 inst->src[0].abs ||
1996 inst->src[0].smear != -1 ||
1997 inst->dst.file != GRF ||
1998 inst->dst.type != inst->src[0].type ||
1999 virtual_grf_sizes[inst->src[0].reg] != 1 ||
2000 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
2001 continue;
2002 }
2003
2004 int reg_from = inst->src[0].reg;
2005 assert(inst->src[0].reg_offset == 0);
2006 int reg_to = inst->dst.reg;
2007 int reg_to_offset = inst->dst.reg_offset;
2008
2009 foreach_list(node, &this->instructions) {
2010 fs_inst *scan_inst = (fs_inst *)node;
2011
2012 if (scan_inst->dst.file == GRF &&
2013 scan_inst->dst.reg == reg_from) {
2014 scan_inst->dst.reg = reg_to;
2015 scan_inst->dst.reg_offset = reg_to_offset;
2016 }
2017 for (int i = 0; i < 3; i++) {
2018 if (scan_inst->src[i].file == GRF &&
2019 scan_inst->src[i].reg == reg_from) {
2020 scan_inst->src[i].reg = reg_to;
2021 scan_inst->src[i].reg_offset = reg_to_offset;
2022 }
2023 }
2024 }
2025
2026 inst->remove();
2027
2028 /* We don't need to recalculate live intervals inside the loop despite
2029 * flagging live_intervals_valid because we only use live intervals for
2030 * the interferes test, and we must have had a situation where the
2031 * intervals were:
2032 *
2033 * from to
2034 * ^
2035 * |
2036 * v
2037 * ^
2038 * |
2039 * v
2040 *
2041 * Some register R that might get coalesced with one of these two could
2042 * only be referencing "to", otherwise "from"'s range would have been
2043 * longer. R's range could also only start at the end of "to" or later,
2044 * otherwise it will conflict with "to" when we try to coalesce "to"
2045 * into Rw anyway.
2046 */
2047 live_intervals_valid = false;
2048
2049 progress = true;
2050 continue;
2051 }
2052
2053 return progress;
2054 }
2055
2056 bool
2057 fs_visitor::register_coalesce()
2058 {
2059 bool progress = false;
2060 int if_depth = 0;
2061 int loop_depth = 0;
2062
2063 foreach_list_safe(node, &this->instructions) {
2064 fs_inst *inst = (fs_inst *)node;
2065
2066 /* Make sure that we dominate the instructions we're going to
2067 * scan for interfering with our coalescing, or we won't have
2068 * scanned enough to see if anything interferes with our
2069 * coalescing. We don't dominate the following instructions if
2070 * we're in a loop or an if block.
2071 */
2072 switch (inst->opcode) {
2073 case BRW_OPCODE_DO:
2074 loop_depth++;
2075 break;
2076 case BRW_OPCODE_WHILE:
2077 loop_depth--;
2078 break;
2079 case BRW_OPCODE_IF:
2080 if_depth++;
2081 break;
2082 case BRW_OPCODE_ENDIF:
2083 if_depth--;
2084 break;
2085 default:
2086 break;
2087 }
2088 if (loop_depth || if_depth)
2089 continue;
2090
2091 if (inst->opcode != BRW_OPCODE_MOV ||
2092 inst->is_partial_write() ||
2093 inst->saturate ||
2094 inst->dst.file != GRF || (inst->src[0].file != GRF &&
2095 inst->src[0].file != UNIFORM)||
2096 inst->dst.type != inst->src[0].type)
2097 continue;
2098
2099 bool has_source_modifiers = (inst->src[0].abs ||
2100 inst->src[0].negate ||
2101 inst->src[0].smear != -1 ||
2102 inst->src[0].file == UNIFORM);
2103
2104 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
2105 * them: check for no writes to either one until the exit of the
2106 * program.
2107 */
2108 bool interfered = false;
2109
2110 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2111 !scan_inst->is_tail_sentinel();
2112 scan_inst = (fs_inst *)scan_inst->next) {
2113 if (scan_inst->dst.file == GRF) {
2114 if (scan_inst->overwrites_reg(inst->dst) ||
2115 scan_inst->overwrites_reg(inst->src[0])) {
2116 interfered = true;
2117 break;
2118 }
2119 }
2120
2121 /* The gen6 MATH instruction can't handle source modifiers or
2122 * unusual register regions, so avoid coalescing those for
2123 * now. We should do something more specific.
2124 */
2125 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2126 interfered = true;
2127 break;
2128 }
2129
2130 /* The accumulator result appears to get used for the
2131 * conditional modifier generation. When negating a UD
2132 * value, there is a 33rd bit generated for the sign in the
2133 * accumulator value, so now you can't check, for example,
2134 * equality with a 32-bit value. See piglit fs-op-neg-uint.
2135 */
2136 if (scan_inst->conditional_mod &&
2137 inst->src[0].negate &&
2138 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2139 interfered = true;
2140 break;
2141 }
2142 }
2143 if (interfered) {
2144 continue;
2145 }
2146
2147 /* Rewrite the later usage to point at the source of the move to
2148 * be removed.
2149 */
2150 for (fs_inst *scan_inst = inst;
2151 !scan_inst->is_tail_sentinel();
2152 scan_inst = (fs_inst *)scan_inst->next) {
2153 for (int i = 0; i < 3; i++) {
2154 if (scan_inst->src[i].file == GRF &&
2155 scan_inst->src[i].reg == inst->dst.reg &&
2156 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2157 fs_reg new_src = inst->src[0];
2158 if (scan_inst->src[i].abs) {
2159 new_src.negate = 0;
2160 new_src.abs = 1;
2161 }
2162 new_src.negate ^= scan_inst->src[i].negate;
2163 scan_inst->src[i] = new_src;
2164 }
2165 }
2166 }
2167
2168 inst->remove();
2169 progress = true;
2170 }
2171
2172 if (progress)
2173 live_intervals_valid = false;
2174
2175 return progress;
2176 }
2177
2178
2179 bool
2180 fs_visitor::compute_to_mrf()
2181 {
2182 bool progress = false;
2183 int next_ip = 0;
2184
2185 calculate_live_intervals();
2186
2187 foreach_list_safe(node, &this->instructions) {
2188 fs_inst *inst = (fs_inst *)node;
2189
2190 int ip = next_ip;
2191 next_ip++;
2192
2193 if (inst->opcode != BRW_OPCODE_MOV ||
2194 inst->is_partial_write() ||
2195 inst->dst.file != MRF || inst->src[0].file != GRF ||
2196 inst->dst.type != inst->src[0].type ||
2197 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2198 continue;
2199
2200 /* Work out which hardware MRF registers are written by this
2201 * instruction.
2202 */
2203 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2204 int mrf_high;
2205 if (inst->dst.reg & BRW_MRF_COMPR4) {
2206 mrf_high = mrf_low + 4;
2207 } else if (dispatch_width == 16 &&
2208 (!inst->force_uncompressed && !inst->force_sechalf)) {
2209 mrf_high = mrf_low + 1;
2210 } else {
2211 mrf_high = mrf_low;
2212 }
2213
2214 /* Can't compute-to-MRF this GRF if someone else was going to
2215 * read it later.
2216 */
2217 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2218 continue;
2219
2220 /* Found a move of a GRF to a MRF. Let's see if we can go
2221 * rewrite the thing that made this GRF to write into the MRF.
2222 */
2223 fs_inst *scan_inst;
2224 for (scan_inst = (fs_inst *)inst->prev;
2225 scan_inst->prev != NULL;
2226 scan_inst = (fs_inst *)scan_inst->prev) {
2227 if (scan_inst->dst.file == GRF &&
2228 scan_inst->dst.reg == inst->src[0].reg) {
2229 /* Found the last thing to write our reg we want to turn
2230 * into a compute-to-MRF.
2231 */
2232
2233 /* If this one instruction didn't populate all the
2234 * channels, bail. We might be able to rewrite everything
2235 * that writes that reg, but it would require smarter
2236 * tracking to delay the rewriting until complete success.
2237 */
2238 if (scan_inst->is_partial_write())
2239 break;
2240
2241 /* Things returning more than one register would need us to
2242 * understand coalescing out more than one MOV at a time.
2243 */
2244 if (scan_inst->regs_written > 1)
2245 break;
2246
2247 /* SEND instructions can't have MRF as a destination. */
2248 if (scan_inst->mlen)
2249 break;
2250
2251 if (brw->gen == 6) {
2252 /* gen6 math instructions must have the destination be
2253 * GRF, so no compute-to-MRF for them.
2254 */
2255 if (scan_inst->is_math()) {
2256 break;
2257 }
2258 }
2259
2260 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2261 /* Found the creator of our MRF's source value. */
2262 scan_inst->dst.file = MRF;
2263 scan_inst->dst.reg = inst->dst.reg;
2264 scan_inst->saturate |= inst->saturate;
2265 inst->remove();
2266 progress = true;
2267 }
2268 break;
2269 }
2270
2271 /* We don't handle control flow here. Most computation of
2272 * values that end up in MRFs are shortly before the MRF
2273 * write anyway.
2274 */
2275 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2276 break;
2277
2278 /* You can't read from an MRF, so if someone else reads our
2279 * MRF's source GRF that we wanted to rewrite, that stops us.
2280 */
2281 bool interfered = false;
2282 for (int i = 0; i < 3; i++) {
2283 if (scan_inst->src[i].file == GRF &&
2284 scan_inst->src[i].reg == inst->src[0].reg &&
2285 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2286 interfered = true;
2287 }
2288 }
2289 if (interfered)
2290 break;
2291
2292 if (scan_inst->dst.file == MRF) {
2293 /* If somebody else writes our MRF here, we can't
2294 * compute-to-MRF before that.
2295 */
2296 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2297 int scan_mrf_high;
2298
2299 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2300 scan_mrf_high = scan_mrf_low + 4;
2301 } else if (dispatch_width == 16 &&
2302 (!scan_inst->force_uncompressed &&
2303 !scan_inst->force_sechalf)) {
2304 scan_mrf_high = scan_mrf_low + 1;
2305 } else {
2306 scan_mrf_high = scan_mrf_low;
2307 }
2308
2309 if (mrf_low == scan_mrf_low ||
2310 mrf_low == scan_mrf_high ||
2311 mrf_high == scan_mrf_low ||
2312 mrf_high == scan_mrf_high) {
2313 break;
2314 }
2315 }
2316
2317 if (scan_inst->mlen > 0) {
2318 /* Found a SEND instruction, which means that there are
2319 * live values in MRFs from base_mrf to base_mrf +
2320 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2321 * above it.
2322 */
2323 if (mrf_low >= scan_inst->base_mrf &&
2324 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2325 break;
2326 }
2327 if (mrf_high >= scan_inst->base_mrf &&
2328 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2329 break;
2330 }
2331 }
2332 }
2333 }
2334
2335 if (progress)
2336 live_intervals_valid = false;
2337
2338 return progress;
2339 }
2340
2341 /**
2342 * Walks through basic blocks, looking for repeated MRF writes and
2343 * removing the later ones.
2344 */
2345 bool
2346 fs_visitor::remove_duplicate_mrf_writes()
2347 {
2348 fs_inst *last_mrf_move[16];
2349 bool progress = false;
2350
2351 /* Need to update the MRF tracking for compressed instructions. */
2352 if (dispatch_width == 16)
2353 return false;
2354
2355 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2356
2357 foreach_list_safe(node, &this->instructions) {
2358 fs_inst *inst = (fs_inst *)node;
2359
2360 if (inst->is_control_flow()) {
2361 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2362 }
2363
2364 if (inst->opcode == BRW_OPCODE_MOV &&
2365 inst->dst.file == MRF) {
2366 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2367 if (prev_inst && inst->equals(prev_inst)) {
2368 inst->remove();
2369 progress = true;
2370 continue;
2371 }
2372 }
2373
2374 /* Clear out the last-write records for MRFs that were overwritten. */
2375 if (inst->dst.file == MRF) {
2376 last_mrf_move[inst->dst.reg] = NULL;
2377 }
2378
2379 if (inst->mlen > 0) {
2380 /* Found a SEND instruction, which will include two or fewer
2381 * implied MRF writes. We could do better here.
2382 */
2383 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2384 last_mrf_move[inst->base_mrf + i] = NULL;
2385 }
2386 }
2387
2388 /* Clear out any MRF move records whose sources got overwritten. */
2389 if (inst->dst.file == GRF) {
2390 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2391 if (last_mrf_move[i] &&
2392 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2393 last_mrf_move[i] = NULL;
2394 }
2395 }
2396 }
2397
2398 if (inst->opcode == BRW_OPCODE_MOV &&
2399 inst->dst.file == MRF &&
2400 inst->src[0].file == GRF &&
2401 !inst->is_partial_write()) {
2402 last_mrf_move[inst->dst.reg] = inst;
2403 }
2404 }
2405
2406 if (progress)
2407 live_intervals_valid = false;
2408
2409 return progress;
2410 }
2411
2412 static void
2413 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2414 int first_grf, int grf_len)
2415 {
2416 bool inst_16wide = (dispatch_width > 8 &&
2417 !inst->force_uncompressed &&
2418 !inst->force_sechalf);
2419
2420 /* Clear the flag for registers that actually got read (as expected). */
2421 for (int i = 0; i < 3; i++) {
2422 int grf;
2423 if (inst->src[i].file == GRF) {
2424 grf = inst->src[i].reg;
2425 } else if (inst->src[i].file == HW_REG &&
2426 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2427 grf = inst->src[i].fixed_hw_reg.nr;
2428 } else {
2429 continue;
2430 }
2431
2432 if (grf >= first_grf &&
2433 grf < first_grf + grf_len) {
2434 deps[grf - first_grf] = false;
2435 if (inst_16wide)
2436 deps[grf - first_grf + 1] = false;
2437 }
2438 }
2439 }
2440
2441 /**
2442 * Implements this workaround for the original 965:
2443 *
2444 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2445 * check for post destination dependencies on this instruction, software
2446 * must ensure that there is no destination hazard for the case of ‘write
2447 * followed by a posted write’ shown in the following example.
2448 *
2449 * 1. mov r3 0
2450 * 2. send r3.xy <rest of send instruction>
2451 * 3. mov r2 r3
2452 *
2453 * Due to no post-destination dependency check on the ‘send’, the above
2454 * code sequence could have two instructions (1 and 2) in flight at the
2455 * same time that both consider ‘r3’ as the target of their final writes.
2456 */
2457 void
2458 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2459 {
2460 int reg_size = dispatch_width / 8;
2461 int write_len = inst->regs_written * reg_size;
2462 int first_write_grf = inst->dst.reg;
2463 bool needs_dep[BRW_MAX_MRF];
2464 assert(write_len < (int)sizeof(needs_dep) - 1);
2465
2466 memset(needs_dep, false, sizeof(needs_dep));
2467 memset(needs_dep, true, write_len);
2468
2469 clear_deps_for_inst_src(inst, dispatch_width,
2470 needs_dep, first_write_grf, write_len);
2471
2472 /* Walk backwards looking for writes to registers we're writing which
2473 * aren't read since being written. If we hit the start of the program,
2474 * we assume that there are no outstanding dependencies on entry to the
2475 * program.
2476 */
2477 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2478 scan_inst != NULL;
2479 scan_inst = (fs_inst *)scan_inst->prev) {
2480
2481 /* If we hit control flow, assume that there *are* outstanding
2482 * dependencies, and force their cleanup before our instruction.
2483 */
2484 if (scan_inst->is_control_flow()) {
2485 for (int i = 0; i < write_len; i++) {
2486 if (needs_dep[i]) {
2487 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2488 }
2489 }
2490 return;
2491 }
2492
2493 bool scan_inst_16wide = (dispatch_width > 8 &&
2494 !scan_inst->force_uncompressed &&
2495 !scan_inst->force_sechalf);
2496
2497 /* We insert our reads as late as possible on the assumption that any
2498 * instruction but a MOV that might have left us an outstanding
2499 * dependency has more latency than a MOV.
2500 */
2501 if (scan_inst->dst.file == GRF) {
2502 for (int i = 0; i < scan_inst->regs_written; i++) {
2503 int reg = scan_inst->dst.reg + i * reg_size;
2504
2505 if (reg >= first_write_grf &&
2506 reg < first_write_grf + write_len &&
2507 needs_dep[reg - first_write_grf]) {
2508 inst->insert_before(DEP_RESOLVE_MOV(reg));
2509 needs_dep[reg - first_write_grf] = false;
2510 if (scan_inst_16wide)
2511 needs_dep[reg - first_write_grf + 1] = false;
2512 }
2513 }
2514 }
2515
2516 /* Clear the flag for registers that actually got read (as expected). */
2517 clear_deps_for_inst_src(scan_inst, dispatch_width,
2518 needs_dep, first_write_grf, write_len);
2519
2520 /* Continue the loop only if we haven't resolved all the dependencies */
2521 int i;
2522 for (i = 0; i < write_len; i++) {
2523 if (needs_dep[i])
2524 break;
2525 }
2526 if (i == write_len)
2527 return;
2528 }
2529 }
2530
2531 /**
2532 * Implements this workaround for the original 965:
2533 *
2534 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2535 * used as a destination register until after it has been sourced by an
2536 * instruction with a different destination register.
2537 */
2538 void
2539 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2540 {
2541 int write_len = inst->regs_written * dispatch_width / 8;
2542 int first_write_grf = inst->dst.reg;
2543 bool needs_dep[BRW_MAX_MRF];
2544 assert(write_len < (int)sizeof(needs_dep) - 1);
2545
2546 memset(needs_dep, false, sizeof(needs_dep));
2547 memset(needs_dep, true, write_len);
2548 /* Walk forwards looking for writes to registers we're writing which aren't
2549 * read before being written.
2550 */
2551 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2552 !scan_inst->is_tail_sentinel();
2553 scan_inst = (fs_inst *)scan_inst->next) {
2554 /* If we hit control flow, force resolve all remaining dependencies. */
2555 if (scan_inst->is_control_flow()) {
2556 for (int i = 0; i < write_len; i++) {
2557 if (needs_dep[i])
2558 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2559 }
2560 return;
2561 }
2562
2563 /* Clear the flag for registers that actually got read (as expected). */
2564 clear_deps_for_inst_src(scan_inst, dispatch_width,
2565 needs_dep, first_write_grf, write_len);
2566
2567 /* We insert our reads as late as possible since they're reading the
2568 * result of a SEND, which has massive latency.
2569 */
2570 if (scan_inst->dst.file == GRF &&
2571 scan_inst->dst.reg >= first_write_grf &&
2572 scan_inst->dst.reg < first_write_grf + write_len &&
2573 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2574 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2575 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2576 }
2577
2578 /* Continue the loop only if we haven't resolved all the dependencies */
2579 int i;
2580 for (i = 0; i < write_len; i++) {
2581 if (needs_dep[i])
2582 break;
2583 }
2584 if (i == write_len)
2585 return;
2586 }
2587
2588 /* If we hit the end of the program, resolve all remaining dependencies out
2589 * of paranoia.
2590 */
2591 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2592 assert(last_inst->eot);
2593 for (int i = 0; i < write_len; i++) {
2594 if (needs_dep[i])
2595 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2596 }
2597 }
2598
2599 void
2600 fs_visitor::insert_gen4_send_dependency_workarounds()
2601 {
2602 if (brw->gen != 4 || brw->is_g4x)
2603 return;
2604
2605 /* Note that we're done with register allocation, so GRF fs_regs always
2606 * have a .reg_offset of 0.
2607 */
2608
2609 foreach_list_safe(node, &this->instructions) {
2610 fs_inst *inst = (fs_inst *)node;
2611
2612 if (inst->mlen != 0 && inst->dst.file == GRF) {
2613 insert_gen4_pre_send_dependency_workarounds(inst);
2614 insert_gen4_post_send_dependency_workarounds(inst);
2615 }
2616 }
2617 }
2618
2619 /**
2620 * Turns the generic expression-style uniform pull constant load instruction
2621 * into a hardware-specific series of instructions for loading a pull
2622 * constant.
2623 *
2624 * The expression style allows the CSE pass before this to optimize out
2625 * repeated loads from the same offset, and gives the pre-register-allocation
2626 * scheduling full flexibility, while the conversion to native instructions
2627 * allows the post-register-allocation scheduler the best information
2628 * possible.
2629 *
2630 * Note that execution masking for setting up pull constant loads is special:
2631 * the channels that need to be written are unrelated to the current execution
2632 * mask, since a later instruction will use one of the result channels as a
2633 * source operand for all 8 or 16 of its channels.
2634 */
2635 void
2636 fs_visitor::lower_uniform_pull_constant_loads()
2637 {
2638 foreach_list(node, &this->instructions) {
2639 fs_inst *inst = (fs_inst *)node;
2640
2641 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2642 continue;
2643
2644 if (brw->gen >= 7) {
2645 /* The offset arg before was a vec4-aligned byte offset. We need to
2646 * turn it into a dword offset.
2647 */
2648 fs_reg const_offset_reg = inst->src[1];
2649 assert(const_offset_reg.file == IMM &&
2650 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2651 const_offset_reg.imm.u /= 4;
2652 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2653
2654 /* This is actually going to be a MOV, but since only the first dword
2655 * is accessed, we have a special opcode to do just that one. Note
2656 * that this needs to be an operation that will be considered a def
2657 * by live variable analysis, or register allocation will explode.
2658 */
2659 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2660 payload, const_offset_reg);
2661 setup->force_writemask_all = true;
2662
2663 setup->ir = inst->ir;
2664 setup->annotation = inst->annotation;
2665 inst->insert_before(setup);
2666
2667 /* Similarly, this will only populate the first 4 channels of the
2668 * result register (since we only use smear values from 0-3), but we
2669 * don't tell the optimizer.
2670 */
2671 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2672 inst->src[1] = payload;
2673
2674 this->live_intervals_valid = false;
2675 } else {
2676 /* Before register allocation, we didn't tell the scheduler about the
2677 * MRF we use. We know it's safe to use this MRF because nothing
2678 * else does except for register spill/unspill, which generates and
2679 * uses its MRF within a single IR instruction.
2680 */
2681 inst->base_mrf = 14;
2682 inst->mlen = 1;
2683 }
2684 }
2685 }
2686
2687 void
2688 fs_visitor::dump_instruction(backend_instruction *be_inst)
2689 {
2690 fs_inst *inst = (fs_inst *)be_inst;
2691
2692 if (inst->predicate) {
2693 printf("(%cf0.%d) ",
2694 inst->predicate_inverse ? '-' : '+',
2695 inst->flag_subreg);
2696 }
2697
2698 printf("%s", brw_instruction_name(inst->opcode));
2699 if (inst->saturate)
2700 printf(".sat");
2701 if (inst->conditional_mod) {
2702 printf(".cmod");
2703 if (!inst->predicate &&
2704 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2705 inst->opcode != BRW_OPCODE_IF &&
2706 inst->opcode != BRW_OPCODE_WHILE))) {
2707 printf(".f0.%d", inst->flag_subreg);
2708 }
2709 }
2710 printf(" ");
2711
2712
2713 switch (inst->dst.file) {
2714 case GRF:
2715 printf("vgrf%d", inst->dst.reg);
2716 if (inst->dst.reg_offset)
2717 printf("+%d", inst->dst.reg_offset);
2718 break;
2719 case MRF:
2720 printf("m%d", inst->dst.reg);
2721 break;
2722 case BAD_FILE:
2723 printf("(null)");
2724 break;
2725 case UNIFORM:
2726 printf("***u%d***", inst->dst.reg);
2727 break;
2728 case ARF:
2729 if (inst->dst.reg == BRW_ARF_NULL)
2730 printf("(null)");
2731 else
2732 printf("arf%d", inst->dst.reg);
2733 break;
2734 default:
2735 printf("???");
2736 break;
2737 }
2738 printf(", ");
2739
2740 for (int i = 0; i < 3; i++) {
2741 if (inst->src[i].negate)
2742 printf("-");
2743 if (inst->src[i].abs)
2744 printf("|");
2745 switch (inst->src[i].file) {
2746 case GRF:
2747 printf("vgrf%d", inst->src[i].reg);
2748 if (inst->src[i].reg_offset)
2749 printf("+%d", inst->src[i].reg_offset);
2750 break;
2751 case MRF:
2752 printf("***m%d***", inst->src[i].reg);
2753 break;
2754 case UNIFORM:
2755 printf("u%d", inst->src[i].reg);
2756 if (inst->src[i].reg_offset)
2757 printf(".%d", inst->src[i].reg_offset);
2758 break;
2759 case BAD_FILE:
2760 printf("(null)");
2761 break;
2762 case IMM:
2763 switch (inst->src[i].type) {
2764 case BRW_REGISTER_TYPE_F:
2765 printf("%ff", inst->src[i].imm.f);
2766 break;
2767 case BRW_REGISTER_TYPE_D:
2768 printf("%dd", inst->src[i].imm.i);
2769 break;
2770 case BRW_REGISTER_TYPE_UD:
2771 printf("%uu", inst->src[i].imm.u);
2772 break;
2773 default:
2774 printf("???");
2775 break;
2776 }
2777 break;
2778 default:
2779 printf("???");
2780 break;
2781 }
2782 if (inst->src[i].abs)
2783 printf("|");
2784
2785 if (i < 3)
2786 printf(", ");
2787 }
2788
2789 printf(" ");
2790
2791 if (inst->force_uncompressed)
2792 printf("1sthalf ");
2793
2794 if (inst->force_sechalf)
2795 printf("2ndhalf ");
2796
2797 printf("\n");
2798 }
2799
2800 /**
2801 * Possibly returns an instruction that set up @param reg.
2802 *
2803 * Sometimes we want to take the result of some expression/variable
2804 * dereference tree and rewrite the instruction generating the result
2805 * of the tree. When processing the tree, we know that the
2806 * instructions generated are all writing temporaries that are dead
2807 * outside of this tree. So, if we have some instructions that write
2808 * a temporary, we're free to point that temp write somewhere else.
2809 *
2810 * Note that this doesn't guarantee that the instruction generated
2811 * only reg -- it might be the size=4 destination of a texture instruction.
2812 */
2813 fs_inst *
2814 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2815 fs_inst *end,
2816 fs_reg reg)
2817 {
2818 if (end == start ||
2819 end->is_partial_write() ||
2820 reg.reladdr ||
2821 !reg.equals(end->dst)) {
2822 return NULL;
2823 } else {
2824 return end;
2825 }
2826 }
2827
2828 void
2829 fs_visitor::setup_payload_gen6()
2830 {
2831 bool uses_depth =
2832 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2833 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2834
2835 assert(brw->gen >= 6);
2836
2837 /* R0-1: masks, pixel X/Y coordinates. */
2838 c->nr_payload_regs = 2;
2839 /* R2: only for 32-pixel dispatch.*/
2840
2841 /* R3-26: barycentric interpolation coordinates. These appear in the
2842 * same order that they appear in the brw_wm_barycentric_interp_mode
2843 * enum. Each set of coordinates occupies 2 registers if dispatch width
2844 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2845 * appear if they were enabled using the "Barycentric Interpolation
2846 * Mode" bits in WM_STATE.
2847 */
2848 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2849 if (barycentric_interp_modes & (1 << i)) {
2850 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2851 c->nr_payload_regs += 2;
2852 if (dispatch_width == 16) {
2853 c->nr_payload_regs += 2;
2854 }
2855 }
2856 }
2857
2858 /* R27: interpolated depth if uses source depth */
2859 if (uses_depth) {
2860 c->source_depth_reg = c->nr_payload_regs;
2861 c->nr_payload_regs++;
2862 if (dispatch_width == 16) {
2863 /* R28: interpolated depth if not 8-wide. */
2864 c->nr_payload_regs++;
2865 }
2866 }
2867 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2868 if (uses_depth) {
2869 c->source_w_reg = c->nr_payload_regs;
2870 c->nr_payload_regs++;
2871 if (dispatch_width == 16) {
2872 /* R30: interpolated W if not 8-wide. */
2873 c->nr_payload_regs++;
2874 }
2875 }
2876 /* R31: MSAA position offsets. */
2877 /* R32-: bary for 32-pixel. */
2878 /* R58-59: interp W for 32-pixel. */
2879
2880 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2881 c->source_depth_to_render_target = true;
2882 }
2883 }
2884
2885 bool
2886 fs_visitor::run()
2887 {
2888 sanity_param_count = fp->Base.Parameters->NumParameters;
2889 uint32_t orig_nr_params = c->prog_data.nr_params;
2890
2891 if (brw->gen >= 6)
2892 setup_payload_gen6();
2893 else
2894 setup_payload_gen4();
2895
2896 if (0) {
2897 emit_dummy_fs();
2898 } else {
2899 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2900 emit_shader_time_begin();
2901
2902 calculate_urb_setup();
2903 if (brw->gen < 6)
2904 emit_interpolation_setup_gen4();
2905 else
2906 emit_interpolation_setup_gen6();
2907
2908 /* We handle discards by keeping track of the still-live pixels in f0.1.
2909 * Initialize it with the dispatched pixels.
2910 */
2911 if (fp->UsesKill) {
2912 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2913 discard_init->flag_subreg = 1;
2914 }
2915
2916 /* Generate FS IR for main(). (the visitor only descends into
2917 * functions called "main").
2918 */
2919 if (shader) {
2920 foreach_list(node, &*shader->ir) {
2921 ir_instruction *ir = (ir_instruction *)node;
2922 base_ir = ir;
2923 this->result = reg_undef;
2924 ir->accept(this);
2925 }
2926 } else {
2927 emit_fragment_program_code();
2928 }
2929 base_ir = NULL;
2930 if (failed)
2931 return false;
2932
2933 emit(FS_OPCODE_PLACEHOLDER_HALT);
2934
2935 emit_fb_writes();
2936
2937 split_virtual_grfs();
2938
2939 move_uniform_array_access_to_pull_constants();
2940 setup_pull_constants();
2941
2942 bool progress;
2943 do {
2944 progress = false;
2945
2946 compact_virtual_grfs();
2947
2948 progress = remove_duplicate_mrf_writes() || progress;
2949
2950 progress = opt_algebraic() || progress;
2951 progress = opt_cse() || progress;
2952 progress = opt_copy_propagate() || progress;
2953 progress = dead_code_eliminate() || progress;
2954 progress = dead_code_eliminate_local() || progress;
2955 progress = register_coalesce() || progress;
2956 progress = register_coalesce_2() || progress;
2957 progress = compute_to_mrf() || progress;
2958 } while (progress);
2959
2960 remove_dead_constants();
2961
2962 schedule_instructions(false);
2963
2964 lower_uniform_pull_constant_loads();
2965
2966 assign_curb_setup();
2967 assign_urb_setup();
2968
2969 if (0) {
2970 /* Debug of register spilling: Go spill everything. */
2971 for (int i = 0; i < virtual_grf_count; i++) {
2972 spill_reg(i);
2973 }
2974 }
2975
2976 if (0)
2977 assign_regs_trivial();
2978 else {
2979 while (!assign_regs()) {
2980 if (failed)
2981 break;
2982 }
2983 }
2984 }
2985 assert(force_uncompressed_stack == 0);
2986 assert(force_sechalf_stack == 0);
2987
2988 /* This must come after all optimization and register allocation, since
2989 * it inserts dead code that happens to have side effects, and it does
2990 * so based on the actual physical registers in use.
2991 */
2992 insert_gen4_send_dependency_workarounds();
2993
2994 if (failed)
2995 return false;
2996
2997 schedule_instructions(true);
2998
2999 if (dispatch_width == 8) {
3000 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3001 } else {
3002 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3003
3004 /* Make sure we didn't try to sneak in an extra uniform */
3005 assert(orig_nr_params == c->prog_data.nr_params);
3006 (void) orig_nr_params;
3007 }
3008
3009 /* If any state parameters were appended, then ParameterValues could have
3010 * been realloced, in which case the driver uniform storage set up by
3011 * _mesa_associate_uniform_storage() would point to freed memory. Make
3012 * sure that didn't happen.
3013 */
3014 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3015
3016 return !failed;
3017 }
3018
3019 const unsigned *
3020 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3021 struct gl_fragment_program *fp,
3022 struct gl_shader_program *prog,
3023 unsigned *final_assembly_size)
3024 {
3025 bool start_busy = false;
3026 float start_time = 0;
3027
3028 if (unlikely(brw->perf_debug)) {
3029 start_busy = (brw->batch.last_bo &&
3030 drm_intel_bo_busy(brw->batch.last_bo));
3031 start_time = get_time();
3032 }
3033
3034 struct brw_shader *shader = NULL;
3035 if (prog)
3036 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3037
3038 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3039 if (prog) {
3040 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3041 _mesa_print_ir(shader->ir, NULL);
3042 printf("\n\n");
3043 } else {
3044 printf("ARB_fragment_program %d ir for native fragment shader\n",
3045 fp->Base.Id);
3046 _mesa_print_program(&fp->Base);
3047 }
3048 }
3049
3050 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3051 */
3052 fs_visitor v(brw, c, prog, fp, 8);
3053 if (!v.run()) {
3054 if (prog) {
3055 prog->LinkStatus = false;
3056 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3057 }
3058
3059 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3060 v.fail_msg);
3061
3062 return NULL;
3063 }
3064
3065 exec_list *simd16_instructions = NULL;
3066 fs_visitor v2(brw, c, prog, fp, 16);
3067 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3068 if (c->prog_data.nr_pull_params == 0) {
3069 /* Try a 16-wide compile */
3070 v2.import_uniforms(&v);
3071 if (!v2.run()) {
3072 perf_debug("16-wide shader failed to compile, falling back to "
3073 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3074 } else {
3075 simd16_instructions = &v2.instructions;
3076 }
3077 } else {
3078 perf_debug("Skipping 16-wide due to pull parameters.\n");
3079 }
3080 }
3081
3082 c->prog_data.dispatch_width = 8;
3083
3084 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3085 const unsigned *generated = g.generate_assembly(&v.instructions,
3086 simd16_instructions,
3087 final_assembly_size);
3088
3089 if (unlikely(brw->perf_debug) && shader) {
3090 if (shader->compiled_once)
3091 brw_wm_debug_recompile(brw, prog, &c->key);
3092 shader->compiled_once = true;
3093
3094 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3095 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3096 (get_time() - start_time) * 1000);
3097 }
3098 }
3099
3100 return generated;
3101 }
3102
3103 bool
3104 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3105 {
3106 struct brw_context *brw = brw_context(ctx);
3107 struct brw_wm_prog_key key;
3108
3109 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3110 return true;
3111
3112 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3113 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3114 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3115 bool program_uses_dfdy = fp->UsesDFdy;
3116
3117 memset(&key, 0, sizeof(key));
3118
3119 if (brw->gen < 6) {
3120 if (fp->UsesKill)
3121 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3122
3123 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3124 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3125
3126 /* Just assume depth testing. */
3127 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3128 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3129 }
3130
3131 if (brw->gen < 6)
3132 key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
3133
3134 for (int i = 0; i < VARYING_SLOT_MAX; i++) {
3135 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
3136 continue;
3137
3138 if (brw->gen < 6) {
3139 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
3140 key.input_slots_valid |= BITFIELD64_BIT(i);
3141 }
3142 }
3143
3144 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3145
3146 for (int i = 0; i < MAX_SAMPLERS; i++) {
3147 if (fp->Base.ShadowSamplers & (1 << i)) {
3148 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3149 key.tex.swizzles[i] =
3150 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3151 } else {
3152 /* Color sampler: assume no swizzling. */
3153 key.tex.swizzles[i] = SWIZZLE_XYZW;
3154 }
3155 }
3156
3157 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3158 key.drawable_height = ctx->DrawBuffer->Height;
3159 }
3160
3161 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3162 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3163 }
3164
3165 key.nr_color_regions = 1;
3166
3167 key.program_string_id = bfp->id;
3168
3169 uint32_t old_prog_offset = brw->wm.prog_offset;
3170 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3171
3172 bool success = do_wm_prog(brw, prog, bfp, &key);
3173
3174 brw->wm.prog_offset = old_prog_offset;
3175 brw->wm.prog_data = old_prog_data;
3176
3177 return success;
3178 }